diff options
203 files changed, 2717 insertions, 1084 deletions
@@ -70,6 +70,8 @@ Baolin Wang <baolin.wang@linux.alibaba.com> <baolin.wang@unisoc.com> Baolin Wang <baolin.wang@linux.alibaba.com> <baolin.wang7@gmail.com> Bart Van Assche <bvanassche@acm.org> <bart.vanassche@sandisk.com> Bart Van Assche <bvanassche@acm.org> <bart.vanassche@wdc.com> +Ben Dooks <ben-linux@fluff.org> <ben.dooks@simtec.co.uk> +Ben Dooks <ben-linux@fluff.org> <ben.dooks@sifive.com> Ben Gardner <bgardner@wabtec.com> Ben M Cahill <ben.m.cahill@intel.com> Ben Widawsky <bwidawsk@kernel.org> <ben@bwidawsk.net> diff --git a/Documentation/devicetree/bindings/firmware/qcom,scm.yaml b/Documentation/devicetree/bindings/firmware/qcom,scm.yaml index 367d04ad1923..83381f3a1341 100644 --- a/Documentation/devicetree/bindings/firmware/qcom,scm.yaml +++ b/Documentation/devicetree/bindings/firmware/qcom,scm.yaml @@ -71,6 +71,8 @@ properties: minItems: 1 maxItems: 3 + dma-coherent: true + interconnects: maxItems: 1 diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst index f79987e16cf4..e7b07313550a 100644 --- a/Documentation/trace/user_events.rst +++ b/Documentation/trace/user_events.rst @@ -14,10 +14,6 @@ Programs can view status of the events via /sys/kernel/tracing/user_events_status and can both register and write data out via /sys/kernel/tracing/user_events_data. -Programs can also use /sys/kernel/tracing/dynamic_events to register and -delete user based events via the u: prefix. The format of the command to -dynamic_events is the same as the ioctl with the u: prefix applied. - Typically programs will register a set of events that they wish to expose to tools that can read trace_events (such as ftrace and perf). The registration process tells the kernel which address and bit to reflect if any tool has @@ -144,6 +140,9 @@ its name. Delete will only succeed if there are no references left to the event (in both user and kernel space). User programs should use a separate file to request deletes than the one used for registration due to this. +**NOTE:** By default events will auto-delete when there are no references left +to the event. Flags in the future may change this logic. + Unregistering ------------- If after registering an event it is no longer wanted to be updated then it can diff --git a/MAINTAINERS b/MAINTAINERS index 6992b7cc7095..35e19594640d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9972,8 +9972,9 @@ M: Miquel Raynal <miquel.raynal@bootlin.com> L: linux-wpan@vger.kernel.org S: Maintained W: https://linux-wpan.org/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/sschmidt/wpan.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/sschmidt/wpan-next.git +Q: https://patchwork.kernel.org/project/linux-wpan/list/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan-next.git F: Documentation/networking/ieee802154.rst F: drivers/net/ieee802154/ F: include/linux/ieee802154.h @@ -13269,10 +13270,11 @@ F: drivers/memory/mtk-smi.c F: include/soc/mediatek/smi.h MEDIATEK SWITCH DRIVER -M: Sean Wang <sean.wang@mediatek.com> +M: Arınç ÃœNAL <arinc.unal@arinc9.com> +M: Daniel Golle <daniel@makrotopia.org> M: Landen Chao <Landen.Chao@mediatek.com> M: DENG Qingfang <dqfext@gmail.com> -M: Daniel Golle <daniel@makrotopia.org> +M: Sean Wang <sean.wang@mediatek.com> L: netdev@vger.kernel.org S: Maintained F: drivers/net/dsa/mt7530-mdio.c @@ -16384,7 +16386,7 @@ F: Documentation/devicetree/bindings/pci/intel,keembay-pcie* F: drivers/pci/controller/dwc/pcie-keembay.c PCIE DRIVER FOR INTEL LGM GW SOC -M: Rahul Tanwar <rtanwar@maxlinear.com> +M: Chuanhua Lei <lchuanhua@maxlinear.com> L: linux-pci@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/pci/intel-gw-pcie.yaml @@ -17827,7 +17829,7 @@ F: tools/testing/selftests/rtc/ Real-time Linux Analysis (RTLA) tools M: Daniel Bristot de Oliveira <bristot@kernel.org> M: Steven Rostedt <rostedt@goodmis.org> -L: linux-trace-devel@vger.kernel.org +L: linux-trace-kernel@vger.kernel.org S: Maintained F: Documentation/tools/rtla/ F: tools/tracing/rtla/ @@ -18397,7 +18399,7 @@ F: drivers/infiniband/ulp/rtrs/ RUNTIME VERIFICATION (RV) M: Daniel Bristot de Oliveira <bristot@kernel.org> M: Steven Rostedt <rostedt@goodmis.org> -L: linux-trace-devel@vger.kernel.org +L: linux-trace-kernel@vger.kernel.org S: Maintained F: Documentation/trace/rv/ F: include/linux/rv.h diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h index f4db3e75d75f..f3cd04ff022d 100644 --- a/arch/arm/include/asm/arm_pmuv3.h +++ b/arch/arm/include/asm/arm_pmuv3.h @@ -222,6 +222,11 @@ static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr) return false; } +static inline bool kvm_set_pmuserenr(u64 val) +{ + return false; +} + /* PMU Version in DFR Register */ #define ARMV8_PMU_DFR_VER_NI 0 #define ARMV8_PMU_DFR_VER_V3P4 0x5 diff --git a/arch/arm64/boot/dts/qcom/sc7180-idp.dts b/arch/arm64/boot/dts/qcom/sc7180-idp.dts index 9f052270e090..299ef5dc225a 100644 --- a/arch/arm64/boot/dts/qcom/sc7180-idp.dts +++ b/arch/arm64/boot/dts/qcom/sc7180-idp.dts @@ -393,6 +393,11 @@ qcom,spare-regs = <&tcsr_regs_2 0xb3e4>; }; +&scm { + /* TF-A firmware maps memory cached so mark dma-coherent to match. */ + dma-coherent; +}; + &sdhc_1 { status = "okay"; diff --git a/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi b/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi index ca6920de7ea8..1472e7f10831 100644 --- a/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi @@ -892,6 +892,11 @@ hp_i2c: &i2c9 { qcom,spare-regs = <&tcsr_regs_2 0xb3e4>; }; +&scm { + /* TF-A firmware maps memory cached so mark dma-coherent to match. */ + dma-coherent; +}; + &sdhc_1 { status = "okay"; diff --git a/arch/arm64/boot/dts/qcom/sc7180.dtsi b/arch/arm64/boot/dts/qcom/sc7180.dtsi index f479cab8ab45..a65be760d1a7 100644 --- a/arch/arm64/boot/dts/qcom/sc7180.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7180.dtsi @@ -369,7 +369,7 @@ }; firmware { - scm { + scm: scm { compatible = "qcom,scm-sc7180", "qcom,scm"; }; }; diff --git a/arch/arm64/boot/dts/qcom/sc7280-chrome-common.dtsi b/arch/arm64/boot/dts/qcom/sc7280-chrome-common.dtsi index f562e4d2b655..2e1cd219fc18 100644 --- a/arch/arm64/boot/dts/qcom/sc7280-chrome-common.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7280-chrome-common.dtsi @@ -79,6 +79,11 @@ firmware-name = "ath11k/WCN6750/hw1.0/wpss.mdt"; }; +&scm { + /* TF-A firmware maps memory cached so mark dma-coherent to match. */ + dma-coherent; +}; + &wifi { status = "okay"; diff --git a/arch/arm64/boot/dts/qcom/sc7280.dtsi b/arch/arm64/boot/dts/qcom/sc7280.dtsi index 2fd1d3c0eb34..36f0bb9b3cbb 100644 --- a/arch/arm64/boot/dts/qcom/sc7280.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7280.dtsi @@ -656,7 +656,7 @@ }; firmware { - scm { + scm: scm { compatible = "qcom,scm-sc7280", "qcom,scm"; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3308.dtsi b/arch/arm64/boot/dts/rockchip/rk3308.dtsi index dd228a256a32..2ae4bb7d5e62 100644 --- a/arch/arm64/boot/dts/rockchip/rk3308.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3308.dtsi @@ -97,6 +97,7 @@ l2: l2-cache { compatible = "cache"; cache-level = <2>; + cache-unified; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts index f69a38f42d2d..0a27fa5271f5 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts @@ -37,7 +37,8 @@ vin-supply = <&vcc_io>; }; - vcc_host_5v: vcc-host-5v-regulator { + /* Common enable line for all of the rails mentioned in the labels */ + vcc_host_5v: vcc_host1_5v: vcc_otg_5v: vcc-host-5v-regulator { compatible = "regulator-fixed"; gpio = <&gpio0 RK_PA2 GPIO_ACTIVE_LOW>; pinctrl-names = "default"; @@ -48,17 +49,6 @@ vin-supply = <&vcc_sys>; }; - vcc_host1_5v: vcc_otg_5v: vcc-host1-5v-regulator { - compatible = "regulator-fixed"; - gpio = <&gpio0 RK_PA2 GPIO_ACTIVE_LOW>; - pinctrl-names = "default"; - pinctrl-0 = <&usb20_host_drv>; - regulator-name = "vcc_host1_5v"; - regulator-always-on; - regulator-boot-on; - vin-supply = <&vcc_sys>; - }; - vcc_sys: vcc-sys { compatible = "regulator-fixed"; regulator-name = "vcc_sys"; diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi index 6d7a7bf72ac7..e729e7a22b23 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi @@ -103,6 +103,7 @@ l2: l2-cache0 { compatible = "cache"; cache-level = <2>; + cache-unified; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3566-soquartz-cm4.dts b/arch/arm64/boot/dts/rockchip/rk3566-soquartz-cm4.dts index 263ce40770dd..cddf6cd2fecb 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-soquartz-cm4.dts +++ b/arch/arm64/boot/dts/rockchip/rk3566-soquartz-cm4.dts @@ -28,6 +28,16 @@ regulator-max-microvolt = <5000000>; vin-supply = <&vcc12v_dcin>; }; + + vcc_sd_pwr: vcc-sd-pwr-regulator { + compatible = "regulator-fixed"; + regulator-name = "vcc_sd_pwr"; + regulator-always-on; + regulator-boot-on; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + vin-supply = <&vcc3v3_sys>; + }; }; /* phy for pcie */ @@ -130,13 +140,7 @@ }; &sdmmc0 { - vmmc-supply = <&sdmmc_pwr>; - status = "okay"; -}; - -&sdmmc_pwr { - regulator-min-microvolt = <3300000>; - regulator-max-microvolt = <3300000>; + vmmc-supply = <&vcc_sd_pwr>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi b/arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi index 102e448bc026..31aa2b8efe39 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi @@ -104,16 +104,6 @@ regulator-max-microvolt = <3300000>; vin-supply = <&vcc5v0_sys>; }; - - sdmmc_pwr: sdmmc-pwr-regulator { - compatible = "regulator-fixed"; - enable-active-high; - gpio = <&gpio0 RK_PA5 GPIO_ACTIVE_HIGH>; - pinctrl-names = "default"; - pinctrl-0 = <&sdmmc_pwr_h>; - regulator-name = "sdmmc_pwr"; - status = "disabled"; - }; }; &cpu0 { @@ -155,6 +145,19 @@ status = "disabled"; }; +&gpio0 { + nextrst-hog { + gpio-hog; + /* + * GPIO_ACTIVE_LOW + output-low here means that the pin is set + * to high, because output-low decides the value pre-inversion. + */ + gpios = <RK_PA5 GPIO_ACTIVE_LOW>; + line-name = "nEXTRST"; + output-low; + }; +}; + &gpu { mali-supply = <&vdd_gpu>; status = "okay"; @@ -538,12 +541,6 @@ rockchip,pins = <2 RK_PC2 RK_FUNC_GPIO &pcfg_pull_none>; }; }; - - sdmmc-pwr { - sdmmc_pwr_h: sdmmc-pwr-h { - rockchip,pins = <0 RK_PA5 RK_FUNC_GPIO &pcfg_pull_none>; - }; - }; }; &pmu_io_domains { diff --git a/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5c.dts b/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5c.dts index f70ca9f0470a..c718b8dbb9c6 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5c.dts +++ b/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5c.dts @@ -106,7 +106,7 @@ rockchip-key { reset_button_pin: reset-button-pin { - rockchip,pins = <4 RK_PA0 RK_FUNC_GPIO &pcfg_pull_up>; + rockchip,pins = <0 RK_PB7 RK_FUNC_GPIO &pcfg_pull_up>; }; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts b/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts index 2a1118f15c29..b6ad8328c7eb 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts +++ b/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts @@ -134,4 +134,3 @@ }; }; }; - diff --git a/arch/arm64/boot/dts/rockchip/rk3568.dtsi b/arch/arm64/boot/dts/rockchip/rk3568.dtsi index ba67b58f05b7..f1be76a54ceb 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3568.dtsi @@ -94,9 +94,10 @@ power-domains = <&power RK3568_PD_PIPE>; reg = <0x3 0xc0400000 0x0 0x00400000>, <0x0 0xfe270000 0x0 0x00010000>, - <0x3 0x7f000000 0x0 0x01000000>; - ranges = <0x01000000 0x0 0x3ef00000 0x3 0x7ef00000 0x0 0x00100000>, - <0x02000000 0x0 0x00000000 0x3 0x40000000 0x0 0x3ef00000>; + <0x0 0xf2000000 0x0 0x00100000>; + ranges = <0x01000000 0x0 0xf2100000 0x0 0xf2100000 0x0 0x00100000>, + <0x02000000 0x0 0xf2200000 0x0 0xf2200000 0x0 0x01e00000>, + <0x03000000 0x0 0x40000000 0x3 0x40000000 0x0 0x40000000>; reg-names = "dbi", "apb", "config"; resets = <&cru SRST_PCIE30X1_POWERUP>; reset-names = "pipe"; @@ -146,9 +147,10 @@ power-domains = <&power RK3568_PD_PIPE>; reg = <0x3 0xc0800000 0x0 0x00400000>, <0x0 0xfe280000 0x0 0x00010000>, - <0x3 0xbf000000 0x0 0x01000000>; - ranges = <0x01000000 0x0 0x3ef00000 0x3 0xbef00000 0x0 0x00100000>, - <0x02000000 0x0 0x00000000 0x3 0x80000000 0x0 0x3ef00000>; + <0x0 0xf0000000 0x0 0x00100000>; + ranges = <0x01000000 0x0 0xf0100000 0x0 0xf0100000 0x0 0x00100000>, + <0x02000000 0x0 0xf0200000 0x0 0xf0200000 0x0 0x01e00000>, + <0x03000000 0x0 0x40000000 0x3 0x80000000 0x0 0x40000000>; reg-names = "dbi", "apb", "config"; resets = <&cru SRST_PCIE30X2_POWERUP>; reset-names = "pipe"; diff --git a/arch/arm64/boot/dts/rockchip/rk356x.dtsi b/arch/arm64/boot/dts/rockchip/rk356x.dtsi index f62e0fd881a9..61680c7ac489 100644 --- a/arch/arm64/boot/dts/rockchip/rk356x.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk356x.dtsi @@ -952,7 +952,7 @@ compatible = "rockchip,rk3568-pcie"; reg = <0x3 0xc0000000 0x0 0x00400000>, <0x0 0xfe260000 0x0 0x00010000>, - <0x3 0x3f000000 0x0 0x01000000>; + <0x0 0xf4000000 0x0 0x00100000>; reg-names = "dbi", "apb", "config"; interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 74 IRQ_TYPE_LEVEL_HIGH>, @@ -982,8 +982,9 @@ phys = <&combphy2 PHY_TYPE_PCIE>; phy-names = "pcie-phy"; power-domains = <&power RK3568_PD_PIPE>; - ranges = <0x01000000 0x0 0x3ef00000 0x3 0x3ef00000 0x0 0x00100000 - 0x02000000 0x0 0x00000000 0x3 0x00000000 0x0 0x3ef00000>; + ranges = <0x01000000 0x0 0xf4100000 0x0 0xf4100000 0x0 0x00100000>, + <0x02000000 0x0 0xf4200000 0x0 0xf4200000 0x0 0x01e00000>, + <0x03000000 0x0 0x40000000 0x3 0x00000000 0x0 0x40000000>; resets = <&cru SRST_PCIE20_POWERUP>; reset-names = "pipe"; #address-cells = <3>; diff --git a/arch/arm64/boot/dts/rockchip/rk3588s.dtsi b/arch/arm64/boot/dts/rockchip/rk3588s.dtsi index 657c019d27fa..a3124bd2e092 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588s.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588s.dtsi @@ -229,6 +229,7 @@ cache-line-size = <64>; cache-sets = <512>; cache-level = <2>; + cache-unified; next-level-cache = <&l3_cache>; }; @@ -238,6 +239,7 @@ cache-line-size = <64>; cache-sets = <512>; cache-level = <2>; + cache-unified; next-level-cache = <&l3_cache>; }; @@ -247,6 +249,7 @@ cache-line-size = <64>; cache-sets = <512>; cache-level = <2>; + cache-unified; next-level-cache = <&l3_cache>; }; @@ -256,6 +259,7 @@ cache-line-size = <64>; cache-sets = <512>; cache-level = <2>; + cache-unified; next-level-cache = <&l3_cache>; }; @@ -265,6 +269,7 @@ cache-line-size = <64>; cache-sets = <1024>; cache-level = <2>; + cache-unified; next-level-cache = <&l3_cache>; }; @@ -274,6 +279,7 @@ cache-line-size = <64>; cache-sets = <1024>; cache-level = <2>; + cache-unified; next-level-cache = <&l3_cache>; }; @@ -283,6 +289,7 @@ cache-line-size = <64>; cache-sets = <1024>; cache-level = <2>; + cache-unified; next-level-cache = <&l3_cache>; }; @@ -292,6 +299,7 @@ cache-line-size = <64>; cache-sets = <1024>; cache-level = <2>; + cache-unified; next-level-cache = <&l3_cache>; }; @@ -301,6 +309,7 @@ cache-line-size = <64>; cache-sets = <4096>; cache-level = <3>; + cache-unified; }; }; diff --git a/arch/arm64/hyperv/mshyperv.c b/arch/arm64/hyperv/mshyperv.c index a406454578f0..f1b8a04ee9f2 100644 --- a/arch/arm64/hyperv/mshyperv.c +++ b/arch/arm64/hyperv/mshyperv.c @@ -67,7 +67,7 @@ static int __init hyperv_init(void) if (ret) return ret; - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "arm64/hyperv_init:online", + ret = cpuhp_setup_state(CPUHP_AP_HYPERV_ONLINE, "arm64/hyperv_init:online", hv_common_cpu_init, hv_common_cpu_die); if (ret < 0) { hv_common_free(); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7e7e19ef6993..9787503ff43f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -699,6 +699,8 @@ struct kvm_vcpu_arch { #define SYSREGS_ON_CPU __vcpu_single_flag(sflags, BIT(4)) /* Software step state is Active-pending */ #define DBG_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(5)) +/* PMUSERENR for the guest EL0 is on physical CPU */ +#define PMUSERENR_ON_CPU __vcpu_single_flag(sflags, BIT(6)) /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ @@ -1065,9 +1067,14 @@ void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr); void kvm_clr_pmu_events(u32 clr); +bool kvm_set_pmuserenr(u64 val); #else static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {} static inline void kvm_clr_pmu_events(u32 clr) {} +static inline bool kvm_set_pmuserenr(u64 val) +{ + return false; +} #endif void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 5c15c58f90cc..4fe217efa218 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -82,8 +82,14 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) * EL1 instead of being trapped to EL2. */ if (kvm_arm_support_pmu_v3()) { + struct kvm_cpu_context *hctxt; + write_sysreg(0, pmselr_el0); + + hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); + vcpu_set_flag(vcpu, PMUSERENR_ON_CPU); } vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2); @@ -106,8 +112,13 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) write_sysreg(vcpu->arch.mdcr_el2_host, mdcr_el2); write_sysreg(0, hstr_el2); - if (kvm_arm_support_pmu_v3()) - write_sysreg(0, pmuserenr_el0); + if (kvm_arm_support_pmu_v3()) { + struct kvm_cpu_context *hctxt; + + hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0); + vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU); + } if (cpus_have_final_cap(ARM64_SME)) { sysreg_clear_set_s(SYS_HFGRTR_EL2, 0, diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 7a1aa511e7da..b37e7c96efea 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -92,14 +92,28 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) } NOKPROBE_SYMBOL(__deactivate_traps); +/* + * Disable IRQs in {activate,deactivate}_traps_vhe_{load,put}() to + * prevent a race condition between context switching of PMUSERENR_EL0 + * in __{activate,deactivate}_traps_common() and IPIs that attempts to + * update PMUSERENR_EL0. See also kvm_set_pmuserenr(). + */ void activate_traps_vhe_load(struct kvm_vcpu *vcpu) { + unsigned long flags; + + local_irq_save(flags); __activate_traps_common(vcpu); + local_irq_restore(flags); } void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu) { + unsigned long flags; + + local_irq_save(flags); __deactivate_traps_common(vcpu); + local_irq_restore(flags); } static const exit_handler_fn hyp_exit_handlers[] = { diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 491ca7eb2a4c..560650972478 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -700,7 +700,25 @@ static struct arm_pmu *kvm_pmu_probe_armpmu(void) mutex_lock(&arm_pmus_lock); - cpu = smp_processor_id(); + /* + * It is safe to use a stale cpu to iterate the list of PMUs so long as + * the same value is used for the entirety of the loop. Given this, and + * the fact that no percpu data is used for the lookup there is no need + * to disable preemption. + * + * It is still necessary to get a valid cpu, though, to probe for the + * default PMU instance as userspace is not required to specify a PMU + * type. In order to uphold the preexisting behavior KVM selects the + * PMU instance for the core where the first call to the + * KVM_ARM_VCPU_PMU_V3_CTRL attribute group occurs. A dependent use case + * would be a user with disdain of all things big.LITTLE that affines + * the VMM to a particular cluster of cores. + * + * In any case, userspace should just do the sane thing and use the UAPI + * to select a PMU type directly. But, be wary of the baggage being + * carried here. + */ + cpu = raw_smp_processor_id(); list_for_each_entry(entry, &arm_pmus, entry) { tmp = entry->arm_pmu; diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 7887133d15f0..121f1a14c829 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -209,3 +209,30 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) kvm_vcpu_pmu_enable_el0(events_host); kvm_vcpu_pmu_disable_el0(events_guest); } + +/* + * With VHE, keep track of the PMUSERENR_EL0 value for the host EL0 on the pCPU + * where PMUSERENR_EL0 for the guest is loaded, since PMUSERENR_EL0 is switched + * to the value for the guest on vcpu_load(). The value for the host EL0 + * will be restored on vcpu_put(), before returning to userspace. + * This isn't necessary for nVHE, as the register is context switched for + * every guest enter/exit. + * + * Return true if KVM takes care of the register. Otherwise return false. + */ +bool kvm_set_pmuserenr(u64 val) +{ + struct kvm_cpu_context *hctxt; + struct kvm_vcpu *vcpu; + + if (!kvm_arm_support_pmu_v3() || !has_vhe()) + return false; + + vcpu = kvm_get_running_vcpu(); + if (!vcpu || !vcpu_get_flag(vcpu, PMUSERENR_ON_CPU)) + return false; + + hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + ctxt_sys_reg(hctxt, PMUSERENR_EL0) = val; + return true; +} diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 6eafc2c45cfc..c8c3cb812783 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -446,6 +446,7 @@ int vgic_lazy_init(struct kvm *kvm) int kvm_vgic_map_resources(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; + enum vgic_type type; gpa_t dist_base; int ret = 0; @@ -460,10 +461,13 @@ int kvm_vgic_map_resources(struct kvm *kvm) if (!irqchip_in_kernel(kvm)) goto out; - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) { ret = vgic_v2_map_resources(kvm); - else + type = VGIC_V2; + } else { ret = vgic_v3_map_resources(kvm); + type = VGIC_V3; + } if (ret) { __kvm_vgic_destroy(kvm); @@ -473,8 +477,7 @@ int kvm_vgic_map_resources(struct kvm *kvm) dist_base = dist->vgic_dist_base; mutex_unlock(&kvm->arch.config_lock); - ret = vgic_register_dist_iodev(kvm, dist_base, - kvm_vgic_global_state.type); + ret = vgic_register_dist_iodev(kvm, dist_base, type); if (ret) { kvm_err("Unable to register VGIC dist MMIO regions\n"); kvm_vgic_destroy(kvm); diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index ce804b7bf84e..0bd4866d9824 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -795,12 +795,20 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush) goto out; if (current->active_mm == mm) { + unsigned long flags; + WARN_ON_ONCE(current->mm != NULL); - /* Is a kernel thread and is using mm as the lazy tlb */ + /* + * It is a kernel thread and is using mm as the lazy tlb, so + * switch it to init_mm. This is not always called from IPI + * (e.g., flush_type_needed), so must disable irqs. + */ + local_irq_save(flags); mmgrab_lazy_tlb(&init_mm); current->active_mm = &init_mm; switch_mm_irqs_off(mm, &init_mm, current); mmdrop_lazy_tlb(mm); + local_irq_restore(flags); } /* diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b39975977c03..fdc2e3abd615 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -305,6 +305,18 @@ ifeq ($(RETPOLINE_CFLAGS),) endif endif +ifdef CONFIG_UNWINDER_ORC +orc_hash_h := arch/$(SRCARCH)/include/generated/asm/orc_hash.h +orc_hash_sh := $(srctree)/scripts/orc_hash.sh +targets += $(orc_hash_h) +quiet_cmd_orc_hash = GEN $@ + cmd_orc_hash = mkdir -p $(dir $@); \ + $(CONFIG_SHELL) $(orc_hash_sh) < $< > $@ +$(orc_hash_h): $(srctree)/arch/x86/include/asm/orc_types.h $(orc_hash_sh) FORCE + $(call if_changed,orc_hash) +archprepare: $(orc_hash_h) +endif + archclean: $(Q)rm -rf $(objtree)/arch/i386 $(Q)rm -rf $(objtree)/arch/x86_64 diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 89b9c1cebb61..27f3a7b34bd5 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -349,6 +349,16 @@ static struct event_constraint intel_spr_event_constraints[] = { EVENT_CONSTRAINT_END }; +static struct extra_reg intel_gnr_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x02c6, MSR_PEBS_FRONTEND, 0x9, FE), + INTEL_UEVENT_EXTRA_REG(0x03c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE), + INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0x7, FE), + INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE), + EVENT_EXTRA_END +}; EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); @@ -6496,6 +6506,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_SAPPHIRERAPIDS_X: case INTEL_FAM6_EMERALDRAPIDS_X: x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; + x86_pmu.extra_regs = intel_spr_extra_regs; fallthrough; case INTEL_FAM6_GRANITERAPIDS_X: case INTEL_FAM6_GRANITERAPIDS_D: @@ -6506,7 +6517,8 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_spr_event_constraints; x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints; - x86_pmu.extra_regs = intel_spr_extra_regs; + if (!x86_pmu.extra_regs) + x86_pmu.extra_regs = intel_gnr_extra_regs; x86_pmu.limit_period = spr_limit_period; x86_pmu.pebs_ept = 1; x86_pmu.pebs_aliases = NULL; @@ -6650,6 +6662,7 @@ __init int intel_pmu_init(void) pmu->pebs_constraints = intel_grt_pebs_event_constraints; pmu->extra_regs = intel_grt_extra_regs; if (is_mtl(boot_cpu_data.x86_model)) { + x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].extra_regs = intel_gnr_extra_regs; x86_pmu.pebs_latency_data = mtl_latency_data_small; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr; diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index a5f9474f08e1..6c04b52f139b 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -416,7 +416,7 @@ void __init hyperv_init(void) goto free_vp_assist_page; } - cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", + cpuhp = cpuhp_setup_state(CPUHP_AP_HYPERV_ONLINE, "x86/hyperv_init:online", hv_cpu_init, hv_cpu_die); if (cpuhp < 0) goto free_ghcb_page; diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 1ba5d3b99b16..85d38b9f3586 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -20,6 +20,8 @@ void __init hv_vtl_init_platform(void) { pr_info("Linux runs in Hyper-V Virtual Trust Level\n"); + x86_platform.realmode_reserve = x86_init_noop; + x86_platform.realmode_init = x86_init_noop; x86_init.irqs.pre_vector_init = x86_init_noop; x86_init.timers.timer_init = x86_init_noop; diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 1e51650b79d7..4f1ce5fc4e19 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +generated-y += orc_hash.h generated-y += syscalls_32.h generated-y += syscalls_64.h generated-y += syscalls_x32.h diff --git a/arch/x86/include/asm/orc_header.h b/arch/x86/include/asm/orc_header.h new file mode 100644 index 000000000000..07bacf3e160e --- /dev/null +++ b/arch/x86/include/asm/orc_header.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ + +#ifndef _ORC_HEADER_H +#define _ORC_HEADER_H + +#include <linux/types.h> +#include <linux/compiler.h> +#include <asm/orc_hash.h> + +/* + * The header is currently a 20-byte hash of the ORC entry definition; see + * scripts/orc_hash.sh. + */ +#define ORC_HEADER \ + __used __section(".orc_header") __aligned(4) \ + static const u8 orc_header[] = { ORC_HASH } + +#endif /* _ORC_HEADER_H */ diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 6bde05a86b4e..896bc41cb2ba 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -97,7 +97,10 @@ static void init_x2apic_ldr(void) static int x2apic_phys_probe(void) { - if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys())) + if (!x2apic_mode) + return 0; + + if (x2apic_phys || x2apic_fadt_phys()) return 1; return apic == &apic_x2apic_phys; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 3ac50b7298d1..4d8e518365f4 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -7,6 +7,9 @@ #include <asm/unwind.h> #include <asm/orc_types.h> #include <asm/orc_lookup.h> +#include <asm/orc_header.h> + +ORC_HEADER; #define orc_warn(fmt, ...) \ printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__) diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 557f0fe25dff..37db264866b6 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -172,10 +172,10 @@ void __meminit init_trampoline_kaslr(void) set_p4d(p4d_tramp, __p4d(_KERNPG_TABLE | __pa(pud_page_tramp))); - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); + trampoline_pgd_entry = + __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)); } else { - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); + trampoline_pgd_entry = + __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)); } } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 1056bbf55b17..438adb695daa 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2570,7 +2570,7 @@ out_image: } if (bpf_jit_enable > 1) - bpf_jit_dump(prog->len, proglen, pass + 1, image); + bpf_jit_dump(prog->len, proglen, pass + 1, rw_image); if (image) { if (!prog->is_func || extra_pass) { diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f0b5c9c41cde..dce1548a7a0c 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -970,6 +970,7 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); struct llist_node *lnode; struct blkg_iostat_set *bisc, *next_bisc; + unsigned long flags; rcu_read_lock(); @@ -983,7 +984,7 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) * When flushing from cgroup, cgroup_rstat_lock is always held, so * this lock won't cause contention most of time. */ - raw_spin_lock(&blkg_stat_lock); + raw_spin_lock_irqsave(&blkg_stat_lock, flags); /* * Iterate only the iostat_cpu's queued in the lockless list. @@ -1009,7 +1010,7 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) blkcg_iostat_update(parent, &blkg->iostat.cur, &blkg->iostat.last); } - raw_spin_unlock(&blkg_stat_lock); + raw_spin_unlock_irqrestore(&blkg_stat_lock, flags); out: rcu_read_unlock(); } diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c index e42c1f9ffff8..e9a1cb779b30 100644 --- a/drivers/accel/qaic/qaic_data.c +++ b/drivers/accel/qaic/qaic_data.c @@ -23,6 +23,7 @@ #include <linux/wait.h> #include <drm/drm_file.h> #include <drm/drm_gem.h> +#include <drm/drm_prime.h> #include <drm/drm_print.h> #include <uapi/drm/qaic_accel.h> @@ -616,8 +617,7 @@ static void qaic_free_object(struct drm_gem_object *obj) if (obj->import_attach) { /* DMABUF/PRIME Path */ - dma_buf_detach(obj->import_attach->dmabuf, obj->import_attach); - dma_buf_put(obj->import_attach->dmabuf); + drm_prime_gem_destroy(obj, NULL); } else { /* Private buffer allocation path */ qaic_free_sgt(bo->sgt); diff --git a/drivers/acpi/acpica/achware.h b/drivers/acpi/acpica/achware.h index ebf8fd373cf7..79bbfe00d241 100644 --- a/drivers/acpi/acpica/achware.h +++ b/drivers/acpi/acpica/achware.h @@ -101,8 +101,6 @@ acpi_status acpi_hw_get_gpe_status(struct acpi_gpe_event_info *gpe_event_info, acpi_event_status *event_status); -acpi_status acpi_hw_disable_all_gpes(void); - acpi_status acpi_hw_enable_all_runtime_gpes(void); acpi_status acpi_hw_enable_all_wakeup_gpes(void); diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 72470b9f16c4..f32570f72b90 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -636,11 +636,19 @@ static int acpi_suspend_enter(suspend_state_t pm_state) } /* - * Disable and clear GPE status before interrupt is enabled. Some GPEs - * (like wakeup GPE) haven't handler, this can avoid such GPE misfire. - * acpi_leave_sleep_state will reenable specific GPEs later + * Disable all GPE and clear their status bits before interrupts are + * enabled. Some GPEs (like wakeup GPEs) have no handlers and this can + * prevent them from producing spurious interrups. + * + * acpi_leave_sleep_state() will reenable specific GPEs later. + * + * Because this code runs on one CPU with disabled interrupts (all of + * the other CPUs are offline at this time), it need not acquire any + * sleeping locks which may trigger an implicit preemption point even + * if there is no contention, so avoid doing that by using a low-level + * library routine here. */ - acpi_disable_all_gpes(); + acpi_hw_disable_all_gpes(); /* Allow EC transactions to happen. */ acpi_ec_unblock_transactions(); diff --git a/drivers/base/regmap/regmap-spi-avmm.c b/drivers/base/regmap/regmap-spi-avmm.c index 4c2b94b3e30b..6af692844c19 100644 --- a/drivers/base/regmap/regmap-spi-avmm.c +++ b/drivers/base/regmap/regmap-spi-avmm.c @@ -660,7 +660,7 @@ static const struct regmap_bus regmap_spi_avmm_bus = { .reg_format_endian_default = REGMAP_ENDIAN_NATIVE, .val_format_endian_default = REGMAP_ENDIAN_NATIVE, .max_raw_read = SPI_AVMM_VAL_SIZE * MAX_READ_CNT, - .max_raw_write = SPI_AVMM_VAL_SIZE * MAX_WRITE_CNT, + .max_raw_write = SPI_AVMM_REG_SIZE + SPI_AVMM_VAL_SIZE * MAX_WRITE_CNT, .free_context = spi_avmm_bridge_ctx_free, }; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 2b918e28acaa..b47358da92a2 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -348,63 +348,33 @@ static inline void virtblk_request_done(struct request *req) blk_mq_end_request(req, status); } -static void virtblk_complete_batch(struct io_comp_batch *iob) -{ - struct request *req; - - rq_list_for_each(&iob->req_list, req) { - virtblk_unmap_data(req, blk_mq_rq_to_pdu(req)); - virtblk_cleanup_cmd(req); - } - blk_mq_end_request_batch(iob); -} - -static int virtblk_handle_req(struct virtio_blk_vq *vq, - struct io_comp_batch *iob) -{ - struct virtblk_req *vbr; - int req_done = 0; - unsigned int len; - - while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) { - struct request *req = blk_mq_rq_from_pdu(vbr); - - if (likely(!blk_should_fake_timeout(req->q)) && - !blk_mq_complete_request_remote(req) && - !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), - virtblk_complete_batch)) - virtblk_request_done(req); - req_done++; - } - - return req_done; -} - static void virtblk_done(struct virtqueue *vq) { struct virtio_blk *vblk = vq->vdev->priv; - struct virtio_blk_vq *vblk_vq = &vblk->vqs[vq->index]; - int req_done = 0; + bool req_done = false; + int qid = vq->index; + struct virtblk_req *vbr; unsigned long flags; - DEFINE_IO_COMP_BATCH(iob); + unsigned int len; - spin_lock_irqsave(&vblk_vq->lock, flags); + spin_lock_irqsave(&vblk->vqs[qid].lock, flags); do { virtqueue_disable_cb(vq); - req_done += virtblk_handle_req(vblk_vq, &iob); + while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) { + struct request *req = blk_mq_rq_from_pdu(vbr); + if (likely(!blk_should_fake_timeout(req->q))) + blk_mq_complete_request(req); + req_done = true; + } if (unlikely(virtqueue_is_broken(vq))) break; } while (!virtqueue_enable_cb(vq)); - if (req_done) { - if (!rq_list_empty(iob.req_list)) - iob.complete(&iob); - - /* In case queue is stopped waiting for more buffers. */ + /* In case queue is stopped waiting for more buffers. */ + if (req_done) blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); - } - spin_unlock_irqrestore(&vblk_vq->lock, flags); + spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); } static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx) @@ -1283,15 +1253,37 @@ static void virtblk_map_queues(struct blk_mq_tag_set *set) } } +static void virtblk_complete_batch(struct io_comp_batch *iob) +{ + struct request *req; + + rq_list_for_each(&iob->req_list, req) { + virtblk_unmap_data(req, blk_mq_rq_to_pdu(req)); + virtblk_cleanup_cmd(req); + } + blk_mq_end_request_batch(iob); +} + static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct virtio_blk *vblk = hctx->queue->queuedata; struct virtio_blk_vq *vq = get_virtio_blk_vq(hctx); + struct virtblk_req *vbr; unsigned long flags; + unsigned int len; int found = 0; spin_lock_irqsave(&vq->lock, flags); - found = virtblk_handle_req(vq, iob); + + while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) { + struct request *req = blk_mq_rq_from_pdu(vbr); + + found++; + if (!blk_mq_complete_request_remote(req) && + !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), + virtblk_complete_batch)) + virtblk_request_done(req); + } if (found) blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index 01f2e86f3f7c..12cf6bb2e3ce 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -12,7 +12,6 @@ #include <linux/shmem_fs.h> #include <linux/slab.h> #include <linux/udmabuf.h> -#include <linux/hugetlb.h> #include <linux/vmalloc.h> #include <linux/iosys-map.h> @@ -207,9 +206,7 @@ static long udmabuf_create(struct miscdevice *device, struct udmabuf *ubuf; struct dma_buf *buf; pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit; - struct page *page, *hpage = NULL; - pgoff_t subpgoff, maxsubpgs; - struct hstate *hpstate; + struct page *page; int seals, ret = -EINVAL; u32 i, flags; @@ -245,7 +242,7 @@ static long udmabuf_create(struct miscdevice *device, if (!memfd) goto err; mapping = memfd->f_mapping; - if (!shmem_mapping(mapping) && !is_file_hugepages(memfd)) + if (!shmem_mapping(mapping)) goto err; seals = memfd_fcntl(memfd, F_GET_SEALS, 0); if (seals == -EINVAL) @@ -256,48 +253,16 @@ static long udmabuf_create(struct miscdevice *device, goto err; pgoff = list[i].offset >> PAGE_SHIFT; pgcnt = list[i].size >> PAGE_SHIFT; - if (is_file_hugepages(memfd)) { - hpstate = hstate_file(memfd); - pgoff = list[i].offset >> huge_page_shift(hpstate); - subpgoff = (list[i].offset & - ~huge_page_mask(hpstate)) >> PAGE_SHIFT; - maxsubpgs = huge_page_size(hpstate) >> PAGE_SHIFT; - } for (pgidx = 0; pgidx < pgcnt; pgidx++) { - if (is_file_hugepages(memfd)) { - if (!hpage) { - hpage = find_get_page_flags(mapping, pgoff, - FGP_ACCESSED); - if (!hpage) { - ret = -EINVAL; - goto err; - } - } - page = hpage + subpgoff; - get_page(page); - subpgoff++; - if (subpgoff == maxsubpgs) { - put_page(hpage); - hpage = NULL; - subpgoff = 0; - pgoff++; - } - } else { - page = shmem_read_mapping_page(mapping, - pgoff + pgidx); - if (IS_ERR(page)) { - ret = PTR_ERR(page); - goto err; - } + page = shmem_read_mapping_page(mapping, pgoff + pgidx); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + goto err; } ubuf->pages[pgbuf++] = page; } fput(memfd); memfd = NULL; - if (hpage) { - put_page(hpage); - hpage = NULL; - } } exp_info.ops = &udmabuf_ops; diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index abeff7dc0b58..34b9e7876538 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -361,24 +361,6 @@ static void __init efi_debugfs_init(void) static inline void efi_debugfs_init(void) {} #endif -static void refresh_nv_rng_seed(struct work_struct *work) -{ - u8 seed[EFI_RANDOM_SEED_SIZE]; - - get_random_bytes(seed, sizeof(seed)); - efi.set_variable(L"RandomSeed", &LINUX_EFI_RANDOM_SEED_TABLE_GUID, - EFI_VARIABLE_NON_VOLATILE | EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS, sizeof(seed), seed); - memzero_explicit(seed, sizeof(seed)); -} -static int refresh_nv_rng_seed_notification(struct notifier_block *nb, unsigned long action, void *data) -{ - static DECLARE_WORK(work, refresh_nv_rng_seed); - schedule_work(&work); - return NOTIFY_DONE; -} -static struct notifier_block refresh_nv_rng_seed_nb = { .notifier_call = refresh_nv_rng_seed_notification }; - /* * We register the efi subsystem with the firmware subsystem and the * efivars subsystem with the efi subsystem, if the system was booted with @@ -451,9 +433,6 @@ static int __init efisubsys_init(void) platform_device_register_simple("efi_secret", 0, NULL, 0); #endif - if (efi_rt_services_supported(EFI_RT_SUPPORTED_SET_VARIABLE)) - execute_with_initialized_rng(&refresh_nv_rng_seed_nb); - return 0; err_remove_group: diff --git a/drivers/gpio/gpio-sifive.c b/drivers/gpio/gpio-sifive.c index 98939cd4a71e..745e5f67254e 100644 --- a/drivers/gpio/gpio-sifive.c +++ b/drivers/gpio/gpio-sifive.c @@ -221,8 +221,12 @@ static int sifive_gpio_probe(struct platform_device *pdev) return -ENODEV; } - for (i = 0; i < ngpio; i++) - chip->irq_number[i] = platform_get_irq(pdev, i); + for (i = 0; i < ngpio; i++) { + ret = platform_get_irq(pdev, i); + if (ret < 0) + return ret; + chip->irq_number[i] = ret; + } ret = bgpio_init(&chip->gc, dev, 4, chip->base + SIFIVE_GPIO_INPUT_VAL, diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index a7220e04a93e..5be8ad61523e 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1745,7 +1745,7 @@ static void gpiochip_irqchip_remove(struct gpio_chip *gc) } /* Remove all IRQ mappings and delete the domain */ - if (gc->irq.domain) { + if (!gc->irq.domain_is_allocated_externally && gc->irq.domain) { unsigned int irq; for (offset = 0; offset < gc->ngpio; offset++) { @@ -1791,6 +1791,15 @@ int gpiochip_irqchip_add_domain(struct gpio_chip *gc, gc->to_irq = gpiochip_to_irq; gc->irq.domain = domain; + gc->irq.domain_is_allocated_externally = true; + + /* + * Using barrier() here to prevent compiler from reordering + * gc->irq.initialized before adding irqdomain. + */ + barrier(); + + gc->irq.initialized = true; return 0; } diff --git a/drivers/gpu/drm/display/drm_dp_mst_topology.c b/drivers/gpu/drm/display/drm_dp_mst_topology.c index 38dab76ae69e..e2e21ce79510 100644 --- a/drivers/gpu/drm/display/drm_dp_mst_topology.c +++ b/drivers/gpu/drm/display/drm_dp_mst_topology.c @@ -3404,7 +3404,7 @@ int drm_dp_add_payload_part2(struct drm_dp_mst_topology_mgr *mgr, /* Skip failed payloads */ if (payload->vc_start_slot == -1) { - drm_dbg_kms(state->dev, "Part 1 of payload creation for %s failed, skipping part 2\n", + drm_dbg_kms(mgr->dev, "Part 1 of payload creation for %s failed, skipping part 2\n", payload->port->connector->name); return -EIO; } diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 007f26d5f1a4..2f4d09ce027a 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -829,11 +829,22 @@ static void vmbus_wait_for_unload(void) if (completion_done(&vmbus_connection.unload_event)) goto completed; - for_each_online_cpu(cpu) { + for_each_present_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); + /* + * In a CoCo VM the synic_message_page is not allocated + * in hv_synic_alloc(). Instead it is set/cleared in + * hv_synic_enable_regs() and hv_synic_disable_regs() + * such that it is set only when the CPU is online. If + * not all present CPUs are online, the message page + * might be NULL, so skip such CPUs. + */ page_addr = hv_cpu->synic_message_page; + if (!page_addr) + continue; + msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; @@ -867,11 +878,14 @@ completed: * maybe-pending messages on all CPUs to be able to receive new * messages after we reconnect. */ - for_each_online_cpu(cpu) { + for_each_present_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); page_addr = hv_cpu->synic_message_page; + if (!page_addr) + continue; + msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; msg->header.message_type = HVMSG_NONE; } diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 64f9ceca887b..542a1d53b303 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -364,13 +364,20 @@ int hv_common_cpu_init(unsigned int cpu) flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL; inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); - *inputarg = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); - if (!(*inputarg)) - return -ENOMEM; - if (hv_root_partition) { - outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); - *outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE; + /* + * hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already + * allocated if this CPU was previously online and then taken offline + */ + if (!*inputarg) { + *inputarg = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); + if (!(*inputarg)) + return -ENOMEM; + + if (hv_root_partition) { + outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); + *outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE; + } } msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX); @@ -385,24 +392,17 @@ int hv_common_cpu_init(unsigned int cpu) int hv_common_cpu_die(unsigned int cpu) { - unsigned long flags; - void **inputarg, **outputarg; - void *mem; - - local_irq_save(flags); - - inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); - mem = *inputarg; - *inputarg = NULL; - - if (hv_root_partition) { - outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); - *outputarg = NULL; - } - - local_irq_restore(flags); - - kfree(mem); + /* + * The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory + * is not freed when the CPU goes offline as the hyperv_pcpu_input_arg + * may be used by the Hyper-V vPCI driver in reassigning interrupts + * as part of the offlining process. The interrupt reassignment + * happens *after* the CPUHP_AP_HYPERV_ONLINE state has run and + * called this function. + * + * If a previously offlined CPU is brought back online again, the + * originally allocated memory is reused in hv_common_cpu_init(). + */ return 0; } diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 1c65a6dfb9fa..67f95a29aeca 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1372,7 +1372,7 @@ static int vmbus_bus_init(void) ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online", hv_synic_init, hv_synic_cleanup); if (ret < 0) - goto err_cpuhp; + goto err_alloc; hyperv_cpuhp_online = ret; ret = vmbus_connect(); @@ -1392,9 +1392,8 @@ static int vmbus_bus_init(void) err_connect: cpuhp_remove_state(hyperv_cpuhp_online); -err_cpuhp: - hv_synic_free(); err_alloc: + hv_synic_free(); if (vmbus_irq == -1) { hv_remove_vmbus_handler(); } else { diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index dc1ec6849775..e8a2e5984acb 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2078,10 +2078,6 @@ static struct protection_domain *protection_domain_alloc(unsigned int type) int mode = DEFAULT_PGTABLE_LEVEL; int ret; - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) - return NULL; - /* * Force IOMMU v1 page table when iommu=pt and * when allocating domain for pass-through devices. @@ -2097,6 +2093,10 @@ static struct protection_domain *protection_domain_alloc(unsigned int type) return NULL; } + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return NULL; + switch (pgtable) { case AMD_IOMMU_V1: ret = protection_domain_init_v1(domain, mode); diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 9e0c69958587..acffed750e3e 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1828,7 +1828,7 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) * Replacement block manager (new_bm) is created and old_bm destroyed outside of * cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of * shrinker associated with the block manager's bufio client vs cmd root_lock). - * - must take shrinker_mutex without holding cmd->root_lock + * - must take shrinker_rwsem without holding cmd->root_lock */ new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, CACHE_MAX_CONCURRENT_LOCKS); diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index b9461faa9f0d..9dd0409848ab 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1891,7 +1891,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) * Replacement block manager (new_bm) is created and old_bm destroyed outside of * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of * shrinker associated with the block manager's bufio client vs pmd root_lock). - * - must take shrinker_mutex without holding pmd->root_lock + * - must take shrinker_rwsem without holding pmd->root_lock */ new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, THIN_MAX_CONCURRENT_LOCKS); diff --git a/drivers/mmc/host/bcm2835.c b/drivers/mmc/host/bcm2835.c index 8648f7e63ca1..eea208856ce0 100644 --- a/drivers/mmc/host/bcm2835.c +++ b/drivers/mmc/host/bcm2835.c @@ -1403,8 +1403,8 @@ static int bcm2835_probe(struct platform_device *pdev) host->max_clk = clk_get_rate(clk); host->irq = platform_get_irq(pdev, 0); - if (host->irq <= 0) { - ret = -EINVAL; + if (host->irq < 0) { + ret = host->irq; goto err; } diff --git a/drivers/mmc/host/litex_mmc.c b/drivers/mmc/host/litex_mmc.c index 39c6707fdfdb..9af6b0902efe 100644 --- a/drivers/mmc/host/litex_mmc.c +++ b/drivers/mmc/host/litex_mmc.c @@ -649,6 +649,7 @@ static struct platform_driver litex_mmc_driver = { .driver = { .name = "litex-mmc", .of_match_table = litex_match, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; module_platform_driver(litex_mmc_driver); diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index b8514d9d5e73..ee9a25b900ae 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -991,11 +991,8 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id) if (data && !cmd->error) data->bytes_xfered = data->blksz * data->blocks; - if (meson_mmc_bounce_buf_read(data) || - meson_mmc_get_next_command(cmd)) - ret = IRQ_WAKE_THREAD; - else - ret = IRQ_HANDLED; + + return IRQ_WAKE_THREAD; } out: @@ -1007,9 +1004,6 @@ out: writel(start, host->regs + SD_EMMC_START); } - if (ret == IRQ_HANDLED) - meson_mmc_request_done(host->mmc, cmd->mrq); - return ret; } @@ -1192,8 +1186,8 @@ static int meson_mmc_probe(struct platform_device *pdev) return PTR_ERR(host->regs); host->irq = platform_get_irq(pdev, 0); - if (host->irq <= 0) - return -EINVAL; + if (host->irq < 0) + return host->irq; cd_irq = platform_get_irq_optional(pdev, 1); mmc_gpio_set_cd_irq(mmc, cd_irq); diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index f2b2e8b0574e..696cbef3ff7d 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -1735,7 +1735,8 @@ static void mmci_set_max_busy_timeout(struct mmc_host *mmc) return; if (host->variant->busy_timeout && mmc->actual_clock) - max_busy_timeout = ~0UL / (mmc->actual_clock / MSEC_PER_SEC); + max_busy_timeout = U32_MAX / DIV_ROUND_UP(mmc->actual_clock, + MSEC_PER_SEC); mmc->max_busy_timeout = max_busy_timeout; } diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index edade0e54a0c..9785ec91654f 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -2680,7 +2680,7 @@ static int msdc_drv_probe(struct platform_device *pdev) host->irq = platform_get_irq(pdev, 0); if (host->irq < 0) { - ret = -EINVAL; + ret = host->irq; goto host_free; } diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c index 629efbe639c4..b4f6a0a2fcb5 100644 --- a/drivers/mmc/host/mvsdio.c +++ b/drivers/mmc/host/mvsdio.c @@ -704,7 +704,7 @@ static int mvsd_probe(struct platform_device *pdev) } irq = platform_get_irq(pdev, 0); if (irq < 0) - return -ENXIO; + return irq; mmc = mmc_alloc_host(sizeof(struct mvsd_host), &pdev->dev); if (!mmc) { diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index ce78edfb402b..86454f1182bb 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -1343,7 +1343,7 @@ static int mmc_omap_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) - return -ENXIO; + return irq; host->virt_base = devm_platform_get_and_ioremap_resource(pdev, 0, &res); if (IS_ERR(host->virt_base)) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 517dde777413..1e0f2d7774bd 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -1791,9 +1791,11 @@ static int omap_hsmmc_probe(struct platform_device *pdev) } res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - irq = platform_get_irq(pdev, 0); - if (res == NULL || irq < 0) + if (!res) return -ENXIO; + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; base = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(base)) diff --git a/drivers/mmc/host/owl-mmc.c b/drivers/mmc/host/owl-mmc.c index 6f9d31a886ba..1bf22b08b373 100644 --- a/drivers/mmc/host/owl-mmc.c +++ b/drivers/mmc/host/owl-mmc.c @@ -637,7 +637,7 @@ static int owl_mmc_probe(struct platform_device *pdev) owl_host->irq = platform_get_irq(pdev, 0); if (owl_host->irq < 0) { - ret = -EINVAL; + ret = owl_host->irq; goto err_release_channel; } diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c index 8f0e639236b1..edf2e6c14dc6 100644 --- a/drivers/mmc/host/sdhci-acpi.c +++ b/drivers/mmc/host/sdhci-acpi.c @@ -829,7 +829,7 @@ static int sdhci_acpi_probe(struct platform_device *pdev) host->ops = &sdhci_acpi_ops_dflt; host->irq = platform_get_irq(pdev, 0); if (host->irq < 0) { - err = -EINVAL; + err = host->irq; goto err_free; } diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 8ac81d57a3df..1877d583fe8c 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -2479,6 +2479,9 @@ static inline void sdhci_msm_get_of_property(struct platform_device *pdev, msm_host->ddr_config = DDR_CONFIG_POR_VAL; of_property_read_u32(node, "qcom,dll-config", &msm_host->dll_config); + + if (of_device_is_compatible(node, "qcom,msm8916-sdhci")) + host->quirks2 |= SDHCI_QUIRK2_BROKEN_64_BIT_DMA; } static int sdhci_msm_gcc_reset(struct device *dev, struct sdhci_host *host) diff --git a/drivers/mmc/host/sdhci-spear.c b/drivers/mmc/host/sdhci-spear.c index d463e2fd5b1a..c79035727b20 100644 --- a/drivers/mmc/host/sdhci-spear.c +++ b/drivers/mmc/host/sdhci-spear.c @@ -65,8 +65,8 @@ static int sdhci_probe(struct platform_device *pdev) host->hw_name = "sdhci"; host->ops = &sdhci_pltfm_ops; host->irq = platform_get_irq(pdev, 0); - if (host->irq <= 0) { - ret = -EINVAL; + if (host->irq < 0) { + ret = host->irq; goto err_host; } host->quirks = SDHCI_QUIRK_BROKEN_ADMA; diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c index 0fd4c9d644dd..5cf53348372a 100644 --- a/drivers/mmc/host/sh_mmcif.c +++ b/drivers/mmc/host/sh_mmcif.c @@ -1400,7 +1400,7 @@ static int sh_mmcif_probe(struct platform_device *pdev) irq[0] = platform_get_irq(pdev, 0); irq[1] = platform_get_irq_optional(pdev, 1); if (irq[0] < 0) - return -ENXIO; + return irq[0]; reg = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(reg)) diff --git a/drivers/mmc/host/sunxi-mmc.c b/drivers/mmc/host/sunxi-mmc.c index 3db9f32d6a7b..69dcb8805e05 100644 --- a/drivers/mmc/host/sunxi-mmc.c +++ b/drivers/mmc/host/sunxi-mmc.c @@ -1350,8 +1350,8 @@ static int sunxi_mmc_resource_request(struct sunxi_mmc_host *host, return ret; host->irq = platform_get_irq(pdev, 0); - if (host->irq <= 0) { - ret = -EINVAL; + if (host->irq < 0) { + ret = host->irq; goto error_disable_mmc; } diff --git a/drivers/mmc/host/usdhi6rol0.c b/drivers/mmc/host/usdhi6rol0.c index 2f59917b105e..2e17903658fc 100644 --- a/drivers/mmc/host/usdhi6rol0.c +++ b/drivers/mmc/host/usdhi6rol0.c @@ -1757,8 +1757,10 @@ static int usdhi6_probe(struct platform_device *pdev) irq_cd = platform_get_irq_byname(pdev, "card detect"); irq_sd = platform_get_irq_byname(pdev, "data"); irq_sdio = platform_get_irq_byname(pdev, "SDIO"); - if (irq_sd < 0 || irq_sdio < 0) - return -ENODEV; + if (irq_sd < 0) + return irq_sd; + if (irq_sdio < 0) + return irq_sdio; mmc = mmc_alloc_host(sizeof(struct usdhi6_host), dev); if (!mmc) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 9bc54e1348cb..7e773c4ba046 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -399,6 +399,20 @@ static void mt7530_pll_setup(struct mt7530_priv *priv) core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN); } +/* If port 6 is available as a CPU port, always prefer that as the default, + * otherwise don't care. + */ +static struct dsa_port * +mt753x_preferred_default_local_cpu_port(struct dsa_switch *ds) +{ + struct dsa_port *cpu_dp = dsa_to_port(ds, 6); + + if (dsa_port_is_cpu(cpu_dp)) + return cpu_dp; + + return NULL; +} + /* Setup port 6 interface mode and TRGMII TX circuit */ static int mt7530_pad_clk_setup(struct dsa_switch *ds, phy_interface_t interface) @@ -985,6 +999,18 @@ unlock_exit: mutex_unlock(&priv->reg_mutex); } +static void +mt753x_trap_frames(struct mt7530_priv *priv) +{ + /* Trap BPDUs to the CPU port(s) */ + mt7530_rmw(priv, MT753X_BPC, MT753X_BPDU_PORT_FW_MASK, + MT753X_BPDU_CPU_ONLY); + + /* Trap LLDP frames with :0E MAC DA to the CPU port(s) */ + mt7530_rmw(priv, MT753X_RGAC2, MT753X_R0E_PORT_FW_MASK, + MT753X_R0E_PORT_FW(MT753X_BPDU_CPU_ONLY)); +} + static int mt753x_cpu_port_enable(struct dsa_switch *ds, int port) { @@ -1007,9 +1033,16 @@ mt753x_cpu_port_enable(struct dsa_switch *ds, int port) UNU_FFP(BIT(port))); /* Set CPU port number */ - if (priv->id == ID_MT7621) + if (priv->id == ID_MT7530 || priv->id == ID_MT7621) mt7530_rmw(priv, MT7530_MFC, CPU_MASK, CPU_EN | CPU_PORT(port)); + /* Add the CPU port to the CPU port bitmap for MT7531 and the switch on + * the MT7988 SoC. Trapped frames will be forwarded to the CPU port that + * is affine to the inbound user port. + */ + if (priv->id == ID_MT7531 || priv->id == ID_MT7988) + mt7530_set(priv, MT7531_CFC, MT7531_CPU_PMAP(BIT(port))); + /* CPU port gets connected to all user ports of * the switch. */ @@ -2255,6 +2288,8 @@ mt7530_setup(struct dsa_switch *ds) priv->p6_interface = PHY_INTERFACE_MODE_NA; + mt753x_trap_frames(priv); + /* Enable and reset MIB counters */ mt7530_mib_reset(ds); @@ -2352,17 +2387,9 @@ static int mt7531_setup_common(struct dsa_switch *ds) { struct mt7530_priv *priv = ds->priv; - struct dsa_port *cpu_dp; int ret, i; - /* BPDU to CPU port */ - dsa_switch_for_each_cpu_port(cpu_dp, ds) { - mt7530_rmw(priv, MT7531_CFC, MT7531_CPU_PMAP_MASK, - BIT(cpu_dp->index)); - break; - } - mt7530_rmw(priv, MT753X_BPC, MT753X_BPDU_PORT_FW_MASK, - MT753X_BPDU_CPU_ONLY); + mt753x_trap_frames(priv); /* Enable and reset MIB counters */ mt7530_mib_reset(ds); @@ -3085,6 +3112,7 @@ static int mt7988_setup(struct dsa_switch *ds) const struct dsa_switch_ops mt7530_switch_ops = { .get_tag_protocol = mtk_get_tag_protocol, .setup = mt753x_setup, + .preferred_default_local_cpu_port = mt753x_preferred_default_local_cpu_port, .get_strings = mt7530_get_strings, .get_ethtool_stats = mt7530_get_ethtool_stats, .get_sset_count = mt7530_get_sset_count, diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index 5084f48a8869..08045b035e6a 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -54,6 +54,7 @@ enum mt753x_id { #define MT7531_MIRROR_PORT_GET(x) (((x) >> 16) & MIRROR_MASK) #define MT7531_MIRROR_PORT_SET(x) (((x) & MIRROR_MASK) << 16) #define MT7531_CPU_PMAP_MASK GENMASK(7, 0) +#define MT7531_CPU_PMAP(x) FIELD_PREP(MT7531_CPU_PMAP_MASK, x) #define MT753X_MIRROR_REG(id) ((((id) == ID_MT7531) || ((id) == ID_MT7988)) ? \ MT7531_CFC : MT7530_MFC) @@ -66,6 +67,11 @@ enum mt753x_id { #define MT753X_BPC 0x24 #define MT753X_BPDU_PORT_FW_MASK GENMASK(2, 0) +/* Register for :03 and :0E MAC DA frame control */ +#define MT753X_RGAC2 0x2c +#define MT753X_R0E_PORT_FW_MASK GENMASK(18, 16) +#define MT753X_R0E_PORT_FW(x) FIELD_PREP(MT753X_R0E_PORT_FW_MASK, x) + enum mt753x_bpdu_port_fw { MT753X_BPDU_FOLLOW_MFC, MT753X_BPDU_CPU_EXCLUDE = 4, diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 7e408bcc88de..0defd519ba62 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -1135,8 +1135,8 @@ static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter, eth_hdr_len = ntohs(skb->protocol) == ETH_P_8021Q ? VLAN_ETH_HLEN : ETH_HLEN; if (skb->len <= 60 && - (lancer_chip(adapter) || skb_vlan_tag_present(skb)) && - is_ipv4_pkt(skb)) { + (lancer_chip(adapter) || BE3_chip(adapter) || + skb_vlan_tag_present(skb)) && is_ipv4_pkt(skb)) { ip = (struct iphdr *)ip_hdr(skb); pskb_trim(skb, eth_hdr_len + ntohs(ip->tot_len)); } diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c index b1871e6c4006..00e50bd30189 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c @@ -54,6 +54,9 @@ static int phy_mode(enum dpmac_eth_if eth_if, phy_interface_t *if_mode) case DPMAC_ETH_IF_XFI: *if_mode = PHY_INTERFACE_MODE_10GBASER; break; + case DPMAC_ETH_IF_CAUI: + *if_mode = PHY_INTERFACE_MODE_25GBASER; + break; default: return -EINVAL; } @@ -79,6 +82,8 @@ static enum dpmac_eth_if dpmac_eth_if_mode(phy_interface_t if_mode) return DPMAC_ETH_IF_XFI; case PHY_INTERFACE_MODE_1000BASEX: return DPMAC_ETH_IF_1000BASEX; + case PHY_INTERFACE_MODE_25GBASER: + return DPMAC_ETH_IF_CAUI; default: return DPMAC_ETH_IF_MII; } @@ -418,7 +423,7 @@ int dpaa2_mac_connect(struct dpaa2_mac *mac) mac->phylink_config.mac_capabilities = MAC_SYM_PAUSE | MAC_ASYM_PAUSE | MAC_10FD | MAC_100FD | MAC_1000FD | MAC_2500FD | MAC_5000FD | - MAC_10000FD; + MAC_10000FD | MAC_25000FD; dpaa2_mac_set_supported_interfaces(mac); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c index 9c94807097cb..5ce28ff7685f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c @@ -732,7 +732,8 @@ static void mlx5e_rx_compute_wqe_bulk_params(struct mlx5e_params *params, static int mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_xsk_param *xsk, - struct mlx5e_rq_frags_info *info) + struct mlx5e_rq_frags_info *info, + u32 *xdp_frag_size) { u32 byte_count = MLX5E_SW2HW_MTU(params, params->sw_mtu); int frag_size_max = DEFAULT_FRAG_SIZE; @@ -845,6 +846,8 @@ out: info->log_num_frags = order_base_2(info->num_frags); + *xdp_frag_size = info->num_frags > 1 && params->xdp_prog ? PAGE_SIZE : 0; + return 0; } @@ -989,7 +992,8 @@ int mlx5e_build_rq_param(struct mlx5_core_dev *mdev, } default: /* MLX5_WQ_TYPE_CYCLIC */ MLX5_SET(wq, wq, log_wq_sz, params->log_rq_mtu_frames); - err = mlx5e_build_rq_frags_info(mdev, params, xsk, ¶m->frags_info); + err = mlx5e_build_rq_frags_info(mdev, params, xsk, ¶m->frags_info, + ¶m->xdp_frag_size); if (err) return err; ndsegs = param->frags_info.num_frags; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h index a5d20f6d6d9c..6800949dafbc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h @@ -24,6 +24,7 @@ struct mlx5e_rq_param { u32 rqc[MLX5_ST_SZ_DW(rqc)]; struct mlx5_wq_param wq; struct mlx5e_rq_frags_info frags_info; + u32 xdp_frag_size; }; struct mlx5e_sq_param { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index ead38ef69483..a254e728ac95 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -2021,6 +2021,8 @@ void mlx5_tc_ct_delete_flow(struct mlx5_tc_ct_priv *priv, struct mlx5_flow_attr *attr) { + if (!attr->ct_attr.ft) /* no ct action, return */ + return; if (!attr->ct_attr.nf_ft) /* means only ct clear action, and not ct_clear,ct() */ return; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c index ed279f450976..36826b582484 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c @@ -86,7 +86,7 @@ static int mlx5e_init_xsk_rq(struct mlx5e_channel *c, if (err) return err; - return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix, 0); + return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix, c->napi.napi_id); } static int mlx5e_open_xsk_rq(struct mlx5e_channel *c, struct mlx5e_params *params, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index 55b38544422f..891d39b4bfd4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -61,16 +61,19 @@ static void mlx5e_ipsec_handle_tx_limit(struct work_struct *_work) struct mlx5e_ipsec_sa_entry *sa_entry = dwork->sa_entry; struct xfrm_state *x = sa_entry->x; - spin_lock(&x->lock); + if (sa_entry->attrs.drop) + return; + + spin_lock_bh(&x->lock); xfrm_state_check_expire(x); if (x->km.state == XFRM_STATE_EXPIRED) { sa_entry->attrs.drop = true; - mlx5e_accel_ipsec_fs_modify(sa_entry); - } - spin_unlock(&x->lock); + spin_unlock_bh(&x->lock); - if (sa_entry->attrs.drop) + mlx5e_accel_ipsec_fs_modify(sa_entry); return; + } + spin_unlock_bh(&x->lock); queue_delayed_work(sa_entry->ipsec->wq, &dwork->dwork, MLX5_IPSEC_RESCHED); @@ -1040,11 +1043,17 @@ err_fs: return err; } -static void mlx5e_xfrm_free_policy(struct xfrm_policy *x) +static void mlx5e_xfrm_del_policy(struct xfrm_policy *x) { struct mlx5e_ipsec_pol_entry *pol_entry = to_ipsec_pol_entry(x); mlx5e_accel_ipsec_fs_del_pol(pol_entry); +} + +static void mlx5e_xfrm_free_policy(struct xfrm_policy *x) +{ + struct mlx5e_ipsec_pol_entry *pol_entry = to_ipsec_pol_entry(x); + kfree(pol_entry); } @@ -1065,6 +1074,7 @@ static const struct xfrmdev_ops mlx5e_ipsec_packet_xfrmdev_ops = { .xdo_dev_state_update_curlft = mlx5e_xfrm_update_curlft, .xdo_dev_policy_add = mlx5e_xfrm_add_policy, + .xdo_dev_policy_delete = mlx5e_xfrm_del_policy, .xdo_dev_policy_free = mlx5e_xfrm_free_policy, }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c index df90e19066bc..a3554bde3e07 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c @@ -305,7 +305,17 @@ static void mlx5e_ipsec_update_esn_state(struct mlx5e_ipsec_sa_entry *sa_entry, } mlx5e_ipsec_build_accel_xfrm_attrs(sa_entry, &attrs); + + /* It is safe to execute the modify below unlocked since the only flows + * that could affect this HW object, are create, destroy and this work. + * + * Creation flow can't co-exist with this modify work, the destruction + * flow would cancel this work, and this work is a single entity that + * can't conflict with it self. + */ + spin_unlock_bh(&sa_entry->x->lock); mlx5_accel_esp_modify_xfrm(sa_entry, &attrs); + spin_lock_bh(&sa_entry->x->lock); data.data_offset_condition_operand = MLX5_IPSEC_ASO_REMOVE_FLOW_PKT_CNT_OFFSET; @@ -431,7 +441,7 @@ static void mlx5e_ipsec_handle_event(struct work_struct *_work) aso = sa_entry->ipsec->aso; attrs = &sa_entry->attrs; - spin_lock(&sa_entry->x->lock); + spin_lock_bh(&sa_entry->x->lock); ret = mlx5e_ipsec_aso_query(sa_entry, NULL); if (ret) goto unlock; @@ -447,7 +457,7 @@ static void mlx5e_ipsec_handle_event(struct work_struct *_work) mlx5e_ipsec_handle_limits(sa_entry); unlock: - spin_unlock(&sa_entry->x->lock); + spin_unlock_bh(&sa_entry->x->lock); kfree(work); } @@ -596,7 +606,8 @@ int mlx5e_ipsec_aso_query(struct mlx5e_ipsec_sa_entry *sa_entry, do { ret = mlx5_aso_poll_cq(aso->aso, false); if (ret) - usleep_range(2, 10); + /* We are in atomic context */ + udelay(10); } while (ret && time_is_after_jiffies(expires)); spin_unlock_bh(&aso->lock); return ret; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index a7c526ee5024..a5bdf78955d7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -641,7 +641,7 @@ static void mlx5e_free_mpwqe_rq_drop_page(struct mlx5e_rq *rq) } static int mlx5e_init_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *params, - struct mlx5e_rq *rq) + u32 xdp_frag_size, struct mlx5e_rq *rq) { struct mlx5_core_dev *mdev = c->mdev; int err; @@ -665,7 +665,8 @@ static int mlx5e_init_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param if (err) return err; - return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix, c->napi.napi_id); + return __xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix, c->napi.napi_id, + xdp_frag_size); } static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev, @@ -2240,7 +2241,7 @@ static int mlx5e_open_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param { int err; - err = mlx5e_init_rxq_rq(c, params, &c->rq); + err = mlx5e_init_rxq_rq(c, params, rq_params->xdp_frag_size, &c->rq); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 8a5a8703f0a3..b9b1da751a3b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1439,6 +1439,7 @@ static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv, mlx5e_hairpin_flow_del(priv, flow); free_flow_post_acts(flow); + mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), attr); kvfree(attr->parse_attr); kfree(flow->attr); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 144e59480686..ec83e6483d1a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -511,10 +511,11 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, struct mlx5_flow_rule *dst; void *in_flow_context, *vlan; void *in_match_value; + int reformat_id = 0; unsigned int inlen; int dst_cnt_size; + u32 *in, action; void *in_dests; - u32 *in; int err; if (mlx5_set_extended_dest(dev, fte, &extended_dest)) @@ -553,22 +554,42 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, MLX5_SET(flow_context, in_flow_context, extended_destination, extended_dest); - if (extended_dest) { - u32 action; - action = fte->action.action & - ~MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; - MLX5_SET(flow_context, in_flow_context, action, action); - } else { - MLX5_SET(flow_context, in_flow_context, action, - fte->action.action); - if (fte->action.pkt_reformat) - MLX5_SET(flow_context, in_flow_context, packet_reformat_id, - fte->action.pkt_reformat->id); + action = fte->action.action; + if (extended_dest) + action &= ~MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + + MLX5_SET(flow_context, in_flow_context, action, action); + + if (!extended_dest && fte->action.pkt_reformat) { + struct mlx5_pkt_reformat *pkt_reformat = fte->action.pkt_reformat; + + if (pkt_reformat->owner == MLX5_FLOW_RESOURCE_OWNER_SW) { + reformat_id = mlx5_fs_dr_action_get_pkt_reformat_id(pkt_reformat); + if (reformat_id < 0) { + mlx5_core_err(dev, + "Unsupported SW-owned pkt_reformat type (%d) in FW-owned table\n", + pkt_reformat->reformat_type); + err = reformat_id; + goto err_out; + } + } else { + reformat_id = fte->action.pkt_reformat->id; + } } - if (fte->action.modify_hdr) + + MLX5_SET(flow_context, in_flow_context, packet_reformat_id, (u32)reformat_id); + + if (fte->action.modify_hdr) { + if (fte->action.modify_hdr->owner == MLX5_FLOW_RESOURCE_OWNER_SW) { + mlx5_core_err(dev, "Can't use SW-owned modify_hdr in FW-owned table\n"); + err = -EOPNOTSUPP; + goto err_out; + } + MLX5_SET(flow_context, in_flow_context, modify_header_id, fte->action.modify_hdr->id); + } MLX5_SET(flow_context, in_flow_context, encrypt_decrypt_type, fte->action.crypto.type); @@ -885,6 +906,8 @@ static int mlx5_cmd_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns, pkt_reformat->id = MLX5_GET(alloc_packet_reformat_context_out, out, packet_reformat_id); + pkt_reformat->owner = MLX5_FLOW_RESOURCE_OWNER_FW; + kfree(in); return err; } @@ -969,6 +992,7 @@ static int mlx5_cmd_modify_header_alloc(struct mlx5_flow_root_namespace *ns, err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); modify_hdr->id = MLX5_GET(alloc_modify_header_context_out, out, modify_header_id); + modify_hdr->owner = MLX5_FLOW_RESOURCE_OWNER_FW; kfree(in); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index f137a0611b77..b043190e50a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -54,8 +54,14 @@ struct mlx5_flow_definer { u32 id; }; +enum mlx5_flow_resource_owner { + MLX5_FLOW_RESOURCE_OWNER_FW, + MLX5_FLOW_RESOURCE_OWNER_SW, +}; + struct mlx5_modify_hdr { enum mlx5_flow_namespace_type ns_type; + enum mlx5_flow_resource_owner owner; union { struct mlx5_fs_dr_action action; u32 id; @@ -65,6 +71,7 @@ struct mlx5_modify_hdr { struct mlx5_pkt_reformat { enum mlx5_flow_namespace_type ns_type; int reformat_type; /* from mlx5_ifc */ + enum mlx5_flow_resource_owner owner; union { struct mlx5_fs_dr_action action; u32 id; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 843da89a9035..98412bd5a696 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -126,14 +126,22 @@ out: return ret; } -static void irq_release(struct mlx5_irq *irq) +/* mlx5_system_free_irq - Free an IRQ + * @irq: IRQ to free + * + * Free the IRQ and other resources such as rmap from the system. + * BUT doesn't free or remove reference from mlx5. + * This function is very important for the shutdown flow, where we need to + * cleanup system resoruces but keep mlx5 objects alive, + * see mlx5_irq_table_free_irqs(). + */ +static void mlx5_system_free_irq(struct mlx5_irq *irq) { struct mlx5_irq_pool *pool = irq->pool; #ifdef CONFIG_RFS_ACCEL struct cpu_rmap *rmap; #endif - xa_erase(&pool->irqs, irq->pool_index); /* free_irq requires that affinity_hint and rmap will be cleared before * calling it. To satisfy this requirement, we call * irq_cpu_rmap_remove() to remove the notifier @@ -145,10 +153,18 @@ static void irq_release(struct mlx5_irq *irq) irq_cpu_rmap_remove(rmap, irq->map.virq); #endif - free_cpumask_var(irq->mask); free_irq(irq->map.virq, &irq->nh); if (irq->map.index && pci_msix_can_alloc_dyn(pool->dev->pdev)) pci_msix_free_irq(pool->dev->pdev, irq->map); +} + +static void irq_release(struct mlx5_irq *irq) +{ + struct mlx5_irq_pool *pool = irq->pool; + + xa_erase(&pool->irqs, irq->pool_index); + mlx5_system_free_irq(irq); + free_cpumask_var(irq->mask); kfree(irq); } @@ -565,15 +581,21 @@ void mlx5_irqs_release_vectors(struct mlx5_irq **irqs, int nirqs) int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs, struct mlx5_irq **irqs, struct cpu_rmap **rmap) { + struct mlx5_irq_table *table = mlx5_irq_table_get(dev); + struct mlx5_irq_pool *pool = table->pcif_pool; struct irq_affinity_desc af_desc; struct mlx5_irq *irq; + int offset = 1; int i; + if (!pool->xa_num_irqs.max) + offset = 0; + af_desc.is_managed = false; for (i = 0; i < nirqs; i++) { cpumask_clear(&af_desc.mask); cpumask_set_cpu(cpus[i], &af_desc.mask); - irq = mlx5_irq_request(dev, i + 1, &af_desc, rmap); + irq = mlx5_irq_request(dev, i + offset, &af_desc, rmap); if (IS_ERR(irq)) break; irqs[i] = irq; @@ -699,7 +721,8 @@ static void mlx5_irq_pool_free_irqs(struct mlx5_irq_pool *pool) unsigned long index; xa_for_each(&pool->irqs, index, irq) - free_irq(irq->map.virq, &irq->nh); + mlx5_system_free_irq(irq); + } static void mlx5_irq_pools_free_irqs(struct mlx5_irq_table *table) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c index 0eb9a8d7f282..0f783e7906cb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c @@ -1421,9 +1421,13 @@ dr_action_create_reformat_action(struct mlx5dr_domain *dmn, } case DR_ACTION_TYP_TNL_L3_TO_L2: { - u8 hw_actions[DR_ACTION_CACHE_LINE_SIZE] = {}; + u8 *hw_actions; int ret; + hw_actions = kzalloc(DR_ACTION_CACHE_LINE_SIZE, GFP_KERNEL); + if (!hw_actions) + return -ENOMEM; + ret = mlx5dr_ste_set_action_decap_l3_list(dmn->ste_ctx, data, data_sz, hw_actions, @@ -1431,6 +1435,7 @@ dr_action_create_reformat_action(struct mlx5dr_domain *dmn, &action->rewrite->num_of_actions); if (ret) { mlx5dr_dbg(dmn, "Failed creating decap l3 action list\n"); + kfree(hw_actions); return ret; } @@ -1440,6 +1445,7 @@ dr_action_create_reformat_action(struct mlx5dr_domain *dmn, ret = mlx5dr_ste_alloc_modify_hdr(action); if (ret) { mlx5dr_dbg(dmn, "Failed preparing reformat data\n"); + kfree(hw_actions); return ret; } return 0; @@ -2129,6 +2135,11 @@ mlx5dr_action_create_aso(struct mlx5dr_domain *dmn, u32 obj_id, return action; } +u32 mlx5dr_action_get_pkt_reformat_id(struct mlx5dr_action *action) +{ + return action->reformat->id; +} + int mlx5dr_action_destroy(struct mlx5dr_action *action) { if (WARN_ON_ONCE(refcount_read(&action->refcount) > 1)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c index 984653756779..cc215beb7436 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c @@ -331,8 +331,16 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, } if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) { - bool is_decap = fte->action.pkt_reformat->reformat_type == - MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2; + bool is_decap; + + if (fte->action.pkt_reformat->owner == MLX5_FLOW_RESOURCE_OWNER_FW) { + err = -EINVAL; + mlx5dr_err(domain, "FW-owned reformat can't be used in SW rule\n"); + goto free_actions; + } + + is_decap = fte->action.pkt_reformat->reformat_type == + MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2; if (is_decap) actions[num_actions++] = @@ -661,6 +669,7 @@ static int mlx5_cmd_dr_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns return -EINVAL; } + pkt_reformat->owner = MLX5_FLOW_RESOURCE_OWNER_SW; pkt_reformat->action.dr_action = action; return 0; @@ -691,6 +700,7 @@ static int mlx5_cmd_dr_modify_header_alloc(struct mlx5_flow_root_namespace *ns, return -EINVAL; } + modify_hdr->owner = MLX5_FLOW_RESOURCE_OWNER_SW; modify_hdr->action.dr_action = action; return 0; @@ -816,6 +826,19 @@ static u32 mlx5_cmd_dr_get_capabilities(struct mlx5_flow_root_namespace *ns, return steering_caps; } +int mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat) +{ + switch (pkt_reformat->reformat_type) { + case MLX5_REFORMAT_TYPE_L2_TO_VXLAN: + case MLX5_REFORMAT_TYPE_L2_TO_NVGRE: + case MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL: + case MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL: + case MLX5_REFORMAT_TYPE_INSERT_HDR: + return mlx5dr_action_get_pkt_reformat_id(pkt_reformat->action.dr_action); + } + return -EOPNOTSUPP; +} + bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev) { return mlx5dr_is_supported(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.h index d168622063d5..99a3b2eff6b8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.h @@ -38,6 +38,8 @@ struct mlx5_fs_dr_table { bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev); +int mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat); + const struct mlx5_flow_cmds *mlx5_fs_cmd_get_dr_cmds(void); #else @@ -47,6 +49,11 @@ static inline const struct mlx5_flow_cmds *mlx5_fs_cmd_get_dr_cmds(void) return NULL; } +static inline u32 mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat) +{ + return 0; +} + static inline bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev) { return false; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h index 9afd268a2573..d1c04f43d86d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h @@ -150,6 +150,8 @@ mlx5dr_action_create_dest_match_range(struct mlx5dr_domain *dmn, int mlx5dr_action_destroy(struct mlx5dr_action *action); +u32 mlx5dr_action_get_pkt_reformat_id(struct mlx5dr_action *action); + int mlx5dr_definer_get(struct mlx5dr_domain *dmn, u16 format_id, u8 *dw_selectors, u8 *byte_selectors, u8 *match_mask, u32 *definer_id); diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c index c865a4be05ee..4a1b94e5a8ea 100644 --- a/drivers/net/ethernet/qualcomm/qca_spi.c +++ b/drivers/net/ethernet/qualcomm/qca_spi.c @@ -582,8 +582,7 @@ qcaspi_spi_thread(void *data) while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); if ((qca->intr_req == qca->intr_svc) && - (qca->txr.skb[qca->txr.head] == NULL) && - (qca->sync == QCASPI_SYNC_READY)) + !qca->txr.skb[qca->txr.head]) schedule(); set_current_state(TASK_RUNNING); diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index d30459dbfe8f..b63e47af6365 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -2950,7 +2950,7 @@ static u32 efx_ef10_extract_event_ts(efx_qword_t *event) return tstamp; } -static void +static int efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) { struct efx_nic *efx = channel->efx; @@ -2958,13 +2958,14 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) unsigned int tx_ev_desc_ptr; unsigned int tx_ev_q_label; unsigned int tx_ev_type; + int work_done; u64 ts_part; if (unlikely(READ_ONCE(efx->reset_pending))) - return; + return 0; if (unlikely(EFX_QWORD_FIELD(*event, ESF_DZ_TX_DROP_EVENT))) - return; + return 0; /* Get the transmit queue */ tx_ev_q_label = EFX_QWORD_FIELD(*event, ESF_DZ_TX_QLABEL); @@ -2973,8 +2974,7 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) if (!tx_queue->timestamping) { /* Transmit completion */ tx_ev_desc_ptr = EFX_QWORD_FIELD(*event, ESF_DZ_TX_DESCR_INDX); - efx_xmit_done(tx_queue, tx_ev_desc_ptr & tx_queue->ptr_mask); - return; + return efx_xmit_done(tx_queue, tx_ev_desc_ptr & tx_queue->ptr_mask); } /* Transmit timestamps are only available for 8XXX series. They result @@ -3000,6 +3000,7 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) * fields in the event. */ tx_ev_type = EFX_QWORD_FIELD(*event, ESF_EZ_TX_SOFT1); + work_done = 0; switch (tx_ev_type) { case TX_TIMESTAMP_EVENT_TX_EV_COMPLETION: @@ -3016,6 +3017,7 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) tx_queue->completed_timestamp_major = ts_part; efx_xmit_done_single(tx_queue); + work_done = 1; break; default: @@ -3026,6 +3028,8 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) EFX_QWORD_VAL(*event)); break; } + + return work_done; } static void @@ -3081,13 +3085,16 @@ static void efx_ef10_handle_driver_generated_event(struct efx_channel *channel, } } +#define EFX_NAPI_MAX_TX 512 + static int efx_ef10_ev_process(struct efx_channel *channel, int quota) { struct efx_nic *efx = channel->efx; efx_qword_t event, *p_event; unsigned int read_ptr; - int ev_code; + int spent_tx = 0; int spent = 0; + int ev_code; if (quota <= 0) return spent; @@ -3126,7 +3133,11 @@ static int efx_ef10_ev_process(struct efx_channel *channel, int quota) } break; case ESE_DZ_EV_CODE_TX_EV: - efx_ef10_handle_tx_event(channel, &event); + spent_tx += efx_ef10_handle_tx_event(channel, &event); + if (spent_tx >= EFX_NAPI_MAX_TX) { + spent = quota; + goto out; + } break; case ESE_DZ_EV_CODE_DRIVER_EV: efx_ef10_handle_driver_event(channel, &event); diff --git a/drivers/net/ethernet/sfc/ef100_nic.c b/drivers/net/ethernet/sfc/ef100_nic.c index 4dc643b0d2db..7adde9639c8a 100644 --- a/drivers/net/ethernet/sfc/ef100_nic.c +++ b/drivers/net/ethernet/sfc/ef100_nic.c @@ -253,6 +253,8 @@ static void ef100_ev_read_ack(struct efx_channel *channel) efx_reg(channel->efx, ER_GZ_EVQ_INT_PRIME)); } +#define EFX_NAPI_MAX_TX 512 + static int ef100_ev_process(struct efx_channel *channel, int quota) { struct efx_nic *efx = channel->efx; @@ -260,6 +262,7 @@ static int ef100_ev_process(struct efx_channel *channel, int quota) bool evq_phase, old_evq_phase; unsigned int read_ptr; efx_qword_t *p_event; + int spent_tx = 0; int spent = 0; bool ev_phase; int ev_type; @@ -295,7 +298,9 @@ static int ef100_ev_process(struct efx_channel *channel, int quota) efx_mcdi_process_event(channel, p_event); break; case ESE_GZ_EF100_EV_TX_COMPLETION: - ef100_ev_tx(channel, p_event); + spent_tx += ef100_ev_tx(channel, p_event); + if (spent_tx >= EFX_NAPI_MAX_TX) + spent = quota; break; case ESE_GZ_EF100_EV_DRIVER: netif_info(efx, drv, efx->net_dev, diff --git a/drivers/net/ethernet/sfc/ef100_tx.c b/drivers/net/ethernet/sfc/ef100_tx.c index 29ffaf35559d..849e5555bd12 100644 --- a/drivers/net/ethernet/sfc/ef100_tx.c +++ b/drivers/net/ethernet/sfc/ef100_tx.c @@ -346,7 +346,7 @@ void ef100_tx_write(struct efx_tx_queue *tx_queue) ef100_tx_push_buffers(tx_queue); } -void ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event) +int ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event) { unsigned int tx_done = EFX_QWORD_FIELD(*p_event, ESF_GZ_EV_TXCMPL_NUM_DESC); @@ -357,7 +357,7 @@ void ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event) unsigned int tx_index = (tx_queue->read_count + tx_done - 1) & tx_queue->ptr_mask; - efx_xmit_done(tx_queue, tx_index); + return efx_xmit_done(tx_queue, tx_index); } /* Add a socket buffer to a TX queue diff --git a/drivers/net/ethernet/sfc/ef100_tx.h b/drivers/net/ethernet/sfc/ef100_tx.h index e9e11540fcde..d9a0819c5a72 100644 --- a/drivers/net/ethernet/sfc/ef100_tx.h +++ b/drivers/net/ethernet/sfc/ef100_tx.h @@ -20,7 +20,7 @@ void ef100_tx_init(struct efx_tx_queue *tx_queue); void ef100_tx_write(struct efx_tx_queue *tx_queue); unsigned int ef100_tx_max_skb_descs(struct efx_nic *efx); -void ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event); +int ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event); netdev_tx_t ef100_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb); int __ef100_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb, diff --git a/drivers/net/ethernet/sfc/tx_common.c b/drivers/net/ethernet/sfc/tx_common.c index 67e789b96c43..755aa92bf823 100644 --- a/drivers/net/ethernet/sfc/tx_common.c +++ b/drivers/net/ethernet/sfc/tx_common.c @@ -249,7 +249,7 @@ void efx_xmit_done_check_empty(struct efx_tx_queue *tx_queue) } } -void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) +int efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) { unsigned int fill_level, pkts_compl = 0, bytes_compl = 0; unsigned int efv_pkts_compl = 0; @@ -279,6 +279,8 @@ void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) } efx_xmit_done_check_empty(tx_queue); + + return pkts_compl + efv_pkts_compl; } /* Remove buffers put into a tx_queue for the current packet. diff --git a/drivers/net/ethernet/sfc/tx_common.h b/drivers/net/ethernet/sfc/tx_common.h index d87aecbc7bf1..1e9f42938aac 100644 --- a/drivers/net/ethernet/sfc/tx_common.h +++ b/drivers/net/ethernet/sfc/tx_common.h @@ -28,7 +28,7 @@ static inline bool efx_tx_buffer_in_use(struct efx_tx_buffer *buffer) } void efx_xmit_done_check_empty(struct efx_tx_queue *tx_queue); -void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index); +int efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index); void efx_enqueue_unwind(struct efx_tx_queue *tx_queue, unsigned int insert_count); diff --git a/drivers/net/ieee802154/adf7242.c b/drivers/net/ieee802154/adf7242.c index f9972b8140f9..a03490ba2e5b 100644 --- a/drivers/net/ieee802154/adf7242.c +++ b/drivers/net/ieee802154/adf7242.c @@ -1348,3 +1348,5 @@ module_spi_driver(adf7242_driver); MODULE_AUTHOR("Michael Hennerich <michael.hennerich@analog.com>"); MODULE_DESCRIPTION("ADF7242 IEEE802.15.4 Transceiver Driver"); MODULE_LICENSE("GPL"); + +MODULE_FIRMWARE(FIRMWARE); diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c index 8445c2189d11..31cba9aa7636 100644 --- a/drivers/net/ieee802154/mac802154_hwsim.c +++ b/drivers/net/ieee802154/mac802154_hwsim.c @@ -685,7 +685,7 @@ static int hwsim_del_edge_nl(struct sk_buff *msg, struct genl_info *info) static int hwsim_set_edge_lqi(struct sk_buff *msg, struct genl_info *info) { struct nlattr *edge_attrs[MAC802154_HWSIM_EDGE_ATTR_MAX + 1]; - struct hwsim_edge_info *einfo; + struct hwsim_edge_info *einfo, *einfo_old; struct hwsim_phy *phy_v0; struct hwsim_edge *e; u32 v0, v1; @@ -723,8 +723,10 @@ static int hwsim_set_edge_lqi(struct sk_buff *msg, struct genl_info *info) list_for_each_entry_rcu(e, &phy_v0->edges, list) { if (e->endpoint->idx == v1) { einfo->lqi = lqi; - rcu_assign_pointer(e->info, einfo); + einfo_old = rcu_replace_pointer(e->info, einfo, + lockdep_is_held(&hwsim_phys_lock)); rcu_read_unlock(); + kfree_rcu(einfo_old, rcu); mutex_unlock(&hwsim_phys_lock); return 0; } diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index 76f5a2402fb0..e397e7d642d9 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -936,7 +936,7 @@ static int dp83867_phy_reset(struct phy_device *phydev) { int err; - err = phy_write(phydev, DP83867_CTRL, DP83867_SW_RESTART); + err = phy_write(phydev, DP83867_CTRL, DP83867_SW_RESET); if (err < 0) return err; diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 389f33a12534..8b3618d3da4a 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -1287,7 +1287,7 @@ EXPORT_SYMBOL_GPL(mdiobus_modify_changed); * @mask: bit mask of bits to clear * @set: bit mask of bits to set */ -int mdiobus_c45_modify_changed(struct mii_bus *bus, int devad, int addr, +int mdiobus_c45_modify_changed(struct mii_bus *bus, int addr, int devad, u32 regnum, u16 mask, u16 set) { int err; diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 17d0d0555a79..53598210be6c 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3021,6 +3021,15 @@ static int phy_led_blink_set(struct led_classdev *led_cdev, return err; } +static void phy_leds_unregister(struct phy_device *phydev) +{ + struct phy_led *phyled; + + list_for_each_entry(phyled, &phydev->leds, list) { + led_classdev_unregister(&phyled->led_cdev); + } +} + static int of_phy_led(struct phy_device *phydev, struct device_node *led) { @@ -3054,7 +3063,7 @@ static int of_phy_led(struct phy_device *phydev, init_data.fwnode = of_fwnode_handle(led); init_data.devname_mandatory = true; - err = devm_led_classdev_register_ext(dev, cdev, &init_data); + err = led_classdev_register_ext(dev, cdev, &init_data); if (err) return err; @@ -3083,6 +3092,7 @@ static int of_phy_leds(struct phy_device *phydev) err = of_phy_led(phydev, led); if (err) { of_node_put(led); + phy_leds_unregister(phydev); return err; } } @@ -3305,6 +3315,9 @@ static int phy_remove(struct device *dev) cancel_delayed_work_sync(&phydev->state_queue); + if (IS_ENABLED(CONFIG_PHYLIB_LEDS)) + phy_leds_unregister(phydev); + phydev->state = PHY_DOWN; sfp_bus_del_upstream(phydev->sfp_bus); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index dba112394838..79115eb1c285 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -548,6 +548,8 @@ static const struct iwl_dev_info iwl_dev_info_table[] = { IWL_DEV_INFO(0x54F0, 0x1692, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690i_name), IWL_DEV_INFO(0x7A70, 0x1691, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690s_name), IWL_DEV_INFO(0x7A70, 0x1692, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690i_name), + IWL_DEV_INFO(0x7AF0, 0x1691, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690s_name), + IWL_DEV_INFO(0x7AF0, 0x1692, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690i_name), IWL_DEV_INFO(0x271C, 0x0214, iwl9260_2ac_cfg, iwl9260_1_name), IWL_DEV_INFO(0x7E40, 0x1691, iwl_cfg_ma_a0_gf4_a0, iwl_ax411_killer_1690s_name), diff --git a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c index d6b166fc5c0e..bff46f7ca59f 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c +++ b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c @@ -626,14 +626,12 @@ static void mux_dl_adb_decode(struct iosm_mux *ipc_mux, if (adth->signature != cpu_to_le32(IOSM_AGGR_MUX_SIG_ADTH)) goto adb_decode_err; - if (le16_to_cpu(adth->table_length) < (sizeof(struct mux_adth) - - sizeof(struct mux_adth_dg))) + if (le16_to_cpu(adth->table_length) < sizeof(struct mux_adth)) goto adb_decode_err; /* Calculate the number of datagrams. */ nr_of_dg = (le16_to_cpu(adth->table_length) - - sizeof(struct mux_adth) + - sizeof(struct mux_adth_dg)) / + sizeof(struct mux_adth)) / sizeof(struct mux_adth_dg); /* Is the datagram table empty ? */ @@ -649,7 +647,7 @@ static void mux_dl_adb_decode(struct iosm_mux *ipc_mux, } /* New aggregated datagram table. */ - dg = &adth->dg; + dg = adth->dg; if (mux_dl_process_dg(ipc_mux, adbh, dg, skb, if_id, nr_of_dg) < 0) goto adb_decode_err; @@ -849,7 +847,7 @@ static void ipc_mux_ul_encode_adth(struct iosm_mux *ipc_mux, adth->if_id = i; adth->table_length = cpu_to_le16(adth_dg_size); adth_dg_size -= offsetof(struct mux_adth, dg); - memcpy(&adth->dg, ul_adb->dg[i], adth_dg_size); + memcpy(adth->dg, ul_adb->dg[i], adth_dg_size); ul_adb->if_cnt++; } @@ -1426,14 +1424,13 @@ static int ipc_mux_get_payload_from_adb(struct iosm_mux *ipc_mux, if (adth->signature == cpu_to_le32(IOSM_AGGR_MUX_SIG_ADTH)) { nr_of_dg = (le16_to_cpu(adth->table_length) - - sizeof(struct mux_adth) + - sizeof(struct mux_adth_dg)) / + sizeof(struct mux_adth)) / sizeof(struct mux_adth_dg); if (nr_of_dg <= 0) return payload_size; - dg = &adth->dg; + dg = adth->dg; for (i = 0; i < nr_of_dg; i++, dg++) { if (le32_to_cpu(dg->datagram_index) < diff --git a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.h b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.h index 5d4e3b89542c..f8df88f816c4 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.h +++ b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.h @@ -161,7 +161,7 @@ struct mux_adth { u8 opt_ipv4v6; __le32 next_table_index; __le32 reserved2; - struct mux_adth_dg dg; + struct mux_adth_dg dg[]; }; /** diff --git a/drivers/nfc/fdp/fdp.c b/drivers/nfc/fdp/fdp.c index f12f903a9dd1..da3e2dce8e70 100644 --- a/drivers/nfc/fdp/fdp.c +++ b/drivers/nfc/fdp/fdp.c @@ -762,3 +762,6 @@ EXPORT_SYMBOL(fdp_nci_remove); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("NFC NCI driver for Intel Fields Peak NFC controller"); MODULE_AUTHOR("Robert Dolca <robert.dolca@intel.com>"); + +MODULE_FIRMWARE(FDP_OTP_PATCH_NAME); +MODULE_FIRMWARE(FDP_RAM_PATCH_NAME); diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index bc32662c6bb7..2d93d0c4f10d 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -489,7 +489,10 @@ struct hv_pcibus_device { struct fwnode_handle *fwnode; /* Protocol version negotiated with the host */ enum pci_protocol_version_t protocol_version; + + struct mutex state_lock; enum hv_pcibus_state state; + struct hv_device *hdev; resource_size_t low_mmio_space; resource_size_t high_mmio_space; @@ -545,19 +548,10 @@ struct hv_dr_state { struct hv_pcidev_description func[]; }; -enum hv_pcichild_state { - hv_pcichild_init = 0, - hv_pcichild_requirements, - hv_pcichild_resourced, - hv_pcichild_ejecting, - hv_pcichild_maximum -}; - struct hv_pci_dev { /* List protected by pci_rescan_remove_lock */ struct list_head list_entry; refcount_t refs; - enum hv_pcichild_state state; struct pci_slot *pci_slot; struct hv_pcidev_description desc; bool reported_missing; @@ -635,6 +629,11 @@ static void hv_arch_irq_unmask(struct irq_data *data) pbus = pdev->bus; hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); int_desc = data->chip_data; + if (!int_desc) { + dev_warn(&hbus->hdev->device, "%s() can not unmask irq %u\n", + __func__, data->irq); + return; + } local_irq_save(flags); @@ -2004,12 +2003,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) hv_pci_onchannelcallback(hbus); spin_unlock_irqrestore(&channel->sched_lock, flags); - if (hpdev->state == hv_pcichild_ejecting) { - dev_err_once(&hbus->hdev->device, - "the device is being ejected\n"); - goto enable_tasklet; - } - udelay(100); } @@ -2615,6 +2608,8 @@ static void pci_devices_present_work(struct work_struct *work) if (!dr) return; + mutex_lock(&hbus->state_lock); + /* First, mark all existing children as reported missing. */ spin_lock_irqsave(&hbus->device_list_lock, flags); list_for_each_entry(hpdev, &hbus->children, list_entry) { @@ -2696,6 +2691,8 @@ static void pci_devices_present_work(struct work_struct *work) break; } + mutex_unlock(&hbus->state_lock); + kfree(dr); } @@ -2844,7 +2841,7 @@ static void hv_eject_device_work(struct work_struct *work) hpdev = container_of(work, struct hv_pci_dev, wrk); hbus = hpdev->hbus; - WARN_ON(hpdev->state != hv_pcichild_ejecting); + mutex_lock(&hbus->state_lock); /* * Ejection can come before or after the PCI bus has been set up, so @@ -2882,6 +2879,8 @@ static void hv_eject_device_work(struct work_struct *work) put_pcichild(hpdev); put_pcichild(hpdev); /* hpdev has been freed. Do not use it any more. */ + + mutex_unlock(&hbus->state_lock); } /** @@ -2902,7 +2901,6 @@ static void hv_pci_eject_device(struct hv_pci_dev *hpdev) return; } - hpdev->state = hv_pcichild_ejecting; get_pcichild(hpdev); INIT_WORK(&hpdev->wrk, hv_eject_device_work); queue_work(hbus->wq, &hpdev->wrk); @@ -3331,8 +3329,10 @@ static int hv_pci_enter_d0(struct hv_device *hdev) struct pci_bus_d0_entry *d0_entry; struct hv_pci_compl comp_pkt; struct pci_packet *pkt; + bool retry = true; int ret; +enter_d0_retry: /* * Tell the host that the bus is ready to use, and moved into the * powered-on state. This includes telling the host which region @@ -3359,6 +3359,38 @@ static int hv_pci_enter_d0(struct hv_device *hdev) if (ret) goto exit; + /* + * In certain case (Kdump) the pci device of interest was + * not cleanly shut down and resource is still held on host + * side, the host could return invalid device status. + * We need to explicitly request host to release the resource + * and try to enter D0 again. + */ + if (comp_pkt.completion_status < 0 && retry) { + retry = false; + + dev_err(&hdev->device, "Retrying D0 Entry\n"); + + /* + * Hv_pci_bus_exit() calls hv_send_resource_released() + * to free up resources of its child devices. + * In the kdump kernel we need to set the + * wslot_res_allocated to 255 so it scans all child + * devices to release resources allocated in the + * normal kernel before panic happened. + */ + hbus->wslot_res_allocated = 255; + + ret = hv_pci_bus_exit(hdev, true); + + if (ret == 0) { + kfree(pkt); + goto enter_d0_retry; + } + dev_err(&hdev->device, + "Retrying D0 failed with ret %d\n", ret); + } + if (comp_pkt.completion_status < 0) { dev_err(&hdev->device, "PCI Pass-through VSP failed D0 Entry with status %x\n", @@ -3401,6 +3433,24 @@ static int hv_pci_query_relations(struct hv_device *hdev) if (!ret) ret = wait_for_response(hdev, &comp); + /* + * In the case of fast device addition/removal, it's possible that + * vmbus_sendpacket() or wait_for_response() returns -ENODEV but we + * already got a PCI_BUS_RELATIONS* message from the host and the + * channel callback already scheduled a work to hbus->wq, which can be + * running pci_devices_present_work() -> survey_child_resources() -> + * complete(&hbus->survey_event), even after hv_pci_query_relations() + * exits and the stack variable 'comp' is no longer valid; as a result, + * a hang or a page fault may happen when the complete() calls + * raw_spin_lock_irqsave(). Flush hbus->wq before we exit from + * hv_pci_query_relations() to avoid the issues. Note: if 'ret' is + * -ENODEV, there can't be any more work item scheduled to hbus->wq + * after the flush_workqueue(): see vmbus_onoffer_rescind() -> + * vmbus_reset_channel_cb(), vmbus_rescind_cleanup() -> + * channel->rescind = true. + */ + flush_workqueue(hbus->wq); + return ret; } @@ -3586,7 +3636,6 @@ static int hv_pci_probe(struct hv_device *hdev, struct hv_pcibus_device *hbus; u16 dom_req, dom; char *name; - bool enter_d0_retry = true; int ret; bridge = devm_pci_alloc_host_bridge(&hdev->device, 0); @@ -3598,6 +3647,7 @@ static int hv_pci_probe(struct hv_device *hdev, return -ENOMEM; hbus->bridge = bridge; + mutex_init(&hbus->state_lock); hbus->state = hv_pcibus_init; hbus->wslot_res_allocated = -1; @@ -3703,49 +3753,15 @@ static int hv_pci_probe(struct hv_device *hdev, if (ret) goto free_fwnode; -retry: ret = hv_pci_query_relations(hdev); if (ret) goto free_irq_domain; - ret = hv_pci_enter_d0(hdev); - /* - * In certain case (Kdump) the pci device of interest was - * not cleanly shut down and resource is still held on host - * side, the host could return invalid device status. - * We need to explicitly request host to release the resource - * and try to enter D0 again. - * Since the hv_pci_bus_exit() call releases structures - * of all its child devices, we need to start the retry from - * hv_pci_query_relations() call, requesting host to send - * the synchronous child device relations message before this - * information is needed in hv_send_resources_allocated() - * call later. - */ - if (ret == -EPROTO && enter_d0_retry) { - enter_d0_retry = false; - - dev_err(&hdev->device, "Retrying D0 Entry\n"); - - /* - * Hv_pci_bus_exit() calls hv_send_resources_released() - * to free up resources of its child devices. - * In the kdump kernel we need to set the - * wslot_res_allocated to 255 so it scans all child - * devices to release resources allocated in the - * normal kernel before panic happened. - */ - hbus->wslot_res_allocated = 255; - ret = hv_pci_bus_exit(hdev, true); - - if (ret == 0) - goto retry; + mutex_lock(&hbus->state_lock); - dev_err(&hdev->device, - "Retrying D0 failed with ret %d\n", ret); - } + ret = hv_pci_enter_d0(hdev); if (ret) - goto free_irq_domain; + goto release_state_lock; ret = hv_pci_allocate_bridge_windows(hbus); if (ret) @@ -3763,12 +3779,15 @@ retry: if (ret) goto free_windows; + mutex_unlock(&hbus->state_lock); return 0; free_windows: hv_pci_free_bridge_windows(hbus); exit_d0: (void) hv_pci_bus_exit(hdev, true); +release_state_lock: + mutex_unlock(&hbus->state_lock); free_irq_domain: irq_domain_remove(hbus->irq_domain); free_fwnode: @@ -4018,20 +4037,26 @@ static int hv_pci_resume(struct hv_device *hdev) if (ret) goto out; + mutex_lock(&hbus->state_lock); + ret = hv_pci_enter_d0(hdev); if (ret) - goto out; + goto release_state_lock; ret = hv_send_resources_allocated(hdev); if (ret) - goto out; + goto release_state_lock; prepopulate_bars(hbus); hv_pci_restore_msi_state(hbus); hbus->state = hv_pcibus_installed; + mutex_unlock(&hbus->state_lock); return 0; + +release_state_lock: + mutex_unlock(&hbus->state_lock); out: vmbus_close(hdev->channel); return ret; diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index c98e4039386d..93b7edb5f1e7 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -677,9 +677,25 @@ static inline u32 armv8pmu_getreset_flags(void) return value; } +static void update_pmuserenr(u64 val) +{ + lockdep_assert_irqs_disabled(); + + /* + * The current PMUSERENR_EL0 value might be the value for the guest. + * If that's the case, have KVM keep tracking of the register value + * for the host EL0 so that KVM can restore it before returning to + * the host EL0. Otherwise, update the register now. + */ + if (kvm_set_pmuserenr(val)) + return; + + write_pmuserenr(val); +} + static void armv8pmu_disable_user_access(void) { - write_pmuserenr(0); + update_pmuserenr(0); } static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu) @@ -695,8 +711,7 @@ static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu) armv8pmu_write_evcntr(i, 0); } - write_pmuserenr(0); - write_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR); + update_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR); } static void armv8pmu_enable_event(struct perf_event *event) diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index ee5f124f78b6..7780705917b7 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -297,6 +297,8 @@ static void amd_pmf_init_features(struct amd_pmf_dev *dev) /* Enable Static Slider */ if (is_apmf_func_supported(dev, APMF_FUNC_STATIC_SLIDER_GRANULAR)) { amd_pmf_init_sps(dev); + dev->pwr_src_notifier.notifier_call = amd_pmf_pwr_src_notify_call; + power_supply_reg_notifier(&dev->pwr_src_notifier); dev_dbg(dev->dev, "SPS enabled and Platform Profiles registered\n"); } @@ -315,8 +317,10 @@ static void amd_pmf_init_features(struct amd_pmf_dev *dev) static void amd_pmf_deinit_features(struct amd_pmf_dev *dev) { - if (is_apmf_func_supported(dev, APMF_FUNC_STATIC_SLIDER_GRANULAR)) + if (is_apmf_func_supported(dev, APMF_FUNC_STATIC_SLIDER_GRANULAR)) { + power_supply_unreg_notifier(&dev->pwr_src_notifier); amd_pmf_deinit_sps(dev); + } if (is_apmf_func_supported(dev, APMF_FUNC_AUTO_MODE)) { amd_pmf_deinit_auto_mode(dev); @@ -399,9 +403,6 @@ static int amd_pmf_probe(struct platform_device *pdev) apmf_install_handler(dev); amd_pmf_dbgfs_register(dev); - dev->pwr_src_notifier.notifier_call = amd_pmf_pwr_src_notify_call; - power_supply_reg_notifier(&dev->pwr_src_notifier); - dev_info(dev->dev, "registered PMF device successfully\n"); return 0; @@ -411,7 +412,6 @@ static void amd_pmf_remove(struct platform_device *pdev) { struct amd_pmf_dev *dev = platform_get_drvdata(pdev); - power_supply_unreg_notifier(&dev->pwr_src_notifier); amd_pmf_deinit_features(dev); apmf_acpi_deinit(dev); amd_pmf_dbgfs_unregister(dev); diff --git a/drivers/spi/spi-geni-qcom.c b/drivers/spi/spi-geni-qcom.c index a98b781b103a..b293428760bc 100644 --- a/drivers/spi/spi-geni-qcom.c +++ b/drivers/spi/spi-geni-qcom.c @@ -646,6 +646,8 @@ static int spi_geni_init(struct spi_geni_master *mas) geni_se_select_mode(se, GENI_GPI_DMA); dev_dbg(mas->dev, "Using GPI DMA mode for SPI\n"); break; + } else if (ret == -EPROBE_DEFER) { + goto out_pm; } /* * in case of failure to get gpi dma channel, we can still do the diff --git a/drivers/thermal/intel/intel_soc_dts_iosf.c b/drivers/thermal/intel/intel_soc_dts_iosf.c index f99dc7e4ae89..db97499f4f0a 100644 --- a/drivers/thermal/intel/intel_soc_dts_iosf.c +++ b/drivers/thermal/intel/intel_soc_dts_iosf.c @@ -398,7 +398,7 @@ struct intel_soc_dts_sensors *intel_soc_dts_iosf_init( spin_lock_init(&sensors->intr_notify_lock); mutex_init(&sensors->dts_update_lock); sensors->intr_type = intr_type; - sensors->tj_max = tj_max; + sensors->tj_max = tj_max * 1000; if (intr_type == INTEL_SOC_DTS_INTERRUPT_NONE) notification = false; else diff --git a/fs/afs/write.c b/fs/afs/write.c index c822d6006033..8750b99c3f56 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -731,6 +731,7 @@ static int afs_writepages_region(struct address_space *mapping, * (changing page->mapping to NULL), or even swizzled * back from swapper_space to tmpfs file mapping */ +try_again: if (wbc->sync_mode != WB_SYNC_NONE) { ret = folio_lock_killable(folio); if (ret < 0) { @@ -757,12 +758,14 @@ static int afs_writepages_region(struct address_space *mapping, #ifdef CONFIG_AFS_FSCACHE folio_wait_fscache(folio); #endif - } else { - start += folio_size(folio); + goto try_again; } + + start += folio_size(folio); if (wbc->sync_mode == WB_SYNC_NONE) { if (skips >= 5 || need_resched()) { *_next = start; + folio_batch_release(&fbatch); _leave(" = 0 [%llx]", *_next); return 0; } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 590b03560265..e97af2e510c3 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1973,7 +1973,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, /* For RAID5/6 adjust to a full IO stripe length */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - io_stripe_size = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; + io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); if (!buf) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index bceaa8c2007e..16c228344cbb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1304,7 +1304,7 @@ static int get_raid56_logic_offset(u64 physical, int num, u32 stripe_index; u32 rot; - *offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT); + *offset = last_offset + btrfs_stripe_nr_to_offset(i); stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes; @@ -1319,7 +1319,7 @@ static int get_raid56_logic_offset(u64 physical, int num, if (stripe_index < num) j++; } - *offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT); + *offset = last_offset + btrfs_stripe_nr_to_offset(j); return 1; } @@ -1715,7 +1715,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, - nr_stripes << BTRFS_STRIPE_LEN_SHIFT); + btrfs_stripe_nr_to_offset(nr_stripes)); for (int i = 0; i < nr_stripes; i++) { stripe = &sctx->stripes[i]; scrub_submit_initial_read(sctx, stripe); @@ -1838,7 +1838,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bool all_empty = true; const int data_stripes = nr_data_stripes(map); unsigned long extent_bitmap = 0; - u64 length = data_stripes << BTRFS_STRIPE_LEN_SHIFT; + u64 length = btrfs_stripe_nr_to_offset(data_stripes); int ret; ASSERT(sctx->raid56_data_stripes); @@ -1853,13 +1853,13 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, data_stripes) >> BTRFS_STRIPE_LEN_SHIFT; stripe_index = (i + rot) % map->num_stripes; physical = map->stripes[stripe_index].physical + - (rot << BTRFS_STRIPE_LEN_SHIFT); + btrfs_stripe_nr_to_offset(rot); scrub_reset_stripe(stripe); set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); ret = scrub_find_fill_first_stripe(bg, map->stripes[stripe_index].dev, physical, 1, - full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT), + full_stripe_start + btrfs_stripe_nr_to_offset(i), BTRFS_STRIPE_LEN, stripe); if (ret < 0) goto out; @@ -1869,7 +1869,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, */ if (ret > 0) { stripe->logical = full_stripe_start + - (i << BTRFS_STRIPE_LEN_SHIFT); + btrfs_stripe_nr_to_offset(i); stripe->dev = map->stripes[stripe_index].dev; stripe->mirror_num = 1; set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); @@ -2062,7 +2062,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)); - return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT; + return btrfs_stripe_nr_to_offset(map->num_stripes / map->sub_stripes); } /* Get the logical bytenr for the stripe */ @@ -2078,7 +2078,7 @@ static u64 simple_stripe_get_logical(struct map_lookup *map, * (stripe_index / sub_stripes) gives how many data stripes we need to * skip. */ - return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) + + return btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes) + bg->start; } @@ -2204,7 +2204,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index); - offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT; + offset = btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes); goto out; } @@ -2219,7 +2219,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* Initialize @offset in case we need to go to out: label */ get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); - increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; + increment = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); /* * Due to the rotation, for RAID56 it's better to iterate each stripe diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index e2b54793bf0c..2138e9fc0564 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -857,10 +857,10 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, * * Thus it should be a good way to catch obvious bitflips. */ - if (unlikely(length >= ((u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT))) { + if (unlikely(length >= btrfs_stripe_nr_to_offset(U32_MAX))) { chunk_err(leaf, chunk, logical, "chunk length too large: have %llu limit %llu", - length, (u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT); + length, btrfs_stripe_nr_to_offset(U32_MAX)); return -EUCLEAN; } if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 841e799dece5..72a838c97534 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5125,7 +5125,7 @@ static void init_alloc_chunk_ctl_policy_regular( /* We don't want a chunk larger than 10% of writable space */ ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), ctl->max_chunk_size); - ctl->dev_extent_min = ctl->dev_stripes << BTRFS_STRIPE_LEN_SHIFT; + ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes); } static void init_alloc_chunk_ctl_policy_zoned( @@ -5801,7 +5801,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, if (!WARN_ON(IS_ERR(em))) { map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - len = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; + len = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); free_extent_map(em); } return len; @@ -5975,12 +5975,12 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; /* stripe_offset is the offset of this block in its stripe */ - stripe_offset = offset - (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); + stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr); stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >> BTRFS_STRIPE_LEN_SHIFT; stripe_cnt = stripe_nr_end - stripe_nr; - stripe_end_offset = (stripe_nr_end << BTRFS_STRIPE_LEN_SHIFT) - + stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) - (offset + length); /* * after this, stripe_nr is the number of stripes on this @@ -6023,12 +6023,12 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, for (i = 0; i < *num_stripes; i++) { stripes[i].physical = map->stripes[stripe_index].physical + - stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); + stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - stripes[i].length = stripes_per_dev << BTRFS_STRIPE_LEN_SHIFT; + stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev); if (i / sub_stripes < remaining_stripes) stripes[i].length += BTRFS_STRIPE_LEN; @@ -6183,8 +6183,8 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, ASSERT(*stripe_offset < U32_MAX); if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = nr_data_stripes(map) << - BTRFS_STRIPE_LEN_SHIFT; + unsigned long full_stripe_len = + btrfs_stripe_nr_to_offset(nr_data_stripes(map)); /* * For full stripe start, we use previously calculated @@ -6196,9 +6196,11 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, * not ensured to be power of 2. */ *full_stripe_start = - rounddown(*stripe_nr, nr_data_stripes(map)) << - BTRFS_STRIPE_LEN_SHIFT; + btrfs_stripe_nr_to_offset( + rounddown(*stripe_nr, nr_data_stripes(map))); + ASSERT(*full_stripe_start + full_stripe_len > offset); + ASSERT(*full_stripe_start <= offset); /* * For writes to RAID56, allow to write a full stripe set, but * no straddling of stripe sets. @@ -6221,7 +6223,7 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup * { dst->dev = map->stripes[stripe_index].dev; dst->physical = map->stripes[stripe_index].physical + - stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); + stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); } int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, @@ -6343,7 +6345,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, /* Return the length to the full stripe end */ *length = min(logical + *length, raid56_full_stripe_start + em->start + - (data_stripes << BTRFS_STRIPE_LEN_SHIFT)) - logical; + btrfs_stripe_nr_to_offset(data_stripes)) - + logical; stripe_index = 0; stripe_offset = 0; } else { @@ -6433,7 +6436,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * modulo, to reduce one modulo call. */ bioc->full_stripe_logical = em->start + - ((stripe_nr * data_stripes) << BTRFS_STRIPE_LEN_SHIFT); + btrfs_stripe_nr_to_offset(stripe_nr * data_stripes); for (i = 0; i < num_stripes; i++) set_io_stripe(&bioc->stripes[i], map, (i + stripe_nr) % num_stripes, @@ -8030,7 +8033,7 @@ static void map_raid56_repair_block(struct btrfs_io_context *bioc, for (i = 0; i < data_stripes; i++) { u64 stripe_start = bioc->full_stripe_logical + - (i << BTRFS_STRIPE_LEN_SHIFT); + btrfs_stripe_nr_to_offset(i); if (logical >= stripe_start && logical < stripe_start + BTRFS_STRIPE_LEN) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index bf47a1a70813..64066d48dce1 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -574,6 +574,17 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) sizeof(struct btrfs_stripe) * (num_stripes - 1); } +/* + * Do the type safe converstion from stripe_nr to offset inside the chunk. + * + * @stripe_nr is u32, with left shift it can overflow u32 for chunks larger + * than 4G. This does the proper type cast to avoid overflow. + */ +static inline u64 btrfs_stripe_nr_to_offset(u32 stripe_nr) +{ + return (u64)stripe_nr << BTRFS_STRIPE_LEN_SHIFT; +} + void btrfs_get_bioc(struct btrfs_io_context *bioc); void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 5cf30827f244..b4e54d079b7d 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -370,7 +370,15 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) struct folio *folio = fbatch.folios[i]; folio_lock(folio); - nilfs_clear_dirty_page(&folio->page, silent); + + /* + * This folio may have been removed from the address + * space by truncation or invalidation when the lock + * was acquired. Skip processing in that case. + */ + if (likely(folio->mapping == mapping)) + nilfs_clear_dirty_page(&folio->page, silent); + folio_unlock(folio); } folio_batch_release(&fbatch); diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 1362ccb64ec7..6e59dc19a732 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -101,6 +101,12 @@ int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf) if (unlikely(!bh)) return -ENOMEM; + lock_buffer(bh); + if (!buffer_uptodate(bh)) { + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); nilfs_segbuf_add_segsum_buffer(segbuf, bh); return 0; } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index ac949fd7603f..c2553024bd25 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -981,10 +981,13 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, unsigned int isz, srsz; bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root; + + lock_buffer(bh_sr); raw_sr = (struct nilfs_super_root *)bh_sr->b_data; isz = nilfs->ns_inode_size; srsz = NILFS_SR_BYTES(isz); + raw_sr->sr_sum = 0; /* Ensure initialization within this update */ raw_sr->sr_bytes = cpu_to_le16(srsz); raw_sr->sr_nongc_ctime = cpu_to_le64(nilfs_doing_gc() ? @@ -998,6 +1001,8 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr + NILFS_SR_SUFILE_OFFSET(isz), 1); memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz); + set_buffer_uptodate(bh_sr); + unlock_buffer(bh_sr); } static void nilfs_redirty_inodes(struct list_head *head) @@ -1780,6 +1785,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err) list_for_each_entry(segbuf, logs, sb_list) { list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { + clear_buffer_uptodate(bh); if (bh->b_page != bd_page) { if (bd_page) end_page_writeback(bd_page); @@ -1791,6 +1797,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err) b_assoc_buffers) { clear_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { + clear_buffer_uptodate(bh); if (bh->b_page != bd_page) { end_page_writeback(bd_page); bd_page = bh->b_page; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 77f1e5778d1c..9ba4933087af 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -372,10 +372,31 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) goto out; } nsbp = (void *)nsbh->b_data + offset; - memset(nsbp, 0, nilfs->ns_blocksize); + lock_buffer(nsbh); if (sb2i >= 0) { + /* + * The position of the second superblock only changes by 4KiB, + * which is larger than the maximum superblock data size + * (= 1KiB), so there is no need to use memmove() to allow + * overlap between source and destination. + */ memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize); + + /* + * Zero fill after copy to avoid overwriting in case of move + * within the same block. + */ + memset(nsbh->b_data, 0, offset); + memset((void *)nsbp + nilfs->ns_sbsize, 0, + nsbh->b_size - offset - nilfs->ns_sbsize); + } else { + memset(nsbh->b_data, 0, nsbh->b_size); + } + set_buffer_uptodate(nsbh); + unlock_buffer(nsbh); + + if (sb2i >= 0) { brelse(nilfs->ns_sbh[sb2i]); nilfs->ns_sbh[sb2i] = nsbh; nilfs->ns_sbp[sb2i] = nsbp; diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index f9b2e0f19b03..ced7a9e916f0 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -185,24 +185,31 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, goto send; } - if (conn->ops->check_user_session) { - rc = conn->ops->check_user_session(work); - if (rc < 0) { - command = conn->ops->get_cmd_val(work); - conn->ops->set_rsp_status(work, - STATUS_USER_SESSION_DELETED); - goto send; - } else if (rc > 0) { - rc = conn->ops->get_ksmbd_tcon(work); + do { + if (conn->ops->check_user_session) { + rc = conn->ops->check_user_session(work); if (rc < 0) { - conn->ops->set_rsp_status(work, - STATUS_NETWORK_NAME_DELETED); + if (rc == -EINVAL) + conn->ops->set_rsp_status(work, + STATUS_INVALID_PARAMETER); + else + conn->ops->set_rsp_status(work, + STATUS_USER_SESSION_DELETED); goto send; + } else if (rc > 0) { + rc = conn->ops->get_ksmbd_tcon(work); + if (rc < 0) { + if (rc == -EINVAL) + conn->ops->set_rsp_status(work, + STATUS_INVALID_PARAMETER); + else + conn->ops->set_rsp_status(work, + STATUS_NETWORK_NAME_DELETED); + goto send; + } } } - } - do { rc = __process_request(work, conn, &command); if (rc == SERVER_HANDLER_ABORT) break; diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c index 0ffe663b7590..33b7e6c4ceff 100644 --- a/fs/smb/server/smb2misc.c +++ b/fs/smb/server/smb2misc.c @@ -351,9 +351,16 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) int command; __u32 clc_len; /* calculated length */ __u32 len = get_rfc1002_len(work->request_buf); + __u32 req_struct_size, next_cmd = le32_to_cpu(hdr->NextCommand); - if (le32_to_cpu(hdr->NextCommand) > 0) - len = le32_to_cpu(hdr->NextCommand); + if ((u64)work->next_smb2_rcv_hdr_off + next_cmd > len) { + pr_err("next command(%u) offset exceeds smb msg size\n", + next_cmd); + return 1; + } + + if (next_cmd > 0) + len = next_cmd; else if (work->next_smb2_rcv_hdr_off) len -= work->next_smb2_rcv_hdr_off; @@ -373,17 +380,9 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) } if (smb2_req_struct_sizes[command] != pdu->StructureSize2) { - if (command != SMB2_OPLOCK_BREAK_HE && - (hdr->Status == 0 || pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2_LE)) { - /* error packets have 9 byte structure size */ - ksmbd_debug(SMB, - "Illegal request size %u for command %d\n", - le16_to_cpu(pdu->StructureSize2), command); - return 1; - } else if (command == SMB2_OPLOCK_BREAK_HE && - hdr->Status == 0 && - le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 && - le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) { + if (command == SMB2_OPLOCK_BREAK_HE && + le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 && + le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) { /* special case for SMB2.1 lease break message */ ksmbd_debug(SMB, "Illegal request size %d for oplock break\n", @@ -392,6 +391,14 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) } } + req_struct_size = le16_to_cpu(pdu->StructureSize2) + + __SMB2_HEADER_STRUCTURE_SIZE; + if (command == SMB2_LOCK_HE) + req_struct_size -= sizeof(struct smb2_lock_element); + + if (req_struct_size > len + 1) + return 1; + if (smb2_calc_size(hdr, &clc_len)) return 1; diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 25c0ba04c59d..da1787c68ba0 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -91,7 +91,6 @@ int smb2_get_ksmbd_tcon(struct ksmbd_work *work) unsigned int cmd = le16_to_cpu(req_hdr->Command); int tree_id; - work->tcon = NULL; if (cmd == SMB2_TREE_CONNECT_HE || cmd == SMB2_CANCEL_HE || cmd == SMB2_LOGOFF_HE) { @@ -105,10 +104,28 @@ int smb2_get_ksmbd_tcon(struct ksmbd_work *work) } tree_id = le32_to_cpu(req_hdr->Id.SyncId.TreeId); + + /* + * If request is not the first in Compound request, + * Just validate tree id in header with work->tcon->id. + */ + if (work->next_smb2_rcv_hdr_off) { + if (!work->tcon) { + pr_err("The first operation in the compound does not have tcon\n"); + return -EINVAL; + } + if (work->tcon->id != tree_id) { + pr_err("tree id(%u) is different with id(%u) in first operation\n", + tree_id, work->tcon->id); + return -EINVAL; + } + return 1; + } + work->tcon = ksmbd_tree_conn_lookup(work->sess, tree_id); if (!work->tcon) { pr_err("Invalid tid %d\n", tree_id); - return -EINVAL; + return -ENOENT; } return 1; @@ -547,7 +564,6 @@ int smb2_check_user_session(struct ksmbd_work *work) unsigned int cmd = conn->ops->get_cmd_val(work); unsigned long long sess_id; - work->sess = NULL; /* * SMB2_ECHO, SMB2_NEGOTIATE, SMB2_SESSION_SETUP command do not * require a session id, so no need to validate user session's for @@ -558,15 +574,33 @@ int smb2_check_user_session(struct ksmbd_work *work) return 0; if (!ksmbd_conn_good(conn)) - return -EINVAL; + return -EIO; sess_id = le64_to_cpu(req_hdr->SessionId); + + /* + * If request is not the first in Compound request, + * Just validate session id in header with work->sess->id. + */ + if (work->next_smb2_rcv_hdr_off) { + if (!work->sess) { + pr_err("The first operation in the compound does not have sess\n"); + return -EINVAL; + } + if (work->sess->id != sess_id) { + pr_err("session id(%llu) is different with the first operation(%lld)\n", + sess_id, work->sess->id); + return -EINVAL; + } + return 1; + } + /* Check for validity of user session */ work->sess = ksmbd_session_lookup_all(conn, sess_id); if (work->sess) return 1; ksmbd_debug(SMB, "Invalid user session, Uid %llu\n", sess_id); - return -EINVAL; + return -ENOENT; } static void destroy_previous_session(struct ksmbd_conn *conn, @@ -2249,7 +2283,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, /* delete the EA only when it exits */ if (rc > 0) { rc = ksmbd_vfs_remove_xattr(idmap, - path->dentry, + path, attr_name); if (rc < 0) { @@ -2263,8 +2297,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, /* if the EA doesn't exist, just do nothing. */ rc = 0; } else { - rc = ksmbd_vfs_setxattr(idmap, - path->dentry, attr_name, value, + rc = ksmbd_vfs_setxattr(idmap, path, attr_name, value, le16_to_cpu(eabuf->EaValueLength), 0); if (rc < 0) { ksmbd_debug(SMB, @@ -2321,8 +2354,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, return -EBADF; } - rc = ksmbd_vfs_setxattr(idmap, path->dentry, - xattr_stream_name, NULL, 0, 0); + rc = ksmbd_vfs_setxattr(idmap, path, xattr_stream_name, NULL, 0, 0); if (rc < 0) pr_err("Failed to store XATTR stream name :%d\n", rc); return 0; @@ -2350,7 +2382,7 @@ static int smb2_remove_smb_xattrs(const struct path *path) if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && !strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX, STREAM_PREFIX_LEN)) { - err = ksmbd_vfs_remove_xattr(idmap, path->dentry, + err = ksmbd_vfs_remove_xattr(idmap, path, name); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", @@ -2397,8 +2429,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path * da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | XATTR_DOSINFO_ITIME; - rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), - path->dentry, &da); + rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), path, &da); if (rc) ksmbd_debug(SMB, "failed to store file attribute into xattr\n"); } @@ -2972,7 +3003,7 @@ int smb2_open(struct ksmbd_work *work) struct inode *inode = d_inode(path.dentry); posix_acl_rc = ksmbd_vfs_inherit_posix_acl(idmap, - path.dentry, + &path, d_inode(path.dentry->d_parent)); if (posix_acl_rc) ksmbd_debug(SMB, "inherit posix acl failed : %d\n", posix_acl_rc); @@ -2988,7 +3019,7 @@ int smb2_open(struct ksmbd_work *work) if (rc) { if (posix_acl_rc) ksmbd_vfs_set_init_posix_acl(idmap, - path.dentry); + &path); if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) { @@ -3028,7 +3059,7 @@ int smb2_open(struct ksmbd_work *work) rc = ksmbd_vfs_set_sd_xattr(conn, idmap, - path.dentry, + &path, pntsd, pntsd_size); kfree(pntsd); @@ -5464,7 +5495,7 @@ static int smb2_rename(struct ksmbd_work *work, goto out; rc = ksmbd_vfs_setxattr(file_mnt_idmap(fp->filp), - fp->filp->f_path.dentry, + &fp->filp->f_path, xattr_stream_name, NULL, 0, 0); if (rc < 0) { @@ -5629,8 +5660,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | XATTR_DOSINFO_ITIME; - rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, - filp->f_path.dentry, &da); + rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, &filp->f_path, &da); if (rc) ksmbd_debug(SMB, "failed to restore file attribute in EA\n"); @@ -7485,7 +7515,7 @@ static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id, da.attr = le32_to_cpu(fp->f_ci->m_fattr); ret = ksmbd_vfs_set_dos_attrib_xattr(idmap, - fp->filp->f_path.dentry, &da); + &fp->filp->f_path, &da); if (ret) fp->f_ci->m_fattr = old_fattr; } diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index 0a5862a61c77..ad919a4239d0 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -1162,8 +1162,7 @@ pass: pntsd_size += sizeof(struct smb_acl) + nt_size; } - ksmbd_vfs_set_sd_xattr(conn, idmap, - path->dentry, pntsd, pntsd_size); + ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, pntsd_size); kfree(pntsd); } @@ -1383,7 +1382,7 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, newattrs.ia_valid |= ATTR_MODE; newattrs.ia_mode = (inode->i_mode & ~0777) | (fattr.cf_mode & 0777); - ksmbd_vfs_remove_acl_xattrs(idmap, path->dentry); + ksmbd_vfs_remove_acl_xattrs(idmap, path); /* Update posix acls */ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) { rc = set_posix_acl(idmap, path->dentry, @@ -1414,9 +1413,8 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, if (test_share_config_flag(tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) { /* Update WinACL in xattr */ - ksmbd_vfs_remove_sd_xattrs(idmap, path->dentry); - ksmbd_vfs_set_sd_xattr(conn, idmap, - path->dentry, pntsd, ntsd_len); + ksmbd_vfs_remove_sd_xattrs(idmap, path); + ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, ntsd_len); } out: diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index f9fb778247e7..81489fdedd8e 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -170,6 +170,10 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) return err; } + err = mnt_want_write(path.mnt); + if (err) + goto out_err; + mode |= S_IFREG; err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry), dentry, mode, true); @@ -179,6 +183,9 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) } else { pr_err("File(%s): creation failed (err:%d)\n", name, err); } + mnt_drop_write(path.mnt); + +out_err: done_path_create(&path, dentry); return err; } @@ -209,30 +216,35 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) return err; } + err = mnt_want_write(path.mnt); + if (err) + goto out_err2; + idmap = mnt_idmap(path.mnt); mode |= S_IFDIR; err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); - if (err) { - goto out; - } else if (d_unhashed(dentry)) { + if (!err && d_unhashed(dentry)) { struct dentry *d; d = lookup_one(idmap, dentry->d_name.name, dentry->d_parent, dentry->d_name.len); if (IS_ERR(d)) { err = PTR_ERR(d); - goto out; + goto out_err1; } if (unlikely(d_is_negative(d))) { dput(d); err = -ENOENT; - goto out; + goto out_err1; } ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d)); dput(d); } -out: + +out_err1: + mnt_drop_write(path.mnt); +out_err2: done_path_create(&path, dentry); if (err) pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); @@ -443,7 +455,7 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, memcpy(&stream_buf[*pos], buf, count); err = ksmbd_vfs_setxattr(idmap, - fp->filp->f_path.dentry, + &fp->filp->f_path, fp->stream.name, (void *)stream_buf, size, @@ -589,6 +601,10 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path) goto out_err; } + err = mnt_want_write(path->mnt); + if (err) + goto out_err; + idmap = mnt_idmap(path->mnt); if (S_ISDIR(d_inode(path->dentry)->i_mode)) { err = vfs_rmdir(idmap, d_inode(parent), path->dentry); @@ -599,6 +615,7 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path) if (err) ksmbd_debug(VFS, "unlink failed, err %d\n", err); } + mnt_drop_write(path->mnt); out_err: ksmbd_revert_fsids(work); @@ -644,11 +661,16 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, goto out3; } + err = mnt_want_write(newpath.mnt); + if (err) + goto out3; + err = vfs_link(oldpath.dentry, mnt_idmap(newpath.mnt), d_inode(newpath.dentry), dentry, NULL); if (err) ksmbd_debug(VFS, "vfs_link failed err %d\n", err); + mnt_drop_write(newpath.mnt); out3: done_path_create(&newpath, dentry); @@ -694,6 +716,10 @@ retry: goto out2; } + err = mnt_want_write(old_path->mnt); + if (err) + goto out2; + trap = lock_rename_child(old_child, new_path.dentry); old_parent = dget(old_child->d_parent); @@ -757,6 +783,7 @@ out4: out3: dput(old_parent); unlock_rename(old_parent, new_path.dentry); + mnt_drop_write(old_path->mnt); out2: path_put(&new_path); @@ -897,19 +924,24 @@ ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap, * Return: 0 on success, otherwise error */ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, - struct dentry *dentry, const char *attr_name, + const struct path *path, const char *attr_name, void *attr_value, size_t attr_size, int flags) { int err; + err = mnt_want_write(path->mnt); + if (err) + return err; + err = vfs_setxattr(idmap, - dentry, + path->dentry, attr_name, attr_value, attr_size, flags); if (err) ksmbd_debug(VFS, "setxattr failed, err %d\n", err); + mnt_drop_write(path->mnt); return err; } @@ -1013,9 +1045,18 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, } int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, - struct dentry *dentry, char *attr_name) + const struct path *path, char *attr_name) { - return vfs_removexattr(idmap, dentry, attr_name); + int err; + + err = mnt_want_write(path->mnt); + if (err) + return err; + + err = vfs_removexattr(idmap, path->dentry, attr_name); + mnt_drop_write(path->mnt); + + return err; } int ksmbd_vfs_unlink(struct file *filp) @@ -1024,6 +1065,10 @@ int ksmbd_vfs_unlink(struct file *filp) struct dentry *dir, *dentry = filp->f_path.dentry; struct mnt_idmap *idmap = file_mnt_idmap(filp); + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + dir = dget_parent(dentry); err = ksmbd_vfs_lock_parent(dir, dentry); if (err) @@ -1041,6 +1086,7 @@ int ksmbd_vfs_unlink(struct file *filp) ksmbd_debug(VFS, "failed to delete, err %d\n", err); out: dput(dir); + mnt_drop_write(filp->f_path.mnt); return err; } @@ -1244,13 +1290,13 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, } int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, - struct dentry *dentry) + const struct path *path) { char *name, *xattr_list = NULL; ssize_t xattr_list_len; int err = 0; - xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list); + xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list); if (xattr_list_len < 0) { goto out; } else if (!xattr_list_len) { @@ -1258,6 +1304,10 @@ int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, goto out; } + err = mnt_want_write(path->mnt); + if (err) + goto out; + for (name = xattr_list; name - xattr_list < xattr_list_len; name += strlen(name) + 1) { ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); @@ -1266,25 +1316,26 @@ int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) || !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) { - err = vfs_remove_acl(idmap, dentry, name); + err = vfs_remove_acl(idmap, path->dentry, name); if (err) ksmbd_debug(SMB, "remove acl xattr failed : %s\n", name); } } + mnt_drop_write(path->mnt); + out: kvfree(xattr_list); return err; } -int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, - struct dentry *dentry) +int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path) { char *name, *xattr_list = NULL; ssize_t xattr_list_len; int err = 0; - xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list); + xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list); if (xattr_list_len < 0) { goto out; } else if (!xattr_list_len) { @@ -1297,7 +1348,7 @@ int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); if (!strncmp(name, XATTR_NAME_SD, XATTR_NAME_SD_LEN)) { - err = ksmbd_vfs_remove_xattr(idmap, dentry, name); + err = ksmbd_vfs_remove_xattr(idmap, path, name); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", name); } @@ -1374,13 +1425,14 @@ out: int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, struct mnt_idmap *idmap, - struct dentry *dentry, + const struct path *path, struct smb_ntsd *pntsd, int len) { int rc; struct ndr sd_ndr = {0}, acl_ndr = {0}; struct xattr_ntacl acl = {0}; struct xattr_smb_acl *smb_acl, *def_smb_acl = NULL; + struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); acl.version = 4; @@ -1432,7 +1484,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, goto out; } - rc = ksmbd_vfs_setxattr(idmap, dentry, + rc = ksmbd_vfs_setxattr(idmap, path, XATTR_NAME_SD, sd_ndr.data, sd_ndr.offset, 0); if (rc < 0) @@ -1522,7 +1574,7 @@ free_n_data: } int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, - struct dentry *dentry, + const struct path *path, struct xattr_dos_attrib *da) { struct ndr n; @@ -1532,7 +1584,7 @@ int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, if (err) return err; - err = ksmbd_vfs_setxattr(idmap, dentry, XATTR_NAME_DOS_ATTRIBUTE, + err = ksmbd_vfs_setxattr(idmap, path, XATTR_NAME_DOS_ATTRIBUTE, (void *)n.data, n.offset, 0); if (err) ksmbd_debug(SMB, "failed to store dos attribute in xattr\n"); @@ -1769,10 +1821,11 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock) } int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, - struct dentry *dentry) + struct path *path) { struct posix_acl_state acl_state; struct posix_acl *acls; + struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); int rc; @@ -1802,6 +1855,11 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, return -ENOMEM; } posix_state_to_acl(&acl_state, acls->a_entries); + + rc = mnt_want_write(path->mnt); + if (rc) + goto out_err; + rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", @@ -1813,16 +1871,20 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", rc); } + mnt_drop_write(path->mnt); + +out_err: free_acl_state(&acl_state); posix_acl_release(acls); return rc; } int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *parent_inode) + struct path *path, struct inode *parent_inode) { struct posix_acl *acls; struct posix_acl_entry *pace; + struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); int rc, i; @@ -1841,6 +1903,10 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, } } + rc = mnt_want_write(path->mnt); + if (rc) + goto out_err; + rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", @@ -1852,6 +1918,9 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", rc); } + mnt_drop_write(path->mnt); + +out_err: posix_acl_release(acls); return rc; } diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index a4ae89f3230d..8c0931d4d531 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -108,12 +108,12 @@ ssize_t ksmbd_vfs_casexattr_len(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name, int attr_name_len); int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, - struct dentry *dentry, const char *attr_name, + const struct path *path, const char *attr_name, void *attr_value, size_t attr_size, int flags); int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, - struct dentry *dentry, char *attr_name); + const struct path *path, char *attr_name); int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, unsigned int flags, struct path *path, bool caseless); @@ -139,26 +139,25 @@ void ksmbd_vfs_posix_lock_wait(struct file_lock *flock); int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout); void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock); int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, - struct dentry *dentry); -int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, - struct dentry *dentry); + const struct path *path); +int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path); int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, struct mnt_idmap *idmap, - struct dentry *dentry, + const struct path *path, struct smb_ntsd *pntsd, int len); int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd **pntsd); int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, - struct dentry *dentry, + const struct path *path, struct xattr_dos_attrib *da); int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da); int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, - struct dentry *dentry); + struct path *path); int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, - struct dentry *dentry, + struct path *path, struct inode *parent_inode); #endif /* __KSMBD_VFS_H__ */ diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index 2d0138e72d78..f41f8d6108ce 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -252,7 +252,7 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp) if (ksmbd_stream_fd(fp) && (ci->m_flags & S_DEL_ON_CLS_STREAM)) { ci->m_flags &= ~S_DEL_ON_CLS_STREAM; err = ksmbd_vfs_remove_xattr(file_mnt_idmap(filp), - filp->f_path.dentry, + &filp->f_path, fp->stream.name); if (err) pr_err("remove xattr failed : %s\n", diff --git a/fs/super.c b/fs/super.c index 34afe411cf2b..04bc62ab7dfe 100644 --- a/fs/super.c +++ b/fs/super.c @@ -54,7 +54,7 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = { * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the - * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we + * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index e6098a08c914..9ffdc0425bc2 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -761,6 +761,7 @@ ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_event_status *event_status)) ACPI_HW_DEPENDENT_RETURN_UINT32(u32 acpi_dispatch_gpe(acpi_handle gpe_device, u32 gpe_number)) +ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_hw_disable_all_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_disable_all_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_runtime_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_wakeup_gpes(void)) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index cebdf1ca415d..da9e5629ea43 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -839,6 +839,9 @@ #ifdef CONFIG_UNWINDER_ORC #define ORC_UNWIND_TABLE \ + .orc_header : AT(ADDR(.orc_header) - LOAD_OFFSET) { \ + BOUNDED_SECTION_BY(.orc_header, _orc_header) \ + } \ . = ALIGN(4); \ .orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) { \ BOUNDED_SECTION_BY(.orc_unwind_ip, _orc_unwind_ip) \ diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 0f1001dca0e0..3ceb9dfa0993 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -200,6 +200,7 @@ enum cpuhp_state { /* Online section invoked on the hotplugged CPU from the hotplug thread */ CPUHP_AP_ONLINE_IDLE, + CPUHP_AP_HYPERV_ONLINE, CPUHP_AP_KVM_ONLINE, CPUHP_AP_SCHED_WAIT_EMPTY, CPUHP_AP_SMPBOOT_THREADS, diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 5c6db5533be6..67b8774eed8f 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -252,6 +252,14 @@ struct gpio_irq_chip { bool initialized; /** + * @domain_is_allocated_externally: + * + * True it the irq_domain was allocated outside of gpiolib, in which + * case gpiolib won't free the irq_domain itself. + */ + bool domain_is_allocated_externally; + + /** * @init_hw: optional routine to initialize hardware before * an IRQ chip will be added. This is quite useful when * a particular driver wants to clear IRQ related registers diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d5628a7b5eaa..c8dcfdbda1f4 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1845,9 +1845,9 @@ int perf_event_exit_cpu(unsigned int cpu); #define perf_event_exit_cpu NULL #endif -extern void __weak arch_perf_update_userpage(struct perf_event *event, - struct perf_event_mmap_page *userpg, - u64 now); +extern void arch_perf_update_userpage(struct perf_event *event, + struct perf_event_mmap_page *userpg, + u64 now); #ifdef CONFIG_MMU extern __weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr); diff --git a/include/linux/regulator/pca9450.h b/include/linux/regulator/pca9450.h index 3c01c2bf84f5..505c908dbb81 100644 --- a/include/linux/regulator/pca9450.h +++ b/include/linux/regulator/pca9450.h @@ -196,11 +196,11 @@ enum { /* PCA9450_REG_LDO3_VOLT bits */ #define LDO3_EN_MASK 0xC0 -#define LDO3OUT_MASK 0x0F +#define LDO3OUT_MASK 0x1F /* PCA9450_REG_LDO4_VOLT bits */ #define LDO4_EN_MASK 0xC0 -#define LDO4OUT_MASK 0x0F +#define LDO4OUT_MASK 0x1F /* PCA9450_REG_LDO5_VOLT bits */ #define LDO5L_EN_MASK 0xC0 diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 3992c994787f..683efe29fa69 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -68,7 +68,6 @@ enum { WORK_OFFQ_FLAG_BASE = WORK_STRUCT_COLOR_SHIFT, __WORK_OFFQ_CANCELING = WORK_OFFQ_FLAG_BASE, - WORK_OFFQ_CANCELING = (1 << __WORK_OFFQ_CANCELING), /* * When a work item is off queue, its high bits point to the last @@ -79,12 +78,6 @@ enum { WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_FLAG_BASE + WORK_OFFQ_FLAG_BITS, WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31, - WORK_OFFQ_POOL_NONE = (1LU << WORK_OFFQ_POOL_BITS) - 1, - - /* convenience constants */ - WORK_STRUCT_FLAG_MASK = (1UL << WORK_STRUCT_FLAG_BITS) - 1, - WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK, - WORK_STRUCT_NO_POOL = (unsigned long)WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT, /* bit mask for work_busy() return values */ WORK_BUSY_PENDING = 1 << 0, @@ -94,6 +87,14 @@ enum { WORKER_DESC_LEN = 24, }; +/* Convenience constants - of type 'unsigned long', not 'enum'! */ +#define WORK_OFFQ_CANCELING (1ul << __WORK_OFFQ_CANCELING) +#define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) +#define WORK_STRUCT_NO_POOL (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT) + +#define WORK_STRUCT_FLAG_MASK ((1ul << WORK_STRUCT_FLAG_BITS) - 1) +#define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK) + struct work_struct { atomic_long_t data; struct list_head entry; diff --git a/include/net/dsa.h b/include/net/dsa.h index 8903053fa5aa..ab0f0a5b0860 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -959,6 +959,14 @@ struct dsa_switch_ops { void (*port_disable)(struct dsa_switch *ds, int port); /* + * Compatibility between device trees defining multiple CPU ports and + * drivers which are not OK to use by default the numerically smallest + * CPU port of a switch for its local ports. This can return NULL, + * meaning "don't know/don't care". + */ + struct dsa_port *(*preferred_default_local_cpu_port)(struct dsa_switch *ds); + + /* * Port's MAC EEE settings */ int (*set_mac_eee)(struct dsa_switch *ds, int port, diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 83db182decc8..ee47d7143d99 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -472,7 +472,8 @@ struct nft_set_ops { int (*init)(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]); - void (*destroy)(const struct nft_set *set); + void (*destroy)(const struct nft_ctx *ctx, + const struct nft_set *set); void (*gc_init)(const struct nft_set *set); unsigned int elemsize; @@ -809,6 +810,8 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]); void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr); +void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, void *elem); /** * struct nft_set_gc_batch_head - nf_tables set garbage collection batch @@ -901,6 +904,7 @@ struct nft_expr_type { enum nft_trans_phase { NFT_TRANS_PREPARE, + NFT_TRANS_PREPARE_ERROR, NFT_TRANS_ABORT, NFT_TRANS_COMMIT, NFT_TRANS_RELEASE @@ -1009,7 +1013,10 @@ static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) return (void *)&rule->data[rule->dlen]; } -void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule); +void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule); +void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, + enum nft_trans_phase phase); +void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule); static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, @@ -1104,6 +1111,8 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_set_elem *elem); int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); +int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); +void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, @@ -1140,11 +1149,17 @@ int nft_chain_validate_dependency(const struct nft_chain *chain, int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); +static inline bool nft_chain_binding(const struct nft_chain *chain) +{ + return chain->flags & NFT_CHAIN_BINDING; +} + static inline bool nft_chain_is_bound(struct nft_chain *chain) { return (chain->flags & NFT_CHAIN_BINDING) && chain->bound; } +int nft_chain_add(struct nft_table *table, struct nft_chain *chain); void nft_chain_del(struct nft_chain *chain); void nf_tables_chain_destroy(struct nft_ctx *ctx); @@ -1558,6 +1573,7 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) * struct nft_trans - nf_tables object update in transaction * * @list: used internally + * @binding_list: list of objects with possible bindings * @msg_type: message type * @put_net: ctx->net needs to be put * @ctx: transaction context @@ -1565,6 +1581,7 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) */ struct nft_trans { struct list_head list; + struct list_head binding_list; int msg_type; bool put_net; struct nft_ctx ctx; @@ -1575,6 +1592,7 @@ struct nft_trans_rule { struct nft_rule *rule; struct nft_flow_rule *flow; u32 rule_id; + bool bound; }; #define nft_trans_rule(trans) \ @@ -1583,6 +1601,8 @@ struct nft_trans_rule { (((struct nft_trans_rule *)trans->data)->flow) #define nft_trans_rule_id(trans) \ (((struct nft_trans_rule *)trans->data)->rule_id) +#define nft_trans_rule_bound(trans) \ + (((struct nft_trans_rule *)trans->data)->bound) struct nft_trans_set { struct nft_set *set; @@ -1607,15 +1627,19 @@ struct nft_trans_set { (((struct nft_trans_set *)trans->data)->gc_int) struct nft_trans_chain { + struct nft_chain *chain; bool update; char *name; struct nft_stats __percpu *stats; u8 policy; + bool bound; u32 chain_id; struct nft_base_chain *basechain; struct list_head hook_list; }; +#define nft_trans_chain(trans) \ + (((struct nft_trans_chain *)trans->data)->chain) #define nft_trans_chain_update(trans) \ (((struct nft_trans_chain *)trans->data)->update) #define nft_trans_chain_name(trans) \ @@ -1624,6 +1648,8 @@ struct nft_trans_chain { (((struct nft_trans_chain *)trans->data)->stats) #define nft_trans_chain_policy(trans) \ (((struct nft_trans_chain *)trans->data)->policy) +#define nft_trans_chain_bound(trans) \ + (((struct nft_trans_chain *)trans->data)->bound) #define nft_trans_chain_id(trans) \ (((struct nft_trans_chain *)trans->data)->chain_id) #define nft_trans_basechain(trans) \ @@ -1700,6 +1726,7 @@ static inline int nft_request_module(struct net *net, const char *fmt, ...) { re struct nftables_pernet { struct list_head tables; struct list_head commit_list; + struct list_head binding_list; struct list_head module_list; struct list_head notify_list; struct mutex commit_mutex; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 33ee3f5936e6..151ca95dd08d 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1054,6 +1054,7 @@ struct xfrm_offload { struct sec_path { int len; int olen; + int verified_cnt; struct xfrm_state *xvec[XFRM_MAX_DEPTH]; struct xfrm_offload ovec[XFRM_MAX_OFFLOAD_DEPTH]; diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 86b2a82da546..54e353c9f919 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(writeback_folio_template, strscpy_pad(__entry->name, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); - __entry->ino = mapping ? mapping->host->i_ino : 0; + __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0; __entry->index = folio->index; ), diff --git a/io_uring/net.c b/io_uring/net.c index 51b0f7fbb4f5..c8a4b2ac00f7 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -203,7 +203,7 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, ret = sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags, &iomsg->free_iov); /* save msg_control as sys_sendmsg() overwrites it */ - sr->msg_control = iomsg->msg.msg_control; + sr->msg_control = iomsg->msg.msg_control_user; return ret; } @@ -302,7 +302,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (req_has_async_data(req)) { kmsg = req->async_data; - kmsg->msg.msg_control = sr->msg_control; + kmsg->msg.msg_control_user = sr->msg_control; } else { ret = io_sendmsg_copy_hdr(req, &iomsg); if (ret) @@ -326,6 +326,8 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) return io_setup_async_msg(req, kmsg, issue_flags); if (ret > 0 && io_net_retry(sock, flags)) { + kmsg->msg.msg_controllen = 0; + kmsg->msg.msg_control = NULL; sr->done_io += ret; req->flags |= REQ_F_PARTIAL_IO; return io_setup_async_msg(req, kmsg, issue_flags); @@ -787,16 +789,19 @@ retry_multishot: flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&kmsg->msg.msg_iter); kmsg->msg.msg_get_inq = 1; - if (req->flags & REQ_F_APOLL_MULTISHOT) + if (req->flags & REQ_F_APOLL_MULTISHOT) { ret = io_recvmsg_multishot(sock, sr, kmsg, flags, &mshot_finished); - else + } else { + /* disable partial retry for recvmsg with cmsg attached */ + if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); + ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); + } if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { diff --git a/io_uring/poll.c b/io_uring/poll.c index c90e47dc1e29..a78b8af7d9ab 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -977,8 +977,9 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_hash_bucket *bucket; struct io_kiocb *preq; int ret2, ret = 0; - struct io_tw_state ts = {}; + struct io_tw_state ts = { .locked = true }; + io_ring_submit_lock(ctx, issue_flags); preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); ret2 = io_poll_disarm(preq); if (bucket) @@ -990,12 +991,10 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) goto out; } - io_ring_submit_lock(ctx, issue_flags); preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table_locked, &bucket); ret2 = io_poll_disarm(preq); if (bucket) spin_unlock(&bucket->lock); - io_ring_submit_unlock(ctx, issue_flags); if (ret2) { ret = ret2; goto out; @@ -1019,7 +1018,7 @@ found: if (poll_update->update_user_data) preq->cqe.user_data = poll_update->new_user_data; - ret2 = io_poll_add(preq, issue_flags); + ret2 = io_poll_add(preq, issue_flags & ~IO_URING_F_UNLOCKED); /* successfully updated, don't complete poll request */ if (!ret2 || ret2 == -EIOCBQUEUED) goto out; @@ -1027,9 +1026,9 @@ found: req_set_fail(preq); io_req_set_res(preq, -ECANCELED, 0); - ts.locked = !(issue_flags & IO_URING_F_UNLOCKED); io_req_task_complete(preq, &ts); out: + io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) { req_set_fail(req); return ret; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6b682b8e4b50..72b32b7cd9cd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -744,13 +744,12 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset) return offset < btf->hdr.str_len; } -static bool __btf_name_char_ok(char c, bool first, bool dot_ok) +static bool __btf_name_char_ok(char c, bool first) { if ((first ? !isalpha(c) : !isalnum(c)) && c != '_' && - ((c == '.' && !dot_ok) || - c != '.')) + c != '.') return false; return true; } @@ -767,20 +766,20 @@ static const char *btf_str_by_offset(const struct btf *btf, u32 offset) return NULL; } -static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) +static bool __btf_name_valid(const struct btf *btf, u32 offset) { /* offset must be valid */ const char *src = btf_str_by_offset(btf, offset); const char *src_limit; - if (!__btf_name_char_ok(*src, true, dot_ok)) + if (!__btf_name_char_ok(*src, true)) return false; /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; src++; while (*src && src < src_limit) { - if (!__btf_name_char_ok(*src, false, dot_ok)) + if (!__btf_name_char_ok(*src, false)) return false; src++; } @@ -788,17 +787,14 @@ static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) return !*src; } -/* Only C-style identifier is permitted. This can be relaxed if - * necessary. - */ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) { - return __btf_name_valid(btf, offset, false); + return __btf_name_valid(btf, offset); } static bool btf_name_valid_section(const struct btf *btf, u32 offset) { - return __btf_name_valid(btf, offset, true); + return __btf_name_valid(btf, offset); } static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) @@ -4422,7 +4418,7 @@ static s32 btf_var_check_meta(struct btf_verifier_env *env, } if (!t->name_off || - !__btf_name_valid(env->btf, t->name_off, true)) { + !__btf_name_valid(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0c21d0d8efe4..f1c8733f76b8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3440,6 +3440,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; + case BPF_PROG_TYPE_KPROBE: + if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && + attach_type != BPF_TRACE_KPROBE_MULTI) + return -EINVAL; + return 0; default: return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5871aa78d01a..cf5f230360f5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3868,6 +3868,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, return err; } save_register_state(state, spi, reg, size); + /* Break the relation on a narrowing spill. */ + if (fls64(reg->umax_value) > BITS_PER_BYTE * size) + state->stack[spi].spilled_ptr.id = 0; } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) && insn->imm != 0 && env->bpf_capable) { struct bpf_reg_state fake_reg = {}; @@ -17214,9 +17217,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) } /* finally lock prog and jit images for all functions and - * populate kallsysm + * populate kallsysm. Begin at the first subprogram, since + * bpf_prog_load will add the kallsyms for the main program. */ - for (i = 0; i < env->subprog_cnt; i++) { + for (i = 1; i < env->subprog_cnt; i++) { bpf_prog_lock_ro(func[i]); bpf_prog_kallsyms_add(func[i]); } @@ -17242,6 +17246,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->jited = 1; prog->bpf_func = func[0]->bpf_func; prog->jited_len = func[0]->jited_len; + prog->aux->extable = func[0]->aux->extable; + prog->aux->num_exentries = func[0]->aux->num_exentries; prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt; bpf_prog_jit_attempt_done(prog); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 245cf62ce85a..4d42f0cbc11e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1798,7 +1798,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - int ssid, i, ret; + int ssid, ret; u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -1842,7 +1842,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); - struct css_set *cset; + struct css_set *cset, *cset_pos; + struct css_task_iter *it; WARN_ON(!css || cgroup_css(dcgrp, ss)); @@ -1860,9 +1861,22 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); - hash_for_each(css_set_table, i, cset, hlist) + WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); + list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], + e_cset_node[ss->id]) { list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); + /* + * all css_sets of scgrp together in same order to dcgrp, + * patch in-flight iterators to preserve correct iteration. + * since the iterator is always advanced right away and + * finished when it->cset_pos meets it->cset_head, so only + * update it->cset_head is enough here. + */ + list_for_each_entry(it, &cset->task_iters, iters_node) + if (it->cset_head == &scgrp->e_csets[ss->id]) + it->cset_head = &dcgrp->e_csets[ss->id]; + } spin_unlock_irq(&css_set_lock); if (ss->css_rstat_flush) { diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 936473203a6b..122dacb3a443 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -108,16 +108,18 @@ static int freezer_css_online(struct cgroup_subsys_state *css) struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); + cpus_read_lock(); mutex_lock(&freezer_mutex); freezer->state |= CGROUP_FREEZER_ONLINE; if (parent && (parent->state & CGROUP_FREEZING)) { freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; - static_branch_inc(&freezer_active); + static_branch_inc_cpuslocked(&freezer_active); } mutex_unlock(&freezer_mutex); + cpus_read_unlock(); return 0; } @@ -132,14 +134,16 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); + cpus_read_lock(); mutex_lock(&freezer_mutex); if (freezer->state & CGROUP_FREEZING) - static_branch_dec(&freezer_active); + static_branch_dec_cpuslocked(&freezer_active); freezer->state = 0; mutex_unlock(&freezer_mutex); + cpus_read_unlock(); } static void freezer_css_free(struct cgroup_subsys_state *css) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 65b8658da829..e9138cd7a0f5 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -218,19 +218,8 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - ktime_t next_p; - u32 rem; - tick_do_timer_cpu = cpu; - - next_p = ktime_get(); - div_u64_rem(next_p, TICK_NSEC, &rem); - if (rem) { - next_p -= rem; - next_p += TICK_NSEC; - } - - tick_next_period = next_p; + tick_next_period = ktime_get(); #ifdef CONFIG_NO_HZ_FULL /* * The boot CPU may be nohz_full, in which case set diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 52254679ec48..42c0be3080bd 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -161,8 +161,19 @@ static ktime_t tick_init_jiffy_update(void) raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Did we start the jiffies update yet ? */ - if (last_jiffies_update == 0) + if (last_jiffies_update == 0) { + u32 rem; + + /* + * Ensure that the tick is aligned to a multiple of + * TICK_NSEC. + */ + div_u64_rem(tick_next_period, TICK_NSEC, &rem); + if (rem) + tick_next_period += TICK_NSEC - rem; + last_jiffies_update = tick_next_period; + } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index dbb14705d0d3..8df0550415e7 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -50,6 +50,18 @@ #define EVENT_STATUS_OTHER BIT(7) /* + * User register flags are not allowed yet, keep them here until we are + * ready to expose them out to the user ABI. + */ +enum user_reg_flag { + /* Event will not delete upon last reference closing */ + USER_EVENT_REG_PERSIST = 1U << 0, + + /* This value or above is currently non-ABI */ + USER_EVENT_REG_MAX = 1U << 1, +}; + +/* * Stores the system name, tables, and locks for a group of events. This * allows isolation for events by various means. */ @@ -85,8 +97,10 @@ struct user_event { struct hlist_node node; struct list_head fields; struct list_head validators; + struct work_struct put_work; refcount_t refcnt; int min_size; + int reg_flags; char status; }; @@ -165,76 +179,151 @@ typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, - struct user_event **newuser); + struct user_event **newuser, int reg_flags); static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); static struct user_event_mm *user_event_mm_get_all(struct user_event *user); static void user_event_mm_put(struct user_event_mm *mm); +static int destroy_user_event(struct user_event *user); static u32 user_event_key(char *name) { return jhash(name, strlen(name), 0); } -static void user_event_group_destroy(struct user_event_group *group) +static struct user_event *user_event_get(struct user_event *user) { - kfree(group->system_name); - kfree(group); + refcount_inc(&user->refcnt); + + return user; } -static char *user_event_group_system_name(struct user_namespace *user_ns) +static void delayed_destroy_user_event(struct work_struct *work) { - char *system_name; - int len = sizeof(USER_EVENTS_SYSTEM) + 1; + struct user_event *user = container_of( + work, struct user_event, put_work); - if (user_ns != &init_user_ns) { + mutex_lock(&event_mutex); + + if (!refcount_dec_and_test(&user->refcnt)) + goto out; + + if (destroy_user_event(user)) { /* - * Unexpected at this point: - * We only currently support init_user_ns. - * When we enable more, this will trigger a failure so log. + * The only reason this would fail here is if we cannot + * update the visibility of the event. In this case the + * event stays in the hashtable, waiting for someone to + * attempt to delete it later. */ - pr_warn("user_events: Namespace other than init_user_ns!\n"); - return NULL; + pr_warn("user_events: Unable to delete event\n"); + refcount_set(&user->refcnt, 1); } +out: + mutex_unlock(&event_mutex); +} - system_name = kmalloc(len, GFP_KERNEL); +static void user_event_put(struct user_event *user, bool locked) +{ + bool delete; - if (!system_name) - return NULL; + if (unlikely(!user)) + return; - snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM); + /* + * When the event is not enabled for auto-delete there will always + * be at least 1 reference to the event. During the event creation + * we initially set the refcnt to 2 to achieve this. In those cases + * the caller must acquire event_mutex and after decrement check if + * the refcnt is 1, meaning this is the last reference. When auto + * delete is enabled, there will only be 1 ref, IE: refcnt will be + * only set to 1 during creation to allow the below checks to go + * through upon the last put. The last put must always be done with + * the event mutex held. + */ + if (!locked) { + lockdep_assert_not_held(&event_mutex); + delete = refcount_dec_and_mutex_lock(&user->refcnt, &event_mutex); + } else { + lockdep_assert_held(&event_mutex); + delete = refcount_dec_and_test(&user->refcnt); + } - return system_name; + if (!delete) + return; + + /* + * We now have the event_mutex in all cases, which ensures that + * no new references will be taken until event_mutex is released. + * New references come through find_user_event(), which requires + * the event_mutex to be held. + */ + + if (user->reg_flags & USER_EVENT_REG_PERSIST) { + /* We should not get here when persist flag is set */ + pr_alert("BUG: Auto-delete engaged on persistent event\n"); + goto out; + } + + /* + * Unfortunately we have to attempt the actual destroy in a work + * queue. This is because not all cases handle a trace_event_call + * being removed within the class->reg() operation for unregister. + */ + INIT_WORK(&user->put_work, delayed_destroy_user_event); + + /* + * Since the event is still in the hashtable, we have to re-inc + * the ref count to 1. This count will be decremented and checked + * in the work queue to ensure it's still the last ref. This is + * needed because a user-process could register the same event in + * between the time of event_mutex release and the work queue + * running the delayed destroy. If we removed the item now from + * the hashtable, this would result in a timing window where a + * user process would fail a register because the trace_event_call + * register would fail in the tracing layers. + */ + refcount_set(&user->refcnt, 1); + + if (WARN_ON_ONCE(!schedule_work(&user->put_work))) { + /* + * If we fail we must wait for an admin to attempt delete or + * another register/close of the event, whichever is first. + */ + pr_warn("user_events: Unable to queue delayed destroy\n"); + } +out: + /* Ensure if we didn't have event_mutex before we unlock it */ + if (!locked) + mutex_unlock(&event_mutex); } -static inline struct user_event_group -*user_event_group_from_user_ns(struct user_namespace *user_ns) +static void user_event_group_destroy(struct user_event_group *group) { - if (user_ns == &init_user_ns) - return init_group; - - return NULL; + kfree(group->system_name); + kfree(group); } -static struct user_event_group *current_user_event_group(void) +static char *user_event_group_system_name(void) { - struct user_namespace *user_ns = current_user_ns(); - struct user_event_group *group = NULL; + char *system_name; + int len = sizeof(USER_EVENTS_SYSTEM) + 1; - while (user_ns) { - group = user_event_group_from_user_ns(user_ns); + system_name = kmalloc(len, GFP_KERNEL); - if (group) - break; + if (!system_name) + return NULL; - user_ns = user_ns->parent; - } + snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM); - return group; + return system_name; } -static struct user_event_group -*user_event_group_create(struct user_namespace *user_ns) +static struct user_event_group *current_user_event_group(void) +{ + return init_group; +} + +static struct user_event_group *user_event_group_create(void) { struct user_event_group *group; @@ -243,7 +332,7 @@ static struct user_event_group if (!group) return NULL; - group->system_name = user_event_group_system_name(user_ns); + group->system_name = user_event_group_system_name(); if (!group->system_name) goto error; @@ -259,12 +348,13 @@ error: return NULL; }; -static void user_event_enabler_destroy(struct user_event_enabler *enabler) +static void user_event_enabler_destroy(struct user_event_enabler *enabler, + bool locked) { list_del_rcu(&enabler->mm_enablers_link); /* No longer tracking the event via the enabler */ - refcount_dec(&enabler->event->refcnt); + user_event_put(enabler->event, locked); kfree(enabler); } @@ -326,7 +416,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work) /* User asked for enabler to be removed during fault */ if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) { - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, true); goto out; } @@ -501,14 +591,12 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig, if (!enabler) return false; - enabler->event = orig->event; + enabler->event = user_event_get(orig->event); enabler->addr = orig->addr; /* Only dup part of value (ignore future flags, etc) */ enabler->values = orig->values & ENABLE_VAL_DUP_MASK; - refcount_inc(&enabler->event->refcnt); - /* Enablers not exposed yet, RCU not required */ list_add(&enabler->mm_enablers_link, &mm->enablers); @@ -625,7 +713,7 @@ static void user_event_mm_destroy(struct user_event_mm *mm) struct user_event_enabler *enabler, *next; list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, false); mmdrop(mm->mm); kfree(mm); @@ -780,7 +868,7 @@ retry: * exit or run exec(), which includes forks and clones. */ if (!*write_result) { - refcount_inc(&enabler->event->refcnt); + user_event_get(user); list_add_rcu(&enabler->mm_enablers_link, &user_mm->enablers); } @@ -803,7 +891,12 @@ out: static __always_inline __must_check bool user_event_last_ref(struct user_event *user) { - return refcount_read(&user->refcnt) == 1; + int last = 0; + + if (user->reg_flags & USER_EVENT_REG_PERSIST) + last = 1; + + return refcount_read(&user->refcnt) == last; } static __always_inline __must_check @@ -842,7 +935,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call) * Upon success user_event has its ref count increased by 1. */ static int user_event_parse_cmd(struct user_event_group *group, - char *raw_command, struct user_event **newuser) + char *raw_command, struct user_event **newuser, + int reg_flags) { char *name = raw_command; char *args = strpbrk(name, " "); @@ -856,7 +950,7 @@ static int user_event_parse_cmd(struct user_event_group *group, if (flags) *flags++ = '\0'; - return user_event_parse(group, name, args, flags, newuser); + return user_event_parse(group, name, args, flags, newuser, reg_flags); } static int user_field_array_size(const char *type) @@ -1367,10 +1461,8 @@ static struct user_event *find_user_event(struct user_event_group *group, *outkey = key; hash_for_each_possible(group->register_table, user, node, key) - if (!strcmp(EVENT_NAME(user), name)) { - refcount_inc(&user->refcnt); - return user; - } + if (!strcmp(EVENT_NAME(user), name)) + return user_event_get(user); return NULL; } @@ -1432,7 +1524,7 @@ static void user_event_ftrace(struct user_event *user, struct iov_iter *i, if (unlikely(!entry)) return; - if (unlikely(!copy_nofault(entry + 1, i->count, i))) + if (unlikely(i->count != 0 && !copy_nofault(entry + 1, i->count, i))) goto discard; if (!list_empty(&user->validators) && @@ -1473,7 +1565,7 @@ static void user_event_perf(struct user_event *user, struct iov_iter *i, perf_fetch_caller_regs(regs); - if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) + if (unlikely(i->count != 0 && !copy_nofault(perf_entry + 1, i->count, i))) goto discard; if (!list_empty(&user->validators) && @@ -1584,12 +1676,12 @@ static int user_event_reg(struct trace_event_call *call, return ret; inc: - refcount_inc(&user->refcnt); + user_event_get(user); update_enable_bit_for(user); return 0; dec: update_enable_bit_for(user); - refcount_dec(&user->refcnt); + user_event_put(user, true); return 0; } @@ -1620,10 +1712,11 @@ static int user_event_create(const char *raw_command) mutex_lock(&group->reg_mutex); - ret = user_event_parse_cmd(group, name, &user); + /* Dyn events persist, otherwise they would cleanup immediately */ + ret = user_event_parse_cmd(group, name, &user, USER_EVENT_REG_PERSIST); if (!ret) - refcount_dec(&user->refcnt); + user_event_put(user, false); mutex_unlock(&group->reg_mutex); @@ -1745,6 +1838,8 @@ static bool user_event_match(const char *system, const char *event, if (match && argc > 0) match = user_fields_match(user, argc, argv); + else if (match && argc == 0) + match = list_empty(&user->fields); return match; } @@ -1781,11 +1876,17 @@ static int user_event_trace_register(struct user_event *user) */ static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, - struct user_event **newuser) + struct user_event **newuser, int reg_flags) { int ret; u32 key; struct user_event *user; + int argc = 0; + char **argv; + + /* User register flags are not ready yet */ + if (reg_flags != 0 || flags != NULL) + return -EINVAL; /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); @@ -1793,13 +1894,35 @@ static int user_event_parse(struct user_event_group *group, char *name, mutex_unlock(&event_mutex); if (user) { - *newuser = user; - /* - * Name is allocated by caller, free it since it already exists. - * Caller only worries about failure cases for freeing. - */ - kfree(name); + if (args) { + argv = argv_split(GFP_KERNEL, args, &argc); + if (!argv) { + ret = -ENOMEM; + goto error; + } + + ret = user_fields_match(user, argc, (const char **)argv); + argv_free(argv); + + } else + ret = list_empty(&user->fields); + + if (ret) { + *newuser = user; + /* + * Name is allocated by caller, free it since it already exists. + * Caller only worries about failure cases for freeing. + */ + kfree(name); + } else { + ret = -EADDRINUSE; + goto error; + } + return 0; +error: + user_event_put(user, false); + return ret; } user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT); @@ -1852,8 +1975,15 @@ static int user_event_parse(struct user_event_group *group, char *name, if (ret) goto put_user_lock; - /* Ensure we track self ref and caller ref (2) */ - refcount_set(&user->refcnt, 2); + user->reg_flags = reg_flags; + + if (user->reg_flags & USER_EVENT_REG_PERSIST) { + /* Ensure we track self ref and caller ref (2) */ + refcount_set(&user->refcnt, 2); + } else { + /* Ensure we track only caller ref (1) */ + refcount_set(&user->refcnt, 1); + } dyn_event_init(&user->devent, &user_event_dops); dyn_event_add(&user->devent, &user->call); @@ -1885,7 +2015,7 @@ static int delete_user_event(struct user_event_group *group, char *name) if (!user) return -ENOENT; - refcount_dec(&user->refcnt); + user_event_put(user, true); if (!user_event_last_ref(user)) return -EBUSY; @@ -2044,9 +2174,7 @@ static int user_events_ref_add(struct user_event_file_info *info, for (i = 0; i < count; ++i) new_refs->events[i] = refs->events[i]; - new_refs->events[i] = user; - - refcount_inc(&user->refcnt); + new_refs->events[i] = user_event_get(user); rcu_assign_pointer(info->refs, new_refs); @@ -2077,8 +2205,8 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) if (ret) return ret; - /* Ensure no flags, since we don't support any yet */ - if (kreg->flags != 0) + /* Ensure only valid flags */ + if (kreg->flags & ~(USER_EVENT_REG_MAX-1)) return -EINVAL; /* Ensure supported size */ @@ -2150,7 +2278,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, return ret; } - ret = user_event_parse_cmd(info->group, name, &user); + ret = user_event_parse_cmd(info->group, name, &user, reg.flags); if (ret) { kfree(name); @@ -2160,7 +2288,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, ret = user_events_ref_add(info, user); /* No longer need parse ref, ref_add either worked or not */ - refcount_dec(&user->refcnt); + user_event_put(user, false); /* Positive number is index and valid */ if (ret < 0) @@ -2309,7 +2437,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))) - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, true); /* Removed at least one */ ret = 0; @@ -2367,7 +2495,6 @@ static int user_events_release(struct inode *node, struct file *file) struct user_event_file_info *info = file->private_data; struct user_event_group *group; struct user_event_refs *refs; - struct user_event *user; int i; if (!info) @@ -2391,12 +2518,9 @@ static int user_events_release(struct inode *node, struct file *file) * The underlying user_events are ref counted, and cannot be freed. * After this decrement, the user_events may be freed elsewhere. */ - for (i = 0; i < refs->count; ++i) { - user = refs->events[i]; + for (i = 0; i < refs->count; ++i) + user_event_put(refs->events[i], false); - if (user) - refcount_dec(&user->refcnt); - } out: file->private_data = NULL; @@ -2577,7 +2701,7 @@ static int __init trace_events_user_init(void) if (!fault_cache) return -ENOMEM; - init_group = user_event_group_create(&init_user_ns); + init_group = user_event_group_create(); if (!init_group) { kmem_cache_destroy(fault_cache); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 15f05faaae44..1e33f367783e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -847,7 +847,7 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c int ret; void *pos; - list_for_each_entry(field, head, link) { + list_for_each_entry_reverse(field, head, link) { trace_seq_printf(&iter->seq, " %s=", field->name); if (field->offset + field->size > iter->ent_size) { trace_seq_puts(&iter->seq, "<OVERFLOW>"); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4666a1a92a31..c913e333cce8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -705,12 +705,17 @@ static void clear_work_data(struct work_struct *work) set_work_data(work, WORK_STRUCT_NO_POOL, 0); } +static inline struct pool_workqueue *work_struct_pwq(unsigned long data) +{ + return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK); +} + static struct pool_workqueue *get_work_pwq(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); if (data & WORK_STRUCT_PWQ) - return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); + return work_struct_pwq(data); else return NULL; } @@ -738,8 +743,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) assert_rcu_or_pool_mutex(); if (data & WORK_STRUCT_PWQ) - return ((struct pool_workqueue *) - (data & WORK_STRUCT_WQ_DATA_MASK))->pool; + return work_struct_pwq(data)->pool; pool_id = data >> WORK_OFFQ_POOL_SHIFT; if (pool_id == WORK_OFFQ_POOL_NONE) @@ -760,8 +764,7 @@ static int get_work_pool_id(struct work_struct *work) unsigned long data = atomic_long_read(&work->data); if (data & WORK_STRUCT_PWQ) - return ((struct pool_workqueue *) - (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id; + return work_struct_pwq(data)->pool->id; return data >> WORK_OFFQ_POOL_SHIFT; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6b9d39d65b73..2d0d58fb4e7f 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2070,7 +2070,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); - xas_set(&xas, index); VM_BUG_ON_PAGE(page != xas_load(&xas), page); diff --git a/mm/memfd.c b/mm/memfd.c index 69b90c31d38c..e763e76f1106 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -371,12 +371,15 @@ SYSCALL_DEFINE2(memfd_create, inode->i_mode &= ~0111; file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; - *file_seals |= F_SEAL_EXEC; + if (file_seals) { + *file_seals &= ~F_SEAL_SEAL; + *file_seals |= F_SEAL_EXEC; + } } else if (flags & MFD_ALLOW_SEALING) { /* MFD_EXEC and MFD_ALLOW_SEALING are set */ file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; + if (file_seals) + *file_seals &= ~F_SEAL_SEAL; } fd_install(fd, file); diff --git a/mm/mprotect.c b/mm/mprotect.c index 92d3d3ca390a..c59e7561698c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -867,7 +867,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } tlb_finish_mmu(&tlb); - if (!error && vma_iter_end(&vmi) < end) + if (!error && tmp < end) error = -ENOMEM; out: diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index fe10436d9911..3ab53fad8876 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -5,12 +5,10 @@ #include <linux/seq_file.h> #include <linux/shrinker.h> #include <linux/memcontrol.h> -#include <linux/srcu.h> /* defined in vmscan.c */ -extern struct mutex shrinker_mutex; +extern struct rw_semaphore shrinker_rwsem; extern struct list_head shrinker_list; -extern struct srcu_struct shrinker_srcu; static DEFINE_IDA(shrinker_debugfs_ida); static struct dentry *shrinker_debugfs_root; @@ -51,13 +49,18 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) struct mem_cgroup *memcg; unsigned long total; bool memcg_aware; - int ret = 0, nid, srcu_idx; + int ret, nid; count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); if (!count_per_node) return -ENOMEM; - srcu_idx = srcu_read_lock(&shrinker_srcu); + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + kfree(count_per_node); + return ret; + } + rcu_read_lock(); memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE; @@ -88,7 +91,8 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) } } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - srcu_read_unlock(&shrinker_srcu, srcu_idx); + rcu_read_unlock(); + up_read(&shrinker_rwsem); kfree(count_per_node); return ret; @@ -111,8 +115,9 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, .gfp_mask = GFP_KERNEL, }; struct mem_cgroup *memcg = NULL; - int nid, srcu_idx; + int nid; char kbuf[72]; + ssize_t ret; read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1); if (copy_from_user(kbuf, buf, read_len)) @@ -141,7 +146,11 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return -EINVAL; } - srcu_idx = srcu_read_lock(&shrinker_srcu); + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + mem_cgroup_put(memcg); + return ret; + } sc.nid = nid; sc.memcg = memcg; @@ -150,7 +159,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, shrinker->scan_objects(shrinker, &sc); - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); mem_cgroup_put(memcg); return size; @@ -168,7 +177,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker) char buf[128]; int id; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); /* debugfs isn't initialized yet, add debugfs entries later. */ if (!shrinker_debugfs_root) @@ -211,7 +220,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) if (!new) return -ENOMEM; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); old = shrinker->name; shrinker->name = new; @@ -229,7 +238,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) shrinker->debugfs_entry = entry; } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); kfree_const(old); @@ -242,7 +251,7 @@ struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, { struct dentry *entry = shrinker->debugfs_entry; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); kfree_const(shrinker->name); shrinker->name = NULL; @@ -271,14 +280,14 @@ static int __init shrinker_debugfs_init(void) shrinker_debugfs_root = dentry; /* Create debugfs entries for shrinkers registered at boot */ - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); list_for_each_entry(shrinker, &shrinker_list, list) if (!shrinker->debugfs_entry) { ret = shrinker_debugfs_add(shrinker); if (ret) break; } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9683573f1225..1d13d71687d7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3098,11 +3098,20 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { - /* vm_area_alloc_pages() can also fail due to a fatal signal */ - if (!fatal_signal_pending(current)) + /* + * vm_area_alloc_pages() can fail due to insufficient memory but + * also:- + * + * - a pending fatal signal + * - insufficient huge page-order pages + * + * Since we always retry allocations at order-0 in the huge page + * case a warning for either is spurious. + */ + if (!fatal_signal_pending(current) && page_order == 0) warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, page order %u, failed to allocate pages", - area->nr_pages * PAGE_SIZE, page_order); + "vmalloc error: size %lu, failed to allocate pages", + area->nr_pages * PAGE_SIZE); goto fail; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 6d0cd2840cf0..5bf98d0a22c9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -35,7 +35,7 @@ #include <linux/cpuset.h> #include <linux/compaction.h> #include <linux/notifier.h> -#include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> @@ -57,7 +57,6 @@ #include <linux/khugepaged.h> #include <linux/rculist_nulls.h> #include <linux/random.h> -#include <linux/srcu.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -190,9 +189,7 @@ struct scan_control { int vm_swappiness = 60; LIST_HEAD(shrinker_list); -DEFINE_MUTEX(shrinker_mutex); -DEFINE_SRCU(shrinker_srcu); -static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0); +DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -211,21 +208,8 @@ static inline int shrinker_defer_size(int nr_items) static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { - return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, - &shrinker_srcu, - lockdep_is_held(&shrinker_mutex)); -} - -static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, - int nid) -{ - return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info, - &shrinker_srcu); -} - -static void free_shrinker_info_rcu(struct rcu_head *head) -{ - kvfree(container_of(head, struct shrinker_info, rcu)); + return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + lockdep_is_held(&shrinker_rwsem)); } static int expand_one_shrinker_info(struct mem_cgroup *memcg, @@ -266,7 +250,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, defer_size - old_defer_size); rcu_assign_pointer(pn->shrinker_info, new); - call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu); + kvfree_rcu(old, rcu); } return 0; @@ -292,7 +276,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) int nid, size, ret = 0; int map_size, defer_size = 0; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); map_size = shrinker_map_size(shrinker_nr_max); defer_size = shrinker_defer_size(shrinker_nr_max); size = map_size + defer_size; @@ -308,7 +292,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) info->map_nr_max = shrinker_nr_max; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } @@ -324,7 +308,7 @@ static int expand_shrinker_info(int new_id) if (!root_mem_cgroup) goto out; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); map_size = shrinker_map_size(new_nr_max); defer_size = shrinker_defer_size(new_nr_max); @@ -352,16 +336,15 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; - int srcu_idx; - srcu_idx = srcu_read_lock(&shrinker_srcu); - info = shrinker_info_srcu(memcg, nid); + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); set_bit(shrinker_id, info->map); } - srcu_read_unlock(&shrinker_srcu, srcu_idx); + rcu_read_unlock(); } } @@ -374,7 +357,8 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) if (mem_cgroup_disabled()) return -ENOSYS; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); + /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -388,7 +372,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) shrinker->id = id; ret = 0; unlock: - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } @@ -398,7 +382,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) BUG_ON(id < 0); - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); idr_remove(&shrinker_idr, id); } @@ -408,7 +392,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_srcu(memcg, nid); + info = shrinker_info_protected(memcg, nid); return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); } @@ -417,7 +401,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_srcu(memcg, nid); + info = shrinker_info_protected(memcg, nid); return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); } @@ -433,7 +417,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - mutex_lock(&shrinker_mutex); + down_read(&shrinker_rwsem); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -442,7 +426,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) atomic_long_add(nr, &parent_info->nr_deferred[i]); } } - mutex_unlock(&shrinker_mutex); + up_read(&shrinker_rwsem); } static bool cgroup_reclaim(struct scan_control *sc) @@ -743,9 +727,9 @@ void free_prealloced_shrinker(struct shrinker *shrinker) shrinker->name = NULL; #endif if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return; } @@ -755,11 +739,11 @@ void free_prealloced_shrinker(struct shrinker *shrinker) void register_shrinker_prepared(struct shrinker *shrinker) { - mutex_lock(&shrinker_mutex); - list_add_tail_rcu(&shrinker->list, &shrinker_list); + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); } static int __register_shrinker(struct shrinker *shrinker) @@ -810,16 +794,13 @@ void unregister_shrinker(struct shrinker *shrinker) if (!(shrinker->flags & SHRINKER_REGISTERED)) return; - mutex_lock(&shrinker_mutex); - list_del_rcu(&shrinker->list); + down_write(&shrinker_rwsem); + list_del(&shrinker->list); shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); - mutex_unlock(&shrinker_mutex); - - atomic_inc(&shrinker_srcu_generation); - synchronize_srcu(&shrinker_srcu); + up_write(&shrinker_rwsem); shrinker_debugfs_remove(debugfs_entry, debugfs_id); @@ -831,13 +812,15 @@ EXPORT_SYMBOL(unregister_shrinker); /** * synchronize_shrinkers - Wait for all running shrinkers to complete. * - * This is useful to guarantee that all shrinker invocations have seen an - * update, before freeing memory. + * This is equivalent to calling unregister_shrink() and register_shrinker(), + * but atomically and with less overhead. This is useful to guarantee that all + * shrinker invocations have seen an update, before freeing memory, similar to + * rcu. */ void synchronize_shrinkers(void) { - atomic_inc(&shrinker_srcu_generation); - synchronize_srcu(&shrinker_srcu); + down_write(&shrinker_rwsem); + up_write(&shrinker_rwsem); } EXPORT_SYMBOL(synchronize_shrinkers); @@ -946,20 +929,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; - int srcu_idx, generation; - int i = 0; + int i; if (!mem_cgroup_online(memcg)) return 0; -again: - srcu_idx = srcu_read_lock(&shrinker_srcu); - info = shrinker_info_srcu(memcg, nid); + if (!down_read_trylock(&shrinker_rwsem)) + return 0; + + info = shrinker_info_protected(memcg, nid); if (unlikely(!info)) goto unlock; - generation = atomic_read(&shrinker_srcu_generation); - for_each_set_bit_from(i, info->map, info->map_nr_max) { + for_each_set_bit(i, info->map, info->map_nr_max) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -1005,14 +987,14 @@ again: set_shrinker_bit(memcg, nid, i); } freed += ret; - if (atomic_read(&shrinker_srcu_generation) != generation) { - srcu_read_unlock(&shrinker_srcu, srcu_idx); - i++; - goto again; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; } } unlock: - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); return freed; } #else /* CONFIG_MEMCG */ @@ -1049,7 +1031,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, { unsigned long ret, freed = 0; struct shrinker *shrinker; - int srcu_idx, generation; /* * The root memcg might be allocated even though memcg is disabled @@ -1061,11 +1042,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); - srcu_idx = srcu_read_lock(&shrinker_srcu); + if (!down_read_trylock(&shrinker_rwsem)) + goto out; - generation = atomic_read(&shrinker_srcu_generation); - list_for_each_entry_srcu(shrinker, &shrinker_list, list, - srcu_read_lock_held(&shrinker_srcu)) { + list_for_each_entry(shrinker, &shrinker_list, list) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -1076,14 +1056,19 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (ret == SHRINK_EMPTY) ret = 0; freed += ret; - - if (atomic_read(&shrinker_srcu_generation) != generation) { + /* + * Bail out if someone want to register a new shrinker to + * prevent the registration from being stalled for long periods + * by parallel ongoing shrinking. + */ + if (rwsem_is_contended(&shrinker_rwsem)) { freed = freed ? : 1; break; } } - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); +out: cond_resched(); return freed; } diff --git a/net/core/sock.c b/net/core/sock.c index 24f2761bdb1d..6e5662ca00fe 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1362,12 +1362,6 @@ set_sndbuf: __sock_set_mark(sk, val); break; case SO_RCVMARK: - if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { - ret = -EPERM; - break; - } - sock_valbool_flag(sk, SOCK_RCVMARK, valbool); break; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index ab1afe67fd18..1afed89e03c0 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -403,6 +403,24 @@ static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) return 0; } +static struct dsa_port * +dsa_switch_preferred_default_local_cpu_port(struct dsa_switch *ds) +{ + struct dsa_port *cpu_dp; + + if (!ds->ops->preferred_default_local_cpu_port) + return NULL; + + cpu_dp = ds->ops->preferred_default_local_cpu_port(ds); + if (!cpu_dp) + return NULL; + + if (WARN_ON(!dsa_port_is_cpu(cpu_dp) || cpu_dp->ds != ds)) + return NULL; + + return cpu_dp; +} + /* Perform initial assignment of CPU ports to user ports and DSA links in the * fabric, giving preference to CPU ports local to each switch. Default to * using the first CPU port in the switch tree if the port does not have a CPU @@ -410,12 +428,16 @@ static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) */ static int dsa_tree_setup_cpu_ports(struct dsa_switch_tree *dst) { - struct dsa_port *cpu_dp, *dp; + struct dsa_port *preferred_cpu_dp, *cpu_dp, *dp; list_for_each_entry(cpu_dp, &dst->ports, list) { if (!dsa_port_is_cpu(cpu_dp)) continue; + preferred_cpu_dp = dsa_switch_preferred_default_local_cpu_port(cpu_dp->ds); + if (preferred_cpu_dp && preferred_cpu_dp != cpu_dp) + continue; + /* Prefer a local CPU port */ dsa_switch_for_each_port(dp, cpu_dp->ds) { /* Prefer the first local CPU port found */ diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h index e5d8439b9e45..c16db0b326fa 100644 --- a/net/ieee802154/trace.h +++ b/net/ieee802154/trace.h @@ -13,7 +13,7 @@ #define MAXNAME 32 #define WPAN_PHY_ENTRY __array(char, wpan_phy_name, MAXNAME) -#define WPAN_PHY_ASSIGN strlcpy(__entry->wpan_phy_name, \ +#define WPAN_PHY_ASSIGN strscpy(__entry->wpan_phy_name, \ wpan_phy_name(wpan_phy), \ MAXNAME) #define WPAN_PHY_PR_FMT "%s" diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 3969fa805679..ee848be59e65 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -340,6 +340,9 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ secpath_reset(skb); + if (skb_needs_linearize(skb, skb->dev->features) && + __skb_linearize(skb)) + return -ENOMEM; return 0; } diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index ad2afeef4f10..eac206a290d0 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -164,6 +164,7 @@ drop: kfree_skb(skb); return 0; } +EXPORT_SYMBOL(xfrm4_udp_encap_rcv); int xfrm4_rcv(struct sk_buff *skb) { diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 75c02992c520..772340268997 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -374,6 +374,9 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features secpath_reset(skb); + if (skb_needs_linearize(skb, skb->dev->features) && + __skb_linearize(skb)) + return -ENOMEM; return 0; } diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 04cbeefd8982..4907ab241d6b 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -86,6 +86,9 @@ int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) __be32 *udpdata32; __u16 encap_type = up->encap_type; + if (skb->protocol == htons(ETH_P_IP)) + return xfrm4_udp_encap_rcv(sk, skb); + /* if this is not encapsulated socket, then just return now */ if (!encap_type) return 1; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index d996aa2579df..fc6e130364da 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2110,7 +2110,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) /* either the frame has been decrypted or will be dropped */ status->flag |= RX_FLAG_DECRYPTED; - if (unlikely(ieee80211_is_beacon(fc) && result == RX_DROP_UNUSABLE && + if (unlikely(ieee80211_is_beacon(fc) && (result & RX_DROP_UNUSABLE) && rx->sdata->dev)) cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, skb->data, skb->len); diff --git a/net/mac802154/trace.h b/net/mac802154/trace.h index 689396d6c76a..1574ecc48075 100644 --- a/net/mac802154/trace.h +++ b/net/mac802154/trace.h @@ -14,7 +14,7 @@ #define MAXNAME 32 #define LOCAL_ENTRY __array(char, wpan_phy_name, MAXNAME) -#define LOCAL_ASSIGN strlcpy(__entry->wpan_phy_name, \ +#define LOCAL_ASSIGN strscpy(__entry->wpan_phy_name, \ wpan_phy_name(local->hw.phy), MAXNAME) #define LOCAL_PR_FMT "%s" #define LOCAL_PR_ARG __entry->wpan_phy_name diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 59f8f3124855..1224dfca5bf3 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1047,6 +1047,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, if (err) return err; + inet_sk_state_store(newsk, TCP_LISTEN); err = kernel_listen(ssock, backlog); if (err) return err; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 67311e7d5b21..a6c7f2d24909 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -44,7 +44,7 @@ enum { static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp; static void __mptcp_destroy_sock(struct sock *sk); -static void __mptcp_check_send_data_fin(struct sock *sk); +static void mptcp_check_send_data_fin(struct sock *sk); DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); static struct net_device mptcp_napi_dev; @@ -424,8 +424,7 @@ static bool mptcp_pending_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - return !__mptcp_check_fallback(msk) && - ((1 << sk->sk_state) & + return ((1 << sk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && msk->write_seq == READ_ONCE(msk->snd_una); } @@ -583,9 +582,6 @@ static bool mptcp_check_data_fin(struct sock *sk) u64 rcv_data_fin_seq; bool ret = false; - if (__mptcp_check_fallback(msk)) - return ret; - /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. * msk->rcv_data_fin was set when parsing the incoming options @@ -623,7 +619,8 @@ static bool mptcp_check_data_fin(struct sock *sk) } ret = true; - mptcp_send_ack(msk); + if (!__mptcp_check_fallback(msk)) + mptcp_send_ack(msk); mptcp_close_wake_up(sk); } return ret; @@ -850,12 +847,12 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) return true; } -static void __mptcp_flush_join_list(struct sock *sk) +static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list) { struct mptcp_subflow_context *tmp, *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - list_for_each_entry_safe(subflow, tmp, &msk->join_list, node) { + list_for_each_entry_safe(subflow, tmp, join_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); @@ -897,49 +894,6 @@ bool mptcp_schedule_work(struct sock *sk) return false; } -void mptcp_subflow_eof(struct sock *sk) -{ - if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags)) - mptcp_schedule_work(sk); -} - -static void mptcp_check_for_eof(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - struct sock *sk = (struct sock *)msk; - int receivers = 0; - - mptcp_for_each_subflow(msk, subflow) - receivers += !subflow->rx_eof; - if (receivers) - return; - - if (!(sk->sk_shutdown & RCV_SHUTDOWN)) { - /* hopefully temporary hack: propagate shutdown status - * to msk, when all subflows agree on it - */ - WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); - - smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - sk->sk_data_ready(sk); - } - - switch (sk->sk_state) { - case TCP_ESTABLISHED: - inet_sk_state_store(sk, TCP_CLOSE_WAIT); - break; - case TCP_FIN_WAIT1: - inet_sk_state_store(sk, TCP_CLOSING); - break; - case TCP_FIN_WAIT2: - inet_sk_state_store(sk, TCP_CLOSE); - break; - default: - return; - } - mptcp_close_wake_up(sk); -} - static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; @@ -1609,7 +1563,7 @@ out: if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); if (do_check_data_fin) - __mptcp_check_send_data_fin(sk); + mptcp_check_send_data_fin(sk); } static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first) @@ -1727,7 +1681,13 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) *copied_syn = 0; } else if (ret && ret != -EINPROGRESS) { - mptcp_disconnect(sk, 0); + /* The disconnect() op called by tcp_sendmsg_fastopen()/ + * __inet_stream_connect() can fail, due to looking check, + * see mptcp_disconnect(). + * Attempt it again outside the problematic scope. + */ + if (!mptcp_disconnect(sk, 0)) + sk->sk_socket->state = SS_UNCONNECTED; } inet_sk(sk)->defer_connect = 0; @@ -2158,9 +2118,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, break; } - if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) - mptcp_check_for_eof(msk); - if (sk->sk_shutdown & RCV_SHUTDOWN) { /* race breaker: the shutdown could be after the * previous receive queue check @@ -2389,7 +2346,10 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { - tcp_disconnect(ssk, 0); + /* The MPTCP code never wait on the subflow sockets, TCP-level + * disconnect should never fail + */ + WARN_ON_ONCE(tcp_disconnect(ssk, 0)); msk->subflow->state = SS_UNCONNECTED; mptcp_subflow_ctx_reset(subflow); release_sock(ssk); @@ -2408,13 +2368,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, kfree_rcu(subflow, rcu); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ - if (ssk->sk_state == TCP_LISTEN) { - tcp_set_state(ssk, TCP_CLOSE); - mptcp_subflow_queue_clean(sk, ssk); - inet_csk_listen_stop(ssk); - mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); - } - __tcp_close(ssk, 0); /* close acquired an extra ref */ @@ -2671,16 +2624,12 @@ static void mptcp_worker(struct work_struct *work) if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN))) goto unlock; - mptcp_check_data_fin_ack(sk); - mptcp_check_fastclose(msk); mptcp_pm_nl_work(msk); - if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) - mptcp_check_for_eof(msk); - - __mptcp_check_send_data_fin(sk); + mptcp_check_send_data_fin(sk); + mptcp_check_data_fin_ack(sk); mptcp_check_data_fin(sk); if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) @@ -2812,13 +2761,19 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) break; fallthrough; case TCP_SYN_SENT: - tcp_disconnect(ssk, O_NONBLOCK); + WARN_ON_ONCE(tcp_disconnect(ssk, O_NONBLOCK)); break; default: if (__mptcp_check_fallback(mptcp_sk(sk))) { pr_debug("Fallback"); ssk->sk_shutdown |= how; tcp_shutdown(ssk, how); + + /* simulate the data_fin ack reception to let the state + * machine move forward + */ + WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt); + mptcp_schedule_work(sk); } else { pr_debug("Sending DATA_FIN on subflow %p", ssk); tcp_send_ack(ssk); @@ -2858,7 +2813,7 @@ static int mptcp_close_state(struct sock *sk) return next & TCP_ACTION_FIN; } -static void __mptcp_check_send_data_fin(struct sock *sk) +static void mptcp_check_send_data_fin(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); @@ -2876,19 +2831,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk) WRITE_ONCE(msk->snd_nxt, msk->write_seq); - /* fallback socket will not get data_fin/ack, can move to the next - * state now - */ - if (__mptcp_check_fallback(msk)) { - WRITE_ONCE(msk->snd_una, msk->write_seq); - if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { - inet_sk_state_store(sk, TCP_CLOSE); - mptcp_close_wake_up(sk); - } else if (sk->sk_state == TCP_FIN_WAIT1) { - inet_sk_state_store(sk, TCP_FIN_WAIT2); - } - } - mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); @@ -2908,7 +2850,7 @@ static void __mptcp_wr_shutdown(struct sock *sk) WRITE_ONCE(msk->write_seq, msk->write_seq + 1); WRITE_ONCE(msk->snd_data_fin_enable, 1); - __mptcp_check_send_data_fin(sk); + mptcp_check_send_data_fin(sk); } static void __mptcp_destroy_sock(struct sock *sk) @@ -2953,10 +2895,24 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) return EPOLLIN | EPOLLRDNORM; } -static void mptcp_listen_inuse_dec(struct sock *sk) +static void mptcp_check_listen_stop(struct sock *sk) { - if (inet_sk_state_load(sk) == TCP_LISTEN) - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + struct sock *ssk; + + if (inet_sk_state_load(sk) != TCP_LISTEN) + return; + + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + ssk = mptcp_sk(sk)->first; + if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN)) + return; + + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + mptcp_subflow_queue_clean(sk, ssk); + inet_csk_listen_stop(ssk); + mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); + tcp_set_state(ssk, TCP_CLOSE); + release_sock(ssk); } bool __mptcp_close(struct sock *sk, long timeout) @@ -2969,7 +2925,7 @@ bool __mptcp_close(struct sock *sk, long timeout) WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { - mptcp_listen_inuse_dec(sk); + mptcp_check_listen_stop(sk); inet_sk_state_store(sk, TCP_CLOSE); goto cleanup; } @@ -3073,15 +3029,20 @@ static int mptcp_disconnect(struct sock *sk, int flags) { struct mptcp_sock *msk = mptcp_sk(sk); + /* Deny disconnect if other threads are blocked in sk_wait_event() + * or inet_wait_for_connect(). + */ + if (sk->sk_wait_pending) + return -EBUSY; + /* We are on the fastopen error path. We can't call straight into the * subflows cleanup code due to lock nesting (we are already under - * msk->firstsocket lock). Do nothing and leave the cleanup to the - * caller. + * msk->firstsocket lock). */ if (msk->fastopening) - return 0; + return -EBUSY; - mptcp_listen_inuse_dec(sk); + mptcp_check_listen_stop(sk); inet_sk_state_store(sk, TCP_CLOSE); mptcp_stop_timer(sk); @@ -3140,6 +3101,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); #endif + nsk->sk_wait_pending = 0; __mptcp_init_sock(nsk); msk = mptcp_sk(nsk); @@ -3327,9 +3289,14 @@ static void mptcp_release_cb(struct sock *sk) for (;;) { unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) | msk->push_pending; + struct list_head join_list; + if (!flags) break; + INIT_LIST_HEAD(&join_list); + list_splice_init(&msk->join_list, &join_list); + /* the following actions acquire the subflow socket lock * * 1) can't be invoked in atomic scope @@ -3340,8 +3307,9 @@ static void mptcp_release_cb(struct sock *sk) msk->push_pending = 0; msk->cb_flags &= ~flags; spin_unlock_bh(&sk->sk_lock.slock); + if (flags & BIT(MPTCP_FLUSH_JOIN_LIST)) - __mptcp_flush_join_list(sk); + __mptcp_flush_join_list(sk, &join_list); if (flags & BIT(MPTCP_PUSH_PENDING)) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 70c957bc56a8..d3783a7056e1 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -113,7 +113,6 @@ /* MPTCP socket atomic flags */ #define MPTCP_NOSPACE 1 #define MPTCP_WORK_RTX 2 -#define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 #define MPTCP_WORK_CLOSE_SUBFLOW 5 @@ -476,14 +475,13 @@ struct mptcp_subflow_context { send_mp_fail : 1, send_fastclose : 1, send_infinite_map : 1, - rx_eof : 1, remote_key_valid : 1, /* received the peer key from */ disposable : 1, /* ctx can be free at ulp release time */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ local_id_valid : 1, /* local_id is correctly initialized */ valid_csum_seen : 1, /* at least one csum validated */ is_mptfo : 1, /* subflow is doing TFO */ - __unused : 8; + __unused : 9; enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; @@ -720,7 +718,6 @@ static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit) void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); -void mptcp_subflow_eof(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) { diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 4688daa6b38b..d9c8b21c6076 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1749,14 +1749,16 @@ static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; + struct mptcp_sock *msk; __subflow_state_change(sk); + msk = mptcp_sk(parent); if (subflow_simultaneous_connect(sk)) { mptcp_propagate_sndbuf(parent, sk); mptcp_do_fallback(sk); - mptcp_rcv_space_init(mptcp_sk(parent), sk); - pr_fallback(mptcp_sk(parent)); + mptcp_rcv_space_init(msk, sk); + pr_fallback(msk); subflow->conn_finished = 1; mptcp_set_connected(parent); } @@ -1772,11 +1774,12 @@ static void subflow_state_change(struct sock *sk) subflow_sched_work_if_closed(mptcp_sk(parent), sk); - if (__mptcp_check_fallback(mptcp_sk(parent)) && - !subflow->rx_eof && subflow_is_done(sk)) { - subflow->rx_eof = 1; - mptcp_subflow_eof(parent); - } + /* when the fallback subflow closes the rx side, trigger a 'dummy' + * ingress data fin, so that the msk state will follow along + */ + if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk && + mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) + mptcp_schedule_work(parent); } void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index feb1d7fcb09f..a80b960223e1 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -1207,6 +1207,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb->transport_header = skb->network_header; skb_set_inner_ipproto(skb, next_protocol); + skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { bool check = false; @@ -1349,6 +1350,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb->transport_header = skb->network_header; skb_set_inner_ipproto(skb, next_protocol); + skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { bool check = false; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 69bceefaa5c8..4c7937fd803f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -151,6 +151,7 @@ static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, return NULL; INIT_LIST_HEAD(&trans->list); + INIT_LIST_HEAD(&trans->binding_list); trans->msg_type = msg_type; trans->ctx = *ctx; @@ -163,13 +164,20 @@ static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL); } -static void nft_trans_destroy(struct nft_trans *trans) +static void nft_trans_list_del(struct nft_trans *trans) { list_del(&trans->list); + list_del(&trans->binding_list); +} + +static void nft_trans_destroy(struct nft_trans *trans) +{ + nft_trans_list_del(trans); kfree(trans); } -static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) +static void __nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set, + bool bind) { struct nftables_pernet *nft_net; struct net *net = ctx->net; @@ -183,16 +191,80 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) switch (trans->msg_type) { case NFT_MSG_NEWSET: if (nft_trans_set(trans) == set) - nft_trans_set_bound(trans) = true; + nft_trans_set_bound(trans) = bind; break; case NFT_MSG_NEWSETELEM: if (nft_trans_elem_set(trans) == set) - nft_trans_elem_set_bound(trans) = true; + nft_trans_elem_set_bound(trans) = bind; break; } } } +static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) +{ + return __nft_set_trans_bind(ctx, set, true); +} + +static void nft_set_trans_unbind(const struct nft_ctx *ctx, struct nft_set *set) +{ + return __nft_set_trans_bind(ctx, set, false); +} + +static void __nft_chain_trans_bind(const struct nft_ctx *ctx, + struct nft_chain *chain, bool bind) +{ + struct nftables_pernet *nft_net; + struct net *net = ctx->net; + struct nft_trans *trans; + + if (!nft_chain_binding(chain)) + return; + + nft_net = nft_pernet(net); + list_for_each_entry_reverse(trans, &nft_net->commit_list, list) { + switch (trans->msg_type) { + case NFT_MSG_NEWCHAIN: + if (nft_trans_chain(trans) == chain) + nft_trans_chain_bound(trans) = bind; + break; + case NFT_MSG_NEWRULE: + if (trans->ctx.chain == chain) + nft_trans_rule_bound(trans) = bind; + break; + } + } +} + +static void nft_chain_trans_bind(const struct nft_ctx *ctx, + struct nft_chain *chain) +{ + __nft_chain_trans_bind(ctx, chain, true); +} + +int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) +{ + if (!nft_chain_binding(chain)) + return 0; + + if (nft_chain_binding(ctx->chain)) + return -EOPNOTSUPP; + + if (chain->bound) + return -EBUSY; + + chain->bound = true; + chain->use++; + nft_chain_trans_bind(ctx, chain); + + return 0; +} + +void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) +{ + __nft_chain_trans_bind(ctx, chain, false); +} + static int nft_netdev_register_hooks(struct net *net, struct list_head *hook_list) { @@ -292,6 +364,19 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr { struct nftables_pernet *nft_net = nft_pernet(net); + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (!nft_trans_set_update(trans) && + nft_set_is_anonymous(nft_trans_set(trans))) + list_add_tail(&trans->binding_list, &nft_net->binding_list); + break; + case NFT_MSG_NEWCHAIN: + if (!nft_trans_chain_update(trans) && + nft_chain_binding(nft_trans_chain(trans))) + list_add_tail(&trans->binding_list, &nft_net->binding_list); + break; + } + list_add_tail(&trans->list, &nft_net->commit_list); } @@ -338,8 +423,9 @@ static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID])); } } - + nft_trans_chain(trans) = ctx->chain; nft_trans_commit_list_add_tail(ctx->net, trans); + return trans; } @@ -357,8 +443,7 @@ static int nft_delchain(struct nft_ctx *ctx) return 0; } -static void nft_rule_expr_activate(const struct nft_ctx *ctx, - struct nft_rule *rule) +void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_expr *expr; @@ -371,9 +456,8 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx, } } -static void nft_rule_expr_deactivate(const struct nft_ctx *ctx, - struct nft_rule *rule, - enum nft_trans_phase phase) +void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, + enum nft_trans_phase phase) { struct nft_expr *expr; @@ -495,6 +579,58 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, return __nft_trans_set_add(ctx, msg_type, set, NULL); } +static void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + +static int nft_mapelem_deactivate(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) +{ + nft_setelem_data_deactivate(ctx->net, set, elem); + + return 0; +} + +struct nft_set_elem_catchall { + struct list_head list; + struct rcu_head rcu; + void *elem; +}; + +static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_elem elem; + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + elem.priv = catchall->elem; + nft_setelem_data_deactivate(ctx->net, set, &elem); + break; + } +} + +static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .fn = nft_mapelem_deactivate, + }; + + set->ops->walk(ctx, set, &iter); + WARN_ON_ONCE(iter.err); + + nft_map_catchall_deactivate(ctx, set); +} + static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) { int err; @@ -503,6 +639,9 @@ static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) if (err < 0) return err; + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + nft_deactivate_next(ctx->net, set); ctx->table->use--; @@ -2226,7 +2365,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, return 0; } -static int nft_chain_add(struct nft_table *table, struct nft_chain *chain) +int nft_chain_add(struct nft_table *table, struct nft_chain *chain) { int err; @@ -2528,6 +2667,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, nft_trans_basechain(trans) = basechain; INIT_LIST_HEAD(&nft_trans_chain_hooks(trans)); list_splice(&hook.list, &nft_trans_chain_hooks(trans)); + if (nla[NFTA_CHAIN_HOOK]) + module_put(hook.type->owner); nft_trans_commit_list_add_tail(ctx->net, trans); @@ -2670,21 +2811,18 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, return nf_tables_addchain(&ctx, family, genmask, policy, flags, extack); } -static int nft_delchain_hook(struct nft_ctx *ctx, struct nft_chain *chain, +static int nft_delchain_hook(struct nft_ctx *ctx, + struct nft_base_chain *basechain, struct netlink_ext_ack *extack) { + const struct nft_chain *chain = &basechain->chain; const struct nlattr * const *nla = ctx->nla; struct nft_chain_hook chain_hook = {}; - struct nft_base_chain *basechain; struct nft_hook *this, *hook; LIST_HEAD(chain_del_list); struct nft_trans *trans; int err; - if (!nft_is_base_chain(chain)) - return -EOPNOTSUPP; - - basechain = nft_base_chain(chain); err = nft_chain_parse_hook(ctx->net, basechain, nla, &chain_hook, ctx->family, chain->flags, extack); if (err < 0) @@ -2769,7 +2907,12 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, if (chain->flags & NFT_CHAIN_HW_OFFLOAD) return -EOPNOTSUPP; - return nft_delchain_hook(&ctx, chain, extack); + if (nft_is_base_chain(chain)) { + struct nft_base_chain *basechain = nft_base_chain(chain); + + if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) + return nft_delchain_hook(&ctx, basechain, extack); + } } if (info->nlh->nlmsg_flags & NLM_F_NONREC && @@ -3490,8 +3633,7 @@ err_fill_rule_info: return err; } -static void nf_tables_rule_destroy(const struct nft_ctx *ctx, - struct nft_rule *rule) +void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_expr *expr, *next; @@ -3508,7 +3650,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, kfree(rule); } -void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) +static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); @@ -3596,12 +3738,6 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, return 0; } -struct nft_set_elem_catchall { - struct list_head list; - struct rcu_head rcu; - void *elem; -}; - int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) { u8 genmask = nft_genmask_next(ctx->net); @@ -3844,7 +3980,7 @@ err_destroy_flow_rule: if (flow) nft_flow_rule_destroy(flow); err_release_rule: - nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE); + nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR); nf_tables_rule_destroy(&ctx, rule); err_release_expr: for (i = 0; i < n; i++) { @@ -4777,6 +4913,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; + if (flags & NFT_SET_ANONYMOUS) + return -EOPNOTSUPP; + err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout); if (err) return err; @@ -4785,6 +4924,10 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_SET_GC_INTERVAL] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; + + if (flags & NFT_SET_ANONYMOUS) + return -EOPNOTSUPP; + desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); } @@ -4831,6 +4974,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; + if (nft_set_is_anonymous(set)) + return -EOPNOTSUPP; + err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags); if (err < 0) return err; @@ -4934,7 +5080,7 @@ err_set_expr_alloc: for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(&ctx, set->exprs[i]); err_set_destroy: - ops->destroy(set); + ops->destroy(&ctx, set); err_set_init: kfree(set->name); err_set_name: @@ -4949,7 +5095,7 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx, list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { list_del_rcu(&catchall->list); - nft_set_elem_destroy(set, catchall->elem, true); + nf_tables_set_elem_destroy(ctx, set, catchall->elem); kfree_rcu(catchall, rcu); } } @@ -4964,7 +5110,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(ctx, set->exprs[i]); - set->ops->destroy(set); + set->ops->destroy(ctx, set); nft_set_catchall_destroy(ctx, set); kfree(set->name); kvfree(set); @@ -5129,10 +5275,60 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, } } +static void nft_setelem_data_activate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + +static int nft_mapelem_activate(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) +{ + nft_setelem_data_activate(ctx->net, set, elem); + + return 0; +} + +static void nft_map_catchall_activate(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_elem elem; + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + elem.priv = catchall->elem; + nft_setelem_data_activate(ctx->net, set, &elem); + break; + } +} + +static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .fn = nft_mapelem_activate, + }; + + set->ops->walk(ctx, set, &iter); + WARN_ON_ONCE(iter.err); + + nft_map_catchall_activate(ctx, set); +} + void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set) { - if (nft_set_is_anonymous(set)) + if (nft_set_is_anonymous(set)) { + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_activate(ctx, set); + nft_clear(ctx->net, set); + } set->use++; } @@ -5143,14 +5339,28 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, enum nft_trans_phase phase) { switch (phase) { - case NFT_TRANS_PREPARE: + case NFT_TRANS_PREPARE_ERROR: + nft_set_trans_unbind(ctx, set); if (nft_set_is_anonymous(set)) nft_deactivate_next(ctx->net, set); set->use--; + break; + case NFT_TRANS_PREPARE: + if (nft_set_is_anonymous(set)) { + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + + nft_deactivate_next(ctx->net, set); + } + set->use--; return; case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: + if (nft_set_is_anonymous(set) && + set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + set->use--; fallthrough; default: @@ -5903,6 +6113,7 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, __nft_set_elem_expr_destroy(ctx, expr); } +/* Drop references and destroy. Called from gc, dynset and abort path. */ void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr) { @@ -5924,11 +6135,11 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); -/* Only called from commit path, nft_setelem_data_deactivate() already deals - * with the refcounting from the preparation phase. +/* Destroy element. References have been already dropped in the preparation + * path via nft_setelem_data_deactivate(). */ -static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, - const struct nft_set *set, void *elem) +void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, void *elem) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem); @@ -6491,19 +6702,19 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (flags) *nft_set_ext_flags(ext) = flags; + if (obj) { + *nft_set_ext_obj(ext) = obj; + obj->use++; + } if (ulen > 0) { if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) { err = -EINVAL; - goto err_elem_userdata; + goto err_elem_free; } udata = nft_set_ext_userdata(ext); udata->len = ulen - 1; nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); } - if (obj) { - *nft_set_ext_obj(ext) = obj; - obj->use++; - } err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs); if (err < 0) goto err_elem_free; @@ -6558,10 +6769,7 @@ err_set_full: err_element_clash: kfree(trans); err_elem_free: - if (obj) - obj->use--; -err_elem_userdata: - nf_tables_set_elem_destroy(ctx, set, elem.priv); + nft_set_elem_destroy(set, elem.priv, true); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_release(&elem.data.val, desc.type); @@ -6605,7 +6813,8 @@ static int nf_tables_newsetelem(struct sk_buff *skb, if (IS_ERR(set)) return PTR_ERR(set); - if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + if (!list_empty(&set->bindings) && + (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); @@ -6638,7 +6847,6 @@ static int nf_tables_newsetelem(struct sk_buff *skb, void nft_data_hold(const struct nft_data *data, enum nft_data_types type) { struct nft_chain *chain; - struct nft_rule *rule; if (type == NFT_DATA_VERDICT) { switch (data->verdict.code) { @@ -6646,15 +6854,6 @@ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) case NFT_GOTO: chain = data->verdict.chain; chain->use++; - - if (!nft_chain_is_bound(chain)) - break; - - chain->table->use++; - list_for_each_entry(rule, &chain->rules, list) - chain->use++; - - nft_chain_add(chain->table, chain); break; } } @@ -6889,7 +7088,9 @@ static int nf_tables_delsetelem(struct sk_buff *skb, set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); - if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + + if (!list_empty(&set->bindings) && + (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); @@ -7671,6 +7872,7 @@ void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, enum nft_trans_phase phase) { switch (phase) { + case NFT_TRANS_PREPARE_ERROR: case NFT_TRANS_PREPARE: case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: @@ -8943,7 +9145,7 @@ static void nf_tables_trans_destroy_work(struct work_struct *w) synchronize_rcu(); list_for_each_entry_safe(trans, next, &head, list) { - list_del(&trans->list); + nft_trans_list_del(trans); nft_commit_release(trans); } } @@ -9308,6 +9510,27 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return 0; } + list_for_each_entry(trans, &nft_net->binding_list, binding_list) { + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (!nft_trans_set_update(trans) && + nft_set_is_anonymous(nft_trans_set(trans)) && + !nft_trans_set_bound(trans)) { + pr_warn_once("nftables ruleset with unbound set\n"); + return -EINVAL; + } + break; + case NFT_MSG_NEWCHAIN: + if (!nft_trans_chain_update(trans) && + nft_chain_binding(nft_trans_chain(trans)) && + !nft_trans_chain_bound(trans)) { + pr_warn_once("nftables ruleset with unbound chain\n"); + return -EINVAL; + } + break; + } + } + /* 0. Validate ruleset, otherwise roll back for error reporting. */ if (nf_tables_validate(net) < 0) return -EAGAIN; @@ -9677,7 +9900,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); } else { - if (nft_chain_is_bound(trans->ctx.chain)) { + if (nft_trans_chain_bound(trans)) { nft_trans_destroy(trans); break; } @@ -9700,6 +9923,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; case NFT_MSG_NEWRULE: + if (nft_trans_rule_bound(trans)) { + nft_trans_destroy(trans); + break; + } trans->ctx.chain->use--; list_del_rcu(&nft_trans_rule(trans)->list); nft_rule_expr_deactivate(&trans->ctx, @@ -9734,6 +9961,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DESTROYSET: trans->ctx.table->use++; nft_clear(trans->ctx.net, nft_trans_set(trans)); + if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_activate(&trans->ctx, nft_trans_set(trans)); + nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: @@ -9814,7 +10044,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { - list_del(&trans->list); + nft_trans_list_del(trans); nf_tables_abort_release(trans); } @@ -10263,22 +10493,12 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, static void nft_verdict_uninit(const struct nft_data *data) { struct nft_chain *chain; - struct nft_rule *rule; switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; chain->use--; - - if (!nft_chain_is_bound(chain)) - break; - - chain->table->use--; - list_for_each_entry(rule, &chain->rules, list) - chain->use--; - - nft_chain_del(chain); break; } } @@ -10513,6 +10733,9 @@ static void __nft_release_table(struct net *net, struct nft_table *table) list_for_each_entry_safe(set, ns, &table->sets, list) { list_del(&set->list); table->use--; + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(&ctx, set); + nft_set_destroy(&ctx, set); } list_for_each_entry_safe(obj, ne, &table->objects, list) { @@ -10597,6 +10820,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->tables); INIT_LIST_HEAD(&nft_net->commit_list); + INIT_LIST_HEAD(&nft_net->binding_list); INIT_LIST_HEAD(&nft_net->module_list); INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index ee6840bd5933..8f1bfa6ccc2d 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -439,3 +439,4 @@ module_init(nfnl_osf_init); module_exit(nfnl_osf_fini); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index c9d2f7c29f53..3d76ebfe8939 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -76,11 +76,9 @@ static int nft_immediate_init(const struct nft_ctx *ctx, switch (priv->data.verdict.code) { case NFT_JUMP: case NFT_GOTO: - if (nft_chain_is_bound(chain)) { - err = -EBUSY; - goto err1; - } - chain->bound = true; + err = nf_tables_bind_chain(ctx, chain); + if (err < 0) + return err; break; default: break; @@ -98,6 +96,31 @@ static void nft_immediate_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_ctx chain_ctx; + struct nft_chain *chain; + struct nft_rule *rule; + + if (priv->dreg == NFT_REG_VERDICT) { + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + if (!nft_chain_binding(chain)) + break; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry(rule, &chain->rules, list) + nft_rule_expr_activate(&chain_ctx, rule); + + nft_clear(ctx->net, chain); + break; + default: + break; + } + } return nft_data_hold(&priv->data, nft_dreg_to_type(priv->dreg)); } @@ -107,6 +130,43 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx, enum nft_trans_phase phase) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_ctx chain_ctx; + struct nft_chain *chain; + struct nft_rule *rule; + + if (priv->dreg == NFT_REG_VERDICT) { + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + if (!nft_chain_binding(chain)) + break; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry(rule, &chain->rules, list) + nft_rule_expr_deactivate(&chain_ctx, rule, phase); + + switch (phase) { + case NFT_TRANS_PREPARE_ERROR: + nf_tables_unbind_chain(ctx, chain); + fallthrough; + case NFT_TRANS_PREPARE: + nft_deactivate_next(ctx->net, chain); + break; + default: + nft_chain_del(chain); + chain->bound = false; + chain->table->use--; + break; + } + break; + default: + break; + } + } if (phase == NFT_TRANS_COMMIT) return; @@ -131,15 +191,27 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx, case NFT_GOTO: chain = data->verdict.chain; - if (!nft_chain_is_bound(chain)) + if (!nft_chain_binding(chain)) + break; + + /* Rule construction failed, but chain is already bound: + * let the transaction records release this chain and its rules. + */ + if (chain->bound) { + chain->use--; break; + } + /* Rule has been deleted, release chain and its rules. */ chain_ctx = *ctx; chain_ctx.chain = chain; - list_for_each_entry_safe(rule, n, &chain->rules, list) - nf_tables_rule_release(&chain_ctx, rule); - + chain->use--; + list_for_each_entry_safe(rule, n, &chain->rules, list) { + chain->use--; + list_del(&rule->list); + nf_tables_rule_destroy(&chain_ctx, rule); + } nf_tables_chain_destroy(&chain_ctx); break; default: diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 96081ac8d2b4..1e5e7a181e0b 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -271,13 +271,14 @@ static int nft_bitmap_init(const struct nft_set *set, return 0; } -static void nft_bitmap_destroy(const struct nft_set *set) +static void nft_bitmap_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be, *n; list_for_each_entry_safe(be, n, &priv->list, head) - nft_set_elem_destroy(set, be, true); + nf_tables_set_elem_destroy(ctx, set, be); } static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 76de6c8d9865..0b73cb0e752f 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -400,19 +400,31 @@ static int nft_rhash_init(const struct nft_set *set, return 0; } +struct nft_rhash_ctx { + const struct nft_ctx ctx; + const struct nft_set *set; +}; + static void nft_rhash_elem_destroy(void *ptr, void *arg) { - nft_set_elem_destroy(arg, ptr, true); + struct nft_rhash_ctx *rhash_ctx = arg; + + nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, ptr); } -static void nft_rhash_destroy(const struct nft_set *set) +static void nft_rhash_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_ctx rhash_ctx = { + .ctx = *ctx, + .set = set, + }; cancel_delayed_work_sync(&priv->gc_work); rcu_barrier(); rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, - (void *)set); + (void *)&rhash_ctx); } /* Number of buckets is stored in u32, so cap our result to 1U<<31 */ @@ -643,7 +655,8 @@ static int nft_hash_init(const struct nft_set *set, return 0; } -static void nft_hash_destroy(const struct nft_set *set) +static void nft_hash_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he; @@ -653,7 +666,7 @@ static void nft_hash_destroy(const struct nft_set *set) for (i = 0; i < priv->buckets; i++) { hlist_for_each_entry_safe(he, next, &priv->table[i], node) { hlist_del_rcu(&he->node); - nft_set_elem_destroy(set, he, true); + nf_tables_set_elem_destroy(ctx, set, he); } } } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 15e451dc3fc4..0452ee586c1c 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1974,12 +1974,16 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_pipapo *priv = nft_set_priv(set); + struct net *net = read_pnet(&set->net); struct nft_pipapo_match *m; struct nft_pipapo_field *f; int i, r; rcu_read_lock(); - m = rcu_dereference(priv->match); + if (iter->genmask == nft_genmask_cur(net)) + m = rcu_dereference(priv->match); + else + m = priv->clone; if (unlikely(!m)) goto out; @@ -2148,10 +2152,12 @@ out_scratch: /** * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array + * @ctx: context * @set: nftables API set representation * @m: matching data pointing to key mapping array */ -static void nft_set_pipapo_match_destroy(const struct nft_set *set, +static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo_field *f; @@ -2168,15 +2174,17 @@ static void nft_set_pipapo_match_destroy(const struct nft_set *set, e = f->mt[r].e; - nft_set_elem_destroy(set, e, true); + nf_tables_set_elem_destroy(ctx, set, e); } } /** * nft_pipapo_destroy() - Free private data for set and all committed elements + * @ctx: context * @set: nftables API set representation */ -static void nft_pipapo_destroy(const struct nft_set *set) +static void nft_pipapo_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; @@ -2186,7 +2194,7 @@ static void nft_pipapo_destroy(const struct nft_set *set) if (m) { rcu_barrier(); - nft_set_pipapo_match_destroy(set, m); + nft_set_pipapo_match_destroy(ctx, set, m); #ifdef NFT_PIPAPO_ALIGN free_percpu(m->scratch_aligned); @@ -2203,7 +2211,7 @@ static void nft_pipapo_destroy(const struct nft_set *set) m = priv->clone; if (priv->dirty) - nft_set_pipapo_match_destroy(set, m); + nft_set_pipapo_match_destroy(ctx, set, m); #ifdef NFT_PIPAPO_ALIGN free_percpu(priv->clone->scratch_aligned); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 2f114aa10f1a..5c05c9b990fb 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -664,7 +664,8 @@ static int nft_rbtree_init(const struct nft_set *set, return 0; } -static void nft_rbtree_destroy(const struct nft_set *set) +static void nft_rbtree_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; @@ -675,7 +676,7 @@ static void nft_rbtree_destroy(const struct nft_set *set) while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); - nft_set_elem_destroy(set, rbe, true); + nf_tables_set_elem_destroy(ctx, set, rbe); } } diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index e1990baf3a3b..dc9485854002 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -71,4 +71,3 @@ MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); MODULE_DESCRIPTION("Passive OS fingerprint matching."); MODULE_ALIAS("ipt_osf"); MODULE_ALIAS("ip6t_osf"); -MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 6ef3021e1169..e79be1b3e74d 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -966,6 +966,7 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, if (ret < 0) return ret; + sch_tree_lock(sch); /* backup q->clg and q->loss_model */ old_clg = q->clg; old_loss_model = q->loss_model; @@ -974,7 +975,7 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); if (ret) { q->loss_model = old_loss_model; - return ret; + goto unlock; } } else { q->loss_model = CLG_RANDOM; @@ -1041,6 +1042,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, /* capping jitter to the range acceptable by tabledist() */ q->jitter = min_t(s64, abs(q->jitter), INT_MAX); +unlock: + sch_tree_unlock(sch); return ret; get_table_failure: @@ -1050,7 +1053,8 @@ get_table_failure: */ q->clg = old_clg; q->loss_model = old_loss_model; - return ret; + + goto unlock; } static int netem_init(struct Qdisc *sch, struct nlattr *opt, diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 39fb91ff23d9..815b38080401 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -131,6 +131,7 @@ struct sec_path *secpath_set(struct sk_buff *skb) memset(sp->ovec, 0, sizeof(sp->ovec)); sp->olen = 0; sp->len = 0; + sp->verified_cnt = 0; return sp; } @@ -330,11 +331,10 @@ xfrm_inner_mode_encap_remove(struct xfrm_state *x, { switch (x->props.mode) { case XFRM_MODE_BEET: - switch (XFRM_MODE_SKB_CB(skb)->protocol) { - case IPPROTO_IPIP: - case IPPROTO_BEETPH: + switch (x->sel.family) { + case AF_INET: return xfrm4_remove_beet_encap(x, skb); - case IPPROTO_IPV6: + case AF_INET6: return xfrm6_remove_beet_encap(x, skb); } break; diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 1f99dc469027..35279c220bd7 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -310,6 +310,52 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) skb->mark = 0; } +static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type, unsigned short family) +{ + struct sec_path *sp; + + sp = skb_sec_path(skb); + if (sp && (sp->len || sp->olen) && + !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + goto discard; + + XFRM_SPI_SKB_CB(skb)->family = family; + if (family == AF_INET) { + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + } else { + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + } + + return xfrm_input(skb, nexthdr, spi, encap_type); +discard: + kfree_skb(skb); + return 0; +} + +static int xfrmi4_rcv(struct sk_buff *skb) +{ + return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); +} + +static int xfrmi6_rcv(struct sk_buff *skb) +{ + return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], + 0, 0, AF_INET6); +} + +static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); +} + +static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); +} + static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; @@ -945,8 +991,8 @@ static struct pernet_operations xfrmi_net_ops = { }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { - .handler = xfrm6_rcv, - .input_handler = xfrm_input, + .handler = xfrmi6_rcv, + .input_handler = xfrmi6_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, @@ -996,8 +1042,8 @@ static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { - .handler = xfrm4_rcv, - .input_handler = xfrm_input, + .handler = xfrmi4_rcv, + .input_handler = xfrmi4_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 6d15788b5123..e7617c9959c3 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1831,6 +1831,7 @@ again: __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + xfrm_dev_policy_delete(pol); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); @@ -1869,6 +1870,7 @@ again: __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + xfrm_dev_policy_delete(pol); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); @@ -3349,6 +3351,13 @@ xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int star if (xfrm_state_ok(tmpl, sp->xvec[idx], family, if_id)) return ++idx; if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) { + if (idx < sp->verified_cnt) { + /* Secpath entry previously verified, consider optional and + * continue searching + */ + continue; + } + if (start == -1) start = -2-idx; break; @@ -3723,6 +3732,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, * Order is _important_. Later we will implement * some barriers, but at the moment barriers * are implied between each two transformations. + * Upon success, marks secpath entries as having been + * verified to allow them to be skipped in future policy + * checks (e.g. nested tunnels). */ for (i = xfrm_nr-1, k = 0; i >= 0; i--) { k = xfrm_policy_ok(tpp[i], sp, k, family, if_id); @@ -3741,6 +3753,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } xfrm_pols_put(pols, npols); + sp->verified_cnt = k; + return 1; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in index 471300ba176c..50a92c4e9984 100644 --- a/scripts/gdb/linux/constants.py.in +++ b/scripts/gdb/linux/constants.py.in @@ -48,12 +48,12 @@ if IS_BUILTIN(CONFIG_COMMON_CLK): LX_GDBPARSED(CLK_GET_RATE_NOCACHE) /* linux/fs.h */ -LX_VALUE(SB_RDONLY) -LX_VALUE(SB_SYNCHRONOUS) -LX_VALUE(SB_MANDLOCK) -LX_VALUE(SB_DIRSYNC) -LX_VALUE(SB_NOATIME) -LX_VALUE(SB_NODIRATIME) +LX_GDBPARSED(SB_RDONLY) +LX_GDBPARSED(SB_SYNCHRONOUS) +LX_GDBPARSED(SB_MANDLOCK) +LX_GDBPARSED(SB_DIRSYNC) +LX_GDBPARSED(SB_NOATIME) +LX_GDBPARSED(SB_NODIRATIME) /* linux/htimer.h */ LX_GDBPARSED(hrtimer_resolution) diff --git a/scripts/gfp-translate b/scripts/gfp-translate index b2ce416d944b..6c9aed17cf56 100755 --- a/scripts/gfp-translate +++ b/scripts/gfp-translate @@ -63,11 +63,11 @@ fi # Extract GFP flags from the kernel source TMPFILE=`mktemp -t gfptranslate-XXXXXX` || exit 1 -grep -q ___GFP $SOURCE/include/linux/gfp.h +grep -q ___GFP $SOURCE/include/linux/gfp_types.h if [ $? -eq 0 ]; then - grep "^#define ___GFP" $SOURCE/include/linux/gfp.h | sed -e 's/u$//' | grep -v GFP_BITS > $TMPFILE + grep "^#define ___GFP" $SOURCE/include/linux/gfp_types.h | sed -e 's/u$//' | grep -v GFP_BITS > $TMPFILE else - grep "^#define __GFP" $SOURCE/include/linux/gfp.h | sed -e 's/(__force gfp_t)//' | sed -e 's/u)/)/' | grep -v GFP_BITS | sed -e 's/)\//) \//' > $TMPFILE + grep "^#define __GFP" $SOURCE/include/linux/gfp_types.h | sed -e 's/(__force gfp_t)//' | sed -e 's/u)/)/' | grep -v GFP_BITS | sed -e 's/)\//) \//' > $TMPFILE fi # Parse the flags diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index d4531d09984d..c12150f96b88 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1979,6 +1979,11 @@ static void add_header(struct buffer *b, struct module *mod) buf_printf(b, "#include <linux/vermagic.h>\n"); buf_printf(b, "#include <linux/compiler.h>\n"); buf_printf(b, "\n"); + buf_printf(b, "#ifdef CONFIG_UNWINDER_ORC\n"); + buf_printf(b, "#include <asm/orc_header.h>\n"); + buf_printf(b, "ORC_HEADER;\n"); + buf_printf(b, "#endif\n"); + buf_printf(b, "\n"); buf_printf(b, "BUILD_SALT;\n"); buf_printf(b, "BUILD_LTO_INFO;\n"); buf_printf(b, "\n"); diff --git a/scripts/orc_hash.sh b/scripts/orc_hash.sh new file mode 100644 index 000000000000..466611aa0053 --- /dev/null +++ b/scripts/orc_hash.sh @@ -0,0 +1,16 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# Copyright (c) Meta Platforms, Inc. and affiliates. + +set -e + +printf '%s' '#define ORC_HASH ' + +awk ' +/^#define ORC_(REG|TYPE)_/ { print } +/^struct orc_entry {$/ { p=1 } +p { print } +/^}/ { p=0 }' | + sha1sum | + cut -d " " -f 1 | + sed 's/\([0-9a-f]\{2\}\)/0x\1,/g' diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 308ec7034cc9..dabfdecece26 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9527,6 +9527,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1427, "Asus Zenbook UX31E", ALC269VB_FIXUP_ASUS_ZENBOOK), SND_PCI_QUIRK(0x1043, 0x1473, "ASUS GU604V", ALC285_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1483, "ASUS GU603V", ALC285_FIXUP_ASUS_HEADSET_MIC), + SND_PCI_QUIRK(0x1043, 0x1493, "ASUS GV601V", ALC285_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1517, "Asus Zenbook UX31A", ALC269VB_FIXUP_ASUS_ZENBOOK_UX31A), SND_PCI_QUIRK(0x1043, 0x1662, "ASUS GV301QH", ALC294_FIXUP_ASUS_DUAL_SPK), SND_PCI_QUIRK(0x1043, 0x1683, "ASUS UM3402YAR", ALC287_FIXUP_CS35L41_I2C_2), @@ -9552,6 +9553,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1c23, "Asus X55U", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x1043, 0x1c62, "ASUS GU603", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x1c92, "ASUS ROG Strix G15", ALC285_FIXUP_ASUS_G533Z_PINS), + SND_PCI_QUIRK(0x1043, 0x1caf, "ASUS G634JYR/JZR", ALC285_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1ccd, "ASUS X555UB", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x1d42, "ASUS Zephyrus G14 2022", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x1d4e, "ASUS TM420", ALC256_FIXUP_ASUS_HPE), diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index 6faf4a43eaf5..144f082c63fd 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -1347,7 +1347,7 @@ static int sof_card_dai_links_create(struct device *dev, if ((SDW_PART_ID(adr_link->adr_d[i].adr) != SDW_PART_ID(adr_link->adr_d[j].adr)) || (SDW_MFG_ID(adr_link->adr_d[i].adr) != - SDW_MFG_ID(adr_link->adr_d[i].adr))) { + SDW_MFG_ID(adr_link->adr_d[j].adr))) { append_codec_type = true; goto out; } diff --git a/tools/testing/selftests/bpf/prog_tests/subprogs_extable.c b/tools/testing/selftests/bpf/prog_tests/subprogs_extable.c new file mode 100644 index 000000000000..3afd9f775f68 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/subprogs_extable.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> +#include "test_subprogs_extable.skel.h" + +void test_subprogs_extable(void) +{ + const int read_sz = 456; + struct test_subprogs_extable *skel; + int err; + + skel = test_subprogs_extable__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + err = test_subprogs_extable__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger tracepoint */ + ASSERT_OK(trigger_module_test_read(read_sz), "trigger_read"); + + ASSERT_NEQ(skel->bss->triggered, 0, "verify at least one program ran"); + + test_subprogs_extable__detach(skel); + +cleanup: + test_subprogs_extable__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_subprogs_extable.c b/tools/testing/selftests/bpf/progs/test_subprogs_extable.c new file mode 100644 index 000000000000..e2a21fbd4e44 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_subprogs_extable.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 8); + __type(key, __u32); + __type(value, __u64); +} test_array SEC(".maps"); + +unsigned int triggered; + +static __u64 test_cb(struct bpf_map *map, __u32 *key, __u64 *val, void *data) +{ + return 1; +} + +SEC("fexit/bpf_testmod_return_ptr") +int BPF_PROG(handle_fexit_ret_subprogs, int arg, struct file *ret) +{ + *(volatile long *)ret; + *(volatile int *)&ret->f_mode; + bpf_for_each_map_elem(&test_array, test_cb, NULL, 0); + triggered++; + return 0; +} + +SEC("fexit/bpf_testmod_return_ptr") +int BPF_PROG(handle_fexit_ret_subprogs2, int arg, struct file *ret) +{ + *(volatile long *)ret; + *(volatile int *)&ret->f_mode; + bpf_for_each_map_elem(&test_array, test_cb, NULL, 0); + triggered++; + return 0; +} + +SEC("fexit/bpf_testmod_return_ptr") +int BPF_PROG(handle_fexit_ret_subprogs3, int arg, struct file *ret) +{ + *(volatile long *)ret; + *(volatile int *)&ret->f_mode; + bpf_for_each_map_elem(&test_array, test_cb, NULL, 0); + triggered++; + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 136e5530b72c..6115520154e3 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -371,4 +371,83 @@ __naked void and_then_at_fp_8(void) " ::: __clobber_all); } +SEC("xdp") +__description("32-bit spill of 64-bit reg should clear ID") +__failure __msg("math between ctx pointer and 4294967295 is not allowed") +__naked void spill_32bit_of_64bit_fail(void) +{ + asm volatile (" \ + r6 = r1; \ + /* Roll one bit to force the verifier to track both branches. */\ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x8; \ + /* Put a large number into r1. */ \ + r1 = 0xffffffff; \ + r1 <<= 32; \ + r1 += r0; \ + /* Assign an ID to r1. */ \ + r2 = r1; \ + /* 32-bit spill r1 to stack - should clear the ID! */\ + *(u32*)(r10 - 8) = r1; \ + /* 32-bit fill r2 from stack. */ \ + r2 = *(u32*)(r10 - 8); \ + /* Compare r2 with another register to trigger find_equal_scalars.\ + * Having one random bit is important here, otherwise the verifier cuts\ + * the corners. If the ID was mistakenly preserved on spill, this would\ + * cause the verifier to think that r1 is also equal to zero in one of\ + * the branches, and equal to eight on the other branch.\ + */ \ + r3 = 0; \ + if r2 != r3 goto l0_%=; \ +l0_%=: r1 >>= 32; \ + /* At this point, if the verifier thinks that r1 is 0, an out-of-bounds\ + * read will happen, because it actually contains 0xffffffff.\ + */ \ + r6 += r1; \ + r0 = *(u32*)(r6 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("xdp") +__description("16-bit spill of 32-bit reg should clear ID") +__failure __msg("dereference of modified ctx ptr R6 off=65535 disallowed") +__naked void spill_16bit_of_32bit_fail(void) +{ + asm volatile (" \ + r6 = r1; \ + /* Roll one bit to force the verifier to track both branches. */\ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x8; \ + /* Put a large number into r1. */ \ + w1 = 0xffff0000; \ + r1 += r0; \ + /* Assign an ID to r1. */ \ + r2 = r1; \ + /* 16-bit spill r1 to stack - should clear the ID! */\ + *(u16*)(r10 - 8) = r1; \ + /* 16-bit fill r2 from stack. */ \ + r2 = *(u16*)(r10 - 8); \ + /* Compare r2 with another register to trigger find_equal_scalars.\ + * Having one random bit is important here, otherwise the verifier cuts\ + * the corners. If the ID was mistakenly preserved on spill, this would\ + * cause the verifier to think that r1 is also equal to zero in one of\ + * the branches, and equal to eight on the other branch.\ + */ \ + r3 = 0; \ + if r2 != r3 goto l0_%=; \ +l0_%=: r1 >>= 16; \ + /* At this point, if the verifier thinks that r1 is 0, an out-of-bounds\ + * read will happen, because it actually contains 0xffff.\ + */ \ + r6 += r1; \ + r0 = *(u32*)(r6 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index d8bff2005dfc..5fd49ad0c696 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -249,7 +249,7 @@ /** * FIXTURE_SETUP() - Prepares the setup function for the fixture. - * *_metadata* is included so that EXPECT_* and ASSERT_* work correctly. + * *_metadata* is included so that EXPECT_*, ASSERT_* etc. work correctly. * * @fixture_name: fixture name * @@ -275,7 +275,7 @@ /** * FIXTURE_TEARDOWN() - * *_metadata* is included so that EXPECT_* and ASSERT_* work correctly. + * *_metadata* is included so that EXPECT_*, ASSERT_* etc. work correctly. * * @fixture_name: fixture name * @@ -388,7 +388,7 @@ if (setjmp(_metadata->env) == 0) { \ fixture_name##_setup(_metadata, &self, variant->data); \ /* Let setup failure terminate early. */ \ - if (!_metadata->passed) \ + if (!_metadata->passed || _metadata->skip) \ return; \ _metadata->setup_completed = true; \ fixture_name##_##test_name(_metadata, &self, variant->data); \ diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 23af4633f0f4..4f0c50c33ba7 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -5,12 +5,15 @@ LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h include local_config.mk +ifeq ($(ARCH),) + ifeq ($(CROSS_COMPILE),) uname_M := $(shell uname -m 2>/dev/null || echo not) else uname_M := $(shell echo $(CROSS_COMPILE) | grep -o '^[a-z0-9]\+') endif -MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') +ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') +endif # Without this, failed build products remain, with up-to-date timestamps, # thus tricking Make (and you!) into believing that All Is Well, in subsequent @@ -65,7 +68,7 @@ TEST_GEN_PROGS += ksm_tests TEST_GEN_PROGS += ksm_functional_tests TEST_GEN_PROGS += mdwe_test -ifeq ($(MACHINE),x86_64) +ifeq ($(ARCH),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c) CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie) @@ -87,13 +90,13 @@ TEST_GEN_PROGS += $(BINARIES_64) endif else -ifneq (,$(findstring $(MACHINE),ppc64)) +ifneq (,$(findstring $(ARCH),ppc64)) TEST_GEN_PROGS += protection_keys endif endif -ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sparc64 x86_64)) +ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sparc64 x86_64)) TEST_GEN_PROGS += va_high_addr_switch TEST_GEN_PROGS += virtual_address_range TEST_GEN_PROGS += write_to_hugetlbfs @@ -112,7 +115,7 @@ $(TEST_GEN_PROGS): vm_util.c $(OUTPUT)/uffd-stress: uffd-common.c $(OUTPUT)/uffd-unit-tests: uffd-common.c -ifeq ($(MACHINE),x86_64) +ifeq ($(ARCH),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 21ca91473c09..ee6880ac3e5e 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -92,6 +92,13 @@ NSC_CMD="ip netns exec ${NSC}" which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) +# Check if FIPS mode is enabled +if [ -f /proc/sys/crypto/fips_enabled ]; then + fips_enabled=`cat /proc/sys/crypto/fips_enabled` +else + fips_enabled=0 +fi + ################################################################################ # utilities @@ -1216,7 +1223,7 @@ ipv4_tcp_novrf() run_cmd nettest -d ${NSA_DEV} -r ${a} log_test_addr ${a} $? 1 "No server, device client, local conn" - ipv4_tcp_md5_novrf + [ "$fips_enabled" = "1" ] || ipv4_tcp_md5_novrf } ipv4_tcp_vrf() @@ -1270,9 +1277,11 @@ ipv4_tcp_vrf() log_test_addr ${a} $? 1 "Global server, local connection" # run MD5 tests - setup_vrf_dup - ipv4_tcp_md5 - cleanup_vrf_dup + if [ "$fips_enabled" = "0" ]; then + setup_vrf_dup + ipv4_tcp_md5 + cleanup_vrf_dup + fi # # enable VRF global server @@ -2772,7 +2781,7 @@ ipv6_tcp_novrf() log_test_addr ${a} $? 1 "No server, device client, local conn" done - ipv6_tcp_md5_novrf + [ "$fips_enabled" = "1" ] || ipv6_tcp_md5_novrf } ipv6_tcp_vrf() @@ -2842,9 +2851,11 @@ ipv6_tcp_vrf() log_test_addr ${a} $? 1 "Global server, local connection" # run MD5 tests - setup_vrf_dup - ipv6_tcp_md5 - cleanup_vrf_dup + if [ "$fips_enabled" = "0" ]; then + setup_vrf_dup + ipv6_tcp_md5 + cleanup_vrf_dup + fi # # enable VRF global server diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh index c5095da7f6bf..aec752a22e9e 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh @@ -93,12 +93,16 @@ cleanup() test_gretap() { + ip neigh replace 192.0.2.130 lladdr $(mac_get $h3) \ + nud permanent dev br2 full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap" full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap" } test_ip6gretap() { + ip neigh replace 2001:db8:2::2 lladdr $(mac_get $h3) \ + nud permanent dev br2 full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap" full_test_span_gre_dir gt6 egress 0 8 "mirror to ip6gretap" } diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh index 9ff22f28032d..0cf4c47a46f9 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh @@ -90,12 +90,16 @@ cleanup() test_gretap() { + ip neigh replace 192.0.2.130 lladdr $(mac_get $h3) \ + nud permanent dev br1 full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap" full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap" } test_ip6gretap() { + ip neigh replace 2001:db8:2::2 lladdr $(mac_get $h3) \ + nud permanent dev br1 full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap" full_test_span_gre_dir gt6 egress 0 8 "mirror to ip6gretap" } diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index e699548d4247..ff36844d14b4 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -25,6 +25,8 @@ #define TLS_PAYLOAD_MAX_LEN 16384 #define SOL_TLS 282 +static int fips_enabled; + struct tls_crypto_info_keys { union { struct tls12_crypto_info_aes_gcm_128 aes128; @@ -235,7 +237,7 @@ FIXTURE_VARIANT(tls) { uint16_t tls_version; uint16_t cipher_type; - bool nopad; + bool nopad, fips_non_compliant; }; FIXTURE_VARIANT_ADD(tls, 12_aes_gcm) @@ -254,24 +256,28 @@ FIXTURE_VARIANT_ADD(tls, 12_chacha) { .tls_version = TLS_1_2_VERSION, .cipher_type = TLS_CIPHER_CHACHA20_POLY1305, + .fips_non_compliant = true, }; FIXTURE_VARIANT_ADD(tls, 13_chacha) { .tls_version = TLS_1_3_VERSION, .cipher_type = TLS_CIPHER_CHACHA20_POLY1305, + .fips_non_compliant = true, }; FIXTURE_VARIANT_ADD(tls, 13_sm4_gcm) { .tls_version = TLS_1_3_VERSION, .cipher_type = TLS_CIPHER_SM4_GCM, + .fips_non_compliant = true, }; FIXTURE_VARIANT_ADD(tls, 13_sm4_ccm) { .tls_version = TLS_1_3_VERSION, .cipher_type = TLS_CIPHER_SM4_CCM, + .fips_non_compliant = true, }; FIXTURE_VARIANT_ADD(tls, 12_aes_ccm) @@ -311,6 +317,9 @@ FIXTURE_SETUP(tls) int one = 1; int ret; + if (fips_enabled && variant->fips_non_compliant) + SKIP(return, "Unsupported cipher in FIPS mode"); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12); @@ -1865,4 +1874,17 @@ TEST(prequeue) { close(cfd); } +static void __attribute__((constructor)) fips_check(void) { + int res; + FILE *f; + + f = fopen("/proc/sys/crypto/fips_enabled", "r"); + if (f) { + res = fscanf(f, "%d", &fips_enabled); + if (res != 1) + ksft_print_msg("ERROR: Couldn't read /proc/sys/crypto/fips_enabled\n"); + fclose(f); + } +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/vrf-xfrm-tests.sh b/tools/testing/selftests/net/vrf-xfrm-tests.sh index 184da81f554f..452638ae8aed 100755 --- a/tools/testing/selftests/net/vrf-xfrm-tests.sh +++ b/tools/testing/selftests/net/vrf-xfrm-tests.sh @@ -264,60 +264,60 @@ setup_xfrm() ip -netns host1 xfrm state add src ${HOST1_4} dst ${HOST2_4} \ proto esp spi ${SPI_1} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ - enc 'cbc(des3_ede)' ${ENC_1} \ + auth-trunc 'hmac(sha1)' ${AUTH_1} 96 \ + enc 'cbc(aes)' ${ENC_1} \ sel src ${h1_4} dst ${h2_4} ${devarg} ip -netns host2 xfrm state add src ${HOST1_4} dst ${HOST2_4} \ proto esp spi ${SPI_1} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ - enc 'cbc(des3_ede)' ${ENC_1} \ + auth-trunc 'hmac(sha1)' ${AUTH_1} 96 \ + enc 'cbc(aes)' ${ENC_1} \ sel src ${h1_4} dst ${h2_4} ip -netns host1 xfrm state add src ${HOST2_4} dst ${HOST1_4} \ proto esp spi ${SPI_2} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ - enc 'cbc(des3_ede)' ${ENC_2} \ + auth-trunc 'hmac(sha1)' ${AUTH_2} 96 \ + enc 'cbc(aes)' ${ENC_2} \ sel src ${h2_4} dst ${h1_4} ${devarg} ip -netns host2 xfrm state add src ${HOST2_4} dst ${HOST1_4} \ proto esp spi ${SPI_2} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ - enc 'cbc(des3_ede)' ${ENC_2} \ + auth-trunc 'hmac(sha1)' ${AUTH_2} 96 \ + enc 'cbc(aes)' ${ENC_2} \ sel src ${h2_4} dst ${h1_4} ip -6 -netns host1 xfrm state add src ${HOST1_6} dst ${HOST2_6} \ proto esp spi ${SPI_1} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ - enc 'cbc(des3_ede)' ${ENC_1} \ + auth-trunc 'hmac(sha1)' ${AUTH_1} 96 \ + enc 'cbc(aes)' ${ENC_1} \ sel src ${h1_6} dst ${h2_6} ${devarg} ip -6 -netns host2 xfrm state add src ${HOST1_6} dst ${HOST2_6} \ proto esp spi ${SPI_1} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ - enc 'cbc(des3_ede)' ${ENC_1} \ + auth-trunc 'hmac(sha1)' ${AUTH_1} 96 \ + enc 'cbc(aes)' ${ENC_1} \ sel src ${h1_6} dst ${h2_6} ip -6 -netns host1 xfrm state add src ${HOST2_6} dst ${HOST1_6} \ proto esp spi ${SPI_2} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ - enc 'cbc(des3_ede)' ${ENC_2} \ + auth-trunc 'hmac(sha1)' ${AUTH_2} 96 \ + enc 'cbc(aes)' ${ENC_2} \ sel src ${h2_6} dst ${h1_6} ${devarg} ip -6 -netns host2 xfrm state add src ${HOST2_6} dst ${HOST1_6} \ proto esp spi ${SPI_2} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ - enc 'cbc(des3_ede)' ${ENC_2} \ + auth-trunc 'hmac(sha1)' ${AUTH_2} 96 \ + enc 'cbc(aes)' ${ENC_2} \ sel src ${h2_6} dst ${h1_6} } diff --git a/tools/testing/selftests/user_events/dyn_test.c b/tools/testing/selftests/user_events/dyn_test.c index 8879a7b04c6a..d6979a48478f 100644 --- a/tools/testing/selftests/user_events/dyn_test.c +++ b/tools/testing/selftests/user_events/dyn_test.c @@ -16,42 +16,140 @@ #include "../kselftest_harness.h" -const char *dyn_file = "/sys/kernel/tracing/dynamic_events"; -const char *clear = "!u:__test_event"; +const char *abi_file = "/sys/kernel/tracing/user_events_data"; +const char *enable_file = "/sys/kernel/tracing/events/user_events/__test_event/enable"; -static int Append(const char *value) +static bool wait_for_delete(void) { - int fd = open(dyn_file, O_RDWR | O_APPEND); - int ret = write(fd, value, strlen(value)); + int i; + + for (i = 0; i < 1000; ++i) { + int fd = open(enable_file, O_RDONLY); + + if (fd == -1) + return true; + + close(fd); + usleep(1000); + } + + return false; +} + +static int reg_event(int fd, int *check, int bit, const char *value) +{ + struct user_reg reg = {0}; + + reg.size = sizeof(reg); + reg.name_args = (__u64)value; + reg.enable_bit = bit; + reg.enable_addr = (__u64)check; + reg.enable_size = sizeof(*check); + + if (ioctl(fd, DIAG_IOCSREG, ®) == -1) + return -1; + + return 0; +} + +static int unreg_event(int fd, int *check, int bit) +{ + struct user_unreg unreg = {0}; + + unreg.size = sizeof(unreg); + unreg.disable_bit = bit; + unreg.disable_addr = (__u64)check; + + return ioctl(fd, DIAG_IOCSUNREG, &unreg); +} + +static int parse(int *check, const char *value) +{ + int fd = open(abi_file, O_RDWR); + int ret; + + if (fd == -1) + return -1; + + /* Until we have persist flags via dynamic events, use the base name */ + if (value[0] != 'u' || value[1] != ':') { + close(fd); + return -1; + } + + ret = reg_event(fd, check, 31, value + 2); + + if (ret != -1) { + if (unreg_event(fd, check, 31) == -1) + printf("WARN: Couldn't unreg event\n"); + } close(fd); + return ret; } -#define CLEAR() \ +static int check_match(int *check, const char *first, const char *second, bool *match) +{ + int fd = open(abi_file, O_RDWR); + int ret = -1; + + if (fd == -1) + return -1; + + if (reg_event(fd, check, 31, first) == -1) + goto cleanup; + + if (reg_event(fd, check, 30, second) == -1) { + if (errno == EADDRINUSE) { + /* Name is in use, with different fields */ + *match = false; + ret = 0; + } + + goto cleanup; + } + + *match = true; + ret = 0; +cleanup: + unreg_event(fd, check, 31); + unreg_event(fd, check, 30); + + close(fd); + + wait_for_delete(); + + return ret; +} + +#define TEST_MATCH(x, y) \ do { \ - int ret = Append(clear); \ - if (ret == -1) \ - ASSERT_EQ(ENOENT, errno); \ + bool match; \ + ASSERT_NE(-1, check_match(&self->check, x, y, &match)); \ + ASSERT_EQ(true, match); \ } while (0) -#define TEST_PARSE(x) \ +#define TEST_NMATCH(x, y) \ do { \ - ASSERT_NE(-1, Append(x)); \ - CLEAR(); \ + bool match; \ + ASSERT_NE(-1, check_match(&self->check, x, y, &match)); \ + ASSERT_EQ(false, match); \ } while (0) -#define TEST_NPARSE(x) ASSERT_EQ(-1, Append(x)) +#define TEST_PARSE(x) ASSERT_NE(-1, parse(&self->check, x)) + +#define TEST_NPARSE(x) ASSERT_EQ(-1, parse(&self->check, x)) FIXTURE(user) { + int check; }; FIXTURE_SETUP(user) { - CLEAR(); } FIXTURE_TEARDOWN(user) { - CLEAR(); + wait_for_delete(); } TEST_F(user, basic_types) { @@ -95,33 +193,30 @@ TEST_F(user, size_types) { TEST_NPARSE("u:__test_event char a 20"); } -TEST_F(user, flags) { - /* Should work */ - TEST_PARSE("u:__test_event:BPF_ITER u32 a"); - /* Forward compat */ - TEST_PARSE("u:__test_event:BPF_ITER,FLAG_FUTURE u32 a"); -} - TEST_F(user, matching) { - /* Register */ - ASSERT_NE(-1, Append("u:__test_event struct custom a 20")); - /* Should not match */ - TEST_NPARSE("!u:__test_event struct custom b"); - /* Should match */ - TEST_PARSE("!u:__test_event struct custom a"); - /* Multi field reg */ - ASSERT_NE(-1, Append("u:__test_event u32 a; u32 b")); - /* Non matching cases */ - TEST_NPARSE("!u:__test_event u32 a"); - TEST_NPARSE("!u:__test_event u32 b"); - TEST_NPARSE("!u:__test_event u32 a; u32 "); - TEST_NPARSE("!u:__test_event u32 a; u32 a"); - /* Matching case */ - TEST_PARSE("!u:__test_event u32 a; u32 b"); - /* Register */ - ASSERT_NE(-1, Append("u:__test_event u32 a; u32 b")); - /* Ensure trailing semi-colon case */ - TEST_PARSE("!u:__test_event u32 a; u32 b;"); + /* Single name matches */ + TEST_MATCH("__test_event u32 a", + "__test_event u32 a"); + + /* Multiple names match */ + TEST_MATCH("__test_event u32 a; u32 b", + "__test_event u32 a; u32 b"); + + /* Multiple names match with dangling ; */ + TEST_MATCH("__test_event u32 a; u32 b", + "__test_event u32 a; u32 b;"); + + /* Single name doesn't match */ + TEST_NMATCH("__test_event u32 a", + "__test_event u32 b"); + + /* Multiple names don't match */ + TEST_NMATCH("__test_event u32 a; u32 b", + "__test_event u32 b; u32 a"); + + /* Types don't match */ + TEST_NMATCH("__test_event u64 a; u64 b", + "__test_event u32 a; u32 b"); } int main(int argc, char **argv) diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c index 7c99cef94a65..eb6904d89f14 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -102,30 +102,56 @@ err: return -1; } +static bool wait_for_delete(void) +{ + int i; + + for (i = 0; i < 1000; ++i) { + int fd = open(enable_file, O_RDONLY); + + if (fd == -1) + return true; + + close(fd); + usleep(1000); + } + + return false; +} + static int clear(int *check) { struct user_unreg unreg = {0}; + int fd; unreg.size = sizeof(unreg); unreg.disable_bit = 31; unreg.disable_addr = (__u64)check; - int fd = open(data_file, O_RDWR); + fd = open(data_file, O_RDWR); if (fd == -1) return -1; if (ioctl(fd, DIAG_IOCSUNREG, &unreg) == -1) if (errno != ENOENT) - return -1; - - if (ioctl(fd, DIAG_IOCSDEL, "__test_event") == -1) - if (errno != ENOENT) - return -1; + goto fail; + + if (ioctl(fd, DIAG_IOCSDEL, "__test_event") == -1) { + if (errno == EBUSY) { + if (!wait_for_delete()) + goto fail; + } else if (errno != ENOENT) + goto fail; + } close(fd); return 0; +fail: + close(fd); + + return -1; } static int check_print_fmt(const char *event, const char *expected, int *check) @@ -155,9 +181,8 @@ static int check_print_fmt(const char *event, const char *expected, int *check) /* Register should work */ ret = ioctl(fd, DIAG_IOCSREG, ®); - close(fd); - if (ret != 0) { + close(fd); printf("Reg failed in fmt\n"); return ret; } @@ -165,6 +190,8 @@ static int check_print_fmt(const char *event, const char *expected, int *check) /* Ensure correct print_fmt */ ret = get_print_fmt(print_fmt, sizeof(print_fmt)); + close(fd); + if (ret != 0) return ret; @@ -228,6 +255,12 @@ TEST_F(user, register_events) { ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); + /* Multiple registers to same name but different args should fail */ + reg.enable_bit = 29; + reg.name_args = (__u64)"__test_event u32 field1;"; + ASSERT_EQ(-1, ioctl(self->data_fd, DIAG_IOCSREG, ®)); + ASSERT_EQ(EADDRINUSE, errno); + /* Ensure disabled */ self->enable_fd = open(enable_file, O_RDWR); ASSERT_NE(-1, self->enable_fd); @@ -250,10 +283,10 @@ TEST_F(user, register_events) { unreg.disable_bit = 30; ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg)); - /* Delete should work only after close and unregister */ + /* Delete should have been auto-done after close and unregister */ close(self->data_fd); - self->data_fd = open(data_file, O_RDWR); - ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSDEL, "__test_event")); + + ASSERT_EQ(true, wait_for_delete()); } TEST_F(user, write_events) { @@ -310,6 +343,39 @@ TEST_F(user, write_events) { ASSERT_EQ(EINVAL, errno); } +TEST_F(user, write_empty_events) { + struct user_reg reg = {0}; + struct iovec io[1]; + int before = 0, after = 0; + + reg.size = sizeof(reg); + reg.name_args = (__u64)"__test_event"; + reg.enable_bit = 31; + reg.enable_addr = (__u64)&self->check; + reg.enable_size = sizeof(self->check); + + io[0].iov_base = ®.write_index; + io[0].iov_len = sizeof(reg.write_index); + + /* Register should work */ + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); + ASSERT_EQ(0, reg.write_index); + ASSERT_EQ(0, self->check); + + /* Enable event */ + self->enable_fd = open(enable_file, O_RDWR); + ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) + + /* Event should now be enabled */ + ASSERT_EQ(1 << reg.enable_bit, self->check); + + /* Write should make it out to ftrace buffers */ + before = trace_bytes(); + ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 1)); + after = trace_bytes(); + ASSERT_GT(after, before); +} + TEST_F(user, write_fault) { struct user_reg reg = {0}; struct iovec io[2]; diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c index a070258d4449..8b09be566fa2 100644 --- a/tools/testing/selftests/user_events/perf_test.c +++ b/tools/testing/selftests/user_events/perf_test.c @@ -81,6 +81,32 @@ static int get_offset(void) return offset; } +static int clear(int *check) +{ + struct user_unreg unreg = {0}; + + unreg.size = sizeof(unreg); + unreg.disable_bit = 31; + unreg.disable_addr = (__u64)check; + + int fd = open(data_file, O_RDWR); + + if (fd == -1) + return -1; + + if (ioctl(fd, DIAG_IOCSUNREG, &unreg) == -1) + if (errno != ENOENT) + return -1; + + if (ioctl(fd, DIAG_IOCSDEL, "__test_event") == -1) + if (errno != ENOENT) + return -1; + + close(fd); + + return 0; +} + FIXTURE(user) { int data_fd; int check; @@ -93,6 +119,9 @@ FIXTURE_SETUP(user) { FIXTURE_TEARDOWN(user) { close(self->data_fd); + + if (clear(&self->check) != 0) + printf("WARNING: Clear didn't work!\n"); } TEST_F(user, perf_write) { @@ -160,6 +189,59 @@ TEST_F(user, perf_write) { ASSERT_EQ(0, self->check); } +TEST_F(user, perf_empty_events) { + struct perf_event_attr pe = {0}; + struct user_reg reg = {0}; + struct perf_event_mmap_page *perf_page; + int page_size = sysconf(_SC_PAGESIZE); + int id, fd; + __u32 *val; + + reg.size = sizeof(reg); + reg.name_args = (__u64)"__test_event"; + reg.enable_bit = 31; + reg.enable_addr = (__u64)&self->check; + reg.enable_size = sizeof(self->check); + + /* Register should work */ + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); + ASSERT_EQ(0, reg.write_index); + ASSERT_EQ(0, self->check); + + /* Id should be there */ + id = get_id(); + ASSERT_NE(-1, id); + + pe.type = PERF_TYPE_TRACEPOINT; + pe.size = sizeof(pe); + pe.config = id; + pe.sample_type = PERF_SAMPLE_RAW; + pe.sample_period = 1; + pe.wakeup_events = 1; + + /* Tracepoint attach should work */ + fd = perf_event_open(&pe, 0, -1, -1, 0); + ASSERT_NE(-1, fd); + + perf_page = mmap(NULL, page_size * 2, PROT_READ, MAP_SHARED, fd, 0); + ASSERT_NE(MAP_FAILED, perf_page); + + /* Status should be updated */ + ASSERT_EQ(1 << reg.enable_bit, self->check); + + /* Ensure write shows up at correct offset */ + ASSERT_NE(-1, write(self->data_fd, ®.write_index, + sizeof(reg.write_index))); + val = (void *)(((char *)perf_page) + perf_page->data_offset); + ASSERT_EQ(PERF_RECORD_SAMPLE, *val); + + munmap(perf_page, page_size * 2); + close(fd); + + /* Status should be updated */ + ASSERT_EQ(0, self->check); +} + int main(int argc, char **argv) { return test_harness_run(argc, argv); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 479802a892d4..65f94f592ff8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -686,6 +686,24 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn return __kvm_handle_hva_range(kvm, &range); } + +static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + /* + * Skipping invalid memslots is correct if and only change_pte() is + * surrounded by invalidate_range_{start,end}(), which is currently + * guaranteed by the primary MMU. If that ever changes, KVM needs to + * unmap the memslot instead of skipping the memslot to ensure that KVM + * doesn't hold references to the old PFN. + */ + WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); + + if (range->slot->flags & KVM_MEMSLOT_INVALID) + return false; + + return kvm_set_spte_gfn(kvm, range); +} + static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address, @@ -707,7 +725,7 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, if (!READ_ONCE(kvm->mmu_invalidate_in_progress)) return; - kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); + kvm_handle_hva_range(mn, address, address + 1, pte, kvm_change_spte_gfn); } void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start, |