From 923386f761f5ff35da2ac778839876fe4a2f165f Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Fri, 18 Dec 2015 15:36:57 +0200
Subject: clk: ti: remove un-used definitions from public clk_hw_omap struct

Clksel support has been deprecated a while back, so remove these from
the struct also.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 include/linux/clk/ti.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 6110fe09ed18..07308db5a15d 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -129,8 +129,6 @@ struct clk_hw_omap_ops {
  * @enable_bit: bitshift to write to enable/disable the clock (see @enable_reg)
  * @flags: see "struct clk.flags possibilities" above
  * @clksel_reg: for clksel clks, register va containing src/divisor select
- * @clksel_mask: bitmask in @clksel_reg for the src/divisor selector
- * @clksel: for clksel clks, pointer to struct clksel for this clock
  * @dpll_data: for DPLLs, pointer to struct dpll_data for this clock
  * @clkdm_name: clockdomain name that this clock is contained in
  * @clkdm: pointer to struct clockdomain, resolved from @clkdm_name at runtime
@@ -145,8 +143,6 @@ struct clk_hw_omap {
 	u8			enable_bit;
 	u8			flags;
 	void __iomem		*clksel_reg;
-	u32			clksel_mask;
-	const struct clksel	*clksel;
 	struct dpll_data	*dpll_data;
 	const char		*clkdm_name;
 	struct clockdomain	*clkdm;
-- 
cgit v1.2.3-70-g09d2


From b6f27b2db2df395d65b02a758861c7fc54edbec1 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Fri, 30 Sep 2016 14:10:11 +0300
Subject: clk: ti: add clkdm_lookup to the exported functions

This will be needed to move some additional clockdomain functionality
under clock driver.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/clock.c | 1 +
 include/linux/clk/ti.h      | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index 1270afdcacdf..6fac82609c96 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -57,6 +57,7 @@ u16 cpu_mask;
 static struct ti_clk_ll_ops omap_clk_ll_ops = {
 	.clkdm_clk_enable = clkdm_clk_enable,
 	.clkdm_clk_disable = clkdm_clk_disable,
+	.clkdm_lookup = clkdm_lookup,
 	.cm_wait_module_ready = omap_cm_wait_module_ready,
 	.cm_split_idlest_reg = cm_split_idlest_reg,
 };
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 07308db5a15d..bc7fd8f0fb5d 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -213,6 +213,7 @@ struct clk_omap_reg {
  * @clk_writel: pointer to register write function
  * @clkdm_clk_enable: pointer to clockdomain enable function
  * @clkdm_clk_disable: pointer to clockdomain disable function
+ * @clkdm_lookup: pointer to clockdomain lookup function
  * @cm_wait_module_ready: pointer to CM module wait ready function
  * @cm_split_idlest_reg: pointer to CM module function to split idlest reg
  *
@@ -228,6 +229,7 @@ struct ti_clk_ll_ops {
 	int	(*clkdm_clk_enable)(struct clockdomain *clkdm, struct clk *clk);
 	int	(*clkdm_clk_disable)(struct clockdomain *clkdm,
 				     struct clk *clk);
+	struct clockdomain * (*clkdm_lookup)(const char *name);
 	int	(*cm_wait_module_ready)(u8 part, s16 prcm_mod, u16 idlest_reg,
 					u8 idlest_shift);
 	int	(*cm_split_idlest_reg)(void __iomem *idlest_reg, s16 *prcm_inst,
-- 
cgit v1.2.3-70-g09d2


From 2e1a294c0f2273a6d3537c91965ca46a6483bd8c Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Fri, 30 Sep 2016 14:13:38 +0300
Subject: clk: ti: move omap2_init_clk_clkdm under TI clock driver

This is not needed outside the driver, so move it inside it and remove
the prototype from the public header also.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/clock.c  | 32 --------------------------------
 drivers/clk/ti/clock.h       |  1 +
 drivers/clk/ti/clockdomain.c | 30 ++++++++++++++++++++++++++++++
 include/linux/clk/ti.h       |  1 -
 4 files changed, 31 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index 6fac82609c96..ae5b23c19b83 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -79,38 +79,6 @@ int __init omap2_clk_setup_ll_ops(void)
  * OMAP2+ specific clock functions
  */
 
-/* Public functions */
-
-/**
- * omap2_init_clk_clkdm - look up a clockdomain name, store pointer in clk
- * @clk: OMAP clock struct ptr to use
- *
- * Convert a clockdomain name stored in a struct clk 'clk' into a
- * clockdomain pointer, and save it into the struct clk.  Intended to be
- * called during clk_register().  No return value.
- */
-void omap2_init_clk_clkdm(struct clk_hw *hw)
-{
-	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
-	struct clockdomain *clkdm;
-	const char *clk_name;
-
-	if (!clk->clkdm_name)
-		return;
-
-	clk_name = __clk_get_name(hw->clk);
-
-	clkdm = clkdm_lookup(clk->clkdm_name);
-	if (clkdm) {
-		pr_debug("clock: associated clk %s to clkdm %s\n",
-			 clk_name, clk->clkdm_name);
-		clk->clkdm = clkdm;
-	} else {
-		pr_debug("clock: could not associate clk %s to clkdm %s\n",
-			 clk_name, clk->clkdm_name);
-	}
-}
-
 /**
  * ti_clk_init_features - init clock features struct for the SoC
  *
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index ee6d22507a3d..ecf82d86118c 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -228,6 +228,7 @@ extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_wait;
 extern const struct clk_ops ti_clk_divider_ops;
 extern const struct clk_ops ti_clk_mux_ops;
 
+void omap2_init_clk_clkdm(struct clk_hw *hw);
 int omap2_clkops_enable_clkdm(struct clk_hw *hw);
 void omap2_clkops_disable_clkdm(struct clk_hw *hw);
 
diff --git a/drivers/clk/ti/clockdomain.c b/drivers/clk/ti/clockdomain.c
index 6cf9dd189a92..704157d8c0b7 100644
--- a/drivers/clk/ti/clockdomain.c
+++ b/drivers/clk/ti/clockdomain.c
@@ -103,6 +103,36 @@ void omap2_clkops_disable_clkdm(struct clk_hw *hw)
 	ti_clk_ll_ops->clkdm_clk_disable(clk->clkdm, hw->clk);
 }
 
+/**
+ * omap2_init_clk_clkdm - look up a clockdomain name, store pointer in clk
+ * @clk: OMAP clock struct ptr to use
+ *
+ * Convert a clockdomain name stored in a struct clk 'clk' into a
+ * clockdomain pointer, and save it into the struct clk.  Intended to be
+ * called during clk_register().  No return value.
+ */
+void omap2_init_clk_clkdm(struct clk_hw *hw)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+	struct clockdomain *clkdm;
+	const char *clk_name;
+
+	if (!clk->clkdm_name)
+		return;
+
+	clk_name = __clk_get_name(hw->clk);
+
+	clkdm = ti_clk_ll_ops->clkdm_lookup(clk->clkdm_name);
+	if (clkdm) {
+		pr_debug("clock: associated clk %s to clkdm %s\n",
+			 clk_name, clk->clkdm_name);
+		clk->clkdm = clkdm;
+	} else {
+		pr_debug("clock: could not associate clk %s to clkdm %s\n",
+			 clk_name, clk->clkdm_name);
+	}
+}
+
 static void __init of_ti_clockdomain_setup(struct device_node *node)
 {
 	struct clk *clk;
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index bc7fd8f0fb5d..626ae94b7444 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -238,7 +238,6 @@ struct ti_clk_ll_ops {
 
 #define to_clk_hw_omap(_hw) container_of(_hw, struct clk_hw_omap, hw)
 
-void omap2_init_clk_clkdm(struct clk_hw *clk);
 int omap2_clk_disable_autoidle_all(void);
 int omap2_clk_enable_autoidle_all(void);
 int omap2_clk_allow_idle(struct clk *clk);
-- 
cgit v1.2.3-70-g09d2


From c91f07801f144920f8467486a1e36e42ed9d9ff2 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Mon, 30 Jan 2017 16:01:36 +0200
Subject: clk: ti: drop unnecessary MEMMAP_ADDRESSING flag

This has been superceded by the usage of ti_clk_ll_ops for now.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 drivers/clk/ti/apll.c      | 1 -
 drivers/clk/ti/dpll.c      | 2 --
 drivers/clk/ti/gate.c      | 4 +---
 drivers/clk/ti/interface.c | 1 -
 include/linux/clk/ti.h     | 2 --
 5 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/ti/apll.c b/drivers/clk/ti/apll.c
index 62b5db7b8fe9..5cba28c6ab35 100644
--- a/drivers/clk/ti/apll.c
+++ b/drivers/clk/ti/apll.c
@@ -194,7 +194,6 @@ static void __init of_dra7_apll_setup(struct device_node *node)
 
 	clk_hw->dpll_data = ad;
 	clk_hw->hw.init = init;
-	clk_hw->flags = MEMMAP_ADDRESSING;
 
 	init->name = node->name;
 	init->ops = &apll_ck_ops;
diff --git a/drivers/clk/ti/dpll.c b/drivers/clk/ti/dpll.c
index 37624e16cf04..c149bd169f43 100644
--- a/drivers/clk/ti/dpll.c
+++ b/drivers/clk/ti/dpll.c
@@ -248,7 +248,6 @@ struct clk *ti_clk_register_dpll(struct ti_clk *setup)
 	clk_hw->dpll_data = dd;
 	clk_hw->ops = &clkhwops_omap3_dpll;
 	clk_hw->hw.init = &init;
-	clk_hw->flags = MEMMAP_ADDRESSING;
 
 	init.name = setup->name;
 	init.ops = ops;
@@ -380,7 +379,6 @@ static void __init of_ti_dpll_setup(struct device_node *node,
 	clk_hw->dpll_data = dd;
 	clk_hw->ops = &clkhwops_omap3_dpll;
 	clk_hw->hw.init = init;
-	clk_hw->flags = MEMMAP_ADDRESSING;
 
 	init->name = node->name;
 	init->ops = ops;
diff --git a/drivers/clk/ti/gate.c b/drivers/clk/ti/gate.c
index b3291dbca5ed..5ff62e2c63d0 100644
--- a/drivers/clk/ti/gate.c
+++ b/drivers/clk/ti/gate.c
@@ -113,7 +113,7 @@ static struct clk *_register_gate(struct device *dev, const char *name,
 	clk_hw->enable_bit = bit_idx;
 	clk_hw->ops = hw_ops;
 
-	clk_hw->flags = MEMMAP_ADDRESSING | clk_gate_flags;
+	clk_hw->flags = clk_gate_flags;
 
 	init.parent_names = &parent_name;
 	init.num_parents = 1;
@@ -203,7 +203,6 @@ struct clk_hw *ti_clk_build_component_gate(struct ti_clk_gate *setup)
 		ops = &clkhwops_iclk_wait;
 
 	gate->ops = ops;
-	gate->flags = MEMMAP_ADDRESSING;
 
 	return &gate->hw;
 }
@@ -269,7 +268,6 @@ _of_ti_composite_gate_clk_setup(struct device_node *node,
 
 	gate->enable_bit = val;
 	gate->ops = hw_ops;
-	gate->flags = MEMMAP_ADDRESSING;
 
 	if (!ti_clk_add_component(node, &gate->hw, CLK_COMPONENT_TYPE_GATE))
 		return;
diff --git a/drivers/clk/ti/interface.c b/drivers/clk/ti/interface.c
index 7927e1a2ba02..42d9fd4f5f6a 100644
--- a/drivers/clk/ti/interface.c
+++ b/drivers/clk/ti/interface.c
@@ -47,7 +47,6 @@ static struct clk *_register_interface(struct device *dev, const char *name,
 
 	clk_hw->hw.init = &init;
 	clk_hw->ops = ops;
-	clk_hw->flags = MEMMAP_ADDRESSING;
 	clk_hw->enable_reg = reg;
 	clk_hw->enable_bit = bit_idx;
 
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 626ae94b7444..affdabd0b6a1 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -168,7 +168,6 @@ struct clk_hw_omap {
  *     should be used.  This is a temporary solution - a better approach
  *     would be to associate clock type-specific data with the clock,
  *     similar to the struct dpll_data approach.
- * MEMMAP_ADDRESSING: Use memmap addressing to access clock registers.
  */
 #define ENABLE_REG_32BIT	(1 << 0)	/* Use 32-bit access */
 #define CLOCK_IDLE_CONTROL	(1 << 1)
@@ -176,7 +175,6 @@ struct clk_hw_omap {
 #define ENABLE_ON_INIT		(1 << 3)	/* Enable upon framework init */
 #define INVERT_ENABLE		(1 << 4)	/* 0 enables, 1 disables */
 #define CLOCK_CLKOUTX2		(1 << 5)
-#define MEMMAP_ADDRESSING	(1 << 6)
 
 /* CM_CLKEN_PLL*.EN* bit values - not all are available for every DPLL */
 #define DPLL_LOW_POWER_STOP	0x1
-- 
cgit v1.2.3-70-g09d2


From 6c0afb503937a12a8d20a805fcf263e31afa9871 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Thu, 9 Feb 2017 11:24:37 +0200
Subject: clk: ti: convert to use proper register definition for all accesses

Currently, TI clock driver uses an encapsulated struct that is cast into
a void pointer to store all register addresses. This can be considered
as rather nasty hackery, and prevents from expanding the register
address field also. Instead, replace all the code to use proper struct
in place for this, which contains all the previously used data.

This patch is rather large as it is touching multiple files, but this
can't be split up as we need to avoid any boot breakage.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/clkt2xxx_dpllcore.c |  3 +-
 arch/arm/mach-omap2/clock.c             |  2 +-
 arch/arm/mach-omap2/clock.h             |  2 ++
 arch/arm/mach-omap2/cm.h                |  5 +--
 arch/arm/mach-omap2/cm2xxx.c            |  9 ++---
 arch/arm/mach-omap2/cm3xxx.c            | 10 ++----
 arch/arm/mach-omap2/cm_common.c         |  2 +-
 drivers/clk/ti/apll.c                   | 47 ++++++++++++-------------
 drivers/clk/ti/autoidle.c               | 18 +++++-----
 drivers/clk/ti/clk-3xxx.c               | 55 +++++++++++++++--------------
 drivers/clk/ti/clk.c                    | 47 ++++++++++++-------------
 drivers/clk/ti/clkt_dflt.c              | 61 ++++++++++++---------------------
 drivers/clk/ti/clkt_dpll.c              |  6 ++--
 drivers/clk/ti/clkt_iclk.c              | 29 ++++++++--------
 drivers/clk/ti/clock.h                  | 11 +++---
 drivers/clk/ti/clockdomain.c            |  8 -----
 drivers/clk/ti/divider.c                | 24 +++++++------
 drivers/clk/ti/dpll.c                   | 48 ++++++++++----------------
 drivers/clk/ti/dpll3xxx.c               | 38 ++++++++++----------
 drivers/clk/ti/dpll44xx.c               | 14 ++++----
 drivers/clk/ti/gate.c                   | 32 ++++++++---------
 drivers/clk/ti/interface.c              | 22 ++++++------
 drivers/clk/ti/mux.c                    | 41 +++++++++-------------
 include/linux/clk/ti.h                  | 46 +++++++++++++------------
 24 files changed, 264 insertions(+), 316 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/clkt2xxx_dpllcore.c b/arch/arm/mach-omap2/clkt2xxx_dpllcore.c
index 59cf310bc1e9..e8d417309f33 100644
--- a/arch/arm/mach-omap2/clkt2xxx_dpllcore.c
+++ b/arch/arm/mach-omap2/clkt2xxx_dpllcore.c
@@ -138,7 +138,8 @@ int omap2_reprogram_dpllcore(struct clk_hw *hw, unsigned long rate,
 		if (!dd)
 			return -EINVAL;
 
-		tmpset.cm_clksel1_pll = readl_relaxed(dd->mult_div1_reg);
+		tmpset.cm_clksel1_pll =
+			omap_clk_ll_ops.clk_readl(&dd->mult_div1_reg);
 		tmpset.cm_clksel1_pll &= ~(dd->mult_mask |
 					   dd->div1_mask);
 		div = ((curr_prcm_set->xtal_speed / 1000000) - 1);
diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index ae5b23c19b83..42881f21cede 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -54,7 +54,7 @@ u16 cpu_mask;
 #define OMAP3PLUS_DPLL_FINT_MIN		32000
 #define OMAP3PLUS_DPLL_FINT_MAX		52000000
 
-static struct ti_clk_ll_ops omap_clk_ll_ops = {
+struct ti_clk_ll_ops omap_clk_ll_ops = {
 	.clkdm_clk_enable = clkdm_clk_enable,
 	.clkdm_clk_disable = clkdm_clk_disable,
 	.clkdm_lookup = clkdm_lookup,
diff --git a/arch/arm/mach-omap2/clock.h b/arch/arm/mach-omap2/clock.h
index 4e66295dca25..cf45550197e6 100644
--- a/arch/arm/mach-omap2/clock.h
+++ b/arch/arm/mach-omap2/clock.h
@@ -64,6 +64,8 @@
 #define OMAP4XXX_EN_DPLL_FRBYPASS		0x6
 #define OMAP4XXX_EN_DPLL_LOCKED			0x7
 
+extern struct ti_clk_ll_ops omap_clk_ll_ops;
+
 extern u16 cpu_mask;
 
 extern const struct clkops clkops_omap2_dflt_wait;
diff --git a/arch/arm/mach-omap2/cm.h b/arch/arm/mach-omap2/cm.h
index 1fe3e6b833d2..de75cbcdc9d1 100644
--- a/arch/arm/mach-omap2/cm.h
+++ b/arch/arm/mach-omap2/cm.h
@@ -23,6 +23,7 @@
 #define MAX_MODULE_READY_TIME		2000
 
 # ifndef __ASSEMBLER__
+#include <linux/clk/ti.h>
 extern void __iomem *cm_base;
 extern void __iomem *cm2_base;
 extern void omap2_set_globals_cm(void __iomem *cm, void __iomem *cm2);
@@ -50,7 +51,7 @@ extern void omap2_set_globals_cm(void __iomem *cm, void __iomem *cm2);
  * @module_disable: ptr to the SoC CM-specific module_disable impl
  */
 struct cm_ll_data {
-	int (*split_idlest_reg)(void __iomem *idlest_reg, s16 *prcm_inst,
+	int (*split_idlest_reg)(struct clk_omap_reg *idlest_reg, s16 *prcm_inst,
 				u8 *idlest_reg_id);
 	int (*wait_module_ready)(u8 part, s16 prcm_mod, u16 idlest_reg,
 				 u8 idlest_shift);
@@ -60,7 +61,7 @@ struct cm_ll_data {
 	void (*module_disable)(u8 part, u16 inst, u16 clkctrl_offs);
 };
 
-extern int cm_split_idlest_reg(void __iomem *idlest_reg, s16 *prcm_inst,
+extern int cm_split_idlest_reg(struct clk_omap_reg *idlest_reg, s16 *prcm_inst,
 			       u8 *idlest_reg_id);
 int omap_cm_wait_module_ready(u8 part, s16 prcm_mod, u16 idlest_reg,
 			      u8 idlest_shift);
diff --git a/arch/arm/mach-omap2/cm2xxx.c b/arch/arm/mach-omap2/cm2xxx.c
index 3e5fd3587eb1..cd90b4c6a06b 100644
--- a/arch/arm/mach-omap2/cm2xxx.c
+++ b/arch/arm/mach-omap2/cm2xxx.c
@@ -204,7 +204,7 @@ void omap2xxx_cm_apll96_disable(void)
  * XXX This function is only needed until absolute register addresses are
  * removed from the OMAP struct clk records.
  */
-static int omap2xxx_cm_split_idlest_reg(void __iomem *idlest_reg,
+static int omap2xxx_cm_split_idlest_reg(struct clk_omap_reg *idlest_reg,
 					s16 *prcm_inst,
 					u8 *idlest_reg_id)
 {
@@ -212,10 +212,7 @@ static int omap2xxx_cm_split_idlest_reg(void __iomem *idlest_reg,
 	u8 idlest_offs;
 	int i;
 
-	if (idlest_reg < cm_base || idlest_reg > (cm_base + 0x0fff))
-		return -EINVAL;
-
-	idlest_offs = (unsigned long)idlest_reg & 0xff;
+	idlest_offs = idlest_reg->offset & 0xff;
 	for (i = 0; i < ARRAY_SIZE(omap2xxx_cm_idlest_offs); i++) {
 		if (idlest_offs == omap2xxx_cm_idlest_offs[i]) {
 			*idlest_reg_id = i + 1;
@@ -226,7 +223,7 @@ static int omap2xxx_cm_split_idlest_reg(void __iomem *idlest_reg,
 	if (i == ARRAY_SIZE(omap2xxx_cm_idlest_offs))
 		return -EINVAL;
 
-	offs = idlest_reg - cm_base;
+	offs = idlest_reg->offset;
 	offs &= 0xff00;
 	*prcm_inst = offs;
 
diff --git a/arch/arm/mach-omap2/cm3xxx.c b/arch/arm/mach-omap2/cm3xxx.c
index d91ae8206d1e..55b046a719dc 100644
--- a/arch/arm/mach-omap2/cm3xxx.c
+++ b/arch/arm/mach-omap2/cm3xxx.c
@@ -118,7 +118,7 @@ static int omap3xxx_cm_wait_module_ready(u8 part, s16 prcm_mod, u16 idlest_id,
  * XXX This function is only needed until absolute register addresses are
  * removed from the OMAP struct clk records.
  */
-static int omap3xxx_cm_split_idlest_reg(void __iomem *idlest_reg,
+static int omap3xxx_cm_split_idlest_reg(struct clk_omap_reg *idlest_reg,
 					s16 *prcm_inst,
 					u8 *idlest_reg_id)
 {
@@ -126,11 +126,7 @@ static int omap3xxx_cm_split_idlest_reg(void __iomem *idlest_reg,
 	u8 idlest_offs;
 	int i;
 
-	if (idlest_reg < (cm_base + OMAP3430_IVA2_MOD) ||
-	    idlest_reg > (cm_base + 0x1ffff))
-		return -EINVAL;
-
-	idlest_offs = (unsigned long)idlest_reg & 0xff;
+	idlest_offs = idlest_reg->offset & 0xff;
 	for (i = 0; i < ARRAY_SIZE(omap3xxx_cm_idlest_offs); i++) {
 		if (idlest_offs == omap3xxx_cm_idlest_offs[i]) {
 			*idlest_reg_id = i + 1;
@@ -141,7 +137,7 @@ static int omap3xxx_cm_split_idlest_reg(void __iomem *idlest_reg,
 	if (i == ARRAY_SIZE(omap3xxx_cm_idlest_offs))
 		return -EINVAL;
 
-	offs = idlest_reg - cm_base;
+	offs = idlest_reg->offset;
 	offs &= 0xff00;
 	*prcm_inst = offs;
 
diff --git a/arch/arm/mach-omap2/cm_common.c b/arch/arm/mach-omap2/cm_common.c
index 23e8bcec34e3..bbe41f4c9dc8 100644
--- a/arch/arm/mach-omap2/cm_common.c
+++ b/arch/arm/mach-omap2/cm_common.c
@@ -65,7 +65,7 @@ void __init omap2_set_globals_cm(void __iomem *cm, void __iomem *cm2)
  * or 0 upon success.  XXX This function is only needed until absolute
  * register addresses are removed from the OMAP struct clk records.
  */
-int cm_split_idlest_reg(void __iomem *idlest_reg, s16 *prcm_inst,
+int cm_split_idlest_reg(struct clk_omap_reg *idlest_reg, s16 *prcm_inst,
 			u8 *idlest_reg_id)
 {
 	if (!cm_ll_data->split_idlest_reg) {
diff --git a/drivers/clk/ti/apll.c b/drivers/clk/ti/apll.c
index 5cba28c6ab35..06f486b3488c 100644
--- a/drivers/clk/ti/apll.c
+++ b/drivers/clk/ti/apll.c
@@ -55,20 +55,20 @@ static int dra7_apll_enable(struct clk_hw *hw)
 	state <<= __ffs(ad->idlest_mask);
 
 	/* Check is already locked */
-	v = ti_clk_ll_ops->clk_readl(ad->idlest_reg);
+	v = ti_clk_ll_ops->clk_readl(&ad->idlest_reg);
 
 	if ((v & ad->idlest_mask) == state)
 		return r;
 
-	v = ti_clk_ll_ops->clk_readl(ad->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&ad->control_reg);
 	v &= ~ad->enable_mask;
 	v |= APLL_FORCE_LOCK << __ffs(ad->enable_mask);
-	ti_clk_ll_ops->clk_writel(v, ad->control_reg);
+	ti_clk_ll_ops->clk_writel(v, &ad->control_reg);
 
 	state <<= __ffs(ad->idlest_mask);
 
 	while (1) {
-		v = ti_clk_ll_ops->clk_readl(ad->idlest_reg);
+		v = ti_clk_ll_ops->clk_readl(&ad->idlest_reg);
 		if ((v & ad->idlest_mask) == state)
 			break;
 		if (i > MAX_APLL_WAIT_TRIES)
@@ -99,10 +99,10 @@ static void dra7_apll_disable(struct clk_hw *hw)
 
 	state <<= __ffs(ad->idlest_mask);
 
-	v = ti_clk_ll_ops->clk_readl(ad->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&ad->control_reg);
 	v &= ~ad->enable_mask;
 	v |= APLL_AUTO_IDLE << __ffs(ad->enable_mask);
-	ti_clk_ll_ops->clk_writel(v, ad->control_reg);
+	ti_clk_ll_ops->clk_writel(v, &ad->control_reg);
 }
 
 static int dra7_apll_is_enabled(struct clk_hw *hw)
@@ -113,7 +113,7 @@ static int dra7_apll_is_enabled(struct clk_hw *hw)
 
 	ad = clk->dpll_data;
 
-	v = ti_clk_ll_ops->clk_readl(ad->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&ad->control_reg);
 	v &= ad->enable_mask;
 
 	v >>= __ffs(ad->enable_mask);
@@ -185,6 +185,7 @@ static void __init of_dra7_apll_setup(struct device_node *node)
 	struct clk_hw_omap *clk_hw = NULL;
 	struct clk_init_data *init = NULL;
 	const char **parent_names = NULL;
+	int ret;
 
 	ad = kzalloc(sizeof(*ad), GFP_KERNEL);
 	clk_hw = kzalloc(sizeof(*clk_hw), GFP_KERNEL);
@@ -212,10 +213,10 @@ static void __init of_dra7_apll_setup(struct device_node *node)
 
 	init->parent_names = parent_names;
 
-	ad->control_reg = ti_clk_get_reg_addr(node, 0);
-	ad->idlest_reg = ti_clk_get_reg_addr(node, 1);
+	ret = ti_clk_get_reg_addr(node, 0, &ad->control_reg);
+	ret |= ti_clk_get_reg_addr(node, 1, &ad->idlest_reg);
 
-	if (IS_ERR(ad->control_reg) || IS_ERR(ad->idlest_reg))
+	if (ret)
 		goto cleanup;
 
 	ad->idlest_mask = 0x1;
@@ -241,7 +242,7 @@ static int omap2_apll_is_enabled(struct clk_hw *hw)
 	struct dpll_data *ad = clk->dpll_data;
 	u32 v;
 
-	v = ti_clk_ll_ops->clk_readl(ad->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&ad->control_reg);
 	v &= ad->enable_mask;
 
 	v >>= __ffs(ad->enable_mask);
@@ -267,13 +268,13 @@ static int omap2_apll_enable(struct clk_hw *hw)
 	u32 v;
 	int i = 0;
 
-	v = ti_clk_ll_ops->clk_readl(ad->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&ad->control_reg);
 	v &= ~ad->enable_mask;
 	v |= OMAP2_EN_APLL_LOCKED << __ffs(ad->enable_mask);
-	ti_clk_ll_ops->clk_writel(v, ad->control_reg);
+	ti_clk_ll_ops->clk_writel(v, &ad->control_reg);
 
 	while (1) {
-		v = ti_clk_ll_ops->clk_readl(ad->idlest_reg);
+		v = ti_clk_ll_ops->clk_readl(&ad->idlest_reg);
 		if (v & ad->idlest_mask)
 			break;
 		if (i > MAX_APLL_WAIT_TRIES)
@@ -297,10 +298,10 @@ static void omap2_apll_disable(struct clk_hw *hw)
 	struct dpll_data *ad = clk->dpll_data;
 	u32 v;
 
-	v = ti_clk_ll_ops->clk_readl(ad->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&ad->control_reg);
 	v &= ~ad->enable_mask;
 	v |= OMAP2_EN_APLL_STOPPED << __ffs(ad->enable_mask);
-	ti_clk_ll_ops->clk_writel(v, ad->control_reg);
+	ti_clk_ll_ops->clk_writel(v, &ad->control_reg);
 }
 
 static struct clk_ops omap2_apll_ops = {
@@ -315,10 +316,10 @@ static void omap2_apll_set_autoidle(struct clk_hw_omap *clk, u32 val)
 	struct dpll_data *ad = clk->dpll_data;
 	u32 v;
 
-	v = ti_clk_ll_ops->clk_readl(ad->autoidle_reg);
+	v = ti_clk_ll_ops->clk_readl(&ad->autoidle_reg);
 	v &= ~ad->autoidle_mask;
 	v |= val << __ffs(ad->autoidle_mask);
-	ti_clk_ll_ops->clk_writel(v, ad->control_reg);
+	ti_clk_ll_ops->clk_writel(v, &ad->control_reg);
 }
 
 #define OMAP2_APLL_AUTOIDLE_LOW_POWER_STOP	0x3
@@ -347,6 +348,7 @@ static void __init of_omap2_apll_setup(struct device_node *node)
 	struct clk *clk;
 	const char *parent_name;
 	u32 val;
+	int ret;
 
 	ad = kzalloc(sizeof(*ad), GFP_KERNEL);
 	clk_hw = kzalloc(sizeof(*clk_hw), GFP_KERNEL);
@@ -392,12 +394,11 @@ static void __init of_omap2_apll_setup(struct device_node *node)
 
 	ad->idlest_mask = 1 << val;
 
-	ad->control_reg = ti_clk_get_reg_addr(node, 0);
-	ad->autoidle_reg = ti_clk_get_reg_addr(node, 1);
-	ad->idlest_reg = ti_clk_get_reg_addr(node, 2);
+	ret = ti_clk_get_reg_addr(node, 0, &ad->control_reg);
+	ret |= ti_clk_get_reg_addr(node, 1, &ad->autoidle_reg);
+	ret |= ti_clk_get_reg_addr(node, 2, &ad->idlest_reg);
 
-	if (IS_ERR(ad->control_reg) || IS_ERR(ad->autoidle_reg) ||
-	    IS_ERR(ad->idlest_reg))
+	if (ret)
 		goto cleanup;
 
 	clk = clk_register(NULL, &clk_hw->hw);
diff --git a/drivers/clk/ti/autoidle.c b/drivers/clk/ti/autoidle.c
index 345af43465f0..7bb9afbe4058 100644
--- a/drivers/clk/ti/autoidle.c
+++ b/drivers/clk/ti/autoidle.c
@@ -25,7 +25,7 @@
 #include "clock.h"
 
 struct clk_ti_autoidle {
-	void __iomem		*reg;
+	struct clk_omap_reg	reg;
 	u8			shift;
 	u8			flags;
 	const char		*name;
@@ -73,28 +73,28 @@ static void _allow_autoidle(struct clk_ti_autoidle *clk)
 {
 	u32 val;
 
-	val = ti_clk_ll_ops->clk_readl(clk->reg);
+	val = ti_clk_ll_ops->clk_readl(&clk->reg);
 
 	if (clk->flags & AUTOIDLE_LOW)
 		val &= ~(1 << clk->shift);
 	else
 		val |= (1 << clk->shift);
 
-	ti_clk_ll_ops->clk_writel(val, clk->reg);
+	ti_clk_ll_ops->clk_writel(val, &clk->reg);
 }
 
 static void _deny_autoidle(struct clk_ti_autoidle *clk)
 {
 	u32 val;
 
-	val = ti_clk_ll_ops->clk_readl(clk->reg);
+	val = ti_clk_ll_ops->clk_readl(&clk->reg);
 
 	if (clk->flags & AUTOIDLE_LOW)
 		val |= (1 << clk->shift);
 	else
 		val &= ~(1 << clk->shift);
 
-	ti_clk_ll_ops->clk_writel(val, clk->reg);
+	ti_clk_ll_ops->clk_writel(val, &clk->reg);
 }
 
 /**
@@ -140,6 +140,7 @@ int __init of_ti_clk_autoidle_setup(struct device_node *node)
 {
 	u32 shift;
 	struct clk_ti_autoidle *clk;
+	int ret;
 
 	/* Check if this clock has autoidle support or not */
 	if (of_property_read_u32(node, "ti,autoidle-shift", &shift))
@@ -152,11 +153,10 @@ int __init of_ti_clk_autoidle_setup(struct device_node *node)
 
 	clk->shift = shift;
 	clk->name = node->name;
-	clk->reg = ti_clk_get_reg_addr(node, 0);
-
-	if (IS_ERR(clk->reg)) {
+	ret = ti_clk_get_reg_addr(node, 0, &clk->reg);
+	if (ret) {
 		kfree(clk);
-		return -EINVAL;
+		return ret;
 	}
 
 	if (of_property_read_bool(node, "ti,invert-autoidle-bit"))
diff --git a/drivers/clk/ti/clk-3xxx.c b/drivers/clk/ti/clk-3xxx.c
index 11d8aa3ec186..b1251cae98b8 100644
--- a/drivers/clk/ti/clk-3xxx.c
+++ b/drivers/clk/ti/clk-3xxx.c
@@ -52,14 +52,13 @@
  * @idlest_reg and @idlest_bit.  No return value.
  */
 static void omap3430es2_clk_ssi_find_idlest(struct clk_hw_omap *clk,
-					    void __iomem **idlest_reg,
+					    struct clk_omap_reg *idlest_reg,
 					    u8 *idlest_bit,
 					    u8 *idlest_val)
 {
-	u32 r;
-
-	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
-	*idlest_reg = (__force void __iomem *)r;
+	memcpy(idlest_reg, &clk->enable_reg, sizeof(*idlest_reg));
+	idlest_reg->offset &= ~0xf0;
+	idlest_reg->offset |= 0x20;
 	*idlest_bit = OMAP3430ES2_ST_SSI_IDLE_SHIFT;
 	*idlest_val = OMAP34XX_CM_IDLEST_VAL;
 }
@@ -85,15 +84,15 @@ const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_ssi_wait = {
  * default find_idlest code assumes that they are at the same
  * position.)  No return value.
  */
-static void omap3430es2_clk_dss_usbhost_find_idlest(struct clk_hw_omap *clk,
-						    void __iomem **idlest_reg,
-						    u8 *idlest_bit,
-						    u8 *idlest_val)
+static void
+omap3430es2_clk_dss_usbhost_find_idlest(struct clk_hw_omap *clk,
+					struct clk_omap_reg *idlest_reg,
+					u8 *idlest_bit, u8 *idlest_val)
 {
-	u32 r;
+	memcpy(idlest_reg, &clk->enable_reg, sizeof(*idlest_reg));
 
-	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
-	*idlest_reg = (__force void __iomem *)r;
+	idlest_reg->offset &= ~0xf0;
+	idlest_reg->offset |= 0x20;
 	/* USBHOST_IDLE has same shift */
 	*idlest_bit = OMAP3430ES2_ST_DSS_IDLE_SHIFT;
 	*idlest_val = OMAP34XX_CM_IDLEST_VAL;
@@ -122,15 +121,15 @@ const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_dss_usbhost_wait = {
  * shift from the CM_{I,F}CLKEN bit.  Pass back the correct info via
  * @idlest_reg and @idlest_bit.  No return value.
  */
-static void omap3430es2_clk_hsotgusb_find_idlest(struct clk_hw_omap *clk,
-						 void __iomem **idlest_reg,
-						 u8 *idlest_bit,
-						 u8 *idlest_val)
+static void
+omap3430es2_clk_hsotgusb_find_idlest(struct clk_hw_omap *clk,
+				     struct clk_omap_reg *idlest_reg,
+				     u8 *idlest_bit,
+				     u8 *idlest_val)
 {
-	u32 r;
-
-	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
-	*idlest_reg = (__force void __iomem *)r;
+	memcpy(idlest_reg, &clk->enable_reg, sizeof(*idlest_reg));
+	idlest_reg->offset &= ~0xf0;
+	idlest_reg->offset |= 0x20;
 	*idlest_bit = OMAP3430ES2_ST_HSOTGUSB_IDLE_SHIFT;
 	*idlest_val = OMAP34XX_CM_IDLEST_VAL;
 }
@@ -154,11 +153,11 @@ const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_hsotgusb_wait = {
  * bit. A value of 1 indicates that clock is enabled.
  */
 static void am35xx_clk_find_idlest(struct clk_hw_omap *clk,
-				   void __iomem **idlest_reg,
+				   struct clk_omap_reg *idlest_reg,
 				   u8 *idlest_bit,
 				   u8 *idlest_val)
 {
-	*idlest_reg = (__force void __iomem *)(clk->enable_reg);
+	memcpy(idlest_reg, &clk->enable_reg, sizeof(*idlest_reg));
 	*idlest_bit = clk->enable_bit + AM35XX_IPSS_ICK_EN_ACK_OFFSET;
 	*idlest_val = AM35XX_IPSS_CLK_IDLEST_VAL;
 }
@@ -178,10 +177,10 @@ static void am35xx_clk_find_idlest(struct clk_hw_omap *clk,
  * avoid this issue, and remove the casts.  No return value.
  */
 static void am35xx_clk_find_companion(struct clk_hw_omap *clk,
-				      void __iomem **other_reg,
+				      struct clk_omap_reg *other_reg,
 				      u8 *other_bit)
 {
-	*other_reg = (__force void __iomem *)(clk->enable_reg);
+	memcpy(other_reg, &clk->enable_reg, sizeof(*other_reg));
 	if (clk->enable_bit & AM35XX_IPSS_ICK_MASK)
 		*other_bit = clk->enable_bit + AM35XX_IPSS_ICK_FCK_OFFSET;
 	else
@@ -205,14 +204,14 @@ const struct clk_hw_omap_ops clkhwops_am35xx_ipss_module_wait = {
  * and @idlest_bit.  No return value.
  */
 static void am35xx_clk_ipss_find_idlest(struct clk_hw_omap *clk,
-					void __iomem **idlest_reg,
+					struct clk_omap_reg *idlest_reg,
 					u8 *idlest_bit,
 					u8 *idlest_val)
 {
-	u32 r;
+	memcpy(idlest_reg, &clk->enable_reg, sizeof(*idlest_reg));
 
-	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
-	*idlest_reg = (__force void __iomem *)r;
+	idlest_reg->offset &= ~0xf0;
+	idlest_reg->offset |= 0x20;
 	*idlest_bit = AM35XX_ST_IPSS_SHIFT;
 	*idlest_val = OMAP34XX_CM_IDLEST_VAL;
 }
diff --git a/drivers/clk/ti/clk.c b/drivers/clk/ti/clk.c
index 024123f421d6..ddbad7e8d7c9 100644
--- a/drivers/clk/ti/clk.c
+++ b/drivers/clk/ti/clk.c
@@ -43,27 +43,29 @@ struct clk_iomap {
 
 static struct clk_iomap *clk_memmaps[CLK_MAX_MEMMAPS];
 
-static void clk_memmap_writel(u32 val, void __iomem *reg)
+static void clk_memmap_writel(u32 val, const struct clk_omap_reg *reg)
 {
-	struct clk_omap_reg *r = (struct clk_omap_reg *)&reg;
-	struct clk_iomap *io = clk_memmaps[r->index];
+	struct clk_iomap *io = clk_memmaps[reg->index];
 
-	if (io->regmap)
-		regmap_write(io->regmap, r->offset, val);
+	if (reg->ptr)
+		writel_relaxed(val, reg->ptr);
+	else if (io->regmap)
+		regmap_write(io->regmap, reg->offset, val);
 	else
-		writel_relaxed(val, io->mem + r->offset);
+		writel_relaxed(val, io->mem + reg->offset);
 }
 
-static u32 clk_memmap_readl(void __iomem *reg)
+static u32 clk_memmap_readl(const struct clk_omap_reg *reg)
 {
 	u32 val;
-	struct clk_omap_reg *r = (struct clk_omap_reg *)&reg;
-	struct clk_iomap *io = clk_memmaps[r->index];
+	struct clk_iomap *io = clk_memmaps[reg->index];
 
-	if (io->regmap)
-		regmap_read(io->regmap, r->offset, &val);
+	if (reg->ptr)
+		val = readl_relaxed(reg->ptr);
+	else if (io->regmap)
+		regmap_read(io->regmap, reg->offset, &val);
 	else
-		val = readl_relaxed(io->mem + r->offset);
+		val = readl_relaxed(io->mem + reg->offset);
 
 	return val;
 }
@@ -162,20 +164,18 @@ int __init ti_clk_retry_init(struct device_node *node, struct clk_hw *hw,
  * ti_clk_get_reg_addr - get register address for a clock register
  * @node: device node for the clock
  * @index: register index from the clock node
+ * @reg: pointer to target register struct
  *
- * Builds clock register address from device tree information. This
- * is a struct of type clk_omap_reg. Returns a pointer to the register
- * address, or a pointer error value in failure.
+ * Builds clock register address from device tree information, and returns
+ * the data via the provided output pointer @reg. Returns 0 on success,
+ * negative error value on failure.
  */
-void __iomem *ti_clk_get_reg_addr(struct device_node *node, int index)
+int ti_clk_get_reg_addr(struct device_node *node, int index,
+			struct clk_omap_reg *reg)
 {
-	struct clk_omap_reg *reg;
 	u32 val;
-	u32 tmp;
 	int i;
 
-	reg = (struct clk_omap_reg *)&tmp;
-
 	for (i = 0; i < CLK_MAX_MEMMAPS; i++) {
 		if (clocks_node_ptr[i] == node->parent)
 			break;
@@ -183,19 +183,20 @@ void __iomem *ti_clk_get_reg_addr(struct device_node *node, int index)
 
 	if (i == CLK_MAX_MEMMAPS) {
 		pr_err("clk-provider not found for %s!\n", node->name);
-		return IOMEM_ERR_PTR(-ENOENT);
+		return -ENOENT;
 	}
 
 	reg->index = i;
 
 	if (of_property_read_u32_index(node, "reg", index, &val)) {
 		pr_err("%s must have reg[%d]!\n", node->name, index);
-		return IOMEM_ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	reg->offset = val;
+	reg->ptr = NULL;
 
-	return (__force void __iomem *)tmp;
+	return 0;
 }
 
 /**
diff --git a/drivers/clk/ti/clkt_dflt.c b/drivers/clk/ti/clkt_dflt.c
index c6ae563801d7..91751dd26b16 100644
--- a/drivers/clk/ti/clkt_dflt.c
+++ b/drivers/clk/ti/clkt_dflt.c
@@ -55,7 +55,8 @@
  * elapsed.  XXX Deprecated - should be moved into drivers for the
  * individual IP block that the IDLEST register exists in.
  */
-static int _wait_idlest_generic(struct clk_hw_omap *clk, void __iomem *reg,
+static int _wait_idlest_generic(struct clk_hw_omap *clk,
+				struct clk_omap_reg *reg,
 				u32 mask, u8 idlest, const char *name)
 {
 	int i = 0, ena = 0;
@@ -91,7 +92,7 @@ static int _wait_idlest_generic(struct clk_hw_omap *clk, void __iomem *reg,
  */
 static void _omap2_module_wait_ready(struct clk_hw_omap *clk)
 {
-	void __iomem *companion_reg, *idlest_reg;
+	struct clk_omap_reg companion_reg, idlest_reg;
 	u8 other_bit, idlest_bit, idlest_val, idlest_reg_id;
 	s16 prcm_mod;
 	int r;
@@ -99,17 +100,17 @@ static void _omap2_module_wait_ready(struct clk_hw_omap *clk)
 	/* Not all modules have multiple clocks that their IDLEST depends on */
 	if (clk->ops->find_companion) {
 		clk->ops->find_companion(clk, &companion_reg, &other_bit);
-		if (!(ti_clk_ll_ops->clk_readl(companion_reg) &
+		if (!(ti_clk_ll_ops->clk_readl(&companion_reg) &
 		      (1 << other_bit)))
 			return;
 	}
 
 	clk->ops->find_idlest(clk, &idlest_reg, &idlest_bit, &idlest_val);
-	r = ti_clk_ll_ops->cm_split_idlest_reg(idlest_reg, &prcm_mod,
+	r = ti_clk_ll_ops->cm_split_idlest_reg(&idlest_reg, &prcm_mod,
 					       &idlest_reg_id);
 	if (r) {
 		/* IDLEST register not in the CM module */
-		_wait_idlest_generic(clk, idlest_reg, (1 << idlest_bit),
+		_wait_idlest_generic(clk, &idlest_reg, (1 << idlest_bit),
 				     idlest_val, clk_hw_get_name(&clk->hw));
 	} else {
 		ti_clk_ll_ops->cm_wait_module_ready(0, prcm_mod, idlest_reg_id,
@@ -139,17 +140,17 @@ static void _omap2_module_wait_ready(struct clk_hw_omap *clk)
  * avoid this issue, and remove the casts.  No return value.
  */
 void omap2_clk_dflt_find_companion(struct clk_hw_omap *clk,
-				   void __iomem **other_reg, u8 *other_bit)
+				   struct clk_omap_reg *other_reg,
+				   u8 *other_bit)
 {
-	u32 r;
+	memcpy(other_reg, &clk->enable_reg, sizeof(*other_reg));
 
 	/*
 	 * Convert CM_ICLKEN* <-> CM_FCLKEN*.  This conversion assumes
 	 * it's just a matter of XORing the bits.
 	 */
-	r = ((__force u32)clk->enable_reg ^ (CM_FCLKEN ^ CM_ICLKEN));
+	other_reg->offset ^= (CM_FCLKEN ^ CM_ICLKEN);
 
-	*other_reg = (__force void __iomem *)r;
 	*other_bit = clk->enable_bit;
 }
 
@@ -168,13 +169,14 @@ void omap2_clk_dflt_find_companion(struct clk_hw_omap *clk,
  * CM_IDLEST2).  This is not true for all modules.  No return value.
  */
 void omap2_clk_dflt_find_idlest(struct clk_hw_omap *clk,
-				void __iomem **idlest_reg, u8 *idlest_bit,
+				struct clk_omap_reg *idlest_reg, u8 *idlest_bit,
 				u8 *idlest_val)
 {
-	u32 r;
+	memcpy(idlest_reg, &clk->enable_reg, sizeof(*idlest_reg));
+
+	idlest_reg->offset &= ~0xf0;
+	idlest_reg->offset |= 0x20;
 
-	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
-	*idlest_reg = (__force void __iomem *)r;
 	*idlest_bit = clk->enable_bit;
 
 	/*
@@ -222,31 +224,19 @@ int omap2_dflt_clk_enable(struct clk_hw *hw)
 		}
 	}
 
-	if (IS_ERR(clk->enable_reg)) {
-		pr_err("%s: %s missing enable_reg\n", __func__,
-		       clk_hw_get_name(hw));
-		ret = -EINVAL;
-		goto err;
-	}
-
 	/* FIXME should not have INVERT_ENABLE bit here */
-	v = ti_clk_ll_ops->clk_readl(clk->enable_reg);
+	v = ti_clk_ll_ops->clk_readl(&clk->enable_reg);
 	if (clk->flags & INVERT_ENABLE)
 		v &= ~(1 << clk->enable_bit);
 	else
 		v |= (1 << clk->enable_bit);
-	ti_clk_ll_ops->clk_writel(v, clk->enable_reg);
-	v = ti_clk_ll_ops->clk_readl(clk->enable_reg); /* OCP barrier */
+	ti_clk_ll_ops->clk_writel(v, &clk->enable_reg);
+	v = ti_clk_ll_ops->clk_readl(&clk->enable_reg); /* OCP barrier */
 
 	if (clk->ops && clk->ops->find_idlest)
 		_omap2_module_wait_ready(clk);
 
 	return 0;
-
-err:
-	if (clkdm_control && clk->clkdm)
-		ti_clk_ll_ops->clkdm_clk_disable(clk->clkdm, hw->clk);
-	return ret;
 }
 
 /**
@@ -264,22 +254,13 @@ void omap2_dflt_clk_disable(struct clk_hw *hw)
 	u32 v;
 
 	clk = to_clk_hw_omap(hw);
-	if (IS_ERR(clk->enable_reg)) {
-		/*
-		 * 'independent' here refers to a clock which is not
-		 * controlled by its parent.
-		 */
-		pr_err("%s: independent clock %s has no enable_reg\n",
-		       __func__, clk_hw_get_name(hw));
-		return;
-	}
 
-	v = ti_clk_ll_ops->clk_readl(clk->enable_reg);
+	v = ti_clk_ll_ops->clk_readl(&clk->enable_reg);
 	if (clk->flags & INVERT_ENABLE)
 		v |= (1 << clk->enable_bit);
 	else
 		v &= ~(1 << clk->enable_bit);
-	ti_clk_ll_ops->clk_writel(v, clk->enable_reg);
+	ti_clk_ll_ops->clk_writel(v, &clk->enable_reg);
 	/* No OCP barrier needed here since it is a disable operation */
 
 	if (!(ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL) &&
@@ -300,7 +281,7 @@ int omap2_dflt_clk_is_enabled(struct clk_hw *hw)
 	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
 	u32 v;
 
-	v = ti_clk_ll_ops->clk_readl(clk->enable_reg);
+	v = ti_clk_ll_ops->clk_readl(&clk->enable_reg);
 
 	if (clk->flags & INVERT_ENABLE)
 		v ^= BIT(clk->enable_bit);
diff --git a/drivers/clk/ti/clkt_dpll.c b/drivers/clk/ti/clkt_dpll.c
index b919fdfe8256..ce98da2c10be 100644
--- a/drivers/clk/ti/clkt_dpll.c
+++ b/drivers/clk/ti/clkt_dpll.c
@@ -213,7 +213,7 @@ u8 omap2_init_dpll_parent(struct clk_hw *hw)
 	if (!dd)
 		return -EINVAL;
 
-	v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->control_reg);
 	v &= dd->enable_mask;
 	v >>= __ffs(dd->enable_mask);
 
@@ -249,14 +249,14 @@ unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk)
 		return 0;
 
 	/* Return bypass rate if DPLL is bypassed */
-	v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->control_reg);
 	v &= dd->enable_mask;
 	v >>= __ffs(dd->enable_mask);
 
 	if (_omap2_dpll_is_in_bypass(v))
 		return clk_hw_get_rate(dd->clk_bypass);
 
-	v = ti_clk_ll_ops->clk_readl(dd->mult_div1_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->mult_div1_reg);
 	dpll_mult = v & dd->mult_mask;
 	dpll_mult >>= __ffs(dd->mult_mask);
 	dpll_div = v & dd->div1_mask;
diff --git a/drivers/clk/ti/clkt_iclk.c b/drivers/clk/ti/clkt_iclk.c
index 38c36908cf88..60b583d7db33 100644
--- a/drivers/clk/ti/clkt_iclk.c
+++ b/drivers/clk/ti/clkt_iclk.c
@@ -31,28 +31,29 @@
 void omap2_clkt_iclk_allow_idle(struct clk_hw_omap *clk)
 {
 	u32 v;
-	void __iomem *r;
+	struct clk_omap_reg r;
 
-	r = (__force void __iomem *)
-		((__force u32)clk->enable_reg ^ (CM_AUTOIDLE ^ CM_ICLKEN));
+	memcpy(&r, &clk->enable_reg, sizeof(r));
+	r.offset ^= (CM_AUTOIDLE ^ CM_ICLKEN);
 
-	v = ti_clk_ll_ops->clk_readl(r);
+	v = ti_clk_ll_ops->clk_readl(&r);
 	v |= (1 << clk->enable_bit);
-	ti_clk_ll_ops->clk_writel(v, r);
+	ti_clk_ll_ops->clk_writel(v, &r);
 }
 
 /* XXX */
 void omap2_clkt_iclk_deny_idle(struct clk_hw_omap *clk)
 {
 	u32 v;
-	void __iomem *r;
+	struct clk_omap_reg r;
 
-	r = (__force void __iomem *)
-		((__force u32)clk->enable_reg ^ (CM_AUTOIDLE ^ CM_ICLKEN));
+	memcpy(&r, &clk->enable_reg, sizeof(r));
 
-	v = ti_clk_ll_ops->clk_readl(r);
+	r.offset ^= (CM_AUTOIDLE ^ CM_ICLKEN);
+
+	v = ti_clk_ll_ops->clk_readl(&r);
 	v &= ~(1 << clk->enable_bit);
-	ti_clk_ll_ops->clk_writel(v, r);
+	ti_clk_ll_ops->clk_writel(v, &r);
 }
 
 /**
@@ -68,14 +69,12 @@ void omap2_clkt_iclk_deny_idle(struct clk_hw_omap *clk)
  * modules.  No return value.
  */
 static void omap2430_clk_i2chs_find_idlest(struct clk_hw_omap *clk,
-					   void __iomem **idlest_reg,
+					   struct clk_omap_reg *idlest_reg,
 					   u8 *idlest_bit,
 					   u8 *idlest_val)
 {
-	u32 r;
-
-	r = ((__force u32)clk->enable_reg ^ (OMAP24XX_CM_FCLKEN2 ^ CM_IDLEST));
-	*idlest_reg = (__force void __iomem *)r;
+	memcpy(idlest_reg, &clk->enable_reg, sizeof(*idlest_reg));
+	idlest_reg->offset ^= (OMAP24XX_CM_FCLKEN2 ^ CM_IDLEST);
 	*idlest_bit = clk->enable_bit;
 	*idlest_val = OMAP24XX_CM_IDLEST_VAL;
 }
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index 437ea768f837..3f7b26540be8 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -18,7 +18,7 @@
 
 struct clk_omap_divider {
 	struct clk_hw		hw;
-	void __iomem		*reg;
+	struct clk_omap_reg	reg;
 	u8			shift;
 	u8			width;
 	u8			flags;
@@ -29,7 +29,7 @@ struct clk_omap_divider {
 
 struct clk_omap_mux {
 	struct clk_hw		hw;
-	void __iomem		*reg;
+	struct clk_omap_reg	reg;
 	u32			*table;
 	u32			mask;
 	u8			shift;
@@ -228,7 +228,8 @@ void ti_clk_patch_legacy_clks(struct ti_clk **patch);
 struct clk *ti_clk_register_clk(struct ti_clk *setup);
 int ti_clk_register_legacy_clks(struct ti_clk_alias *clks);
 
-void __iomem *ti_clk_get_reg_addr(struct device_node *node, int index);
+int ti_clk_get_reg_addr(struct device_node *node, int index,
+			struct clk_omap_reg *reg);
 void ti_dt_clocks_register(struct ti_dt_clk *oclks);
 int ti_clk_retry_init(struct device_node *node, struct clk_hw *hw,
 		      ti_of_clk_init_cb_t func);
@@ -263,10 +264,10 @@ int omap2_dflt_clk_enable(struct clk_hw *hw);
 void omap2_dflt_clk_disable(struct clk_hw *hw);
 int omap2_dflt_clk_is_enabled(struct clk_hw *hw);
 void omap2_clk_dflt_find_companion(struct clk_hw_omap *clk,
-				   void __iomem **other_reg,
+				   struct clk_omap_reg *other_reg,
 				   u8 *other_bit);
 void omap2_clk_dflt_find_idlest(struct clk_hw_omap *clk,
-				void __iomem **idlest_reg,
+				struct clk_omap_reg *idlest_reg,
 				u8 *idlest_bit, u8 *idlest_val);
 
 void omap2_clkt_iclk_allow_idle(struct clk_hw_omap *clk);
diff --git a/drivers/clk/ti/clockdomain.c b/drivers/clk/ti/clockdomain.c
index 704157d8c0b7..fbedc6a9fed0 100644
--- a/drivers/clk/ti/clockdomain.c
+++ b/drivers/clk/ti/clockdomain.c
@@ -52,10 +52,6 @@ int omap2_clkops_enable_clkdm(struct clk_hw *hw)
 		return -EINVAL;
 	}
 
-	if (unlikely(clk->enable_reg))
-		pr_err("%s: %s: should use dflt_clk_enable ?!\n", __func__,
-		       clk_hw_get_name(hw));
-
 	if (ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL) {
 		pr_err("%s: %s: clkfw-based clockdomain control disabled ?!\n",
 		       __func__, clk_hw_get_name(hw));
@@ -90,10 +86,6 @@ void omap2_clkops_disable_clkdm(struct clk_hw *hw)
 		return;
 	}
 
-	if (unlikely(clk->enable_reg))
-		pr_err("%s: %s: should use dflt_clk_disable ?!\n", __func__,
-		       clk_hw_get_name(hw));
-
 	if (ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL) {
 		pr_err("%s: %s: clkfw-based clockdomain control disabled ?!\n",
 		       __func__, clk_hw_get_name(hw));
diff --git a/drivers/clk/ti/divider.c b/drivers/clk/ti/divider.c
index 1cc0242d823f..d6dcb283b72b 100644
--- a/drivers/clk/ti/divider.c
+++ b/drivers/clk/ti/divider.c
@@ -100,7 +100,7 @@ static unsigned long ti_clk_divider_recalc_rate(struct clk_hw *hw,
 	struct clk_omap_divider *divider = to_clk_omap_divider(hw);
 	unsigned int div, val;
 
-	val = ti_clk_ll_ops->clk_readl(divider->reg) >> divider->shift;
+	val = ti_clk_ll_ops->clk_readl(&divider->reg) >> divider->shift;
 	val &= div_mask(divider);
 
 	div = _get_div(divider, val);
@@ -257,11 +257,11 @@ static int ti_clk_divider_set_rate(struct clk_hw *hw, unsigned long rate,
 	if (divider->flags & CLK_DIVIDER_HIWORD_MASK) {
 		val = div_mask(divider) << (divider->shift + 16);
 	} else {
-		val = ti_clk_ll_ops->clk_readl(divider->reg);
+		val = ti_clk_ll_ops->clk_readl(&divider->reg);
 		val &= ~(div_mask(divider) << divider->shift);
 	}
 	val |= value << divider->shift;
-	ti_clk_ll_ops->clk_writel(val, divider->reg);
+	ti_clk_ll_ops->clk_writel(val, &divider->reg);
 
 	return 0;
 }
@@ -274,7 +274,8 @@ const struct clk_ops ti_clk_divider_ops = {
 
 static struct clk *_register_divider(struct device *dev, const char *name,
 				     const char *parent_name,
-				     unsigned long flags, void __iomem *reg,
+				     unsigned long flags,
+				     struct clk_omap_reg *reg,
 				     u8 shift, u8 width, u8 clk_divider_flags,
 				     const struct clk_div_table *table)
 {
@@ -303,7 +304,7 @@ static struct clk *_register_divider(struct device *dev, const char *name,
 	init.num_parents = (parent_name ? 1 : 0);
 
 	/* struct clk_divider assignments */
-	div->reg = reg;
+	memcpy(&div->reg, reg, sizeof(*reg));
 	div->shift = shift;
 	div->width = width;
 	div->flags = clk_divider_flags;
@@ -561,14 +562,15 @@ static int _get_divider_width(struct device_node *node,
 }
 
 static int __init ti_clk_divider_populate(struct device_node *node,
-	void __iomem **reg, const struct clk_div_table **table,
+	struct clk_omap_reg *reg, const struct clk_div_table **table,
 	u32 *flags, u8 *div_flags, u8 *width, u8 *shift)
 {
 	u32 val;
+	int ret;
 
-	*reg = ti_clk_get_reg_addr(node, 0);
-	if (IS_ERR(*reg))
-		return PTR_ERR(*reg);
+	ret = ti_clk_get_reg_addr(node, 0, reg);
+	if (ret)
+		return ret;
 
 	if (!of_property_read_u32(node, "ti,bit-shift", &val))
 		*shift = val;
@@ -607,7 +609,7 @@ static void __init of_ti_divider_clk_setup(struct device_node *node)
 {
 	struct clk *clk;
 	const char *parent_name;
-	void __iomem *reg;
+	struct clk_omap_reg reg;
 	u8 clk_divider_flags = 0;
 	u8 width = 0;
 	u8 shift = 0;
@@ -620,7 +622,7 @@ static void __init of_ti_divider_clk_setup(struct device_node *node)
 				    &clk_divider_flags, &width, &shift))
 		goto cleanup;
 
-	clk = _register_divider(NULL, node->name, parent_name, flags, reg,
+	clk = _register_divider(NULL, node->name, parent_name, flags, &reg,
 				shift, width, clk_divider_flags, table);
 
 	if (!IS_ERR(clk)) {
diff --git a/drivers/clk/ti/dpll.c b/drivers/clk/ti/dpll.c
index 778bc90955b9..96d84888c6c5 100644
--- a/drivers/clk/ti/dpll.c
+++ b/drivers/clk/ti/dpll.c
@@ -203,17 +203,10 @@ cleanup:
 }
 
 #if defined(CONFIG_ARCH_OMAP3) && defined(CONFIG_ATAGS)
-static void __iomem *_get_reg(u8 module, u16 offset)
+void _get_reg(u8 module, u16 offset, struct clk_omap_reg *reg)
 {
-	u32 reg;
-	struct clk_omap_reg *reg_setup;
-
-	reg_setup = (struct clk_omap_reg *)&reg;
-
-	reg_setup->index = module;
-	reg_setup->offset = offset;
-
-	return (void __iomem *)reg;
+	reg->index = module;
+	reg->offset = offset;
 }
 
 struct clk *ti_clk_register_dpll(struct ti_clk *setup)
@@ -255,10 +248,10 @@ struct clk *ti_clk_register_dpll(struct ti_clk *setup)
 	init.num_parents = dpll->num_parents;
 	init.parent_names = dpll->parents;
 
-	dd->control_reg = _get_reg(dpll->module, dpll->control_reg);
-	dd->idlest_reg = _get_reg(dpll->module, dpll->idlest_reg);
-	dd->mult_div1_reg = _get_reg(dpll->module, dpll->mult_div1_reg);
-	dd->autoidle_reg = _get_reg(dpll->module, dpll->autoidle_reg);
+	_get_reg(dpll->module, dpll->control_reg, &dd->control_reg);
+	_get_reg(dpll->module, dpll->idlest_reg, &dd->idlest_reg);
+	_get_reg(dpll->module, dpll->mult_div1_reg, &dd->mult_div1_reg);
+	_get_reg(dpll->module, dpll->autoidle_reg, &dd->autoidle_reg);
 
 	dd->modes = dpll->modes;
 	dd->div1_mask = dpll->div1_mask;
@@ -344,12 +337,9 @@ static void _register_dpll_x2(struct device_node *node,
 		ret = of_property_count_elems_of_size(node, "reg", 1);
 		if (ret <= 0) {
 			hw_ops = NULL;
-		} else {
-			clk_hw->clksel_reg = ti_clk_get_reg_addr(node, 0);
-			if (IS_ERR(clk_hw->clksel_reg)) {
-				kfree(clk_hw);
-				return;
-			}
+		} else if (ti_clk_get_reg_addr(node, 0, &clk_hw->clksel_reg)) {
+			kfree(clk_hw);
+			return;
 		}
 	}
 
@@ -412,7 +402,8 @@ static void __init of_ti_dpll_setup(struct device_node *node,
 
 	init->parent_names = parent_names;
 
-	dd->control_reg = ti_clk_get_reg_addr(node, 0);
+	if (ti_clk_get_reg_addr(node, 0, &dd->control_reg))
+		goto cleanup;
 
 	/*
 	 * Special case for OMAP2 DPLL, register order is different due to
@@ -420,25 +411,22 @@ static void __init of_ti_dpll_setup(struct device_node *node,
 	 * missing idlest_mask.
 	 */
 	if (!dd->idlest_mask) {
-		dd->mult_div1_reg = ti_clk_get_reg_addr(node, 1);
+		if (ti_clk_get_reg_addr(node, 1, &dd->mult_div1_reg))
+			goto cleanup;
 #ifdef CONFIG_ARCH_OMAP2
 		clk_hw->ops = &clkhwops_omap2xxx_dpll;
 		omap2xxx_clkt_dpllcore_init(&clk_hw->hw);
 #endif
 	} else {
-		dd->idlest_reg = ti_clk_get_reg_addr(node, 1);
-		if (IS_ERR(dd->idlest_reg))
+		if (ti_clk_get_reg_addr(node, 1, &dd->idlest_reg))
 			goto cleanup;
 
-		dd->mult_div1_reg = ti_clk_get_reg_addr(node, 2);
+		if (ti_clk_get_reg_addr(node, 2, &dd->mult_div1_reg))
+			goto cleanup;
 	}
 
-	if (IS_ERR(dd->control_reg) || IS_ERR(dd->mult_div1_reg))
-		goto cleanup;
-
 	if (dd->autoidle_mask) {
-		dd->autoidle_reg = ti_clk_get_reg_addr(node, 3);
-		if (IS_ERR(dd->autoidle_reg))
+		if (ti_clk_get_reg_addr(node, 3, &dd->autoidle_reg))
 			goto cleanup;
 	}
 
diff --git a/drivers/clk/ti/dpll3xxx.c b/drivers/clk/ti/dpll3xxx.c
index 4cdd28a25584..4534de2ef455 100644
--- a/drivers/clk/ti/dpll3xxx.c
+++ b/drivers/clk/ti/dpll3xxx.c
@@ -54,10 +54,10 @@ static void _omap3_dpll_write_clken(struct clk_hw_omap *clk, u8 clken_bits)
 
 	dd = clk->dpll_data;
 
-	v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->control_reg);
 	v &= ~dd->enable_mask;
 	v |= clken_bits << __ffs(dd->enable_mask);
-	ti_clk_ll_ops->clk_writel(v, dd->control_reg);
+	ti_clk_ll_ops->clk_writel(v, &dd->control_reg);
 }
 
 /* _omap3_wait_dpll_status: wait for a DPLL to enter a specific state */
@@ -73,7 +73,7 @@ static int _omap3_wait_dpll_status(struct clk_hw_omap *clk, u8 state)
 
 	state <<= __ffs(dd->idlest_mask);
 
-	while (((ti_clk_ll_ops->clk_readl(dd->idlest_reg) & dd->idlest_mask)
+	while (((ti_clk_ll_ops->clk_readl(&dd->idlest_reg) & dd->idlest_mask)
 		!= state) && i < MAX_DPLL_WAIT_TRIES) {
 		i++;
 		udelay(1);
@@ -151,7 +151,7 @@ static int _omap3_noncore_dpll_lock(struct clk_hw_omap *clk)
 	state <<= __ffs(dd->idlest_mask);
 
 	/* Check if already locked */
-	if ((ti_clk_ll_ops->clk_readl(dd->idlest_reg) & dd->idlest_mask) ==
+	if ((ti_clk_ll_ops->clk_readl(&dd->idlest_reg) & dd->idlest_mask) ==
 	    state)
 		goto done;
 
@@ -317,14 +317,14 @@ static int omap3_noncore_dpll_program(struct clk_hw_omap *clk, u16 freqsel)
 	 * only since freqsel field is no longer present on other devices.
 	 */
 	if (ti_clk_get_features()->flags & TI_CLK_DPLL_HAS_FREQSEL) {
-		v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+		v = ti_clk_ll_ops->clk_readl(&dd->control_reg);
 		v &= ~dd->freqsel_mask;
 		v |= freqsel << __ffs(dd->freqsel_mask);
-		ti_clk_ll_ops->clk_writel(v, dd->control_reg);
+		ti_clk_ll_ops->clk_writel(v, &dd->control_reg);
 	}
 
 	/* Set DPLL multiplier, divider */
-	v = ti_clk_ll_ops->clk_readl(dd->mult_div1_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->mult_div1_reg);
 
 	/* Handle Duty Cycle Correction */
 	if (dd->dcc_mask) {
@@ -370,11 +370,11 @@ static int omap3_noncore_dpll_program(struct clk_hw_omap *clk, u16 freqsel)
 		}
 	}
 
-	ti_clk_ll_ops->clk_writel(v, dd->mult_div1_reg);
+	ti_clk_ll_ops->clk_writel(v, &dd->mult_div1_reg);
 
 	/* Set 4X multiplier and low-power mode */
 	if (dd->m4xen_mask || dd->lpmode_mask) {
-		v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+		v = ti_clk_ll_ops->clk_readl(&dd->control_reg);
 
 		if (dd->m4xen_mask) {
 			if (dd->last_rounded_m4xen)
@@ -390,7 +390,7 @@ static int omap3_noncore_dpll_program(struct clk_hw_omap *clk, u16 freqsel)
 				v &= ~dd->lpmode_mask;
 		}
 
-		ti_clk_ll_ops->clk_writel(v, dd->control_reg);
+		ti_clk_ll_ops->clk_writel(v, &dd->control_reg);
 	}
 
 	/* We let the clock framework set the other output dividers later */
@@ -652,10 +652,10 @@ static u32 omap3_dpll_autoidle_read(struct clk_hw_omap *clk)
 
 	dd = clk->dpll_data;
 
-	if (!dd->autoidle_reg)
+	if (!dd->autoidle_mask)
 		return -EINVAL;
 
-	v = ti_clk_ll_ops->clk_readl(dd->autoidle_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->autoidle_reg);
 	v &= dd->autoidle_mask;
 	v >>= __ffs(dd->autoidle_mask);
 
@@ -681,7 +681,7 @@ static void omap3_dpll_allow_idle(struct clk_hw_omap *clk)
 
 	dd = clk->dpll_data;
 
-	if (!dd->autoidle_reg)
+	if (!dd->autoidle_mask)
 		return;
 
 	/*
@@ -689,10 +689,10 @@ static void omap3_dpll_allow_idle(struct clk_hw_omap *clk)
 	 * by writing 0x5 instead of 0x1.  Add some mechanism to
 	 * optionally enter this mode.
 	 */
-	v = ti_clk_ll_ops->clk_readl(dd->autoidle_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->autoidle_reg);
 	v &= ~dd->autoidle_mask;
 	v |= DPLL_AUTOIDLE_LOW_POWER_STOP << __ffs(dd->autoidle_mask);
-	ti_clk_ll_ops->clk_writel(v, dd->autoidle_reg);
+	ti_clk_ll_ops->clk_writel(v, &dd->autoidle_reg);
 }
 
 /**
@@ -711,13 +711,13 @@ static void omap3_dpll_deny_idle(struct clk_hw_omap *clk)
 
 	dd = clk->dpll_data;
 
-	if (!dd->autoidle_reg)
+	if (!dd->autoidle_mask)
 		return;
 
-	v = ti_clk_ll_ops->clk_readl(dd->autoidle_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->autoidle_reg);
 	v &= ~dd->autoidle_mask;
 	v |= DPLL_AUTOIDLE_DISABLE << __ffs(dd->autoidle_mask);
-	ti_clk_ll_ops->clk_writel(v, dd->autoidle_reg);
+	ti_clk_ll_ops->clk_writel(v, &dd->autoidle_reg);
 }
 
 /* Clock control for DPLL outputs */
@@ -773,7 +773,7 @@ unsigned long omap3_clkoutx2_recalc(struct clk_hw *hw,
 
 	WARN_ON(!dd->enable_mask);
 
-	v = ti_clk_ll_ops->clk_readl(dd->control_reg) & dd->enable_mask;
+	v = ti_clk_ll_ops->clk_readl(&dd->control_reg) & dd->enable_mask;
 	v >>= __ffs(dd->enable_mask);
 	if ((v != OMAP3XXX_EN_DPLL_LOCKED) || (dd->flags & DPLL_J_TYPE))
 		rate = parent_rate;
diff --git a/drivers/clk/ti/dpll44xx.c b/drivers/clk/ti/dpll44xx.c
index 82c05b55a7be..d7a3f7ec8d77 100644
--- a/drivers/clk/ti/dpll44xx.c
+++ b/drivers/clk/ti/dpll44xx.c
@@ -42,17 +42,17 @@ static void omap4_dpllmx_allow_gatectrl(struct clk_hw_omap *clk)
 	u32 v;
 	u32 mask;
 
-	if (!clk || !clk->clksel_reg)
+	if (!clk)
 		return;
 
 	mask = clk->flags & CLOCK_CLKOUTX2 ?
 			OMAP4430_DPLL_CLKOUTX2_GATE_CTRL_MASK :
 			OMAP4430_DPLL_CLKOUT_GATE_CTRL_MASK;
 
-	v = ti_clk_ll_ops->clk_readl(clk->clksel_reg);
+	v = ti_clk_ll_ops->clk_readl(&clk->clksel_reg);
 	/* Clear the bit to allow gatectrl */
 	v &= ~mask;
-	ti_clk_ll_ops->clk_writel(v, clk->clksel_reg);
+	ti_clk_ll_ops->clk_writel(v, &clk->clksel_reg);
 }
 
 static void omap4_dpllmx_deny_gatectrl(struct clk_hw_omap *clk)
@@ -60,17 +60,17 @@ static void omap4_dpllmx_deny_gatectrl(struct clk_hw_omap *clk)
 	u32 v;
 	u32 mask;
 
-	if (!clk || !clk->clksel_reg)
+	if (!clk)
 		return;
 
 	mask = clk->flags & CLOCK_CLKOUTX2 ?
 			OMAP4430_DPLL_CLKOUTX2_GATE_CTRL_MASK :
 			OMAP4430_DPLL_CLKOUT_GATE_CTRL_MASK;
 
-	v = ti_clk_ll_ops->clk_readl(clk->clksel_reg);
+	v = ti_clk_ll_ops->clk_readl(&clk->clksel_reg);
 	/* Set the bit to deny gatectrl */
 	v |= mask;
-	ti_clk_ll_ops->clk_writel(v, clk->clksel_reg);
+	ti_clk_ll_ops->clk_writel(v, &clk->clksel_reg);
 }
 
 const struct clk_hw_omap_ops clkhwops_omap4_dpllmx = {
@@ -128,7 +128,7 @@ unsigned long omap4_dpll_regm4xen_recalc(struct clk_hw *hw,
 	rate = omap2_get_dpll_rate(clk);
 
 	/* regm4xen adds a multiplier of 4 to DPLL calculations */
-	v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->control_reg);
 	if (v & OMAP4430_DPLL_REGM4XEN_MASK)
 		rate *= OMAP4430_REGM4XEN_MULT;
 
diff --git a/drivers/clk/ti/gate.c b/drivers/clk/ti/gate.c
index 77f0920e12f1..7151ec3a1b07 100644
--- a/drivers/clk/ti/gate.c
+++ b/drivers/clk/ti/gate.c
@@ -76,15 +76,15 @@ static int omap36xx_gate_clk_enable_with_hsdiv_restore(struct clk_hw *hw)
 
 	/* Restore the dividers */
 	if (!ret) {
-		orig_v = ti_clk_ll_ops->clk_readl(parent->reg);
+		orig_v = ti_clk_ll_ops->clk_readl(&parent->reg);
 		dummy_v = orig_v;
 
 		/* Write any other value different from the Read value */
 		dummy_v ^= (1 << parent->shift);
-		ti_clk_ll_ops->clk_writel(dummy_v, parent->reg);
+		ti_clk_ll_ops->clk_writel(dummy_v, &parent->reg);
 
 		/* Write the original divider */
-		ti_clk_ll_ops->clk_writel(orig_v, parent->reg);
+		ti_clk_ll_ops->clk_writel(orig_v, &parent->reg);
 	}
 
 	return ret;
@@ -92,7 +92,7 @@ static int omap36xx_gate_clk_enable_with_hsdiv_restore(struct clk_hw *hw)
 
 static struct clk *_register_gate(struct device *dev, const char *name,
 				  const char *parent_name, unsigned long flags,
-				  void __iomem *reg, u8 bit_idx,
+				  struct clk_omap_reg *reg, u8 bit_idx,
 				  u8 clk_gate_flags, const struct clk_ops *ops,
 				  const struct clk_hw_omap_ops *hw_ops)
 {
@@ -109,7 +109,7 @@ static struct clk *_register_gate(struct device *dev, const char *name,
 	init.name = name;
 	init.ops = ops;
 
-	clk_hw->enable_reg = reg;
+	memcpy(&clk_hw->enable_reg, reg, sizeof(*reg));
 	clk_hw->enable_bit = bit_idx;
 	clk_hw->ops = hw_ops;
 
@@ -133,8 +133,7 @@ struct clk *ti_clk_register_gate(struct ti_clk *setup)
 {
 	const struct clk_ops *ops = &omap_gate_clk_ops;
 	const struct clk_hw_omap_ops *hw_ops = NULL;
-	u32 reg;
-	struct clk_omap_reg *reg_setup;
+	struct clk_omap_reg reg;
 	u32 flags = 0;
 	u8 clk_gate_flags = 0;
 	struct ti_clk_gate *gate;
@@ -144,8 +143,6 @@ struct clk *ti_clk_register_gate(struct ti_clk *setup)
 	if (gate->flags & CLKF_INTERFACE)
 		return ti_clk_register_interface(setup);
 
-	reg_setup = (struct clk_omap_reg *)&reg;
-
 	if (gate->flags & CLKF_SET_RATE_PARENT)
 		flags |= CLK_SET_RATE_PARENT;
 
@@ -169,11 +166,12 @@ struct clk *ti_clk_register_gate(struct ti_clk *setup)
 	if (gate->flags & CLKF_AM35XX)
 		hw_ops = &clkhwops_am35xx_ipss_module_wait;
 
-	reg_setup->index = gate->module;
-	reg_setup->offset = gate->reg;
+	reg.index = gate->module;
+	reg.offset = gate->reg;
+	reg.ptr = NULL;
 
 	return _register_gate(NULL, setup->name, gate->parent, flags,
-			      (void __iomem *)reg, gate->bit_shift,
+			      &reg, gate->bit_shift,
 			      clk_gate_flags, ops, hw_ops);
 }
 
@@ -214,15 +212,14 @@ static void __init _of_ti_gate_clk_setup(struct device_node *node,
 {
 	struct clk *clk;
 	const char *parent_name;
-	void __iomem *reg = NULL;
+	struct clk_omap_reg reg;
 	u8 enable_bit = 0;
 	u32 val;
 	u32 flags = 0;
 	u8 clk_gate_flags = 0;
 
 	if (ops != &omap_gate_clkdm_clk_ops) {
-		reg = ti_clk_get_reg_addr(node, 0);
-		if (IS_ERR(reg))
+		if (ti_clk_get_reg_addr(node, 0, &reg))
 			return;
 
 		if (!of_property_read_u32(node, "ti,bit-shift", &val))
@@ -242,7 +239,7 @@ static void __init _of_ti_gate_clk_setup(struct device_node *node,
 	if (of_property_read_bool(node, "ti,set-bit-to-disable"))
 		clk_gate_flags |= INVERT_ENABLE;
 
-	clk = _register_gate(NULL, node->name, parent_name, flags, reg,
+	clk = _register_gate(NULL, node->name, parent_name, flags, &reg,
 			     enable_bit, clk_gate_flags, ops, hw_ops);
 
 	if (!IS_ERR(clk))
@@ -260,8 +257,7 @@ _of_ti_composite_gate_clk_setup(struct device_node *node,
 	if (!gate)
 		return;
 
-	gate->enable_reg = ti_clk_get_reg_addr(node, 0);
-	if (IS_ERR(gate->enable_reg))
+	if (ti_clk_get_reg_addr(node, 0, &gate->enable_reg))
 		goto cleanup;
 
 	of_property_read_u32(node, "ti,bit-shift", &val);
diff --git a/drivers/clk/ti/interface.c b/drivers/clk/ti/interface.c
index 42d9fd4f5f6a..62cf50c1e1e3 100644
--- a/drivers/clk/ti/interface.c
+++ b/drivers/clk/ti/interface.c
@@ -34,7 +34,7 @@ static const struct clk_ops ti_interface_clk_ops = {
 
 static struct clk *_register_interface(struct device *dev, const char *name,
 				       const char *parent_name,
-				       void __iomem *reg, u8 bit_idx,
+				       struct clk_omap_reg *reg, u8 bit_idx,
 				       const struct clk_hw_omap_ops *ops)
 {
 	struct clk_init_data init = { NULL };
@@ -47,7 +47,7 @@ static struct clk *_register_interface(struct device *dev, const char *name,
 
 	clk_hw->hw.init = &init;
 	clk_hw->ops = ops;
-	clk_hw->enable_reg = reg;
+	memcpy(&clk_hw->enable_reg, reg, sizeof(*reg));
 	clk_hw->enable_bit = bit_idx;
 
 	init.name = name;
@@ -71,14 +71,13 @@ static struct clk *_register_interface(struct device *dev, const char *name,
 struct clk *ti_clk_register_interface(struct ti_clk *setup)
 {
 	const struct clk_hw_omap_ops *ops = &clkhwops_iclk_wait;
-	u32 reg;
-	struct clk_omap_reg *reg_setup;
+	struct clk_omap_reg reg;
 	struct ti_clk_gate *gate;
 
 	gate = setup->data;
-	reg_setup = (struct clk_omap_reg *)&reg;
-	reg_setup->index = gate->module;
-	reg_setup->offset = gate->reg;
+	reg.index = gate->module;
+	reg.offset = gate->reg;
+	reg.ptr = NULL;
 
 	if (gate->flags & CLKF_NO_WAIT)
 		ops = &clkhwops_iclk;
@@ -96,7 +95,7 @@ struct clk *ti_clk_register_interface(struct ti_clk *setup)
 		ops = &clkhwops_am35xx_ipss_wait;
 
 	return _register_interface(NULL, setup->name, gate->parent,
-				   (void __iomem *)reg, gate->bit_shift, ops);
+				   &reg, gate->bit_shift, ops);
 }
 #endif
 
@@ -105,12 +104,11 @@ static void __init _of_ti_interface_clk_setup(struct device_node *node,
 {
 	struct clk *clk;
 	const char *parent_name;
-	void __iomem *reg;
+	struct clk_omap_reg reg;
 	u8 enable_bit = 0;
 	u32 val;
 
-	reg = ti_clk_get_reg_addr(node, 0);
-	if (IS_ERR(reg))
+	if (ti_clk_get_reg_addr(node, 0, &reg))
 		return;
 
 	if (!of_property_read_u32(node, "ti,bit-shift", &val))
@@ -122,7 +120,7 @@ static void __init _of_ti_interface_clk_setup(struct device_node *node,
 		return;
 	}
 
-	clk = _register_interface(NULL, node->name, parent_name, reg,
+	clk = _register_interface(NULL, node->name, parent_name, &reg,
 				  enable_bit, ops);
 
 	if (!IS_ERR(clk))
diff --git a/drivers/clk/ti/mux.c b/drivers/clk/ti/mux.c
index daa2dee6bafe..18c267b38461 100644
--- a/drivers/clk/ti/mux.c
+++ b/drivers/clk/ti/mux.c
@@ -39,7 +39,7 @@ static u8 ti_clk_mux_get_parent(struct clk_hw *hw)
 	 * OTOH, pmd_trace_clk_mux_ck uses a separate bit for each clock, so
 	 * val = 0x4 really means "bit 2, index starts at bit 0"
 	 */
-	val = ti_clk_ll_ops->clk_readl(mux->reg) >> mux->shift;
+	val = ti_clk_ll_ops->clk_readl(&mux->reg) >> mux->shift;
 	val &= mux->mask;
 
 	if (mux->table) {
@@ -81,11 +81,11 @@ static int ti_clk_mux_set_parent(struct clk_hw *hw, u8 index)
 	if (mux->flags & CLK_MUX_HIWORD_MASK) {
 		val = mux->mask << (mux->shift + 16);
 	} else {
-		val = ti_clk_ll_ops->clk_readl(mux->reg);
+		val = ti_clk_ll_ops->clk_readl(&mux->reg);
 		val &= ~(mux->mask << mux->shift);
 	}
 	val |= index << mux->shift;
-	ti_clk_ll_ops->clk_writel(val, mux->reg);
+	ti_clk_ll_ops->clk_writel(val, &mux->reg);
 
 	return 0;
 }
@@ -99,7 +99,7 @@ const struct clk_ops ti_clk_mux_ops = {
 static struct clk *_register_mux(struct device *dev, const char *name,
 				 const char * const *parent_names,
 				 u8 num_parents, unsigned long flags,
-				 void __iomem *reg, u8 shift, u32 mask,
+				 struct clk_omap_reg *reg, u8 shift, u32 mask,
 				 u8 clk_mux_flags, u32 *table)
 {
 	struct clk_omap_mux *mux;
@@ -120,7 +120,7 @@ static struct clk *_register_mux(struct device *dev, const char *name,
 	init.num_parents = num_parents;
 
 	/* struct clk_mux assignments */
-	mux->reg = reg;
+	memcpy(&mux->reg, reg, sizeof(*reg));
 	mux->shift = shift;
 	mux->mask = mask;
 	mux->flags = clk_mux_flags;
@@ -140,12 +140,9 @@ struct clk *ti_clk_register_mux(struct ti_clk *setup)
 	struct ti_clk_mux *mux;
 	u32 flags;
 	u8 mux_flags = 0;
-	struct clk_omap_reg *reg_setup;
-	u32 reg;
+	struct clk_omap_reg reg;
 	u32 mask;
 
-	reg_setup = (struct clk_omap_reg *)&reg;
-
 	mux = setup->data;
 	flags = CLK_SET_RATE_NO_REPARENT;
 
@@ -154,8 +151,9 @@ struct clk *ti_clk_register_mux(struct ti_clk *setup)
 		mask--;
 
 	mask = (1 << fls(mask)) - 1;
-	reg_setup->index = mux->module;
-	reg_setup->offset = mux->reg;
+	reg.index = mux->module;
+	reg.offset = mux->reg;
+	reg.ptr = NULL;
 
 	if (mux->flags & CLKF_INDEX_STARTS_AT_ONE)
 		mux_flags |= CLK_MUX_INDEX_ONE;
@@ -164,7 +162,7 @@ struct clk *ti_clk_register_mux(struct ti_clk *setup)
 		flags |= CLK_SET_RATE_PARENT;
 
 	return _register_mux(NULL, setup->name, mux->parents, mux->num_parents,
-			     flags, (void __iomem *)reg, mux->bit_shift, mask,
+			     flags, &reg, mux->bit_shift, mask,
 			     mux_flags, NULL);
 }
 
@@ -177,7 +175,7 @@ struct clk *ti_clk_register_mux(struct ti_clk *setup)
 static void of_mux_clk_setup(struct device_node *node)
 {
 	struct clk *clk;
-	void __iomem *reg;
+	struct clk_omap_reg reg;
 	unsigned int num_parents;
 	const char **parent_names;
 	u8 clk_mux_flags = 0;
@@ -196,9 +194,7 @@ static void of_mux_clk_setup(struct device_node *node)
 
 	of_clk_parent_fill(node, parent_names, num_parents);
 
-	reg = ti_clk_get_reg_addr(node, 0);
-
-	if (IS_ERR(reg))
+	if (ti_clk_get_reg_addr(node, 0, &reg))
 		goto cleanup;
 
 	of_property_read_u32(node, "ti,bit-shift", &shift);
@@ -217,7 +213,7 @@ static void of_mux_clk_setup(struct device_node *node)
 	mask = (1 << fls(mask)) - 1;
 
 	clk = _register_mux(NULL, node->name, parent_names, num_parents,
-			    flags, reg, shift, mask, clk_mux_flags, NULL);
+			    flags, &reg, shift, mask, clk_mux_flags, NULL);
 
 	if (!IS_ERR(clk))
 		of_clk_add_provider(node, of_clk_src_simple_get, clk);
@@ -230,7 +226,6 @@ CLK_OF_DECLARE(mux_clk, "ti,mux-clock", of_mux_clk_setup);
 struct clk_hw *ti_clk_build_component_mux(struct ti_clk_mux *setup)
 {
 	struct clk_omap_mux *mux;
-	struct clk_omap_reg *reg;
 	int num_parents;
 
 	if (!setup)
@@ -240,12 +235,10 @@ struct clk_hw *ti_clk_build_component_mux(struct ti_clk_mux *setup)
 	if (!mux)
 		return ERR_PTR(-ENOMEM);
 
-	reg = (struct clk_omap_reg *)&mux->reg;
-
 	mux->shift = setup->bit_shift;
 
-	reg->index = setup->module;
-	reg->offset = setup->reg;
+	mux->reg.index = setup->module;
+	mux->reg.offset = setup->reg;
 
 	if (setup->flags & CLKF_INDEX_STARTS_AT_ONE)
 		mux->flags |= CLK_MUX_INDEX_ONE;
@@ -268,9 +261,7 @@ static void __init of_ti_composite_mux_clk_setup(struct device_node *node)
 	if (!mux)
 		return;
 
-	mux->reg = ti_clk_get_reg_addr(node, 0);
-
-	if (IS_ERR(mux->reg))
+	if (ti_clk_get_reg_addr(node, 0, &mux->reg))
 		goto cleanup;
 
 	if (!of_property_read_u32(node, "ti,bit-shift", &val))
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index affdabd0b6a1..d18da839b810 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -18,6 +18,18 @@
 #include <linux/clk-provider.h>
 #include <linux/clkdev.h>
 
+/**
+ * struct clk_omap_reg - OMAP register declaration
+ * @offset: offset from the master IP module base address
+ * @index: index of the master IP module
+ */
+struct clk_omap_reg {
+	void __iomem *ptr;
+	u16 offset;
+	u8 index;
+	u8 flags;
+};
+
 /**
  * struct dpll_data - DPLL registers and integration data
  * @mult_div1_reg: register containing the DPLL M and N bitfields
@@ -67,12 +79,12 @@
  * can be placed into read-only space.
  */
 struct dpll_data {
-	void __iomem		*mult_div1_reg;
+	struct clk_omap_reg	mult_div1_reg;
 	u32			mult_mask;
 	u32			div1_mask;
 	struct clk_hw		*clk_bypass;
 	struct clk_hw		*clk_ref;
-	void __iomem		*control_reg;
+	struct clk_omap_reg	control_reg;
 	u32			enable_mask;
 	unsigned long		last_rounded_rate;
 	u16			last_rounded_m;
@@ -84,8 +96,8 @@ struct dpll_data {
 	u16			max_divider;
 	unsigned long		max_rate;
 	u8			modes;
-	void __iomem		*autoidle_reg;
-	void __iomem		*idlest_reg;
+	struct clk_omap_reg	autoidle_reg;
+	struct clk_omap_reg	idlest_reg;
 	u32			autoidle_mask;
 	u32			freqsel_mask;
 	u32			idlest_mask;
@@ -113,10 +125,10 @@ struct clk_hw_omap;
  */
 struct clk_hw_omap_ops {
 	void	(*find_idlest)(struct clk_hw_omap *oclk,
-			       void __iomem **idlest_reg,
+			       struct clk_omap_reg *idlest_reg,
 			       u8 *idlest_bit, u8 *idlest_val);
 	void	(*find_companion)(struct clk_hw_omap *oclk,
-				  void __iomem **other_reg,
+				  struct clk_omap_reg *other_reg,
 				  u8 *other_bit);
 	void	(*allow_idle)(struct clk_hw_omap *oclk);
 	void	(*deny_idle)(struct clk_hw_omap *oclk);
@@ -139,10 +151,10 @@ struct clk_hw_omap {
 	struct list_head	node;
 	unsigned long		fixed_rate;
 	u8			fixed_div;
-	void __iomem		*enable_reg;
+	struct clk_omap_reg	enable_reg;
 	u8			enable_bit;
 	u8			flags;
-	void __iomem		*clksel_reg;
+	struct clk_omap_reg	clksel_reg;
 	struct dpll_data	*dpll_data;
 	const char		*clkdm_name;
 	struct clockdomain	*clkdm;
@@ -195,16 +207,6 @@ enum {
 	CLK_MAX_MEMMAPS
 };
 
-/**
- * struct clk_omap_reg - OMAP register declaration
- * @offset: offset from the master IP module base address
- * @index: index of the master IP module
- */
-struct clk_omap_reg {
-	u16 offset;
-	u16 index;
-};
-
 /**
  * struct ti_clk_ll_ops - low-level ops for clocks
  * @clk_readl: pointer to register read function
@@ -222,16 +224,16 @@ struct clk_omap_reg {
  * operations not provided directly by clock drivers.
  */
 struct ti_clk_ll_ops {
-	u32	(*clk_readl)(void __iomem *reg);
-	void	(*clk_writel)(u32 val, void __iomem *reg);
+	u32	(*clk_readl)(const struct clk_omap_reg *reg);
+	void	(*clk_writel)(u32 val, const struct clk_omap_reg *reg);
 	int	(*clkdm_clk_enable)(struct clockdomain *clkdm, struct clk *clk);
 	int	(*clkdm_clk_disable)(struct clockdomain *clkdm,
 				     struct clk *clk);
 	struct clockdomain * (*clkdm_lookup)(const char *name);
 	int	(*cm_wait_module_ready)(u8 part, s16 prcm_mod, u16 idlest_reg,
 					u8 idlest_shift);
-	int	(*cm_split_idlest_reg)(void __iomem *idlest_reg, s16 *prcm_inst,
-				       u8 *idlest_reg_id);
+	int	(*cm_split_idlest_reg)(struct clk_omap_reg *idlest_reg,
+				       s16 *prcm_inst, u8 *idlest_reg_id);
 };
 
 #define to_clk_hw_omap(_hw) container_of(_hw, struct clk_hw_omap, hw)
-- 
cgit v1.2.3-70-g09d2


From 7f501f0a72036dc29ad9a53811474c393634b401 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Tue, 24 May 2016 19:20:05 +0200
Subject: mtd: nand: Store nand ID in struct nand_chip

Store the NAND ID in struct nand_chip to avoid passing id_data and id_len
as function parameters.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Richard Weinberger <richard@nod.at>
Reviewed-by: Marek Vasut <marek.vasut@gmail.com>
---
 drivers/mtd/nand/nand_base.c | 55 ++++++++++++++++++++++++--------------------
 include/linux/mtd/nand.h     | 13 +++++++++++
 2 files changed, 43 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index f7969e0d59fc..6f3ae626cabc 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3821,18 +3821,16 @@ static int nand_get_bits_per_cell(u8 cellinfo)
  * chip. The rest of the parameters must be decoded according to generic or
  * manufacturer-specific "extended ID" decoding patterns.
  */
-static void nand_decode_ext_id(struct nand_chip *chip, u8 id_data[8],
-			       int *busw)
+static void nand_decode_ext_id(struct nand_chip *chip, int *busw)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
-	int extid, id_len;
+	int extid, id_len = chip->id.len;
+	u8 *id_data = chip->id.data;
 	/* The 3rd id byte holds MLC / multichip data */
 	chip->bits_per_cell = nand_get_bits_per_cell(id_data[2]);
 	/* The 4th id byte is the important one */
 	extid = id_data[3];
 
-	id_len = nand_id_len(id_data, 8);
-
 	/*
 	 * Field definitions are in the following datasheets:
 	 * Old style (4,5 byte ID): Samsung K9GAG08U0M (p.32)
@@ -3956,9 +3954,10 @@ static void nand_decode_ext_id(struct nand_chip *chip, u8 id_data[8],
  * the chip.
  */
 static void nand_decode_id(struct nand_chip *chip, struct nand_flash_dev *type,
-			   u8 id_data[8], int *busw)
+			   int *busw)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
+	u8 *id_data = chip->id.data;
 	int maf_id = id_data[0];
 
 	mtd->erasesize = type->erasesize;
@@ -3988,9 +3987,10 @@ static void nand_decode_id(struct nand_chip *chip, struct nand_flash_dev *type,
  * heuristic patterns using various detected parameters (e.g., manufacturer,
  * page size, cell-type information).
  */
-static void nand_decode_bbm_options(struct nand_chip *chip, u8 id_data[8])
+static void nand_decode_bbm_options(struct nand_chip *chip)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
+	u8 *id_data = chip->id.data;
 	int maf_id = id_data[0];
 
 	/* Set the bad block position */
@@ -4026,10 +4026,10 @@ static inline bool is_full_id_nand(struct nand_flash_dev *type)
 }
 
 static bool find_full_id_nand(struct nand_chip *chip,
-			      struct nand_flash_dev *type, u8 *id_data,
-			      int *busw)
+			      struct nand_flash_dev *type, int *busw)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
+	u8 *id_data = chip->id.data;
 
 	if (!strncmp(type->id, id_data, type->id_len)) {
 		mtd->writesize = type->pagesize;
@@ -4058,13 +4058,13 @@ static bool find_full_id_nand(struct nand_chip *chip,
  * Get the flash and manufacturer id and lookup if the type is supported.
  */
 static int nand_get_flash_type(struct nand_chip *chip,
-			       int *maf_id, int *dev_id,
 			       struct nand_flash_dev *type)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	int busw;
 	int i, maf_idx;
-	u8 id_data[8];
+	u8 *id_data = chip->id.data;
+	u8 maf_id, dev_id;
 
 	/*
 	 * Reset the chip, required by some chips (e.g. Micron MT29FxGxxxxx)
@@ -4079,8 +4079,8 @@ static int nand_get_flash_type(struct nand_chip *chip,
 	chip->cmdfunc(mtd, NAND_CMD_READID, 0x00, -1);
 
 	/* Read manufacturer and device IDs */
-	*maf_id = chip->read_byte(mtd);
-	*dev_id = chip->read_byte(mtd);
+	maf_id = chip->read_byte(mtd);
+	dev_id = chip->read_byte(mtd);
 
 	/*
 	 * Try again to make sure, as some systems the bus-hold or other
@@ -4095,20 +4095,22 @@ static int nand_get_flash_type(struct nand_chip *chip,
 	for (i = 0; i < 8; i++)
 		id_data[i] = chip->read_byte(mtd);
 
-	if (id_data[0] != *maf_id || id_data[1] != *dev_id) {
+	if (id_data[0] != maf_id || id_data[1] != dev_id) {
 		pr_info("second ID read did not match %02x,%02x against %02x,%02x\n",
-			*maf_id, *dev_id, id_data[0], id_data[1]);
+			maf_id, dev_id, id_data[0], id_data[1]);
 		return -ENODEV;
 	}
 
+	chip->id.len = nand_id_len(id_data, 8);
+
 	if (!type)
 		type = nand_flash_ids;
 
 	for (; type->name != NULL; type++) {
 		if (is_full_id_nand(type)) {
-			if (find_full_id_nand(chip, type, id_data, &busw))
+			if (find_full_id_nand(chip, type, &busw))
 				goto ident_done;
-		} else if (*dev_id == type->dev_id) {
+		} else if (dev_id == type->dev_id) {
 			break;
 		}
 	}
@@ -4134,9 +4136,9 @@ static int nand_get_flash_type(struct nand_chip *chip,
 
 	if (!type->pagesize) {
 		/* Decode parameters from extended ID */
-		nand_decode_ext_id(chip, id_data, &busw);
+		nand_decode_ext_id(chip, &busw);
 	} else {
-		nand_decode_id(chip, type, id_data, &busw);
+		nand_decode_id(chip, type, &busw);
 	}
 	/* Get chip options */
 	chip->options |= type->options;
@@ -4145,13 +4147,13 @@ static int nand_get_flash_type(struct nand_chip *chip,
 	 * Check if chip is not a Samsung device. Do not clear the
 	 * options for chips which do not have an extended id.
 	 */
-	if (*maf_id != NAND_MFR_SAMSUNG && !type->pagesize)
+	if (maf_id != NAND_MFR_SAMSUNG && !type->pagesize)
 		chip->options &= ~NAND_SAMSUNG_LP_OPTIONS;
 ident_done:
 
 	/* Try to identify manufacturer */
 	for (maf_idx = 0; nand_manuf_ids[maf_idx].id != 0x0; maf_idx++) {
-		if (nand_manuf_ids[maf_idx].id == *maf_id)
+		if (nand_manuf_ids[maf_idx].id == maf_id)
 			break;
 	}
 
@@ -4165,7 +4167,7 @@ ident_done:
 		 * chip correct!
 		 */
 		pr_info("device found, Manufacturer ID: 0x%02x, Chip ID: 0x%02x\n",
-			*maf_id, *dev_id);
+			maf_id, dev_id);
 		pr_info("%s %s\n", nand_manuf_ids[maf_idx].name, mtd->name);
 		pr_warn("bus width %d instead %d bit\n",
 			   (chip->options & NAND_BUSWIDTH_16) ? 16 : 8,
@@ -4173,7 +4175,7 @@ ident_done:
 		return -EINVAL;
 	}
 
-	nand_decode_bbm_options(chip, id_data);
+	nand_decode_bbm_options(chip);
 
 	/* Calculate the address shift from the page size */
 	chip->page_shift = ffs(mtd->writesize) - 1;
@@ -4197,7 +4199,7 @@ ident_done:
 		chip->cmdfunc = nand_command_lp;
 
 	pr_info("device found, Manufacturer ID: 0x%02x, Chip ID: 0x%02x\n",
-		*maf_id, *dev_id);
+		maf_id, dev_id);
 
 	if (chip->onfi_version)
 		pr_info("%s %s\n", nand_manuf_ids[maf_idx].name,
@@ -4400,7 +4402,7 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips,
 	nand_set_defaults(chip, chip->options & NAND_BUSWIDTH_16);
 
 	/* Read the flash type */
-	ret = nand_get_flash_type(chip, &nand_maf_id, &nand_dev_id, table);
+	ret = nand_get_flash_type(chip, table);
 	if (ret) {
 		if (!(chip->options & NAND_SCAN_SILENT_NODEV))
 			pr_warn("No NAND device found\n");
@@ -4425,6 +4427,9 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips,
 	if (ret)
 		return ret;
 
+	nand_maf_id = chip->id.data[0];
+	nand_dev_id = chip->id.data[1];
+
 	chip->select_chip(mtd, -1);
 
 	/* Check for a chip array */
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 9591e0fbe5bd..e2c11351b1bd 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -464,6 +464,17 @@ struct nand_jedec_params {
 	__le16 crc;
 } __packed;
 
+/**
+ * struct nand_id - NAND id structure
+ * @data: buffer containing the id bytes. Currently 8 bytes large, but can
+ *	  be extended if required.
+ * @len: ID length.
+ */
+struct nand_id {
+	u8 data[8];
+	int len;
+};
+
 /**
  * struct nand_hw_control - Control structure for hardware controller (e.g ECC generator) shared among independent devices
  * @lock:               protection lock
@@ -793,6 +804,7 @@ nand_get_sdr_timings(const struct nand_data_interface *conf)
  * @pagebuf_bitflips:	[INTERN] holds the bitflip count for the page which is
  *			currently in data_buf.
  * @subpagesize:	[INTERN] holds the subpagesize
+ * @id:			[INTERN] holds NAND ID
  * @onfi_version:	[INTERN] holds the chip ONFI version (BCD encoded),
  *			non 0 if ONFI supported.
  * @jedec_version:	[INTERN] holds the chip JEDEC version (BCD encoded),
@@ -881,6 +893,7 @@ struct nand_chip {
 	int badblockpos;
 	int badblockbits;
 
+	struct nand_id id;
 	int onfi_version;
 	int jedec_version;
 	union {
-- 
cgit v1.2.3-70-g09d2


From 8cfb9ab68f90703d419870fce7ac21ac401399f2 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Sat, 7 Jan 2017 15:15:57 +0100
Subject: mtd: nand: Rename the nand_manufacturers struct

Drop the 's' at the end of nand_manufacturers since the struct is actually
describing a single manufacturer, not a manufacturer table.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_ids.c | 2 +-
 include/linux/mtd/nand.h    | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index 4a2f75b0c200..3f80cfcb5e37 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -169,7 +169,7 @@ struct nand_flash_dev nand_flash_ids[] = {
 };
 
 /* Manufacturer IDs */
-struct nand_manufacturers nand_manuf_ids[] = {
+struct nand_manufacturer nand_manuf_ids[] = {
 	{NAND_MFR_TOSHIBA, "Toshiba"},
 	{NAND_MFR_ESMT, "ESMT"},
 	{NAND_MFR_SAMSUNG, "Samsung"},
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index e2c11351b1bd..9c679e8bde42 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1062,17 +1062,17 @@ struct nand_flash_dev {
 };
 
 /**
- * struct nand_manufacturers - NAND Flash Manufacturer ID Structure
+ * struct nand_manufacturer - NAND Flash Manufacturer structure
  * @name:	Manufacturer name
  * @id:		manufacturer ID code of device.
 */
-struct nand_manufacturers {
+struct nand_manufacturer {
 	int id;
 	char *name;
 };
 
 extern struct nand_flash_dev nand_flash_ids[];
-extern struct nand_manufacturers nand_manuf_ids[];
+extern struct nand_manufacturer nand_manuf_ids[];
 
 int nand_default_bbt(struct mtd_info *mtd);
 int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
-- 
cgit v1.2.3-70-g09d2


From bcc678c2d7a0e0af14cb3d858ebd367be378c172 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Sat, 7 Jan 2017 15:48:25 +0100
Subject: mtd: nand: Do not expose the NAND manufacturer table directly

There is no reason to expose the NAND manufacturer table. Provide an
helper function to find manufacturers by their id.

We also turn the nand_manufacturers table into a const array, since its
members are not modified after the initial assignment.

Finally, we remove the sentinel manufacturer entry from the manufacturers
table (we already have the array size information given by ARRAY_SIZE()),
and add the nand_manufacturer_name() helper to handle the "Unknown" case
properly.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c | 23 +++++++++++------------
 drivers/mtd/nand/nand_ids.c  | 22 ++++++++++++++++++++--
 include/linux/mtd/nand.h     |  9 ++++++++-
 3 files changed, 39 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index b1eb99e84044..0120252e0710 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -4052,9 +4052,10 @@ static bool find_full_id_nand(struct nand_chip *chip,
  */
 static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 {
+	const struct nand_manufacturer *manufacturer;
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	int busw;
-	int i, maf_idx;
+	int i;
 	u8 *id_data = chip->id.data;
 	u8 maf_id, dev_id;
 
@@ -4159,10 +4160,7 @@ static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 ident_done:
 
 	/* Try to identify manufacturer */
-	for (maf_idx = 0; nand_manuf_ids[maf_idx].id != 0x0; maf_idx++) {
-		if (nand_manuf_ids[maf_idx].id == maf_id)
-			break;
-	}
+	manufacturer = nand_get_manufacturer(maf_id);
 
 	if (chip->options & NAND_BUSWIDTH_AUTO) {
 		WARN_ON(busw & NAND_BUSWIDTH_16);
@@ -4174,7 +4172,8 @@ ident_done:
 		 */
 		pr_info("device found, Manufacturer ID: 0x%02x, Chip ID: 0x%02x\n",
 			maf_id, dev_id);
-		pr_info("%s %s\n", nand_manuf_ids[maf_idx].name, mtd->name);
+		pr_info("%s %s\n", nand_manufacturer_name(manufacturer),
+			mtd->name);
 		pr_warn("bus width %d instead of %d bits\n", busw ? 16 : 8,
 			(chip->options & NAND_BUSWIDTH_16) ? 16 : 8);
 		return -EINVAL;
@@ -4207,14 +4206,14 @@ ident_done:
 		maf_id, dev_id);
 
 	if (chip->onfi_version)
-		pr_info("%s %s\n", nand_manuf_ids[maf_idx].name,
-				chip->onfi_params.model);
+		pr_info("%s %s\n", nand_manufacturer_name(manufacturer),
+			chip->onfi_params.model);
 	else if (chip->jedec_version)
-		pr_info("%s %s\n", nand_manuf_ids[maf_idx].name,
-				chip->jedec_params.model);
+		pr_info("%s %s\n", nand_manufacturer_name(manufacturer),
+			chip->jedec_params.model);
 	else
-		pr_info("%s %s\n", nand_manuf_ids[maf_idx].name,
-				type->name);
+		pr_info("%s %s\n", nand_manufacturer_name(manufacturer),
+			type->name);
 
 	pr_info("%d MiB, %s, erase size: %d KiB, page size: %d, OOB size: %d\n",
 		(int)(chip->chipsize >> 20), nand_is_slc(chip) ? "SLC" : "MLC",
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index bd267ade0742..06f59a6b74ff 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -169,7 +169,7 @@ struct nand_flash_dev nand_flash_ids[] = {
 };
 
 /* Manufacturer IDs */
-struct nand_manufacturer nand_manuf_ids[] = {
+static const struct nand_manufacturer nand_manufacturers[] = {
 	{NAND_MFR_TOSHIBA, "Toshiba"},
 	{NAND_MFR_ESMT, "ESMT"},
 	{NAND_MFR_SAMSUNG, "Samsung"},
@@ -186,5 +186,23 @@ struct nand_manufacturer nand_manuf_ids[] = {
 	{NAND_MFR_INTEL, "Intel"},
 	{NAND_MFR_ATO, "ATO"},
 	{NAND_MFR_WINBOND, "Winbond"},
-	{0x0, "Unknown"}
 };
+
+/**
+ * nand_get_manufacturer - Get manufacturer information from the manufacturer
+ *			   ID
+ * @id: manufacturer ID
+ *
+ * Returns a pointer a nand_manufacturer object if the manufacturer is defined
+ * in the NAND manufacturers database, NULL otherwise.
+ */
+const struct nand_manufacturer *nand_get_manufacturer(u8 id)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(nand_manufacturers); i++)
+		if (nand_manufacturers[i].id == id)
+			return &nand_manufacturers[i];
+
+	return NULL;
+}
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 9c679e8bde42..6415aa16043c 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1071,8 +1071,15 @@ struct nand_manufacturer {
 	char *name;
 };
 
+const struct nand_manufacturer *nand_get_manufacturer(u8 id);
+
+static inline const char *
+nand_manufacturer_name(const struct nand_manufacturer *manufacturer)
+{
+	return manufacturer ? manufacturer->name : "Unknown";
+}
+
 extern struct nand_flash_dev nand_flash_ids[];
-extern struct nand_manufacturer nand_manuf_ids[];
 
 int nand_default_bbt(struct mtd_info *mtd);
 int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
-- 
cgit v1.2.3-70-g09d2


From abbe26d144ec22bb067fa414d717b9f7ca2e12bd Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 8 Jun 2016 09:32:55 +0200
Subject: mtd: nand: Add manufacturer specific initialization/detection steps

A lot of NANDs are implementing generic features in a non-generic way,
or are providing advanced auto-detection logic where the NAND ID bytes
meaning changes with the NAND generation.

Providing this vendor specific initialization step will allow us to get
rid of full-id entries in the nand_ids table or all the vendor specific
cases added over the time in the generic NAND ID decoding logic.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c | 75 ++++++++++++++++++++++++++++++++++++++------
 include/linux/mtd/nand.h     | 35 +++++++++++++++++++++
 2 files changed, 100 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 0120252e0710..92ff6adbd7c9 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3819,7 +3819,7 @@ static int nand_get_bits_per_cell(u8 cellinfo)
  * chip. The rest of the parameters must be decoded according to generic or
  * manufacturer-specific "extended ID" decoding patterns.
  */
-static void nand_decode_ext_id(struct nand_chip *chip)
+void nand_decode_ext_id(struct nand_chip *chip)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	int extid, id_len = chip->id.len;
@@ -3944,6 +3944,7 @@ static void nand_decode_ext_id(struct nand_chip *chip)
 
 	}
 }
+EXPORT_SYMBOL_GPL(nand_decode_ext_id);
 
 /*
  * Old devices have chip data hardcoded in the device ID table. nand_decode_id
@@ -4047,6 +4048,53 @@ static bool find_full_id_nand(struct nand_chip *chip,
 	return false;
 }
 
+/*
+ * Manufacturer detection. Only used when the NAND is not ONFI or JEDEC
+ * compliant and does not have a full-id or legacy-id entry in the nand_ids
+ * table.
+ */
+static void nand_manufacturer_detect(struct nand_chip *chip)
+{
+	/*
+	 * Try manufacturer detection if available and use
+	 * nand_decode_ext_id() otherwise.
+	 */
+	if (chip->manufacturer.desc && chip->manufacturer.desc->ops &&
+	    chip->manufacturer.desc->ops->detect)
+		chip->manufacturer.desc->ops->detect(chip);
+	else
+		nand_decode_ext_id(chip);
+}
+
+/*
+ * Manufacturer initialization. This function is called for all NANDs including
+ * ONFI and JEDEC compliant ones.
+ * Manufacturer drivers should put all their specific initialization code in
+ * their ->init() hook.
+ */
+static int nand_manufacturer_init(struct nand_chip *chip)
+{
+	if (!chip->manufacturer.desc || !chip->manufacturer.desc->ops ||
+	    !chip->manufacturer.desc->ops->init)
+		return 0;
+
+	return chip->manufacturer.desc->ops->init(chip);
+}
+
+/*
+ * Manufacturer cleanup. This function is called for all NANDs including
+ * ONFI and JEDEC compliant ones.
+ * Manufacturer drivers should put all their specific cleanup code in their
+ * ->cleanup() hook.
+ */
+static void nand_manufacturer_cleanup(struct nand_chip *chip)
+{
+	/* Release manufacturer private data */
+	if (chip->manufacturer.desc && chip->manufacturer.desc->ops &&
+	    chip->manufacturer.desc->ops->cleanup)
+		chip->manufacturer.desc->ops->cleanup(chip);
+}
+
 /*
  * Get the flash and manufacturer id and lookup if the type is supported.
  */
@@ -4055,7 +4103,7 @@ static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 	const struct nand_manufacturer *manufacturer;
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	int busw;
-	int i;
+	int i, ret;
 	u8 *id_data = chip->id.data;
 	u8 maf_id, dev_id;
 
@@ -4096,6 +4144,10 @@ static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 
 	chip->id.len = nand_id_len(id_data, 8);
 
+	/* Try to identify manufacturer */
+	manufacturer = nand_get_manufacturer(maf_id);
+	chip->manufacturer.desc = manufacturer;
+
 	if (!type)
 		type = nand_flash_ids;
 
@@ -4142,12 +4194,11 @@ static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 
 	chip->chipsize = (uint64_t)type->chipsize << 20;
 
-	if (!type->pagesize) {
-		/* Decode parameters from extended ID */
-		nand_decode_ext_id(chip);
-	} else {
+	if (!type->pagesize)
+		nand_manufacturer_detect(chip);
+	else
 		nand_decode_id(chip, type);
-	}
+
 	/* Get chip options */
 	chip->options |= type->options;
 
@@ -4159,9 +4210,6 @@ static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 		chip->options &= ~NAND_SAMSUNG_LP_OPTIONS;
 ident_done:
 
-	/* Try to identify manufacturer */
-	manufacturer = nand_get_manufacturer(maf_id);
-
 	if (chip->options & NAND_BUSWIDTH_AUTO) {
 		WARN_ON(busw & NAND_BUSWIDTH_16);
 		nand_set_defaults(chip);
@@ -4202,6 +4250,10 @@ ident_done:
 	if (mtd->writesize > 512 && chip->cmdfunc == nand_command)
 		chip->cmdfunc = nand_command_lp;
 
+	ret = nand_manufacturer_init(chip);
+	if (ret)
+		return ret;
+
 	pr_info("device found, Manufacturer ID: 0x%02x, Chip ID: 0x%02x\n",
 		maf_id, dev_id);
 
@@ -4947,6 +4999,9 @@ void nand_cleanup(struct nand_chip *chip)
 	if (chip->badblock_pattern && chip->badblock_pattern->options
 			& NAND_BBT_DYNAMICSTRUCT)
 		kfree(chip->badblock_pattern);
+
+	/* Free manufacturer priv data. */
+	nand_manufacturer_cleanup(chip);
 }
 EXPORT_SYMBOL_GPL(nand_cleanup);
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 6415aa16043c..ee9a19f42293 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -731,6 +731,20 @@ nand_get_sdr_timings(const struct nand_data_interface *conf)
 	return &conf->timings.sdr;
 }
 
+/**
+ * struct nand_manufacturer_ops - NAND Manufacturer operations
+ * @detect: detect the NAND memory organization and capabilities
+ * @init: initialize all vendor specific fields (like the ->read_retry()
+ *	  implementation) if any.
+ * @cleanup: the ->init() function may have allocated resources, ->cleanup()
+ *	     is here to let vendor specific code release those resources.
+ */
+struct nand_manufacturer_ops {
+	void (*detect)(struct nand_chip *chip);
+	int (*init)(struct nand_chip *chip);
+	void (*cleanup)(struct nand_chip *chip);
+};
+
 /**
  * struct nand_chip - NAND Private Flash Chip Data
  * @mtd:		MTD device registered to the MTD framework
@@ -835,6 +849,7 @@ nand_get_sdr_timings(const struct nand_data_interface *conf)
  *			additional error status checks (determine if errors are
  *			correctable).
  * @write_page:		[REPLACEABLE] High-level page write function
+ * @manufacturer:	[INTERN] Contains manufacturer information
  */
 
 struct nand_chip {
@@ -923,6 +938,11 @@ struct nand_chip {
 	struct nand_bbt_descr *badblock_pattern;
 
 	void *priv;
+
+	struct {
+		const struct nand_manufacturer *desc;
+		void *priv;
+	} manufacturer;
 };
 
 extern const struct mtd_ooblayout_ops nand_ooblayout_sp_ops;
@@ -959,6 +979,17 @@ static inline void nand_set_controller_data(struct nand_chip *chip, void *priv)
 	chip->priv = priv;
 }
 
+static inline void nand_set_manufacturer_data(struct nand_chip *chip,
+					      void *priv)
+{
+	chip->manufacturer.priv = priv;
+}
+
+static inline void *nand_get_manufacturer_data(struct nand_chip *chip)
+{
+	return chip->manufacturer.priv;
+}
+
 /*
  * NAND Flash Manufacturer ID Codes
  */
@@ -1065,10 +1096,12 @@ struct nand_flash_dev {
  * struct nand_manufacturer - NAND Flash Manufacturer structure
  * @name:	Manufacturer name
  * @id:		manufacturer ID code of device.
+ * @ops:	manufacturer operations
 */
 struct nand_manufacturer {
 	int id;
 	char *name;
+	const struct nand_manufacturer_ops *ops;
 };
 
 const struct nand_manufacturer *nand_get_manufacturer(u8 id);
@@ -1246,4 +1279,6 @@ int nand_reset(struct nand_chip *chip, int chipnr);
 /* Free resources held by the NAND device */
 void nand_cleanup(struct nand_chip *chip);
 
+/* Default extended ID decoding function */
+void nand_decode_ext_id(struct nand_chip *chip);
 #endif /* __LINUX_MTD_NAND_H */
-- 
cgit v1.2.3-70-g09d2


From c51d0ac59f24200dfdccc897ff7c3c9446c7599a Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 8 Jun 2016 10:22:19 +0200
Subject: mtd: nand: Move Samsung specific init/detection logic in
 nand_samsung.c

Move Samsung specific initialization and detection logic into
nand_samsung.c. This is part of the "separate vendor specific code from
core" cleanup process.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/nand/Makefile       |  1 +
 drivers/mtd/nand/nand_base.c    | 52 ++---------------------
 drivers/mtd/nand/nand_ids.c     |  4 +-
 drivers/mtd/nand/nand_samsung.c | 92 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/nand.h        |  2 +
 5 files changed, 101 insertions(+), 50 deletions(-)
 create mode 100644 drivers/mtd/nand/nand_samsung.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index bfd5d12b9ade..d4b90b0f879e 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -61,3 +61,4 @@ obj-$(CONFIG_MTD_NAND_QCOM)		+= qcom_nandc.o
 obj-$(CONFIG_MTD_NAND_MTK)		+= mtk_nand.o mtk_ecc.o
 
 nand-objs := nand_base.o nand_bbt.o nand_timings.o nand_ids.o
+nand-objs += nand_samsung.o
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 92ff6adbd7c9..fd38d59d33a6 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3832,48 +3832,13 @@ void nand_decode_ext_id(struct nand_chip *chip)
 	/*
 	 * Field definitions are in the following datasheets:
 	 * Old style (4,5 byte ID): Samsung K9GAG08U0M (p.32)
-	 * New Samsung (6 byte ID): Samsung K9GAG08U0F (p.44)
 	 * Hynix MLC   (6 byte ID): Hynix H27UBG8T2B (p.22)
 	 *
 	 * Check for ID length, non-zero 6th byte, cell type, and Hynix/Samsung
 	 * ID to decide what to do.
 	 */
-	if (id_len == 6 && id_data[0] == NAND_MFR_SAMSUNG &&
-			!nand_is_slc(chip) && id_data[5] != 0x00) {
-		/* Calc pagesize */
-		mtd->writesize = 2048 << (extid & 0x03);
-		extid >>= 2;
-		/* Calc oobsize */
-		switch (((extid >> 2) & 0x04) | (extid & 0x03)) {
-		case 1:
-			mtd->oobsize = 128;
-			break;
-		case 2:
-			mtd->oobsize = 218;
-			break;
-		case 3:
-			mtd->oobsize = 400;
-			break;
-		case 4:
-			mtd->oobsize = 436;
-			break;
-		case 5:
-			mtd->oobsize = 512;
-			break;
-		case 6:
-			mtd->oobsize = 640;
-			break;
-		case 7:
-		default: /* Other cases are "reserved" (unknown) */
-			mtd->oobsize = 1024;
-			break;
-		}
-		extid >>= 2;
-		/* Calc blocksize */
-		mtd->erasesize = (128 * 1024) <<
-			(((extid >> 1) & 0x04) | (extid & 0x03));
-	} else if (id_len == 6 && id_data[0] == NAND_MFR_HYNIX &&
-			!nand_is_slc(chip)) {
+	if (id_len == 6 && id_data[0] == NAND_MFR_HYNIX &&
+	    !nand_is_slc(chip)) {
 		unsigned int tmp;
 
 		/* Calc pagesize */
@@ -4001,13 +3966,10 @@ static void nand_decode_bbm_options(struct nand_chip *chip)
 	 * Micron devices with 2KiB pages and on SLC Samsung, Hynix, Toshiba,
 	 * AMD/Spansion, and Macronix.  All others scan only the first page.
 	 */
-	if (!nand_is_slc(chip) &&
-			(maf_id == NAND_MFR_SAMSUNG ||
-			 maf_id == NAND_MFR_HYNIX))
+	if (!nand_is_slc(chip) && maf_id == NAND_MFR_HYNIX)
 		chip->bbt_options |= NAND_BBT_SCANLASTPAGE;
 	else if ((nand_is_slc(chip) &&
-				(maf_id == NAND_MFR_SAMSUNG ||
-				 maf_id == NAND_MFR_HYNIX ||
+				(maf_id == NAND_MFR_HYNIX ||
 				 maf_id == NAND_MFR_TOSHIBA ||
 				 maf_id == NAND_MFR_AMD ||
 				 maf_id == NAND_MFR_MACRONIX)) ||
@@ -4202,12 +4164,6 @@ static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 	/* Get chip options */
 	chip->options |= type->options;
 
-	/*
-	 * Check if chip is not a Samsung device. Do not clear the
-	 * options for chips which do not have an extended id.
-	 */
-	if (maf_id != NAND_MFR_SAMSUNG && !type->pagesize)
-		chip->options &= ~NAND_SAMSUNG_LP_OPTIONS;
 ident_done:
 
 	if (chip->options & NAND_BUSWIDTH_AUTO) {
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index 06f59a6b74ff..8eba7dfe7c1f 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -10,7 +10,7 @@
 #include <linux/mtd/nand.h>
 #include <linux/sizes.h>
 
-#define LP_OPTIONS NAND_SAMSUNG_LP_OPTIONS
+#define LP_OPTIONS 0
 #define LP_OPTIONS16 (LP_OPTIONS | NAND_BUSWIDTH_16)
 
 #define SP_OPTIONS NAND_NEED_READRDY
@@ -172,7 +172,7 @@ struct nand_flash_dev nand_flash_ids[] = {
 static const struct nand_manufacturer nand_manufacturers[] = {
 	{NAND_MFR_TOSHIBA, "Toshiba"},
 	{NAND_MFR_ESMT, "ESMT"},
-	{NAND_MFR_SAMSUNG, "Samsung"},
+	{NAND_MFR_SAMSUNG, "Samsung", &samsung_nand_manuf_ops},
 	{NAND_MFR_FUJITSU, "Fujitsu"},
 	{NAND_MFR_NATIONAL, "National"},
 	{NAND_MFR_RENESAS, "Renesas"},
diff --git a/drivers/mtd/nand/nand_samsung.c b/drivers/mtd/nand/nand_samsung.c
new file mode 100644
index 000000000000..5c259dd5b652
--- /dev/null
+++ b/drivers/mtd/nand/nand_samsung.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2017 Free Electrons
+ * Copyright (C) 2017 NextThing Co
+ *
+ * Author: Boris Brezillon <boris.brezillon@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/mtd/nand.h>
+
+static void samsung_nand_decode_id(struct nand_chip *chip)
+{
+	struct mtd_info *mtd = nand_to_mtd(chip);
+
+	/* New Samsung (6 byte ID): Samsung K9GAG08U0F (p.44) */
+	if (chip->id.len == 6 && !nand_is_slc(chip) &&
+	    chip->id.data[5] != 0x00) {
+		u8 extid = chip->id.data[3];
+
+		/* Get pagesize */
+		mtd->writesize = 2048 << (extid & 0x03);
+
+		extid >>= 2;
+
+		/* Get oobsize */
+		switch (((extid >> 2) & 0x4) | (extid & 0x3)) {
+		case 1:
+			mtd->oobsize = 128;
+			break;
+		case 2:
+			mtd->oobsize = 218;
+			break;
+		case 3:
+			mtd->oobsize = 400;
+			break;
+		case 4:
+			mtd->oobsize = 436;
+			break;
+		case 5:
+			mtd->oobsize = 512;
+			break;
+		case 6:
+			mtd->oobsize = 640;
+			break;
+		default:
+			/*
+			 * We should never reach this case, but if that
+			 * happens, this probably means Samsung decided to use
+			 * a different extended ID format, and we should find
+			 * a way to support it.
+			 */
+			WARN(1, "Invalid OOB size value");
+			break;
+		}
+
+		/* Get blocksize */
+		extid >>= 2;
+		mtd->erasesize = (128 * 1024) <<
+				 (((extid >> 1) & 0x04) | (extid & 0x03));
+	} else {
+		nand_decode_ext_id(chip);
+	}
+}
+
+static int samsung_nand_init(struct nand_chip *chip)
+{
+	struct mtd_info *mtd = nand_to_mtd(chip);
+
+	if (mtd->writesize > 512)
+		chip->options |= NAND_SAMSUNG_LP_OPTIONS;
+
+	if (!nand_is_slc(chip))
+		chip->bbt_options |= NAND_BBT_SCANLASTPAGE;
+	else
+		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
+
+	return 0;
+}
+
+const struct nand_manufacturer_ops samsung_nand_manuf_ops = {
+	.detect = samsung_nand_decode_id,
+	.init = samsung_nand_init,
+};
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index ee9a19f42293..2f83cb55392f 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1114,6 +1114,8 @@ nand_manufacturer_name(const struct nand_manufacturer *manufacturer)
 
 extern struct nand_flash_dev nand_flash_ids[];
 
+extern const struct nand_manufacturer_ops samsung_nand_manuf_ops;
+
 int nand_default_bbt(struct mtd_info *mtd);
 int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
 int nand_isreserved_bbt(struct mtd_info *mtd, loff_t offs);
-- 
cgit v1.2.3-70-g09d2


From 01389b6bd2f4f7649cdbb4a99a15d9e0c05d6f8c Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 8 Jun 2016 10:30:18 +0200
Subject: mtd: nand: Move Hynix specific init/detection logic in nand_hynix.c

Move Hynix specific initialization and detection logic into
nand_hynix.c. This is part of the "separate vendor specific code from
core" cleanup process.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/nand/Makefile     |   1 +
 drivers/mtd/nand/nand_base.c  | 114 +++++++++++-------------------------------
 drivers/mtd/nand/nand_hynix.c |  84 +++++++++++++++++++++++++++++++
 drivers/mtd/nand/nand_ids.c   |   2 +-
 include/linux/mtd/nand.h      |   1 +
 5 files changed, 115 insertions(+), 87 deletions(-)
 create mode 100644 drivers/mtd/nand/nand_hynix.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index d4b90b0f879e..ddb2670d9767 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -61,4 +61,5 @@ obj-$(CONFIG_MTD_NAND_QCOM)		+= qcom_nandc.o
 obj-$(CONFIG_MTD_NAND_MTK)		+= mtk_nand.o mtk_ecc.o
 
 nand-objs := nand_base.o nand_bbt.o nand_timings.o nand_ids.o
+nand-objs += nand_hynix.o
 nand-objs += nand_samsung.o
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index fd38d59d33a6..d65db5921a0f 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3829,85 +3829,32 @@ void nand_decode_ext_id(struct nand_chip *chip)
 	/* The 4th id byte is the important one */
 	extid = id_data[3];
 
+	/* Calc pagesize */
+	mtd->writesize = 1024 << (extid & 0x03);
+	extid >>= 2;
+	/* Calc oobsize */
+	mtd->oobsize = (8 << (extid & 0x01)) * (mtd->writesize >> 9);
+	extid >>= 2;
+	/* Calc blocksize. Blocksize is multiples of 64KiB */
+	mtd->erasesize = (64 * 1024) << (extid & 0x03);
+	extid >>= 2;
+	/* Get buswidth information */
+	if (extid & 0x1)
+		chip->options |= NAND_BUSWIDTH_16;
+
 	/*
-	 * Field definitions are in the following datasheets:
-	 * Old style (4,5 byte ID): Samsung K9GAG08U0M (p.32)
-	 * Hynix MLC   (6 byte ID): Hynix H27UBG8T2B (p.22)
-	 *
-	 * Check for ID length, non-zero 6th byte, cell type, and Hynix/Samsung
-	 * ID to decide what to do.
+	 * Toshiba 24nm raw SLC (i.e., not BENAND) have 32B OOB per
+	 * 512B page. For Toshiba SLC, we decode the 5th/6th byte as
+	 * follows:
+	 * - ID byte 6, bits[2:0]: 100b -> 43nm, 101b -> 32nm,
+	 *                         110b -> 24nm
+	 * - ID byte 5, bit[7]:    1 -> BENAND, 0 -> raw SLC
 	 */
-	if (id_len == 6 && id_data[0] == NAND_MFR_HYNIX &&
-	    !nand_is_slc(chip)) {
-		unsigned int tmp;
-
-		/* Calc pagesize */
-		mtd->writesize = 2048 << (extid & 0x03);
-		extid >>= 2;
-		/* Calc oobsize */
-		switch (((extid >> 2) & 0x04) | (extid & 0x03)) {
-		case 0:
-			mtd->oobsize = 128;
-			break;
-		case 1:
-			mtd->oobsize = 224;
-			break;
-		case 2:
-			mtd->oobsize = 448;
-			break;
-		case 3:
-			mtd->oobsize = 64;
-			break;
-		case 4:
-			mtd->oobsize = 32;
-			break;
-		case 5:
-			mtd->oobsize = 16;
-			break;
-		default:
-			mtd->oobsize = 640;
-			break;
-		}
-		extid >>= 2;
-		/* Calc blocksize */
-		tmp = ((extid >> 1) & 0x04) | (extid & 0x03);
-		if (tmp < 0x03)
-			mtd->erasesize = (128 * 1024) << tmp;
-		else if (tmp == 0x03)
-			mtd->erasesize = 768 * 1024;
-		else
-			mtd->erasesize = (64 * 1024) << tmp;
-	} else {
-		/* Calc pagesize */
-		mtd->writesize = 1024 << (extid & 0x03);
-		extid >>= 2;
-		/* Calc oobsize */
-		mtd->oobsize = (8 << (extid & 0x01)) *
-			(mtd->writesize >> 9);
-		extid >>= 2;
-		/* Calc blocksize. Blocksize is multiples of 64KiB */
-		mtd->erasesize = (64 * 1024) << (extid & 0x03);
-		extid >>= 2;
-		/* Get buswidth information */
-		if (extid & 0x1)
-			chip->options |= NAND_BUSWIDTH_16;
-
-		/*
-		 * Toshiba 24nm raw SLC (i.e., not BENAND) have 32B OOB per
-		 * 512B page. For Toshiba SLC, we decode the 5th/6th byte as
-		 * follows:
-		 * - ID byte 6, bits[2:0]: 100b -> 43nm, 101b -> 32nm,
-		 *                         110b -> 24nm
-		 * - ID byte 5, bit[7]:    1 -> BENAND, 0 -> raw SLC
-		 */
-		if (id_len >= 6 && id_data[0] == NAND_MFR_TOSHIBA &&
-				nand_is_slc(chip) &&
-				(id_data[5] & 0x7) == 0x6 /* 24nm */ &&
-				!(id_data[4] & 0x80) /* !BENAND */) {
-			mtd->oobsize = 32 * mtd->writesize >> 9;
-		}
-
-	}
+	if (id_len >= 6 && id_data[0] == NAND_MFR_TOSHIBA &&
+	    nand_is_slc(chip) &&
+	    (id_data[5] & 0x7) == 0x6 /* 24nm */ &&
+	     !(id_data[4] & 0x80) /* !BENAND */)
+		mtd->oobsize = 32 * mtd->writesize >> 9;
 }
 EXPORT_SYMBOL_GPL(nand_decode_ext_id);
 
@@ -3966,15 +3913,10 @@ static void nand_decode_bbm_options(struct nand_chip *chip)
 	 * Micron devices with 2KiB pages and on SLC Samsung, Hynix, Toshiba,
 	 * AMD/Spansion, and Macronix.  All others scan only the first page.
 	 */
-	if (!nand_is_slc(chip) && maf_id == NAND_MFR_HYNIX)
-		chip->bbt_options |= NAND_BBT_SCANLASTPAGE;
-	else if ((nand_is_slc(chip) &&
-				(maf_id == NAND_MFR_HYNIX ||
-				 maf_id == NAND_MFR_TOSHIBA ||
-				 maf_id == NAND_MFR_AMD ||
-				 maf_id == NAND_MFR_MACRONIX)) ||
-			(mtd->writesize == 2048 &&
-			 maf_id == NAND_MFR_MICRON))
+	if ((nand_is_slc(chip) &&
+	     (maf_id == NAND_MFR_TOSHIBA || maf_id == NAND_MFR_AMD ||
+	      maf_id == NAND_MFR_MACRONIX)) ||
+	    (mtd->writesize == 2048 && maf_id == NAND_MFR_MICRON))
 		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
 }
 
diff --git a/drivers/mtd/nand/nand_hynix.c b/drivers/mtd/nand/nand_hynix.c
new file mode 100644
index 000000000000..fbd661688158
--- /dev/null
+++ b/drivers/mtd/nand/nand_hynix.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2017 Free Electrons
+ * Copyright (C) 2017 NextThing Co
+ *
+ * Author: Boris Brezillon <boris.brezillon@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/mtd/nand.h>
+
+static void hynix_nand_decode_id(struct nand_chip *chip)
+{
+	struct mtd_info *mtd = nand_to_mtd(chip);
+
+	/* Hynix MLC   (6 byte ID): Hynix H27UBG8T2B (p.22) */
+	if (chip->id.len == 6 && !nand_is_slc(chip)) {
+		u8 tmp, extid = chip->id.data[3];
+
+		/* Extract pagesize */
+		mtd->writesize = 2048 << (extid & 0x03);
+		extid >>= 2;
+
+		/* Extract oobsize */
+		switch (((extid >> 2) & 0x4) | (extid & 0x3)) {
+		case 0:
+			mtd->oobsize = 128;
+			break;
+		case 1:
+			mtd->oobsize = 224;
+			break;
+		case 2:
+			mtd->oobsize = 448;
+			break;
+		case 3:
+			mtd->oobsize = 64;
+			break;
+		case 4:
+			mtd->oobsize = 32;
+			break;
+		case 5:
+			mtd->oobsize = 16;
+			break;
+		default:
+			mtd->oobsize = 640;
+			break;
+		}
+
+		/* Extract blocksize */
+		extid >>= 2;
+		tmp = ((extid >> 1) & 0x04) | (extid & 0x03);
+		if (tmp < 0x03)
+			mtd->erasesize = (128 * 1024) << tmp;
+		else if (tmp == 0x03)
+			mtd->erasesize = 768 * 1024;
+		else
+			mtd->erasesize = (64 * 1024) << tmp;
+	} else {
+		nand_decode_ext_id(chip);
+	}
+}
+
+static int hynix_nand_init(struct nand_chip *chip)
+{
+	if (!nand_is_slc(chip))
+		chip->bbt_options |= NAND_BBT_SCANLASTPAGE;
+	else
+		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
+
+	return 0;
+}
+
+const struct nand_manufacturer_ops hynix_nand_manuf_ops = {
+	.detect = hynix_nand_decode_id,
+	.init = hynix_nand_init,
+};
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index 8eba7dfe7c1f..7dc06a807d00 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -177,7 +177,7 @@ static const struct nand_manufacturer nand_manufacturers[] = {
 	{NAND_MFR_NATIONAL, "National"},
 	{NAND_MFR_RENESAS, "Renesas"},
 	{NAND_MFR_STMICRO, "ST Micro"},
-	{NAND_MFR_HYNIX, "Hynix"},
+	{NAND_MFR_HYNIX, "Hynix", &hynix_nand_manuf_ops},
 	{NAND_MFR_MICRON, "Micron"},
 	{NAND_MFR_AMD, "AMD/Spansion"},
 	{NAND_MFR_MACRONIX, "Macronix"},
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 2f83cb55392f..74e3a231cb56 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1115,6 +1115,7 @@ nand_manufacturer_name(const struct nand_manufacturer *manufacturer)
 extern struct nand_flash_dev nand_flash_ids[];
 
 extern const struct nand_manufacturer_ops samsung_nand_manuf_ops;
+extern const struct nand_manufacturer_ops hynix_nand_manuf_ops;
 
 int nand_default_bbt(struct mtd_info *mtd);
 int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
-- 
cgit v1.2.3-70-g09d2


From 9b2d61f80b060ce3ea5af2a99e148b0b214932b2 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 8 Jun 2016 10:34:57 +0200
Subject: mtd: nand: Move Toshiba specific init/detection logic in
 nand_toshiba.c

Move Toshiba specific initialization and detection logic into
nand_toshiba.c. This is part of the "separate vendor specific code from
core" cleanup process.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/nand/Makefile       |  1 +
 drivers/mtd/nand/nand_base.c    | 19 ++-------------
 drivers/mtd/nand/nand_ids.c     |  2 +-
 drivers/mtd/nand/nand_toshiba.c | 51 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/nand.h        |  1 +
 5 files changed, 56 insertions(+), 18 deletions(-)
 create mode 100644 drivers/mtd/nand/nand_toshiba.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index ddb2670d9767..7c059822f479 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -63,3 +63,4 @@ obj-$(CONFIG_MTD_NAND_MTK)		+= mtk_nand.o mtk_ecc.o
 nand-objs := nand_base.o nand_bbt.o nand_timings.o nand_ids.o
 nand-objs += nand_hynix.o
 nand-objs += nand_samsung.o
+nand-objs += nand_toshiba.o
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index d65db5921a0f..36bca97900af 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3822,7 +3822,7 @@ static int nand_get_bits_per_cell(u8 cellinfo)
 void nand_decode_ext_id(struct nand_chip *chip)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
-	int extid, id_len = chip->id.len;
+	int extid;
 	u8 *id_data = chip->id.data;
 	/* The 3rd id byte holds MLC / multichip data */
 	chip->bits_per_cell = nand_get_bits_per_cell(id_data[2]);
@@ -3841,20 +3841,6 @@ void nand_decode_ext_id(struct nand_chip *chip)
 	/* Get buswidth information */
 	if (extid & 0x1)
 		chip->options |= NAND_BUSWIDTH_16;
-
-	/*
-	 * Toshiba 24nm raw SLC (i.e., not BENAND) have 32B OOB per
-	 * 512B page. For Toshiba SLC, we decode the 5th/6th byte as
-	 * follows:
-	 * - ID byte 6, bits[2:0]: 100b -> 43nm, 101b -> 32nm,
-	 *                         110b -> 24nm
-	 * - ID byte 5, bit[7]:    1 -> BENAND, 0 -> raw SLC
-	 */
-	if (id_len >= 6 && id_data[0] == NAND_MFR_TOSHIBA &&
-	    nand_is_slc(chip) &&
-	    (id_data[5] & 0x7) == 0x6 /* 24nm */ &&
-	     !(id_data[4] & 0x80) /* !BENAND */)
-		mtd->oobsize = 32 * mtd->writesize >> 9;
 }
 EXPORT_SYMBOL_GPL(nand_decode_ext_id);
 
@@ -3914,8 +3900,7 @@ static void nand_decode_bbm_options(struct nand_chip *chip)
 	 * AMD/Spansion, and Macronix.  All others scan only the first page.
 	 */
 	if ((nand_is_slc(chip) &&
-	     (maf_id == NAND_MFR_TOSHIBA || maf_id == NAND_MFR_AMD ||
-	      maf_id == NAND_MFR_MACRONIX)) ||
+	     (maf_id == NAND_MFR_AMD || maf_id == NAND_MFR_MACRONIX)) ||
 	    (mtd->writesize == 2048 && maf_id == NAND_MFR_MICRON))
 		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
 }
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index 7dc06a807d00..22ff89e9c478 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -170,7 +170,7 @@ struct nand_flash_dev nand_flash_ids[] = {
 
 /* Manufacturer IDs */
 static const struct nand_manufacturer nand_manufacturers[] = {
-	{NAND_MFR_TOSHIBA, "Toshiba"},
+	{NAND_MFR_TOSHIBA, "Toshiba", &toshiba_nand_manuf_ops},
 	{NAND_MFR_ESMT, "ESMT"},
 	{NAND_MFR_SAMSUNG, "Samsung", &samsung_nand_manuf_ops},
 	{NAND_MFR_FUJITSU, "Fujitsu"},
diff --git a/drivers/mtd/nand/nand_toshiba.c b/drivers/mtd/nand/nand_toshiba.c
new file mode 100644
index 000000000000..fa787ba38dcd
--- /dev/null
+++ b/drivers/mtd/nand/nand_toshiba.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2017 Free Electrons
+ * Copyright (C) 2017 NextThing Co
+ *
+ * Author: Boris Brezillon <boris.brezillon@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/mtd/nand.h>
+
+static void toshiba_nand_decode_id(struct nand_chip *chip)
+{
+	struct mtd_info *mtd = nand_to_mtd(chip);
+
+	nand_decode_ext_id(chip);
+
+	/*
+	 * Toshiba 24nm raw SLC (i.e., not BENAND) have 32B OOB per
+	 * 512B page. For Toshiba SLC, we decode the 5th/6th byte as
+	 * follows:
+	 * - ID byte 6, bits[2:0]: 100b -> 43nm, 101b -> 32nm,
+	 *                         110b -> 24nm
+	 * - ID byte 5, bit[7]:    1 -> BENAND, 0 -> raw SLC
+	 */
+	if (chip->id.len >= 6 && nand_is_slc(chip) &&
+	    (chip->id.data[5] & 0x7) == 0x6 /* 24nm */ &&
+	    !(chip->id.data[4] & 0x80) /* !BENAND */)
+		mtd->oobsize = 32 * mtd->writesize >> 9;
+}
+
+static int toshiba_nand_init(struct nand_chip *chip)
+{
+	if (nand_is_slc(chip))
+		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
+
+	return 0;
+}
+
+const struct nand_manufacturer_ops toshiba_nand_manuf_ops = {
+	.detect = toshiba_nand_decode_id,
+	.init = toshiba_nand_init,
+};
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 74e3a231cb56..dd9e3b5ddd4f 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1114,6 +1114,7 @@ nand_manufacturer_name(const struct nand_manufacturer *manufacturer)
 
 extern struct nand_flash_dev nand_flash_ids[];
 
+extern const struct nand_manufacturer_ops toshiba_nand_manuf_ops;
 extern const struct nand_manufacturer_ops samsung_nand_manuf_ops;
 extern const struct nand_manufacturer_ops hynix_nand_manuf_ops;
 
-- 
cgit v1.2.3-70-g09d2


From 10d4e75c36f6c16311dde1461f318210da357219 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 8 Jun 2016 10:38:57 +0200
Subject: mtd: nand: Move Micron specific init logic in nand_micron.c

Move Micron specific initialization logic into nand_micron.c. This is
part of the "separate vendor specific code from core" cleanup process.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/nand/Makefile      |  1 +
 drivers/mtd/nand/nand_base.c   | 32 +---------------
 drivers/mtd/nand/nand_ids.c    |  2 +-
 drivers/mtd/nand/nand_micron.c | 86 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/nand.h       | 21 +----------
 5 files changed, 91 insertions(+), 51 deletions(-)
 create mode 100644 drivers/mtd/nand/nand_micron.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index 7c059822f479..11d743155810 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -62,5 +62,6 @@ obj-$(CONFIG_MTD_NAND_MTK)		+= mtk_nand.o mtk_ecc.o
 
 nand-objs := nand_base.o nand_bbt.o nand_timings.o nand_ids.o
 nand-objs += nand_hynix.o
+nand-objs += nand_micron.o
 nand-objs += nand_samsung.o
 nand-objs += nand_toshiba.o
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 36bca97900af..f45e3bbe5f85 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3537,30 +3537,6 @@ ext_out:
 	return ret;
 }
 
-static int nand_setup_read_retry_micron(struct mtd_info *mtd, int retry_mode)
-{
-	struct nand_chip *chip = mtd_to_nand(mtd);
-	uint8_t feature[ONFI_SUBFEATURE_PARAM_LEN] = {retry_mode};
-
-	return chip->onfi_set_features(mtd, chip, ONFI_FEATURE_ADDR_READ_RETRY,
-			feature);
-}
-
-/*
- * Configure chip properties from Micron vendor-specific ONFI table
- */
-static void nand_onfi_detect_micron(struct nand_chip *chip,
-		struct nand_onfi_params *p)
-{
-	struct nand_onfi_vendor_micron *micron = (void *)p->vendor;
-
-	if (le16_to_cpu(p->vendor_revision) < 1)
-		return;
-
-	chip->read_retries = micron->read_retry_options;
-	chip->setup_read_retry = nand_setup_read_retry_micron;
-}
-
 /*
  * Check if the NAND chip is ONFI compliant, returns 1 if it is, 0 otherwise.
  */
@@ -3660,9 +3636,6 @@ static int nand_flash_detect_onfi(struct nand_chip *chip)
 		pr_warn("Could not retrieve ONFI ECC requirements\n");
 	}
 
-	if (p->jedec_id == NAND_MFR_MICRON)
-		nand_onfi_detect_micron(chip, p);
-
 	return 1;
 }
 
@@ -3899,9 +3872,8 @@ static void nand_decode_bbm_options(struct nand_chip *chip)
 	 * Micron devices with 2KiB pages and on SLC Samsung, Hynix, Toshiba,
 	 * AMD/Spansion, and Macronix.  All others scan only the first page.
 	 */
-	if ((nand_is_slc(chip) &&
-	     (maf_id == NAND_MFR_AMD || maf_id == NAND_MFR_MACRONIX)) ||
-	    (mtd->writesize == 2048 && maf_id == NAND_MFR_MICRON))
+	if (nand_is_slc(chip) &&
+	    (maf_id == NAND_MFR_AMD || maf_id == NAND_MFR_MACRONIX))
 		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
 }
 
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index 22ff89e9c478..42c6743401c5 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -178,7 +178,7 @@ static const struct nand_manufacturer nand_manufacturers[] = {
 	{NAND_MFR_RENESAS, "Renesas"},
 	{NAND_MFR_STMICRO, "ST Micro"},
 	{NAND_MFR_HYNIX, "Hynix", &hynix_nand_manuf_ops},
-	{NAND_MFR_MICRON, "Micron"},
+	{NAND_MFR_MICRON, "Micron", &micron_nand_manuf_ops},
 	{NAND_MFR_AMD, "AMD/Spansion"},
 	{NAND_MFR_MACRONIX, "Macronix"},
 	{NAND_MFR_EON, "Eon"},
diff --git a/drivers/mtd/nand/nand_micron.c b/drivers/mtd/nand/nand_micron.c
new file mode 100644
index 000000000000..877011069251
--- /dev/null
+++ b/drivers/mtd/nand/nand_micron.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2017 Free Electrons
+ * Copyright (C) 2017 NextThing Co
+ *
+ * Author: Boris Brezillon <boris.brezillon@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/mtd/nand.h>
+
+struct nand_onfi_vendor_micron {
+	u8 two_plane_read;
+	u8 read_cache;
+	u8 read_unique_id;
+	u8 dq_imped;
+	u8 dq_imped_num_settings;
+	u8 dq_imped_feat_addr;
+	u8 rb_pulldown_strength;
+	u8 rb_pulldown_strength_feat_addr;
+	u8 rb_pulldown_strength_num_settings;
+	u8 otp_mode;
+	u8 otp_page_start;
+	u8 otp_data_prot_addr;
+	u8 otp_num_pages;
+	u8 otp_feat_addr;
+	u8 read_retry_options;
+	u8 reserved[72];
+	u8 param_revision;
+} __packed;
+
+static int micron_nand_setup_read_retry(struct mtd_info *mtd, int retry_mode)
+{
+	struct nand_chip *chip = mtd_to_nand(mtd);
+	u8 feature[ONFI_SUBFEATURE_PARAM_LEN] = {retry_mode};
+
+	return chip->onfi_set_features(mtd, chip, ONFI_FEATURE_ADDR_READ_RETRY,
+				       feature);
+}
+
+/*
+ * Configure chip properties from Micron vendor-specific ONFI table
+ */
+static int micron_nand_onfi_init(struct nand_chip *chip)
+{
+	struct nand_onfi_params *p = &chip->onfi_params;
+	struct nand_onfi_vendor_micron *micron = (void *)p->vendor;
+
+	if (!chip->onfi_version)
+		return 0;
+
+	if (le16_to_cpu(p->vendor_revision) < 1)
+		return 0;
+
+	chip->read_retries = micron->read_retry_options;
+	chip->setup_read_retry = micron_nand_setup_read_retry;
+
+	return 0;
+}
+
+static int micron_nand_init(struct nand_chip *chip)
+{
+	struct mtd_info *mtd = nand_to_mtd(chip);
+	int ret;
+
+	ret = micron_nand_onfi_init(chip);
+	if (ret)
+		return ret;
+
+	if (mtd->writesize == 2048)
+		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
+
+	return 0;
+}
+
+const struct nand_manufacturer_ops micron_nand_manuf_ops = {
+	.init = micron_nand_init,
+};
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index dd9e3b5ddd4f..7d0f18ecbf57 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -366,26 +366,6 @@ struct onfi_ext_param_page {
 	 */
 } __packed;
 
-struct nand_onfi_vendor_micron {
-	u8 two_plane_read;
-	u8 read_cache;
-	u8 read_unique_id;
-	u8 dq_imped;
-	u8 dq_imped_num_settings;
-	u8 dq_imped_feat_addr;
-	u8 rb_pulldown_strength;
-	u8 rb_pulldown_strength_feat_addr;
-	u8 rb_pulldown_strength_num_settings;
-	u8 otp_mode;
-	u8 otp_page_start;
-	u8 otp_data_prot_addr;
-	u8 otp_num_pages;
-	u8 otp_feat_addr;
-	u8 read_retry_options;
-	u8 reserved[72];
-	u8 param_revision;
-} __packed;
-
 struct jedec_ecc_info {
 	u8 ecc_bits;
 	u8 codeword_size;
@@ -1117,6 +1097,7 @@ extern struct nand_flash_dev nand_flash_ids[];
 extern const struct nand_manufacturer_ops toshiba_nand_manuf_ops;
 extern const struct nand_manufacturer_ops samsung_nand_manuf_ops;
 extern const struct nand_manufacturer_ops hynix_nand_manuf_ops;
+extern const struct nand_manufacturer_ops micron_nand_manuf_ops;
 
 int nand_default_bbt(struct mtd_info *mtd);
 int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
-- 
cgit v1.2.3-70-g09d2


From 229204da53b31d576fcc1c93a33626943ea8202c Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 8 Jun 2016 10:42:23 +0200
Subject: mtd: nand: Move AMD/Spansion specific init/detection logic in
 nand_amd.c

Move AMD/Spansion specific initialization/detection logic into
nand_amd.c. This is part of the "separate vendor specific code from
core" cleanup process.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/nand/Makefile    |  1 +
 drivers/mtd/nand/nand_amd.c  | 51 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/mtd/nand/nand_base.c | 18 +---------------
 drivers/mtd/nand/nand_ids.c  |  2 +-
 include/linux/mtd/nand.h     |  1 +
 5 files changed, 55 insertions(+), 18 deletions(-)
 create mode 100644 drivers/mtd/nand/nand_amd.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index 11d743155810..f48ddcc132bc 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -61,6 +61,7 @@ obj-$(CONFIG_MTD_NAND_QCOM)		+= qcom_nandc.o
 obj-$(CONFIG_MTD_NAND_MTK)		+= mtk_nand.o mtk_ecc.o
 
 nand-objs := nand_base.o nand_bbt.o nand_timings.o nand_ids.o
+nand-objs += nand_amd.o
 nand-objs += nand_hynix.o
 nand-objs += nand_micron.o
 nand-objs += nand_samsung.o
diff --git a/drivers/mtd/nand/nand_amd.c b/drivers/mtd/nand/nand_amd.c
new file mode 100644
index 000000000000..170403a3bfa8
--- /dev/null
+++ b/drivers/mtd/nand/nand_amd.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2017 Free Electrons
+ * Copyright (C) 2017 NextThing Co
+ *
+ * Author: Boris Brezillon <boris.brezillon@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/mtd/nand.h>
+
+static void amd_nand_decode_id(struct nand_chip *chip)
+{
+	struct mtd_info *mtd = nand_to_mtd(chip);
+
+	nand_decode_ext_id(chip);
+
+	/*
+	 * Check for Spansion/AMD ID + repeating 5th, 6th byte since
+	 * some Spansion chips have erasesize that conflicts with size
+	 * listed in nand_ids table.
+	 * Data sheet (5 byte ID): Spansion S30ML-P ORNAND (p.39)
+	 */
+	if (chip->id.data[4] != 0x00 && chip->id.data[5] == 0x00 &&
+	    chip->id.data[6] == 0x00 && chip->id.data[7] == 0x00 &&
+	    mtd->writesize == 512) {
+		mtd->erasesize = 128 * 1024;
+		mtd->erasesize <<= ((chip->id.data[3] & 0x03) << 1);
+	}
+}
+
+static int amd_nand_init(struct nand_chip *chip)
+{
+	if (nand_is_slc(chip))
+		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
+
+	return 0;
+}
+
+const struct nand_manufacturer_ops amd_nand_manuf_ops = {
+	.detect = amd_nand_decode_id,
+	.init = amd_nand_init,
+};
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index f45e3bbe5f85..a9060bcb9eb5 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3825,8 +3825,6 @@ EXPORT_SYMBOL_GPL(nand_decode_ext_id);
 static void nand_decode_id(struct nand_chip *chip, struct nand_flash_dev *type)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
-	u8 *id_data = chip->id.data;
-	int maf_id = id_data[0];
 
 	mtd->erasesize = type->erasesize;
 	mtd->writesize = type->pagesize;
@@ -3834,19 +3832,6 @@ static void nand_decode_id(struct nand_chip *chip, struct nand_flash_dev *type)
 
 	/* All legacy ID NAND are small-page, SLC */
 	chip->bits_per_cell = 1;
-
-	/*
-	 * Check for Spansion/AMD ID + repeating 5th, 6th byte since
-	 * some Spansion chips have erasesize that conflicts with size
-	 * listed in nand_ids table.
-	 * Data sheet (5 byte ID): Spansion S30ML-P ORNAND (p.39)
-	 */
-	if (maf_id == NAND_MFR_AMD && id_data[4] != 0x00 && id_data[5] == 0x00
-			&& id_data[6] == 0x00 && id_data[7] == 0x00
-			&& mtd->writesize == 512) {
-		mtd->erasesize = 128 * 1024;
-		mtd->erasesize <<= ((id_data[3] & 0x03) << 1);
-	}
 }
 
 /*
@@ -3872,8 +3857,7 @@ static void nand_decode_bbm_options(struct nand_chip *chip)
 	 * Micron devices with 2KiB pages and on SLC Samsung, Hynix, Toshiba,
 	 * AMD/Spansion, and Macronix.  All others scan only the first page.
 	 */
-	if (nand_is_slc(chip) &&
-	    (maf_id == NAND_MFR_AMD || maf_id == NAND_MFR_MACRONIX))
+	if (nand_is_slc(chip) && maf_id == NAND_MFR_MACRONIX)
 		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
 }
 
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index 42c6743401c5..af7b92e08fbd 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -179,7 +179,7 @@ static const struct nand_manufacturer nand_manufacturers[] = {
 	{NAND_MFR_STMICRO, "ST Micro"},
 	{NAND_MFR_HYNIX, "Hynix", &hynix_nand_manuf_ops},
 	{NAND_MFR_MICRON, "Micron", &micron_nand_manuf_ops},
-	{NAND_MFR_AMD, "AMD/Spansion"},
+	{NAND_MFR_AMD, "AMD/Spansion", &amd_nand_manuf_ops},
 	{NAND_MFR_MACRONIX, "Macronix"},
 	{NAND_MFR_EON, "Eon"},
 	{NAND_MFR_SANDISK, "SanDisk"},
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 7d0f18ecbf57..97dce42778e9 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1098,6 +1098,7 @@ extern const struct nand_manufacturer_ops toshiba_nand_manuf_ops;
 extern const struct nand_manufacturer_ops samsung_nand_manuf_ops;
 extern const struct nand_manufacturer_ops hynix_nand_manuf_ops;
 extern const struct nand_manufacturer_ops micron_nand_manuf_ops;
+extern const struct nand_manufacturer_ops amd_nand_manuf_ops;
 
 int nand_default_bbt(struct mtd_info *mtd);
 int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
-- 
cgit v1.2.3-70-g09d2


From 3b5206f4be9b65d2f0f85b3239cf117a1d0de7ce Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 8 Jun 2016 10:43:26 +0200
Subject: mtd: nand: Move Macronix specific initialization in nand_macronix.c

Move Macronix specific initialization logic into nand_macronix.c. This
is part of the "separate vendor specific code from core" cleanup
process.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/Makefile        |  1 +
 drivers/mtd/nand/nand_base.c     | 11 -----------
 drivers/mtd/nand/nand_ids.c      |  2 +-
 drivers/mtd/nand/nand_macronix.c | 30 ++++++++++++++++++++++++++++++
 include/linux/mtd/nand.h         |  1 +
 5 files changed, 33 insertions(+), 12 deletions(-)
 create mode 100644 drivers/mtd/nand/nand_macronix.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index f48ddcc132bc..098b8791f10a 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_MTD_NAND_MTK)		+= mtk_nand.o mtk_ecc.o
 nand-objs := nand_base.o nand_bbt.o nand_timings.o nand_ids.o
 nand-objs += nand_amd.o
 nand-objs += nand_hynix.o
+nand-objs += nand_macronix.o
 nand-objs += nand_micron.o
 nand-objs += nand_samsung.o
 nand-objs += nand_toshiba.o
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index a9060bcb9eb5..685376d8ceab 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3842,23 +3842,12 @@ static void nand_decode_id(struct nand_chip *chip, struct nand_flash_dev *type)
 static void nand_decode_bbm_options(struct nand_chip *chip)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
-	u8 *id_data = chip->id.data;
-	int maf_id = id_data[0];
 
 	/* Set the bad block position */
 	if (mtd->writesize > 512 || (chip->options & NAND_BUSWIDTH_16))
 		chip->badblockpos = NAND_LARGE_BADBLOCK_POS;
 	else
 		chip->badblockpos = NAND_SMALL_BADBLOCK_POS;
-
-	/*
-	 * Bad block marker is stored in the last page of each block on Samsung
-	 * and Hynix MLC devices; stored in first two pages of each block on
-	 * Micron devices with 2KiB pages and on SLC Samsung, Hynix, Toshiba,
-	 * AMD/Spansion, and Macronix.  All others scan only the first page.
-	 */
-	if (nand_is_slc(chip) && maf_id == NAND_MFR_MACRONIX)
-		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
 }
 
 static inline bool is_full_id_nand(struct nand_flash_dev *type)
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index af7b92e08fbd..9d5ca0e540b5 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -180,7 +180,7 @@ static const struct nand_manufacturer nand_manufacturers[] = {
 	{NAND_MFR_HYNIX, "Hynix", &hynix_nand_manuf_ops},
 	{NAND_MFR_MICRON, "Micron", &micron_nand_manuf_ops},
 	{NAND_MFR_AMD, "AMD/Spansion", &amd_nand_manuf_ops},
-	{NAND_MFR_MACRONIX, "Macronix"},
+	{NAND_MFR_MACRONIX, "Macronix", &macronix_nand_manuf_ops},
 	{NAND_MFR_EON, "Eon"},
 	{NAND_MFR_SANDISK, "SanDisk"},
 	{NAND_MFR_INTEL, "Intel"},
diff --git a/drivers/mtd/nand/nand_macronix.c b/drivers/mtd/nand/nand_macronix.c
new file mode 100644
index 000000000000..84855c3e1a02
--- /dev/null
+++ b/drivers/mtd/nand/nand_macronix.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2017 Free Electrons
+ * Copyright (C) 2017 NextThing Co
+ *
+ * Author: Boris Brezillon <boris.brezillon@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/mtd/nand.h>
+
+static int macronix_nand_init(struct nand_chip *chip)
+{
+	if (nand_is_slc(chip))
+		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
+
+	return 0;
+}
+
+const struct nand_manufacturer_ops macronix_nand_manuf_ops = {
+	.init = macronix_nand_init,
+};
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 97dce42778e9..c7de017c7f4c 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1099,6 +1099,7 @@ extern const struct nand_manufacturer_ops samsung_nand_manuf_ops;
 extern const struct nand_manufacturer_ops hynix_nand_manuf_ops;
 extern const struct nand_manufacturer_ops micron_nand_manuf_ops;
 extern const struct nand_manufacturer_ops amd_nand_manuf_ops;
+extern const struct nand_manufacturer_ops macronix_nand_manuf_ops;
 
 int nand_default_bbt(struct mtd_info *mtd);
 int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
-- 
cgit v1.2.3-70-g09d2


From 3843832fc8cadc2d48ba4ea4cd350a696906ac42 Mon Sep 17 00:00:00 2001
From: Peter De Schrijver <pdeschrijver@nvidia.com>
Date: Tue, 28 Feb 2017 17:19:24 +0200
Subject: clk: tegra: Handle UTMIPLL IDDQ

Export UTMIPLL IDDQ functions. These will be needed when powergating the
XUSB partition.

Signed-off-by: BH Hsieh <bhsieh@nvidia.com>
Signed-off-by: Peter De Schrijver <pdeschrijver@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/clk/tegra/clk-tegra210.c | 26 ++++++++++++++++++++++++++
 include/linux/clk/tegra.h        |  2 ++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/tegra/clk-tegra210.c b/drivers/clk/tegra/clk-tegra210.c
index 9a2512a6b419..f89d2a912273 100644
--- a/drivers/clk/tegra/clk-tegra210.c
+++ b/drivers/clk/tegra/clk-tegra210.c
@@ -2313,6 +2313,32 @@ static const char * const aclk_parents[] = {
 	"clk_m"
 };
 
+void tegra210_put_utmipll_in_iddq(void)
+{
+	u32 reg;
+
+	reg = readl_relaxed(clk_base + UTMIPLL_HW_PWRDN_CFG0);
+
+	if (reg & UTMIPLL_HW_PWRDN_CFG0_UTMIPLL_LOCK) {
+		pr_err("trying to assert IDDQ while UTMIPLL is locked\n");
+		return;
+	}
+
+	reg |= UTMIPLL_HW_PWRDN_CFG0_IDDQ_OVERRIDE;
+	writel_relaxed(reg, clk_base + UTMIPLL_HW_PWRDN_CFG0);
+}
+EXPORT_SYMBOL_GPL(tegra210_put_utmipll_in_iddq);
+
+void tegra210_put_utmipll_out_iddq(void)
+{
+	u32 reg;
+
+	reg = readl_relaxed(clk_base + UTMIPLL_HW_PWRDN_CFG0);
+	reg &= ~UTMIPLL_HW_PWRDN_CFG0_IDDQ_OVERRIDE;
+	writel_relaxed(reg, clk_base + UTMIPLL_HW_PWRDN_CFG0);
+}
+EXPORT_SYMBOL_GPL(tegra210_put_utmipll_out_iddq);
+
 static __init void tegra210_periph_clk_init(void __iomem *clk_base,
 					    void __iomem *pmc_base)
 {
diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index 7007a5f48080..e17d32831e28 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -125,5 +125,7 @@ extern void tegra210_xusb_pll_hw_control_enable(void);
 extern void tegra210_xusb_pll_hw_sequence_start(void);
 extern void tegra210_sata_pll_hw_control_enable(void);
 extern void tegra210_sata_pll_hw_sequence_start(void);
+extern void tegra210_put_utmipll_in_iddq(void);
+extern void tegra210_put_utmipll_out_iddq(void);
 
 #endif /* __LINUX_CLK_TEGRA_H_ */
-- 
cgit v1.2.3-70-g09d2


From 59af78d78db8bde6a63e09772aa44192f772fa96 Mon Sep 17 00:00:00 2001
From: Peter De Schrijver <pdeschrijver@nvidia.com>
Date: Wed, 15 Mar 2017 17:42:05 +0200
Subject: clk: tegra: Add SATA seq input control

This will be used by the powergating driver to ensure proper sequencer
state when the SATA domain is powergated.

Signed-off-by: Peter De Schrijver <pdeschrijver@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/clk/tegra/clk-tegra210.c | 25 +++++++++++++++++++++++++
 include/linux/clk/tegra.h        |  1 +
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/tegra/clk-tegra210.c b/drivers/clk/tegra/clk-tegra210.c
index 6f29125ec439..f3e51e640d4d 100644
--- a/drivers/clk/tegra/clk-tegra210.c
+++ b/drivers/clk/tegra/clk-tegra210.c
@@ -181,6 +181,11 @@
 #define SATA_PLL_CFG0				0x490
 #define SATA_PLL_CFG0_PADPLL_RESET_SWCTL	BIT(0)
 #define SATA_PLL_CFG0_PADPLL_USE_LOCKDET	BIT(2)
+#define SATA_PLL_CFG0_SATA_SEQ_IN_SWCTL		BIT(4)
+#define SATA_PLL_CFG0_SATA_SEQ_RESET_INPUT_VALUE	BIT(5)
+#define SATA_PLL_CFG0_SATA_SEQ_LANE_PD_INPUT_VALUE	BIT(6)
+#define SATA_PLL_CFG0_SATA_SEQ_PADPLL_PD_INPUT_VALUE	BIT(7)
+
 #define SATA_PLL_CFG0_PADPLL_SLEEP_IDDQ		BIT(13)
 #define SATA_PLL_CFG0_SEQ_ENABLE		BIT(24)
 
@@ -483,6 +488,26 @@ void tegra210_sata_pll_hw_sequence_start(void)
 }
 EXPORT_SYMBOL_GPL(tegra210_sata_pll_hw_sequence_start);
 
+void tegra210_set_sata_pll_seq_sw(bool state)
+{
+	u32 val;
+
+	val = readl_relaxed(clk_base + SATA_PLL_CFG0);
+	if (state) {
+		val |= SATA_PLL_CFG0_SATA_SEQ_IN_SWCTL;
+		val |= SATA_PLL_CFG0_SATA_SEQ_RESET_INPUT_VALUE;
+		val |= SATA_PLL_CFG0_SATA_SEQ_LANE_PD_INPUT_VALUE;
+		val |= SATA_PLL_CFG0_SATA_SEQ_PADPLL_PD_INPUT_VALUE;
+	} else {
+		val &= ~SATA_PLL_CFG0_SATA_SEQ_IN_SWCTL;
+		val &= ~SATA_PLL_CFG0_SATA_SEQ_RESET_INPUT_VALUE;
+		val &= ~SATA_PLL_CFG0_SATA_SEQ_LANE_PD_INPUT_VALUE;
+		val &= ~SATA_PLL_CFG0_SATA_SEQ_PADPLL_PD_INPUT_VALUE;
+	}
+	writel_relaxed(val, clk_base + SATA_PLL_CFG0);
+}
+EXPORT_SYMBOL_GPL(tegra210_set_sata_pll_seq_sw);
+
 static inline void _pll_misc_chk_default(void __iomem *base,
 					struct tegra_clk_pll_params *params,
 					u8 misc_num, u32 default_val, u32 mask)
diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index e17d32831e28..d23c9cf26993 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -125,6 +125,7 @@ extern void tegra210_xusb_pll_hw_control_enable(void);
 extern void tegra210_xusb_pll_hw_sequence_start(void);
 extern void tegra210_sata_pll_hw_control_enable(void);
 extern void tegra210_sata_pll_hw_sequence_start(void);
+extern void tegra210_set_sata_pll_seq_sw(bool state);
 extern void tegra210_put_utmipll_in_iddq(void);
 extern void tegra210_put_utmipll_out_iddq(void);
 
-- 
cgit v1.2.3-70-g09d2


From 61012985eb132a2fa5e4a3eddbc33528334fa377 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 16 Mar 2017 16:23:55 +0200
Subject: iommu/vt-d: Use lo_hi_readq() / lo_hi_writeq()

There is already helper functions to do 64-bit I/O on 32-bit machines or
buses, thus we don't need to reinvent the wheel.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/intel-iommu.h | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index c573a52ae440..485a5b48f038 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -30,6 +30,8 @@
 #include <linux/mmu_notifier.h>
 #include <linux/list.h>
 #include <linux/iommu.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
 
@@ -72,24 +74,8 @@
 
 #define OFFSET_STRIDE		(9)
 
-#ifdef CONFIG_64BIT
 #define dmar_readq(a) readq(a)
 #define dmar_writeq(a,v) writeq(v,a)
-#else
-static inline u64 dmar_readq(void __iomem *addr)
-{
-	u32 lo, hi;
-	lo = readl(addr);
-	hi = readl(addr + 4);
-	return (((u64) hi) << 32) + lo;
-}
-
-static inline void dmar_writeq(void __iomem *addr, u64 val)
-{
-	writel((u32)val, addr);
-	writel((u32)(val >> 32), addr + 4);
-}
-#endif
 
 #define DMAR_VER_MAJOR(v)		(((v) & 0xf0) >> 4)
 #define DMAR_VER_MINOR(v)		((v) & 0x0f)
-- 
cgit v1.2.3-70-g09d2


From 273df9635385b2156851c7ee49f40658d7bcb29d Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 16 Mar 2017 17:00:19 +0000
Subject: iommu/dma: Make PCI window reservation generic

Now that we're applying the IOMMU API reserved regions to our IOVA
domains, we shouldn't need to privately special-case PCI windows, or
indeed anything else which isn't specific to our iommu-dma layer.
However, since those aren't IOMMU-specific either, rather than start
duplicating code into IOMMU drivers let's transform the existing
function into an iommu_get_resv_regions() helper that they can share.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/arm-smmu-v3.c |  2 ++
 drivers/iommu/arm-smmu.c    |  2 ++
 drivers/iommu/dma-iommu.c   | 38 ++++++++++++++++++++++++++++----------
 include/linux/dma-iommu.h   |  5 +++++
 4 files changed, 37 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 591bb96047c9..bbd46efbe075 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -1893,6 +1893,8 @@ static void arm_smmu_get_resv_regions(struct device *dev,
 		return;
 
 	list_add_tail(&region->list, head);
+
+	iommu_dma_get_resv_regions(dev, head);
 }
 
 static void arm_smmu_put_resv_regions(struct device *dev,
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index b493c99e17f7..9b33700b7c69 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -1613,6 +1613,8 @@ static void arm_smmu_get_resv_regions(struct device *dev,
 		return;
 
 	list_add_tail(&region->list, head);
+
+	iommu_dma_get_resv_regions(dev, head);
 }
 
 static void arm_smmu_put_resv_regions(struct device *dev,
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 5787f919f4ec..85652110c8ff 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -167,22 +167,43 @@ void iommu_put_dma_cookie(struct iommu_domain *domain)
 }
 EXPORT_SYMBOL(iommu_put_dma_cookie);
 
-static void iova_reserve_pci_windows(struct pci_dev *dev,
-		struct iova_domain *iovad)
+/**
+ * iommu_dma_get_resv_regions - Reserved region driver helper
+ * @dev: Device from iommu_get_resv_regions()
+ * @list: Reserved region list from iommu_get_resv_regions()
+ *
+ * IOMMU drivers can use this to implement their .get_resv_regions callback
+ * for general non-IOMMU-specific reservations. Currently, this covers host
+ * bridge windows for PCI devices.
+ */
+void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
 {
-	struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
+	struct pci_host_bridge *bridge;
 	struct resource_entry *window;
-	unsigned long lo, hi;
 
+	if (!dev_is_pci(dev))
+		return;
+
+	bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
 	resource_list_for_each_entry(window, &bridge->windows) {
+		struct iommu_resv_region *region;
+		phys_addr_t start;
+		size_t length;
+
 		if (resource_type(window->res) != IORESOURCE_MEM)
 			continue;
 
-		lo = iova_pfn(iovad, window->res->start - window->offset);
-		hi = iova_pfn(iovad, window->res->end - window->offset);
-		reserve_iova(iovad, lo, hi);
+		start = window->res->start - window->offset;
+		length = window->res->end - window->res->start + 1;
+		region = iommu_alloc_resv_region(start, length, 0,
+				IOMMU_RESV_RESERVED);
+		if (!region)
+			return;
+
+		list_add_tail(&region->list, list);
 	}
 }
+EXPORT_SYMBOL(iommu_dma_get_resv_regions);
 
 static int cookie_init_hw_msi_region(struct iommu_dma_cookie *cookie,
 		phys_addr_t start, phys_addr_t end)
@@ -218,9 +239,6 @@ static int iova_reserve_iommu_regions(struct device *dev,
 	LIST_HEAD(resv_regions);
 	int ret = 0;
 
-	if (dev_is_pci(dev))
-		iova_reserve_pci_windows(to_pci_dev(dev), iovad);
-
 	iommu_get_resv_regions(dev, &resv_regions);
 	list_for_each_entry(region, &resv_regions, list) {
 		unsigned long lo, hi;
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 5725c94b1f12..b6635a46fc7c 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -71,6 +71,7 @@ int iommu_dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
 
 /* The DMA API isn't _quite_ the whole story, though... */
 void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg);
+void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
 
 #else
 
@@ -100,6 +101,10 @@ static inline void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg)
 {
 }
 
+static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
+{
+}
+
 #endif	/* CONFIG_IOMMU_DMA */
 #endif	/* __KERNEL__ */
 #endif	/* __DMA_IOMMU_H */
-- 
cgit v1.2.3-70-g09d2


From e8bb4673596ea28fab287dbc417e8100d798cd40 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Mon, 27 Mar 2017 07:31:03 +0200
Subject: dmaengine: pl330: remove pdata based initialization

This driver is now used only on platforms which support device tree, so
it is safe to remove legacy platform data based initialization code.

Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
For plat-samsung:
Acked-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 arch/arm/plat-samsung/devs.c |  1 -
 drivers/dma/pl330.c          | 42 ++++++++----------------------------------
 include/linux/amba/pl330.h   | 35 -----------------------------------
 3 files changed, 8 insertions(+), 70 deletions(-)
 delete mode 100644 include/linux/amba/pl330.h

(limited to 'include/linux')

diff --git a/arch/arm/plat-samsung/devs.c b/arch/arm/plat-samsung/devs.c
index 03fac123676d..dc269d9143bc 100644
--- a/arch/arm/plat-samsung/devs.c
+++ b/arch/arm/plat-samsung/devs.c
@@ -10,7 +10,6 @@
  * published by the Free Software Foundation.
 */
 
-#include <linux/amba/pl330.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/interrupt.h>
diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
index f37f4978dabb..8b0da7fa520d 100644
--- a/drivers/dma/pl330.c
+++ b/drivers/dma/pl330.c
@@ -22,7 +22,6 @@
 #include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
 #include <linux/amba/bus.h>
-#include <linux/amba/pl330.h>
 #include <linux/scatterlist.h>
 #include <linux/of.h>
 #include <linux/of_dma.h>
@@ -2077,18 +2076,6 @@ static void pl330_tasklet(unsigned long data)
 	}
 }
 
-bool pl330_filter(struct dma_chan *chan, void *param)
-{
-	u8 *peri_id;
-
-	if (chan->device->dev->driver != &pl330_driver.drv)
-		return false;
-
-	peri_id = chan->private;
-	return *peri_id == (unsigned long)param;
-}
-EXPORT_SYMBOL(pl330_filter);
-
 static struct dma_chan *of_dma_pl330_xlate(struct of_phandle_args *dma_spec,
 						struct of_dma *ofdma)
 {
@@ -2833,7 +2820,6 @@ static SIMPLE_DEV_PM_OPS(pl330_pm, pl330_suspend, pl330_resume);
 static int
 pl330_probe(struct amba_device *adev, const struct amba_id *id)
 {
-	struct dma_pl330_platdata *pdat;
 	struct pl330_config *pcfg;
 	struct pl330_dmac *pl330;
 	struct dma_pl330_chan *pch, *_p;
@@ -2843,8 +2829,6 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 	int num_chan;
 	struct device_node *np = adev->dev.of_node;
 
-	pdat = dev_get_platdata(&adev->dev);
-
 	ret = dma_set_mask_and_coherent(&adev->dev, DMA_BIT_MASK(32));
 	if (ret)
 		return ret;
@@ -2857,7 +2841,7 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 	pd = &pl330->ddma;
 	pd->dev = &adev->dev;
 
-	pl330->mcbufsz = pdat ? pdat->mcbuf_sz : 0;
+	pl330->mcbufsz = 0;
 
 	/* get quirk */
 	for (i = 0; i < ARRAY_SIZE(of_quirks); i++)
@@ -2901,10 +2885,7 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 	INIT_LIST_HEAD(&pd->channels);
 
 	/* Initialize channel parameters */
-	if (pdat)
-		num_chan = max_t(int, pdat->nr_valid_peri, pcfg->num_chan);
-	else
-		num_chan = max_t(int, pcfg->num_peri, pcfg->num_chan);
+	num_chan = max_t(int, pcfg->num_peri, pcfg->num_chan);
 
 	pl330->num_peripherals = num_chan;
 
@@ -2916,11 +2897,8 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 
 	for (i = 0; i < num_chan; i++) {
 		pch = &pl330->peripherals[i];
-		if (!adev->dev.of_node)
-			pch->chan.private = pdat ? &pdat->peri_id[i] : NULL;
-		else
-			pch->chan.private = adev->dev.of_node;
 
+		pch->chan.private = adev->dev.of_node;
 		INIT_LIST_HEAD(&pch->submitted_list);
 		INIT_LIST_HEAD(&pch->work_list);
 		INIT_LIST_HEAD(&pch->completed_list);
@@ -2933,15 +2911,11 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 		list_add_tail(&pch->chan.device_node, &pd->channels);
 	}
 
-	if (pdat) {
-		pd->cap_mask = pdat->cap_mask;
-	} else {
-		dma_cap_set(DMA_MEMCPY, pd->cap_mask);
-		if (pcfg->num_peri) {
-			dma_cap_set(DMA_SLAVE, pd->cap_mask);
-			dma_cap_set(DMA_CYCLIC, pd->cap_mask);
-			dma_cap_set(DMA_PRIVATE, pd->cap_mask);
-		}
+	dma_cap_set(DMA_MEMCPY, pd->cap_mask);
+	if (pcfg->num_peri) {
+		dma_cap_set(DMA_SLAVE, pd->cap_mask);
+		dma_cap_set(DMA_CYCLIC, pd->cap_mask);
+		dma_cap_set(DMA_PRIVATE, pd->cap_mask);
 	}
 
 	pd->device_alloc_chan_resources = pl330_alloc_chan_resources;
diff --git a/include/linux/amba/pl330.h b/include/linux/amba/pl330.h
deleted file mode 100644
index fe93758e8403..000000000000
--- a/include/linux/amba/pl330.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* linux/include/linux/amba/pl330.h
- *
- * Copyright (C) 2010 Samsung Electronics Co. Ltd.
- *	Jaswinder Singh <jassi.brar@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef	__AMBA_PL330_H_
-#define	__AMBA_PL330_H_
-
-#include <linux/dmaengine.h>
-
-struct dma_pl330_platdata {
-	/*
-	 * Number of valid peripherals connected to DMAC.
-	 * This may be different from the value read from
-	 * CR0, as the PL330 implementation might have 'holes'
-	 * in the peri list or the peri could also be reached
-	 * from another DMAC which the platform prefers.
-	 */
-	u8 nr_valid_peri;
-	/* Array of valid peripherals */
-	u8 *peri_id;
-	/* Operational capabilities */
-	dma_cap_mask_t cap_mask;
-	/* Bytes to allocate for MC buffer */
-	unsigned mcbuf_sz;
-};
-
-extern bool pl330_filter(struct dma_chan *chan, void *param);
-#endif	/* __AMBA_PL330_H_ */
-- 
cgit v1.2.3-70-g09d2


From bf616d21f41174389c6d720ae21bf40f154474c8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 4 Apr 2017 16:54:21 +0100
Subject: Annotate module params that specify hardware parameters (eg. ioport)

Provided an annotation for module parameters that specify hardware
parameters (such as io ports, iomem addresses, irqs, dma channels, fixed
dma buffers and other types).

This will enable such parameters to be locked down in the core parameter
parser for secure boot support.

I've also included annotations as to what sort of hardware configuration
each module is dealing with for future use.  Some of these are
straightforward (ioport, iomem, irq, dma), but there are also:

 (1) drivers that switch the semantics of a parameter between ioport and
     iomem depending on a second parameter,

 (2) drivers that appear to reserve a CPU memory buffer at a fixed address,

 (3) other parameters, such as bus types and irq selection bitmasks.

For the moment, the hardware configuration type isn't actually stored,
though its validity is checked.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/linux/moduleparam.h | 65 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 52666d90ca94..6be1949ebcdf 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -60,9 +60,11 @@ struct kernel_param_ops {
  * Flags available for kernel_param
  *
  * UNSAFE - the parameter is dangerous and setting it will taint the kernel
+ * HWPARAM - Hardware param not permitted in lockdown mode
  */
 enum {
-	KERNEL_PARAM_FL_UNSAFE = (1 << 0)
+	KERNEL_PARAM_FL_UNSAFE	= (1 << 0),
+	KERNEL_PARAM_FL_HWPARAM	= (1 << 1),
 };
 
 struct kernel_param {
@@ -451,6 +453,67 @@ extern int param_set_bint(const char *val, const struct kernel_param *kp);
 			    perm, -1, 0);				\
 	__MODULE_PARM_TYPE(name, "array of " #type)
 
+enum hwparam_type {
+	hwparam_ioport,		/* Module parameter configures an I/O port */
+	hwparam_iomem,		/* Module parameter configures an I/O mem address */
+	hwparam_ioport_or_iomem, /* Module parameter could be either, depending on other option */
+	hwparam_irq,		/* Module parameter configures an I/O port */
+	hwparam_dma,		/* Module parameter configures a DMA channel */
+	hwparam_dma_addr,	/* Module parameter configures a DMA buffer address */
+	hwparam_other,		/* Module parameter configures some other value */
+};
+
+/**
+ * module_param_hw_named - A parameter representing a hw parameters
+ * @name: a valid C identifier which is the parameter name.
+ * @value: the actual lvalue to alter.
+ * @type: the type of the parameter
+ * @hwtype: what the value represents (enum hwparam_type)
+ * @perm: visibility in sysfs.
+ *
+ * Usually it's a good idea to have variable names and user-exposed names the
+ * same, but that's harder if the variable must be non-static or is inside a
+ * structure.  This allows exposure under a different name.
+ */
+#define module_param_hw_named(name, value, type, hwtype, perm)		\
+	param_check_##type(name, &(value));				\
+	__module_param_call(MODULE_PARAM_PREFIX, name,			\
+			    &param_ops_##type, &value,			\
+			    perm, -1,					\
+			    KERNEL_PARAM_FL_HWPARAM | (hwparam_##hwtype & 0));	\
+	__MODULE_PARM_TYPE(name, #type)
+
+#define module_param_hw(name, type, hwtype, perm)		\
+	module_param_hw_named(name, name, type, hwtype, perm)
+
+/**
+ * module_param_hw_array - A parameter representing an array of hw parameters
+ * @name: the name of the array variable
+ * @type: the type, as per module_param()
+ * @hwtype: what the value represents (enum hwparam_type)
+ * @nump: optional pointer filled in with the number written
+ * @perm: visibility in sysfs
+ *
+ * Input and output are as comma-separated values.  Commas inside values
+ * don't work properly (eg. an array of charp).
+ *
+ * ARRAY_SIZE(@name) is used to determine the number of elements in the
+ * array, so the definition must be visible.
+ */
+#define module_param_hw_array(name, type, hwtype, nump, perm)		\
+	param_check_##type(name, &(name)[0]);				\
+	static const struct kparam_array __param_arr_##name		\
+	= { .max = ARRAY_SIZE(name), .num = nump,			\
+	    .ops = &param_ops_##type,					\
+	    .elemsize = sizeof(name[0]), .elem = name };		\
+	__module_param_call(MODULE_PARAM_PREFIX, name,			\
+			    &param_array_ops,				\
+			    .arr = &__param_arr_##name,			\
+			    perm, -1,					\
+			    KERNEL_PARAM_FL_HWPARAM | (hwparam_##hwtype & 0));	\
+	__MODULE_PARM_TYPE(name, "array of " #type)
+
+
 extern const struct kernel_param_ops param_array_ops;
 
 extern const struct kernel_param_ops param_ops_string;
-- 
cgit v1.2.3-70-g09d2


From adf5e5168bd51c42332ebaa709351fa6ed65ea73 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 27 Jan 2017 12:22:54 +0000
Subject: iommu: Better document the IOMMU_PRIV flag

This is a fairly subtle thing - let's make sure it's described as
clearly as possible to avoid potential misunderstandings.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/linux/iommu.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2e4de0deee53..88ec8c6580d3 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -32,10 +32,13 @@
 #define IOMMU_NOEXEC	(1 << 3)
 #define IOMMU_MMIO	(1 << 4) /* e.g. things like MSI doorbells */
 /*
- * This is to make the IOMMU API setup privileged
- * mapppings accessible by the master only at higher
- * privileged execution level and inaccessible at
- * less privileged levels.
+ * Where the bus hardware includes a privilege level as part of its access type
+ * markings, and certain devices are capable of issuing transactions marked as
+ * either 'supervisor' or 'user', the IOMMU_PRIV flag requests that the other
+ * given permission flags only apply to accesses at the higher privilege level,
+ * and that unprivileged transactions should have as little access as possible.
+ * This would usually imply the same permissions as kernel mappings on the CPU,
+ * if the IOMMU page table format is equivalent.
  */
 #define IOMMU_PRIV	(1 << 5)
 
-- 
cgit v1.2.3-70-g09d2


From b8c17e6664c461e4aed545a943304c3b32dd309c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 8 Nov 2016 14:25:21 -0800
Subject: rcu: Maintain special bits at bottom of ->dynticks counter

Currently, IPIs are used to force other CPUs to invalidate their TLBs
in response to a kernel virtual-memory mapping change.  This works, but
degrades both battery lifetime (for idle CPUs) and real-time response
(for nohz_full CPUs), and in addition results in unnecessary IPIs due to
the fact that CPUs executing in usermode are unaffected by stale kernel
mappings.  It would be better to cause a CPU executing in usermode to
wait until it is entering kernel mode to do the flush, first to avoid
interrupting usemode tasks and second to handle multiple flush requests
with a single flush in the case of a long-running user task.

This commit therefore reserves a bit at the bottom of the ->dynticks
counter, which is checked upon exit from extended quiescent states.
If it is set, it is cleared and then a new rcu_eqs_special_exit() macro is
invoked, which, if not supplied, is an empty single-pass do-while loop.
If this bottom bit is set on -entry- to an extended quiescent state,
then a WARN_ON_ONCE() triggers.

This bottom bit may be set using a new rcu_eqs_special_set() function,
which returns true if the bit was set, or false if the CPU turned
out to not be in an extended quiescent state.  Please note that this
function refuses to set the bit for a non-nohz_full CPU when that CPU
is executing in usermode because usermode execution is tracked by RCU
as a dyntick-idle extended quiescent state only for nohz_full CPUs.

Reported-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcutiny.h |  5 ++++
 kernel/rcu/tree.c       | 77 +++++++++++++++++++++++++++++++++++++++----------
 kernel/rcu/tree.h       |  1 +
 3 files changed, 67 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index b452953e21c8..6c9d941e3962 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -33,6 +33,11 @@ static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
 	return 0;
 }
 
+static inline bool rcu_eqs_special_set(int cpu)
+{
+	return false;  /* Never flag non-existent other CPUs! */
+}
+
 static inline unsigned long get_state_synchronize_rcu(void)
 {
 	return 0;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 50fee7689e71..0efad311ded4 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -274,9 +274,19 @@ void rcu_bh_qs(void)
 
 static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
 
+/*
+ * Steal a bit from the bottom of ->dynticks for idle entry/exit
+ * control.  Initially this is for TLB flushing.
+ */
+#define RCU_DYNTICK_CTRL_MASK 0x1
+#define RCU_DYNTICK_CTRL_CTR  (RCU_DYNTICK_CTRL_MASK + 1)
+#ifndef rcu_eqs_special_exit
+#define rcu_eqs_special_exit() do { } while (0)
+#endif
+
 static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 	.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
-	.dynticks = ATOMIC_INIT(1),
+	.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE
 	.dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
 	.dynticks_idle = ATOMIC_INIT(1),
@@ -290,15 +300,20 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 static void rcu_dynticks_eqs_enter(void)
 {
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-	int special;
+	int seq;
 
 	/*
-	 * CPUs seeing atomic_inc_return() must see prior RCU read-side
+	 * CPUs seeing atomic_add_return() must see prior RCU read-side
 	 * critical sections, and we also must force ordering with the
 	 * next idle sojourn.
 	 */
-	special = atomic_inc_return(&rdtp->dynticks);
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1);
+	seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
+	/* Better be in an extended quiescent state! */
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     (seq & RCU_DYNTICK_CTRL_CTR));
+	/* Better not have special action (TLB flush) pending! */
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     (seq & RCU_DYNTICK_CTRL_MASK));
 }
 
 /*
@@ -308,15 +323,22 @@ static void rcu_dynticks_eqs_enter(void)
 static void rcu_dynticks_eqs_exit(void)
 {
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-	int special;
+	int seq;
 
 	/*
-	 * CPUs seeing atomic_inc_return() must see prior idle sojourns,
+	 * CPUs seeing atomic_add_return() must see prior idle sojourns,
 	 * and we also must force ordering with the next RCU read-side
 	 * critical section.
 	 */
-	special = atomic_inc_return(&rdtp->dynticks);
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1));
+	seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     !(seq & RCU_DYNTICK_CTRL_CTR));
+	if (seq & RCU_DYNTICK_CTRL_MASK) {
+		atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks);
+		smp_mb__after_atomic(); /* _exit after clearing mask. */
+		/* Prefer duplicate flushes to losing a flush. */
+		rcu_eqs_special_exit();
+	}
 }
 
 /*
@@ -333,9 +355,9 @@ static void rcu_dynticks_eqs_online(void)
 {
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
 
-	if (atomic_read(&rdtp->dynticks) & 0x1)
+	if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR)
 		return;
-	atomic_add(0x1, &rdtp->dynticks);
+	atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
 }
 
 /*
@@ -347,7 +369,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)
 {
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
 
-	return !(atomic_read(&rdtp->dynticks) & 0x1);
+	return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR);
 }
 
 /*
@@ -358,7 +380,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
 {
 	int snap = atomic_add_return(0, &rdtp->dynticks);
 
-	return snap;
+	return snap & ~RCU_DYNTICK_CTRL_MASK;
 }
 
 /*
@@ -367,7 +389,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
  */
 static bool rcu_dynticks_in_eqs(int snap)
 {
-	return !(snap & 0x1);
+	return !(snap & RCU_DYNTICK_CTRL_CTR);
 }
 
 /*
@@ -387,10 +409,33 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap)
 static void rcu_dynticks_momentary_idle(void)
 {
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-	int special = atomic_add_return(2, &rdtp->dynticks);
+	int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
+					&rdtp->dynticks);
 
 	/* It is illegal to call this from idle state. */
-	WARN_ON_ONCE(!(special & 0x1));
+	WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
+}
+
+/*
+ * Set the special (bottom) bit of the specified CPU so that it
+ * will take special action (such as flushing its TLB) on the
+ * next exit from an extended quiescent state.  Returns true if
+ * the bit was successfully set, or false if the CPU was not in
+ * an extended quiescent state.
+ */
+bool rcu_eqs_special_set(int cpu)
+{
+	int old;
+	int new;
+	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+	do {
+		old = atomic_read(&rdtp->dynticks);
+		if (old & RCU_DYNTICK_CTRL_CTR)
+			return false;
+		new = old | RCU_DYNTICK_CTRL_MASK;
+	} while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old);
+	return true;
 }
 
 DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ec62a05bfdb3..7468b4de7e0c 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -596,6 +596,7 @@ extern struct rcu_state rcu_preempt_state;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 
 int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
+bool rcu_eqs_special_set(int cpu);
 
 #ifdef CONFIG_RCU_BOOST
 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-- 
cgit v1.2.3-70-g09d2


From 77e5849688670280b173bb9e0544e9da7b2acc36 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 14 Jan 2017 13:32:50 -0800
Subject: rcu: Make arch select smp_mb__after_unlock_lock() strength

The definition of smp_mb__after_unlock_lock() is currently smp_mb()
for CONFIG_PPC and a no-op otherwise.  It would be better to instead
provide an architecture-selectable Kconfig option, and select the
strength of smp_mb__after_unlock_lock() based on that option.  This
commit therefore creates ARCH_WEAK_RELEASE_ACQUIRE, has PPC select it,
and bases the definition of smp_mb__after_unlock_lock() on this new
ARCH_WEAK_RELEASE_ACQUIRE Kconfig option.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Boqun Feng <boqun.feng@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <linuxppc-dev@lists.ozlabs.org>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 arch/Kconfig             | 3 +++
 arch/powerpc/Kconfig     | 1 +
 include/linux/rcupdate.h | 6 +++---
 3 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index cd211a14a88f..adefaf344239 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -320,6 +320,9 @@ config HAVE_CMPXCHG_LOCAL
 config HAVE_CMPXCHG_DOUBLE
 	bool
 
+config ARCH_WEAK_RELEASE_ACQUIRE
+	bool
+
 config ARCH_WANT_IPC_PARSE_VERSION
 	bool
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 97a8bc8a095c..7a5c9b764cd2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -99,6 +99,7 @@ config PPC
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
 	select ARCH_WANT_IPC_PARSE_VERSION
+	select ARCH_WEAK_RELEASE_ACQUIRE
 	select BINFMT_ELF
 	select BUILDTIME_EXTABLE_SORT
 	select CLONE_BACKWARDS
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index de88b33c0974..e6146d0074f8 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1127,11 +1127,11 @@ do { \
  * if the UNLOCK and LOCK are executed by the same CPU or if the
  * UNLOCK and LOCK operate on the same lock variable.
  */
-#ifdef CONFIG_PPC
+#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE
 #define smp_mb__after_unlock_lock()	smp_mb()  /* Full ordering for lock. */
-#else /* #ifdef CONFIG_PPC */
+#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
 #define smp_mb__after_unlock_lock()	do { } while (0)
-#endif /* #else #ifdef CONFIG_PPC */
+#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
 
 
 #endif /* __LINUX_RCUPDATE_H */
-- 
cgit v1.2.3-70-g09d2


From 900b1028ec388e50c98200641ae4274794c807cf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 10 Feb 2017 14:32:54 -0800
Subject: srcu: Allow SRCU to access rcu_scheduler_active

This is primarily a code-movement commit in preparation for allowing
SRCU to handle early-boot SRCU grace periods.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcutiny.h  |  6 +++---
 kernel/rcu/tiny_plugin.h |  9 +++++----
 kernel/rcu/tree.c        |  2 +-
 kernel/rcu/tree_exp.h    | 12 -----------
 kernel/rcu/update.c      | 52 +++++++++++++++++++++++++++++++-----------------
 5 files changed, 43 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 6c9d941e3962..5219be250f00 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -217,14 +217,14 @@ static inline void exit_rcu(void)
 {
 }
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
 extern int rcu_scheduler_active __read_mostly;
 void rcu_scheduler_starting(void);
-#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#else /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
 static inline void rcu_scheduler_starting(void)
 {
 }
-#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
 
 #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
 
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index df3a60e19f07..371034e77f87 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -52,7 +52,7 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	RCU_TRACE(.name = "rcu_bh")
 };
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
 #include <linux/kernel_stat.h>
 
 int rcu_scheduler_active __read_mostly;
@@ -65,15 +65,16 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
  * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
  * The reason for this is that Tiny RCU does not need kthreads, so does
  * not have to care about the fact that the scheduler is half-initialized
- * at a certain phase of the boot process.
+ * at a certain phase of the boot process.  Unless SRCU is in the mix.
  */
 void __init rcu_scheduler_starting(void)
 {
 	WARN_ON(nr_context_switches() > 0);
-	rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+	rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
+		? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
 }
 
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
 
 #ifdef CONFIG_RCU_TRACE
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8cc9d40b41ea..2380d1e3dfb8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3974,7 +3974,7 @@ early_initcall(rcu_spawn_gp_kthread);
  * task is booting the system, and such primitives are no-ops).  After this
  * function is called, any synchronous grace-period primitives are run as
  * expedited, with the requesting task driving the grace period forward.
- * A later core_initcall() rcu_exp_runtime_mode() will switch to full
+ * A later core_initcall() rcu_set_runtime_mode() will switch to full
  * runtime RCU functionality.
  */
 void rcu_scheduler_starting(void)
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index a1f52bbe9db6..51ca287828a2 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -737,15 +737,3 @@ void synchronize_rcu_expedited(void)
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
-
-/*
- * Switch to run-time mode once Tree RCU has fully initialized.
- */
-static int __init rcu_exp_runtime_mode(void)
-{
-	rcu_test_sync_prims();
-	rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
-	rcu_test_sync_prims();
-	return 0;
-}
-core_initcall(rcu_exp_runtime_mode);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 55c8530316c7..c5df0d756900 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
  * non-expedited counterparts?  Intended for use within RCU.  Note
  * that if the user specifies both rcu_expedited and rcu_normal, then
  * rcu_normal wins.  (Except during the time period during boot from
- * when the first task is spawned until the rcu_exp_runtime_mode()
+ * when the first task is spawned until the rcu_set_runtime_mode()
  * core_initcall() is invoked, at which point everything is expedited.)
  */
 bool rcu_gp_is_normal(void)
@@ -190,6 +190,39 @@ void rcu_end_inkernel_boot(void)
 
 #endif /* #ifndef CONFIG_TINY_RCU */
 
+/*
+ * Test each non-SRCU synchronous grace-period wait API.  This is
+ * useful just after a change in mode for these primitives, and
+ * during early boot.
+ */
+void rcu_test_sync_prims(void)
+{
+	if (!IS_ENABLED(CONFIG_PROVE_RCU))
+		return;
+	synchronize_rcu();
+	synchronize_rcu_bh();
+	synchronize_sched();
+	synchronize_rcu_expedited();
+	synchronize_rcu_bh_expedited();
+	synchronize_sched_expedited();
+}
+
+#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU)
+
+/*
+ * Switch to run-time mode once RCU has fully initialized.
+ */
+static int __init rcu_set_runtime_mode(void)
+{
+	rcu_test_sync_prims();
+	rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+	rcu_test_sync_prims();
+	return 0;
+}
+core_initcall(rcu_set_runtime_mode);
+
+#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
+
 #ifdef CONFIG_PREEMPT_RCU
 
 /*
@@ -817,23 +850,6 @@ static void rcu_spawn_tasks_kthread(void)
 
 #endif /* #ifdef CONFIG_TASKS_RCU */
 
-/*
- * Test each non-SRCU synchronous grace-period wait API.  This is
- * useful just after a change in mode for these primitives, and
- * during early boot.
- */
-void rcu_test_sync_prims(void)
-{
-	if (!IS_ENABLED(CONFIG_PROVE_RCU))
-		return;
-	synchronize_rcu();
-	synchronize_rcu_bh();
-	synchronize_sched();
-	synchronize_rcu_expedited();
-	synchronize_rcu_bh_expedited();
-	synchronize_sched_expedited();
-}
-
 #ifdef CONFIG_PROVE_RCU
 
 /*
-- 
cgit v1.2.3-70-g09d2


From c2a8ec0778b2ca0d360ba9b5cac7fcd5ddfe798f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 15:31:55 -0800
Subject: srcu: Move to state-based grace-period sequencing

The current SRCU grace-period processing might never reach the last
portion of srcu_advance_batches().  This is OK given the current
implementation, as the first portion, up to the try_check_zero()
following the srcu_flip() is sufficient to drive grace periods forward.
However, it has the unfortunate side-effect of making it impossible to
determine when a given grace period has ended, and it will be necessary
to efficiently trace ends of grace periods in order to efficiently handle
per-CPU SRCU callback lists.

This commit therefore adds states to the SRCU grace-period processing,
so that the end of a given SRCU grace period is marked by the transition
to the SRCU_STATE_DONE state.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h |  10 ++++-
 kernel/rcu/srcu.c    | 111 ++++++++++++++++++++++++++++-----------------------
 2 files changed, 69 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index a598cf3ac70c..f149a685896c 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -48,7 +48,7 @@ struct srcu_struct {
 	unsigned long completed;
 	struct srcu_array __percpu *per_cpu_ref;
 	spinlock_t queue_lock; /* protect ->batch_queue, ->running */
-	bool running;
+	int srcu_state;
 	/* callbacks just queued */
 	struct rcu_batch batch_queue;
 	/* callbacks try to do the first check_zero */
@@ -62,6 +62,12 @@ struct srcu_struct {
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 };
 
+/* Values for -> state variable. */
+#define SRCU_STATE_IDLE		0
+#define SRCU_STATE_SCAN1	1
+#define SRCU_STATE_SCAN2	2
+#define SRCU_STATE_DONE		3
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 int __init_srcu_struct(struct srcu_struct *sp, const char *name,
@@ -89,7 +95,7 @@ void process_srcu(struct work_struct *work);
 		.completed = -300,					\
 		.per_cpu_ref = &name##_srcu_array,			\
 		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
-		.running = false,					\
+		.srcu_state = SRCU_STATE_IDLE,				\
 		.batch_queue = RCU_BATCH_INIT(name.batch_queue),	\
 		.batch_check0 = RCU_BATCH_INIT(name.batch_check0),	\
 		.batch_check1 = RCU_BATCH_INIT(name.batch_check1),	\
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 821ecda873f2..70286f68f3a1 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -111,7 +111,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
 	sp->completed = 0;
 	spin_lock_init(&sp->queue_lock);
-	sp->running = false;
+	sp->srcu_state = SRCU_STATE_IDLE;
 	rcu_batch_init(&sp->batch_queue);
 	rcu_batch_init(&sp->batch_check0);
 	rcu_batch_init(&sp->batch_check1);
@@ -270,7 +270,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 	if (WARN_ON(!rcu_all_batches_empty(sp)))
 		return; /* Leakage unless caller handles error. */
 	flush_delayed_work(&sp->work);
-	if (WARN_ON(sp->running))
+	if (WARN_ON(READ_ONCE(sp->srcu_state) != SRCU_STATE_IDLE))
 		return; /* Caller forgot to stop doing call_srcu()? */
 	free_percpu(sp->per_cpu_ref);
 	sp->per_cpu_ref = NULL;
@@ -391,8 +391,8 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 	spin_lock_irqsave(&sp->queue_lock, flags);
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
 	rcu_batch_queue(&sp->batch_queue, head);
-	if (!sp->running) {
-		sp->running = true;
+	if (READ_ONCE(sp->srcu_state) == SRCU_STATE_IDLE) {
+		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
 		queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
 	}
 	spin_unlock_irqrestore(&sp->queue_lock, flags);
@@ -424,9 +424,9 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 	head->func = wakeme_after_rcu;
 	spin_lock_irq(&sp->queue_lock);
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
-	if (!sp->running) {
+	if (READ_ONCE(sp->srcu_state) == SRCU_STATE_IDLE) {
 		/* steal the processing owner */
-		sp->running = true;
+		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
 		rcu_batch_queue(&sp->batch_check0, head);
 		spin_unlock_irq(&sp->queue_lock);
 		/* give the processing owner to work_struct */
@@ -548,7 +548,9 @@ static void srcu_collect_new(struct srcu_struct *sp)
  */
 static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 {
-	int idx = 1 ^ (sp->completed & 1);
+	int idx;
+
+	WARN_ON_ONCE(sp->srcu_state == SRCU_STATE_IDLE);
 
 	/*
 	 * Because readers might be delayed for an extended period after
@@ -558,48 +560,56 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 	 * invoking a callback.
 	 */
 
-	if (rcu_batch_empty(&sp->batch_check0) &&
-	    rcu_batch_empty(&sp->batch_check1))
-		return; /* no callbacks need to be advanced */
-
-	if (!try_check_zero(sp, idx, trycount))
-		return; /* failed to advance, will try after SRCU_INTERVAL */
-
-	/*
-	 * The callbacks in ->batch_check1 have already done with their
-	 * first zero check and flip back when they were enqueued on
-	 * ->batch_check0 in a previous invocation of srcu_advance_batches().
-	 * (Presumably try_check_zero() returned false during that
-	 * invocation, leaving the callbacks stranded on ->batch_check1.)
-	 * They are therefore ready to invoke, so move them to ->batch_done.
-	 */
-	rcu_batch_move(&sp->batch_done, &sp->batch_check1);
-
-	if (rcu_batch_empty(&sp->batch_check0))
-		return; /* no callbacks need to be advanced */
-	srcu_flip(sp);
-
-	/*
-	 * The callbacks in ->batch_check0 just finished their
-	 * first check zero and flip, so move them to ->batch_check1
-	 * for future checking on the other idx.
-	 */
-	rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
-
-	/*
-	 * SRCU read-side critical sections are normally short, so check
-	 * at least twice in quick succession after a flip.
-	 */
-	trycount = trycount < 2 ? 2 : trycount;
-	if (!try_check_zero(sp, idx^1, trycount))
-		return; /* failed to advance, will try after SRCU_INTERVAL */
+	if (sp->srcu_state == SRCU_STATE_DONE)
+		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
+
+	if (sp->srcu_state == SRCU_STATE_SCAN1) {
+		idx = 1 ^ (sp->completed & 1);
+		if (!try_check_zero(sp, idx, trycount))
+			return; /* readers present, retry after SRCU_INTERVAL */
+
+		/*
+		 * The callbacks in ->batch_check1 have already done
+		 * with their first zero check and flip back when they were
+		 * enqueued on ->batch_check0 in a previous invocation of
+		 * srcu_advance_batches().  (Presumably try_check_zero()
+		 * returned false during that invocation, leaving the
+		 * callbacks stranded on ->batch_check1.) They are therefore
+		 * ready to invoke, so move them to ->batch_done.
+		 */
+		rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+		srcu_flip(sp);
+
+		/*
+		 * The callbacks in ->batch_check0 just finished their
+		 * first check zero and flip, so move them to ->batch_check1
+		 * for future checking on the other idx.
+		 */
+		rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
+
+		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN2);
+	}
 
-	/*
-	 * The callbacks in ->batch_check1 have now waited for all
-	 * pre-existing readers using both idx values.  They are therefore
-	 * ready to invoke, so move them to ->batch_done.
-	 */
-	rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+	if (sp->srcu_state == SRCU_STATE_SCAN2) {
+
+		/*
+		 * SRCU read-side critical sections are normally short,
+		 * so check at least twice in quick succession after a flip.
+		 */
+		idx = 1 ^ (sp->completed & 1);
+		trycount = trycount < 2 ? 2 : trycount;
+		if (!try_check_zero(sp, idx, trycount))
+			return; /* readers present, retry after SRCU_INTERVAL */
+
+		/*
+		 * The callbacks in ->batch_check1 have now waited for
+		 * all pre-existing readers using both idx values.  They are
+		 * therefore ready to invoke, so move them to ->batch_done.
+		 */
+		rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+
+		WRITE_ONCE(sp->srcu_state, SRCU_STATE_DONE);
+	}
 }
 
 /*
@@ -633,8 +643,9 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
 
 	if (rcu_all_batches_empty(sp)) {
 		spin_lock_irq(&sp->queue_lock);
-		if (rcu_all_batches_empty(sp)) {
-			sp->running = false;
+		if (rcu_all_batches_empty(sp) &&
+		    READ_ONCE(sp->srcu_state) == SRCU_STATE_DONE) {
+			WRITE_ONCE(sp->srcu_state, SRCU_STATE_IDLE);
 			pending = false;
 		}
 		spin_unlock_irq(&sp->queue_lock);
-- 
cgit v1.2.3-70-g09d2


From ac367c1c621b75689f6d5cd8301d364ba2c9f292 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 11 Mar 2017 07:14:06 -0800
Subject: srcu: Add grace-period sequence numbers

This commit adds grace-period sequence numbers, which will be used to
handle mid-boot grace periods and per-CPU callback lists.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h |  1 +
 kernel/rcu/srcu.c    | 27 +++++++++++++++++++++++----
 2 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index f149a685896c..047ac8c28a4e 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -46,6 +46,7 @@ struct rcu_batch {
 
 struct srcu_struct {
 	unsigned long completed;
+	unsigned long srcu_gp_seq;
 	struct srcu_array __percpu *per_cpu_ref;
 	spinlock_t queue_lock; /* protect ->batch_queue, ->running */
 	int srcu_state;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 70286f68f3a1..d464986d82b6 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -110,6 +110,7 @@ static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
 static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
 	sp->completed = 0;
+	sp->srcu_gp_seq = 0;
 	spin_lock_init(&sp->queue_lock);
 	sp->srcu_state = SRCU_STATE_IDLE;
 	rcu_batch_init(&sp->batch_queue);
@@ -318,6 +319,15 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 #define SYNCHRONIZE_SRCU_TRYCOUNT	2
 #define SYNCHRONIZE_SRCU_EXP_TRYCOUNT	12
 
+/*
+ * Start an SRCU grace period.
+ */
+static void srcu_gp_start(struct srcu_struct *sp)
+{
+	WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
+	rcu_seq_start(&sp->srcu_gp_seq);
+}
+
 /*
  * @@@ Wait until all pre-existing readers complete.  Such readers
  * will have used the index specified by "idx".
@@ -354,6 +364,15 @@ static void srcu_flip(struct srcu_struct *sp)
 	smp_mb(); /* D */  /* Pairs with C. */
 }
 
+/*
+ * End an SRCU grace period.
+ */
+static void srcu_gp_end(struct srcu_struct *sp)
+{
+	rcu_seq_end(&sp->srcu_gp_seq);
+	WRITE_ONCE(sp->srcu_state, SRCU_STATE_DONE);
+}
+
 /*
  * Enqueue an SRCU callback on the specified srcu_struct structure,
  * initiating grace-period processing if it is not already running.
@@ -392,7 +411,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
 	rcu_batch_queue(&sp->batch_queue, head);
 	if (READ_ONCE(sp->srcu_state) == SRCU_STATE_IDLE) {
-		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
+		srcu_gp_start(sp);
 		queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
 	}
 	spin_unlock_irqrestore(&sp->queue_lock, flags);
@@ -426,7 +445,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
 	if (READ_ONCE(sp->srcu_state) == SRCU_STATE_IDLE) {
 		/* steal the processing owner */
-		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
+		srcu_gp_start(sp);
 		rcu_batch_queue(&sp->batch_check0, head);
 		spin_unlock_irq(&sp->queue_lock);
 		/* give the processing owner to work_struct */
@@ -561,7 +580,7 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 	 */
 
 	if (sp->srcu_state == SRCU_STATE_DONE)
-		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
+		srcu_gp_start(sp);
 
 	if (sp->srcu_state == SRCU_STATE_SCAN1) {
 		idx = 1 ^ (sp->completed & 1);
@@ -608,7 +627,7 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 		 */
 		rcu_batch_move(&sp->batch_done, &sp->batch_check1);
 
-		WRITE_ONCE(sp->srcu_state, SRCU_STATE_DONE);
+		srcu_gp_end(sp);
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From 8660b7d8a545227fd9ee80508aa82528ea9947d7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 13 Mar 2017 16:48:18 -0700
Subject: srcu: Use rcu_segcblist to track SRCU callbacks

This commit switches SRCU from custom-built callback queues to the new
rcu_segcblist structure.  This change associates grace-period sequence
numbers with groups of callbacks, which will be needed for efficient
processing of per-CPU callbacks.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcu_segcblist.h | 678 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/srcu.h          |  24 +-
 kernel/rcu/rcu.h              |   6 +
 kernel/rcu/rcu_segcblist.h    | 670 -----------------------------------------
 kernel/rcu/srcu.c             | 159 ++--------
 kernel/rcu/tree.h             |   2 +-
 6 files changed, 721 insertions(+), 818 deletions(-)
 create mode 100644 include/linux/rcu_segcblist.h
 delete mode 100644 kernel/rcu/rcu_segcblist.h

(limited to 'include/linux')

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
new file mode 100644
index 000000000000..74b1e7243955
--- /dev/null
+++ b/include/linux/rcu_segcblist.h
@@ -0,0 +1,678 @@
+/*
+ * RCU segmented callback lists
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2017
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#ifndef __KERNEL_RCU_SEGCBLIST_H
+#define __KERNEL_RCU_SEGCBLIST_H
+
+/* Simple unsegmented callback lists. */
+struct rcu_cblist {
+	struct rcu_head *head;
+	struct rcu_head **tail;
+	long len;
+	long len_lazy;
+};
+
+#define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head }
+
+/* Initialize simple callback list. */
+static inline void rcu_cblist_init(struct rcu_cblist *rclp)
+{
+	rclp->head = NULL;
+	rclp->tail = &rclp->head;
+	rclp->len = 0;
+	rclp->len_lazy = 0;
+}
+
+/* Is simple callback list empty? */
+static inline bool rcu_cblist_empty(struct rcu_cblist *rclp)
+{
+	return !rclp->head;
+}
+
+/* Return number of callbacks in simple callback list. */
+static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
+{
+	return rclp->len;
+}
+
+/* Return number of lazy callbacks in simple callback list. */
+static inline long rcu_cblist_n_lazy_cbs(struct rcu_cblist *rclp)
+{
+	return rclp->len_lazy;
+}
+
+/*
+ * Debug function to actually count the number of callbacks.
+ * If the number exceeds the limit specified, return -1.
+ */
+static inline long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
+{
+	int cnt = 0;
+	struct rcu_head **rhpp = &rclp->head;
+
+	for (;;) {
+		if (!*rhpp)
+			return cnt;
+		if (++cnt > lim)
+			return -1;
+		rhpp = &(*rhpp)->next;
+	}
+}
+
+/*
+ * Dequeue the oldest rcu_head structure from the specified callback
+ * list.  This function assumes that the callback is non-lazy, but
+ * the caller can later invoke rcu_cblist_dequeued_lazy() if it
+ * finds otherwise (and if it cares about laziness).  This allows
+ * different users to have different ways of determining laziness.
+ */
+static inline struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
+{
+	struct rcu_head *rhp;
+
+	rhp = rclp->head;
+	if (!rhp)
+		return NULL;
+	rclp->len--;
+	rclp->head = rhp->next;
+	if (!rclp->head)
+		rclp->tail = &rclp->head;
+	return rhp;
+}
+
+/*
+ * Account for the fact that a previously dequeued callback turned out
+ * to be marked as lazy.
+ */
+static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
+{
+	rclp->len_lazy--;
+}
+
+/*
+ * Interim function to return rcu_cblist head pointer.  Longer term, the
+ * rcu_cblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
+{
+	return rclp->head;
+}
+
+/*
+ * Interim function to return rcu_cblist head pointer.  Longer term, the
+ * rcu_cblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
+{
+	WARN_ON_ONCE(rcu_cblist_empty(rclp));
+	return rclp->tail;
+}
+
+/* Complicated segmented callback lists.  ;-) */
+
+/*
+ * Index values for segments in rcu_segcblist structure.
+ *
+ * The segments are as follows:
+ *
+ * [head, *tails[RCU_DONE_TAIL]):
+ *	Callbacks whose grace period has elapsed, and thus can be invoked.
+ * [*tails[RCU_DONE_TAIL], *tails[RCU_WAIT_TAIL]):
+ *	Callbacks waiting for the current GP from the current CPU's viewpoint.
+ * [*tails[RCU_WAIT_TAIL], *tails[RCU_NEXT_READY_TAIL]):
+ *	Callbacks that arrived before the next GP started, again from
+ *	the current CPU's viewpoint.  These can be handled by the next GP.
+ * [*tails[RCU_NEXT_READY_TAIL], *tails[RCU_NEXT_TAIL]):
+ *	Callbacks that might have arrived after the next GP started.
+ *	There is some uncertainty as to when a given GP starts and
+ *	ends, but a CPU knows the exact times if it is the one starting
+ *	or ending the GP.  Other CPUs know that the previous GP ends
+ *	before the next one starts.
+ *
+ * Note that RCU_WAIT_TAIL cannot be empty unless RCU_NEXT_READY_TAIL is also
+ * empty.
+ *
+ * The ->gp_seq[] array contains the grace-period number at which the
+ * corresponding segment of callbacks will be ready to invoke.  A given
+ * element of this array is meaningful only when the corresponding segment
+ * is non-empty, and it is never valid for RCU_DONE_TAIL (whose callbacks
+ * are already ready to invoke) or for RCU_NEXT_TAIL (whose callbacks have
+ * not yet been assigned a grace-period number).
+ */
+#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
+#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
+#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
+#define RCU_NEXT_TAIL		3
+#define RCU_CBLIST_NSEGS	4
+
+struct rcu_segcblist {
+	struct rcu_head *head;
+	struct rcu_head **tails[RCU_CBLIST_NSEGS];
+	unsigned long gp_seq[RCU_CBLIST_NSEGS];
+	long len;
+	long len_lazy;
+};
+
+#define RCU_SEGCBLIST_INITIALIZER(n) \
+{ \
+	.head = NULL, \
+	.tails[RCU_DONE_TAIL] = &n.head, \
+	.tails[RCU_WAIT_TAIL] = &n.head, \
+	.tails[RCU_NEXT_READY_TAIL] = &n.head, \
+	.tails[RCU_NEXT_TAIL] = &n.head, \
+}
+
+/*
+ * Initialize an rcu_segcblist structure.
+ */
+static inline void rcu_segcblist_init(struct rcu_segcblist *rsclp)
+{
+	int i;
+
+	BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
+	BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
+	rsclp->head = NULL;
+	for (i = 0; i < RCU_CBLIST_NSEGS; i++)
+		rsclp->tails[i] = &rsclp->head;
+	rsclp->len = 0;
+	rsclp->len_lazy = 0;
+}
+
+/*
+ * Is the specified rcu_segcblist structure empty?
+ *
+ * But careful!  The fact that the ->head field is NULL does not
+ * necessarily imply that there are no callbacks associated with
+ * this structure.  When callbacks are being invoked, they are
+ * removed as a group.  If callback invocation must be preempted,
+ * the remaining callbacks will be added back to the list.  Either
+ * way, the counts are updated later.
+ *
+ * So it is often the case that rcu_segcblist_n_cbs() should be used
+ * instead.
+ */
+static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
+{
+	return !rsclp->head;
+}
+
+/* Return number of callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
+{
+	return READ_ONCE(rsclp->len);
+}
+
+/* Return number of lazy callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
+{
+	return rsclp->len_lazy;
+}
+
+/* Return number of lazy callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
+{
+	return rsclp->len - rsclp->len_lazy;
+}
+
+/*
+ * Is the specified rcu_segcblist enabled, for example, not corresponding
+ * to an offline or callback-offloaded CPU?
+ */
+static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
+{
+	return !!rsclp->tails[RCU_NEXT_TAIL];
+}
+
+/*
+ * Disable the specified rcu_segcblist structure, so that callbacks can
+ * no longer be posted to it.  This structure must be empty.
+ */
+static inline void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
+{
+	WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
+	WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
+	WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
+	rsclp->tails[RCU_NEXT_TAIL] = NULL;
+}
+
+/*
+ * Is the specified segment of the specified rcu_segcblist structure
+ * empty of callbacks?
+ */
+static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
+{
+	if (seg == RCU_DONE_TAIL)
+		return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
+	return rsclp->tails[seg - 1] == rsclp->tails[seg];
+}
+
+/*
+ * Are all segments following the specified segment of the specified
+ * rcu_segcblist structure empty of callbacks?  (The specified
+ * segment might well contain callbacks.)
+ */
+static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
+{
+	return !*rsclp->tails[seg];
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * are ready to be invoked?
+ */
+static inline bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
+{
+	return rcu_segcblist_is_enabled(rsclp) &&
+	       &rsclp->head != rsclp->tails[RCU_DONE_TAIL];
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * are still pending, that is, not yet ready to be invoked?
+ */
+static inline bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
+{
+	return rcu_segcblist_is_enabled(rsclp) &&
+	       !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL);
+}
+
+/*
+ * Dequeue and return the first ready-to-invoke callback.  If there
+ * are no ready-to-invoke callbacks, return NULL.  Disables interrupts
+ * to avoid interference.  Does not protect from interference from other
+ * CPUs or tasks.
+ */
+static inline struct rcu_head *
+rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
+{
+	unsigned long flags;
+	int i;
+	struct rcu_head *rhp;
+
+	local_irq_save(flags);
+	if (!rcu_segcblist_ready_cbs(rsclp)) {
+		local_irq_restore(flags);
+		return NULL;
+	}
+	rhp = rsclp->head;
+	BUG_ON(!rhp);
+	rsclp->head = rhp->next;
+	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
+		if (rsclp->tails[i] != &rhp->next)
+			break;
+		rsclp->tails[i] = &rsclp->head;
+	}
+	smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
+	WRITE_ONCE(rsclp->len, rsclp->len - 1);
+	local_irq_restore(flags);
+	return rhp;
+}
+
+/*
+ * Account for the fact that a previously dequeued callback turned out
+ * to be marked as lazy.
+ */
+static inline void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	rsclp->len_lazy--;
+	local_irq_restore(flags);
+}
+
+/*
+ * Return a pointer to the first callback in the specified rcu_segcblist
+ * structure.  This is useful for diagnostics.
+ */
+static inline struct rcu_head *
+rcu_segcblist_first_cb(struct rcu_segcblist *rsclp)
+{
+	if (rcu_segcblist_is_enabled(rsclp))
+		return rsclp->head;
+	return NULL;
+}
+
+/*
+ * Return a pointer to the first pending callback in the specified
+ * rcu_segcblist structure.  This is useful just after posting a given
+ * callback -- if that callback is the first pending callback, then
+ * you cannot rely on someone else having already started up the required
+ * grace period.
+ */
+static inline struct rcu_head *
+rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
+{
+	if (rcu_segcblist_is_enabled(rsclp))
+		return *rsclp->tails[RCU_DONE_TAIL];
+	return NULL;
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * have not yet been processed beyond having been posted, that is,
+ * does it contain callbacks in its last segment?
+ */
+static inline bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
+{
+	return rcu_segcblist_is_enabled(rsclp) &&
+	       !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
+}
+
+/*
+ * Enqueue the specified callback onto the specified rcu_segcblist
+ * structure, updating accounting as needed.  Note that the ->len
+ * field may be accessed locklessly, hence the WRITE_ONCE().
+ * The ->len field is used by rcu_barrier() and friends to determine
+ * if it must post a callback on this structure, and it is OK
+ * for rcu_barrier() to sometimes post callbacks needlessly, but
+ * absolutely not OK for it to ever miss posting a callback.
+ */
+static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
+					 struct rcu_head *rhp, bool lazy)
+{
+	WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
+	if (lazy)
+		rsclp->len_lazy++;
+	smp_mb(); /* Ensure counts are updated before callback is enqueued. */
+	rhp->next = NULL;
+	*rsclp->tails[RCU_NEXT_TAIL] = rhp;
+	rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
+}
+
+/*
+ * Extract only the counts from the specified rcu_segcblist structure,
+ * and place them in the specified rcu_cblist structure.  This function
+ * supports both callback orphaning and invocation, hence the separation
+ * of counts and callbacks.  (Callbacks ready for invocation must be
+ * orphaned and adopted separately from pending callbacks, but counts
+ * apply to all callbacks.  Locking must be used to make sure that
+ * both orphaned-callbacks lists are consistent.)
+ */
+static inline void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
+					       struct rcu_cblist *rclp)
+{
+	rclp->len_lazy += rsclp->len_lazy;
+	rclp->len += rsclp->len;
+	rsclp->len_lazy = 0;
+	WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
+}
+
+/*
+ * Extract only those callbacks ready to be invoked from the specified
+ * rcu_segcblist structure and place them in the specified rcu_cblist
+ * structure.
+ */
+static inline void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
+						  struct rcu_cblist *rclp)
+{
+	int i;
+
+	if (!rcu_segcblist_ready_cbs(rsclp))
+		return; /* Nothing to do. */
+	*rclp->tail = rsclp->head;
+	rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
+	*rsclp->tails[RCU_DONE_TAIL] = NULL;
+	rclp->tail = rsclp->tails[RCU_DONE_TAIL];
+	for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
+		if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
+			rsclp->tails[i] = &rsclp->head;
+}
+
+/*
+ * Extract only those callbacks still pending (not yet ready to be
+ * invoked) from the specified rcu_segcblist structure and place them in
+ * the specified rcu_cblist structure.  Note that this loses information
+ * about any callbacks that might have been partway done waiting for
+ * their grace period.  Too bad!  They will have to start over.
+ */
+static inline void
+rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
+			       struct rcu_cblist *rclp)
+{
+	int i;
+
+	if (!rcu_segcblist_pend_cbs(rsclp))
+		return; /* Nothing to do. */
+	*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
+	rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
+	*rsclp->tails[RCU_DONE_TAIL] = NULL;
+	for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
+		rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
+}
+
+/*
+ * Move the entire contents of the specified rcu_segcblist structure,
+ * counts, callbacks, and all, to the specified rcu_cblist structure.
+ * @@@ Why do we need this???  Moving early-boot CBs to NOCB lists?
+ * @@@ Memory barrier needed?  (Not if only used at boot time...)
+ */
+static inline void rcu_segcblist_extract_all(struct rcu_segcblist *rsclp,
+					     struct rcu_cblist *rclp)
+{
+	rcu_segcblist_extract_done_cbs(rsclp, rclp);
+	rcu_segcblist_extract_pend_cbs(rsclp, rclp);
+	rcu_segcblist_extract_count(rsclp, rclp);
+}
+
+/*
+ * Insert counts from the specified rcu_cblist structure in the
+ * specified rcu_segcblist structure.
+ */
+static inline void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
+					      struct rcu_cblist *rclp)
+{
+	rsclp->len_lazy += rclp->len_lazy;
+	/* ->len sampled locklessly. */
+	WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
+	rclp->len_lazy = 0;
+	rclp->len = 0;
+}
+
+/*
+ * Move callbacks from the specified rcu_cblist to the beginning of the
+ * done-callbacks segment of the specified rcu_segcblist.
+ */
+static inline void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
+						 struct rcu_cblist *rclp)
+{
+	int i;
+
+	if (!rclp->head)
+		return; /* No callbacks to move. */
+	*rclp->tail = rsclp->head;
+	rsclp->head = rclp->head;
+	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
+		if (&rsclp->head == rsclp->tails[i])
+			rsclp->tails[i] = rclp->tail;
+		else
+			break;
+	rclp->head = NULL;
+	rclp->tail = &rclp->head;
+}
+
+/*
+ * Move callbacks from the specified rcu_cblist to the end of the
+ * new-callbacks segment of the specified rcu_segcblist.
+ */
+static inline void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
+						 struct rcu_cblist *rclp)
+{
+	if (!rclp->head)
+		return; /* Nothing to do. */
+	*rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
+	rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
+	rclp->head = NULL;
+	rclp->tail = &rclp->head;
+}
+
+/*
+ * Advance the callbacks in the specified rcu_segcblist structure based
+ * on the current value passed in for the grace-period counter.
+ */
+static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp,
+					 unsigned long seq)
+{
+	int i, j;
+
+	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+	WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL));
+
+	/*
+	 * Find all callbacks whose ->gp_seq numbers indicate that they
+	 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
+	 */
+	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+		if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+			break;
+		rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
+	}
+
+	/* If no callbacks moved, nothing more need be done. */
+	if (i == RCU_WAIT_TAIL)
+		return;
+
+	/* Clean up tail pointers that might have been misordered above. */
+	for (j = RCU_WAIT_TAIL; j < i; j++)
+		rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
+
+	/*
+	 * Callbacks moved, so clean up the misordered ->tails[] pointers
+	 * that now point into the middle of the list of ready-to-invoke
+	 * callbacks.  The overall effect is to copy down the later pointers
+	 * into the gap that was created by the now-ready segments.
+	 */
+	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+		if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
+			break;  /* No more callbacks. */
+		rsclp->tails[j] = rsclp->tails[i];
+		rsclp->gp_seq[j] = rsclp->gp_seq[i];
+	}
+}
+
+/*
+ * "Accelerate" callbacks based on more-accurate grace-period information.
+ * The reason for this is that RCU does not synchronize the beginnings and
+ * ends of grace periods, and that callbacks are posted locally.  This in
+ * turn means that the callbacks must be labelled conservatively early
+ * on, as getting exact information would degrade both performance and
+ * scalability.  When more accurate grace-period information becomes
+ * available, previously posted callbacks can be "accelerated", marking
+ * them to complete at the end of the earlier grace period.
+ *
+ * This function operates on an rcu_segcblist structure, and also the
+ * grace-period sequence number at which new callbacks would become
+ * ready to invoke.
+ */
+static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp,
+					    unsigned long seq)
+{
+	int i;
+
+	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+	WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL));
+
+	/*
+	 * Find the segment preceding the oldest segment of callbacks
+	 * whose ->gp_seq[] completion is at or after that passed in via
+	 * "seq", skipping any empty segments.  This oldest segment, along
+	 * with any later segments, can be merged in with any newly arrived
+	 * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
+	 * as their ->gp_seq[] grace-period completion sequence number.
+	 */
+	for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
+		if (rsclp->tails[i] != rsclp->tails[i - 1] &&
+		    ULONG_CMP_LT(rsclp->gp_seq[i], seq))
+			break;
+
+	/*
+	 * If all the segments contain callbacks that correspond to
+	 * earlier grace-period sequence numbers than "seq", leave.
+	 * Assuming that the rcu_segcblist structure has enough
+	 * segments in its arrays, this can only happen if some of
+	 * the non-done segments contain callbacks that really are
+	 * ready to invoke.  This situation will get straightened
+	 * out by the next call to rcu_segcblist_advance().
+	 *
+	 * Also advance to the oldest segment of callbacks whose
+	 * ->gp_seq[] completion is at or after that passed in via "seq",
+	 * skipping any empty segments.
+	 */
+	if (++i >= RCU_NEXT_TAIL)
+		return false;
+
+	/*
+	 * Merge all later callbacks, including newly arrived callbacks,
+	 * into the segment located by the for-loop above.  Assign "seq"
+	 * as the ->gp_seq[] value in order to correctly handle the case
+	 * where there were no pending callbacks in the rcu_segcblist
+	 * structure other than in the RCU_NEXT_TAIL segment.
+	 */
+	for (; i < RCU_NEXT_TAIL; i++) {
+		rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
+		rsclp->gp_seq[i] = seq;
+	}
+	return true;
+}
+
+/*
+ * Scan the specified rcu_segcblist structure for callbacks that need
+ * a grace period later than the one specified by "seq".  We don't look
+ * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
+ * have a grace-period sequence number.
+ */
+static inline bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
+						  unsigned long seq)
+{
+	int i;
+
+	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
+		if (rsclp->tails[i - 1] != rsclp->tails[i] &&
+		    ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+			return true;
+	return false;
+}
+
+/*
+ * Interim function to return rcu_segcblist head pointer.  Longer term, the
+ * rcu_segcblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
+{
+	return rsclp->head;
+}
+
+/*
+ * Interim function to return rcu_segcblist head pointer.  Longer term, the
+ * rcu_segcblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
+{
+	WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
+	return rsclp->tails[RCU_NEXT_TAIL];
+}
+
+#endif /* __KERNEL_RCU_SEGCBLIST_H */
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 047ac8c28a4e..ad154a7bc114 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -22,7 +22,7 @@
  *	   Lai Jiangshan <laijs@cn.fujitsu.com>
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU/ *.txt
+ *		Documentation/RCU/ *.txt
  *
  */
 
@@ -32,31 +32,20 @@
 #include <linux/mutex.h>
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
+#include <linux/rcu_segcblist.h>
 
 struct srcu_array {
 	unsigned long lock_count[2];
 	unsigned long unlock_count[2];
 };
 
-struct rcu_batch {
-	struct rcu_head *head, **tail;
-};
-
-#define RCU_BATCH_INIT(name) { NULL, &(name.head) }
-
 struct srcu_struct {
 	unsigned long completed;
 	unsigned long srcu_gp_seq;
 	struct srcu_array __percpu *per_cpu_ref;
-	spinlock_t queue_lock; /* protect ->batch_queue, ->running */
+	spinlock_t queue_lock; /* protect ->srcu_cblist, ->srcu_state */
 	int srcu_state;
-	/* callbacks just queued */
-	struct rcu_batch batch_queue;
-	/* callbacks try to do the first check_zero */
-	struct rcu_batch batch_check0;
-	/* callbacks done with the first check_zero and the flip */
-	struct rcu_batch batch_check1;
-	struct rcu_batch batch_done;
+	struct rcu_segcblist srcu_cblist;
 	struct delayed_work work;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
@@ -97,10 +86,7 @@ void process_srcu(struct work_struct *work);
 		.per_cpu_ref = &name##_srcu_array,			\
 		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
 		.srcu_state = SRCU_STATE_IDLE,				\
-		.batch_queue = RCU_BATCH_INIT(name.batch_queue),	\
-		.batch_check0 = RCU_BATCH_INIT(name.batch_check0),	\
-		.batch_check1 = RCU_BATCH_INIT(name.batch_check1),	\
-		.batch_done = RCU_BATCH_INIT(name.batch_done),		\
+		.srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\
 		.work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
 		__SRCU_DEP_MAP_INIT(name)				\
 	}
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 0bc1313c49e2..a943b42a9cf7 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -87,6 +87,12 @@ static inline unsigned long rcu_seq_snap(unsigned long *sp)
 	return s;
 }
 
+/* Return the current value the update side's sequence number, no ordering. */
+static inline unsigned long rcu_seq_current(unsigned long *sp)
+{
+	return READ_ONCE(*sp);
+}
+
 /*
  * Given a snapshot from rcu_seq_snap(), determine whether or not a
  * full update-side operation has occurred.
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
deleted file mode 100644
index 982e3e05b22a..000000000000
--- a/kernel/rcu/rcu_segcblist.h
+++ /dev/null
@@ -1,670 +0,0 @@
-/*
- * RCU segmented callback lists
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright IBM Corporation, 2017
- *
- * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
- */
-
-#ifndef __KERNEL_RCU_SEGCBLIST_H
-#define __KERNEL_RCU_SEGCBLIST_H
-
-/* Simple unsegmented callback lists. */
-struct rcu_cblist {
-	struct rcu_head *head;
-	struct rcu_head **tail;
-	long len;
-	long len_lazy;
-};
-
-#define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head }
-
-/* Initialize simple callback list. */
-static inline void rcu_cblist_init(struct rcu_cblist *rclp)
-{
-	rclp->head = NULL;
-	rclp->tail = &rclp->head;
-	rclp->len = 0;
-	rclp->len_lazy = 0;
-}
-
-/* Is simple callback list empty? */
-static inline bool rcu_cblist_empty(struct rcu_cblist *rclp)
-{
-	return !rclp->head;
-}
-
-/* Return number of callbacks in simple callback list. */
-static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
-{
-	return rclp->len;
-}
-
-/* Return number of lazy callbacks in simple callback list. */
-static inline long rcu_cblist_n_lazy_cbs(struct rcu_cblist *rclp)
-{
-	return rclp->len_lazy;
-}
-
-/*
- * Debug function to actually count the number of callbacks.
- * If the number exceeds the limit specified, return -1.
- */
-static inline long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
-{
-	int cnt = 0;
-	struct rcu_head **rhpp = &rclp->head;
-
-	for (;;) {
-		if (!*rhpp)
-			return cnt;
-		if (++cnt > lim)
-			return -1;
-		rhpp = &(*rhpp)->next;
-	}
-}
-
-/*
- * Dequeue the oldest rcu_head structure from the specified callback
- * list.  This function assumes that the callback is non-lazy, but
- * the caller can later invoke rcu_cblist_dequeued_lazy() if it
- * finds otherwise (and if it cares about laziness).  This allows
- * different users to have different ways of determining laziness.
- */
-static inline struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
-{
-	struct rcu_head *rhp;
-
-	rhp = rclp->head;
-	if (!rhp)
-		return NULL;
-	prefetch(rhp);
-	rclp->len--;
-	rclp->head = rhp->next;
-	if (!rclp->head)
-		rclp->tail = &rclp->head;
-	return rhp;
-}
-
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
-{
-	rclp->len_lazy--;
-}
-
-/*
- * Interim function to return rcu_cblist head pointer.  Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
-{
-	return rclp->head;
-}
-
-/*
- * Interim function to return rcu_cblist head pointer.  Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
-{
-	WARN_ON_ONCE(rcu_cblist_empty(rclp));
-	return rclp->tail;
-}
-
-/* Complicated segmented callback lists.  ;-) */
-
-/*
- * Index values for segments in rcu_segcblist structure.
- *
- * The segments are as follows:
- *
- * [head, *tails[RCU_DONE_TAIL]):
- *	Callbacks whose grace period has elapsed, and thus can be invoked.
- * [*tails[RCU_DONE_TAIL], *tails[RCU_WAIT_TAIL]):
- *	Callbacks waiting for the current GP from the current CPU's viewpoint.
- * [*tails[RCU_WAIT_TAIL], *tails[RCU_NEXT_READY_TAIL]):
- *	Callbacks that arrived before the next GP started, again from
- *	the current CPU's viewpoint.  These can be handled by the next GP.
- * [*tails[RCU_NEXT_READY_TAIL], *tails[RCU_NEXT_TAIL]):
- *	Callbacks that might have arrived after the next GP started.
- *	There is some uncertainty as to when a given GP starts and
- *	ends, but a CPU knows the exact times if it is the one starting
- *	or ending the GP.  Other CPUs know that the previous GP ends
- *	before the next one starts.
- *
- * Note that RCU_WAIT_TAIL cannot be empty unless RCU_NEXT_READY_TAIL is also
- * empty.
- *
- * The ->gp_seq[] array contains the grace-period number at which the
- * corresponding segment of callbacks will be ready to invoke.  A given
- * element of this array is meaningful only when the corresponding segment
- * is non-empty, and it is never valid for RCU_DONE_TAIL (whose callbacks
- * are already ready to invoke) or for RCU_NEXT_TAIL (whose callbacks have
- * not yet been assigned a grace-period number).
- */
-#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
-#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
-#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
-#define RCU_NEXT_TAIL		3
-#define RCU_CBLIST_NSEGS	4
-
-struct rcu_segcblist {
-	struct rcu_head *head;
-	struct rcu_head **tails[RCU_CBLIST_NSEGS];
-	unsigned long gp_seq[RCU_CBLIST_NSEGS];
-	long len;
-	long len_lazy;
-};
-
-/*
- * Initialize an rcu_segcblist structure.
- */
-static inline void rcu_segcblist_init(struct rcu_segcblist *rsclp)
-{
-	int i;
-
-	BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
-	BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
-	rsclp->head = NULL;
-	for (i = 0; i < RCU_CBLIST_NSEGS; i++)
-		rsclp->tails[i] = &rsclp->head;
-	rsclp->len = 0;
-	rsclp->len_lazy = 0;
-}
-
-/*
- * Is the specified rcu_segcblist structure empty?
- *
- * But careful!  The fact that the ->head field is NULL does not
- * necessarily imply that there are no callbacks associated with
- * this structure.  When callbacks are being invoked, they are
- * removed as a group.  If callback invocation must be preempted,
- * the remaining callbacks will be added back to the list.  Either
- * way, the counts are updated later.
- *
- * So it is often the case that rcu_segcblist_n_cbs() should be used
- * instead.
- */
-static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
-{
-	return !rsclp->head;
-}
-
-/* Return number of callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
-{
-	return READ_ONCE(rsclp->len);
-}
-
-/* Return number of lazy callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
-{
-	return rsclp->len_lazy;
-}
-
-/* Return number of lazy callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
-{
-	return rsclp->len - rsclp->len_lazy;
-}
-
-/*
- * Is the specified rcu_segcblist enabled, for example, not corresponding
- * to an offline or callback-offloaded CPU?
- */
-static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
-{
-	return !!rsclp->tails[RCU_NEXT_TAIL];
-}
-
-/*
- * Disable the specified rcu_segcblist structure, so that callbacks can
- * no longer be posted to it.  This structure must be empty.
- */
-static inline void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
-{
-	WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
-	rsclp->tails[RCU_NEXT_TAIL] = NULL;
-}
-
-/*
- * Is the specified segment of the specified rcu_segcblist structure
- * empty of callbacks?
- */
-static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
-{
-	if (seg == RCU_DONE_TAIL)
-		return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
-	return rsclp->tails[seg - 1] == rsclp->tails[seg];
-}
-
-/*
- * Are all segments following the specified segment of the specified
- * rcu_segcblist structure empty of callbacks?  (The specified
- * segment might well contain callbacks.)
- */
-static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
-{
-	return !*rsclp->tails[seg];
-}
-
-/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * are ready to be invoked?
- */
-static inline bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
-{
-	return rcu_segcblist_is_enabled(rsclp) &&
-	       &rsclp->head != rsclp->tails[RCU_DONE_TAIL];
-}
-
-/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * are still pending, that is, not yet ready to be invoked?
- */
-static inline bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
-{
-	return rcu_segcblist_is_enabled(rsclp) &&
-	       !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL);
-}
-
-/*
- * Dequeue and return the first ready-to-invoke callback.  If there
- * are no ready-to-invoke callbacks, return NULL.  Disables interrupts
- * to avoid interference.  Does not protect from interference from other
- * CPUs or tasks.
- */
-static inline struct rcu_head *
-rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
-{
-	unsigned long flags;
-	int i;
-	struct rcu_head *rhp;
-
-	local_irq_save(flags);
-	if (!rcu_segcblist_ready_cbs(rsclp)) {
-		local_irq_restore(flags);
-		return NULL;
-	}
-	rhp = rsclp->head;
-	BUG_ON(!rhp);
-	rsclp->head = rhp->next;
-	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
-		if (rsclp->tails[i] != &rhp->next)
-			break;
-		rsclp->tails[i] = &rsclp->head;
-	}
-	smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
-	WRITE_ONCE(rsclp->len, rsclp->len - 1);
-	local_irq_restore(flags);
-	return rhp;
-}
-
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-static inline void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	rsclp->len_lazy--;
-	local_irq_restore(flags);
-}
-
-/*
- * Return a pointer to the first callback in the specified rcu_segcblist
- * structure.  This is useful for diagnostics.
- */
-static inline struct rcu_head *
-rcu_segcblist_first_cb(struct rcu_segcblist *rsclp)
-{
-	if (rcu_segcblist_is_enabled(rsclp))
-		return rsclp->head;
-	return NULL;
-}
-
-/*
- * Return a pointer to the first pending callback in the specified
- * rcu_segcblist structure.  This is useful just after posting a given
- * callback -- if that callback is the first pending callback, then
- * you cannot rely on someone else having already started up the required
- * grace period.
- */
-static inline struct rcu_head *
-rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
-{
-	if (rcu_segcblist_is_enabled(rsclp))
-		return *rsclp->tails[RCU_DONE_TAIL];
-	return NULL;
-}
-
-/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * have not yet been processed beyond having been posted, that is,
- * does it contain callbacks in its last segment?
- */
-static inline bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
-{
-	return rcu_segcblist_is_enabled(rsclp) &&
-	       !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
-}
-
-/*
- * Enqueue the specified callback onto the specified rcu_segcblist
- * structure, updating accounting as needed.  Note that the ->len
- * field may be accessed locklessly, hence the WRITE_ONCE().
- * The ->len field is used by rcu_barrier() and friends to determine
- * if it must post a callback on this structure, and it is OK
- * for rcu_barrier() to sometimes post callbacks needlessly, but
- * absolutely not OK for it to ever miss posting a callback.
- */
-static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
-					 struct rcu_head *rhp, bool lazy)
-{
-	WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
-	if (lazy)
-		rsclp->len_lazy++;
-	smp_mb(); /* Ensure counts are updated before callback is enqueued. */
-	rhp->next = NULL;
-	*rsclp->tails[RCU_NEXT_TAIL] = rhp;
-	rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
-}
-
-/*
- * Extract only the counts from the specified rcu_segcblist structure,
- * and place them in the specified rcu_cblist structure.  This function
- * supports both callback orphaning and invocation, hence the separation
- * of counts and callbacks.  (Callbacks ready for invocation must be
- * orphaned and adopted separately from pending callbacks, but counts
- * apply to all callbacks.  Locking must be used to make sure that
- * both orphaned-callbacks lists are consistent.)
- */
-static inline void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
-					       struct rcu_cblist *rclp)
-{
-	rclp->len_lazy += rsclp->len_lazy;
-	rclp->len += rsclp->len;
-	rsclp->len_lazy = 0;
-	WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
-}
-
-/*
- * Extract only those callbacks ready to be invoked from the specified
- * rcu_segcblist structure and place them in the specified rcu_cblist
- * structure.
- */
-static inline void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
-						  struct rcu_cblist *rclp)
-{
-	int i;
-
-	if (!rcu_segcblist_ready_cbs(rsclp))
-		return; /* Nothing to do. */
-	*rclp->tail = rsclp->head;
-	rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
-	*rsclp->tails[RCU_DONE_TAIL] = NULL;
-	rclp->tail = rsclp->tails[RCU_DONE_TAIL];
-	for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
-		if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
-			rsclp->tails[i] = &rsclp->head;
-}
-
-/*
- * Extract only those callbacks still pending (not yet ready to be
- * invoked) from the specified rcu_segcblist structure and place them in
- * the specified rcu_cblist structure.  Note that this loses information
- * about any callbacks that might have been partway done waiting for
- * their grace period.  Too bad!  They will have to start over.
- */
-static inline void
-rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
-			       struct rcu_cblist *rclp)
-{
-	int i;
-
-	if (!rcu_segcblist_pend_cbs(rsclp))
-		return; /* Nothing to do. */
-	*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
-	rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
-	*rsclp->tails[RCU_DONE_TAIL] = NULL;
-	for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
-		rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
-}
-
-/*
- * Move the entire contents of the specified rcu_segcblist structure,
- * counts, callbacks, and all, to the specified rcu_cblist structure.
- * @@@ Why do we need this???  Moving early-boot CBs to NOCB lists?
- * @@@ Memory barrier needed?  (Not if only used at boot time...)
- */
-static inline void rcu_segcblist_extract_all(struct rcu_segcblist *rsclp,
-					     struct rcu_cblist *rclp)
-{
-	rcu_segcblist_extract_done_cbs(rsclp, rclp);
-	rcu_segcblist_extract_pend_cbs(rsclp, rclp);
-	rcu_segcblist_extract_count(rsclp, rclp);
-}
-
-/*
- * Insert counts from the specified rcu_cblist structure in the
- * specified rcu_segcblist structure.
- */
-static inline void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
-					      struct rcu_cblist *rclp)
-{
-	rsclp->len_lazy += rclp->len_lazy;
-	/* ->len sampled locklessly. */
-	WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
-	rclp->len_lazy = 0;
-	rclp->len = 0;
-}
-
-/*
- * Move callbacks from the specified rcu_cblist to the beginning of the
- * done-callbacks segment of the specified rcu_segcblist.
- */
-static inline void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
-						 struct rcu_cblist *rclp)
-{
-	int i;
-
-	if (!rclp->head)
-		return; /* No callbacks to move. */
-	*rclp->tail = rsclp->head;
-	rsclp->head = rclp->head;
-	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
-		if (&rsclp->head == rsclp->tails[i])
-			rsclp->tails[i] = rclp->tail;
-		else
-			break;
-	rclp->head = NULL;
-	rclp->tail = &rclp->head;
-}
-
-/*
- * Move callbacks from the specified rcu_cblist to the end of the
- * new-callbacks segment of the specified rcu_segcblist.
- */
-static inline void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
-						 struct rcu_cblist *rclp)
-{
-	if (!rclp->head)
-		return; /* Nothing to do. */
-	*rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
-	rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
-	rclp->head = NULL;
-	rclp->tail = &rclp->head;
-}
-
-/*
- * Advance the callbacks in the specified rcu_segcblist structure based
- * on the current value passed in for the grace-period counter.
- */
-static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp,
-					 unsigned long seq)
-{
-	int i, j;
-
-	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL));
-
-	/*
-	 * Find all callbacks whose ->gp_seq numbers indicate that they
-	 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
-	 */
-	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
-		if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
-			break;
-		rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
-	}
-
-	/* If no callbacks moved, nothing more need be done. */
-	if (i == RCU_WAIT_TAIL)
-		return;
-
-	/* Clean up tail pointers that might have been misordered above. */
-	for (j = RCU_WAIT_TAIL; j < i; j++)
-		rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
-
-	/*
-	 * Callbacks moved, so clean up the misordered ->tails[] pointers
-	 * that now point into the middle of the list of ready-to-invoke
-	 * callbacks.  The overall effect is to copy down the later pointers
-	 * into the gap that was created by the now-ready segments.
-	 */
-	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
-		if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
-			break;  /* No more callbacks. */
-		rsclp->tails[j] = rsclp->tails[i];
-		rsclp->gp_seq[j] = rsclp->gp_seq[i];
-	}
-}
-
-/*
- * "Accelerate" callbacks based on more-accurate grace-period information.
- * The reason for this is that RCU does not synchronize the beginnings and
- * ends of grace periods, and that callbacks are posted locally.  This in
- * turn means that the callbacks must be labelled conservatively early
- * on, as getting exact information would degrade both performance and
- * scalability.  When more accurate grace-period information becomes
- * available, previously posted callbacks can be "accelerated", marking
- * them to complete at the end of the earlier grace period.
- *
- * This function operates on an rcu_segcblist structure, and also the
- * grace-period sequence number at which new callbacks would become
- * ready to invoke.
- */
-static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp,
-					    unsigned long seq)
-{
-	int i;
-
-	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL));
-
-	/*
-	 * Find the segment preceding the oldest segment of callbacks
-	 * whose ->gp_seq[] completion is at or after that passed in via
-	 * "seq", skipping any empty segments.  This oldest segment, along
-	 * with any later segments, can be merged in with any newly arrived
-	 * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
-	 * as their ->gp_seq[] grace-period completion sequence number.
-	 */
-	for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
-		if (rsclp->tails[i] != rsclp->tails[i - 1] &&
-		    ULONG_CMP_LT(rsclp->gp_seq[i], seq))
-			break;
-
-	/*
-	 * If all the segments contain callbacks that correspond to
-	 * earlier grace-period sequence numbers than "seq", leave.
-	 * Assuming that the rcu_segcblist structure has enough
-	 * segments in its arrays, this can only happen if some of
-	 * the non-done segments contain callbacks that really are
-	 * ready to invoke.  This situation will get straightened
-	 * out by the next call to rcu_segcblist_advance().
-	 *
-	 * Also advance to the oldest segment of callbacks whose
-	 * ->gp_seq[] completion is at or after that passed in via "seq",
-	 * skipping any empty segments.
-	 */
-	if (++i >= RCU_NEXT_TAIL)
-		return false;
-
-	/*
-	 * Merge all later callbacks, including newly arrived callbacks,
-	 * into the segment located by the for-loop above.  Assign "seq"
-	 * as the ->gp_seq[] value in order to correctly handle the case
-	 * where there were no pending callbacks in the rcu_segcblist
-	 * structure other than in the RCU_NEXT_TAIL segment.
-	 */
-	for (; i < RCU_NEXT_TAIL; i++) {
-		rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
-		rsclp->gp_seq[i] = seq;
-	}
-	return true;
-}
-
-/*
- * Scan the specified rcu_segcblist structure for callbacks that need
- * a grace period later than the one specified by "seq".  We don't look
- * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
- * have a grace-period sequence number.
- */
-static inline bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
-						  unsigned long seq)
-{
-	int i;
-
-	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
-		if (rsclp->tails[i - 1] != rsclp->tails[i] &&
-		    ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
-			return true;
-	return false;
-}
-
-/*
- * Interim function to return rcu_segcblist head pointer.  Longer term, the
- * rcu_segcblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
-{
-	return rsclp->head;
-}
-
-/*
- * Interim function to return rcu_segcblist head pointer.  Longer term, the
- * rcu_segcblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
-{
-	WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
-	return rsclp->tails[RCU_NEXT_TAIL];
-}
-
-#endif /* __KERNEL_RCU_SEGCBLIST_H */
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index d464986d82b6..56fd30862122 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -22,7 +22,7 @@
  *	   Lai Jiangshan <laijs@cn.fujitsu.com>
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU/ *.txt
+ *		Documentation/RCU/ *.txt
  *
  */
 
@@ -38,85 +38,13 @@
 
 #include "rcu.h"
 
-/*
- * Initialize an rcu_batch structure to empty.
- */
-static inline void rcu_batch_init(struct rcu_batch *b)
-{
-	b->head = NULL;
-	b->tail = &b->head;
-}
-
-/*
- * Enqueue a callback onto the tail of the specified rcu_batch structure.
- */
-static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
-{
-	*b->tail = head;
-	b->tail = &head->next;
-}
-
-/*
- * Is the specified rcu_batch structure empty?
- */
-static inline bool rcu_batch_empty(struct rcu_batch *b)
-{
-	return b->tail == &b->head;
-}
-
-/*
- * Are all batches empty for the specified srcu_struct?
- */
-static inline bool rcu_all_batches_empty(struct srcu_struct *sp)
-{
-	return rcu_batch_empty(&sp->batch_done) &&
-	       rcu_batch_empty(&sp->batch_check1) &&
-	       rcu_batch_empty(&sp->batch_check0) &&
-	       rcu_batch_empty(&sp->batch_queue);
-}
-
-/*
- * Remove the callback at the head of the specified rcu_batch structure
- * and return a pointer to it, or return NULL if the structure is empty.
- */
-static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
-{
-	struct rcu_head *head;
-
-	if (rcu_batch_empty(b))
-		return NULL;
-
-	head = b->head;
-	b->head = head->next;
-	if (b->tail == &head->next)
-		rcu_batch_init(b);
-
-	return head;
-}
-
-/*
- * Move all callbacks from the rcu_batch structure specified by "from" to
- * the structure specified by "to".
- */
-static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
-{
-	if (!rcu_batch_empty(from)) {
-		*to->tail = from->head;
-		to->tail = from->tail;
-		rcu_batch_init(from);
-	}
-}
-
 static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
 	sp->completed = 0;
 	sp->srcu_gp_seq = 0;
 	spin_lock_init(&sp->queue_lock);
 	sp->srcu_state = SRCU_STATE_IDLE;
-	rcu_batch_init(&sp->batch_queue);
-	rcu_batch_init(&sp->batch_check0);
-	rcu_batch_init(&sp->batch_check1);
-	rcu_batch_init(&sp->batch_done);
+	rcu_segcblist_init(&sp->srcu_cblist);
 	INIT_DELAYED_WORK(&sp->work, process_srcu);
 	sp->per_cpu_ref = alloc_percpu(struct srcu_array);
 	return sp->per_cpu_ref ? 0 : -ENOMEM;
@@ -268,7 +196,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 {
 	if (WARN_ON(srcu_readers_active(sp)))
 		return; /* Leakage unless caller handles error. */
-	if (WARN_ON(!rcu_all_batches_empty(sp)))
+	if (WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)))
 		return; /* Leakage unless caller handles error. */
 	flush_delayed_work(&sp->work);
 	if (WARN_ON(READ_ONCE(sp->srcu_state) != SRCU_STATE_IDLE))
@@ -324,6 +252,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
  */
 static void srcu_gp_start(struct srcu_struct *sp)
 {
+	rcu_segcblist_accelerate(&sp->srcu_cblist,
+				 rcu_seq_snap(&sp->srcu_gp_seq));
 	WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
 	rcu_seq_start(&sp->srcu_gp_seq);
 }
@@ -371,6 +301,11 @@ static void srcu_gp_end(struct srcu_struct *sp)
 {
 	rcu_seq_end(&sp->srcu_gp_seq);
 	WRITE_ONCE(sp->srcu_state, SRCU_STATE_DONE);
+
+	spin_lock_irq(&sp->queue_lock);
+	rcu_segcblist_advance(&sp->srcu_cblist,
+			      rcu_seq_current(&sp->srcu_gp_seq));
+	spin_unlock_irq(&sp->queue_lock);
 }
 
 /*
@@ -409,7 +344,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 	head->func = func;
 	spin_lock_irqsave(&sp->queue_lock, flags);
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
-	rcu_batch_queue(&sp->batch_queue, head);
+	rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
 	if (READ_ONCE(sp->srcu_state) == SRCU_STATE_IDLE) {
 		srcu_gp_start(sp);
 		queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
@@ -445,13 +380,13 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
 	if (READ_ONCE(sp->srcu_state) == SRCU_STATE_IDLE) {
 		/* steal the processing owner */
+		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
 		srcu_gp_start(sp);
-		rcu_batch_queue(&sp->batch_check0, head);
 		spin_unlock_irq(&sp->queue_lock);
 		/* give the processing owner to work_struct */
 		srcu_reschedule(sp, 0);
 	} else {
-		rcu_batch_queue(&sp->batch_queue, head);
+		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
 		spin_unlock_irq(&sp->queue_lock);
 	}
 
@@ -548,19 +483,6 @@ EXPORT_SYMBOL_GPL(srcu_batches_completed);
 #define SRCU_CALLBACK_BATCH	10
 #define SRCU_INTERVAL		1
 
-/*
- * Move any new SRCU callbacks to the first stage of the SRCU grace
- * period pipeline.
- */
-static void srcu_collect_new(struct srcu_struct *sp)
-{
-	if (!rcu_batch_empty(&sp->batch_queue)) {
-		spin_lock_irq(&sp->queue_lock);
-		rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
-		spin_unlock_irq(&sp->queue_lock);
-	}
-}
-
 /*
  * Core SRCU state machine.  Advance callbacks from ->batch_check0 to
  * ->batch_check1 and then to ->batch_done as readers drain.
@@ -586,26 +508,7 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 		idx = 1 ^ (sp->completed & 1);
 		if (!try_check_zero(sp, idx, trycount))
 			return; /* readers present, retry after SRCU_INTERVAL */
-
-		/*
-		 * The callbacks in ->batch_check1 have already done
-		 * with their first zero check and flip back when they were
-		 * enqueued on ->batch_check0 in a previous invocation of
-		 * srcu_advance_batches().  (Presumably try_check_zero()
-		 * returned false during that invocation, leaving the
-		 * callbacks stranded on ->batch_check1.) They are therefore
-		 * ready to invoke, so move them to ->batch_done.
-		 */
-		rcu_batch_move(&sp->batch_done, &sp->batch_check1);
 		srcu_flip(sp);
-
-		/*
-		 * The callbacks in ->batch_check0 just finished their
-		 * first check zero and flip, so move them to ->batch_check1
-		 * for future checking on the other idx.
-		 */
-		rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
-
 		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN2);
 	}
 
@@ -619,14 +522,6 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 		trycount = trycount < 2 ? 2 : trycount;
 		if (!try_check_zero(sp, idx, trycount))
 			return; /* readers present, retry after SRCU_INTERVAL */
-
-		/*
-		 * The callbacks in ->batch_check1 have now waited for
-		 * all pre-existing readers using both idx values.  They are
-		 * therefore ready to invoke, so move them to ->batch_done.
-		 */
-		rcu_batch_move(&sp->batch_done, &sp->batch_check1);
-
 		srcu_gp_end(sp);
 	}
 }
@@ -639,17 +534,26 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
  */
 static void srcu_invoke_callbacks(struct srcu_struct *sp)
 {
-	int i;
-	struct rcu_head *head;
+	struct rcu_cblist ready_cbs;
+	struct rcu_head *rhp;
 
-	for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
-		head = rcu_batch_dequeue(&sp->batch_done);
-		if (!head)
-			break;
+	spin_lock_irq(&sp->queue_lock);
+	if (!rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
+		spin_unlock_irq(&sp->queue_lock);
+		return;
+	}
+	rcu_cblist_init(&ready_cbs);
+	rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
+	spin_unlock_irq(&sp->queue_lock);
+	rhp = rcu_cblist_dequeue(&ready_cbs);
+	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
 		local_bh_disable();
-		head->func(head);
+		rhp->func(rhp);
 		local_bh_enable();
 	}
+	spin_lock_irq(&sp->queue_lock);
+	rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
+	spin_unlock_irq(&sp->queue_lock);
 }
 
 /*
@@ -660,9 +564,9 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
 {
 	bool pending = true;
 
-	if (rcu_all_batches_empty(sp)) {
+	if (rcu_segcblist_empty(&sp->srcu_cblist)) {
 		spin_lock_irq(&sp->queue_lock);
-		if (rcu_all_batches_empty(sp) &&
+		if (rcu_segcblist_empty(&sp->srcu_cblist) &&
 		    READ_ONCE(sp->srcu_state) == SRCU_STATE_DONE) {
 			WRITE_ONCE(sp->srcu_state, SRCU_STATE_IDLE);
 			pending = false;
@@ -683,7 +587,6 @@ void process_srcu(struct work_struct *work)
 
 	sp = container_of(work, struct srcu_struct, work.work);
 
-	srcu_collect_new(sp);
 	srcu_advance_batches(sp, 1);
 	srcu_invoke_callbacks(sp);
 	srcu_reschedule(sp, SRCU_INTERVAL);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 93889ff21dbb..4f62651588ea 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -30,7 +30,7 @@
 #include <linux/seqlock.h>
 #include <linux/swait.h>
 #include <linux/stop_machine.h>
-#include "rcu_segcblist.h"
+#include <linux/rcu_segcblist.h>
 
 /*
  * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
-- 
cgit v1.2.3-70-g09d2


From f2425b4efb0c69e77c0b9666b605ae4a1ecaae47 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 14 Mar 2017 12:42:30 -0700
Subject: srcu: Move combining-tree definitions for SRCU's benefit

This commit moves the C preprocessor code that defines the default shape
of the rcu_node combining tree to a new include/linux/rcu_node_tree.h
file as a first step towards enabling SRCU to create its own combining
tree, which in turn enables SRCU to implement per-CPU callback handling,
thus avoiding contention on the lock currently guarding the single list
of callbacks.  Note that users of SRCU still need to know the size of
the srcu_struct structure, hence include/linux rather than kernel/rcu.

This commit is code-movement only.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcu_node_tree.h | 102 ++++++++++++++++++++++++++++++++++++++++++
 kernel/rcu/tree.h             |  71 +----------------------------
 2 files changed, 103 insertions(+), 70 deletions(-)
 create mode 100644 include/linux/rcu_node_tree.h

(limited to 'include/linux')

diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h
new file mode 100644
index 000000000000..b7eb97096b1c
--- /dev/null
+++ b/include/linux/rcu_node_tree.h
@@ -0,0 +1,102 @@
+/*
+ * RCU node combining tree definitions.  These are used to compute
+ * global attributes while avoiding common-case global contention.  A key
+ * property that these computations rely on is a tournament-style approach
+ * where only one of the tasks contending a lower level in the tree need
+ * advance to the next higher level.  If properly configured, this allows
+ * unlimited scalability while maintaining a constant level of contention
+ * on the root node.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2017
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#ifndef __LINUX_RCU_NODE_TREE_H
+#define __LINUX_RCU_NODE_TREE_H
+
+/*
+ * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
+ * CONFIG_RCU_FANOUT_LEAF.
+ * In theory, it should be possible to add more levels straightforwardly.
+ * In practice, this did work well going from three levels to four.
+ * Of course, your mileage may vary.
+ */
+
+#ifdef CONFIG_RCU_FANOUT
+#define RCU_FANOUT CONFIG_RCU_FANOUT
+#else /* #ifdef CONFIG_RCU_FANOUT */
+# ifdef CONFIG_64BIT
+# define RCU_FANOUT 64
+# else
+# define RCU_FANOUT 32
+# endif
+#endif /* #else #ifdef CONFIG_RCU_FANOUT */
+
+#ifdef CONFIG_RCU_FANOUT_LEAF
+#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
+#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
+#define RCU_FANOUT_LEAF 16
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
+
+#define RCU_FANOUT_1	      (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2	      (RCU_FANOUT_1 * RCU_FANOUT)
+#define RCU_FANOUT_3	      (RCU_FANOUT_2 * RCU_FANOUT)
+#define RCU_FANOUT_4	      (RCU_FANOUT_3 * RCU_FANOUT)
+
+#if NR_CPUS <= RCU_FANOUT_1
+#  define RCU_NUM_LVLS	      1
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_NODES	      NUM_RCU_LVL_0
+#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0 }
+#  define RCU_NODE_NAME_INIT  { "rcu_node_0" }
+#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0" }
+#elif NR_CPUS <= RCU_FANOUT_2
+#  define RCU_NUM_LVLS	      2
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+#  define NUM_RCU_NODES	      (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
+#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
+#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1" }
+#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1" }
+#elif NR_CPUS <= RCU_FANOUT_3
+#  define RCU_NUM_LVLS	      3
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+#  define NUM_RCU_NODES	      (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
+#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
+#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
+#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
+#elif NR_CPUS <= RCU_FANOUT_4
+#  define RCU_NUM_LVLS	      4
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
+#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+#  define NUM_RCU_LVL_3	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+#  define NUM_RCU_NODES	      (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
+#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
+#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
+#else
+# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
+
+extern int rcu_num_lvls;
+extern int rcu_num_nodes;
+
+#endif /* __LINUX_RCU_NODE_TREE_H */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 4f62651588ea..1bec3958d44f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -31,76 +31,7 @@
 #include <linux/swait.h>
 #include <linux/stop_machine.h>
 #include <linux/rcu_segcblist.h>
-
-/*
- * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
- * CONFIG_RCU_FANOUT_LEAF.
- * In theory, it should be possible to add more levels straightforwardly.
- * In practice, this did work well going from three levels to four.
- * Of course, your mileage may vary.
- */
-
-#ifdef CONFIG_RCU_FANOUT
-#define RCU_FANOUT CONFIG_RCU_FANOUT
-#else /* #ifdef CONFIG_RCU_FANOUT */
-# ifdef CONFIG_64BIT
-# define RCU_FANOUT 64
-# else
-# define RCU_FANOUT 32
-# endif
-#endif /* #else #ifdef CONFIG_RCU_FANOUT */
-
-#ifdef CONFIG_RCU_FANOUT_LEAF
-#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
-#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
-#define RCU_FANOUT_LEAF 16
-#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
-
-#define RCU_FANOUT_1	      (RCU_FANOUT_LEAF)
-#define RCU_FANOUT_2	      (RCU_FANOUT_1 * RCU_FANOUT)
-#define RCU_FANOUT_3	      (RCU_FANOUT_2 * RCU_FANOUT)
-#define RCU_FANOUT_4	      (RCU_FANOUT_3 * RCU_FANOUT)
-
-#if NR_CPUS <= RCU_FANOUT_1
-#  define RCU_NUM_LVLS	      1
-#  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_NODES	      NUM_RCU_LVL_0
-#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0 }
-#  define RCU_NODE_NAME_INIT  { "rcu_node_0" }
-#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0" }
-#elif NR_CPUS <= RCU_FANOUT_2
-#  define RCU_NUM_LVLS	      2
-#  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_NODES	      (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
-#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
-#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1" }
-#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1" }
-#elif NR_CPUS <= RCU_FANOUT_3
-#  define RCU_NUM_LVLS	      3
-#  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_NODES	      (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
-#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
-#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
-#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
-#elif NR_CPUS <= RCU_FANOUT_4
-#  define RCU_NUM_LVLS	      4
-#  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
-#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-#  define NUM_RCU_LVL_3	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_NODES	      (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
-#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
-#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
-#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
-#else
-# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
-
-extern int rcu_num_lvls;
-extern int rcu_num_nodes;
+#include <linux/rcu_node_tree.h>
 
 /*
  * Dynticks per-CPU state.
-- 
cgit v1.2.3-70-g09d2


From 2b34c43cc1671c59bad6dd1682ae3ee4f0919eb7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 14 Mar 2017 14:29:53 -0700
Subject: srcu: Move rcu_init_levelspread() to rcu_tree_node.h

This commit moves the rcu_init_levelspread() function from
kernel/rcu/tree.c to kernel/rcu/rcu.h so that SRCU can access it.  This is
another step towards enabling SRCU to create its own combining tree.
This commit is code-movement only, give or take knock-on adjustments.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcu_node_tree.h |  3 ---
 kernel/rcu/rcu.h              | 36 ++++++++++++++++++++++++++++++++++++
 kernel/rcu/srcu.c             |  1 +
 kernel/rcu/tree.c             | 25 -------------------------
 kernel/rcu/tree_trace.c       |  1 +
 5 files changed, 38 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h
index b7eb97096b1c..4b766b61e1a0 100644
--- a/include/linux/rcu_node_tree.h
+++ b/include/linux/rcu_node_tree.h
@@ -96,7 +96,4 @@
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
 #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
 
-extern int rcu_num_lvls;
-extern int rcu_num_nodes;
-
 #endif /* __LINUX_RCU_NODE_TREE_H */
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index a943b42a9cf7..87326479b39a 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -190,4 +190,40 @@ void rcu_test_sync_prims(void);
  */
 extern void resched_cpu(int cpu);
 
+#if defined(SRCU) || !defined(TINY_RCU)
+
+#include <linux/rcu_node_tree.h>
+
+extern int rcu_num_lvls;
+extern int rcu_num_nodes;
+static bool rcu_fanout_exact;
+static int rcu_fanout_leaf;
+
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
+ */
+static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
+{
+	int i;
+
+	if (rcu_fanout_exact) {
+		levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+		for (i = rcu_num_lvls - 2; i >= 0; i--)
+			levelspread[i] = RCU_FANOUT;
+	} else {
+		int ccur;
+		int cprv;
+
+		cprv = nr_cpu_ids;
+		for (i = rcu_num_lvls - 1; i >= 0; i--) {
+			ccur = levelcnt[i];
+			levelspread[i] = (cprv + ccur - 1) / ccur;
+			cprv = ccur;
+		}
+	}
+}
+
+#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
+
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 56fd30862122..0b511de7ca4d 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -36,6 +36,7 @@
 #include <linux/delay.h>
 #include <linux/srcu.h>
 
+#include <linux/rcu_node_tree.h>
 #include "rcu.h"
 
 static int init_srcu_struct_fields(struct srcu_struct *sp)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 844a030c1960..df3527744af8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3951,31 +3951,6 @@ void rcu_scheduler_starting(void)
 	rcu_test_sync_prims();
 }
 
-/*
- * Compute the per-level fanout, either using the exact fanout specified
- * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
- */
-static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
-{
-	int i;
-
-	if (rcu_fanout_exact) {
-		levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
-		for (i = rcu_num_lvls - 2; i >= 0; i--)
-			levelspread[i] = RCU_FANOUT;
-	} else {
-		int ccur;
-		int cprv;
-
-		cprv = nr_cpu_ids;
-		for (i = rcu_num_lvls - 1; i >= 0; i--) {
-			ccur = levelcnt[i];
-			levelspread[i] = (cprv + ccur - 1) / ccur;
-			cprv = ccur;
-		}
-	}
-}
-
 /*
  * Helper function for rcu_init() that initializes one rcu_state structure.
  */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 066c64071a7b..30c5bf89ee58 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -45,6 +45,7 @@
 
 #define RCU_TREE_NONCORE
 #include "tree.h"
+#include "rcu.h"
 
 static int r_open(struct inode *inode, struct file *file,
 					const struct seq_operations *op)
-- 
cgit v1.2.3-70-g09d2


From 80a7956fe36c2ee40c6ff12c77926d267802b7c8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 15:26:18 -0700
Subject: srcu: Merge ->srcu_state into ->srcu_gp_seq

Updating ->srcu_state and ->srcu_gp_seq will lead to extremely complex
race conditions given multiple callback queues, so this commit takes
advantage of the two-bit state now available in rcu_seq counters to
store the state in the bottom two bits of ->srcu_gp_seq.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h |  5 +----
 kernel/rcu/rcu.h     | 10 ++++++++++
 kernel/rcu/srcu.c    | 55 +++++++++++++++++++++++++++++++++-------------------
 3 files changed, 46 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index ad154a7bc114..e7dbc01b61a1 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -43,8 +43,7 @@ struct srcu_struct {
 	unsigned long completed;
 	unsigned long srcu_gp_seq;
 	struct srcu_array __percpu *per_cpu_ref;
-	spinlock_t queue_lock; /* protect ->srcu_cblist, ->srcu_state */
-	int srcu_state;
+	spinlock_t queue_lock; /* protect ->srcu_cblist */
 	struct rcu_segcblist srcu_cblist;
 	struct delayed_work work;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -56,7 +55,6 @@ struct srcu_struct {
 #define SRCU_STATE_IDLE		0
 #define SRCU_STATE_SCAN1	1
 #define SRCU_STATE_SCAN2	2
-#define SRCU_STATE_DONE		3
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
@@ -85,7 +83,6 @@ void process_srcu(struct work_struct *work);
 		.completed = -300,					\
 		.per_cpu_ref = &name##_srcu_array,			\
 		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
-		.srcu_state = SRCU_STATE_IDLE,				\
 		.srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\
 		.work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
 		__SRCU_DEP_MAP_INIT(name)				\
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 87a0ac95b551..73e16ec4054b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -82,6 +82,16 @@ static inline int rcu_seq_state(unsigned long s)
 	return s & RCU_SEQ_STATE_MASK;
 }
 
+/*
+ * Set the state portion of the pointed-to sequence number.
+ * The caller is responsible for preventing conflicting updates.
+ */
+static inline void rcu_seq_set_state(unsigned long *sp, int newstate)
+{
+	WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK);
+	WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate);
+}
+
 /* Adjust sequence number for start of update-side operation. */
 static inline void rcu_seq_start(unsigned long *sp)
 {
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 1a2dc74bb625..90ffea31b188 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -44,7 +44,6 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
 	sp->completed = 0;
 	sp->srcu_gp_seq = 0;
 	spin_lock_init(&sp->queue_lock);
-	sp->srcu_state = SRCU_STATE_IDLE;
 	rcu_segcblist_init(&sp->srcu_cblist);
 	INIT_DELAYED_WORK(&sp->work, process_srcu);
 	sp->per_cpu_ref = alloc_percpu(struct srcu_array);
@@ -180,6 +179,9 @@ static bool srcu_readers_active(struct srcu_struct *sp)
 	return sum;
 }
 
+#define SRCU_CALLBACK_BATCH	10
+#define SRCU_INTERVAL		1
+
 /**
  * cleanup_srcu_struct - deconstruct a sleep-RCU structure
  * @sp: structure to clean up.
@@ -200,8 +202,10 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 	if (WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)))
 		return; /* Leakage unless caller handles error. */
 	flush_delayed_work(&sp->work);
-	if (WARN_ON(READ_ONCE(sp->srcu_state) != SRCU_STATE_IDLE))
+	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE)) {
+		pr_info("cleanup_srcu_struct: Active srcu_struct %lu CBs %c state: %d\n", rcu_segcblist_n_cbs(&sp->srcu_cblist), ".E"[rcu_segcblist_empty(&sp->srcu_cblist)], rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
 		return; /* Caller forgot to stop doing call_srcu()? */
+	}
 	free_percpu(sp->per_cpu_ref);
 	sp->per_cpu_ref = NULL;
 }
@@ -253,10 +257,13 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
  */
 static void srcu_gp_start(struct srcu_struct *sp)
 {
+	int state;
+
 	rcu_segcblist_accelerate(&sp->srcu_cblist,
 				 rcu_seq_snap(&sp->srcu_gp_seq));
-	WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
 	rcu_seq_start(&sp->srcu_gp_seq);
+	state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+	WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
 }
 
 /*
@@ -300,7 +307,6 @@ static void srcu_flip(struct srcu_struct *sp)
 static void srcu_gp_end(struct srcu_struct *sp)
 {
 	rcu_seq_end(&sp->srcu_gp_seq);
-	WRITE_ONCE(sp->srcu_state, SRCU_STATE_DONE);
 
 	spin_lock_irq(&sp->queue_lock);
 	rcu_segcblist_advance(&sp->srcu_cblist,
@@ -345,7 +351,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 	spin_lock_irqsave(&sp->queue_lock, flags);
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
 	rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
-	if (READ_ONCE(sp->srcu_state) == SRCU_STATE_IDLE) {
+	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_IDLE) {
 		srcu_gp_start(sp);
 		queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
 	}
@@ -378,7 +384,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 	head->func = wakeme_after_rcu;
 	spin_lock_irq(&sp->queue_lock);
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
-	if (READ_ONCE(sp->srcu_state) == SRCU_STATE_IDLE) {
+	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_IDLE) {
 		/* steal the processing owner */
 		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
 		srcu_gp_start(sp);
@@ -480,9 +486,6 @@ unsigned long srcu_batches_completed(struct srcu_struct *sp)
 }
 EXPORT_SYMBOL_GPL(srcu_batches_completed);
 
-#define SRCU_CALLBACK_BATCH	10
-#define SRCU_INTERVAL		1
-
 /*
  * Core SRCU state machine.  Advance callbacks from ->batch_check0 to
  * ->batch_check1 and then to ->batch_done as readers drain.
@@ -491,28 +494,40 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 {
 	int idx;
 
-	WARN_ON_ONCE(sp->srcu_state == SRCU_STATE_IDLE);
-
 	/*
 	 * Because readers might be delayed for an extended period after
 	 * fetching ->completed for their index, at any point in time there
 	 * might well be readers using both idx=0 and idx=1.  We therefore
 	 * need to wait for readers to clear from both index values before
 	 * invoking a callback.
+	 *
+	 * The load-acquire ensures that we see the accesses performed
+	 * by the prior grace period.
 	 */
+	idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
+	if (idx == SRCU_STATE_IDLE) {
+		spin_lock_irq(&sp->queue_lock);
+		if (rcu_segcblist_empty(&sp->srcu_cblist)) {
+			spin_unlock_irq(&sp->queue_lock);
+			return;
+		}
+		idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+		if (idx == SRCU_STATE_IDLE)
+			srcu_gp_start(sp);
+		spin_unlock_irq(&sp->queue_lock);
+		if (idx != SRCU_STATE_IDLE)
+			return; /* Someone else started the grace period. */
+	}
 
-	if (sp->srcu_state == SRCU_STATE_DONE)
-		srcu_gp_start(sp);
-
-	if (sp->srcu_state == SRCU_STATE_SCAN1) {
+	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
 		idx = 1 ^ (sp->completed & 1);
 		if (!try_check_zero(sp, idx, trycount))
 			return; /* readers present, retry after SRCU_INTERVAL */
 		srcu_flip(sp);
-		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN2);
+		rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
 	}
 
-	if (sp->srcu_state == SRCU_STATE_SCAN2) {
+	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
 
 		/*
 		 * SRCU read-side critical sections are normally short,
@@ -563,14 +578,14 @@ static void srcu_invoke_callbacks(struct srcu_struct *sp)
 static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
 {
 	bool pending = true;
+	int state;
 
 	if (rcu_segcblist_empty(&sp->srcu_cblist)) {
 		spin_lock_irq(&sp->queue_lock);
+		state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
 		if (rcu_segcblist_empty(&sp->srcu_cblist) &&
-		    READ_ONCE(sp->srcu_state) == SRCU_STATE_DONE) {
-			WRITE_ONCE(sp->srcu_state, SRCU_STATE_IDLE);
+		    state == SRCU_STATE_IDLE)
 			pending = false;
-		}
 		spin_unlock_irq(&sp->queue_lock);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From f60d231a87c5c9f23f10e69996f396d46f5bf901 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 24 Mar 2017 13:46:33 -0700
Subject: srcu: Crude control of expedited grace periods

SRCU's implementation of expedited grace periods has always assumed
that the SRCU instance is idle when the expedited request arrives.
This commit improves this a bit by maintaining a count of the number
of outstanding expedited requests, thus allowing prior non-expedited
grace periods accommodate these requests by shifting to expedited mode.
However, any non-expedited wait already in progress will still wait for
the full duration.

Improved control of expedited grace periods is planned, but one step
at a time.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h |  1 +
 kernel/rcu/srcu.c    | 84 ++++++++++++++++++++++++++++------------------------
 2 files changed, 47 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index e7dbc01b61a1..73a1b6296224 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -42,6 +42,7 @@ struct srcu_array {
 struct srcu_struct {
 	unsigned long completed;
 	unsigned long srcu_gp_seq;
+	atomic_t srcu_exp_cnt;
 	struct srcu_array __percpu *per_cpu_ref;
 	spinlock_t queue_lock; /* protect ->srcu_cblist */
 	struct rcu_segcblist srcu_cblist;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 90ffea31b188..3cfcc59bddf3 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -43,6 +43,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
 	sp->completed = 0;
 	sp->srcu_gp_seq = 0;
+	atomic_set(&sp->srcu_exp_cnt, 0);
 	spin_lock_init(&sp->queue_lock);
 	rcu_segcblist_init(&sp->srcu_cblist);
 	INIT_DELAYED_WORK(&sp->work, process_srcu);
@@ -179,7 +180,6 @@ static bool srcu_readers_active(struct srcu_struct *sp)
 	return sum;
 }
 
-#define SRCU_CALLBACK_BATCH	10
 #define SRCU_INTERVAL		1
 
 /**
@@ -197,6 +197,7 @@ static bool srcu_readers_active(struct srcu_struct *sp)
  */
 void cleanup_srcu_struct(struct srcu_struct *sp)
 {
+	WARN_ON_ONCE(atomic_read(&sp->srcu_exp_cnt));
 	if (WARN_ON(srcu_readers_active(sp)))
 		return; /* Leakage unless caller handles error. */
 	if (WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)))
@@ -244,13 +245,10 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
  * We use an adaptive strategy for synchronize_srcu() and especially for
  * synchronize_srcu_expedited().  We spin for a fixed time period
  * (defined below) to allow SRCU readers to exit their read-side critical
- * sections.  If there are still some readers after 10 microseconds,
- * we repeatedly block for 1-millisecond time periods.  This approach
- * has done well in testing, so there is no need for a config parameter.
+ * sections.  If there are still some readers after a few microseconds,
+ * we repeatedly block for 1-millisecond time periods.
  */
 #define SRCU_RETRY_CHECK_DELAY		5
-#define SYNCHRONIZE_SRCU_TRYCOUNT	2
-#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT	12
 
 /*
  * Start an SRCU grace period.
@@ -267,16 +265,16 @@ static void srcu_gp_start(struct srcu_struct *sp)
 }
 
 /*
- * Wait until all readers counted by array index idx complete, but loop
- * a maximum of trycount times.  The caller must ensure that ->completed
- * is not changed while checking.
+ * Wait until all readers counted by array index idx complete, but
+ * loop an additional time if there is an expedited grace period pending.
+ * The caller must ensure that ->completed is not changed while checking.
  */
 static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
 {
 	for (;;) {
 		if (srcu_readers_active_idx_check(sp, idx))
 			return true;
-		if (--trycount <= 0)
+		if (--trycount + !!atomic_read(&sp->srcu_exp_cnt) <= 0)
 			return false;
 		udelay(SRCU_RETRY_CHECK_DELAY);
 	}
@@ -364,7 +362,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
 /*
  * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
  */
-static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
+static void __synchronize_srcu(struct srcu_struct *sp)
 {
 	struct rcu_synchronize rcu;
 	struct rcu_head *head = &rcu.head;
@@ -400,6 +398,32 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 	smp_mb(); /* Caller's later accesses after GP. */
 }
 
+/**
+ * synchronize_srcu_expedited - Brute-force SRCU grace period
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * spinning rather than blocking when waiting.
+ *
+ * Note that synchronize_srcu_expedited() has the same deadlock and
+ * memory-ordering properties as does synchronize_srcu().
+ */
+void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+	bool do_norm = rcu_gp_is_normal();
+
+	if (!do_norm) {
+		atomic_inc(&sp->srcu_exp_cnt);
+		smp_mb__after_atomic(); /* increment before GP. */
+	}
+	__synchronize_srcu(sp);
+	if (!do_norm) {
+		smp_mb__before_atomic(); /* GP before decrement. */
+		atomic_dec(&sp->srcu_exp_cnt);
+	}
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
+
 /**
  * synchronize_srcu - wait for prior SRCU read-side critical-section completion
  * @sp: srcu_struct with which to synchronize.
@@ -441,28 +465,13 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
  */
 void synchronize_srcu(struct srcu_struct *sp)
 {
-	__synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal())
-			   ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
-			   : SYNCHRONIZE_SRCU_TRYCOUNT);
+	if (rcu_gp_is_expedited())
+		synchronize_srcu_expedited(sp);
+	else
+		__synchronize_srcu(sp);
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu);
 
-/**
- * synchronize_srcu_expedited - Brute-force SRCU grace period
- * @sp: srcu_struct with which to synchronize.
- *
- * Wait for an SRCU grace period to elapse, but be more aggressive about
- * spinning rather than blocking when waiting.
- *
- * Note that synchronize_srcu_expedited() has the same deadlock and
- * memory-ordering properties as does synchronize_srcu().
- */
-void synchronize_srcu_expedited(struct srcu_struct *sp)
-{
-	__synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
-}
-EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
-
 /**
  * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
  * @sp: srcu_struct on which to wait for in-flight callbacks.
@@ -490,7 +499,7 @@ EXPORT_SYMBOL_GPL(srcu_batches_completed);
  * Core SRCU state machine.  Advance callbacks from ->batch_check0 to
  * ->batch_check1 and then to ->batch_done as readers drain.
  */
-static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
+static void srcu_advance_batches(struct srcu_struct *sp)
 {
 	int idx;
 
@@ -521,8 +530,8 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 
 	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
 		idx = 1 ^ (sp->completed & 1);
-		if (!try_check_zero(sp, idx, trycount))
-			return; /* readers present, retry after SRCU_INTERVAL */
+		if (!try_check_zero(sp, idx, 1))
+			return; /* readers present, retry later. */
 		srcu_flip(sp);
 		rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
 	}
@@ -534,9 +543,8 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 		 * so check at least twice in quick succession after a flip.
 		 */
 		idx = 1 ^ (sp->completed & 1);
-		trycount = trycount < 2 ? 2 : trycount;
-		if (!try_check_zero(sp, idx, trycount))
-			return; /* readers present, retry after SRCU_INTERVAL */
+		if (!try_check_zero(sp, idx, 2))
+			return; /* readers present, retry after later. */
 		srcu_gp_end(sp);
 	}
 }
@@ -602,8 +610,8 @@ void process_srcu(struct work_struct *work)
 
 	sp = container_of(work, struct srcu_struct, work.work);
 
-	srcu_advance_batches(sp, 1);
+	srcu_advance_batches(sp);
 	srcu_invoke_callbacks(sp);
-	srcu_reschedule(sp, SRCU_INTERVAL);
+	srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
 }
 EXPORT_SYMBOL_GPL(process_srcu);
-- 
cgit v1.2.3-70-g09d2


From d8be81735aa89413b333de488251f0e64e2be591 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 25 Mar 2017 09:59:38 -0700
Subject: srcu: Create a tiny SRCU

In response to automated complaints about modifications to SRCU
increasing its size, this commit creates a tiny SRCU that is
used in SMP=n && PREEMPT=n builds.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h     |  69 ++-------------
 include/linux/srcutiny.h |  81 ++++++++++++++++++
 include/linux/srcutree.h |  91 ++++++++++++++++++++
 init/Kconfig             |  12 +++
 kernel/rcu/Makefile      |   3 +-
 kernel/rcu/rcutorture.c  |   2 +
 kernel/rcu/srcutiny.c    | 215 +++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 411 insertions(+), 62 deletions(-)
 create mode 100644 include/linux/srcutiny.h
 create mode 100644 include/linux/srcutree.h
 create mode 100644 kernel/rcu/srcutiny.c

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 73a1b6296224..907f09b14eda 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -34,28 +34,7 @@
 #include <linux/workqueue.h>
 #include <linux/rcu_segcblist.h>
 
-struct srcu_array {
-	unsigned long lock_count[2];
-	unsigned long unlock_count[2];
-};
-
-struct srcu_struct {
-	unsigned long completed;
-	unsigned long srcu_gp_seq;
-	atomic_t srcu_exp_cnt;
-	struct srcu_array __percpu *per_cpu_ref;
-	spinlock_t queue_lock; /* protect ->srcu_cblist */
-	struct rcu_segcblist srcu_cblist;
-	struct delayed_work work;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-};
-
-/* Values for -> state variable. */
-#define SRCU_STATE_IDLE		0
-#define SRCU_STATE_SCAN1	1
-#define SRCU_STATE_SCAN2	2
+struct srcu_struct;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
@@ -77,42 +56,13 @@ int init_srcu_struct(struct srcu_struct *sp);
 #define __SRCU_DEP_MAP_INIT(srcu_name)
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-void process_srcu(struct work_struct *work);
-
-#define __SRCU_STRUCT_INIT(name)					\
-	{								\
-		.completed = -300,					\
-		.per_cpu_ref = &name##_srcu_array,			\
-		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
-		.srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\
-		.work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
-		__SRCU_DEP_MAP_INIT(name)				\
-	}
-
-/*
- * Define and initialize a srcu struct at build time.
- * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
- *
- * Note that although DEFINE_STATIC_SRCU() hides the name from other
- * files, the per-CPU variable rules nevertheless require that the
- * chosen name be globally unique.  These rules also prohibit use of
- * DEFINE_STATIC_SRCU() within a function.  If these rules are too
- * restrictive, declare the srcu_struct manually.  For example, in
- * each file:
- *
- *	static struct srcu_struct my_srcu;
- *
- * Then, before the first use of each my_srcu, manually initialize it:
- *
- *	init_srcu_struct(&my_srcu);
- *
- * See include/linux/percpu-defs.h for the rules on per-CPU variables.
- */
-#define __DEFINE_SRCU(name, is_static)					\
-	static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
-	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
-#define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
-#define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
+#ifdef CONFIG_TINY_SRCU
+#include <linux/srcutiny.h>
+#elif defined(CONFIG_TREE_SRCU)
+#include <linux/srcutree.h>
+#else
+#error "Unknown SRCU implementation specified to kernel configuration"
+#endif
 
 /**
  * call_srcu() - Queue a callback for invocation after an SRCU grace period
@@ -138,9 +88,6 @@ void cleanup_srcu_struct(struct srcu_struct *sp);
 int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
 void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
 void synchronize_srcu(struct srcu_struct *sp);
-void synchronize_srcu_expedited(struct srcu_struct *sp);
-unsigned long srcu_batches_completed(struct srcu_struct *sp);
-void srcu_barrier(struct srcu_struct *sp);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
new file mode 100644
index 000000000000..4f284e4f4d8c
--- /dev/null
+++ b/include/linux/srcutiny.h
@@ -0,0 +1,81 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion,
+ *	tiny variant.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2017
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ */
+
+#ifndef _LINUX_SRCU_TINY_H
+#define _LINUX_SRCU_TINY_H
+
+#include <linux/swait.h>
+
+struct srcu_struct {
+	int srcu_lock_nesting[2];	/* srcu_read_lock() nesting depth. */
+	struct swait_queue_head srcu_wq;
+					/* Last srcu_read_unlock() wakes GP. */
+	unsigned long srcu_gp_seq;	/* GP seq # for callback tagging. */
+	struct rcu_segcblist srcu_cblist;
+					/* Pending SRCU callbacks. */
+	int srcu_idx;			/* Current reader array element. */
+	bool srcu_gp_running;		/* GP workqueue running? */
+	bool srcu_gp_waiting;		/* GP waiting for readers? */
+	struct work_struct srcu_work;	/* For driving grace periods. */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+};
+
+void srcu_drive_gp(struct work_struct *wp);
+
+#define __SRCU_STRUCT_INIT(name)					\
+{									\
+	.srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq),	\
+	.srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),	\
+	.srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp),	\
+	__SRCU_DEP_MAP_INIT(name)					\
+}
+
+/*
+ * This odd _STATIC_ arrangement is needed for API compatibility with
+ * Tree SRCU, which needs some per-CPU data.
+ */
+#define DEFINE_SRCU(name) \
+	struct srcu_struct name = __SRCU_STRUCT_INIT(name)
+#define DEFINE_STATIC_SRCU(name) \
+	static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
+
+void synchronize_srcu(struct srcu_struct *sp);
+
+static inline void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+	synchronize_srcu(sp);
+}
+
+static inline void srcu_barrier(struct srcu_struct *sp)
+{
+	synchronize_srcu(sp);
+}
+
+static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
+{
+	return 0;
+}
+
+#endif
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
new file mode 100644
index 000000000000..f2b3bd6c6bc2
--- /dev/null
+++ b/include/linux/srcutree.h
@@ -0,0 +1,91 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion,
+ *	tree variant.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2017
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ */
+
+#ifndef _LINUX_SRCU_TREE_H
+#define _LINUX_SRCU_TREE_H
+
+struct srcu_array {
+	unsigned long lock_count[2];
+	unsigned long unlock_count[2];
+};
+
+struct srcu_struct {
+	unsigned long completed;
+	unsigned long srcu_gp_seq;
+	atomic_t srcu_exp_cnt;
+	struct srcu_array __percpu *per_cpu_ref;
+	spinlock_t queue_lock; /* protect ->srcu_cblist */
+	struct rcu_segcblist srcu_cblist;
+	struct delayed_work work;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+};
+
+/* Values for -> state variable. */
+#define SRCU_STATE_IDLE		0
+#define SRCU_STATE_SCAN1	1
+#define SRCU_STATE_SCAN2	2
+
+void process_srcu(struct work_struct *work);
+
+#define __SRCU_STRUCT_INIT(name)					\
+	{								\
+		.completed = -300,					\
+		.per_cpu_ref = &name##_srcu_array,			\
+		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
+		.srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\
+		.work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
+		__SRCU_DEP_MAP_INIT(name)				\
+	}
+
+/*
+ * Define and initialize a srcu struct at build time.
+ * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
+ *
+ * Note that although DEFINE_STATIC_SRCU() hides the name from other
+ * files, the per-CPU variable rules nevertheless require that the
+ * chosen name be globally unique.  These rules also prohibit use of
+ * DEFINE_STATIC_SRCU() within a function.  If these rules are too
+ * restrictive, declare the srcu_struct manually.  For example, in
+ * each file:
+ *
+ *	static struct srcu_struct my_srcu;
+ *
+ * Then, before the first use of each my_srcu, manually initialize it:
+ *
+ *	init_srcu_struct(&my_srcu);
+ *
+ * See include/linux/percpu-defs.h for the rules on per-CPU variables.
+ */
+#define __DEFINE_SRCU(name, is_static)					\
+	static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
+	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
+#define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
+#define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
+
+void synchronize_srcu_expedited(struct srcu_struct *sp);
+void srcu_barrier(struct srcu_struct *sp);
+unsigned long srcu_batches_completed(struct srcu_struct *sp);
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index a92f27da4a27..d269f2ca17b8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -526,6 +526,18 @@ config SRCU
 	  permits arbitrary sleeping or blocking within RCU read-side critical
 	  sections.
 
+config TINY_SRCU
+	bool
+	default y if TINY_RCU
+	help
+	  This option selects the single-CPU non-preemptible version of SRCU.
+
+config TREE_SRCU
+	bool
+	default y if !TINY_RCU
+	help
+	  This option selects the full-fledged version of SRCU.
+
 config TASKS_RCU
 	bool
 	default n
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 18dfc485225c..b853214a2b99 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -3,7 +3,8 @@
 KCOV_INSTRUMENT := n
 
 obj-y += update.o sync.o
-obj-$(CONFIG_SRCU) += srcu.o
+obj-$(CONFIG_TREE_SRCU) += srcu.o
+obj-$(CONFIG_TINY_SRCU) += srcutiny.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
 obj-$(CONFIG_TREE_RCU) += tree.o
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index cccc417a8135..98591e16db1a 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -559,6 +559,7 @@ static void srcu_torture_barrier(void)
 
 static void srcu_torture_stats(void)
 {
+#ifdef CONFIG_TREE_SRCU
 	int cpu;
 	int idx = srcu_ctlp->completed & 0x1;
 
@@ -587,6 +588,7 @@ static void srcu_torture_stats(void)
 		pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
 	}
 	pr_cont("\n");
+#endif
 }
 
 static void srcu_torture_synchronize_expedited(void)
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
new file mode 100644
index 000000000000..b8293527ee18
--- /dev/null
+++ b/kernel/rcu/srcutiny.c
@@ -0,0 +1,215 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion,
+ *	tiny version for non-preemptible single-CPU use.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2017
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ */
+
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/srcu.h>
+
+#include <linux/rcu_node_tree.h>
+#include "rcu.h"
+
+static int init_srcu_struct_fields(struct srcu_struct *sp)
+{
+	sp->srcu_lock_nesting[0] = 0;
+	sp->srcu_lock_nesting[1] = 0;
+	init_swait_queue_head(&sp->srcu_wq);
+	sp->srcu_gp_seq = 0;
+	rcu_segcblist_init(&sp->srcu_cblist);
+	sp->srcu_gp_running = false;
+	sp->srcu_gp_waiting = false;
+	sp->srcu_idx = 0;
+	INIT_WORK(&sp->srcu_work, srcu_drive_gp);
+	return 0;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+		       struct lock_class_key *key)
+{
+	/* Don't re-initialize a lock while it is held. */
+	debug_check_no_locks_freed((void *)sp, sizeof(*sp));
+	lockdep_init_map(&sp->dep_map, name, key, 0);
+	return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @sp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function.  Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *sp)
+{
+	return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+	WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
+	flush_work(&sp->srcu_work);
+	WARN_ON(rcu_seq_state(sp->srcu_gp_seq));
+	WARN_ON(sp->srcu_gp_running);
+	WARN_ON(sp->srcu_gp_waiting);
+	WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist));
+}
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct.  Must be called from process context.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int __srcu_read_lock(struct srcu_struct *sp)
+{
+	int idx;
+
+	idx = READ_ONCE(sp->srcu_idx);
+	WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1);
+	return idx;
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock);
+
+/*
+ * Removes the count for the old reader from the appropriate element of
+ * the srcu_struct.  Must be called from process context.
+ */
+void __srcu_read_unlock(struct srcu_struct *sp, int idx)
+{
+	int newval = sp->srcu_lock_nesting[idx] - 1;
+
+	WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
+	if (!newval && READ_ONCE(sp->srcu_gp_waiting))
+		swake_up(&sp->srcu_wq);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+
+/*
+ * Workqueue handler to drive one grace period and invoke any callbacks
+ * that become ready as a result.  Single-CPU and !PREEMPT operation
+ * means that we get away with murder on synchronization.  ;-)
+ */
+void srcu_drive_gp(struct work_struct *wp)
+{
+	int idx;
+	struct rcu_cblist ready_cbs;
+	struct srcu_struct *sp;
+	struct rcu_head *rhp;
+
+	sp = container_of(wp, struct srcu_struct, srcu_work);
+	if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist))
+		return; /* Already running or nothing to do. */
+
+	/* Tag recently arrived callbacks and wait for readers. */
+	WRITE_ONCE(sp->srcu_gp_running, true);
+	rcu_segcblist_accelerate(&sp->srcu_cblist,
+				 rcu_seq_snap(&sp->srcu_gp_seq));
+	rcu_seq_start(&sp->srcu_gp_seq);
+	idx = sp->srcu_idx;
+	WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
+	WRITE_ONCE(sp->srcu_gp_waiting, true);  /* srcu_read_unlock() wakes! */
+	swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
+	WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
+	rcu_seq_end(&sp->srcu_gp_seq);
+
+	/* Update callback list based on GP, and invoke ready callbacks. */
+	rcu_segcblist_advance(&sp->srcu_cblist,
+			      rcu_seq_current(&sp->srcu_gp_seq));
+	if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
+		rcu_cblist_init(&ready_cbs);
+		local_irq_disable();
+		rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
+		local_irq_enable();
+		rhp = rcu_cblist_dequeue(&ready_cbs);
+		for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
+			local_bh_disable();
+			rhp->func(rhp);
+			local_bh_enable();
+		}
+		local_irq_disable();
+		rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
+		local_irq_enable();
+	}
+	WRITE_ONCE(sp->srcu_gp_running, false);
+
+	/*
+	 * If more callbacks, reschedule ourselves.  This can race with
+	 * a call_srcu() at interrupt level, but the ->srcu_gp_running
+	 * checks will straighten that out.
+	 */
+	if (!rcu_segcblist_empty(&sp->srcu_cblist))
+		schedule_work(&sp->srcu_work);
+}
+EXPORT_SYMBOL_GPL(srcu_drive_gp);
+
+/*
+ * Enqueue an SRCU callback on the specified srcu_struct structure,
+ * initiating grace-period processing if it is not already running.
+ */
+void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+	       rcu_callback_t func)
+{
+	unsigned long flags;
+
+	head->func = func;
+	local_irq_save(flags);
+	rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
+	local_irq_restore(flags);
+	if (!READ_ONCE(sp->srcu_gp_running))
+		schedule_work(&sp->srcu_work);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
+
+/*
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+	struct rcu_synchronize rs;
+
+	init_rcu_head_on_stack(&rs.head);
+	init_completion(&rs.completion);
+	call_srcu(sp, &rs.head, wakeme_after_rcu);
+	wait_for_completion(&rs.completion);
+	destroy_rcu_head_on_stack(&rs.head);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
-- 
cgit v1.2.3-70-g09d2


From dad81a2026841b5e2651aab58a7398c13cc05847 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 25 Mar 2017 17:23:44 -0700
Subject: srcu: Introduce CLASSIC_SRCU Kconfig option

The TREE_SRCU rewrite is large and a bit on the non-simple side, so
this commit helps reduce risk by allowing the old v4.11 SRCU algorithm
to be selected using a new CLASSIC_SRCU Kconfig option that depends
on RCU_EXPERT.  The default is to use the new TREE_SRCU and TINY_SRCU
algorithms, in order to help get these the testing that they need.
However, if your users do not require the update-side scalability that
is to be provided by TREE_SRCU, select RCU_EXPERT and then CLASSIC_SRCU
to revert back to the old classic SRCU algorithm.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h        |   2 +
 include/linux/srcuclassic.h | 101 ++++++++
 init/Kconfig                |  21 +-
 kernel/rcu/Makefile         |   3 +-
 kernel/rcu/rcutorture.c     |   2 +-
 kernel/rcu/srcu.c           | 347 ++++++++++++++-----------
 kernel/rcu/srcutree.c       | 613 ++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 934 insertions(+), 155 deletions(-)
 create mode 100644 include/linux/srcuclassic.h
 create mode 100644 kernel/rcu/srcutree.c

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 907f09b14eda..167ad8831aaf 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -60,6 +60,8 @@ int init_srcu_struct(struct srcu_struct *sp);
 #include <linux/srcutiny.h>
 #elif defined(CONFIG_TREE_SRCU)
 #include <linux/srcutree.h>
+#elif defined(CONFIG_CLASSIC_SRCU)
+#include <linux/srcuclassic.h>
 #else
 #error "Unknown SRCU implementation specified to kernel configuration"
 #endif
diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h
new file mode 100644
index 000000000000..41cf99930f34
--- /dev/null
+++ b/include/linux/srcuclassic.h
@@ -0,0 +1,101 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion,
+ *	classic v4.11 variant.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2017
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ */
+
+#ifndef _LINUX_SRCU_CLASSIC_H
+#define _LINUX_SRCU_CLASSIC_H
+
+struct srcu_array {
+	unsigned long lock_count[2];
+	unsigned long unlock_count[2];
+};
+
+struct rcu_batch {
+	struct rcu_head *head, **tail;
+};
+
+#define RCU_BATCH_INIT(name) { NULL, &(name.head) }
+
+struct srcu_struct {
+	unsigned long completed;
+	struct srcu_array __percpu *per_cpu_ref;
+	spinlock_t queue_lock; /* protect ->batch_queue, ->running */
+	bool running;
+	/* callbacks just queued */
+	struct rcu_batch batch_queue;
+	/* callbacks try to do the first check_zero */
+	struct rcu_batch batch_check0;
+	/* callbacks done with the first check_zero and the flip */
+	struct rcu_batch batch_check1;
+	struct rcu_batch batch_done;
+	struct delayed_work work;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+};
+
+void process_srcu(struct work_struct *work);
+
+#define __SRCU_STRUCT_INIT(name)					\
+	{								\
+		.completed = -300,					\
+		.per_cpu_ref = &name##_srcu_array,			\
+		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
+		.running = false,					\
+		.batch_queue = RCU_BATCH_INIT(name.batch_queue),	\
+		.batch_check0 = RCU_BATCH_INIT(name.batch_check0),	\
+		.batch_check1 = RCU_BATCH_INIT(name.batch_check1),	\
+		.batch_done = RCU_BATCH_INIT(name.batch_done),		\
+		.work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
+		__SRCU_DEP_MAP_INIT(name)				\
+	}
+
+/*
+ * Define and initialize a srcu struct at build time.
+ * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
+ *
+ * Note that although DEFINE_STATIC_SRCU() hides the name from other
+ * files, the per-CPU variable rules nevertheless require that the
+ * chosen name be globally unique.  These rules also prohibit use of
+ * DEFINE_STATIC_SRCU() within a function.  If these rules are too
+ * restrictive, declare the srcu_struct manually.  For example, in
+ * each file:
+ *
+ *	static struct srcu_struct my_srcu;
+ *
+ * Then, before the first use of each my_srcu, manually initialize it:
+ *
+ *	init_srcu_struct(&my_srcu);
+ *
+ * See include/linux/percpu-defs.h for the rules on per-CPU variables.
+ */
+#define __DEFINE_SRCU(name, is_static)					\
+	static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
+	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
+#define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
+#define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
+
+void synchronize_srcu_expedited(struct srcu_struct *sp);
+void srcu_barrier(struct srcu_struct *sp);
+unsigned long srcu_batches_completed(struct srcu_struct *sp);
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index d269f2ca17b8..558cc3638ab9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -526,15 +526,32 @@ config SRCU
 	  permits arbitrary sleeping or blocking within RCU read-side critical
 	  sections.
 
+config CLASSIC_SRCU
+	bool "Use v4.11 classic SRCU implementation"
+	default n
+	depends on RCU_EXPERT && SRCU
+	help
+	  This option selects the traditional well-tested classic SRCU
+	  implementation from v4.11, as might be desired for enterprise
+	  Linux distributions.  Without this option, the shiny new
+	  Tiny SRCU and Tree SRCU implementations are used instead.
+	  At some point, it is hoped that Tiny SRCU and Tree SRCU
+	  will accumulate enough test time and confidence to allow
+	  Classic SRCU to be dropped entirely.
+
+	  Say Y if you need a rock-solid SRCU.
+
+	  Say N if you would like help test Tree SRCU.
+
 config TINY_SRCU
 	bool
-	default y if TINY_RCU
+	default y if TINY_RCU && !CLASSIC_SRCU
 	help
 	  This option selects the single-CPU non-preemptible version of SRCU.
 
 config TREE_SRCU
 	bool
-	default y if !TINY_RCU
+	default y if !TINY_RCU && !CLASSIC_SRCU
 	help
 	  This option selects the full-fledged version of SRCU.
 
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index b853214a2b99..158e6593d58c 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -3,7 +3,8 @@
 KCOV_INSTRUMENT := n
 
 obj-y += update.o sync.o
-obj-$(CONFIG_TREE_SRCU) += srcu.o
+obj-$(CONFIG_CLASSIC_SRCU) += srcu.o
+obj-$(CONFIG_TREE_SRCU) += srcutree.o
 obj-$(CONFIG_TINY_SRCU) += srcutiny.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 9cbb8a7b909d..6f344b6748a8 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -562,7 +562,7 @@ static void srcu_torture_stats(void)
 	int __maybe_unused cpu;
 	int idx;
 
-#ifdef CONFIG_TREE_SRCU
+#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU)
 	idx = srcu_ctlp->completed & 0x1;
 	pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
 		 torture_type, TORTURE_FLAG, idx);
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 3cfcc59bddf3..584d8a983883 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -36,16 +36,75 @@
 #include <linux/delay.h>
 #include <linux/srcu.h>
 
-#include <linux/rcu_node_tree.h>
 #include "rcu.h"
 
+/*
+ * Initialize an rcu_batch structure to empty.
+ */
+static inline void rcu_batch_init(struct rcu_batch *b)
+{
+	b->head = NULL;
+	b->tail = &b->head;
+}
+
+/*
+ * Enqueue a callback onto the tail of the specified rcu_batch structure.
+ */
+static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
+{
+	*b->tail = head;
+	b->tail = &head->next;
+}
+
+/*
+ * Is the specified rcu_batch structure empty?
+ */
+static inline bool rcu_batch_empty(struct rcu_batch *b)
+{
+	return b->tail == &b->head;
+}
+
+/*
+ * Remove the callback at the head of the specified rcu_batch structure
+ * and return a pointer to it, or return NULL if the structure is empty.
+ */
+static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
+{
+	struct rcu_head *head;
+
+	if (rcu_batch_empty(b))
+		return NULL;
+
+	head = b->head;
+	b->head = head->next;
+	if (b->tail == &head->next)
+		rcu_batch_init(b);
+
+	return head;
+}
+
+/*
+ * Move all callbacks from the rcu_batch structure specified by "from" to
+ * the structure specified by "to".
+ */
+static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
+{
+	if (!rcu_batch_empty(from)) {
+		*to->tail = from->head;
+		to->tail = from->tail;
+		rcu_batch_init(from);
+	}
+}
+
 static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
 	sp->completed = 0;
-	sp->srcu_gp_seq = 0;
-	atomic_set(&sp->srcu_exp_cnt, 0);
 	spin_lock_init(&sp->queue_lock);
-	rcu_segcblist_init(&sp->srcu_cblist);
+	sp->running = false;
+	rcu_batch_init(&sp->batch_queue);
+	rcu_batch_init(&sp->batch_check0);
+	rcu_batch_init(&sp->batch_check1);
+	rcu_batch_init(&sp->batch_done);
 	INIT_DELAYED_WORK(&sp->work, process_srcu);
 	sp->per_cpu_ref = alloc_percpu(struct srcu_array);
 	return sp->per_cpu_ref ? 0 : -ENOMEM;
@@ -180,8 +239,6 @@ static bool srcu_readers_active(struct srcu_struct *sp)
 	return sum;
 }
 
-#define SRCU_INTERVAL		1
-
 /**
  * cleanup_srcu_struct - deconstruct a sleep-RCU structure
  * @sp: structure to clean up.
@@ -197,16 +254,8 @@ static bool srcu_readers_active(struct srcu_struct *sp)
  */
 void cleanup_srcu_struct(struct srcu_struct *sp)
 {
-	WARN_ON_ONCE(atomic_read(&sp->srcu_exp_cnt));
 	if (WARN_ON(srcu_readers_active(sp)))
 		return; /* Leakage unless caller handles error. */
-	if (WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)))
-		return; /* Leakage unless caller handles error. */
-	flush_delayed_work(&sp->work);
-	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE)) {
-		pr_info("cleanup_srcu_struct: Active srcu_struct %lu CBs %c state: %d\n", rcu_segcblist_n_cbs(&sp->srcu_cblist), ".E"[rcu_segcblist_empty(&sp->srcu_cblist)], rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
-		return; /* Caller forgot to stop doing call_srcu()? */
-	}
 	free_percpu(sp->per_cpu_ref);
 	sp->per_cpu_ref = NULL;
 }
@@ -245,36 +294,26 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
  * We use an adaptive strategy for synchronize_srcu() and especially for
  * synchronize_srcu_expedited().  We spin for a fixed time period
  * (defined below) to allow SRCU readers to exit their read-side critical
- * sections.  If there are still some readers after a few microseconds,
- * we repeatedly block for 1-millisecond time periods.
+ * sections.  If there are still some readers after 10 microseconds,
+ * we repeatedly block for 1-millisecond time periods.  This approach
+ * has done well in testing, so there is no need for a config parameter.
  */
 #define SRCU_RETRY_CHECK_DELAY		5
+#define SYNCHRONIZE_SRCU_TRYCOUNT	2
+#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT	12
 
 /*
- * Start an SRCU grace period.
- */
-static void srcu_gp_start(struct srcu_struct *sp)
-{
-	int state;
-
-	rcu_segcblist_accelerate(&sp->srcu_cblist,
-				 rcu_seq_snap(&sp->srcu_gp_seq));
-	rcu_seq_start(&sp->srcu_gp_seq);
-	state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
-	WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
-}
-
-/*
- * Wait until all readers counted by array index idx complete, but
- * loop an additional time if there is an expedited grace period pending.
- * The caller must ensure that ->completed is not changed while checking.
+ * @@@ Wait until all pre-existing readers complete.  Such readers
+ * will have used the index specified by "idx".
+ * the caller should ensures the ->completed is not changed while checking
+ * and idx = (->completed & 1) ^ 1
  */
 static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
 {
 	for (;;) {
 		if (srcu_readers_active_idx_check(sp, idx))
 			return true;
-		if (--trycount + !!atomic_read(&sp->srcu_exp_cnt) <= 0)
+		if (--trycount <= 0)
 			return false;
 		udelay(SRCU_RETRY_CHECK_DELAY);
 	}
@@ -299,19 +338,6 @@ static void srcu_flip(struct srcu_struct *sp)
 	smp_mb(); /* D */  /* Pairs with C. */
 }
 
-/*
- * End an SRCU grace period.
- */
-static void srcu_gp_end(struct srcu_struct *sp)
-{
-	rcu_seq_end(&sp->srcu_gp_seq);
-
-	spin_lock_irq(&sp->queue_lock);
-	rcu_segcblist_advance(&sp->srcu_cblist,
-			      rcu_seq_current(&sp->srcu_gp_seq));
-	spin_unlock_irq(&sp->queue_lock);
-}
-
 /*
  * Enqueue an SRCU callback on the specified srcu_struct structure,
  * initiating grace-period processing if it is not already running.
@@ -348,24 +374,26 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 	head->func = func;
 	spin_lock_irqsave(&sp->queue_lock, flags);
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
-	rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
-	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_IDLE) {
-		srcu_gp_start(sp);
+	rcu_batch_queue(&sp->batch_queue, head);
+	if (!sp->running) {
+		sp->running = true;
 		queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
 	}
 	spin_unlock_irqrestore(&sp->queue_lock, flags);
 }
 EXPORT_SYMBOL_GPL(call_srcu);
 
-static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
+static void srcu_reschedule(struct srcu_struct *sp);
 
 /*
  * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
  */
-static void __synchronize_srcu(struct srcu_struct *sp)
+static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 {
 	struct rcu_synchronize rcu;
 	struct rcu_head *head = &rcu.head;
+	bool done = false;
 
 	RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
 			 lock_is_held(&rcu_bh_lock_map) ||
@@ -373,8 +401,6 @@ static void __synchronize_srcu(struct srcu_struct *sp)
 			 lock_is_held(&rcu_sched_lock_map),
 			 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
 
-	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
-		return;
 	might_sleep();
 	init_completion(&rcu.completion);
 
@@ -382,47 +408,31 @@ static void __synchronize_srcu(struct srcu_struct *sp)
 	head->func = wakeme_after_rcu;
 	spin_lock_irq(&sp->queue_lock);
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
-	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_IDLE) {
+	if (!sp->running) {
 		/* steal the processing owner */
-		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
-		srcu_gp_start(sp);
+		sp->running = true;
+		rcu_batch_queue(&sp->batch_check0, head);
 		spin_unlock_irq(&sp->queue_lock);
+
+		srcu_advance_batches(sp, trycount);
+		if (!rcu_batch_empty(&sp->batch_done)) {
+			BUG_ON(sp->batch_done.head != head);
+			rcu_batch_dequeue(&sp->batch_done);
+			done = true;
+		}
 		/* give the processing owner to work_struct */
-		srcu_reschedule(sp, 0);
+		srcu_reschedule(sp);
 	} else {
-		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
+		rcu_batch_queue(&sp->batch_queue, head);
 		spin_unlock_irq(&sp->queue_lock);
 	}
 
-	wait_for_completion(&rcu.completion);
-	smp_mb(); /* Caller's later accesses after GP. */
-}
-
-/**
- * synchronize_srcu_expedited - Brute-force SRCU grace period
- * @sp: srcu_struct with which to synchronize.
- *
- * Wait for an SRCU grace period to elapse, but be more aggressive about
- * spinning rather than blocking when waiting.
- *
- * Note that synchronize_srcu_expedited() has the same deadlock and
- * memory-ordering properties as does synchronize_srcu().
- */
-void synchronize_srcu_expedited(struct srcu_struct *sp)
-{
-	bool do_norm = rcu_gp_is_normal();
-
-	if (!do_norm) {
-		atomic_inc(&sp->srcu_exp_cnt);
-		smp_mb__after_atomic(); /* increment before GP. */
-	}
-	__synchronize_srcu(sp);
-	if (!do_norm) {
-		smp_mb__before_atomic(); /* GP before decrement. */
-		atomic_dec(&sp->srcu_exp_cnt);
+	if (!done) {
+		wait_for_completion(&rcu.completion);
+		smp_mb(); /* Caller's later accesses after GP. */
 	}
+
 }
-EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
 
 /**
  * synchronize_srcu - wait for prior SRCU read-side critical-section completion
@@ -465,13 +475,28 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
  */
 void synchronize_srcu(struct srcu_struct *sp)
 {
-	if (rcu_gp_is_expedited())
-		synchronize_srcu_expedited(sp);
-	else
-		__synchronize_srcu(sp);
+	__synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal())
+			   ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
+			   : SYNCHRONIZE_SRCU_TRYCOUNT);
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu);
 
+/**
+ * synchronize_srcu_expedited - Brute-force SRCU grace period
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * spinning rather than blocking when waiting.
+ *
+ * Note that synchronize_srcu_expedited() has the same deadlock and
+ * memory-ordering properties as does synchronize_srcu().
+ */
+void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+	__synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
+
 /**
  * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
  * @sp: srcu_struct on which to wait for in-flight callbacks.
@@ -495,13 +520,29 @@ unsigned long srcu_batches_completed(struct srcu_struct *sp)
 }
 EXPORT_SYMBOL_GPL(srcu_batches_completed);
 
+#define SRCU_CALLBACK_BATCH	10
+#define SRCU_INTERVAL		1
+
+/*
+ * Move any new SRCU callbacks to the first stage of the SRCU grace
+ * period pipeline.
+ */
+static void srcu_collect_new(struct srcu_struct *sp)
+{
+	if (!rcu_batch_empty(&sp->batch_queue)) {
+		spin_lock_irq(&sp->queue_lock);
+		rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
+		spin_unlock_irq(&sp->queue_lock);
+	}
+}
+
 /*
  * Core SRCU state machine.  Advance callbacks from ->batch_check0 to
  * ->batch_check1 and then to ->batch_done as readers drain.
  */
-static void srcu_advance_batches(struct srcu_struct *sp)
+static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 {
-	int idx;
+	int idx = 1 ^ (sp->completed & 1);
 
 	/*
 	 * Because readers might be delayed for an extended period after
@@ -509,44 +550,50 @@ static void srcu_advance_batches(struct srcu_struct *sp)
 	 * might well be readers using both idx=0 and idx=1.  We therefore
 	 * need to wait for readers to clear from both index values before
 	 * invoking a callback.
-	 *
-	 * The load-acquire ensures that we see the accesses performed
-	 * by the prior grace period.
 	 */
-	idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
-	if (idx == SRCU_STATE_IDLE) {
-		spin_lock_irq(&sp->queue_lock);
-		if (rcu_segcblist_empty(&sp->srcu_cblist)) {
-			spin_unlock_irq(&sp->queue_lock);
-			return;
-		}
-		idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
-		if (idx == SRCU_STATE_IDLE)
-			srcu_gp_start(sp);
-		spin_unlock_irq(&sp->queue_lock);
-		if (idx != SRCU_STATE_IDLE)
-			return; /* Someone else started the grace period. */
-	}
 
-	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
-		idx = 1 ^ (sp->completed & 1);
-		if (!try_check_zero(sp, idx, 1))
-			return; /* readers present, retry later. */
-		srcu_flip(sp);
-		rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
-	}
+	if (rcu_batch_empty(&sp->batch_check0) &&
+	    rcu_batch_empty(&sp->batch_check1))
+		return; /* no callbacks need to be advanced */
 
-	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+	if (!try_check_zero(sp, idx, trycount))
+		return; /* failed to advance, will try after SRCU_INTERVAL */
 
-		/*
-		 * SRCU read-side critical sections are normally short,
-		 * so check at least twice in quick succession after a flip.
-		 */
-		idx = 1 ^ (sp->completed & 1);
-		if (!try_check_zero(sp, idx, 2))
-			return; /* readers present, retry after later. */
-		srcu_gp_end(sp);
-	}
+	/*
+	 * The callbacks in ->batch_check1 have already done with their
+	 * first zero check and flip back when they were enqueued on
+	 * ->batch_check0 in a previous invocation of srcu_advance_batches().
+	 * (Presumably try_check_zero() returned false during that
+	 * invocation, leaving the callbacks stranded on ->batch_check1.)
+	 * They are therefore ready to invoke, so move them to ->batch_done.
+	 */
+	rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+
+	if (rcu_batch_empty(&sp->batch_check0))
+		return; /* no callbacks need to be advanced */
+	srcu_flip(sp);
+
+	/*
+	 * The callbacks in ->batch_check0 just finished their
+	 * first check zero and flip, so move them to ->batch_check1
+	 * for future checking on the other idx.
+	 */
+	rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
+
+	/*
+	 * SRCU read-side critical sections are normally short, so check
+	 * at least twice in quick succession after a flip.
+	 */
+	trycount = trycount < 2 ? 2 : trycount;
+	if (!try_check_zero(sp, idx^1, trycount))
+		return; /* failed to advance, will try after SRCU_INTERVAL */
+
+	/*
+	 * The callbacks in ->batch_check1 have now waited for all
+	 * pre-existing readers using both idx values.  They are therefore
+	 * ready to invoke, so move them to ->batch_done.
+	 */
+	rcu_batch_move(&sp->batch_done, &sp->batch_check1);
 }
 
 /*
@@ -557,48 +604,45 @@ static void srcu_advance_batches(struct srcu_struct *sp)
  */
 static void srcu_invoke_callbacks(struct srcu_struct *sp)
 {
-	struct rcu_cblist ready_cbs;
-	struct rcu_head *rhp;
+	int i;
+	struct rcu_head *head;
 
-	spin_lock_irq(&sp->queue_lock);
-	if (!rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
-		spin_unlock_irq(&sp->queue_lock);
-		return;
-	}
-	rcu_cblist_init(&ready_cbs);
-	rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
-	spin_unlock_irq(&sp->queue_lock);
-	rhp = rcu_cblist_dequeue(&ready_cbs);
-	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
+	for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
+		head = rcu_batch_dequeue(&sp->batch_done);
+		if (!head)
+			break;
 		local_bh_disable();
-		rhp->func(rhp);
+		head->func(head);
 		local_bh_enable();
 	}
-	spin_lock_irq(&sp->queue_lock);
-	rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
-	spin_unlock_irq(&sp->queue_lock);
 }
 
 /*
  * Finished one round of SRCU grace period.  Start another if there are
  * more SRCU callbacks queued, otherwise put SRCU into not-running state.
  */
-static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
+static void srcu_reschedule(struct srcu_struct *sp)
 {
 	bool pending = true;
-	int state;
 
-	if (rcu_segcblist_empty(&sp->srcu_cblist)) {
+	if (rcu_batch_empty(&sp->batch_done) &&
+	    rcu_batch_empty(&sp->batch_check1) &&
+	    rcu_batch_empty(&sp->batch_check0) &&
+	    rcu_batch_empty(&sp->batch_queue)) {
 		spin_lock_irq(&sp->queue_lock);
-		state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
-		if (rcu_segcblist_empty(&sp->srcu_cblist) &&
-		    state == SRCU_STATE_IDLE)
+		if (rcu_batch_empty(&sp->batch_done) &&
+		    rcu_batch_empty(&sp->batch_check1) &&
+		    rcu_batch_empty(&sp->batch_check0) &&
+		    rcu_batch_empty(&sp->batch_queue)) {
+			sp->running = false;
 			pending = false;
+		}
 		spin_unlock_irq(&sp->queue_lock);
 	}
 
 	if (pending)
-		queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
+		queue_delayed_work(system_power_efficient_wq,
+				   &sp->work, SRCU_INTERVAL);
 }
 
 /*
@@ -610,8 +654,9 @@ void process_srcu(struct work_struct *work)
 
 	sp = container_of(work, struct srcu_struct, work.work);
 
-	srcu_advance_batches(sp);
+	srcu_collect_new(sp);
+	srcu_advance_batches(sp, 1);
 	srcu_invoke_callbacks(sp);
-	srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
+	srcu_reschedule(sp);
 }
 EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
new file mode 100644
index 000000000000..da676b0d016b
--- /dev/null
+++ b/kernel/rcu/srcutree.c
@@ -0,0 +1,613 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ * Copyright (C) Fujitsu, 2012
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ *	   Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ *		Documentation/RCU/ *.txt
+ *
+ */
+
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/delay.h>
+#include <linux/srcu.h>
+
+#include <linux/rcu_node_tree.h>
+#include "rcu.h"
+
+static int init_srcu_struct_fields(struct srcu_struct *sp)
+{
+	sp->completed = 0;
+	sp->srcu_gp_seq = 0;
+	atomic_set(&sp->srcu_exp_cnt, 0);
+	spin_lock_init(&sp->queue_lock);
+	rcu_segcblist_init(&sp->srcu_cblist);
+	INIT_DELAYED_WORK(&sp->work, process_srcu);
+	sp->per_cpu_ref = alloc_percpu(struct srcu_array);
+	return sp->per_cpu_ref ? 0 : -ENOMEM;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+		       struct lock_class_key *key)
+{
+	/* Don't re-initialize a lock while it is held. */
+	debug_check_no_locks_freed((void *)sp, sizeof(*sp));
+	lockdep_init_map(&sp->dep_map, name, key, 0);
+	return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/**
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @sp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function.  Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *sp)
+{
+	return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * Returns approximate total of the readers' ->lock_count[] values for the
+ * rank of per-CPU counters specified by idx.
+ */
+static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
+{
+	int cpu;
+	unsigned long sum = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+		sum += READ_ONCE(cpuc->lock_count[idx]);
+	}
+	return sum;
+}
+
+/*
+ * Returns approximate total of the readers' ->unlock_count[] values for the
+ * rank of per-CPU counters specified by idx.
+ */
+static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
+{
+	int cpu;
+	unsigned long sum = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+		sum += READ_ONCE(cpuc->unlock_count[idx]);
+	}
+	return sum;
+}
+
+/*
+ * Return true if the number of pre-existing readers is determined to
+ * be zero.
+ */
+static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
+{
+	unsigned long unlocks;
+
+	unlocks = srcu_readers_unlock_idx(sp, idx);
+
+	/*
+	 * Make sure that a lock is always counted if the corresponding
+	 * unlock is counted. Needs to be a smp_mb() as the read side may
+	 * contain a read from a variable that is written to before the
+	 * synchronize_srcu() in the write side. In this case smp_mb()s
+	 * A and B act like the store buffering pattern.
+	 *
+	 * This smp_mb() also pairs with smp_mb() C to prevent accesses
+	 * after the synchronize_srcu() from being executed before the
+	 * grace period ends.
+	 */
+	smp_mb(); /* A */
+
+	/*
+	 * If the locks are the same as the unlocks, then there must have
+	 * been no readers on this index at some time in between. This does
+	 * not mean that there are no more readers, as one could have read
+	 * the current index but not have incremented the lock counter yet.
+	 *
+	 * Possible bug: There is no guarantee that there haven't been
+	 * ULONG_MAX increments of ->lock_count[] since the unlocks were
+	 * counted, meaning that this could return true even if there are
+	 * still active readers.  Since there are no memory barriers around
+	 * srcu_flip(), the CPU is not required to increment ->completed
+	 * before running srcu_readers_unlock_idx(), which means that there
+	 * could be an arbitrarily large number of critical sections that
+	 * execute after srcu_readers_unlock_idx() but use the old value
+	 * of ->completed.
+	 */
+	return srcu_readers_lock_idx(sp, idx) == unlocks;
+}
+
+/**
+ * srcu_readers_active - returns true if there are readers. and false
+ *                       otherwise
+ * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
+ *
+ * Note that this is not an atomic primitive, and can therefore suffer
+ * severe errors when invoked on an active srcu_struct.  That said, it
+ * can be useful as an error check at cleanup time.
+ */
+static bool srcu_readers_active(struct srcu_struct *sp)
+{
+	int cpu;
+	unsigned long sum = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+		sum += READ_ONCE(cpuc->lock_count[0]);
+		sum += READ_ONCE(cpuc->lock_count[1]);
+		sum -= READ_ONCE(cpuc->unlock_count[0]);
+		sum -= READ_ONCE(cpuc->unlock_count[1]);
+	}
+	return sum;
+}
+
+#define SRCU_INTERVAL		1
+
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+	WARN_ON_ONCE(atomic_read(&sp->srcu_exp_cnt));
+	if (WARN_ON(srcu_readers_active(sp)))
+		return; /* Leakage unless caller handles error. */
+	if (WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)))
+		return; /* Leakage unless caller handles error. */
+	flush_delayed_work(&sp->work);
+	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE)) {
+		pr_info("cleanup_srcu_struct: Active srcu_struct %lu CBs %c state: %d\n", rcu_segcblist_n_cbs(&sp->srcu_cblist), ".E"[rcu_segcblist_empty(&sp->srcu_cblist)], rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
+		return; /* Caller forgot to stop doing call_srcu()? */
+	}
+	free_percpu(sp->per_cpu_ref);
+	sp->per_cpu_ref = NULL;
+}
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct.  Must be called from process context.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int __srcu_read_lock(struct srcu_struct *sp)
+{
+	int idx;
+
+	idx = READ_ONCE(sp->completed) & 0x1;
+	__this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
+	smp_mb(); /* B */  /* Avoid leaking the critical section. */
+	return idx;
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock);
+
+/*
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct.  Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
+ * Must be called from process context.
+ */
+void __srcu_read_unlock(struct srcu_struct *sp, int idx)
+{
+	smp_mb(); /* C */  /* Avoid leaking the critical section. */
+	this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited().  We spin for a fixed time period
+ * (defined below) to allow SRCU readers to exit their read-side critical
+ * sections.  If there are still some readers after a few microseconds,
+ * we repeatedly block for 1-millisecond time periods.
+ */
+#define SRCU_RETRY_CHECK_DELAY		5
+
+/*
+ * Start an SRCU grace period.
+ */
+static void srcu_gp_start(struct srcu_struct *sp)
+{
+	int state;
+
+	rcu_segcblist_accelerate(&sp->srcu_cblist,
+				 rcu_seq_snap(&sp->srcu_gp_seq));
+	rcu_seq_start(&sp->srcu_gp_seq);
+	state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+	WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
+}
+
+/*
+ * Wait until all readers counted by array index idx complete, but
+ * loop an additional time if there is an expedited grace period pending.
+ * The caller must ensure that ->completed is not changed while checking.
+ */
+static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
+{
+	for (;;) {
+		if (srcu_readers_active_idx_check(sp, idx))
+			return true;
+		if (--trycount + !!atomic_read(&sp->srcu_exp_cnt) <= 0)
+			return false;
+		udelay(SRCU_RETRY_CHECK_DELAY);
+	}
+}
+
+/*
+ * Increment the ->completed counter so that future SRCU readers will
+ * use the other rank of the ->(un)lock_count[] arrays.  This allows
+ * us to wait for pre-existing readers in a starvation-free manner.
+ */
+static void srcu_flip(struct srcu_struct *sp)
+{
+	WRITE_ONCE(sp->completed, sp->completed + 1);
+
+	/*
+	 * Ensure that if the updater misses an __srcu_read_unlock()
+	 * increment, that task's next __srcu_read_lock() will see the
+	 * above counter update.  Note that both this memory barrier
+	 * and the one in srcu_readers_active_idx_check() provide the
+	 * guarantee for __srcu_read_lock().
+	 */
+	smp_mb(); /* D */  /* Pairs with C. */
+}
+
+/*
+ * End an SRCU grace period.
+ */
+static void srcu_gp_end(struct srcu_struct *sp)
+{
+	rcu_seq_end(&sp->srcu_gp_seq);
+
+	spin_lock_irq(&sp->queue_lock);
+	rcu_segcblist_advance(&sp->srcu_cblist,
+			      rcu_seq_current(&sp->srcu_gp_seq));
+	spin_unlock_irq(&sp->queue_lock);
+}
+
+/*
+ * Enqueue an SRCU callback on the specified srcu_struct structure,
+ * initiating grace-period processing if it is not already running.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing SRCU read-side critical section.  On systems with
+ * more than one CPU, this means that when "func()" is invoked, each CPU
+ * is guaranteed to have executed a full memory barrier since the end of
+ * its last corresponding SRCU read-side critical section whose beginning
+ * preceded the call to call_rcu().  It also means that each CPU executing
+ * an SRCU read-side critical section that continues beyond the start of
+ * "func()" must have executed a memory barrier after the call_rcu()
+ * but before the beginning of that SRCU read-side critical section.
+ * Note that these guarantees include CPUs that are offline, idle, or
+ * executing in user mode, as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting SRCU callback function "func()", then both CPU A and CPU
+ * B are guaranteed to execute a full memory barrier during the time
+ * interval between the call to call_rcu() and the invocation of "func()".
+ * This guarantee applies even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
+ *
+ * Of course, these guarantees apply only for invocations of call_srcu(),
+ * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
+ * srcu_struct structure.
+ */
+void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+	       rcu_callback_t func)
+{
+	unsigned long flags;
+
+	head->next = NULL;
+	head->func = func;
+	spin_lock_irqsave(&sp->queue_lock, flags);
+	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
+	rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
+	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_IDLE) {
+		srcu_gp_start(sp);
+		queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
+	}
+	spin_unlock_irqrestore(&sp->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
+
+static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+
+/*
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ */
+static void __synchronize_srcu(struct srcu_struct *sp)
+{
+	struct rcu_synchronize rcu;
+	struct rcu_head *head = &rcu.head;
+
+	RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
+			 lock_is_held(&rcu_bh_lock_map) ||
+			 lock_is_held(&rcu_lock_map) ||
+			 lock_is_held(&rcu_sched_lock_map),
+			 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
+
+	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+		return;
+	might_sleep();
+	init_completion(&rcu.completion);
+
+	head->next = NULL;
+	head->func = wakeme_after_rcu;
+	spin_lock_irq(&sp->queue_lock);
+	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
+	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_IDLE) {
+		/* steal the processing owner */
+		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
+		srcu_gp_start(sp);
+		spin_unlock_irq(&sp->queue_lock);
+		/* give the processing owner to work_struct */
+		srcu_reschedule(sp, 0);
+	} else {
+		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
+		spin_unlock_irq(&sp->queue_lock);
+	}
+
+	wait_for_completion(&rcu.completion);
+	smp_mb(); /* Caller's later accesses after GP. */
+}
+
+/**
+ * synchronize_srcu_expedited - Brute-force SRCU grace period
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * spinning rather than blocking when waiting.
+ *
+ * Note that synchronize_srcu_expedited() has the same deadlock and
+ * memory-ordering properties as does synchronize_srcu().
+ */
+void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+	bool do_norm = rcu_gp_is_normal();
+
+	if (!do_norm) {
+		atomic_inc(&sp->srcu_exp_cnt);
+		smp_mb__after_atomic(); /* increment before GP. */
+	}
+	__synchronize_srcu(sp);
+	if (!do_norm) {
+		smp_mb__before_atomic(); /* GP before decrement. */
+		atomic_dec(&sp->srcu_exp_cnt);
+	}
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
+
+/**
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for the count to drain to zero of both indexes. To avoid the
+ * possible starvation of synchronize_srcu(), it waits for the count of
+ * the index=((->completed & 1) ^ 1) to drain to zero at first,
+ * and then flip the completed and wait for the count of the other index.
+ *
+ * Can block; must be called from process context.
+ *
+ * Note that it is illegal to call synchronize_srcu() from the corresponding
+ * SRCU read-side critical section; doing so will result in deadlock.
+ * However, it is perfectly legal to call synchronize_srcu() on one
+ * srcu_struct from some other srcu_struct's read-side critical section,
+ * as long as the resulting graph of srcu_structs is acyclic.
+ *
+ * There are memory-ordering constraints implied by synchronize_srcu().
+ * On systems with more than one CPU, when synchronize_srcu() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since
+ * the end of its last corresponding SRCU-sched read-side critical section
+ * whose beginning preceded the call to synchronize_srcu().  In addition,
+ * each CPU having an SRCU read-side critical section that extends beyond
+ * the return from synchronize_srcu() is guaranteed to have executed a
+ * full memory barrier after the beginning of synchronize_srcu() and before
+ * the beginning of that SRCU read-side critical section.  Note that these
+ * guarantees include CPUs that are offline, idle, or executing in user mode,
+ * as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_srcu(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_srcu().  This guarantee applies even if CPU A and CPU B
+ * are the same CPU, but again only if the system has more than one CPU.
+ *
+ * Of course, these memory-ordering guarantees apply only when
+ * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
+ * passed the same srcu_struct structure.
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+	if (rcu_gp_is_expedited())
+		synchronize_srcu_expedited(sp);
+	else
+		__synchronize_srcu(sp);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/**
+ * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
+ * @sp: srcu_struct on which to wait for in-flight callbacks.
+ */
+void srcu_barrier(struct srcu_struct *sp)
+{
+	synchronize_srcu(sp);
+}
+EXPORT_SYMBOL_GPL(srcu_barrier);
+
+/**
+ * srcu_batches_completed - return batches completed.
+ * @sp: srcu_struct on which to report batch completion.
+ *
+ * Report the number of batches, correlated with, but not necessarily
+ * precisely the same as, the number of grace periods that have elapsed.
+ */
+unsigned long srcu_batches_completed(struct srcu_struct *sp)
+{
+	return sp->completed;
+}
+EXPORT_SYMBOL_GPL(srcu_batches_completed);
+
+/*
+ * Core SRCU state machine.  Advance callbacks from ->batch_check0 to
+ * ->batch_check1 and then to ->batch_done as readers drain.
+ */
+static void srcu_advance_batches(struct srcu_struct *sp)
+{
+	int idx;
+
+	/*
+	 * Because readers might be delayed for an extended period after
+	 * fetching ->completed for their index, at any point in time there
+	 * might well be readers using both idx=0 and idx=1.  We therefore
+	 * need to wait for readers to clear from both index values before
+	 * invoking a callback.
+	 *
+	 * The load-acquire ensures that we see the accesses performed
+	 * by the prior grace period.
+	 */
+	idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
+	if (idx == SRCU_STATE_IDLE) {
+		spin_lock_irq(&sp->queue_lock);
+		if (rcu_segcblist_empty(&sp->srcu_cblist)) {
+			spin_unlock_irq(&sp->queue_lock);
+			return;
+		}
+		idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+		if (idx == SRCU_STATE_IDLE)
+			srcu_gp_start(sp);
+		spin_unlock_irq(&sp->queue_lock);
+		if (idx != SRCU_STATE_IDLE)
+			return; /* Someone else started the grace period. */
+	}
+
+	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
+		idx = 1 ^ (sp->completed & 1);
+		if (!try_check_zero(sp, idx, 1))
+			return; /* readers present, retry later. */
+		srcu_flip(sp);
+		rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
+	}
+
+	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+
+		/*
+		 * SRCU read-side critical sections are normally short,
+		 * so check at least twice in quick succession after a flip.
+		 */
+		idx = 1 ^ (sp->completed & 1);
+		if (!try_check_zero(sp, idx, 2))
+			return; /* readers present, retry after later. */
+		srcu_gp_end(sp);
+	}
+}
+
+/*
+ * Invoke a limited number of SRCU callbacks that have passed through
+ * their grace period.  If there are more to do, SRCU will reschedule
+ * the workqueue.  Note that needed memory barriers have been executed
+ * in this task's context by srcu_readers_active_idx_check().
+ */
+static void srcu_invoke_callbacks(struct srcu_struct *sp)
+{
+	struct rcu_cblist ready_cbs;
+	struct rcu_head *rhp;
+
+	spin_lock_irq(&sp->queue_lock);
+	if (!rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
+		spin_unlock_irq(&sp->queue_lock);
+		return;
+	}
+	rcu_cblist_init(&ready_cbs);
+	rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
+	spin_unlock_irq(&sp->queue_lock);
+	rhp = rcu_cblist_dequeue(&ready_cbs);
+	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
+		local_bh_disable();
+		rhp->func(rhp);
+		local_bh_enable();
+	}
+	spin_lock_irq(&sp->queue_lock);
+	rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
+	spin_unlock_irq(&sp->queue_lock);
+}
+
+/*
+ * Finished one round of SRCU grace period.  Start another if there are
+ * more SRCU callbacks queued, otherwise put SRCU into not-running state.
+ */
+static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
+{
+	bool pending = true;
+	int state;
+
+	if (rcu_segcblist_empty(&sp->srcu_cblist)) {
+		spin_lock_irq(&sp->queue_lock);
+		state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+		if (rcu_segcblist_empty(&sp->srcu_cblist) &&
+		    state == SRCU_STATE_IDLE)
+			pending = false;
+		spin_unlock_irq(&sp->queue_lock);
+	}
+
+	if (pending)
+		queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
+}
+
+/*
+ * This is the work-queue function that handles SRCU grace periods.
+ */
+void process_srcu(struct work_struct *work)
+{
+	struct srcu_struct *sp;
+
+	sp = container_of(work, struct srcu_struct, work.work);
+
+	srcu_advance_batches(sp);
+	srcu_invoke_callbacks(sp);
+	srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
+}
+EXPORT_SYMBOL_GPL(process_srcu);
-- 
cgit v1.2.3-70-g09d2


From 5f0d5a3ae7cff0d7fa943c199c3a2e44f23e1fac Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 18 Jan 2017 02:53:44 -0800
Subject: mm: Rename SLAB_DESTROY_BY_RCU to SLAB_TYPESAFE_BY_RCU

A group of Linux kernel hackers reported chasing a bug that resulted
from their assumption that SLAB_DESTROY_BY_RCU provided an existence
guarantee, that is, that no block from such a slab would be reallocated
during an RCU read-side critical section.  Of course, that is not the
case.  Instead, SLAB_DESTROY_BY_RCU only prevents freeing of an entire
slab of blocks.

However, there is a phrase for this, namely "type safety".  This commit
therefore renames SLAB_DESTROY_BY_RCU to SLAB_TYPESAFE_BY_RCU in order
to avoid future instances of this sort of confusion.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <linux-mm@kvack.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
[ paulmck: Add comments mentioning the old name, as requested by Eric
  Dumazet, in order to help people familiar with the old name find
  the new one. ]
Acked-by: David Rientjes <rientjes@google.com>
---
 Documentation/RCU/00-INDEX                      |  2 +-
 Documentation/RCU/rculist_nulls.txt             |  6 +++---
 Documentation/RCU/whatisRCU.txt                 |  3 ++-
 drivers/gpu/drm/i915/i915_gem.c                 |  2 +-
 drivers/gpu/drm/i915/i915_gem_request.h         |  2 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c |  2 +-
 fs/jbd2/journal.c                               |  2 +-
 fs/signalfd.c                                   |  2 +-
 include/linux/dma-fence.h                       |  4 ++--
 include/linux/slab.h                            |  6 ++++--
 include/net/sock.h                              |  2 +-
 kernel/fork.c                                   |  4 ++--
 kernel/signal.c                                 |  2 +-
 mm/kasan/kasan.c                                |  6 +++---
 mm/kmemcheck.c                                  |  2 +-
 mm/rmap.c                                       |  4 ++--
 mm/slab.c                                       |  6 +++---
 mm/slab.h                                       |  4 ++--
 mm/slab_common.c                                |  6 +++---
 mm/slob.c                                       |  6 +++---
 mm/slub.c                                       | 12 ++++++------
 net/dccp/ipv4.c                                 |  2 +-
 net/dccp/ipv6.c                                 |  2 +-
 net/ipv4/tcp_ipv4.c                             |  2 +-
 net/ipv6/tcp_ipv6.c                             |  2 +-
 net/llc/af_llc.c                                |  2 +-
 net/llc/llc_conn.c                              |  4 ++--
 net/llc/llc_sap.c                               |  2 +-
 net/netfilter/nf_conntrack_core.c               |  8 ++++----
 net/smc/af_smc.c                                |  2 +-
 30 files changed, 57 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
index f773a264ae02..1672573b037a 100644
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -17,7 +17,7 @@ rcu_dereference.txt
 rcubarrier.txt
 	- RCU and Unloadable Modules
 rculist_nulls.txt
-	- RCU list primitives for use with SLAB_DESTROY_BY_RCU
+	- RCU list primitives for use with SLAB_TYPESAFE_BY_RCU
 rcuref.txt
 	- Reference-count design for elements of lists/arrays protected by RCU
 rcu.txt
diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt
index 18f9651ff23d..8151f0195f76 100644
--- a/Documentation/RCU/rculist_nulls.txt
+++ b/Documentation/RCU/rculist_nulls.txt
@@ -1,5 +1,5 @@
 Using hlist_nulls to protect read-mostly linked lists and
-objects using SLAB_DESTROY_BY_RCU allocations.
+objects using SLAB_TYPESAFE_BY_RCU allocations.
 
 Please read the basics in Documentation/RCU/listRCU.txt
 
@@ -7,7 +7,7 @@ Using special makers (called 'nulls') is a convenient way
 to solve following problem :
 
 A typical RCU linked list managing objects which are
-allocated with SLAB_DESTROY_BY_RCU kmem_cache can
+allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can
 use following algos :
 
 1) Lookup algo
@@ -96,7 +96,7 @@ unlock_chain(); // typically a spin_unlock()
 3) Remove algo
 --------------
 Nothing special here, we can use a standard RCU hlist deletion.
-But thanks to SLAB_DESTROY_BY_RCU, beware a deleted object can be reused
+But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused
 very very fast (before the end of RCU grace period)
 
 if (put_last_reference_on(obj) {
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 5cbd8b2395b8..91c912e86915 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -925,7 +925,8 @@ d.	Do you need RCU grace periods to complete even in the face
 
 e.	Is your workload too update-intensive for normal use of
 	RCU, but inappropriate for other synchronization mechanisms?
-	If so, consider SLAB_DESTROY_BY_RCU.  But please be careful!
+	If so, consider SLAB_TYPESAFE_BY_RCU (which was originally
+	named SLAB_DESTROY_BY_RCU).  But please be careful!
 
 f.	Do you need read-side critical sections that are respected
 	even though they are in the middle of the idle loop, during
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 6908123162d1..3b668895ac24 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4552,7 +4552,7 @@ i915_gem_load_init(struct drm_i915_private *dev_priv)
 	dev_priv->requests = KMEM_CACHE(drm_i915_gem_request,
 					SLAB_HWCACHE_ALIGN |
 					SLAB_RECLAIM_ACCOUNT |
-					SLAB_DESTROY_BY_RCU);
+					SLAB_TYPESAFE_BY_RCU);
 	if (!dev_priv->requests)
 		goto err_vmas;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index ea511f06efaf..9ee2750e1dde 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -493,7 +493,7 @@ static inline struct drm_i915_gem_request *
 __i915_gem_active_get_rcu(const struct i915_gem_active *active)
 {
 	/* Performing a lockless retrieval of the active request is super
-	 * tricky. SLAB_DESTROY_BY_RCU merely guarantees that the backing
+	 * tricky. SLAB_TYPESAFE_BY_RCU merely guarantees that the backing
 	 * slab of request objects will not be freed whilst we hold the
 	 * RCU read lock. It does not guarantee that the request itself
 	 * will not be freed and then *reused*. Viz,
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
index 12647af5a336..e7fb47e84a93 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
@@ -1071,7 +1071,7 @@ int ldlm_init(void)
 	ldlm_lock_slab = kmem_cache_create("ldlm_locks",
 					   sizeof(struct ldlm_lock), 0,
 					   SLAB_HWCACHE_ALIGN |
-					   SLAB_DESTROY_BY_RCU, NULL);
+					   SLAB_TYPESAFE_BY_RCU, NULL);
 	if (!ldlm_lock_slab) {
 		kmem_cache_destroy(ldlm_resource_slab);
 		return -ENOMEM;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index a1a359bfcc9c..7f8f962454e5 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2340,7 +2340,7 @@ static int jbd2_journal_init_journal_head_cache(void)
 	jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
 				sizeof(struct journal_head),
 				0,		/* offset */
-				SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU,
+				SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU,
 				NULL);		/* ctor */
 	retval = 0;
 	if (!jbd2_journal_head_cache) {
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 270221fcef42..7e3d71109f51 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -38,7 +38,7 @@ void signalfd_cleanup(struct sighand_struct *sighand)
 	/*
 	 * The lockless check can race with remove_wait_queue() in progress,
 	 * but in this case its caller should run under rcu_read_lock() and
-	 * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return.
+	 * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return.
 	 */
 	if (likely(!waitqueue_active(wqh)))
 		return;
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 6048fa404e57..a5195a7d6f77 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -229,7 +229,7 @@ static inline struct dma_fence *dma_fence_get_rcu(struct dma_fence *fence)
  *
  * Function returns NULL if no refcount could be obtained, or the fence.
  * This function handles acquiring a reference to a fence that may be
- * reallocated within the RCU grace period (such as with SLAB_DESTROY_BY_RCU),
+ * reallocated within the RCU grace period (such as with SLAB_TYPESAFE_BY_RCU),
  * so long as the caller is using RCU on the pointer to the fence.
  *
  * An alternative mechanism is to employ a seqlock to protect a bunch of
@@ -257,7 +257,7 @@ dma_fence_get_rcu_safe(struct dma_fence * __rcu *fencep)
 		 * have successfully acquire a reference to it. If it no
 		 * longer matches, we are holding a reference to some other
 		 * reallocated pointer. This is possible if the allocator
-		 * is using a freelist like SLAB_DESTROY_BY_RCU where the
+		 * is using a freelist like SLAB_TYPESAFE_BY_RCU where the
 		 * fence remains valid for the RCU grace period, but it
 		 * may be reallocated. When using such allocators, we are
 		 * responsible for ensuring the reference we get is to
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 3c37a8c51921..04a7f7993e67 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -28,7 +28,7 @@
 #define SLAB_STORE_USER		0x00010000UL	/* DEBUG: Store the last owner for bug hunting */
 #define SLAB_PANIC		0x00040000UL	/* Panic if kmem_cache_create() fails */
 /*
- * SLAB_DESTROY_BY_RCU - **WARNING** READ THIS!
+ * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
  *
  * This delays freeing the SLAB page by a grace period, it does _NOT_
  * delay object freeing. This means that if you do kmem_cache_free()
@@ -61,8 +61,10 @@
  *
  * rcu_read_lock before reading the address, then rcu_read_unlock after
  * taking the spinlock within the structure expected at that address.
+ *
+ * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
  */
-#define SLAB_DESTROY_BY_RCU	0x00080000UL	/* Defer freeing slabs to RCU */
+#define SLAB_TYPESAFE_BY_RCU	0x00080000UL	/* Defer freeing slabs to RCU */
 #define SLAB_MEM_SPREAD		0x00100000UL	/* Spread some memory over cpuset */
 #define SLAB_TRACE		0x00200000UL	/* Trace allocations and frees */
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 5e5997654db6..59cdccaa30e7 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -993,7 +993,7 @@ struct smc_hashinfo;
 struct module;
 
 /*
- * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
+ * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes
  * un-modified. Special care is taken when initializing object to zero.
  */
 static inline void sk_prot_clear_nulls(struct sock *sk, int size)
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c463c80e93d..9330ce24f1bb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1313,7 +1313,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
 	if (atomic_dec_and_test(&sighand->count)) {
 		signalfd_cleanup(sighand);
 		/*
-		 * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it
+		 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
 		 * without an RCU grace period, see __lock_task_sighand().
 		 */
 		kmem_cache_free(sighand_cachep, sighand);
@@ -2144,7 +2144,7 @@ void __init proc_caches_init(void)
 {
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
 			SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
diff --git a/kernel/signal.c b/kernel/signal.c
index 7e59ebc2c25e..6df5f72158e4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1237,7 +1237,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
 		}
 		/*
 		 * This sighand can be already freed and even reused, but
-		 * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which
+		 * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which
 		 * initializes ->siglock: this slab can't go away, it has
 		 * the same object type, ->siglock can't be reinitialized.
 		 *
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 98b27195e38b..4b20061102f6 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -413,7 +413,7 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 	*size += sizeof(struct kasan_alloc_meta);
 
 	/* Add free meta. */
-	if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor ||
+	if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor ||
 	    cache->object_size < sizeof(struct kasan_free_meta)) {
 		cache->kasan_info.free_meta_offset = *size;
 		*size += sizeof(struct kasan_free_meta);
@@ -561,7 +561,7 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
 	unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
 
 	/* RCU slabs could be legally used after free within the RCU period */
-	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+	if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
 		return;
 
 	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
@@ -572,7 +572,7 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
 	s8 shadow_byte;
 
 	/* RCU slabs could be legally used after free within the RCU period */
-	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+	if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
 		return false;
 
 	shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index 5bf191756a4a..2d5959c5f7c5 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -95,7 +95,7 @@ void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
 void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
 {
 	/* TODO: RCU freeing is unsupported for now; hide false positives. */
-	if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU))
+	if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU))
 		kmemcheck_mark_freed(object, size);
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 49ed681ccc7b..8ffd59df8a3f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -430,7 +430,7 @@ static void anon_vma_ctor(void *data)
 void __init anon_vma_init(void)
 {
 	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
-			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
+			0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
 			anon_vma_ctor);
 	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
 			SLAB_PANIC|SLAB_ACCOUNT);
@@ -481,7 +481,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
 	 * If this page is still mapped, then its anon_vma cannot have been
 	 * freed.  But if it has been unmapped, we have no security against the
 	 * anon_vma structure being freed and reused (for another anon_vma:
-	 * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero()
+	 * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
 	 * above cannot corrupt).
 	 */
 	if (!page_mapped(page)) {
diff --git a/mm/slab.c b/mm/slab.c
index 807d86c76908..93c827864862 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1728,7 +1728,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page)
 
 	freelist = page->freelist;
 	slab_destroy_debugcheck(cachep, page);
-	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
+	if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU))
 		call_rcu(&page->rcu_head, kmem_rcu_free);
 	else
 		kmem_freepages(cachep, page);
@@ -1924,7 +1924,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
 
 	cachep->num = 0;
 
-	if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU)
+	if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU)
 		return false;
 
 	left = calculate_slab_order(cachep, size,
@@ -2030,7 +2030,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 	if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
 						2 * sizeof(unsigned long long)))
 		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
-	if (!(flags & SLAB_DESTROY_BY_RCU))
+	if (!(flags & SLAB_TYPESAFE_BY_RCU))
 		flags |= SLAB_POISON;
 #endif
 #endif
diff --git a/mm/slab.h b/mm/slab.h
index 65e7c3fcac72..9cfcf099709c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -126,7 +126,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
 
 /* Legal flag mask for kmem_cache_create(), for various configurations */
 #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
-			 SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS )
+			 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
 
 #if defined(CONFIG_DEBUG_SLAB)
 #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
@@ -415,7 +415,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
 	 * back there or track user information then we can
 	 * only use the space before that information.
 	 */
-	if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+	if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER))
 		return s->inuse;
 	/*
 	 * Else we can use all the padding etc for the allocation
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 09d0e849b07f..01a0fe2eb332 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -39,7 +39,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
  * Set of flags that will prevent slab merging
  */
 #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
-		SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
+		SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
 		SLAB_FAILSLAB | SLAB_KASAN)
 
 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
@@ -500,7 +500,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
 	struct kmem_cache *s, *s2;
 
 	/*
-	 * On destruction, SLAB_DESTROY_BY_RCU kmem_caches are put on the
+	 * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the
 	 * @slab_caches_to_rcu_destroy list.  The slab pages are freed
 	 * through RCU and and the associated kmem_cache are dereferenced
 	 * while freeing the pages, so the kmem_caches should be freed only
@@ -537,7 +537,7 @@ static int shutdown_cache(struct kmem_cache *s)
 	memcg_unlink_cache(s);
 	list_del(&s->list);
 
-	if (s->flags & SLAB_DESTROY_BY_RCU) {
+	if (s->flags & SLAB_TYPESAFE_BY_RCU) {
 		list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
 		schedule_work(&slab_caches_to_rcu_destroy_work);
 	} else {
diff --git a/mm/slob.c b/mm/slob.c
index eac04d4357ec..1bae78d71096 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -126,7 +126,7 @@ static inline void clear_slob_page_free(struct page *sp)
 
 /*
  * struct slob_rcu is inserted at the tail of allocated slob blocks, which
- * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free
+ * were created with a SLAB_TYPESAFE_BY_RCU slab. slob_rcu is used to free
  * the block using call_rcu.
  */
 struct slob_rcu {
@@ -524,7 +524,7 @@ EXPORT_SYMBOL(ksize);
 
 int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
 {
-	if (flags & SLAB_DESTROY_BY_RCU) {
+	if (flags & SLAB_TYPESAFE_BY_RCU) {
 		/* leave room for rcu footer at the end of object */
 		c->size += sizeof(struct slob_rcu);
 	}
@@ -598,7 +598,7 @@ static void kmem_rcu_free(struct rcu_head *head)
 void kmem_cache_free(struct kmem_cache *c, void *b)
 {
 	kmemleak_free_recursive(b, c->flags);
-	if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
+	if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) {
 		struct slob_rcu *slob_rcu;
 		slob_rcu = b + (c->size - sizeof(struct slob_rcu));
 		slob_rcu->size = c->size;
diff --git a/mm/slub.c b/mm/slub.c
index 7f4bc7027ed5..57e5156f02be 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1687,7 +1687,7 @@ static void rcu_free_slab(struct rcu_head *h)
 
 static void free_slab(struct kmem_cache *s, struct page *page)
 {
-	if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
+	if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
 		struct rcu_head *head;
 
 		if (need_reserve_slab_rcu) {
@@ -2963,7 +2963,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
 	 * slab_free_freelist_hook() could have put the items into quarantine.
 	 * If so, no need to free them.
 	 */
-	if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU))
+	if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU))
 		return;
 	do_slab_free(s, page, head, tail, cnt, addr);
 }
@@ -3433,7 +3433,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 	 * the slab may touch the object after free or before allocation
 	 * then we should never poison the object itself.
 	 */
-	if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
+	if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
 			!s->ctor)
 		s->flags |= __OBJECT_POISON;
 	else
@@ -3455,7 +3455,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 	 */
 	s->inuse = size;
 
-	if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
+	if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
 		s->ctor)) {
 		/*
 		 * Relocate free pointer after the object if it is not
@@ -3537,7 +3537,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
 	s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
 	s->reserved = 0;
 
-	if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
+	if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU))
 		s->reserved = sizeof(struct rcu_head);
 
 	if (!calculate_sizes(s, -1))
@@ -5042,7 +5042,7 @@ SLAB_ATTR_RO(cache_dma);
 
 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
 {
-	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
+	return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
 }
 SLAB_ATTR_RO(destroy_by_rcu);
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 409d0cfd3447..90210a0e3888 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -950,7 +950,7 @@ static struct proto dccp_v4_prot = {
 	.orphan_count		= &dccp_orphan_count,
 	.max_header		= MAX_DCCP_HEADER,
 	.obj_size		= sizeof(struct dccp_sock),
-	.slab_flags		= SLAB_DESTROY_BY_RCU,
+	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
 	.rsk_prot		= &dccp_request_sock_ops,
 	.twsk_prot		= &dccp_timewait_sock_ops,
 	.h.hashinfo		= &dccp_hashinfo,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 233b57367758..b4019a5e4551 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1012,7 +1012,7 @@ static struct proto dccp_v6_prot = {
 	.orphan_count	   = &dccp_orphan_count,
 	.max_header	   = MAX_DCCP_HEADER,
 	.obj_size	   = sizeof(struct dccp6_sock),
-	.slab_flags	   = SLAB_DESTROY_BY_RCU,
+	.slab_flags	   = SLAB_TYPESAFE_BY_RCU,
 	.rsk_prot	   = &dccp6_request_sock_ops,
 	.twsk_prot	   = &dccp6_timewait_sock_ops,
 	.h.hashinfo	   = &dccp_hashinfo,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 9a89b8deafae..82c89abeb989 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2398,7 +2398,7 @@ struct proto tcp_prot = {
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp_sock),
-	.slab_flags		= SLAB_DESTROY_BY_RCU,
+	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
 	.twsk_prot		= &tcp_timewait_sock_ops,
 	.rsk_prot		= &tcp_request_sock_ops,
 	.h.hashinfo		= &tcp_hashinfo,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 60a5295a7de6..bdbc4327ebee 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1919,7 +1919,7 @@ struct proto tcpv6_prot = {
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp6_sock),
-	.slab_flags		= SLAB_DESTROY_BY_RCU,
+	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
 	.twsk_prot		= &tcp6_timewait_sock_ops,
 	.rsk_prot		= &tcp6_request_sock_ops,
 	.h.hashinfo		= &tcp_hashinfo,
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 06186d608a27..d096ca563054 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -142,7 +142,7 @@ static struct proto llc_proto = {
 	.name	  = "LLC",
 	.owner	  = THIS_MODULE,
 	.obj_size = sizeof(struct llc_sock),
-	.slab_flags = SLAB_DESTROY_BY_RCU,
+	.slab_flags = SLAB_TYPESAFE_BY_RCU,
 };
 
 /**
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 8bc5a1bd2d45..9b02c13d258b 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -506,7 +506,7 @@ static struct sock *__llc_lookup_established(struct llc_sap *sap,
 again:
 	sk_nulls_for_each_rcu(rc, node, laddr_hb) {
 		if (llc_estab_match(sap, daddr, laddr, rc)) {
-			/* Extra checks required by SLAB_DESTROY_BY_RCU */
+			/* Extra checks required by SLAB_TYPESAFE_BY_RCU */
 			if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
 				goto again;
 			if (unlikely(llc_sk(rc)->sap != sap ||
@@ -565,7 +565,7 @@ static struct sock *__llc_lookup_listener(struct llc_sap *sap,
 again:
 	sk_nulls_for_each_rcu(rc, node, laddr_hb) {
 		if (llc_listener_match(sap, laddr, rc)) {
-			/* Extra checks required by SLAB_DESTROY_BY_RCU */
+			/* Extra checks required by SLAB_TYPESAFE_BY_RCU */
 			if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
 				goto again;
 			if (unlikely(llc_sk(rc)->sap != sap ||
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 5404d0d195cc..63b6ab056370 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -328,7 +328,7 @@ static struct sock *llc_lookup_dgram(struct llc_sap *sap,
 again:
 	sk_nulls_for_each_rcu(rc, node, laddr_hb) {
 		if (llc_dgram_match(sap, laddr, rc)) {
-			/* Extra checks required by SLAB_DESTROY_BY_RCU */
+			/* Extra checks required by SLAB_TYPESAFE_BY_RCU */
 			if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
 				goto again;
 			if (unlikely(llc_sk(rc)->sap != sap ||
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 071b97fcbefb..fdcdac7916b2 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -914,7 +914,7 @@ static unsigned int early_drop_list(struct net *net,
 			continue;
 
 		/* kill only if still in same netns -- might have moved due to
-		 * SLAB_DESTROY_BY_RCU rules.
+		 * SLAB_TYPESAFE_BY_RCU rules.
 		 *
 		 * We steal the timer reference.  If that fails timer has
 		 * already fired or someone else deleted it. Just drop ref
@@ -1069,7 +1069,7 @@ __nf_conntrack_alloc(struct net *net,
 
 	/*
 	 * Do not use kmem_cache_zalloc(), as this cache uses
-	 * SLAB_DESTROY_BY_RCU.
+	 * SLAB_TYPESAFE_BY_RCU.
 	 */
 	ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
 	if (ct == NULL)
@@ -1114,7 +1114,7 @@ void nf_conntrack_free(struct nf_conn *ct)
 	struct net *net = nf_ct_net(ct);
 
 	/* A freed object has refcnt == 0, that's
-	 * the golden rule for SLAB_DESTROY_BY_RCU
+	 * the golden rule for SLAB_TYPESAFE_BY_RCU
 	 */
 	NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
 
@@ -1878,7 +1878,7 @@ int nf_conntrack_init_start(void)
 	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
 						sizeof(struct nf_conn),
 						NFCT_INFOMASK + 1,
-						SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
+						SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
 	if (!nf_conntrack_cachep)
 		goto err_cachep;
 
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 85837ab90e89..d34bbd6d8f38 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -101,7 +101,7 @@ struct proto smc_proto = {
 	.unhash		= smc_unhash_sk,
 	.obj_size	= sizeof(struct smc_sock),
 	.h.smc_hash	= &smc_v4_hashinfo,
-	.slab_flags	= SLAB_DESTROY_BY_RCU,
+	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
 };
 EXPORT_SYMBOL_GPL(smc_proto);
 
-- 
cgit v1.2.3-70-g09d2


From 468d01bec544286bb5283f012b95b5b84636565b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 2 Feb 2017 11:40:15 -0800
Subject: types: Update obsolete callback_head comment

The comment header for callback_head (and thus for rcu_head) states that
the bottom two bits of a pointer to these structures must be zero.  This
is obsolete:  The new requirement is that only the bottom bit need be
zero.  This commit therefore updates this comment.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/types.h b/include/linux/types.h
index 1e7bd24848fc..258099a4ed82 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -209,7 +209,7 @@ struct ustat {
  * naturally due ABI requirements, but some architectures (like CRIS) have
  * weird ABI and we need to ask it explicitly.
  *
- * The alignment is required to guarantee that bits 0 and 1 of @next will be
+ * The alignment is required to guarantee that bit 0 of @next will be
  * clear under normal conditions -- as long as we use call_rcu(),
  * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback.
  *
-- 
cgit v1.2.3-70-g09d2


From 48ac34666ff76843d8743db1cc78b303759916f1 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 27 Feb 2017 21:14:19 +0200
Subject: hlist_add_tail_rcu disable sparse warning

sparse is unhappy about this code in hlist_add_tail_rcu:

        struct hlist_node *i, *last = NULL;

        for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i))
                last = i;

This is because hlist_next_rcu and hlist_next_rcu return
__rcu pointers.

It's a false positive - it's a write side primitive and so
does not need to be called in a read side critical section.

The following trivial patch disables the warning
without changing the behaviour in any way.

Note: __hlist_for_each_rcu would also remove the warning but it would be
confusing since it calls rcu_derefence and is designed to run in the rcu
read side critical section.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rculist.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 4f7a9561b8c4..b1fd8bf85fdc 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -509,7 +509,8 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n,
 {
 	struct hlist_node *i, *last = NULL;
 
-	for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i))
+	/* Note: write side code, so rcu accessors are not needed. */
+	for (i = h->first; i; i = i->next)
 		last = i;
 
 	if (last) {
-- 
cgit v1.2.3-70-g09d2


From 4a67c9fde04fc1b6752fa68c495310ca3ed29eeb Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Fri, 31 Mar 2017 11:11:48 +0200
Subject: mtd: use dev_of_node helper in mtd_get_of_node
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows better compile-time optimizations with CONFIG_OF disabled.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 include/linux/mtd/mtd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index eebdc63cf6af..f8db5b2e4028 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -393,7 +393,7 @@ static inline void mtd_set_of_node(struct mtd_info *mtd,
 
 static inline struct device_node *mtd_get_of_node(struct mtd_info *mtd)
 {
-	return mtd->dev.of_node;
+	return dev_of_node(&mtd->dev);
 }
 
 static inline int mtd_oobavail(struct mtd_info *mtd, struct mtd_oob_ops *ops)
-- 
cgit v1.2.3-70-g09d2


From 3f1866779cf8338e1c8bd32e5f6f5424795ef191 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Mon, 10 Apr 2017 16:50:58 +0530
Subject: of: dma: Make of_dma_deconfigure() public

As part of moving DMA initializing to probe time the
of_dma_deconfigure() function will need to be called from different
source files. Make it public and move it to drivers/of/device.c where
the of_dma_configure() function is.

Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/of/device.c       | 12 ++++++++++++
 drivers/of/platform.c     |  5 -----
 include/linux/of_device.h |  3 +++
 3 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/device.c b/drivers/of/device.c
index b1e6bebda3f3..0d378c03e1a4 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -151,6 +151,18 @@ void of_dma_configure(struct device *dev, struct device_node *np)
 }
 EXPORT_SYMBOL_GPL(of_dma_configure);
 
+/**
+ * of_dma_deconfigure - Clean up DMA configuration
+ * @dev:	Device for which to clean up DMA configuration
+ *
+ * Clean up all configuration performed by of_dma_configure_ops() and free all
+ * resources that have been allocated.
+ */
+void of_dma_deconfigure(struct device *dev)
+{
+	arch_teardown_dma_ops(dev);
+}
+
 int of_device_register(struct platform_device *pdev)
 {
 	device_initialize(&pdev->dev);
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 5dfcc967dd05..5344db50aa65 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -158,11 +158,6 @@ struct platform_device *of_device_alloc(struct device_node *np,
 }
 EXPORT_SYMBOL(of_device_alloc);
 
-static void of_dma_deconfigure(struct device *dev)
-{
-	arch_teardown_dma_ops(dev);
-}
-
 /**
  * of_platform_device_create_pdata - Alloc, initialize and register an of_device
  * @np: pointer to node to create device for
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index c12dace043f3..af984551cc2b 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -56,6 +56,7 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
 }
 
 void of_dma_configure(struct device *dev, struct device_node *np);
+void of_dma_deconfigure(struct device *dev);
 #else /* CONFIG_OF */
 
 static inline int of_driver_match_device(struct device *dev,
@@ -105,6 +106,8 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
 }
 static inline void of_dma_configure(struct device *dev, struct device_node *np)
 {}
+static inline void of_dma_deconfigure(struct device *dev)
+{}
 #endif /* CONFIG_OF */
 
 #endif /* _LINUX_OF_DEVICE_H */
-- 
cgit v1.2.3-70-g09d2


From 09515ef5ddad71c7820e5e428da418b709feeb26 Mon Sep 17 00:00:00 2001
From: Sricharan R <sricharan@codeaurora.org>
Date: Mon, 10 Apr 2017 16:51:01 +0530
Subject: of/acpi: Configure dma operations at probe time for platform/amba/pci
 bus devices

Configuring DMA ops at probe time will allow deferring device probe when
the IOMMU isn't available yet. The dma_configure for the device is
now called from the generic device_attach callback just before the
bus/driver probe is called. This way, configuring the DMA ops for the
device would be called at the same place for all bus_types, hence the
deferred probing mechanism should work for all buses as well.

pci_bus_add_devices    (platform/amba)(_device_create/driver_register)
       |                         |
pci_bus_add_device     (device_add/driver_register)
       |                         |
device_attach           device_initial_probe
       |                         |
__device_attach_driver    __device_attach_driver
       |
driver_probe_device
       |
really_probe
       |
dma_configure

Similarly on the device/driver_unregister path __device_release_driver is
called which inturn calls dma_deconfigure.

This patch changes the dma ops configuration to probe time for
both OF and ACPI based platform/amba/pci bus devices.

Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Hanjun Guo <hanjun.guo@linaro.org>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com> (drivers/pci part)
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Sricharan R <sricharan@codeaurora.org>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/glue.c         |  5 -----
 drivers/base/dd.c           |  9 +++++++++
 drivers/base/dma-mapping.c  | 40 ++++++++++++++++++++++++++++++++++++++++
 drivers/of/platform.c       |  5 +----
 drivers/pci/probe.c         | 28 ----------------------------
 include/linux/dma-mapping.h | 12 ++++++++++++
 6 files changed, 62 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c
index fb19e1cdb641..c05f24107bfc 100644
--- a/drivers/acpi/glue.c
+++ b/drivers/acpi/glue.c
@@ -176,7 +176,6 @@ int acpi_bind_one(struct device *dev, struct acpi_device *acpi_dev)
 	struct list_head *physnode_list;
 	unsigned int node_id;
 	int retval = -EINVAL;
-	enum dev_dma_attr attr;
 
 	if (has_acpi_companion(dev)) {
 		if (acpi_dev) {
@@ -233,10 +232,6 @@ int acpi_bind_one(struct device *dev, struct acpi_device *acpi_dev)
 	if (!has_acpi_companion(dev))
 		ACPI_COMPANION_SET(dev, acpi_dev);
 
-	attr = acpi_get_dma_attr(acpi_dev);
-	if (attr != DEV_DMA_NOT_SUPPORTED)
-		acpi_dma_configure(dev, attr);
-
 	acpi_physnode_link_name(physical_node_name, node_id);
 	retval = sysfs_create_link(&acpi_dev->dev.kobj, &dev->kobj,
 				   physical_node_name);
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index a1fbf55c4d3a..4882f06d12df 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -19,6 +19,7 @@
 
 #include <linux/device.h>
 #include <linux/delay.h>
+#include <linux/dma-mapping.h>
 #include <linux/module.h>
 #include <linux/kthread.h>
 #include <linux/wait.h>
@@ -356,6 +357,10 @@ re_probe:
 	if (ret)
 		goto pinctrl_bind_failed;
 
+	ret = dma_configure(dev);
+	if (ret)
+		goto dma_failed;
+
 	if (driver_sysfs_add(dev)) {
 		printk(KERN_ERR "%s: driver_sysfs_add(%s) failed\n",
 			__func__, dev_name(dev));
@@ -417,6 +422,8 @@ re_probe:
 	goto done;
 
 probe_failed:
+	dma_deconfigure(dev);
+dma_failed:
 	if (dev->bus)
 		blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
 					     BUS_NOTIFY_DRIVER_NOT_BOUND, dev);
@@ -826,6 +833,8 @@ static void __device_release_driver(struct device *dev, struct device *parent)
 			drv->remove(dev);
 
 		device_links_driver_cleanup(dev);
+		dma_deconfigure(dev);
+
 		devres_release_all(dev);
 		dev->driver = NULL;
 		dev_set_drvdata(dev, NULL);
diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c
index efd71cf4fdea..449b948c7427 100644
--- a/drivers/base/dma-mapping.c
+++ b/drivers/base/dma-mapping.c
@@ -7,9 +7,11 @@
  * This file is released under the GPLv2.
  */
 
+#include <linux/acpi.h>
 #include <linux/dma-mapping.h>
 #include <linux/export.h>
 #include <linux/gfp.h>
+#include <linux/of_device.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 
@@ -341,3 +343,41 @@ void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
 	vunmap(cpu_addr);
 }
 #endif
+
+/*
+ * Common configuration to enable DMA API use for a device
+ */
+#include <linux/pci.h>
+
+int dma_configure(struct device *dev)
+{
+	struct device *bridge = NULL, *dma_dev = dev;
+	enum dev_dma_attr attr;
+
+	if (dev_is_pci(dev)) {
+		bridge = pci_get_host_bridge_device(to_pci_dev(dev));
+		dma_dev = bridge;
+		if (IS_ENABLED(CONFIG_OF) && dma_dev->parent &&
+		    dma_dev->parent->of_node)
+			dma_dev = dma_dev->parent;
+	}
+
+	if (dma_dev->of_node) {
+		of_dma_configure(dev, dma_dev->of_node);
+	} else if (has_acpi_companion(dma_dev)) {
+		attr = acpi_get_dma_attr(to_acpi_device_node(dma_dev->fwnode));
+		if (attr != DEV_DMA_NOT_SUPPORTED)
+			acpi_dma_configure(dev, attr);
+	}
+
+	if (bridge)
+		pci_put_host_bridge_device(bridge);
+
+	return 0;
+}
+
+void dma_deconfigure(struct device *dev)
+{
+	of_dma_deconfigure(dev);
+	acpi_dma_deconfigure(dev);
+}
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 5344db50aa65..2aa4ebbde9cd 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -22,6 +22,7 @@
 #include <linux/slab.h>
 #include <linux/of_address.h>
 #include <linux/of_device.h>
+#include <linux/of_iommu.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
@@ -186,11 +187,9 @@ static struct platform_device *of_platform_device_create_pdata(
 
 	dev->dev.bus = &platform_bus_type;
 	dev->dev.platform_data = platform_data;
-	of_dma_configure(&dev->dev, dev->dev.of_node);
 	of_msi_configure(&dev->dev, dev->dev.of_node);
 
 	if (of_device_add(dev) != 0) {
-		of_dma_deconfigure(&dev->dev);
 		platform_device_put(dev);
 		goto err_clear_flag;
 	}
@@ -248,7 +247,6 @@ static struct amba_device *of_amba_device_create(struct device_node *node,
 		dev_set_name(&dev->dev, "%s", bus_id);
 	else
 		of_device_make_bus_id(&dev->dev);
-	of_dma_configure(&dev->dev, dev->dev.of_node);
 
 	/* Allow the HW Peripheral ID to be overridden */
 	prop = of_get_property(node, "arm,primecell-periphid", NULL);
@@ -542,7 +540,6 @@ static int of_platform_device_destroy(struct device *dev, void *data)
 		amba_device_unregister(to_amba_device(dev));
 #endif
 
-	of_dma_deconfigure(dev);
 	of_node_clear_flag(dev->of_node, OF_POPULATED);
 	of_node_clear_flag(dev->of_node, OF_POPULATED_BUS);
 	return 0;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index dfc9a2794141..5a8dd43db336 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1893,33 +1893,6 @@ static void pci_set_msi_domain(struct pci_dev *dev)
 	dev_set_msi_domain(&dev->dev, d);
 }
 
-/**
- * pci_dma_configure - Setup DMA configuration
- * @dev: ptr to pci_dev struct of the PCI device
- *
- * Function to update PCI devices's DMA configuration using the same
- * info from the OF node or ACPI node of host bridge's parent (if any).
- */
-static void pci_dma_configure(struct pci_dev *dev)
-{
-	struct device *bridge = pci_get_host_bridge_device(dev);
-
-	if (IS_ENABLED(CONFIG_OF) &&
-		bridge->parent && bridge->parent->of_node) {
-			of_dma_configure(&dev->dev, bridge->parent->of_node);
-	} else if (has_acpi_companion(bridge)) {
-		struct acpi_device *adev = to_acpi_device_node(bridge->fwnode);
-		enum dev_dma_attr attr = acpi_get_dma_attr(adev);
-
-		if (attr == DEV_DMA_NOT_SUPPORTED)
-			dev_warn(&dev->dev, "DMA not supported.\n");
-		else
-			acpi_dma_configure(&dev->dev, attr);
-	}
-
-	pci_put_host_bridge_device(bridge);
-}
-
 void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
 {
 	int ret;
@@ -1933,7 +1906,6 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
 	dev->dev.dma_mask = &dev->dma_mask;
 	dev->dev.dma_parms = &dev->dma_parms;
 	dev->dev.coherent_dma_mask = 0xffffffffull;
-	pci_dma_configure(dev);
 
 	pci_set_dma_max_seg_size(dev, 65536);
 	pci_set_dma_seg_boundary(dev, 0xffffffff);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 0977317c6835..4f3eecedca2d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -728,6 +728,18 @@ dma_mark_declared_memory_occupied(struct device *dev,
 }
 #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */
 
+#ifdef CONFIG_HAS_DMA
+int dma_configure(struct device *dev);
+void dma_deconfigure(struct device *dev);
+#else
+static inline int dma_configure(struct device *dev)
+{
+	return 0;
+}
+
+static inline void dma_deconfigure(struct device *dev) {}
+#endif
+
 /*
  * Managed DMA API
  */
-- 
cgit v1.2.3-70-g09d2


From 7b07cbefb68d486febf47e13b570fed53d9296b4 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Mon, 10 Apr 2017 16:51:02 +0530
Subject: iommu: of: Handle IOMMU lookup failure with deferred probing or error

Failures to look up an IOMMU when parsing the DT iommus property need to
be handled separately from the .of_xlate() failures to support deferred
probing.

The lack of a registered IOMMU can be caused by the lack of a driver for
the IOMMU, the IOMMU device probe not having been performed yet, having
been deferred, or having failed.

The first case occurs when the device tree describes the bus master and
IOMMU topology correctly but no device driver exists for the IOMMU yet
or the device driver has not been compiled in. Return NULL, the caller
will configure the device without an IOMMU.

The second and third cases are handled by deferring the probe of the bus
master device which will eventually get reprobed after the IOMMU.

The last case is currently handled by deferring the probe of the bus
master device as well. A mechanism to either configure the bus master
device without an IOMMU or to fail the bus master device probe depending
on whether the IOMMU is optional or mandatory would be a good
enhancement.

Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Laurent Pichart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Sricharan R <sricharan@codeaurora.org>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/base/dma-mapping.c | 5 +++--
 drivers/iommu/of_iommu.c   | 4 ++--
 drivers/of/device.c        | 9 +++++++--
 include/linux/of_device.h  | 9 ++++++---
 4 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c
index 449b948c7427..82bd45ced7ff 100644
--- a/drivers/base/dma-mapping.c
+++ b/drivers/base/dma-mapping.c
@@ -353,6 +353,7 @@ int dma_configure(struct device *dev)
 {
 	struct device *bridge = NULL, *dma_dev = dev;
 	enum dev_dma_attr attr;
+	int ret = 0;
 
 	if (dev_is_pci(dev)) {
 		bridge = pci_get_host_bridge_device(to_pci_dev(dev));
@@ -363,7 +364,7 @@ int dma_configure(struct device *dev)
 	}
 
 	if (dma_dev->of_node) {
-		of_dma_configure(dev, dma_dev->of_node);
+		ret = of_dma_configure(dev, dma_dev->of_node);
 	} else if (has_acpi_companion(dma_dev)) {
 		attr = acpi_get_dma_attr(to_acpi_device_node(dma_dev->fwnode));
 		if (attr != DEV_DMA_NOT_SUPPORTED)
@@ -373,7 +374,7 @@ int dma_configure(struct device *dev)
 	if (bridge)
 		pci_put_host_bridge_device(bridge);
 
-	return 0;
+	return ret;
 }
 
 void dma_deconfigure(struct device *dev)
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index c8be889f5206..9f44ee8ea1bc 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -236,7 +236,7 @@ const struct iommu_ops *of_iommu_configure(struct device *dev,
 			ops = ERR_PTR(err);
 	}
 
-	return IS_ERR(ops) ? NULL : ops;
+	return ops;
 }
 
 static int __init of_iommu_init(void)
@@ -247,7 +247,7 @@ static int __init of_iommu_init(void)
 	for_each_matching_node_and_match(np, matches, &match) {
 		const of_iommu_init_fn init_fn = match->data;
 
-		if (init_fn(np))
+		if (init_fn && init_fn(np))
 			pr_err("Failed to initialise IOMMU %s\n",
 				of_node_full_name(np));
 	}
diff --git a/drivers/of/device.c b/drivers/of/device.c
index e1ae9e7104a1..8bd3d8c09435 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -82,7 +82,7 @@ int of_device_add(struct platform_device *ofdev)
  * can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events
  * to fix up DMA configuration.
  */
-void of_dma_configure(struct device *dev, struct device_node *np)
+int of_dma_configure(struct device *dev, struct device_node *np)
 {
 	u64 dma_addr, paddr, size;
 	int ret;
@@ -123,7 +123,7 @@ void of_dma_configure(struct device *dev, struct device_node *np)
 
 		if (!size) {
 			dev_err(dev, "Adjusted size 0x%llx invalid\n", size);
-			return;
+			return -EINVAL;
 		}
 		dev_dbg(dev, "dma_pfn_offset(%#08lx)\n", offset);
 	}
@@ -144,10 +144,15 @@ void of_dma_configure(struct device *dev, struct device_node *np)
 		coherent ? " " : " not ");
 
 	iommu = of_iommu_configure(dev, np);
+	if (IS_ERR(iommu))
+		return PTR_ERR(iommu);
+
 	dev_dbg(dev, "device is%sbehind an iommu\n",
 		iommu ? " " : " not ");
 
 	arch_setup_dma_ops(dev, dma_addr, size, iommu, coherent);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(of_dma_configure);
 
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index af984551cc2b..2cacdd81062e 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -55,7 +55,7 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
 	return of_node_get(cpu_dev->of_node);
 }
 
-void of_dma_configure(struct device *dev, struct device_node *np);
+int of_dma_configure(struct device *dev, struct device_node *np);
 void of_dma_deconfigure(struct device *dev);
 #else /* CONFIG_OF */
 
@@ -104,8 +104,11 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
 {
 	return NULL;
 }
-static inline void of_dma_configure(struct device *dev, struct device_node *np)
-{}
+
+static inline int of_dma_configure(struct device *dev, struct device_node *np)
+{
+	return 0;
+}
 static inline void of_dma_deconfigure(struct device *dev)
 {}
 #endif /* CONFIG_OF */
-- 
cgit v1.2.3-70-g09d2


From 5a1bb638d5677053c7addcb228b56da6fccb5d68 Mon Sep 17 00:00:00 2001
From: Sricharan R <sricharan@codeaurora.org>
Date: Mon, 10 Apr 2017 16:51:03 +0530
Subject: drivers: acpi: Handle IOMMU lookup failure with deferred probing or
 error

This is an equivalent to the DT's handling of the iommu master's probe
with deferred probing when the corrsponding iommu is not probed yet.
The lack of a registered IOMMU can be caused by the lack of a driver for
the IOMMU, the IOMMU device probe not having been performed yet, having
been deferred, or having failed.

The first case occurs when the firmware describes the bus master and
IOMMU topology correctly but no device driver exists for the IOMMU yet
or the device driver has not been compiled in. Return NULL, the caller
will configure the device without an IOMMU.

The second and third cases are handled by deferring the probe of the bus
master device which will eventually get reprobed after the IOMMU.

The last case is currently handled by deferring the probe of the bus
master device as well. A mechanism to either configure the bus master
device without an IOMMU or to fail the bus master device probe depending
on whether the IOMMU is optional or mandatory would be a good
enhancement.

Tested-by: Hanjun Guo <hanjun.guo@linaro.org>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
[Lorenzo: Added fixes for dma_coherent_mask overflow, acpi_dma_configure
          called multiple times for same device]
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Sricharan R <sricharan@codeaurora.org>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/iort.c  | 33 ++++++++++++++++++++++++++++++++-
 drivers/acpi/scan.c        | 11 ++++++++---
 drivers/base/dma-mapping.c |  2 +-
 include/acpi/acpi_bus.h    |  2 +-
 include/linux/acpi.h       |  7 +++++--
 5 files changed, 47 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 3dd9ec372dae..e323ece0314d 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -543,6 +543,14 @@ static const struct iommu_ops *iort_iommu_xlate(struct device *dev,
 	const struct iommu_ops *ops = NULL;
 	int ret = -ENODEV;
 	struct fwnode_handle *iort_fwnode;
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+
+	/*
+	 * If we already translated the fwspec there
+	 * is nothing left to do, return the iommu_ops.
+	 */
+	if (fwspec && fwspec->ops)
+		return fwspec->ops;
 
 	if (node) {
 		iort_fwnode = iort_get_fwnode(node);
@@ -550,8 +558,17 @@ static const struct iommu_ops *iort_iommu_xlate(struct device *dev,
 			return NULL;
 
 		ops = iommu_ops_from_fwnode(iort_fwnode);
+		/*
+		 * If the ops look-up fails, this means that either
+		 * the SMMU drivers have not been probed yet or that
+		 * the SMMU drivers are not built in the kernel;
+		 * Depending on whether the SMMU drivers are built-in
+		 * in the kernel or not, defer the IOMMU configuration
+		 * or just abort it.
+		 */
 		if (!ops)
-			return NULL;
+			return iort_iommu_driver_enabled(node->type) ?
+			       ERR_PTR(-EPROBE_DEFER) : NULL;
 
 		ret = arm_smmu_iort_xlate(dev, streamid, iort_fwnode, ops);
 	}
@@ -625,12 +642,26 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev)
 
 		while (parent) {
 			ops = iort_iommu_xlate(dev, parent, streamid);
+			if (IS_ERR_OR_NULL(ops))
+				return ops;
 
 			parent = iort_node_get_id(node, &streamid,
 						  IORT_IOMMU_TYPE, i++);
 		}
 	}
 
+	/*
+	 * If we have reason to believe the IOMMU driver missed the initial
+	 * add_device callback for dev, replay it to get things in order.
+	 */
+	if (!IS_ERR_OR_NULL(ops) && ops->add_device &&
+	    dev->bus && !dev->iommu_group) {
+		int err = ops->add_device(dev);
+
+		if (err)
+			ops = ERR_PTR(err);
+	}
+
 	return ops;
 }
 
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 192691880d55..2a513cce332e 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1373,20 +1373,25 @@ enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev)
  * @dev: The pointer to the device
  * @attr: device dma attributes
  */
-void acpi_dma_configure(struct device *dev, enum dev_dma_attr attr)
+int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr)
 {
 	const struct iommu_ops *iommu;
+	u64 size;
 
 	iort_set_dma_mask(dev);
 
 	iommu = iort_iommu_configure(dev);
+	if (IS_ERR(iommu))
+		return PTR_ERR(iommu);
 
+	size = max(dev->coherent_dma_mask, dev->coherent_dma_mask + 1);
 	/*
 	 * Assume dma valid range starts at 0 and covers the whole
 	 * coherent_dma_mask.
 	 */
-	arch_setup_dma_ops(dev, 0, dev->coherent_dma_mask + 1, iommu,
-			   attr == DEV_DMA_COHERENT);
+	arch_setup_dma_ops(dev, 0, size, iommu, attr == DEV_DMA_COHERENT);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(acpi_dma_configure);
 
diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c
index 82bd45ced7ff..755a2b5354c5 100644
--- a/drivers/base/dma-mapping.c
+++ b/drivers/base/dma-mapping.c
@@ -368,7 +368,7 @@ int dma_configure(struct device *dev)
 	} else if (has_acpi_companion(dma_dev)) {
 		attr = acpi_get_dma_attr(to_acpi_device_node(dma_dev->fwnode));
 		if (attr != DEV_DMA_NOT_SUPPORTED)
-			acpi_dma_configure(dev, attr);
+			ret = acpi_dma_configure(dev, attr);
 	}
 
 	if (bridge)
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index ef0ae8aaa567..2a9a5de0fb00 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -575,7 +575,7 @@ struct acpi_pci_root {
 
 bool acpi_dma_supported(struct acpi_device *adev);
 enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev);
-void acpi_dma_configure(struct device *dev, enum dev_dma_attr attr);
+int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr);
 void acpi_dma_deconfigure(struct device *dev);
 
 struct acpi_device *acpi_find_child_device(struct acpi_device *parent,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 9b05886f9773..79d06ef654c9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -762,8 +762,11 @@ static inline enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev)
 	return DEV_DMA_NOT_SUPPORTED;
 }
 
-static inline void acpi_dma_configure(struct device *dev,
-				      enum dev_dma_attr attr) { }
+static inline int acpi_dma_configure(struct device *dev,
+				     enum dev_dma_attr attr)
+{
+	return 0;
+}
 
 static inline void acpi_dma_deconfigure(struct device *dev) { }
 
-- 
cgit v1.2.3-70-g09d2


From 316ca8804ea84a782d5ba2163711ebb22116ff5a Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Mon, 10 Apr 2017 16:51:06 +0530
Subject: ACPI/IORT: Remove linker section for IORT entries probing

The IORT linker section introduced by commit 34ceea275f62
("ACPI/IORT: Introduce linker section for IORT entries probing")
was needed to make sure SMMU drivers are registered (and therefore
probed) in the kernel before devices using the SMMU have a chance
to probe in turn.

Through the introduction of deferred IOMMU configuration the linker
section based IORT probing infrastructure is not needed any longer, in
that device/SMMU probe dependencies are managed through the probe
deferral mechanism, making the IORT linker section infrastructure
unused, so that it can be removed.

Remove the unused IORT linker section probing infrastructure
from the kernel to complete the ACPI IORT IOMMU configure probe
deferral mechanism implementation.

Tested-by: Hanjun Guo <hanjun.guo@linaro.org>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Sricharan R <sricharan@codeaurora.org>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/iort.c         | 2 --
 include/asm-generic/vmlinux.lds.h | 1 -
 include/linux/acpi_iort.h         | 3 ---
 3 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index e323ece0314d..e7b1940ff13b 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1000,6 +1000,4 @@ void __init acpi_iort_init(void)
 	}
 
 	iort_init_platform_devices();
-
-	acpi_probe_device_table(iort);
 }
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 0968d13b3885..9faa26c41c14 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -566,7 +566,6 @@
 	IRQCHIP_OF_MATCH_TABLE()					\
 	ACPI_PROBE_TABLE(irqchip)					\
 	ACPI_PROBE_TABLE(clksrc)					\
-	ACPI_PROBE_TABLE(iort)						\
 	EARLYCON_TABLE()
 
 #define INIT_TEXT							\
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index 77e08099e554..f167e1d045ff 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -52,7 +52,4 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev)
 { return NULL; }
 #endif
 
-#define IORT_ACPI_DECLARE(name, table_id, fn)		\
-	ACPI_DECLARE_PROBE_ENTRY(iort, name, table_id, 0, NULL, 0, fn)
-
 #endif /* __ACPI_IORT_H__ */
-- 
cgit v1.2.3-70-g09d2


From 49a57ef7f8492ef985ee1ecdb927ca78a6b2f308 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Wed, 12 Apr 2017 00:21:27 -0500
Subject: iommu/omap: Drop legacy-style device support

All the supported boards that have OMAP IOMMU devices do support
DT boot only now. So, drop the support for the non-DT legacy-style
devices from the OMAP IOMMU driver. Couple of the fields from the
iommu platform data would no longer be required, so they have also
been cleaned up. The IOMMU platform data is still needed though for
performing reset management properly in a multi-arch environment.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/omap-iommu.c               | 30 ++++++++++++++----------------
 include/linux/platform_data/iommu-omap.h |  3 ---
 2 files changed, 14 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index 54556713c8d1..febd4fbe3445 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -928,28 +928,26 @@ static int omap_iommu_probe(struct platform_device *pdev)
 	int irq;
 	struct omap_iommu *obj;
 	struct resource *res;
-	struct iommu_platform_data *pdata = dev_get_platdata(&pdev->dev);
 	struct device_node *of = pdev->dev.of_node;
 
+	if (!of) {
+		pr_err("%s: only DT-based devices are supported\n", __func__);
+		return -ENODEV;
+	}
+
 	obj = devm_kzalloc(&pdev->dev, sizeof(*obj) + MMU_REG_SIZE, GFP_KERNEL);
 	if (!obj)
 		return -ENOMEM;
 
-	if (of) {
-		obj->name = dev_name(&pdev->dev);
-		obj->nr_tlb_entries = 32;
-		err = of_property_read_u32(of, "ti,#tlb-entries",
-					   &obj->nr_tlb_entries);
-		if (err && err != -EINVAL)
-			return err;
-		if (obj->nr_tlb_entries != 32 && obj->nr_tlb_entries != 8)
-			return -EINVAL;
-		if (of_find_property(of, "ti,iommu-bus-err-back", NULL))
-			obj->has_bus_err_back = MMU_GP_REG_BUS_ERR_BACK_EN;
-	} else {
-		obj->nr_tlb_entries = pdata->nr_tlb_entries;
-		obj->name = pdata->name;
-	}
+	obj->name = dev_name(&pdev->dev);
+	obj->nr_tlb_entries = 32;
+	err = of_property_read_u32(of, "ti,#tlb-entries", &obj->nr_tlb_entries);
+	if (err && err != -EINVAL)
+		return err;
+	if (obj->nr_tlb_entries != 32 && obj->nr_tlb_entries != 8)
+		return -EINVAL;
+	if (of_find_property(of, "ti,iommu-bus-err-back", NULL))
+		obj->has_bus_err_back = MMU_GP_REG_BUS_ERR_BACK_EN;
 
 	obj->dev = &pdev->dev;
 	obj->ctx = (void *)obj + sizeof(*obj);
diff --git a/include/linux/platform_data/iommu-omap.h b/include/linux/platform_data/iommu-omap.h
index 0496d171700a..a40fc0f4f9de 100644
--- a/include/linux/platform_data/iommu-omap.h
+++ b/include/linux/platform_data/iommu-omap.h
@@ -30,10 +30,7 @@ struct omap_iommu_arch_data {
 };
 
 struct iommu_platform_data {
-	const char *name;
 	const char *reset_name;
-	int nr_tlb_entries;
-
 	int (*assert_reset)(struct platform_device *pdev, const char *name);
 	int (*deassert_reset)(struct platform_device *pdev, const char *name);
 };
-- 
cgit v1.2.3-70-g09d2


From e73b7afe4e8ca5ec4304a9e1d5009755a85fff91 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 12 Apr 2017 00:21:28 -0500
Subject: iommu/omap: Move data structures to omap-iommu.h

The internal data-structures are scattered over various
header and C files. Consolidate them in omap-iommu.h.

While at this, add the kerneldoc comment for the missing
iommu domain variable and revise the iommu_arch_data name.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
[s-anna@ti.com: revise kerneldoc comments]
Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/omap-iommu.c               | 16 ----------------
 drivers/iommu/omap-iommu.h               | 33 ++++++++++++++++++++++++++++++++
 include/linux/platform_data/iommu-omap.h | 17 ----------------
 3 files changed, 33 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index febd4fbe3445..c1739a650654 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -42,22 +42,6 @@
 /* bitmap of the page sizes currently supported */
 #define OMAP_IOMMU_PGSIZES	(SZ_4K | SZ_64K | SZ_1M | SZ_16M)
 
-/**
- * struct omap_iommu_domain - omap iommu domain
- * @pgtable:	the page table
- * @iommu_dev:	an omap iommu device attached to this domain. only a single
- *		iommu device can be attached for now.
- * @dev:	Device using this domain.
- * @lock:	domain lock, should be taken when attaching/detaching
- */
-struct omap_iommu_domain {
-	u32 *pgtable;
-	struct omap_iommu *iommu_dev;
-	struct device *dev;
-	spinlock_t lock;
-	struct iommu_domain domain;
-};
-
 #define MMU_LOCK_BASE_SHIFT	10
 #define MMU_LOCK_BASE_MASK	(0x1f << MMU_LOCK_BASE_SHIFT)
 #define MMU_LOCK_BASE(x)	\
diff --git a/drivers/iommu/omap-iommu.h b/drivers/iommu/omap-iommu.h
index 59628e5017b4..3c33608f48ca 100644
--- a/drivers/iommu/omap-iommu.h
+++ b/drivers/iommu/omap-iommu.h
@@ -14,6 +14,7 @@
 #define _OMAP_IOMMU_H
 
 #include <linux/bitops.h>
+#include <linux/iommu.h>
 
 #define for_each_iotlb_cr(obj, n, __i, cr)				\
 	for (__i = 0;							\
@@ -27,6 +28,23 @@ struct iotlb_entry {
 	u32 endian, elsz, mixed;
 };
 
+/**
+ * struct omap_iommu_domain - omap iommu domain
+ * @pgtable:	the page table
+ * @iommu_dev:	an omap iommu device attached to this domain. only a single
+ *		iommu device can be attached for now.
+ * @dev:	Device using this domain.
+ * @lock:	domain lock, should be taken when attaching/detaching
+ * @domain:	generic domain handle used by iommu core code
+ */
+struct omap_iommu_domain {
+	u32 *pgtable;
+	struct omap_iommu *iommu_dev;
+	struct device *dev;
+	spinlock_t lock;
+	struct iommu_domain domain;
+};
+
 struct omap_iommu {
 	const char	*name;
 	void __iomem	*regbase;
@@ -52,6 +70,21 @@ struct omap_iommu {
 	u32 id;
 };
 
+/**
+ * struct omap_iommu_arch_data - omap iommu private data
+ * @name: name of the iommu device
+ * @iommu_dev: handle of the iommu device
+ *
+ * This is an omap iommu private data object, which binds an iommu user
+ * to its iommu device. This object should be placed at the iommu user's
+ * dev_archdata so generic IOMMU API can be used without having to
+ * utilize omap-specific plumbing anymore.
+ */
+struct omap_iommu_arch_data {
+	const char *name;
+	struct omap_iommu *iommu_dev;
+};
+
 struct cr_regs {
 	u32 cam;
 	u32 ram;
diff --git a/include/linux/platform_data/iommu-omap.h b/include/linux/platform_data/iommu-omap.h
index a40fc0f4f9de..e8b12dbf6170 100644
--- a/include/linux/platform_data/iommu-omap.h
+++ b/include/linux/platform_data/iommu-omap.h
@@ -12,23 +12,6 @@
 
 #include <linux/platform_device.h>
 
-#define MMU_REG_SIZE		256
-
-/**
- * struct iommu_arch_data - omap iommu private data
- * @name: name of the iommu device
- * @iommu_dev: handle of the iommu device
- *
- * This is an omap iommu private data object, which binds an iommu user
- * to its iommu device. This object should be placed at the iommu user's
- * dev_archdata so generic IOMMU API can be used without having to
- * utilize omap-specific plumbing anymore.
- */
-struct omap_iommu_arch_data {
-	const char *name;
-	struct omap_iommu *iommu_dev;
-};
-
 struct iommu_platform_data {
 	const char *reset_name;
 	int (*assert_reset)(struct platform_device *pdev, const char *name);
-- 
cgit v1.2.3-70-g09d2


From 518662e0fcb9fa241fe90a337b59bc5066b2a930 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 10 Apr 2017 12:22:09 +1000
Subject: NFS: fix usage of mempools.

When passed GFP flags that allow sleeping (such as
GFP_NOIO), mempool_alloc() will never return NULL, it will
wait until memory is available.

This means that we don't need to handle failure, but that we
do need to ensure one thread doesn't call mempool_alloc()
twice on the one pool without queuing or freeing the first
allocation.  If multiple threads did this during times of
high memory pressure, the pool could be exhausted and a
deadlock could result.

pnfs_generic_alloc_ds_commits() attempts to allocate from
the nfs_commit_mempool while already holding an allocation
from that pool.  This is not safe.  So change
nfs_commitdata_alloc() to take a flag that indicates whether
failure is acceptable.

In pnfs_generic_alloc_ds_commits(), accept failure and
handle it as we currently do.  Else where, do not accept
failure, and do not handle it.

Even when failure is acceptable, we want to succeed if
possible.  That means both
 - using an entry from the pool if there is one
 - waiting for direct reclaim is there isn't.

We call mempool_alloc(GFP_NOWAIT) to achieve the first, then
kmem_cache_alloc(GFP_NOIO|__GFP_NORETRY) to achieve the
second.  Each of these can fail, but together they do the
best they can without blocking indefinitely.

The objects returned by kmem_cache_alloc() will still be freed
by mempool_free().  This is safe as mempool_alloc() uses
exactly the same function to allocate objects (since the mempool
was created with mempool_create_slab_pool()).  The object returned
by mempool_alloc() and kmem_cache_alloc() are indistinguishable
so mempool_free() will handle both identically, either adding to the
pool or calling kmem_cache_free().

Also, don't test for failure when allocating from
nfs_wdata_mempool.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs_nfs.c      | 16 +++++-----------
 fs/nfs/write.c         | 35 +++++++++++++++++++++--------------
 include/linux/nfs_fs.h |  2 +-
 3 files changed, 27 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 7250b95549ec..1edf5b84aba5 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -217,7 +217,7 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
 	for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
 		if (list_empty(&bucket->committing))
 			continue;
-		data = nfs_commitdata_alloc();
+		data = nfs_commitdata_alloc(false);
 		if (!data)
 			break;
 		data->ds_commit_index = i;
@@ -283,16 +283,10 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 	unsigned int nreq = 0;
 
 	if (!list_empty(mds_pages)) {
-		data = nfs_commitdata_alloc();
-		if (data != NULL) {
-			data->ds_commit_index = -1;
-			list_add(&data->pages, &list);
-			nreq++;
-		} else {
-			nfs_retry_commit(mds_pages, NULL, cinfo, 0);
-			pnfs_generic_retry_commit(cinfo, 0);
-			return -ENOMEM;
-		}
+		data = nfs_commitdata_alloc(true);
+		data->ds_commit_index = -1;
+		list_add(&data->pages, &list);
+		nreq++;
 	}
 
 	nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index abb2c8a3be42..bdfe5a7c5874 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -60,14 +60,28 @@ static mempool_t *nfs_wdata_mempool;
 static struct kmem_cache *nfs_cdata_cachep;
 static mempool_t *nfs_commit_mempool;
 
-struct nfs_commit_data *nfs_commitdata_alloc(void)
+struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail)
 {
-	struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
+	struct nfs_commit_data *p;
 
-	if (p) {
-		memset(p, 0, sizeof(*p));
-		INIT_LIST_HEAD(&p->pages);
+	if (never_fail)
+		p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
+	else {
+		/* It is OK to do some reclaim, not no safe to wait
+		 * for anything to be returned to the pool.
+		 * mempool_alloc() cannot handle that particular combination,
+		 * so we need two separate attempts.
+		 */
+		p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT);
+		if (!p)
+			p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO |
+					     __GFP_NOWARN | __GFP_NORETRY);
+		if (!p)
+			return NULL;
 	}
+
+	memset(p, 0, sizeof(*p));
+	INIT_LIST_HEAD(&p->pages);
 	return p;
 }
 EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
@@ -82,8 +96,7 @@ static struct nfs_pgio_header *nfs_writehdr_alloc(void)
 {
 	struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
 
-	if (p)
-		memset(p, 0, sizeof(*p));
+	memset(p, 0, sizeof(*p));
 	return p;
 }
 
@@ -1705,19 +1718,13 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
 	if (list_empty(head))
 		return 0;
 
-	data = nfs_commitdata_alloc();
-
-	if (!data)
-		goto out_bad;
+	data = nfs_commitdata_alloc(true);
 
 	/* Set up the argument struct */
 	nfs_init_commit(data, head, NULL, cinfo);
 	atomic_inc(&cinfo->mds->rpcs_out);
 	return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
 				   data->mds_ops, how, 0);
- out_bad:
-	nfs_retry_commit(head, NULL, cinfo, 0);
-	return -ENOMEM;
 }
 
 int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 287f34161086..1b29915247b2 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -502,7 +502,7 @@ extern int nfs_wb_all(struct inode *inode);
 extern int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 extern int  nfs_commit_inode(struct inode *, int);
-extern struct nfs_commit_data *nfs_commitdata_alloc(void);
+extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail);
 extern void nfs_commit_free(struct nfs_commit_data *data);
 
 static inline int
-- 
cgit v1.2.3-70-g09d2


From fbe77c30e9abcb3429380dec622439991a718e31 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Wed, 19 Apr 2017 10:11:35 -0400
Subject: NFS: move rw_mode to nfs_pageio_header

Let's try to have it in a cacheline in nfs4_proc_pgio_rpc_prepare().

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c        | 2 +-
 fs/nfs/pagelist.c        | 8 +++-----
 fs/nfs/read.c            | 9 ++++++---
 fs/nfs/write.c           | 8 +++++---
 include/linux/nfs_page.h | 4 ++--
 include/linux/nfs_xdr.h  | 1 +
 6 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dda19a35ad9e..4e52ac773d1f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4610,7 +4610,7 @@ static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task,
 		return 0;
 	if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
 				hdr->args.lock_context,
-				hdr->rw_ops->rw_mode) == -EIO)
+				hdr->rw_mode) == -EIO)
 		return -EIO;
 	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags)))
 		return -EIO;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 453255c30fa1..f53610672f03 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -664,11 +664,11 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 		     const struct nfs_pgio_completion_ops *compl_ops,
 		     const struct nfs_rw_ops *rw_ops,
 		     size_t bsize,
-		     int io_flags)
+		     int io_flags,
+		     gfp_t gfp_flags)
 {
 	struct nfs_pgio_mirror *new;
 	int i;
-	gfp_t gfp_flags = GFP_KERNEL;
 
 	desc->pg_moreio = 0;
 	desc->pg_inode = inode;
@@ -688,8 +688,6 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	if (pg_ops->pg_get_mirror_count) {
 		/* until we have a request, we don't have an lseg and no
 		 * idea how many mirrors there will be */
-		if (desc->pg_rw_ops->rw_mode == FMODE_WRITE)
-			gfp_flags = GFP_NOIO;
 		new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX,
 			      sizeof(struct nfs_pgio_mirror), gfp_flags);
 		desc->pg_mirrors_dynamic = new;
@@ -753,7 +751,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
 	if (pagecount <= ARRAY_SIZE(pg_array->page_array))
 		pg_array->pagevec = pg_array->page_array;
 	else {
-		if (desc->pg_rw_ops->rw_mode == FMODE_WRITE)
+		if (hdr->rw_mode == FMODE_WRITE)
 			gfp_flags = GFP_NOIO;
 		pg_array->pagevec = kcalloc(pagecount, sizeof(struct page *), gfp_flags);
 		if (!pg_array->pagevec) {
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index defc9233e985..a8421d9dab6a 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -35,7 +35,11 @@ static struct kmem_cache *nfs_rdata_cachep;
 
 static struct nfs_pgio_header *nfs_readhdr_alloc(void)
 {
-	return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
+	struct nfs_pgio_header *p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
+
+	if (p)
+		p->rw_mode = FMODE_READ;
+	return p;
 }
 
 static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
@@ -64,7 +68,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 		pg_ops = server->pnfs_curr_ld->pg_read_ops;
 #endif
 	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops,
-			server->rsize, 0);
+			server->rsize, 0, GFP_KERNEL);
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
 
@@ -451,7 +455,6 @@ void nfs_destroy_readpagecache(void)
 }
 
 static const struct nfs_rw_ops nfs_rw_read_ops = {
-	.rw_mode		= FMODE_READ,
 	.rw_alloc_header	= nfs_readhdr_alloc,
 	.rw_free_header		= nfs_readhdr_free,
 	.rw_done		= nfs_readpage_done,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e0bccbefbc9e..95ac001b6fdf 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -96,7 +96,10 @@ static struct nfs_pgio_header *nfs_writehdr_alloc(void)
 {
 	struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
 
-	memset(p, 0, sizeof(*p));
+	if (p) {
+		memset(p, 0, sizeof(*p));
+		p->rw_mode = FMODE_WRITE;
+	}
 	return p;
 }
 
@@ -1381,7 +1384,7 @@ void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 		pg_ops = server->pnfs_curr_ld->pg_write_ops;
 #endif
 	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops,
-			server->wsize, ioflags);
+			server->wsize, ioflags, GFP_NOIO);
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
 
@@ -2115,7 +2118,6 @@ void nfs_destroy_writepagecache(void)
 }
 
 static const struct nfs_rw_ops nfs_rw_write_ops = {
-	.rw_mode		= FMODE_WRITE,
 	.rw_alloc_header	= nfs_writehdr_alloc,
 	.rw_free_header		= nfs_writehdr_free,
 	.rw_done		= nfs_writeback_done,
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 957049f72290..6f01e28bba27 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -64,7 +64,6 @@ struct nfs_pageio_ops {
 };
 
 struct nfs_rw_ops {
-	const fmode_t rw_mode;
 	struct nfs_pgio_header *(*rw_alloc_header)(void);
 	void (*rw_free_header)(struct nfs_pgio_header *);
 	int  (*rw_done)(struct rpc_task *, struct nfs_pgio_header *,
@@ -124,7 +123,8 @@ extern	void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 			     const struct nfs_pgio_completion_ops *compl_ops,
 			     const struct nfs_rw_ops *rw_ops,
 			     size_t bsize,
-			     int how);
+			     int how,
+			     gfp_t gfp_flags);
 extern	int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
 				   struct nfs_page *);
 extern  int nfs_pageio_resend(struct nfs_pageio_descriptor *,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 348f7c158084..51e27f9746ee 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1427,6 +1427,7 @@ struct nfs_pgio_header {
 	struct list_head	pages;
 	struct nfs_page		*req;
 	struct nfs_writeverf	verf;		/* Used for writes */
+	fmode_t			rw_mode;
 	struct pnfs_layout_segment *lseg;
 	loff_t			io_start;
 	const struct rpc_call_ops *mds_ops;
-- 
cgit v1.2.3-70-g09d2


From 6ade8694f471d847500c7cec152cc15171cef5d5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 20 Apr 2017 17:30:06 -0700
Subject: kvm: Move srcu_struct fields to end of struct kvm

Parallelizing SRCU callback handling will increase the size of
srcu_struct, which will move the kvm structure's kvm_arch field out
of reach of powerpc's current assembly code, which will result in the
following sort of build error:

arch/powerpc/kvm/book3s_hv_rmhandlers.S:617: Error: operand out of range (0x000000000000b328 is not between 0xffffffffffff8000 and 0x0000000000007fff)

This commit moves the srcu_struct fields in the kvm structure to follow
the kvm_arch field, which will allow powerpc's assembly code to continue
to be able to reach the kvm_arch field.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: Michael Ellerman <michaele@au1.ibm.com>
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
[ paulmck: Moved this commit to precede SRCU callback parallelization,
  and reworded the commit log into future tense, all in the name of
  bisectability. ]
---
 include/linux/kvm_host.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c14ad9809da..96c8e29c6442 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -375,8 +375,6 @@ struct kvm {
 	struct mutex slots_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
 	struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM];
-	struct srcu_struct srcu;
-	struct srcu_struct irq_srcu;
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 
 	/*
@@ -429,6 +427,8 @@ struct kvm {
 	struct list_head devices;
 	struct dentry *debugfs_dentry;
 	struct kvm_stat_data **debugfs_stat_data;
+	struct srcu_struct srcu;
+	struct srcu_struct irq_srcu;
 };
 
 #define kvm_err(fmt, ...) \
-- 
cgit v1.2.3-70-g09d2


From da915ad5cf25b5f5d358dd3670c3378d8ae8c03e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 5 Apr 2017 09:01:53 -0700
Subject: srcu: Parallelize callback handling

Peter Zijlstra proposed using SRCU to reduce mmap_sem contention [1,2],
however, there are workloads that could result in a high volume of
concurrent invocations of call_srcu(), which with current SRCU would
result in excessive lock contention on the srcu_struct structure's
->queue_lock, which protects SRCU's callback lists.  This commit therefore
moves SRCU to per-CPU callback lists, thus greatly reducing contention.

Because a given SRCU instance no longer has a single centralized callback
list, starting grace periods and invoking callbacks are both more complex
than in the single-list Classic SRCU implementation.  Starting grace
periods and handling callbacks are now handled using an srcu_node tree
that is in some ways similar to the rcu_node trees used by RCU-bh,
RCU-preempt, and RCU-sched (for example, the srcu_node tree shape is
controlled by exactly the same Kconfig options and boot parameters that
control the shape of the rcu_node tree).

In addition, the old per-CPU srcu_array structure is now named srcu_data
and contains an rcu_segcblist structure named ->srcu_cblist for its
callbacks (and a spinlock to protect this).  The srcu_struct gets
an srcu_gp_seq that is used to associate callback segments with the
corresponding completion-time grace-period number.  These completion-time
grace-period numbers are propagated up the srcu_node tree so that the
grace-period workqueue handler can determine whether additional grace
periods are needed on the one hand and where to look for callbacks that
are ready to be invoked.

The srcu_barrier() function must now wait on all instances of the per-CPU
->srcu_cblist.  Because each ->srcu_cblist is protected by ->lock,
srcu_barrier() can remotely add the needed callbacks.  In theory,
it could also remotely start grace periods, but in practice doing so
is complex and racy.  And interestingly enough, it is never necessary
for srcu_barrier() to start a grace period because srcu_barrier() only
enqueues a callback when a callback is already present--and it turns out
that a grace period has to have already been started for this pre-existing
callback.  Furthermore, it is only the callback that srcu_barrier()
needs to wait on, not any particular grace period.  Therefore, a new
rcu_segcblist_entrain() function enqueues the srcu_barrier() function's
callback into the same segment occupied by the last pre-existing callback
in the list.  The special case where all the pre-existing callbacks are
on a different list (because they are in the process of being invoked)
is handled by enqueuing srcu_barrier()'s callback into the RCU_DONE_TAIL
segment, relying on the done-callbacks check that takes place after all
callbacks are inovked.

Note that the readers use the same algorithm as before.  Note that there
is a separate srcu_idx that tells the readers what counter to increment.
This unfortunately cannot be combined with srcu_gp_seq because they
need to be incremented at different times.

This commit introduces some ugly #ifdefs in rcutorture.  These will go
away when I feel good enough about Tree SRCU to ditch Classic SRCU.

Some crude performance comparisons, courtesy of a quickly hacked rcuperf
asynchronous-grace-period capability:

			Callback Queuing Overhead
			-------------------------
	# CPUS		Classic SRCU	Tree SRCU
	------          ------------    ---------
	     2              0.349 us     0.342 us
	    16             31.66  us     0.4   us
	    41             ---------     0.417 us

The times are the 90th percentiles, a statistic that was chosen to reject
the overheads of the occasional srcu_barrier() call needed to avoid OOMing
the test machine.  The rcuperf test hangs when running Classic SRCU at 41
CPUs, hence the line of dashes.  Despite the hacks to both the rcuperf code
and that statistics, this is a convincing demonstration of Tree SRCU's
performance and scalability advantages.

[1] https://lwn.net/Articles/309030/
[2] https://patchwork.kernel.org/patch/5108281/

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Fix initialization if synchronize_srcu_expedited() called first. ]
---
 include/linux/rcu_segcblist.h |  42 ++-
 include/linux/srcutree.h      |  80 ++++--
 kernel/rcu/rcutorture.c       |  20 +-
 kernel/rcu/srcutree.c         | 642 +++++++++++++++++++++++++++++++++---------
 kernel/rcu/tree.c             |   6 +
 kernel/rcu/tree.h             |   8 +
 6 files changed, 647 insertions(+), 151 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 74b1e7243955..ced8f313fd05 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -401,6 +401,37 @@ static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
 	rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
 }
 
+/*
+ * Entrain the specified callback onto the specified rcu_segcblist at
+ * the end of the last non-empty segment.  If the entire rcu_segcblist
+ * is empty, make no change, but return false.
+ *
+ * This is intended for use by rcu_barrier()-like primitives, -not-
+ * for normal grace-period use.  IMPORTANT:  The callback you enqueue
+ * will wait for all prior callbacks, NOT necessarily for a grace
+ * period.  You have been warned.
+ */
+static inline bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
+					 struct rcu_head *rhp, bool lazy)
+{
+	int i;
+
+	if (rcu_segcblist_n_cbs(rsclp) == 0)
+		return false;
+	WRITE_ONCE(rsclp->len, rsclp->len + 1);
+	if (lazy)
+		rsclp->len_lazy++;
+	smp_mb(); /* Ensure counts are updated before callback is entrained. */
+	rhp->next = NULL;
+	for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
+		if (rsclp->tails[i] != rsclp->tails[i - 1])
+			break;
+	*rsclp->tails[i] = rhp;
+	for (; i <= RCU_NEXT_TAIL; i++)
+		rsclp->tails[i] = &rhp->next;
+	return true;
+}
+
 /*
  * Extract only the counts from the specified rcu_segcblist structure,
  * and place them in the specified rcu_cblist structure.  This function
@@ -537,7 +568,8 @@ static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp,
 	int i, j;
 
 	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL));
+	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+		return;
 
 	/*
 	 * Find all callbacks whose ->gp_seq numbers indicate that they
@@ -582,8 +614,9 @@ static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp,
  * them to complete at the end of the earlier grace period.
  *
  * This function operates on an rcu_segcblist structure, and also the
- * grace-period sequence number at which new callbacks would become
- * ready to invoke.
+ * grace-period sequence number seq at which new callbacks would become
+ * ready to invoke.  Returns true if there are callbacks that won't be
+ * ready to invoke until seq, false otherwise.
  */
 static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp,
 					    unsigned long seq)
@@ -591,7 +624,8 @@ static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp,
 	int i;
 
 	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL));
+	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+		return false;
 
 	/*
 	 * Find the segment preceding the oldest segment of callbacks
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index f2b3bd6c6bc2..0400e211aa44 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -24,25 +24,75 @@
 #ifndef _LINUX_SRCU_TREE_H
 #define _LINUX_SRCU_TREE_H
 
-struct srcu_array {
-	unsigned long lock_count[2];
-	unsigned long unlock_count[2];
+#include <linux/rcu_node_tree.h>
+#include <linux/completion.h>
+
+struct srcu_node;
+struct srcu_struct;
+
+/*
+ * Per-CPU structure feeding into leaf srcu_node, similar in function
+ * to rcu_node.
+ */
+struct srcu_data {
+	/* Read-side state. */
+	unsigned long srcu_lock_count[2];	/* Locks per CPU. */
+	unsigned long srcu_unlock_count[2];	/* Unlocks per CPU. */
+
+	/* Update-side state. */
+	spinlock_t lock ____cacheline_internodealigned_in_smp;
+	struct rcu_segcblist srcu_cblist;	/* List of callbacks.*/
+	unsigned long srcu_gp_seq_needed;	/* Furthest future GP needed. */
+	bool srcu_cblist_invoking;		/* Invoking these CBs? */
+	struct delayed_work work;		/* Context for CB invoking. */
+	struct rcu_head srcu_barrier_head;	/* For srcu_barrier() use. */
+	struct srcu_node *mynode;		/* Leaf srcu_node. */
+	int cpu;
+	struct srcu_struct *sp;
 };
 
+/*
+ * Node in SRCU combining tree, similar in function to rcu_data.
+ */
+struct srcu_node {
+	spinlock_t lock;
+	unsigned long srcu_have_cbs[4];		/* GP seq for children */
+						/*  having CBs, but only */
+						/*  is > ->srcu_gq_seq. */
+	struct srcu_node *srcu_parent;		/* Next up in tree. */
+	int grplo;				/* Least CPU for node. */
+	int grphi;				/* Biggest CPU for node. */
+};
+
+/*
+ * Per-SRCU-domain structure, similar in function to rcu_state.
+ */
 struct srcu_struct {
-	unsigned long completed;
-	unsigned long srcu_gp_seq;
-	atomic_t srcu_exp_cnt;
-	struct srcu_array __percpu *per_cpu_ref;
-	spinlock_t queue_lock; /* protect ->srcu_cblist */
-	struct rcu_segcblist srcu_cblist;
+	struct srcu_node node[NUM_RCU_NODES];	/* Combining tree. */
+	struct srcu_node *level[RCU_NUM_LVLS + 1];
+						/* First node at each level. */
+	struct mutex srcu_cb_mutex;		/* Serialize CB preparation. */
+	spinlock_t gp_lock;			/* protect ->srcu_cblist */
+	struct mutex srcu_gp_mutex;		/* Serialize GP work. */
+	unsigned int srcu_idx;			/* Current rdr array element. */
+	unsigned long srcu_gp_seq;		/* Grace-period seq #. */
+	unsigned long srcu_gp_seq_needed;	/* Latest gp_seq needed. */
+	atomic_t srcu_exp_cnt;			/* # ongoing expedited GPs. */
+	struct srcu_data __percpu *sda;		/* Per-CPU srcu_data array. */
+	unsigned long srcu_barrier_seq;		/* srcu_barrier seq #. */
+	struct mutex srcu_barrier_mutex;	/* Serialize barrier ops. */
+	struct completion srcu_barrier_completion;
+						/* Awaken barrier rq at end. */
+	atomic_t srcu_barrier_cpu_cnt;		/* # CPUs not yet posting a */
+						/*  callback for the barrier */
+						/*  operation. */
 	struct delayed_work work;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 };
 
-/* Values for -> state variable. */
+/* Values for state variable (bottom bits of ->srcu_gp_seq). */
 #define SRCU_STATE_IDLE		0
 #define SRCU_STATE_SCAN1	1
 #define SRCU_STATE_SCAN2	2
@@ -51,11 +101,9 @@ void process_srcu(struct work_struct *work);
 
 #define __SRCU_STRUCT_INIT(name)					\
 	{								\
-		.completed = -300,					\
-		.per_cpu_ref = &name##_srcu_array,			\
-		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
-		.srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\
-		.work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
+		.sda = &name##_srcu_data,				\
+		.gp_lock = __SPIN_LOCK_UNLOCKED(name.gp_lock),		\
+		.srcu_gp_seq_needed = 0 - 1,				\
 		__SRCU_DEP_MAP_INIT(name)				\
 	}
 
@@ -79,7 +127,7 @@ void process_srcu(struct work_struct *work);
  * See include/linux/percpu-defs.h for the rules on per-CPU variables.
  */
 #define __DEFINE_SRCU(name, is_static)					\
-	static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
+	static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\
 	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
 #define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
 #define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 6f344b6748a8..e9d4527cdd43 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -563,17 +563,30 @@ static void srcu_torture_stats(void)
 	int idx;
 
 #if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU)
+#ifdef CONFIG_TREE_SRCU
+	idx = srcu_ctlp->srcu_idx & 0x1;
+#else /* #ifdef CONFIG_TREE_SRCU */
 	idx = srcu_ctlp->completed & 0x1;
+#endif /* #else #ifdef CONFIG_TREE_SRCU */
 	pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
 		 torture_type, TORTURE_FLAG, idx);
 	for_each_possible_cpu(cpu) {
 		unsigned long l0, l1;
 		unsigned long u0, u1;
 		long c0, c1;
-		struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
+#ifdef CONFIG_TREE_SRCU
+		struct srcu_data *counts;
 
+		counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
+		u0 = counts->srcu_unlock_count[!idx];
+		u1 = counts->srcu_unlock_count[idx];
+#else /* #ifdef CONFIG_TREE_SRCU */
+		struct srcu_array *counts;
+
+		counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
 		u0 = counts->unlock_count[!idx];
 		u1 = counts->unlock_count[idx];
+#endif /* #else #ifdef CONFIG_TREE_SRCU */
 
 		/*
 		 * Make sure that a lock is always counted if the corresponding
@@ -581,8 +594,13 @@ static void srcu_torture_stats(void)
 		 */
 		smp_rmb();
 
+#ifdef CONFIG_TREE_SRCU
+		l0 = counts->srcu_lock_count[!idx];
+		l1 = counts->srcu_lock_count[idx];
+#else /* #ifdef CONFIG_TREE_SRCU */
 		l0 = counts->lock_count[!idx];
 		l1 = counts->lock_count[idx];
+#endif /* #else #ifdef CONFIG_TREE_SRCU */
 
 		c0 = l0 - u0;
 		c1 = l1 - u1;
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index da676b0d016b..12feeca18f46 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -36,19 +36,110 @@
 #include <linux/delay.h>
 #include <linux/srcu.h>
 
-#include <linux/rcu_node_tree.h>
 #include "rcu.h"
 
-static int init_srcu_struct_fields(struct srcu_struct *sp)
+static void srcu_invoke_callbacks(struct work_struct *work);
+static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+
+/*
+ * Initialize SRCU combining tree.  Note that statically allocated
+ * srcu_struct structures might already have srcu_read_lock() and
+ * srcu_read_unlock() running against them.  So if the is_static parameter
+ * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
+ */
+static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
 {
-	sp->completed = 0;
+	int cpu;
+	int i;
+	int level = 0;
+	int levelspread[RCU_NUM_LVLS];
+	struct srcu_data *sdp;
+	struct srcu_node *snp;
+	struct srcu_node *snp_first;
+
+	/* Work out the overall tree geometry. */
+	sp->level[0] = &sp->node[0];
+	for (i = 1; i < rcu_num_lvls; i++)
+		sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1];
+	rcu_init_levelspread(levelspread, num_rcu_lvl);
+
+	/* Each pass through this loop initializes one srcu_node structure. */
+	rcu_for_each_node_breadth_first(sp, snp) {
+		spin_lock_init(&snp->lock);
+		for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++)
+			snp->srcu_have_cbs[i] = 0;
+		snp->grplo = -1;
+		snp->grphi = -1;
+		if (snp == &sp->node[0]) {
+			/* Root node, special case. */
+			snp->srcu_parent = NULL;
+			continue;
+		}
+
+		/* Non-root node. */
+		if (snp == sp->level[level + 1])
+			level++;
+		snp->srcu_parent = sp->level[level - 1] +
+				   (snp - sp->level[level]) /
+				   levelspread[level - 1];
+	}
+
+	/*
+	 * Initialize the per-CPU srcu_data array, which feeds into the
+	 * leaves of the srcu_node tree.
+	 */
+	WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
+		     ARRAY_SIZE(sdp->srcu_unlock_count));
+	level = rcu_num_lvls - 1;
+	snp_first = sp->level[level];
+	for_each_possible_cpu(cpu) {
+		sdp = per_cpu_ptr(sp->sda, cpu);
+		spin_lock_init(&sdp->lock);
+		rcu_segcblist_init(&sdp->srcu_cblist);
+		sdp->srcu_cblist_invoking = false;
+		sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
+		sdp->mynode = &snp_first[cpu / levelspread[level]];
+		for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
+			if (snp->grplo < 0)
+				snp->grplo = cpu;
+			snp->grphi = cpu;
+		}
+		sdp->cpu = cpu;
+		INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks);
+		sdp->sp = sp;
+		if (is_static)
+			continue;
+
+		/* Dynamically allocated, better be no srcu_read_locks()! */
+		for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) {
+			sdp->srcu_lock_count[i] = 0;
+			sdp->srcu_unlock_count[i] = 0;
+		}
+	}
+}
+
+/*
+ * Initialize non-compile-time initialized fields, including the
+ * associated srcu_node and srcu_data structures.  The is_static
+ * parameter is passed through to init_srcu_struct_nodes(), and
+ * also tells us that ->sda has already been wired up to srcu_data.
+ */
+static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
+{
+	mutex_init(&sp->srcu_cb_mutex);
+	mutex_init(&sp->srcu_gp_mutex);
+	sp->srcu_idx = 0;
 	sp->srcu_gp_seq = 0;
 	atomic_set(&sp->srcu_exp_cnt, 0);
-	spin_lock_init(&sp->queue_lock);
-	rcu_segcblist_init(&sp->srcu_cblist);
+	sp->srcu_barrier_seq = 0;
+	mutex_init(&sp->srcu_barrier_mutex);
+	atomic_set(&sp->srcu_barrier_cpu_cnt, 0);
 	INIT_DELAYED_WORK(&sp->work, process_srcu);
-	sp->per_cpu_ref = alloc_percpu(struct srcu_array);
-	return sp->per_cpu_ref ? 0 : -ENOMEM;
+	if (!is_static)
+		sp->sda = alloc_percpu(struct srcu_data);
+	init_srcu_struct_nodes(sp, is_static);
+	smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */
+	return sp->sda ? 0 : -ENOMEM;
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -59,7 +150,8 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
 	/* Don't re-initialize a lock while it is held. */
 	debug_check_no_locks_freed((void *)sp, sizeof(*sp));
 	lockdep_init_map(&sp->dep_map, name, key, 0);
-	return init_srcu_struct_fields(sp);
+	spin_lock_init(&sp->gp_lock);
+	return init_srcu_struct_fields(sp, false);
 }
 EXPORT_SYMBOL_GPL(__init_srcu_struct);
 
@@ -75,15 +167,41 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
  */
 int init_srcu_struct(struct srcu_struct *sp)
 {
-	return init_srcu_struct_fields(sp);
+	spin_lock_init(&sp->gp_lock);
+	return init_srcu_struct_fields(sp, false);
 }
 EXPORT_SYMBOL_GPL(init_srcu_struct);
 
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 /*
- * Returns approximate total of the readers' ->lock_count[] values for the
- * rank of per-CPU counters specified by idx.
+ * First-use initialization of statically allocated srcu_struct
+ * structure.  Wiring up the combining tree is more than can be
+ * done with compile-time initialization, so this check is added
+ * to each update-side SRCU primitive.  Use ->gp_lock, which -is-
+ * compile-time initialized, to resolve races involving multiple
+ * CPUs trying to garner first-use privileges.
+ */
+static void check_init_srcu_struct(struct srcu_struct *sp)
+{
+	unsigned long flags;
+
+	WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT);
+	/* The smp_load_acquire() pairs with the smp_store_release(). */
+	if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
+		return; /* Already initialized. */
+	spin_lock_irqsave(&sp->gp_lock, flags);
+	if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
+		spin_unlock_irqrestore(&sp->gp_lock, flags);
+		return;
+	}
+	init_srcu_struct_fields(sp, true);
+	spin_unlock_irqrestore(&sp->gp_lock, flags);
+}
+
+/*
+ * Returns approximate total of the readers' ->srcu_lock_count[] values
+ * for the rank of per-CPU counters specified by idx.
  */
 static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
 {
@@ -91,16 +209,16 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
 	unsigned long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+		struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
 
-		sum += READ_ONCE(cpuc->lock_count[idx]);
+		sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
 	}
 	return sum;
 }
 
 /*
- * Returns approximate total of the readers' ->unlock_count[] values for the
- * rank of per-CPU counters specified by idx.
+ * Returns approximate total of the readers' ->srcu_unlock_count[] values
+ * for the rank of per-CPU counters specified by idx.
  */
 static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
 {
@@ -108,9 +226,9 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
 	unsigned long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+		struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
 
-		sum += READ_ONCE(cpuc->unlock_count[idx]);
+		sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
 	}
 	return sum;
 }
@@ -145,14 +263,14 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
 	 * the current index but not have incremented the lock counter yet.
 	 *
 	 * Possible bug: There is no guarantee that there haven't been
-	 * ULONG_MAX increments of ->lock_count[] since the unlocks were
+	 * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were
 	 * counted, meaning that this could return true even if there are
 	 * still active readers.  Since there are no memory barriers around
-	 * srcu_flip(), the CPU is not required to increment ->completed
+	 * srcu_flip(), the CPU is not required to increment ->srcu_idx
 	 * before running srcu_readers_unlock_idx(), which means that there
 	 * could be an arbitrarily large number of critical sections that
 	 * execute after srcu_readers_unlock_idx() but use the old value
-	 * of ->completed.
+	 * of ->srcu_idx.
 	 */
 	return srcu_readers_lock_idx(sp, idx) == unlocks;
 }
@@ -172,12 +290,12 @@ static bool srcu_readers_active(struct srcu_struct *sp)
 	unsigned long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+		struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
 
-		sum += READ_ONCE(cpuc->lock_count[0]);
-		sum += READ_ONCE(cpuc->lock_count[1]);
-		sum -= READ_ONCE(cpuc->unlock_count[0]);
-		sum -= READ_ONCE(cpuc->unlock_count[1]);
+		sum += READ_ONCE(cpuc->srcu_lock_count[0]);
+		sum += READ_ONCE(cpuc->srcu_lock_count[1]);
+		sum -= READ_ONCE(cpuc->srcu_unlock_count[0]);
+		sum -= READ_ONCE(cpuc->srcu_unlock_count[1]);
 	}
 	return sum;
 }
@@ -193,18 +311,21 @@ static bool srcu_readers_active(struct srcu_struct *sp)
  */
 void cleanup_srcu_struct(struct srcu_struct *sp)
 {
+	int cpu;
+
 	WARN_ON_ONCE(atomic_read(&sp->srcu_exp_cnt));
 	if (WARN_ON(srcu_readers_active(sp)))
 		return; /* Leakage unless caller handles error. */
-	if (WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)))
-		return; /* Leakage unless caller handles error. */
 	flush_delayed_work(&sp->work);
-	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE)) {
-		pr_info("cleanup_srcu_struct: Active srcu_struct %lu CBs %c state: %d\n", rcu_segcblist_n_cbs(&sp->srcu_cblist), ".E"[rcu_segcblist_empty(&sp->srcu_cblist)], rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
+	for_each_possible_cpu(cpu)
+		flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+	    WARN_ON(srcu_readers_active(sp))) {
+		pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
 		return; /* Caller forgot to stop doing call_srcu()? */
 	}
-	free_percpu(sp->per_cpu_ref);
-	sp->per_cpu_ref = NULL;
+	free_percpu(sp->sda);
+	sp->sda = NULL;
 }
 EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
 
@@ -217,8 +338,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
 {
 	int idx;
 
-	idx = READ_ONCE(sp->completed) & 0x1;
-	__this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
+	idx = READ_ONCE(sp->srcu_idx) & 0x1;
+	__this_cpu_inc(sp->sda->srcu_lock_count[idx]);
 	smp_mb(); /* B */  /* Avoid leaking the critical section. */
 	return idx;
 }
@@ -233,7 +354,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
 void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 {
 	smp_mb(); /* C */  /* Avoid leaking the critical section. */
-	this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
+	this_cpu_inc(sp->sda->srcu_unlock_count[idx]);
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 
@@ -251,19 +372,207 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
  */
 static void srcu_gp_start(struct srcu_struct *sp)
 {
+	struct srcu_data *sdp = this_cpu_ptr(sp->sda);
 	int state;
 
-	rcu_segcblist_accelerate(&sp->srcu_cblist,
-				 rcu_seq_snap(&sp->srcu_gp_seq));
+	RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock),
+			 "Invoked srcu_gp_start() without ->gp_lock!");
+	WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
+	rcu_segcblist_advance(&sdp->srcu_cblist,
+			      rcu_seq_current(&sp->srcu_gp_seq));
+	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
+				       rcu_seq_snap(&sp->srcu_gp_seq));
 	rcu_seq_start(&sp->srcu_gp_seq);
 	state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
 	WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
 }
 
+/*
+ * Track online CPUs to guide callback workqueue placement.
+ */
+DEFINE_PER_CPU(bool, srcu_online);
+
+void srcu_online_cpu(unsigned int cpu)
+{
+	WRITE_ONCE(per_cpu(srcu_online, cpu), true);
+}
+
+void srcu_offline_cpu(unsigned int cpu)
+{
+	WRITE_ONCE(per_cpu(srcu_online, cpu), false);
+}
+
+/*
+ * Place the workqueue handler on the specified CPU if online, otherwise
+ * just run it whereever.  This is useful for placing workqueue handlers
+ * that are to invoke the specified CPU's callbacks.
+ */
+static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+				       struct delayed_work *dwork,
+				       unsigned long delay)
+{
+	bool ret;
+
+	preempt_disable();
+	if (READ_ONCE(per_cpu(srcu_online, cpu)))
+		ret = queue_delayed_work_on(cpu, wq, dwork, delay);
+	else
+		ret = queue_delayed_work(wq, dwork, delay);
+	preempt_enable();
+	return ret;
+}
+
+/*
+ * Schedule callback invocation for the specified srcu_data structure,
+ * if possible, on the corresponding CPU.
+ */
+static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
+{
+	srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq,
+				   &sdp->work, delay);
+}
+
+/*
+ * Schedule callback invocation for all srcu_data structures associated
+ * with the specified srcu_node structure, if possible, on the corresponding
+ * CPUs.
+ */
+static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp)
+{
+	int cpu;
+
+	for (cpu = snp->grplo; cpu <= snp->grphi; cpu++)
+		srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), SRCU_INTERVAL);
+}
+
+/*
+ * Note the end of an SRCU grace period.  Initiates callback invocation
+ * and starts a new grace period if needed.
+ *
+ * The ->srcu_cb_mutex acquisition does not protect any data, but
+ * instead prevents more than one grace period from starting while we
+ * are initiating callback invocation.  This allows the ->srcu_have_cbs[]
+ * array to have a finite number of elements.
+ */
+static void srcu_gp_end(struct srcu_struct *sp)
+{
+	bool cbs;
+	unsigned long gpseq;
+	int idx;
+	int idxnext;
+	struct srcu_node *snp;
+
+	/* Prevent more than one additional grace period. */
+	mutex_lock(&sp->srcu_cb_mutex);
+
+	/* End the current grace period. */
+	spin_lock_irq(&sp->gp_lock);
+	idx = rcu_seq_state(sp->srcu_gp_seq);
+	WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
+	rcu_seq_end(&sp->srcu_gp_seq);
+	gpseq = rcu_seq_current(&sp->srcu_gp_seq);
+	spin_unlock_irq(&sp->gp_lock);
+	mutex_unlock(&sp->srcu_gp_mutex);
+	/* A new grace period can start at this point.  But only one. */
+
+	/* Initiate callback invocation as needed. */
+	idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
+	idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
+	rcu_for_each_node_breadth_first(sp, snp) {
+		spin_lock_irq(&snp->lock);
+		cbs = false;
+		if (snp >= sp->level[rcu_num_lvls - 1])
+			cbs = snp->srcu_have_cbs[idx] == gpseq;
+		snp->srcu_have_cbs[idx] = gpseq;
+		rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
+		spin_unlock_irq(&snp->lock);
+		if (cbs) {
+			smp_mb(); /* GP end before CB invocation. */
+			srcu_schedule_cbs_snp(sp, snp);
+		}
+	}
+
+	/* Callback initiation done, allow grace periods after next. */
+	mutex_unlock(&sp->srcu_cb_mutex);
+
+	/* Start a new grace period if needed. */
+	spin_lock_irq(&sp->gp_lock);
+	gpseq = rcu_seq_current(&sp->srcu_gp_seq);
+	if (!rcu_seq_state(gpseq) &&
+	    ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
+		srcu_gp_start(sp);
+		spin_unlock_irq(&sp->gp_lock);
+		/* Throttle expedited grace periods: Should be rare! */
+		srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) &&
+				    rcu_seq_ctr(gpseq) & 0xf
+				    ? 0
+				    : SRCU_INTERVAL);
+	} else {
+		spin_unlock_irq(&sp->gp_lock);
+	}
+}
+
+/*
+ * Funnel-locking scheme to scalably mediate many concurrent grace-period
+ * requests.  The winner has to do the work of actually starting grace
+ * period s.  Losers must either ensure that their desired grace-period
+ * number is recorded on at least their leaf srcu_node structure, or they
+ * must take steps to invoke their own callbacks.
+ */
+static void srcu_funnel_gp_start(struct srcu_struct *sp,
+				 struct srcu_data *sdp,
+				 unsigned long s)
+{
+	unsigned long flags;
+	int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
+	struct srcu_node *snp = sdp->mynode;
+	unsigned long snp_seq;
+
+	/* Each pass through the loop does one level of the srcu_node tree. */
+	for (; snp != NULL; snp = snp->srcu_parent) {
+		if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
+			return; /* GP already done and CBs recorded. */
+		spin_lock_irqsave(&snp->lock, flags);
+		if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
+			snp_seq = snp->srcu_have_cbs[idx];
+			spin_unlock_irqrestore(&snp->lock, flags);
+			if (snp == sdp->mynode && snp_seq != s) {
+				smp_mb(); /* CBs after GP! */
+				srcu_schedule_cbs_sdp(sdp, 0);
+			}
+			return;
+		}
+		snp->srcu_have_cbs[idx] = s;
+		spin_unlock_irqrestore(&snp->lock, flags);
+	}
+
+	/* Top of tree, must ensure the grace period will be started. */
+	spin_lock_irqsave(&sp->gp_lock, flags);
+	if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
+		/*
+		 * Record need for grace period s.  Pair with load
+		 * acquire setting up for initialization.
+		 */
+		smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/
+	}
+
+	/* If grace period not already done and none in progress, start it. */
+	if (!rcu_seq_done(&sp->srcu_gp_seq, s) &&
+	    rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
+		WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
+		srcu_gp_start(sp);
+		queue_delayed_work(system_power_efficient_wq, &sp->work,
+				   atomic_read(&sp->srcu_exp_cnt)
+				   ? 0
+				   : SRCU_INTERVAL);
+	}
+	spin_unlock_irqrestore(&sp->gp_lock, flags);
+}
+
 /*
  * Wait until all readers counted by array index idx complete, but
  * loop an additional time if there is an expedited grace period pending.
- * The caller must ensure that ->completed is not changed while checking.
+ * The caller must ensure that ->srcu_idx is not changed while checking.
  */
 static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
 {
@@ -277,13 +586,13 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
 }
 
 /*
- * Increment the ->completed counter so that future SRCU readers will
- * use the other rank of the ->(un)lock_count[] arrays.  This allows
+ * Increment the ->srcu_idx counter so that future SRCU readers will
+ * use the other rank of the ->srcu_(un)lock_count[] arrays.  This allows
  * us to wait for pre-existing readers in a starvation-free manner.
  */
 static void srcu_flip(struct srcu_struct *sp)
 {
-	WRITE_ONCE(sp->completed, sp->completed + 1);
+	WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1);
 
 	/*
 	 * Ensure that if the updater misses an __srcu_read_unlock()
@@ -296,21 +605,9 @@ static void srcu_flip(struct srcu_struct *sp)
 }
 
 /*
- * End an SRCU grace period.
- */
-static void srcu_gp_end(struct srcu_struct *sp)
-{
-	rcu_seq_end(&sp->srcu_gp_seq);
-
-	spin_lock_irq(&sp->queue_lock);
-	rcu_segcblist_advance(&sp->srcu_cblist,
-			      rcu_seq_current(&sp->srcu_gp_seq));
-	spin_unlock_irq(&sp->queue_lock);
-}
-
-/*
- * Enqueue an SRCU callback on the specified srcu_struct structure,
- * initiating grace-period processing if it is not already running.
+ * Enqueue an SRCU callback on the srcu_data structure associated with
+ * the current CPU and the specified srcu_struct structure, initiating
+ * grace-period processing if it is not already running.
  *
  * Note that all CPUs must agree that the grace period extended beyond
  * all pre-existing SRCU read-side critical section.  On systems with
@@ -335,33 +632,40 @@ static void srcu_gp_end(struct srcu_struct *sp)
  * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
  * srcu_struct structure.
  */
-void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 	       rcu_callback_t func)
 {
 	unsigned long flags;
-
-	head->next = NULL;
-	head->func = func;
-	spin_lock_irqsave(&sp->queue_lock, flags);
-	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
-	rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
-	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_IDLE) {
-		srcu_gp_start(sp);
-		queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
+	bool needgp = false;
+	unsigned long s;
+	struct srcu_data *sdp;
+
+	check_init_srcu_struct(sp);
+	rhp->func = func;
+	local_irq_save(flags);
+	sdp = this_cpu_ptr(sp->sda);
+	spin_lock(&sdp->lock);
+	rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
+	rcu_segcblist_advance(&sdp->srcu_cblist,
+			      rcu_seq_current(&sp->srcu_gp_seq));
+	s = rcu_seq_snap(&sp->srcu_gp_seq);
+	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
+	if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
+		sdp->srcu_gp_seq_needed = s;
+		needgp = true;
 	}
-	spin_unlock_irqrestore(&sp->queue_lock, flags);
+	spin_unlock_irqrestore(&sdp->lock, flags);
+	if (needgp)
+		srcu_funnel_gp_start(sp, sdp, s);
 }
 EXPORT_SYMBOL_GPL(call_srcu);
 
-static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
-
 /*
  * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
  */
 static void __synchronize_srcu(struct srcu_struct *sp)
 {
 	struct rcu_synchronize rcu;
-	struct rcu_head *head = &rcu.head;
 
 	RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
 			 lock_is_held(&rcu_bh_lock_map) ||
@@ -372,26 +676,12 @@ static void __synchronize_srcu(struct srcu_struct *sp)
 	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
 		return;
 	might_sleep();
+	check_init_srcu_struct(sp);
 	init_completion(&rcu.completion);
-
-	head->next = NULL;
-	head->func = wakeme_after_rcu;
-	spin_lock_irq(&sp->queue_lock);
-	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
-	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_IDLE) {
-		/* steal the processing owner */
-		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
-		srcu_gp_start(sp);
-		spin_unlock_irq(&sp->queue_lock);
-		/* give the processing owner to work_struct */
-		srcu_reschedule(sp, 0);
-	} else {
-		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
-		spin_unlock_irq(&sp->queue_lock);
-	}
-
+	init_rcu_head_on_stack(&rcu.head);
+	call_srcu(sp, &rcu.head, wakeme_after_rcu);
 	wait_for_completion(&rcu.completion);
-	smp_mb(); /* Caller's later accesses after GP. */
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 
 /**
@@ -408,6 +698,7 @@ void synchronize_srcu_expedited(struct srcu_struct *sp)
 {
 	bool do_norm = rcu_gp_is_normal();
 
+	check_init_srcu_struct(sp);
 	if (!do_norm) {
 		atomic_inc(&sp->srcu_exp_cnt);
 		smp_mb__after_atomic(); /* increment before GP. */
@@ -415,7 +706,7 @@ void synchronize_srcu_expedited(struct srcu_struct *sp)
 	__synchronize_srcu(sp);
 	if (!do_norm) {
 		smp_mb__before_atomic(); /* GP before decrement. */
-		atomic_dec(&sp->srcu_exp_cnt);
+		WARN_ON_ONCE(atomic_dec_return(&sp->srcu_exp_cnt) < 0);
 	}
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
@@ -426,8 +717,8 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
  *
  * Wait for the count to drain to zero of both indexes. To avoid the
  * possible starvation of synchronize_srcu(), it waits for the count of
- * the index=((->completed & 1) ^ 1) to drain to zero at first,
- * and then flip the completed and wait for the count of the other index.
+ * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first,
+ * and then flip the srcu_idx and wait for the count of the other index.
  *
  * Can block; must be called from process context.
  *
@@ -468,13 +759,69 @@ void synchronize_srcu(struct srcu_struct *sp)
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu);
 
+/*
+ * Callback function for srcu_barrier() use.
+ */
+static void srcu_barrier_cb(struct rcu_head *rhp)
+{
+	struct srcu_data *sdp;
+	struct srcu_struct *sp;
+
+	sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
+	sp = sdp->sp;
+	if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
+		complete(&sp->srcu_barrier_completion);
+}
+
 /**
  * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
  * @sp: srcu_struct on which to wait for in-flight callbacks.
  */
 void srcu_barrier(struct srcu_struct *sp)
 {
-	synchronize_srcu(sp);
+	int cpu;
+	struct srcu_data *sdp;
+	unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq);
+
+	check_init_srcu_struct(sp);
+	mutex_lock(&sp->srcu_barrier_mutex);
+	if (rcu_seq_done(&sp->srcu_barrier_seq, s)) {
+		smp_mb(); /* Force ordering following return. */
+		mutex_unlock(&sp->srcu_barrier_mutex);
+		return; /* Someone else did our work for us. */
+	}
+	rcu_seq_start(&sp->srcu_barrier_seq);
+	init_completion(&sp->srcu_barrier_completion);
+
+	/* Initial count prevents reaching zero until all CBs are posted. */
+	atomic_set(&sp->srcu_barrier_cpu_cnt, 1);
+
+	/*
+	 * Each pass through this loop enqueues a callback, but only
+	 * on CPUs already having callbacks enqueued.  Note that if
+	 * a CPU already has callbacks enqueue, it must have already
+	 * registered the need for a future grace period, so all we
+	 * need do is enqueue a callback that will use the same
+	 * grace period as the last callback already in the queue.
+	 */
+	for_each_possible_cpu(cpu) {
+		sdp = per_cpu_ptr(sp->sda, cpu);
+		spin_lock_irq(&sdp->lock);
+		atomic_inc(&sp->srcu_barrier_cpu_cnt);
+		sdp->srcu_barrier_head.func = srcu_barrier_cb;
+		if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
+					   &sdp->srcu_barrier_head, 0))
+			atomic_dec(&sp->srcu_barrier_cpu_cnt);
+		spin_unlock_irq(&sdp->lock);
+	}
+
+	/* Remove the initial count, at which point reaching zero can happen. */
+	if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
+		complete(&sp->srcu_barrier_completion);
+	wait_for_completion(&sp->srcu_barrier_completion);
+
+	rcu_seq_end(&sp->srcu_barrier_seq);
+	mutex_unlock(&sp->srcu_barrier_mutex);
 }
 EXPORT_SYMBOL_GPL(srcu_barrier);
 
@@ -487,21 +834,24 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
  */
 unsigned long srcu_batches_completed(struct srcu_struct *sp)
 {
-	return sp->completed;
+	return sp->srcu_idx;
 }
 EXPORT_SYMBOL_GPL(srcu_batches_completed);
 
 /*
- * Core SRCU state machine.  Advance callbacks from ->batch_check0 to
- * ->batch_check1 and then to ->batch_done as readers drain.
+ * Core SRCU state machine.  Push state bits of ->srcu_gp_seq
+ * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has
+ * completed in that state.
  */
-static void srcu_advance_batches(struct srcu_struct *sp)
+static void srcu_advance_state(struct srcu_struct *sp)
 {
 	int idx;
 
+	mutex_lock(&sp->srcu_gp_mutex);
+
 	/*
 	 * Because readers might be delayed for an extended period after
-	 * fetching ->completed for their index, at any point in time there
+	 * fetching ->srcu_idx for their index, at any point in time there
 	 * might well be readers using both idx=0 and idx=1.  We therefore
 	 * need to wait for readers to clear from both index values before
 	 * invoking a callback.
@@ -511,23 +861,29 @@ static void srcu_advance_batches(struct srcu_struct *sp)
 	 */
 	idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
 	if (idx == SRCU_STATE_IDLE) {
-		spin_lock_irq(&sp->queue_lock);
-		if (rcu_segcblist_empty(&sp->srcu_cblist)) {
-			spin_unlock_irq(&sp->queue_lock);
+		spin_lock_irq(&sp->gp_lock);
+		if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
+			WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
+			spin_unlock_irq(&sp->gp_lock);
+			mutex_unlock(&sp->srcu_gp_mutex);
 			return;
 		}
 		idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
 		if (idx == SRCU_STATE_IDLE)
 			srcu_gp_start(sp);
-		spin_unlock_irq(&sp->queue_lock);
-		if (idx != SRCU_STATE_IDLE)
+		spin_unlock_irq(&sp->gp_lock);
+		if (idx != SRCU_STATE_IDLE) {
+			mutex_unlock(&sp->srcu_gp_mutex);
 			return; /* Someone else started the grace period. */
+		}
 	}
 
 	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
-		idx = 1 ^ (sp->completed & 1);
-		if (!try_check_zero(sp, idx, 1))
+		idx = 1 ^ (sp->srcu_idx & 1);
+		if (!try_check_zero(sp, idx, 1)) {
+			mutex_unlock(&sp->srcu_gp_mutex);
 			return; /* readers present, retry later. */
+		}
 		srcu_flip(sp);
 		rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
 	}
@@ -538,10 +894,12 @@ static void srcu_advance_batches(struct srcu_struct *sp)
 		 * SRCU read-side critical sections are normally short,
 		 * so check at least twice in quick succession after a flip.
 		 */
-		idx = 1 ^ (sp->completed & 1);
-		if (!try_check_zero(sp, idx, 2))
-			return; /* readers present, retry after later. */
-		srcu_gp_end(sp);
+		idx = 1 ^ (sp->srcu_idx & 1);
+		if (!try_check_zero(sp, idx, 2)) {
+			mutex_unlock(&sp->srcu_gp_mutex);
+			return; /* readers present, retry later. */
+		}
+		srcu_gp_end(sp);  /* Releases ->srcu_gp_mutex. */
 	}
 }
 
@@ -551,28 +909,51 @@ static void srcu_advance_batches(struct srcu_struct *sp)
  * the workqueue.  Note that needed memory barriers have been executed
  * in this task's context by srcu_readers_active_idx_check().
  */
-static void srcu_invoke_callbacks(struct srcu_struct *sp)
+static void srcu_invoke_callbacks(struct work_struct *work)
 {
+	bool more;
 	struct rcu_cblist ready_cbs;
 	struct rcu_head *rhp;
+	struct srcu_data *sdp;
+	struct srcu_struct *sp;
 
-	spin_lock_irq(&sp->queue_lock);
-	if (!rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
-		spin_unlock_irq(&sp->queue_lock);
-		return;
-	}
+	sdp = container_of(work, struct srcu_data, work.work);
+	sp = sdp->sp;
 	rcu_cblist_init(&ready_cbs);
-	rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
-	spin_unlock_irq(&sp->queue_lock);
+	spin_lock_irq(&sdp->lock);
+	smp_mb(); /* Old grace periods before callback invocation! */
+	rcu_segcblist_advance(&sdp->srcu_cblist,
+			      rcu_seq_current(&sp->srcu_gp_seq));
+	if (sdp->srcu_cblist_invoking ||
+	    !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
+		spin_unlock_irq(&sdp->lock);
+		return;  /* Someone else on the job or nothing to do. */
+	}
+
+	/* We are on the job!  Extract and invoke ready callbacks. */
+	sdp->srcu_cblist_invoking = true;
+	rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
+	spin_unlock_irq(&sdp->lock);
 	rhp = rcu_cblist_dequeue(&ready_cbs);
 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
 		local_bh_disable();
 		rhp->func(rhp);
 		local_bh_enable();
 	}
-	spin_lock_irq(&sp->queue_lock);
-	rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
-	spin_unlock_irq(&sp->queue_lock);
+
+	/*
+	 * Update counts, accelerate new callbacks, and if needed,
+	 * schedule another round of callback invocation.
+	 */
+	spin_lock_irq(&sdp->lock);
+	rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
+	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
+				       rcu_seq_snap(&sp->srcu_gp_seq));
+	sdp->srcu_cblist_invoking = false;
+	more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
+	spin_unlock_irq(&sdp->lock);
+	if (more)
+		srcu_schedule_cbs_sdp(sdp, 0);
 }
 
 /*
@@ -581,19 +962,21 @@ static void srcu_invoke_callbacks(struct srcu_struct *sp)
  */
 static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
 {
-	bool pending = true;
-	int state;
+	bool pushgp = true;
 
-	if (rcu_segcblist_empty(&sp->srcu_cblist)) {
-		spin_lock_irq(&sp->queue_lock);
-		state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
-		if (rcu_segcblist_empty(&sp->srcu_cblist) &&
-		    state == SRCU_STATE_IDLE)
-			pending = false;
-		spin_unlock_irq(&sp->queue_lock);
+	spin_lock_irq(&sp->gp_lock);
+	if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
+		if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
+			/* All requests fulfilled, time to go idle. */
+			pushgp = false;
+		}
+	} else if (!rcu_seq_state(sp->srcu_gp_seq)) {
+		/* Outstanding request and no GP.  Start one. */
+		srcu_gp_start(sp);
 	}
+	spin_unlock_irq(&sp->gp_lock);
 
-	if (pending)
+	if (pushgp)
 		queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
 }
 
@@ -606,8 +989,7 @@ void process_srcu(struct work_struct *work)
 
 	sp = container_of(work, struct srcu_struct, work.work);
 
-	srcu_advance_batches(sp);
-	srcu_invoke_callbacks(sp);
+	srcu_advance_state(sp);
 	srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
 }
 EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 346948b51b0b..3c23435d2083 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3776,12 +3776,16 @@ int rcutree_online_cpu(unsigned int cpu)
 {
 	sync_sched_exp_online_cleanup(cpu);
 	rcutree_affinity_setting(cpu, -1);
+	if (IS_ENABLED(CONFIG_TREE_SRCU))
+		srcu_online_cpu(cpu);
 	return 0;
 }
 
 int rcutree_offline_cpu(unsigned int cpu)
 {
 	rcutree_affinity_setting(cpu, cpu);
+	if (IS_ENABLED(CONFIG_TREE_SRCU))
+		srcu_offline_cpu(cpu);
 	return 0;
 }
 
@@ -4157,6 +4161,8 @@ void __init rcu_init(void)
 	for_each_online_cpu(cpu) {
 		rcutree_prepare_cpu(cpu);
 		rcu_cpu_starting(cpu);
+		if (IS_ENABLED(CONFIG_TREE_SRCU))
+			srcu_online_cpu(cpu);
 	}
 }
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a2a45cb629d6..0e598ab08fea 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -541,6 +541,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
 static void rcu_dynticks_task_enter(void);
 static void rcu_dynticks_task_exit(void);
 
+#ifdef CONFIG_SRCU
+void srcu_online_cpu(unsigned int cpu);
+void srcu_offline_cpu(unsigned int cpu);
+#else /* #ifdef CONFIG_SRCU */
+void srcu_online_cpu(unsigned int cpu) { }
+void srcu_offline_cpu(unsigned int cpu) { }
+#endif /* #else #ifdef CONFIG_SRCU */
+
 #endif /* #ifndef RCU_TREE_NONCORE */
 
 #ifdef CONFIG_RCU_TRACE
-- 
cgit v1.2.3-70-g09d2


From bcbfdd01dce5556a952fae84ef16fd0f12525e7b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 11 Apr 2017 15:50:41 -0700
Subject: rcu: Make non-preemptive schedule be Tasks RCU quiescent state

Currently, a call to schedule() acts as a Tasks RCU quiescent state
only if a context switch actually takes place.  However, just the
call to schedule() guarantees that the calling task has moved off of
whatever tracing trampoline that it might have been one previously.
This commit therefore plumbs schedule()'s "preempt" parameter into
rcu_note_context_switch(), which then records the Tasks RCU quiescent
state, but only if this call to schedule() was -not- due to a preemption.

To avoid adding overhead to the common-case context-switch path,
this commit hides the rcu_note_context_switch() check under an existing
non-common-case check.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 11 ++++++++---
 include/linux/rcutiny.h  | 13 +++++++++----
 include/linux/rcutree.h  |  5 +++--
 kernel/rcu/tree.c        | 22 +++++++++++++++++++++-
 kernel/rcu/update.c      |  1 +
 kernel/sched/core.c      |  2 +-
 6 files changed, 43 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index e6146d0074f8..f531b29207da 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -363,15 +363,20 @@ static inline void rcu_init_nohz(void)
 #ifdef CONFIG_TASKS_RCU
 #define TASKS_RCU(x) x
 extern struct srcu_struct tasks_rcu_exit_srcu;
-#define rcu_note_voluntary_context_switch(t) \
+#define rcu_note_voluntary_context_switch_lite(t) \
 	do { \
-		rcu_all_qs(); \
 		if (READ_ONCE((t)->rcu_tasks_holdout)) \
 			WRITE_ONCE((t)->rcu_tasks_holdout, false); \
 	} while (0)
+#define rcu_note_voluntary_context_switch(t) \
+	do { \
+		rcu_all_qs(); \
+		rcu_note_voluntary_context_switch_lite(t); \
+	} while (0)
 #else /* #ifdef CONFIG_TASKS_RCU */
 #define TASKS_RCU(x) do { } while (0)
-#define rcu_note_voluntary_context_switch(t)	rcu_all_qs()
+#define rcu_note_voluntary_context_switch_lite(t)	do { } while (0)
+#define rcu_note_voluntary_context_switch(t)		rcu_all_qs()
 #endif /* #else #ifdef CONFIG_TASKS_RCU */
 
 /**
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 5219be250f00..74d9c3a1feee 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -92,10 +92,11 @@ static inline void kfree_call_rcu(struct rcu_head *head,
 	call_rcu(head, func);
 }
 
-static inline void rcu_note_context_switch(void)
-{
-	rcu_sched_qs();
-}
+#define rcu_note_context_switch(preempt) \
+	do { \
+		rcu_sched_qs(); \
+		rcu_note_voluntary_context_switch_lite(current); \
+	} while (0)
 
 /*
  * Take advantage of the fact that there is only one CPU, which
@@ -242,6 +243,10 @@ static inline bool rcu_is_watching(void)
 
 #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
 
+static inline void rcu_request_urgent_qs_task(struct task_struct *t)
+{
+}
+
 static inline void rcu_all_qs(void)
 {
 	barrier(); /* Avoid RCU read-side critical sections leaking across. */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 63a4e4cf40a5..0bacb6b2af69 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,7 +30,7 @@
 #ifndef __LINUX_RCUTREE_H
 #define __LINUX_RCUTREE_H
 
-void rcu_note_context_switch(void);
+void rcu_note_context_switch(bool preempt);
 int rcu_needs_cpu(u64 basem, u64 *nextevt);
 void rcu_cpu_stall_reset(void);
 
@@ -41,7 +41,7 @@ void rcu_cpu_stall_reset(void);
  */
 static inline void rcu_virt_note_context_switch(int cpu)
 {
-	rcu_note_context_switch();
+	rcu_note_context_switch(false);
 }
 
 void synchronize_rcu_bh(void);
@@ -108,6 +108,7 @@ void rcu_scheduler_starting(void);
 extern int rcu_scheduler_active __read_mostly;
 
 bool rcu_is_watching(void);
+void rcu_request_urgent_qs_task(struct task_struct *t);
 
 void rcu_all_qs(void);
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3c23435d2083..891d97109e09 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -458,7 +458,7 @@ static void rcu_momentary_dyntick_idle(void)
  * and requires special handling for preemptible RCU.
  * The caller must have disabled interrupts.
  */
-void rcu_note_context_switch(void)
+void rcu_note_context_switch(bool preempt)
 {
 	barrier(); /* Avoid RCU read-side critical sections leaking down. */
 	trace_rcu_utilization(TPS("Start context switch"));
@@ -471,6 +471,8 @@ void rcu_note_context_switch(void)
 	if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs)))
 		rcu_momentary_dyntick_idle();
 	this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
+	if (!preempt)
+		rcu_note_voluntary_context_switch_lite(current);
 out:
 	trace_rcu_utilization(TPS("End context switch"));
 	barrier(); /* Avoid RCU read-side critical sections leaking up. */
@@ -1149,6 +1151,24 @@ bool notrace rcu_is_watching(void)
 }
 EXPORT_SYMBOL_GPL(rcu_is_watching);
 
+/*
+ * If a holdout task is actually running, request an urgent quiescent
+ * state from its CPU.  This is unsynchronized, so migrations can cause
+ * the request to go to the wrong CPU.  Which is OK, all that will happen
+ * is that the CPU's next context switch will be a bit slower and next
+ * time around this task will generate another request.
+ */
+void rcu_request_urgent_qs_task(struct task_struct *t)
+{
+	int cpu;
+
+	barrier();
+	cpu = task_cpu(t);
+	if (!task_curr(t))
+		return; /* This task is not running on that CPU. */
+	smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true);
+}
+
 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
 
 /*
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c5df0d756900..273e869ca21d 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -665,6 +665,7 @@ static void check_holdout_task(struct task_struct *t,
 		put_task_struct(t);
 		return;
 	}
+	rcu_request_urgent_qs_task(t);
 	if (!needreport)
 		return;
 	if (*firstreport) {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3b31fc05a0f1..2adf7b6c04e7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3378,7 +3378,7 @@ static void __sched notrace __schedule(bool preempt)
 		hrtick_clear(rq);
 
 	local_irq_disable();
-	rcu_note_context_switch();
+	rcu_note_context_switch(preempt);
 
 	/*
 	 * Make sure that signal_pending_state()->signal_pending() below
-- 
cgit v1.2.3-70-g09d2


From 50f2112cf7a3e62a8d33838eb205d5fef306457a Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Tue, 11 Apr 2017 12:50:09 -0400
Subject: locks: Set FL_CLOSE when removing flock locks on close()

Set FL_CLOSE in fl_flags as in locks_remove_posix() when clearing locks.
NFS will check for this flag to ensure an unlock is sent in a following
patch.

Fuse handles flock and posix locks differently for FL_CLOSE, and so
requires a fixup to retain the existing behavior for flock.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/fuse/file.c     | 2 +-
 fs/locks.c         | 2 +-
 include/linux/fs.h | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ec238fb5a584..995da8957f6f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2168,7 +2168,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
 	}
 
 	/* Unlock on close is handled by the flush method */
-	if (fl->fl_flags & FL_CLOSE)
+	if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
 		return 0;
 
 	fuse_lk_fill(&args, file, fl, opcode, pid, flock, &inarg);
diff --git a/fs/locks.c b/fs/locks.c
index 26811321d39b..af2031a1fcff 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2504,7 +2504,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
 		.fl_owner = filp,
 		.fl_pid = current->tgid,
 		.fl_file = filp,
-		.fl_flags = FL_FLOCK,
+		.fl_flags = FL_FLOCK | FL_CLOSE,
 		.fl_type = F_UNLCK,
 		.fl_end = OFFSET_MAX,
 	};
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7251f7bb45e8..72061aa65405 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -909,6 +909,8 @@ static inline struct file *get_file(struct file *f)
 #define FL_OFDLCK	1024	/* lock is "owned" by struct file */
 #define FL_LAYOUT	2048	/* outstanding pNFS layout */
 
+#define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)
+
 /*
  * Special return value from posix_lock_file() and vfs_lock_file() for
  * asynchronous locking.
-- 
cgit v1.2.3-70-g09d2


From 7d6ddf88c4db372689c8aa65ea652d0514d66c06 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Tue, 11 Apr 2017 12:50:10 -0400
Subject: NFS: Add an iocounter wait function for async RPC tasks

By sleeping on a new NFS Unlock-On-Close waitqueue, rpc tasks may wait for
a lock context's iocounter to reach zero.  The rpc waitqueue is only woken
when the open_context has the NFS_CONTEXT_UNLOCK flag set in order to
mitigate spurious wake-ups for any iocounter reaching zero.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/client.c           |  1 +
 fs/nfs/pagelist.c         | 34 +++++++++++++++++++++++++++++++++-
 include/linux/nfs_fs.h    |  1 +
 include/linux/nfs_fs_sb.h |  1 +
 include/linux/nfs_page.h  |  1 +
 5 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 3ffbffe8f39f..3e7b2e6a7cfb 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -218,6 +218,7 @@ static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
 static void pnfs_init_server(struct nfs_server *server)
 {
 	rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+	rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC");
 }
 
 #else
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index f53610672f03..ad92b401326c 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -102,6 +102,35 @@ nfs_iocounter_wait(struct nfs_lock_context *l_ctx)
 			TASK_KILLABLE);
 }
 
+/**
+ * nfs_async_iocounter_wait - wait on a rpc_waitqueue for I/O
+ * to complete
+ * @task: the rpc_task that should wait
+ * @l_ctx: nfs_lock_context with io_counter to check
+ *
+ * Returns true if there is outstanding I/O to wait on and the
+ * task has been put to sleep.
+ */
+bool
+nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx)
+{
+	struct inode *inode = d_inode(l_ctx->open_context->dentry);
+	bool ret = false;
+
+	if (atomic_read(&l_ctx->io_count) > 0) {
+		rpc_sleep_on(&NFS_SERVER(inode)->uoc_rpcwaitq, task, NULL);
+		ret = true;
+	}
+
+	if (atomic_read(&l_ctx->io_count) == 0) {
+		rpc_wake_up_queued_task(&NFS_SERVER(inode)->uoc_rpcwaitq, task);
+		ret = false;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait);
+
 /*
  * nfs_page_group_lock - lock the head of the page group
  * @req - request in group that is to be locked
@@ -385,8 +414,11 @@ static void nfs_clear_request(struct nfs_page *req)
 		req->wb_page = NULL;
 	}
 	if (l_ctx != NULL) {
-		if (atomic_dec_and_test(&l_ctx->io_count))
+		if (atomic_dec_and_test(&l_ctx->io_count)) {
 			wake_up_atomic_t(&l_ctx->io_count);
+			if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags))
+				rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq);
+		}
 		nfs_put_lock_context(l_ctx);
 		req->wb_lock_context = NULL;
 	}
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 1b29915247b2..9aa044e76820 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -76,6 +76,7 @@ struct nfs_open_context {
 #define NFS_CONTEXT_ERROR_WRITE		(0)
 #define NFS_CONTEXT_RESEND_WRITES	(1)
 #define NFS_CONTEXT_BAD			(2)
+#define NFS_CONTEXT_UNLOCK	(3)
 	int error;
 
 	struct list_head list;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b34097c67848..2a70f34dffe8 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -222,6 +222,7 @@ struct nfs_server {
 	u32			mountd_version;
 	unsigned short		mountd_port;
 	unsigned short		mountd_protocol;
+	struct rpc_wait_queue	uoc_rpcwaitq;
 };
 
 /* Server capabilities */
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 6f01e28bba27..247cc3d3498f 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -141,6 +141,7 @@ extern int nfs_page_group_lock(struct nfs_page *, bool);
 extern void nfs_page_group_lock_wait(struct nfs_page *);
 extern void nfs_page_group_unlock(struct nfs_page *);
 extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
+extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *);
 
 /*
  * Lock the page of an asynchronous request
-- 
cgit v1.2.3-70-g09d2


From b1ece737f44f91dca8f4829cf0b442e752e406db Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Tue, 11 Apr 2017 12:50:11 -0400
Subject: lockd: Introduce nlmclnt_operations

NFS would enjoy the ability to modify the behavior of the NLM client's
unlock RPC task in order to delay the transmission of the unlock until IO
that was submitted under that lock has completed.  This ability can ensure
that the NLM client will always complete the transmission of an unlock even
if the waiting caller has been interrupted with fatal signal.

For this purpose, a pointer to a struct nlmclnt_operations can be assigned
in a nfs_module's nfs_rpc_ops that will install those nlmclnt_operations on
the nlm_host.  The struct nlmclnt_operations defines three callback
operations that will be used in a following patch:

nlmclnt_alloc_call - used to call back after a successful allocation of
	a struct nlm_rqst in nlmclnt_proc().

nlmclnt_unlock_prepare - used to call back during NLM unlock's
	rpc_call_prepare.  The NLM client defers calling rpc_call_start()
	until this callback returns false.

nlmclnt_release_call - used to call back when the NLM client's struct
	nlm_rqst is freed.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/lockd/clntlock.c         |  1 +
 fs/lockd/clntproc.c         | 26 +++++++++++++++++++++++++-
 fs/nfs/client.c             |  1 +
 fs/nfs/nfs3proc.c           |  2 +-
 fs/nfs/proc.c               |  2 +-
 include/linux/lockd/bind.h  | 24 ++++++++++++++++++++++--
 include/linux/lockd/lockd.h |  2 ++
 include/linux/nfs_xdr.h     |  1 +
 8 files changed, 54 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 41e491b8e5d7..27d577dbe51a 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -69,6 +69,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
 	if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL)
 		goto out_nobind;
 
+	host->h_nlmclnt_ops = nlm_init->nlmclnt_ops;
 	return host;
 out_nobind:
 	nlmclnt_release_host(host);
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 112952037933..066ac313ae5c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -150,17 +150,22 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req)
  * @host: address of a valid nlm_host context representing the NLM server
  * @cmd: fcntl-style file lock operation to perform
  * @fl: address of arguments for the lock operation
+ * @data: address of data to be sent to callback operations
  *
  */
-int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
+int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *data)
 {
 	struct nlm_rqst		*call;
 	int			status;
+	const struct nlmclnt_operations *nlmclnt_ops = host->h_nlmclnt_ops;
 
 	call = nlm_alloc_call(host);
 	if (call == NULL)
 		return -ENOMEM;
 
+	if (nlmclnt_ops && nlmclnt_ops->nlmclnt_alloc_call)
+		nlmclnt_ops->nlmclnt_alloc_call(data);
+
 	nlmclnt_locks_init_private(fl, host);
 	if (!fl->fl_u.nfs_fl.owner) {
 		/* lockowner allocation has failed */
@@ -169,6 +174,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
 	}
 	/* Set up the argument struct */
 	nlmclnt_setlockargs(call, fl);
+	call->a_callback_data = data;
 
 	if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
 		if (fl->fl_type != F_UNLCK) {
@@ -214,8 +220,12 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
 
 void nlmclnt_release_call(struct nlm_rqst *call)
 {
+	const struct nlmclnt_operations *nlmclnt_ops = call->a_host->h_nlmclnt_ops;
+
 	if (!atomic_dec_and_test(&call->a_count))
 		return;
+	if (nlmclnt_ops && nlmclnt_ops->nlmclnt_release_call)
+		nlmclnt_ops->nlmclnt_release_call(call->a_callback_data);
 	nlmclnt_release_host(call->a_host);
 	nlmclnt_release_lockargs(call);
 	kfree(call);
@@ -687,6 +697,19 @@ out:
 	return status;
 }
 
+static void nlmclnt_unlock_prepare(struct rpc_task *task, void *data)
+{
+	struct nlm_rqst	*req = data;
+	const struct nlmclnt_operations *nlmclnt_ops = req->a_host->h_nlmclnt_ops;
+	bool defer_call = false;
+
+	if (nlmclnt_ops && nlmclnt_ops->nlmclnt_unlock_prepare)
+		defer_call = nlmclnt_ops->nlmclnt_unlock_prepare(task, req->a_callback_data);
+
+	if (!defer_call)
+		rpc_call_start(task);
+}
+
 static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
 {
 	struct nlm_rqst	*req = data;
@@ -720,6 +743,7 @@ die:
 }
 
 static const struct rpc_call_ops nlmclnt_unlock_ops = {
+	.rpc_call_prepare = nlmclnt_unlock_prepare,
 	.rpc_call_done = nlmclnt_unlock_callback,
 	.rpc_release = nlmclnt_rpc_release,
 };
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 3e7b2e6a7cfb..e0302101e18a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -546,6 +546,7 @@ static int nfs_start_lockd(struct nfs_server *server)
 		.noresvport	= server->flags & NFS_MOUNT_NORESVPORT ?
 					1 : 0,
 		.net		= clp->cl_net,
+		.nlmclnt_ops 	= clp->cl_nfs_mod->rpc_ops->nlmclnt_ops,
 	};
 
 	if (nlm_init.nfs_version > 3)
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index dc925b531f32..03b3c3de28f1 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -870,7 +870,7 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
 	struct inode *inode = file_inode(filp);
 
-	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
+	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, NULL);
 }
 
 static int nfs3_have_delegation(struct inode *inode, fmode_t flags)
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index b7bca8303989..9872cf676a50 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -638,7 +638,7 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
 	struct inode *inode = file_inode(filp);
 
-	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
+	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, NULL);
 }
 
 /* Helper functions for NFS lock bounds checking */
diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h
index 140edab64446..05728396a1a1 100644
--- a/include/linux/lockd/bind.h
+++ b/include/linux/lockd/bind.h
@@ -18,6 +18,7 @@
 
 /* Dummy declarations */
 struct svc_rqst;
+struct rpc_task;
 
 /*
  * This is the set of functions for lockd->nfsd communication
@@ -43,6 +44,7 @@ struct nlmclnt_initdata {
 	u32			nfs_version;
 	int			noresvport;
 	struct net		*net;
+	const struct nlmclnt_operations	*nlmclnt_ops;
 };
 
 /*
@@ -52,8 +54,26 @@ struct nlmclnt_initdata {
 extern struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init);
 extern void	nlmclnt_done(struct nlm_host *host);
 
-extern int	nlmclnt_proc(struct nlm_host *host, int cmd,
-					struct file_lock *fl);
+/*
+ * NLM client operations provide a means to modify RPC processing of NLM
+ * requests.  Callbacks receive a pointer to data passed into the call to
+ * nlmclnt_proc().
+ */
+struct nlmclnt_operations {
+	/* Called on successful allocation of nlm_rqst, use for allocation or
+	 * reference counting. */
+	void (*nlmclnt_alloc_call)(void *);
+
+	/* Called in rpc_task_prepare for unlock.  A return value of true
+	 * indicates the callback has put the task to sleep on a waitqueue
+	 * and NLM should not call rpc_call_start(). */
+	bool (*nlmclnt_unlock_prepare)(struct rpc_task*, void *);
+
+	/* Called when the nlm_rqst is freed, callbacks should clean up here */
+	void (*nlmclnt_release_call)(void *);
+};
+
+extern int	nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *data);
 extern int	lockd_up(struct net *net);
 extern void	lockd_down(struct net *net);
 
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index b37dee3acaba..41f7b6a04d69 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -69,6 +69,7 @@ struct nlm_host {
 	char			*h_addrbuf;	/* address eyecatcher */
 	struct net		*net;		/* host net */
 	char			nodename[UNX_MAXNODENAME + 1];
+	const struct nlmclnt_operations	*h_nlmclnt_ops;	/* Callback ops for NLM users */
 };
 
 /*
@@ -142,6 +143,7 @@ struct nlm_rqst {
 	struct nlm_block *	a_block;
 	unsigned int		a_retries;	/* Retry count */
 	u8			a_owner[NLMCLNT_OHSIZE];
+	void *	a_callback_data; /* sent to nlmclnt_operations callbacks */
 };
 
 /*
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 51e27f9746ee..677c6b91dfcd 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1551,6 +1551,7 @@ struct nfs_rpc_ops {
 	const struct inode_operations *dir_inode_ops;
 	const struct inode_operations *file_inode_ops;
 	const struct file_operations *file_ops;
+	const struct nlmclnt_operations *nlmclnt_ops;
 
 	int	(*getroot) (struct nfs_server *, struct nfs_fh *,
 			    struct nfs_fsinfo *);
-- 
cgit v1.2.3-70-g09d2


From b62ea4112ce3746664dcc2f232d03461f0e6f3c7 Mon Sep 17 00:00:00 2001
From: Martin Kaiser <martin@kaiser.cx>
Date: Fri, 21 Apr 2017 16:47:11 +0200
Subject: video: fbdev: imxfb: support AUS mode

Some displays require setting AUS mode in the LDCD AUS Mode Control
Register to work with the imxfb driver. Like the value of the Panel
Configuration Register, the AUS mode setting depends on the display
mode.

Allow setting AUS mode from the device tree by adding a boolean
property. Make this property optional to keep the DT ABI stable.
AUS mode can be set only on imx21 and compatible chipsets.

Signed-off-by: Martin Kaiser <martin@kaiser.cx>
Cc: Sascha Hauer <kernel@pengutronix.de>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 drivers/video/fbdev/imxfb.c               | 17 +++++++++++++++++
 include/linux/platform_data/video-imxfb.h |  1 +
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/imxfb.c b/drivers/video/fbdev/imxfb.c
index 1b0faadb3080..c166e0725be5 100644
--- a/drivers/video/fbdev/imxfb.c
+++ b/drivers/video/fbdev/imxfb.c
@@ -117,6 +117,9 @@
 
 #define IMXFB_LSCR1_DEFAULT 0x00120300
 
+#define LCDC_LAUSCR	0x80
+#define LAUSCR_AUS_MODE	(1<<31)
+
 /* Used fb-mode. Can be set on kernel command line, therefore file-static. */
 static const char *fb_mode;
 
@@ -158,6 +161,7 @@ struct imxfb_info {
 	dma_addr_t		dbar2;
 
 	u_int			pcr;
+	u_int			lauscr;
 	u_int			pwmr;
 	u_int			lscr1;
 	u_int			dmacr;
@@ -422,6 +426,11 @@ static int imxfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
 	pcr |= imxfb_mode->pcr & ~(0x3f | (7 << 25));
 
 	fbi->pcr = pcr;
+	/*
+	 * The LCDC AUS Mode Control Register does not exist on imx1.
+	 */
+	if (!is_imx1_fb(fbi) && imxfb_mode->aus_mode)
+		fbi->lauscr = LAUSCR_AUS_MODE;
 
 	/*
 	 * Copy the RGB parameters for this display
@@ -638,6 +647,9 @@ static int imxfb_activate_var(struct fb_var_screeninfo *var, struct fb_info *inf
 	if (fbi->dmacr)
 		writel(fbi->dmacr, fbi->regs + LCDC_DMACR);
 
+	if (fbi->lauscr)
+		writel(fbi->lauscr, fbi->regs + LCDC_LAUSCR);
+
 	return 0;
 }
 
@@ -734,6 +746,11 @@ static int imxfb_of_read_mode(struct device *dev, struct device_node *np,
 	imxfb_mode->bpp = bpp;
 	imxfb_mode->pcr = pcr;
 
+	/*
+	 * fsl,aus-mode is optional
+	 */
+	imxfb_mode->aus_mode = of_property_read_bool(np, "fsl,aus-mode");
+
 	return 0;
 }
 
diff --git a/include/linux/platform_data/video-imxfb.h b/include/linux/platform_data/video-imxfb.h
index a5c0a71ec914..cf9348b376ac 100644
--- a/include/linux/platform_data/video-imxfb.h
+++ b/include/linux/platform_data/video-imxfb.h
@@ -50,6 +50,7 @@
 struct imx_fb_videomode {
 	struct fb_videomode mode;
 	u32 pcr;
+	bool aus_mode;
 	unsigned char	bpp;
 };
 
-- 
cgit v1.2.3-70-g09d2


From 4f757f3cbf54edef7b75c68d6d6d2f1a0ca08d2e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Apr 2017 17:31:22 -0400
Subject: make sure that mntns_install() doesn't end up with referral for root

new flag: LOOKUP_DOWN.  If the starting point is overmounted, cross
into whatever's mounted on top, triggering referrals et.al.

Use that instead of follow_down_one() loop in mntns_install(), handle
errors properly.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 38 ++++++++++++++++++++++++++++++++++++++
 fs/namespace.c        | 18 +++++++++++-------
 include/linux/namei.h |  1 +
 3 files changed, 50 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 60c0a78ebca7..646db9cf2579 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2252,6 +2252,35 @@ static inline int lookup_last(struct nameidata *nd)
 	return walk_component(nd, 0);
 }
 
+static int handle_lookup_down(struct nameidata *nd)
+{
+	struct path path = nd->path;
+	struct inode *inode = nd->inode;
+	unsigned seq = nd->seq;
+	int err;
+
+	if (nd->flags & LOOKUP_RCU) {
+		/*
+		 * don't bother with unlazy_walk on failure - we are
+		 * at the very beginning of walk, so we lose nothing
+		 * if we simply redo everything in non-RCU mode
+		 */
+		if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq)))
+			return -ECHILD;
+	} else {
+		dget(path.dentry);
+		err = follow_managed(&path, nd);
+		if (unlikely(err < 0))
+			return err;
+		inode = d_backing_inode(path.dentry);
+		seq = 0;
+	}
+	path_to_nameidata(&path, nd);
+	nd->inode = inode;
+	nd->seq = seq;
+	return 0;
+}
+
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
 static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
 {
@@ -2260,6 +2289,15 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
 
 	if (IS_ERR(s))
 		return PTR_ERR(s);
+
+	if (unlikely(flags & LOOKUP_DOWN)) {
+		err = handle_lookup_down(nd);
+		if (unlikely(err < 0)) {
+			terminate_walk(nd);
+			return err;
+		}
+	}
+
 	while (!(err = link_path_walk(s, nd))
 		&& ((err = lookup_last(nd)) > 0)) {
 		s = trailing_symlink(nd);
diff --git a/fs/namespace.c b/fs/namespace.c
index cc1375eff88c..0886ef28bff6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3465,8 +3465,9 @@ static void mntns_put(struct ns_common *ns)
 static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 {
 	struct fs_struct *fs = current->fs;
-	struct mnt_namespace *mnt_ns = to_mnt_ns(ns);
+	struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
 	struct path root;
+	int err;
 
 	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
 	    !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
@@ -3477,15 +3478,18 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 		return -EINVAL;
 
 	get_mnt_ns(mnt_ns);
-	put_mnt_ns(nsproxy->mnt_ns);
+	old_mnt_ns = nsproxy->mnt_ns;
 	nsproxy->mnt_ns = mnt_ns;
 
 	/* Find the root */
-	root.mnt    = &mnt_ns->root->mnt;
-	root.dentry = mnt_ns->root->mnt.mnt_root;
-	path_get(&root);
-	while(d_mountpoint(root.dentry) && follow_down_one(&root))
-		;
+	err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
+				"/", LOOKUP_DOWN, &root);
+	if (err) {
+		/* revert to old namespace */
+		nsproxy->mnt_ns = old_mnt_ns;
+		put_mnt_ns(mnt_ns);
+		return err;
+	}
 
 	/* Update the pwd and root */
 	set_fs_pwd(fs, &root);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index f29abda31e6d..8b4794e83196 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -44,6 +44,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_JUMPED		0x1000
 #define LOOKUP_ROOT		0x2000
 #define LOOKUP_EMPTY		0x4000
+#define LOOKUP_DOWN		0x8000
 
 extern int path_pts(struct path *path);
 
-- 
cgit v1.2.3-70-g09d2


From cf0c3e68aa81f992b0301f62e341b710d385bf68 Mon Sep 17 00:00:00 2001
From: Jeroen Hofstee <jeroen@myspectrum.nl>
Date: Fri, 21 Apr 2017 15:21:11 +0900
Subject: kbuild: fix asm-offset generation to work with clang

KBuild abuses the asm statement to write to a file and
clang chokes about these invalid asm statements. Hack it
even more by fooling this is actual valid asm code.

[masahiro:
 Import Jeroen's work for U-Boot:
 http://patchwork.ozlabs.org/patch/375026/
 Tweak sed script a little to avoid garbage '#' for GCC case, like
 #define NR_PAGEFLAGS 23 /* __NR_PAGEFLAGS       # */ ]

Signed-off-by: Jeroen Hofstee <jeroen@myspectrum.nl>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Tested-by: Matthias Kaehlcke <mka@chromium.org>
---
 include/linux/kbuild.h | 6 +++---
 scripts/Makefile.lib   | 8 ++++++--
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kbuild.h b/include/linux/kbuild.h
index 22a72198c14b..4e80f3a9ad58 100644
--- a/include/linux/kbuild.h
+++ b/include/linux/kbuild.h
@@ -2,14 +2,14 @@
 #define __LINUX_KBUILD_H
 
 #define DEFINE(sym, val) \
-        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+	asm volatile("\n.ascii \"->" #sym " %0 " #val "\"" : : "i" (val))
 
-#define BLANK() asm volatile("\n->" : : )
+#define BLANK() asm volatile("\n.ascii \"->\"" : : )
 
 #define OFFSET(sym, str, mem) \
 	DEFINE(sym, offsetof(struct str, mem))
 
 #define COMMENT(x) \
-	asm volatile("\n->#" x)
+	asm volatile("\n.ascii \"->#" x "\"")
 
 #endif
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 8109c133887d..2209ed696fb4 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -413,10 +413,14 @@ cmd_xzmisc = (cat $(filter-out FORCE,$^) | \
 # ---------------------------------------------------------------------------
 
 # Default sed regexp - multiline due to syntax constraints
+#
+# Use [:space:] because LLVM's integrated assembler inserts <tab> around
+# the .ascii directive whereas GCC keeps the <space> as-is.
 define sed-offsets
-	"/^->/{s:->#\(.*\):/* \1 */:; \
+	's:^[[:space:]]*\.ascii[[:space:]]*"\(.*\)".*:\1:; \
+	/^->/{s:->#\(.*\):/* \1 */:; \
 	s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \
-	s:->::; p;}"
+	s:->::; p;}'
 endef
 
 # Use filechk to avoid rebuilds when a header changes, but the resulting file
-- 
cgit v1.2.3-70-g09d2


From f107d7a43923a83d837b3ea3c7b7de58cd014bbd Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 16 Mar 2017 09:02:42 +0100
Subject: mtd: nand: Remove unused chip->write_page() hook

The last/only user of the chip->write_page() hook (the Atmel NAND
controller driver) has been reworked and is no longer specifying a custom
->write_page() implementation.
Drop this hook before someone else start abusing it.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Reviewed-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 drivers/mtd/nand/nand_base.c | 12 +++++-------
 include/linux/mtd/nand.h     |  4 ----
 2 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index c4c3386e4c0a..36258e69a389 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2633,7 +2633,7 @@ static int nand_write_page_syndrome(struct mtd_info *mtd,
 }
 
 /**
- * nand_write_page - [REPLACEABLE] write one page
+ * nand_write_page - write one page
  * @mtd: MTD device structure
  * @chip: NAND chip descriptor
  * @offset: address offset within the page
@@ -2836,9 +2836,10 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
 			/* We still need to erase leftover OOB data */
 			memset(chip->oob_poi, 0xff, mtd->oobsize);
 		}
-		ret = chip->write_page(mtd, chip, column, bytes, wbuf,
-					oob_required, page, cached,
-					(ops->mode == MTD_OPS_RAW));
+
+		ret = nand_write_page(mtd, chip, column, bytes, wbuf,
+				      oob_required, page, cached,
+				      (ops->mode == MTD_OPS_RAW));
 		if (ret)
 			break;
 
@@ -4548,9 +4549,6 @@ int nand_scan_tail(struct mtd_info *mtd)
 		}
 	}
 
-	if (!chip->write_page)
-		chip->write_page = nand_write_page;
-
 	/*
 	 * Check ECC mode, default to software if 3byte/512byte hardware ECC is
 	 * selected and we have 256 byte pagesize fallback to software ECC
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index c7de017c7f4c..40657939797c 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -828,7 +828,6 @@ struct nand_manufacturer_ops {
  * @errstat:		[OPTIONAL] hardware specific function to perform
  *			additional error status checks (determine if errors are
  *			correctable).
- * @write_page:		[REPLACEABLE] High-level page write function
  * @manufacturer:	[INTERN] Contains manufacturer information
  */
 
@@ -854,9 +853,6 @@ struct nand_chip {
 	int (*scan_bbt)(struct mtd_info *mtd);
 	int (*errstat)(struct mtd_info *mtd, struct nand_chip *this, int state,
 			int status, int page);
-	int (*write_page)(struct mtd_info *mtd, struct nand_chip *chip,
-			uint32_t offset, int data_len, const uint8_t *buf,
-			int oob_required, int page, int cached, int raw);
 	int (*onfi_set_features)(struct mtd_info *mtd, struct nand_chip *chip,
 			int feature_addr, uint8_t *subfeature_para);
 	int (*onfi_get_features)(struct mtd_info *mtd, struct nand_chip *chip,
-- 
cgit v1.2.3-70-g09d2


From 07604686e808cd93d352172806a7828860f048f5 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 30 Mar 2017 15:45:47 +0900
Subject: mtd: nand: relax ecc.read_page() return value for uncorrectable ECC

The comment for ecc.read_page() requires that it should return
"0 if bitflips uncorrectable".

Actually, drivers could return positive values when uncorrectable
bitflips occur.  For example, nand_read_page_swecc() is the case.
If ecc.correct() returns -EBADMSG for the first ECC sector, and
a positive value for the second one, nand_read_page_swecc() returns
a positive max_bitflips and increments ecc_stats.failed for the same
page.

The requirement can be relaxed by tweaking nand_do_read_ops().
Move the max_bitflips calculation below the retry.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Suggested-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c | 3 +--
 include/linux/mtd/nand.h     | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 36258e69a389..de6c8045c85b 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -1993,8 +1993,6 @@ read_retry:
 				break;
 			}
 
-			max_bitflips = max_t(unsigned int, max_bitflips, ret);
-
 			/* Transfer not aligned data */
 			if (use_bufpoi) {
 				if (!NAND_HAS_SUBPAGE_READ(chip) && !oob &&
@@ -2045,6 +2043,7 @@ read_retry:
 			}
 
 			buf += bytes;
+			max_bitflips = max_t(unsigned int, max_bitflips, ret);
 		} else {
 			memcpy(buf, chip->buffers->databuf + col, bytes);
 			buf += bytes;
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 40657939797c..9e0c93c44bef 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -516,7 +516,7 @@ static inline void nand_hw_control_init(struct nand_hw_control *nfc)
  *			out-of-band data).
  * @read_page:	function to read a page according to the ECC generator
  *		requirements; returns maximum number of bitflips corrected in
- *		any single ECC step, 0 if bitflips uncorrectable, -EIO hw error
+ *		any single ECC step, -EIO hw error
  * @read_subpage:	function to read parts of the page covered by ECC;
  *			returns same as read_page()
  * @write_subpage:	function to write parts of the page covered by ECC.
-- 
cgit v1.2.3-70-g09d2


From 477544c62a84d3bacd9f90ba75ffc16c04d78071 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 30 Mar 2017 17:15:05 +0900
Subject: mtd: nand: allow drivers to request minimum alignment for passed
 buffer

In some cases, nand_do_{read,write}_ops is passed with unaligned
ops->datbuf.  Drivers using DMA will be unhappy about unaligned
buffer.

The new struct member, buf_align, represents the minimum alignment
the driver require for the buffer.  If the buffer passed from the
upper MTD layer does not have enough alignment, nand_do_*_ops will
use bufpoi.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c | 10 ++++++++--
 include/linux/mtd/nand.h     |  2 ++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index c796d0e4039a..ed49a1d634b0 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -1954,7 +1954,9 @@ static int nand_do_read_ops(struct mtd_info *mtd, loff_t from,
 		if (!aligned)
 			use_bufpoi = 1;
 		else if (chip->options & NAND_USE_BOUNCE_BUFFER)
-			use_bufpoi = !virt_addr_valid(buf);
+			use_bufpoi = !virt_addr_valid(buf) ||
+				     !IS_ALIGNED((unsigned long)buf,
+						 chip->buf_align);
 		else
 			use_bufpoi = 0;
 
@@ -2810,7 +2812,9 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
 		if (part_pagewr)
 			use_bufpoi = 1;
 		else if (chip->options & NAND_USE_BOUNCE_BUFFER)
-			use_bufpoi = !virt_addr_valid(buf);
+			use_bufpoi = !virt_addr_valid(buf) ||
+				     !IS_ALIGNED((unsigned long)buf,
+						 chip->buf_align);
 		else
 			use_bufpoi = 0;
 
@@ -3429,6 +3433,8 @@ static void nand_set_defaults(struct nand_chip *chip)
 		nand_hw_control_init(chip->controller);
 	}
 
+	if (!chip->buf_align)
+		chip->buf_align = 1;
 }
 
 /* Sanitize ONFI strings so we can safely print them */
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 9e0c93c44bef..8f67b1581683 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -755,6 +755,7 @@ struct nand_manufacturer_ops {
  *			setting the read-retry mode. Mostly needed for MLC NAND.
  * @ecc:		[BOARDSPECIFIC] ECC control structure
  * @buffers:		buffer structure for read/write
+ * @buf_align:		minimum buffer alignment required by a platform
  * @hwcontrol:		platform-specific hardware control structure
  * @erase:		[REPLACEABLE] erase function
  * @scan_bbt:		[REPLACEABLE] function to scan bad block table
@@ -905,6 +906,7 @@ struct nand_chip {
 
 	struct nand_ecc_ctrl ecc;
 	struct nand_buffers *buffers;
+	unsigned long buf_align;
 	struct nand_hw_control hwcontrol;
 
 	uint8_t *bbt;
-- 
cgit v1.2.3-70-g09d2


From 51f567777799c9d85a778302b9eb61cf15214a98 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 6 Apr 2017 22:36:31 -0400
Subject: nfsd: check for oversized NFSv2/v3 arguments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A client can append random data to the end of an NFSv2 or NFSv3 RPC call
without our complaining; we'll just stop parsing at the end of the
expected data and ignore the rest.

Encoded arguments and replies are stored together in an array of pages,
and if a call is too large it could leave inadequate space for the
reply.  This is normally OK because NFS RPC's typically have either
short arguments and long replies (like READ) or long arguments and short
replies (like WRITE).  But a client that sends an incorrectly long reply
can violate those assumptions.  This was observed to cause crashes.

So, insist that the argument not be any longer than we expect.

Also, several operations increment rq_next_page in the decode routine
before checking the argument size, which can leave rq_next_page pointing
well past the end of the page array, causing trouble later in
svc_free_pages.

As followup we may also want to rewrite the encoding routines to check
more carefully that they aren't running off the end of the page array.

Reported-by: Tuomas Haanpää <thaan@synopsys.com>
Reported-by: Ari Kauppi <ari@synopsys.com>
Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs3xdr.c          | 23 +++++++++++++++++------
 fs/nfsd/nfsxdr.c           | 13 ++++++++++---
 include/linux/sunrpc/svc.h |  3 +--
 3 files changed, 28 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 452334694a5d..12feac6ee2fd 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -334,8 +334,11 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
 	if (!p)
 		return 0;
 	p = xdr_decode_hyper(p, &args->offset);
-
 	args->count = ntohl(*p++);
+
+	if (!xdr_argsize_check(rqstp, p))
+		return 0;
+
 	len = min(args->count, max_blocksize);
 
 	/* set up the kvec */
@@ -349,7 +352,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
 		v++;
 	}
 	args->vlen = v;
-	return xdr_argsize_check(rqstp, p);
+	return 1;
 }
 
 int
@@ -541,9 +544,11 @@ nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
 	p = decode_fh(p, &args->fh);
 	if (!p)
 		return 0;
+	if (!xdr_argsize_check(rqstp, p))
+		return 0;
 	args->buffer = page_address(*(rqstp->rq_next_page++));
 
-	return xdr_argsize_check(rqstp, p);
+	return 1;
 }
 
 int
@@ -569,10 +574,14 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
 	args->verf   = p; p += 2;
 	args->dircount = ~0;
 	args->count  = ntohl(*p++);
+
+	if (!xdr_argsize_check(rqstp, p))
+		return 0;
+
 	args->count  = min_t(u32, args->count, PAGE_SIZE);
 	args->buffer = page_address(*(rqstp->rq_next_page++));
 
-	return xdr_argsize_check(rqstp, p);
+	return 1;
 }
 
 int
@@ -590,6 +599,9 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
 	args->dircount = ntohl(*p++);
 	args->count    = ntohl(*p++);
 
+	if (!xdr_argsize_check(rqstp, p))
+		return 0;
+
 	len = args->count = min(args->count, max_blocksize);
 	while (len > 0) {
 		struct page *p = *(rqstp->rq_next_page++);
@@ -597,8 +609,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
 			args->buffer = page_address(p);
 		len -= PAGE_SIZE;
 	}
-
-	return xdr_argsize_check(rqstp, p);
+	return 1;
 }
 
 int
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index de07ff625777..6a4947a3f4fa 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -257,6 +257,9 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
 	len = args->count     = ntohl(*p++);
 	p++; /* totalcount - unused */
 
+	if (!xdr_argsize_check(rqstp, p))
+		return 0;
+
 	len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2);
 
 	/* set up somewhere to store response.
@@ -272,7 +275,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
 		v++;
 	}
 	args->vlen = v;
-	return xdr_argsize_check(rqstp, p);
+	return 1;
 }
 
 int
@@ -362,9 +365,11 @@ nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readli
 	p = decode_fh(p, &args->fh);
 	if (!p)
 		return 0;
+	if (!xdr_argsize_check(rqstp, p))
+		return 0;
 	args->buffer = page_address(*(rqstp->rq_next_page++));
 
-	return xdr_argsize_check(rqstp, p);
+	return 1;
 }
 
 int
@@ -402,9 +407,11 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
 	args->cookie = ntohl(*p++);
 	args->count  = ntohl(*p++);
 	args->count  = min_t(u32, args->count, PAGE_SIZE);
+	if (!xdr_argsize_check(rqstp, p))
+		return 0;
 	args->buffer = page_address(*(rqstp->rq_next_page++));
 
-	return xdr_argsize_check(rqstp, p);
+	return 1;
 }
 
 /*
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index e770abeed32d..6ef19cf658b4 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -336,8 +336,7 @@ xdr_argsize_check(struct svc_rqst *rqstp, __be32 *p)
 {
 	char *cp = (char *)p;
 	struct kvec *vec = &rqstp->rq_arg.head[0];
-	return cp >= (char*)vec->iov_base
-		&& cp <= (char*)vec->iov_base + vec->iov_len;
+	return cp == (char *)vec->iov_base + vec->iov_len;
 }
 
 static inline int
-- 
cgit v1.2.3-70-g09d2


From 17f5f7f506aaca985b95df7ef7fc2ff49c36a8e9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:05:36 -0400
Subject: svcrdma: Move send_wr to svc_rdma_op_ctxt

Clean up: Move the ib_send_wr off the stack, and move common code
to post a Send Work Request into a helper.

This is a refactoring change only.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h            |  4 ++
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 11 +----
 net/sunrpc/xprtrdma/svc_rdma_sendto.c      | 64 ++++++++++++++++++------------
 3 files changed, 44 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index b105f73e3ca2..287db5c179d8 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -85,6 +85,7 @@ struct svc_rdma_op_ctxt {
 	enum dma_data_direction direction;
 	int count;
 	unsigned int mapped_sges;
+	struct ib_send_wr send_wr;
 	struct ib_sge sge[RPCSVC_MAXPAGES];
 	struct page *pages[RPCSVC_MAXPAGES];
 };
@@ -227,6 +228,9 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
 /* svc_rdma_sendto.c */
 extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *,
 			    struct svc_rdma_req_map *, bool);
+extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
+				 struct svc_rdma_op_ctxt *ctxt,
+				 int num_sge, u32 inv_rkey);
 extern int svc_rdma_sendto(struct svc_rqst *);
 extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *,
 				int);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index ff1df40f0d26..f12f39c189c3 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -104,7 +104,6 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
 	struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
 	struct svc_rdma_op_ctxt *ctxt;
 	struct svc_rdma_req_map *vec;
-	struct ib_send_wr send_wr;
 	int ret;
 
 	vec = svc_rdma_get_req_map(rdma);
@@ -132,15 +131,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
 	}
 	svc_rdma_count_mappings(rdma, ctxt);
 
-	memset(&send_wr, 0, sizeof(send_wr));
-	ctxt->cqe.done = svc_rdma_wc_send;
-	send_wr.wr_cqe = &ctxt->cqe;
-	send_wr.sg_list = ctxt->sge;
-	send_wr.num_sge = 1;
-	send_wr.opcode = IB_WR_SEND;
-	send_wr.send_flags = IB_SEND_SIGNALED;
-
-	ret = svc_rdma_send(rdma, &send_wr);
+	ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0);
 	if (ret) {
 		ret = -EIO;
 		goto out_unmap;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 515221b16d09..f90b40d0932f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -435,6 +435,43 @@ out_err:
 	return -EIO;
 }
 
+/**
+ * svc_rdma_post_send_wr - Set up and post one Send Work Request
+ * @rdma: controlling transport
+ * @ctxt: op_ctxt for transmitting the Send WR
+ * @num_sge: number of SGEs to send
+ * @inv_rkey: R_key argument to Send With Invalidate, or zero
+ *
+ * Returns:
+ *	%0 if the Send* was posted successfully,
+ *	%-ENOTCONN if the connection was lost or dropped,
+ *	%-EINVAL if there was a problem with the Send we built,
+ *	%-ENOMEM if ib_post_send failed.
+ */
+int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
+			  struct svc_rdma_op_ctxt *ctxt, int num_sge,
+			  u32 inv_rkey)
+{
+	struct ib_send_wr *send_wr = &ctxt->send_wr;
+
+	dprintk("svcrdma: posting Send WR with %u sge(s)\n", num_sge);
+
+	send_wr->next = NULL;
+	ctxt->cqe.done = svc_rdma_wc_send;
+	send_wr->wr_cqe = &ctxt->cqe;
+	send_wr->sg_list = ctxt->sge;
+	send_wr->num_sge = num_sge;
+	send_wr->send_flags = IB_SEND_SIGNALED;
+	if (inv_rkey) {
+		send_wr->opcode = IB_WR_SEND_WITH_INV;
+		send_wr->ex.invalidate_rkey = inv_rkey;
+	} else {
+		send_wr->opcode = IB_WR_SEND;
+	}
+
+	return svc_rdma_send(rdma, send_wr);
+}
+
 /* This function prepares the portion of the RPCRDMA message to be
  * sent in the RDMA_SEND. This function is called after data sent via
  * RDMA has already been transmitted. There are three cases:
@@ -460,7 +497,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		      u32 inv_rkey)
 {
 	struct svc_rdma_op_ctxt *ctxt;
-	struct ib_send_wr send_wr;
 	u32 xdr_off;
 	int sge_no;
 	int sge_bytes;
@@ -524,19 +560,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		pr_err("svcrdma: Too many sges (%d)\n", sge_no);
 		goto err;
 	}
-	memset(&send_wr, 0, sizeof send_wr);
-	ctxt->cqe.done = svc_rdma_wc_send;
-	send_wr.wr_cqe = &ctxt->cqe;
-	send_wr.sg_list = ctxt->sge;
-	send_wr.num_sge = sge_no;
-	if (inv_rkey) {
-		send_wr.opcode = IB_WR_SEND_WITH_INV;
-		send_wr.ex.invalidate_rkey = inv_rkey;
-	} else
-		send_wr.opcode = IB_WR_SEND;
-	send_wr.send_flags =  IB_SEND_SIGNALED;
 
-	ret = svc_rdma_send(rdma, &send_wr);
+	ret = svc_rdma_post_send_wr(rdma, ctxt, sge_no, inv_rkey);
 	if (ret)
 		goto err;
 
@@ -652,7 +677,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 			 int status)
 {
-	struct ib_send_wr err_wr;
 	struct page *p;
 	struct svc_rdma_op_ctxt *ctxt;
 	enum rpcrdma_errcode err;
@@ -692,17 +716,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 	}
 	svc_rdma_count_mappings(xprt, ctxt);
 
-	/* Prepare SEND WR */
-	memset(&err_wr, 0, sizeof(err_wr));
-	ctxt->cqe.done = svc_rdma_wc_send;
-	err_wr.wr_cqe = &ctxt->cqe;
-	err_wr.sg_list = ctxt->sge;
-	err_wr.num_sge = 1;
-	err_wr.opcode = IB_WR_SEND;
-	err_wr.send_flags = IB_SEND_SIGNALED;
-
-	/* Post It */
-	ret = svc_rdma_send(xprt, &err_wr);
+	ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
 	if (ret) {
 		dprintk("svcrdma: Error %d posting send for protocol error\n",
 			ret);
-- 
cgit v1.2.3-70-g09d2


From 6e6092ca305ad785c605d7e313727aad96c228a5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:05:44 -0400
Subject: svcrdma: Add svc_rdma_map_reply_hdr()

Introduce a helper to DMA-map a reply's transport header before
sending it. This will in part replace the map vector cache.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h            |  3 ++
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 36 ++++++------------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c      | 61 +++++++++++++++++++++++-------
 3 files changed, 62 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 287db5c179d8..002a46d1faa1 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -228,6 +228,9 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
 /* svc_rdma_sendto.c */
 extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *,
 			    struct svc_rdma_req_map *, bool);
+extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
+				  struct svc_rdma_op_ctxt *ctxt,
+				  __be32 *rdma_resp, unsigned int len);
 extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
 				 struct svc_rdma_op_ctxt *ctxt,
 				 int num_sge, u32 inv_rkey);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index f12f39c189c3..0305b33d482f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -101,50 +101,36 @@ out_notfound:
 static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
 			      struct rpc_rqst *rqst)
 {
-	struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
 	struct svc_rdma_op_ctxt *ctxt;
-	struct svc_rdma_req_map *vec;
 	int ret;
 
-	vec = svc_rdma_get_req_map(rdma);
-	ret = svc_rdma_map_xdr(rdma, sndbuf, vec, false);
-	if (ret)
+	ctxt = svc_rdma_get_context(rdma);
+
+	/* rpcrdma_bc_send_request builds the transport header and
+	 * the backchannel RPC message in the same buffer. Thus only
+	 * one SGE is needed to send both.
+	 */
+	ret = svc_rdma_map_reply_hdr(rdma, ctxt, rqst->rq_buffer,
+				     rqst->rq_snd_buf.len);
+	if (ret < 0)
 		goto out_err;
 
 	ret = svc_rdma_repost_recv(rdma, GFP_NOIO);
 	if (ret)
 		goto out_err;
 
-	ctxt = svc_rdma_get_context(rdma);
-	ctxt->pages[0] = virt_to_page(rqst->rq_buffer);
-	ctxt->count = 1;
-
-	ctxt->direction = DMA_TO_DEVICE;
-	ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
-	ctxt->sge[0].length = sndbuf->len;
-	ctxt->sge[0].addr =
-	    ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0,
-			    sndbuf->len, DMA_TO_DEVICE);
-	if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) {
-		ret = -EIO;
-		goto out_unmap;
-	}
-	svc_rdma_count_mappings(rdma, ctxt);
-
 	ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0);
-	if (ret) {
-		ret = -EIO;
+	if (ret)
 		goto out_unmap;
-	}
 
 out_err:
-	svc_rdma_put_req_map(rdma, vec);
 	dprintk("svcrdma: %s returns %d\n", __func__, ret);
 	return ret;
 
 out_unmap:
 	svc_rdma_unmap_dma(ctxt);
 	svc_rdma_put_context(ctxt, 1);
+	ret = -EIO;
 	goto out_err;
 }
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index f90b40d0932f..a7dc71daa776 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -217,6 +217,49 @@ static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp,
 	return 0;
 }
 
+static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
+				 struct svc_rdma_op_ctxt *ctxt,
+				 unsigned int sge_no,
+				 struct page *page,
+				 unsigned int offset,
+				 unsigned int len)
+{
+	struct ib_device *dev = rdma->sc_cm_id->device;
+	dma_addr_t dma_addr;
+
+	dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(dev, dma_addr))
+		return -EIO;
+
+	ctxt->sge[sge_no].addr = dma_addr;
+	ctxt->sge[sge_no].length = len;
+	ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
+	svc_rdma_count_mappings(rdma, ctxt);
+	return 0;
+}
+
+/**
+ * svc_rdma_map_reply_hdr - DMA map the transport header buffer
+ * @rdma: controlling transport
+ * @ctxt: op_ctxt for the Send WR
+ * @rdma_resp: buffer containing transport header
+ * @len: length of transport header
+ *
+ * Returns:
+ *	%0 if the header is DMA mapped,
+ *	%-EIO if DMA mapping failed.
+ */
+int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
+			   struct svc_rdma_op_ctxt *ctxt,
+			   __be32 *rdma_resp,
+			   unsigned int len)
+{
+	ctxt->direction = DMA_TO_DEVICE;
+	ctxt->pages[0] = virt_to_page(rdma_resp);
+	ctxt->count = 1;
+	return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->pages[0], 0, len);
+}
+
 /* Assumptions:
  * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
  */
@@ -699,22 +742,14 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 		err = ERR_VERS;
 	length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
 
+	/* Map transport header; no RPC message payload */
 	ctxt = svc_rdma_get_context(xprt);
-	ctxt->direction = DMA_TO_DEVICE;
-	ctxt->count = 1;
-	ctxt->pages[0] = p;
-
-	/* Prepare SGE for local address */
-	ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey;
-	ctxt->sge[0].length = length;
-	ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device,
-					    p, 0, length, DMA_TO_DEVICE);
-	if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) {
-		dprintk("svcrdma: Error mapping buffer for protocol error\n");
-		svc_rdma_put_context(ctxt, 1);
+	ret = svc_rdma_map_reply_hdr(xprt, ctxt, &rmsgp->rm_xid, length);
+	if (ret) {
+		dprintk("svcrdma: Error %d mapping send for protocol error\n",
+			ret);
 		return;
 	}
-	svc_rdma_count_mappings(xprt, ctxt);
 
 	ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
 	if (ret) {
-- 
cgit v1.2.3-70-g09d2


From b623589dbacbc786c2fffc85113a1dc1a331e2ca Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:05:52 -0400
Subject: svcrdma: Eliminate RPCRDMA_SQ_DEPTH_MULT

The Send Queue depth is temporarily reduced to 1 SQE per credit. The
new rdma_rw API does an internal computation, during QP creation, to
increase the depth of the Send Queue to handle RDMA Read and Write
operations.

This change has to come before the NFSD code paths are updated to
use the rdma_rw API. Without this patch, rdma_rw_init_qp() increases
the size of the SQ too much, resulting in memory allocation failures
during QP creation.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          | 1 -
 net/sunrpc/xprtrdma/svc_rdma.c           | 2 --
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +-
 3 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 002a46d1faa1..11d5aa123f17 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -182,7 +182,6 @@ struct svcxprt_rdma {
 /* The default ORD value is based on two outstanding full-size writes with a
  * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ.  */
 #define RPCRDMA_ORD             (64/4)
-#define RPCRDMA_SQ_DEPTH_MULT   8
 #define RPCRDMA_MAX_REQUESTS    32
 #define RPCRDMA_MAX_REQ_SIZE    4096
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index c846ca9f1eba..912444174647 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -247,8 +247,6 @@ int svc_rdma_init(void)
 	dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
 	dprintk("\tsvcrdma_ord      : %d\n", svcrdma_ord);
 	dprintk("\tmax_requests     : %u\n", svcrdma_max_requests);
-	dprintk("\tsq_depth         : %u\n",
-		svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
 	dprintk("\tmax_bc_requests  : %u\n", svcrdma_max_bc_requests);
 	dprintk("\tmax_inline       : %d\n", svcrdma_max_req_size);
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index fc8f14c7bfec..e1097cc6d1eb 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1014,7 +1014,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 					    svcrdma_max_bc_requests);
 	newxprt->sc_rq_depth = newxprt->sc_max_requests +
 			       newxprt->sc_max_bc_requests;
-	newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
+	newxprt->sc_sq_depth = newxprt->sc_rq_depth;
 	atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
 
 	if (!svc_rdma_prealloc_ctxts(newxprt))
-- 
cgit v1.2.3-70-g09d2


From f13193f50b64e2e0c87706b838d6b9895626a892 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:06:16 -0400
Subject: svcrdma: Introduce local rdma_rw API helpers

The plan is to replace the local bespoke code that constructs and
posts RDMA Read and Write Work Requests with calls to the rdma_rw
API. This shares code with other RDMA-enabled ULPs that manages the
gory details of buffer registration and posting Work Requests.

Some design notes:

 o The structure of RPC-over-RDMA transport headers is flexible,
   allowing multiple segments per Reply with arbitrary alignment,
   each with a unique R_key. Write and Send WRs continue to be
   built and posted in separate code paths. However, one whole
   chunk (with one or more RDMA segments apiece) gets exactly
   one ib_post_send and one work completion.

 o svc_xprt reference counting is modified, since a chain of
   rdma_rw_ctx structs generates one completion, no matter how
   many Write WRs are posted.

 o The current code builds the transport header as it is construct-
   ing Write WRs. I've replaced that with marshaling of transport
   header data items in a separate step. This is because the exact
   structure of client-provided segments may not align with the
   components of the server's reply xdr_buf, or the pages in the
   page list. Thus parts of each client-provided segment may be
   written at different points in the send path.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          |  11 +
 net/sunrpc/Kconfig                       |   1 +
 net/sunrpc/xprtrdma/Makefile             |   2 +-
 net/sunrpc/xprtrdma/svc_rdma_rw.c        | 512 +++++++++++++++++++++++++++++++
 net/sunrpc/xprtrdma/svc_rdma_transport.c |   4 +
 5 files changed, 529 insertions(+), 1 deletion(-)
 create mode 100644 net/sunrpc/xprtrdma/svc_rdma_rw.c

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 11d5aa123f17..ca08671fb7e2 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -145,12 +145,15 @@ struct svcxprt_rdma {
 	u32		     sc_max_requests;	/* Max requests */
 	u32		     sc_max_bc_requests;/* Backward credits */
 	int                  sc_max_req_size;	/* Size of each RQ WR buf */
+	u8		     sc_port_num;
 
 	struct ib_pd         *sc_pd;
 
 	spinlock_t	     sc_ctxt_lock;
 	struct list_head     sc_ctxts;
 	int		     sc_ctxt_used;
+	spinlock_t	     sc_rw_ctxt_lock;
+	struct list_head     sc_rw_ctxts;
 	spinlock_t	     sc_map_lock;
 	struct list_head     sc_maps;
 
@@ -224,6 +227,14 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
 				struct svc_rdma_op_ctxt *, int *, u32 *,
 				u32, u32, u64, bool);
 
+/* svc_rdma_rw.c */
+extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma);
+extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
+				     __be32 *wr_ch, struct xdr_buf *xdr);
+extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
+				     __be32 *rp_ch, bool writelist,
+				     struct xdr_buf *xdr);
+
 /* svc_rdma_sendto.c */
 extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *,
 			    struct svc_rdma_req_map *, bool);
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 04ce2c0b660e..ac09ca803296 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -52,6 +52,7 @@ config SUNRPC_XPRT_RDMA
 	tristate "RPC-over-RDMA transport"
 	depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
 	default SUNRPC && INFINIBAND
+	select SG_POOL
 	help
 	  This option allows the NFS client and server to use RDMA
 	  transports (InfiniBand, iWARP, or RoCE).
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index ef19fa42c50f..c1ae8142ab73 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -4,5 +4,5 @@ rpcrdma-y := transport.o rpc_rdma.o verbs.o \
 	fmr_ops.o frwr_ops.o \
 	svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
 	svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
-	module.o
+	svc_rdma_rw.o module.o
 rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
new file mode 100644
index 000000000000..0cf620277693
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2016 Oracle.  All rights reserved.
+ *
+ * Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
+ */
+
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/debug.h>
+
+#include <rdma/rw.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+/* Each R/W context contains state for one chain of RDMA Read or
+ * Write Work Requests.
+ *
+ * Each WR chain handles a single contiguous server-side buffer,
+ * because scatterlist entries after the first have to start on
+ * page alignment. xdr_buf iovecs cannot guarantee alignment.
+ *
+ * Each WR chain handles only one R_key. Each RPC-over-RDMA segment
+ * from a client may contain a unique R_key, so each WR chain moves
+ * up to one segment at a time.
+ *
+ * The scatterlist makes this data structure over 4KB in size. To
+ * make it less likely to fail, and to handle the allocation for
+ * smaller I/O requests without disabling bottom-halves, these
+ * contexts are created on demand, but cached and reused until the
+ * controlling svcxprt_rdma is destroyed.
+ */
+struct svc_rdma_rw_ctxt {
+	struct list_head	rw_list;
+	struct rdma_rw_ctx	rw_ctx;
+	int			rw_nents;
+	struct sg_table		rw_sg_table;
+	struct scatterlist	rw_first_sgl[0];
+};
+
+static inline struct svc_rdma_rw_ctxt *
+svc_rdma_next_ctxt(struct list_head *list)
+{
+	return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt,
+					rw_list);
+}
+
+static struct svc_rdma_rw_ctxt *
+svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
+{
+	struct svc_rdma_rw_ctxt *ctxt;
+
+	spin_lock(&rdma->sc_rw_ctxt_lock);
+
+	ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts);
+	if (ctxt) {
+		list_del(&ctxt->rw_list);
+		spin_unlock(&rdma->sc_rw_ctxt_lock);
+	} else {
+		spin_unlock(&rdma->sc_rw_ctxt_lock);
+		ctxt = kmalloc(sizeof(*ctxt) +
+			       SG_CHUNK_SIZE * sizeof(struct scatterlist),
+			       GFP_KERNEL);
+		if (!ctxt)
+			goto out;
+		INIT_LIST_HEAD(&ctxt->rw_list);
+	}
+
+	ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
+	if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
+				   ctxt->rw_sg_table.sgl)) {
+		kfree(ctxt);
+		ctxt = NULL;
+	}
+out:
+	return ctxt;
+}
+
+static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
+				 struct svc_rdma_rw_ctxt *ctxt)
+{
+	sg_free_table_chained(&ctxt->rw_sg_table, true);
+
+	spin_lock(&rdma->sc_rw_ctxt_lock);
+	list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts);
+	spin_unlock(&rdma->sc_rw_ctxt_lock);
+}
+
+/**
+ * svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts
+ * @rdma: transport about to be destroyed
+ *
+ */
+void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_rw_ctxt *ctxt;
+
+	while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) {
+		list_del(&ctxt->rw_list);
+		kfree(ctxt);
+	}
+}
+
+/* A chunk context tracks all I/O for moving one Read or Write
+ * chunk. This is a a set of rdma_rw's that handle data movement
+ * for all segments of one chunk.
+ *
+ * These are small, acquired with a single allocator call, and
+ * no more than one is needed per chunk. They are allocated on
+ * demand, and not cached.
+ */
+struct svc_rdma_chunk_ctxt {
+	struct ib_cqe		cc_cqe;
+	struct svcxprt_rdma	*cc_rdma;
+	struct list_head	cc_rwctxts;
+	int			cc_sqecount;
+	enum dma_data_direction cc_dir;
+};
+
+static void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
+			     struct svc_rdma_chunk_ctxt *cc,
+			     enum dma_data_direction dir)
+{
+	cc->cc_rdma = rdma;
+	svc_xprt_get(&rdma->sc_xprt);
+
+	INIT_LIST_HEAD(&cc->cc_rwctxts);
+	cc->cc_sqecount = 0;
+	cc->cc_dir = dir;
+}
+
+static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc)
+{
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_rdma_rw_ctxt *ctxt;
+
+	while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
+		list_del(&ctxt->rw_list);
+
+		rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
+				    rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+				    ctxt->rw_nents, cc->cc_dir);
+		svc_rdma_put_rw_ctxt(rdma, ctxt);
+	}
+	svc_xprt_put(&rdma->sc_xprt);
+}
+
+/* State for sending a Write or Reply chunk.
+ *  - Tracks progress of writing one chunk over all its segments
+ *  - Stores arguments for the SGL constructor functions
+ */
+struct svc_rdma_write_info {
+	/* write state of this chunk */
+	unsigned int		wi_seg_off;
+	unsigned int		wi_seg_no;
+	unsigned int		wi_nsegs;
+	__be32			*wi_segs;
+
+	/* SGL constructor arguments */
+	struct xdr_buf		*wi_xdr;
+	unsigned char		*wi_base;
+	unsigned int		wi_next_off;
+
+	struct svc_rdma_chunk_ctxt	wi_cc;
+};
+
+static struct svc_rdma_write_info *
+svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
+{
+	struct svc_rdma_write_info *info;
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return info;
+
+	info->wi_seg_off = 0;
+	info->wi_seg_no = 0;
+	info->wi_nsegs = be32_to_cpup(++chunk);
+	info->wi_segs = ++chunk;
+	svc_rdma_cc_init(rdma, &info->wi_cc, DMA_TO_DEVICE);
+	return info;
+}
+
+static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
+{
+	svc_rdma_cc_release(&info->wi_cc);
+	kfree(info);
+}
+
+/**
+ * svc_rdma_write_done - Write chunk completion
+ * @cq: controlling Completion Queue
+ * @wc: Work Completion
+ *
+ * Pages under I/O are freed by a subsequent Send completion.
+ */
+static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct svc_rdma_chunk_ctxt *cc =
+			container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_rdma_write_info *info =
+			container_of(cc, struct svc_rdma_write_info, wi_cc);
+
+	atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+	wake_up(&rdma->sc_send_wait);
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			pr_err("svcrdma: write ctx: %s (%u/0x%x)\n",
+			       ib_wc_status_msg(wc->status),
+			       wc->status, wc->vendor_err);
+	}
+
+	svc_rdma_write_info_free(info);
+}
+
+/* This function sleeps when the transport's Send Queue is congested.
+ *
+ * Assumptions:
+ * - If ib_post_send() succeeds, only one completion is expected,
+ *   even if one or more WRs are flushed. This is true when posting
+ *   an rdma_rw_ctx or when posting a single signaled WR.
+ */
+static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
+{
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_xprt *xprt = &rdma->sc_xprt;
+	struct ib_send_wr *first_wr, *bad_wr;
+	struct list_head *tmp;
+	struct ib_cqe *cqe;
+	int ret;
+
+	first_wr = NULL;
+	cqe = &cc->cc_cqe;
+	list_for_each(tmp, &cc->cc_rwctxts) {
+		struct svc_rdma_rw_ctxt *ctxt;
+
+		ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list);
+		first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp,
+					   rdma->sc_port_num, cqe, first_wr);
+		cqe = NULL;
+	}
+
+	do {
+		if (atomic_sub_return(cc->cc_sqecount,
+				      &rdma->sc_sq_avail) > 0) {
+			ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
+			if (ret)
+				break;
+			return 0;
+		}
+
+		atomic_inc(&rdma_stat_sq_starve);
+		atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+		wait_event(rdma->sc_send_wait,
+			   atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
+	} while (1);
+
+	pr_err("svcrdma: ib_post_send failed (%d)\n", ret);
+	set_bit(XPT_CLOSE, &xprt->xpt_flags);
+
+	/* If even one was posted, there will be a completion. */
+	if (bad_wr != first_wr)
+		return 0;
+
+	atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+	wake_up(&rdma->sc_send_wait);
+	return -ENOTCONN;
+}
+
+/* Build and DMA-map an SGL that covers one kvec in an xdr_buf
+ */
+static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
+			       unsigned int len,
+			       struct svc_rdma_rw_ctxt *ctxt)
+{
+	struct scatterlist *sg = ctxt->rw_sg_table.sgl;
+
+	sg_set_buf(&sg[0], info->wi_base, len);
+	info->wi_base += len;
+
+	ctxt->rw_nents = 1;
+}
+
+/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
+ */
+static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
+				    unsigned int remaining,
+				    struct svc_rdma_rw_ctxt *ctxt)
+{
+	unsigned int sge_no, sge_bytes, page_off, page_no;
+	struct xdr_buf *xdr = info->wi_xdr;
+	struct scatterlist *sg;
+	struct page **page;
+
+	page_off = (info->wi_next_off + xdr->page_base) & ~PAGE_MASK;
+	page_no = (info->wi_next_off + xdr->page_base) >> PAGE_SHIFT;
+	page = xdr->pages + page_no;
+	info->wi_next_off += remaining;
+	sg = ctxt->rw_sg_table.sgl;
+	sge_no = 0;
+	do {
+		sge_bytes = min_t(unsigned int, remaining,
+				  PAGE_SIZE - page_off);
+		sg_set_page(sg, *page, sge_bytes, page_off);
+
+		remaining -= sge_bytes;
+		sg = sg_next(sg);
+		page_off = 0;
+		sge_no++;
+		page++;
+	} while (remaining);
+
+	ctxt->rw_nents = sge_no;
+}
+
+/* Construct RDMA Write WRs to send a portion of an xdr_buf containing
+ * an RPC Reply.
+ */
+static int
+svc_rdma_build_writes(struct svc_rdma_write_info *info,
+		      void (*constructor)(struct svc_rdma_write_info *info,
+					  unsigned int len,
+					  struct svc_rdma_rw_ctxt *ctxt),
+		      unsigned int remaining)
+{
+	struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_rdma_rw_ctxt *ctxt;
+	__be32 *seg;
+	int ret;
+
+	cc->cc_cqe.done = svc_rdma_write_done;
+	seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
+	do {
+		unsigned int write_len;
+		u32 seg_length, seg_handle;
+		u64 seg_offset;
+
+		if (info->wi_seg_no >= info->wi_nsegs)
+			goto out_overflow;
+
+		seg_handle = be32_to_cpup(seg);
+		seg_length = be32_to_cpup(seg + 1);
+		xdr_decode_hyper(seg + 2, &seg_offset);
+		seg_offset += info->wi_seg_off;
+
+		write_len = min(remaining, seg_length - info->wi_seg_off);
+		ctxt = svc_rdma_get_rw_ctxt(rdma,
+					    (write_len >> PAGE_SHIFT) + 2);
+		if (!ctxt)
+			goto out_noctx;
+
+		constructor(info, write_len, ctxt);
+		ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp,
+				       rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+				       ctxt->rw_nents, 0, seg_offset,
+				       seg_handle, DMA_TO_DEVICE);
+		if (ret < 0)
+			goto out_initerr;
+
+		list_add(&ctxt->rw_list, &cc->cc_rwctxts);
+		cc->cc_sqecount += ret;
+		if (write_len == seg_length - info->wi_seg_off) {
+			seg += 4;
+			info->wi_seg_no++;
+			info->wi_seg_off = 0;
+		} else {
+			info->wi_seg_off += write_len;
+		}
+		remaining -= write_len;
+	} while (remaining);
+
+	return 0;
+
+out_overflow:
+	dprintk("svcrdma: inadequate space in Write chunk (%u)\n",
+		info->wi_nsegs);
+	return -E2BIG;
+
+out_noctx:
+	dprintk("svcrdma: no R/W ctxs available\n");
+	return -ENOMEM;
+
+out_initerr:
+	svc_rdma_put_rw_ctxt(rdma, ctxt);
+	pr_err("svcrdma: failed to map pagelist (%d)\n", ret);
+	return -EIO;
+}
+
+/* Send one of an xdr_buf's kvecs by itself. To send a Reply
+ * chunk, the whole RPC Reply is written back to the client.
+ * This function writes either the head or tail of the xdr_buf
+ * containing the Reply.
+ */
+static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
+				  struct kvec *vec)
+{
+	info->wi_base = vec->iov_base;
+	return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
+				     vec->iov_len);
+}
+
+/* Send an xdr_buf's page list by itself. A Write chunk is
+ * just the page list. a Reply chunk is the head, page list,
+ * and tail. This function is shared between the two types
+ * of chunk.
+ */
+static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
+				      struct xdr_buf *xdr)
+{
+	info->wi_xdr = xdr;
+	info->wi_next_off = 0;
+	return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
+				     xdr->page_len);
+}
+
+/**
+ * svc_rdma_send_write_chunk - Write all segments in a Write chunk
+ * @rdma: controlling RDMA transport
+ * @wr_ch: Write chunk provided by client
+ * @xdr: xdr_buf containing the data payload
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ *	%-E2BIG if the payload was larger than the Write chunk,
+ *	%-ENOMEM if rdma_rw context pool was exhausted,
+ *	%-ENOTCONN if posting failed (connection is lost),
+ *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
+			      struct xdr_buf *xdr)
+{
+	struct svc_rdma_write_info *info;
+	int ret;
+
+	if (!xdr->page_len)
+		return 0;
+
+	info = svc_rdma_write_info_alloc(rdma, wr_ch);
+	if (!info)
+		return -ENOMEM;
+
+	ret = svc_rdma_send_xdr_pagelist(info, xdr);
+	if (ret < 0)
+		goto out_err;
+
+	ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+	if (ret < 0)
+		goto out_err;
+	return xdr->page_len;
+
+out_err:
+	svc_rdma_write_info_free(info);
+	return ret;
+}
+
+/**
+ * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
+ * @rdma: controlling RDMA transport
+ * @rp_ch: Reply chunk provided by client
+ * @writelist: true if client provided a Write list
+ * @xdr: xdr_buf containing an RPC Reply
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ *	%-E2BIG if the payload was larger than the Reply chunk,
+ *	%-ENOMEM if rdma_rw context pool was exhausted,
+ *	%-ENOTCONN if posting failed (connection is lost),
+ *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
+			      bool writelist, struct xdr_buf *xdr)
+{
+	struct svc_rdma_write_info *info;
+	int consumed, ret;
+
+	info = svc_rdma_write_info_alloc(rdma, rp_ch);
+	if (!info)
+		return -ENOMEM;
+
+	ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]);
+	if (ret < 0)
+		goto out_err;
+	consumed = xdr->head[0].iov_len;
+
+	/* Send the page list in the Reply chunk only if the
+	 * client did not provide Write chunks.
+	 */
+	if (!writelist && xdr->page_len) {
+		ret = svc_rdma_send_xdr_pagelist(info, xdr);
+		if (ret < 0)
+			goto out_err;
+		consumed += xdr->page_len;
+	}
+
+	if (xdr->tail[0].iov_len) {
+		ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]);
+		if (ret < 0)
+			goto out_err;
+		consumed += xdr->tail[0].iov_len;
+	}
+
+	ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+	if (ret < 0)
+		goto out_err;
+	return consumed;
+
+out_err:
+	svc_rdma_write_info_free(info);
+	return ret;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index e1097cc6d1eb..b25c50992a95 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -561,6 +561,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
+	INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
 	INIT_LIST_HEAD(&cma_xprt->sc_maps);
 	init_waitqueue_head(&cma_xprt->sc_send_wait);
 
@@ -568,6 +569,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 	spin_lock_init(&cma_xprt->sc_rq_dto_lock);
 	spin_lock_init(&cma_xprt->sc_frmr_q_lock);
 	spin_lock_init(&cma_xprt->sc_ctxt_lock);
+	spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
 	spin_lock_init(&cma_xprt->sc_map_lock);
 
 	/*
@@ -999,6 +1001,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		newxprt, newxprt->sc_cm_id);
 
 	dev = newxprt->sc_cm_id->device;
+	newxprt->sc_port_num = newxprt->sc_cm_id->port_num;
 
 	/* Qualify the transport resource defaults with the
 	 * capabilities of this particular device */
@@ -1248,6 +1251,7 @@ static void __svc_rdma_free(struct work_struct *work)
 	}
 
 	rdma_dealloc_frmr_q(rdma);
+	svc_rdma_destroy_rw_ctxts(rdma);
 	svc_rdma_destroy_ctxts(rdma);
 	svc_rdma_destroy_maps(rdma);
 
-- 
cgit v1.2.3-70-g09d2


From 9a6a180b7867ceceeeab88a6f011bac23174b939 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:06:25 -0400
Subject: svcrdma: Use rdma_rw API in RPC reply path

The current svcrdma sendto code path posts one RDMA Write WR at a
time. Each of these Writes typically carries a small number of pages
(for instance, up to 30 pages for mlx4 devices). That means a 1MB
NFS READ reply requires 9 ib_post_send() calls for the Write WRs,
and one for the Send WR carrying the actual RPC Reply message.

Instead, use the new rdma_rw API. The details of Write WR chain
construction and memory registration are taken care of in the RDMA
core. svcrdma can focus on the details of the RPC-over-RDMA
protocol. This gives three main benefits:

1. All Write WRs for one RDMA segment are posted in a single chain.
As few as one ib_post_send() for each Write chunk.

2. The Write path can now use FRWR to register the Write buffers.
If the device's maximum page list depth is large, this means a
single Write WR is needed for each RPC's Write chunk data.

3. The new code introduces support for RPCs that carry both a Write
list and a Reply chunk. This combination can be used for an NFSv4
READ where the data payload is large, and thus is removed from the
Payload Stream, but the Payload Stream is still larger than the
inline threshold.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h            |   1 -
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c |   6 +-
 net/sunrpc/xprtrdma/svc_rdma_sendto.c      | 696 ++++++++++++++---------------
 net/sunrpc/xprtrdma/svc_rdma_transport.c   |   2 +
 4 files changed, 350 insertions(+), 355 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index ca08671fb7e2..599ee03ee3fb 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -212,7 +212,6 @@ extern int svc_rdma_xdr_decode_req(struct xdr_buf *);
 extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
 				     struct rpcrdma_msg *,
 				     enum rpcrdma_errcode, __be32 *);
-extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int);
 extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
 extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
 					    __be32, __be64, u32);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 0305b33d482f..bf185b79c98f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -90,9 +90,9 @@ out_notfound:
  * Caller holds the connection's mutex and has already marshaled
  * the RPC/RDMA request.
  *
- * This is similar to svc_rdma_reply, but takes an rpc_rqst
- * instead, does not support chunks, and avoids blocking memory
- * allocation.
+ * This is similar to svc_rdma_send_reply_msg, but takes a struct
+ * rpc_rqst instead, does not support chunks, and avoids blocking
+ * memory allocation.
  *
  * XXX: There is still an opportunity to block in svc_rdma_send()
  * if there are no SQ entries to post the Send. This may occur if
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 2eb3df698e11..ce62b78e5bc9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2016 Oracle. All rights reserved.
  * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
  * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
  *
@@ -40,6 +41,63 @@
  * Author: Tom Tucker <tom@opengridcomputing.com>
  */
 
+/* Operation
+ *
+ * The main entry point is svc_rdma_sendto. This is called by the
+ * RPC server when an RPC Reply is ready to be transmitted to a client.
+ *
+ * The passed-in svc_rqst contains a struct xdr_buf which holds an
+ * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA
+ * transport header, post all Write WRs needed for this Reply, then post
+ * a Send WR conveying the transport header and the RPC message itself to
+ * the client.
+ *
+ * svc_rdma_sendto must fully transmit the Reply before returning, as
+ * the svc_rqst will be recycled as soon as sendto returns. Remaining
+ * resources referred to by the svc_rqst are also recycled at that time.
+ * Therefore any resources that must remain longer must be detached
+ * from the svc_rqst and released later.
+ *
+ * Page Management
+ *
+ * The I/O that performs Reply transmission is asynchronous, and may
+ * complete well after sendto returns. Thus pages under I/O must be
+ * removed from the svc_rqst before sendto returns.
+ *
+ * The logic here depends on Send Queue and completion ordering. Since
+ * the Send WR is always posted last, it will always complete last. Thus
+ * when it completes, it is guaranteed that all previous Write WRs have
+ * also completed.
+ *
+ * Write WRs are constructed and posted. Each Write segment gets its own
+ * svc_rdma_rw_ctxt, allowing the Write completion handler to find and
+ * DMA-unmap the pages under I/O for that Write segment. The Write
+ * completion handler does not release any pages.
+ *
+ * When the Send WR is constructed, it also gets its own svc_rdma_op_ctxt.
+ * The ownership of all of the Reply's pages are transferred into that
+ * ctxt, the Send WR is posted, and sendto returns.
+ *
+ * The svc_rdma_op_ctxt is presented when the Send WR completes. The
+ * Send completion handler finally releases the Reply's pages.
+ *
+ * This mechanism also assumes that completions on the transport's Send
+ * Completion Queue do not run in parallel. Otherwise a Write completion
+ * and Send completion running at the same time could release pages that
+ * are still DMA-mapped.
+ *
+ * Error Handling
+ *
+ * - If the Send WR is posted successfully, it will either complete
+ *   successfully, or get flushed. Either way, the Send completion
+ *   handler releases the Reply's pages.
+ * - If the Send WR cannot be not posted, the forward path releases
+ *   the Reply's pages.
+ *
+ * This handles the case, without the use of page reference counting,
+ * where two different Write segments send portions of the same page.
+ */
+
 #include <linux/sunrpc/debug.h>
 #include <linux/sunrpc/rpc_rdma.h>
 #include <linux/spinlock.h>
@@ -55,6 +113,133 @@ static u32 xdr_padsize(u32 len)
 	return (len & 3) ? (4 - (len & 3)) : 0;
 }
 
+/* Returns length of transport header, in bytes.
+ */
+static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp)
+{
+	unsigned int nsegs;
+	__be32 *p;
+
+	p = rdma_resp;
+
+	/* RPC-over-RDMA V1 replies never have a Read list. */
+	p += rpcrdma_fixed_maxsz + 1;
+
+	/* Skip Write list. */
+	while (*p++ != xdr_zero) {
+		nsegs = be32_to_cpup(p++);
+		p += nsegs * rpcrdma_segment_maxsz;
+	}
+
+	/* Skip Reply chunk. */
+	if (*p++ != xdr_zero) {
+		nsegs = be32_to_cpup(p++);
+		p += nsegs * rpcrdma_segment_maxsz;
+	}
+
+	return (unsigned long)p - (unsigned long)rdma_resp;
+}
+
+/* One Write chunk is copied from Call transport header to Reply
+ * transport header. Each segment's length field is updated to
+ * reflect number of bytes consumed in the segment.
+ *
+ * Returns number of segments in this chunk.
+ */
+static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src,
+					   unsigned int remaining)
+{
+	unsigned int i, nsegs;
+	u32 seg_len;
+
+	/* Write list discriminator */
+	*dst++ = *src++;
+
+	/* number of segments in this chunk */
+	nsegs = be32_to_cpup(src);
+	*dst++ = *src++;
+
+	for (i = nsegs; i; i--) {
+		/* segment's RDMA handle */
+		*dst++ = *src++;
+
+		/* bytes returned in this segment */
+		seg_len = be32_to_cpu(*src);
+		if (remaining >= seg_len) {
+			/* entire segment was consumed */
+			*dst = *src;
+			remaining -= seg_len;
+		} else {
+			/* segment only partly filled */
+			*dst = cpu_to_be32(remaining);
+			remaining = 0;
+		}
+		dst++; src++;
+
+		/* segment's RDMA offset */
+		*dst++ = *src++;
+		*dst++ = *src++;
+	}
+
+	return nsegs;
+}
+
+/* The client provided a Write list in the Call message. Fill in
+ * the segments in the first Write chunk in the Reply's transport
+ * header with the number of bytes consumed in each segment.
+ * Remaining chunks are returned unused.
+ *
+ * Assumptions:
+ *  - Client has provided only one Write chunk
+ */
+static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch,
+					   unsigned int consumed)
+{
+	unsigned int nsegs;
+	__be32 *p, *q;
+
+	/* RPC-over-RDMA V1 replies never have a Read list. */
+	p = rdma_resp + rpcrdma_fixed_maxsz + 1;
+
+	q = wr_ch;
+	while (*q != xdr_zero) {
+		nsegs = xdr_encode_write_chunk(p, q, consumed);
+		q += 2 + nsegs * rpcrdma_segment_maxsz;
+		p += 2 + nsegs * rpcrdma_segment_maxsz;
+		consumed = 0;
+	}
+
+	/* Terminate Write list */
+	*p++ = xdr_zero;
+
+	/* Reply chunk discriminator; may be replaced later */
+	*p = xdr_zero;
+}
+
+/* The client provided a Reply chunk in the Call message. Fill in
+ * the segments in the Reply chunk in the Reply message with the
+ * number of bytes consumed in each segment.
+ *
+ * Assumptions:
+ * - Reply can always fit in the provided Reply chunk
+ */
+static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch,
+					    unsigned int consumed)
+{
+	__be32 *p;
+
+	/* Find the Reply chunk in the Reply's xprt header.
+	 * RPC-over-RDMA V1 replies never have a Read list.
+	 */
+	p = rdma_resp + rpcrdma_fixed_maxsz + 1;
+
+	/* Skip past Write list */
+	while (*p++ != xdr_zero)
+		p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+
+	xdr_encode_write_chunk(p, rp_ch, consumed);
+}
+
 int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
 		     struct xdr_buf *xdr,
 		     struct svc_rdma_req_map *vec,
@@ -123,45 +308,14 @@ int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
 	return 0;
 }
 
-static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
-			      struct xdr_buf *xdr,
-			      u32 xdr_off, size_t len, int dir)
-{
-	struct page *page;
-	dma_addr_t dma_addr;
-	if (xdr_off < xdr->head[0].iov_len) {
-		/* This offset is in the head */
-		xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
-		page = virt_to_page(xdr->head[0].iov_base);
-	} else {
-		xdr_off -= xdr->head[0].iov_len;
-		if (xdr_off < xdr->page_len) {
-			/* This offset is in the page list */
-			xdr_off += xdr->page_base;
-			page = xdr->pages[xdr_off >> PAGE_SHIFT];
-			xdr_off &= ~PAGE_MASK;
-		} else {
-			/* This offset is in the tail */
-			xdr_off -= xdr->page_len;
-			xdr_off += (unsigned long)
-				xdr->tail[0].iov_base & ~PAGE_MASK;
-			page = virt_to_page(xdr->tail[0].iov_base);
-		}
-	}
-	dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off,
-				   min_t(size_t, PAGE_SIZE, len), dir);
-	return dma_addr;
-}
-
 /* Parse the RPC Call's transport header.
  */
-static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
-				      struct rpcrdma_write_array **write,
-				      struct rpcrdma_write_array **reply)
+static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
+				      __be32 **write, __be32 **reply)
 {
 	__be32 *p;
 
-	p = (__be32 *)&rmsgp->rm_body.rm_chunks[0];
+	p = rdma_argp + rpcrdma_fixed_maxsz;
 
 	/* Read list */
 	while (*p++ != xdr_zero)
@@ -169,7 +323,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
 
 	/* Write list */
 	if (*p != xdr_zero) {
-		*write = (struct rpcrdma_write_array *)p;
+		*write = p;
 		while (*p++ != xdr_zero)
 			p += 1 + be32_to_cpu(*p) * 4;
 	} else {
@@ -179,7 +333,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
 
 	/* Reply chunk */
 	if (*p != xdr_zero)
-		*reply = (struct rpcrdma_write_array *)p;
+		*reply = p;
 	else
 		*reply = NULL;
 }
@@ -210,6 +364,32 @@ static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
 	return be32_to_cpup(p);
 }
 
+/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
+ * is used during completion to DMA-unmap this memory, and
+ * it uses ib_dma_unmap_page() exclusively.
+ */
+static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
+				struct svc_rdma_op_ctxt *ctxt,
+				unsigned int sge_no,
+				unsigned char *base,
+				unsigned int len)
+{
+	unsigned long offset = (unsigned long)base & ~PAGE_MASK;
+	struct ib_device *dev = rdma->sc_cm_id->device;
+	dma_addr_t dma_addr;
+
+	dma_addr = ib_dma_map_page(dev, virt_to_page(base),
+				   offset, len, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(dev, dma_addr))
+		return -EIO;
+
+	ctxt->sge[sge_no].addr = dma_addr;
+	ctxt->sge[sge_no].length = len;
+	ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
+	svc_rdma_count_mappings(rdma, ctxt);
+	return 0;
+}
+
 static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
 				 struct svc_rdma_op_ctxt *ctxt,
 				 unsigned int sge_no,
@@ -253,222 +433,73 @@ int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
 	return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->pages[0], 0, len);
 }
 
-/* Assumptions:
- * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
+/* Load the xdr_buf into the ctxt's sge array, and DMA map each
+ * element as it is added.
+ *
+ * Returns the number of sge elements loaded on success, or
+ * a negative errno on failure.
  */
-static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
-		      u32 rmr, u64 to,
-		      u32 xdr_off, int write_len,
-		      struct svc_rdma_req_map *vec)
+static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
+				  struct svc_rdma_op_ctxt *ctxt,
+				  struct xdr_buf *xdr, __be32 *wr_lst)
 {
-	struct ib_rdma_wr write_wr;
-	struct ib_sge *sge;
-	int xdr_sge_no;
-	int sge_no;
-	int sge_bytes;
-	int sge_off;
-	int bc;
-	struct svc_rdma_op_ctxt *ctxt;
+	unsigned int len, sge_no, remaining, page_off;
+	struct page **ppages;
+	unsigned char *base;
+	u32 xdr_pad;
+	int ret;
 
-	if (vec->count > RPCSVC_MAXPAGES) {
-		pr_err("svcrdma: Too many pages (%lu)\n", vec->count);
-		return -EIO;
-	}
+	sge_no = 1;
 
-	dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
-		"write_len=%d, vec->sge=%p, vec->count=%lu\n",
-		rmr, (unsigned long long)to, xdr_off,
-		write_len, vec->sge, vec->count);
+	ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++,
+				   xdr->head[0].iov_base,
+				   xdr->head[0].iov_len);
+	if (ret < 0)
+		return ret;
 
-	ctxt = svc_rdma_get_context(xprt);
-	ctxt->direction = DMA_TO_DEVICE;
-	sge = ctxt->sge;
-
-	/* Find the SGE associated with xdr_off */
-	for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count;
-	     xdr_sge_no++) {
-		if (vec->sge[xdr_sge_no].iov_len > bc)
-			break;
-		bc -= vec->sge[xdr_sge_no].iov_len;
-	}
+	/* If a Write chunk is present, the xdr_buf's page list
+	 * is not included inline. However the Upper Layer may
+	 * have added XDR padding in the tail buffer, and that
+	 * should not be included inline.
+	 */
+	if (wr_lst) {
+		base = xdr->tail[0].iov_base;
+		len = xdr->tail[0].iov_len;
+		xdr_pad = xdr_padsize(xdr->page_len);
 
-	sge_off = bc;
-	bc = write_len;
-	sge_no = 0;
-
-	/* Copy the remaining SGE */
-	while (bc != 0) {
-		sge_bytes = min_t(size_t,
-			  bc, vec->sge[xdr_sge_no].iov_len-sge_off);
-		sge[sge_no].length = sge_bytes;
-		sge[sge_no].addr =
-			dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
-				    sge_bytes, DMA_TO_DEVICE);
-		xdr_off += sge_bytes;
-		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
-					 sge[sge_no].addr))
-			goto err;
-		svc_rdma_count_mappings(xprt, ctxt);
-		sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
-		ctxt->count++;
-		sge_off = 0;
-		sge_no++;
-		xdr_sge_no++;
-		if (xdr_sge_no > vec->count) {
-			pr_err("svcrdma: Too many sges (%d)\n", xdr_sge_no);
-			goto err;
+		if (len && xdr_pad) {
+			base += xdr_pad;
+			len -= xdr_pad;
 		}
-		bc -= sge_bytes;
-		if (sge_no == xprt->sc_max_sge)
-			break;
-	}
-
-	/* Prepare WRITE WR */
-	memset(&write_wr, 0, sizeof write_wr);
-	ctxt->cqe.done = svc_rdma_wc_write;
-	write_wr.wr.wr_cqe = &ctxt->cqe;
-	write_wr.wr.sg_list = &sge[0];
-	write_wr.wr.num_sge = sge_no;
-	write_wr.wr.opcode = IB_WR_RDMA_WRITE;
-	write_wr.wr.send_flags = IB_SEND_SIGNALED;
-	write_wr.rkey = rmr;
-	write_wr.remote_addr = to;
-
-	/* Post It */
-	atomic_inc(&rdma_stat_write);
-	if (svc_rdma_send(xprt, &write_wr.wr))
-		goto err;
-	return write_len - bc;
- err:
-	svc_rdma_unmap_dma(ctxt);
-	svc_rdma_put_context(ctxt, 0);
-	return -EIO;
-}
 
-noinline
-static int send_write_chunks(struct svcxprt_rdma *xprt,
-			     struct rpcrdma_write_array *wr_ary,
-			     struct rpcrdma_msg *rdma_resp,
-			     struct svc_rqst *rqstp,
-			     struct svc_rdma_req_map *vec)
-{
-	u32 xfer_len = rqstp->rq_res.page_len;
-	int write_len;
-	u32 xdr_off;
-	int chunk_off;
-	int chunk_no;
-	int nchunks;
-	struct rpcrdma_write_array *res_ary;
-	int ret;
-
-	res_ary = (struct rpcrdma_write_array *)
-		&rdma_resp->rm_body.rm_chunks[1];
-
-	/* Write chunks start at the pagelist */
-	nchunks = be32_to_cpu(wr_ary->wc_nchunks);
-	for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
-	     xfer_len && chunk_no < nchunks;
-	     chunk_no++) {
-		struct rpcrdma_segment *arg_ch;
-		u64 rs_offset;
-
-		arg_ch = &wr_ary->wc_array[chunk_no].wc_target;
-		write_len = min(xfer_len, be32_to_cpu(arg_ch->rs_length));
-
-		/* Prepare the response chunk given the length actually
-		 * written */
-		xdr_decode_hyper((__be32 *)&arg_ch->rs_offset, &rs_offset);
-		svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
-						arg_ch->rs_handle,
-						arg_ch->rs_offset,
-						write_len);
-		chunk_off = 0;
-		while (write_len) {
-			ret = send_write(xprt, rqstp,
-					 be32_to_cpu(arg_ch->rs_handle),
-					 rs_offset + chunk_off,
-					 xdr_off,
-					 write_len,
-					 vec);
-			if (ret <= 0)
-				goto out_err;
-			chunk_off += ret;
-			xdr_off += ret;
-			xfer_len -= ret;
-			write_len -= ret;
-		}
+		goto tail;
 	}
-	/* Update the req with the number of chunks actually used */
-	svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
 
-	return rqstp->rq_res.page_len;
+	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+	page_off = xdr->page_base & ~PAGE_MASK;
+	remaining = xdr->page_len;
+	while (remaining) {
+		len = min_t(u32, PAGE_SIZE - page_off, remaining);
 
-out_err:
-	pr_err("svcrdma: failed to send write chunks, rc=%d\n", ret);
-	return -EIO;
-}
-
-noinline
-static int send_reply_chunks(struct svcxprt_rdma *xprt,
-			     struct rpcrdma_write_array *rp_ary,
-			     struct rpcrdma_msg *rdma_resp,
-			     struct svc_rqst *rqstp,
-			     struct svc_rdma_req_map *vec)
-{
-	u32 xfer_len = rqstp->rq_res.len;
-	int write_len;
-	u32 xdr_off;
-	int chunk_no;
-	int chunk_off;
-	int nchunks;
-	struct rpcrdma_segment *ch;
-	struct rpcrdma_write_array *res_ary;
-	int ret;
+		ret = svc_rdma_dma_map_page(rdma, ctxt, sge_no++,
+					    *ppages++, page_off, len);
+		if (ret < 0)
+			return ret;
 
-	/* XXX: need to fix when reply lists occur with read-list and or
-	 * write-list */
-	res_ary = (struct rpcrdma_write_array *)
-		&rdma_resp->rm_body.rm_chunks[2];
-
-	/* xdr offset starts at RPC message */
-	nchunks = be32_to_cpu(rp_ary->wc_nchunks);
-	for (xdr_off = 0, chunk_no = 0;
-	     xfer_len && chunk_no < nchunks;
-	     chunk_no++) {
-		u64 rs_offset;
-		ch = &rp_ary->wc_array[chunk_no].wc_target;
-		write_len = min(xfer_len, be32_to_cpu(ch->rs_length));
-
-		/* Prepare the reply chunk given the length actually
-		 * written */
-		xdr_decode_hyper((__be32 *)&ch->rs_offset, &rs_offset);
-		svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
-						ch->rs_handle, ch->rs_offset,
-						write_len);
-		chunk_off = 0;
-		while (write_len) {
-			ret = send_write(xprt, rqstp,
-					 be32_to_cpu(ch->rs_handle),
-					 rs_offset + chunk_off,
-					 xdr_off,
-					 write_len,
-					 vec);
-			if (ret <= 0)
-				goto out_err;
-			chunk_off += ret;
-			xdr_off += ret;
-			xfer_len -= ret;
-			write_len -= ret;
-		}
+		remaining -= len;
+		page_off = 0;
 	}
-	/* Update the req with the number of chunks actually used */
-	svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
 
-	return rqstp->rq_res.len;
+	base = xdr->tail[0].iov_base;
+	len = xdr->tail[0].iov_len;
+tail:
+	if (len) {
+		ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, base, len);
+		if (ret < 0)
+			return ret;
+	}
 
-out_err:
-	pr_err("svcrdma: failed to send reply chunks, rc=%d\n", ret);
-	return -EIO;
+	return sge_no - 1;
 }
 
 /* The svc_rqst and all resources it owns are released as soon as
@@ -525,90 +556,66 @@ int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
 	return svc_rdma_send(rdma, send_wr);
 }
 
-/* This function prepares the portion of the RPCRDMA message to be
- * sent in the RDMA_SEND. This function is called after data sent via
- * RDMA has already been transmitted. There are three cases:
- * - The RPCRDMA header, RPC header, and payload are all sent in a
- *   single RDMA_SEND. This is the "inline" case.
- * - The RPCRDMA header and some portion of the RPC header and data
- *   are sent via this RDMA_SEND and another portion of the data is
- *   sent via RDMA.
- * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC
- *   header and data are all transmitted via RDMA.
- * In all three cases, this function prepares the RPCRDMA header in
- * sge[0], the 'type' parameter indicates the type to place in the
- * RPCRDMA header, and the 'byte_count' field indicates how much of
- * the XDR to include in this RDMA_SEND. NB: The offset of the payload
- * to send is zero in the XDR.
+/* Prepare the portion of the RPC Reply that will be transmitted
+ * via RDMA Send. The RPC-over-RDMA transport header is prepared
+ * in sge[0], and the RPC xdr_buf is prepared in following sges.
+ *
+ * Depending on whether a Write list or Reply chunk is present,
+ * the server may send all, a portion of, or none of the xdr_buf.
+ * In the latter case, only the transport header (sge[0]) is
+ * transmitted.
+ *
+ * RDMA Send is the last step of transmitting an RPC reply. Pages
+ * involved in the earlier RDMA Writes are here transferred out
+ * of the rqstp and into the ctxt's page array. These pages are
+ * DMA unmapped by each Write completion, but the subsequent Send
+ * completion finally releases these pages.
+ *
+ * Assumptions:
+ * - The Reply's transport header will never be larger than a page.
  */
-static int send_reply(struct svcxprt_rdma *rdma,
-		      struct svc_rqst *rqstp,
-		      struct page *page,
-		      struct rpcrdma_msg *rdma_resp,
-		      struct svc_rdma_req_map *vec,
-		      int byte_count,
-		      u32 inv_rkey)
+static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
+				   __be32 *rdma_argp, __be32 *rdma_resp,
+				   struct svc_rqst *rqstp,
+				   __be32 *wr_lst, __be32 *rp_ch)
 {
 	struct svc_rdma_op_ctxt *ctxt;
-	u32 xdr_off;
-	int sge_no;
-	int sge_bytes;
-	int ret = -EIO;
+	u32 inv_rkey;
+	int ret;
+
+	dprintk("svcrdma: sending %s reply: head=%zu, pagelen=%u, tail=%zu\n",
+		(rp_ch ? "RDMA_NOMSG" : "RDMA_MSG"),
+		rqstp->rq_res.head[0].iov_len,
+		rqstp->rq_res.page_len,
+		rqstp->rq_res.tail[0].iov_len);
 
-	/* Prepare the context */
 	ctxt = svc_rdma_get_context(rdma);
-	ctxt->direction = DMA_TO_DEVICE;
-	ctxt->pages[0] = page;
-	ctxt->count = 1;
 
-	/* Prepare the SGE for the RPCRDMA Header */
-	ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
-	ctxt->sge[0].length =
-	    svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp);
-	ctxt->sge[0].addr =
-	    ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
-			    ctxt->sge[0].length, DMA_TO_DEVICE);
-	if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
+	ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp,
+				     svc_rdma_reply_hdr_len(rdma_resp));
+	if (ret < 0)
 		goto err;
-	svc_rdma_count_mappings(rdma, ctxt);
-
-	ctxt->direction = DMA_TO_DEVICE;
 
-	/* Map the payload indicated by 'byte_count' */
-	xdr_off = 0;
-	for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
-		sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
-		byte_count -= sge_bytes;
-		ctxt->sge[sge_no].addr =
-			dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
-				    sge_bytes, DMA_TO_DEVICE);
-		xdr_off += sge_bytes;
-		if (ib_dma_mapping_error(rdma->sc_cm_id->device,
-					 ctxt->sge[sge_no].addr))
+	if (!rp_ch) {
+		ret = svc_rdma_map_reply_msg(rdma, ctxt,
+					     &rqstp->rq_res, wr_lst);
+		if (ret < 0)
 			goto err;
-		svc_rdma_count_mappings(rdma, ctxt);
-		ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
-		ctxt->sge[sge_no].length = sge_bytes;
-	}
-	if (byte_count != 0) {
-		pr_err("svcrdma: Could not map %d bytes\n", byte_count);
-		goto err;
 	}
 
 	svc_rdma_save_io_pages(rqstp, ctxt);
 
-	if (sge_no > rdma->sc_max_sge) {
-		pr_err("svcrdma: Too many sges (%d)\n", sge_no);
-		goto err;
-	}
-
-	ret = svc_rdma_post_send_wr(rdma, ctxt, sge_no, inv_rkey);
+	inv_rkey = 0;
+	if (rdma->sc_snd_w_inv)
+		inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
+	ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, inv_rkey);
 	if (ret)
 		goto err;
 
 	return 0;
 
- err:
+err:
+	pr_err("svcrdma: failed to post Send WR (%d)\n", ret);
 	svc_rdma_unmap_dma(ctxt);
 	svc_rdma_put_context(ctxt, 1);
 	return ret;
@@ -618,41 +625,36 @@ void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
 {
 }
 
+/**
+ * svc_rdma_sendto - Transmit an RPC reply
+ * @rqstp: processed RPC request, reply XDR already in ::rq_res
+ *
+ * Any resources still associated with @rqstp are released upon return.
+ * If no reply message was possible, the connection is closed.
+ *
+ * Returns:
+ *	%0 if an RPC reply has been successfully posted,
+ *	%-ENOMEM if a resource shortage occurred (connection is lost),
+ *	%-ENOTCONN if posting failed (connection is lost).
+ */
 int svc_rdma_sendto(struct svc_rqst *rqstp)
 {
 	struct svc_xprt *xprt = rqstp->rq_xprt;
 	struct svcxprt_rdma *rdma =
 		container_of(xprt, struct svcxprt_rdma, sc_xprt);
-	struct rpcrdma_msg *rdma_argp;
-	struct rpcrdma_msg *rdma_resp;
-	struct rpcrdma_write_array *wr_ary, *rp_ary;
-	int ret;
-	int inline_bytes;
+	__be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch;
+	struct xdr_buf *xdr = &rqstp->rq_res;
 	struct page *res_page;
-	struct svc_rdma_req_map *vec;
-	u32 inv_rkey;
-	__be32 *p;
-
-	dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
+	int ret;
 
-	/* Get the RDMA request header. The receive logic always
-	 * places this at the start of page 0.
+	/* Find the call's chunk lists to decide how to send the reply.
+	 * Receive places the Call's xprt header at the start of page 0.
 	 */
 	rdma_argp = page_address(rqstp->rq_pages[0]);
-	svc_rdma_get_write_arrays(rdma_argp, &wr_ary, &rp_ary);
+	svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch);
 
-	inv_rkey = 0;
-	if (rdma->sc_snd_w_inv)
-		inv_rkey = svc_rdma_get_inv_rkey(&rdma_argp->rm_xid,
-						 (__be32 *)wr_ary,
-						 (__be32 *)rp_ary);
-
-	/* Build an req vec for the XDR */
-	vec = svc_rdma_get_req_map(rdma);
-	ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
-	if (ret)
-		goto err0;
-	inline_bytes = rqstp->rq_res.len;
+	dprintk("svcrdma: preparing response for XID 0x%08x\n",
+		be32_to_cpup(rdma_argp));
 
 	/* Create the RDMA response header. xprt->xpt_mutex,
 	 * acquired in svc_send(), serializes RPC replies. The
@@ -666,54 +668,46 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 		goto err0;
 	rdma_resp = page_address(res_page);
 
-	p = &rdma_resp->rm_xid;
-	*p++ = rdma_argp->rm_xid;
-	*p++ = rdma_argp->rm_vers;
+	p = rdma_resp;
+	*p++ = *rdma_argp;
+	*p++ = *(rdma_argp + 1);
 	*p++ = rdma->sc_fc_credits;
-	*p++ = rp_ary ? rdma_nomsg : rdma_msg;
+	*p++ = rp_ch ? rdma_nomsg : rdma_msg;
 
 	/* Start with empty chunks */
 	*p++ = xdr_zero;
 	*p++ = xdr_zero;
 	*p   = xdr_zero;
 
-	/* Send any write-chunk data and build resp write-list */
-	if (wr_ary) {
-		ret = send_write_chunks(rdma, wr_ary, rdma_resp, rqstp, vec);
+	if (wr_lst) {
+		/* XXX: Presume the client sent only one Write chunk */
+		ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr);
 		if (ret < 0)
 			goto err1;
-		inline_bytes -= ret + xdr_padsize(ret);
+		svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret);
 	}
-
-	/* Send any reply-list data and update resp reply-list */
-	if (rp_ary) {
-		ret = send_reply_chunks(rdma, rp_ary, rdma_resp, rqstp, vec);
+	if (rp_ch) {
+		ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr);
 		if (ret < 0)
 			goto err1;
-		inline_bytes -= ret;
+		svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret);
 	}
 
-	/* Post a fresh Receive buffer _before_ sending the reply */
 	ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
 	if (ret)
 		goto err1;
-
-	ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
-			 inline_bytes, inv_rkey);
+	ret = svc_rdma_send_reply_msg(rdma, rdma_argp, rdma_resp, rqstp,
+				      wr_lst, rp_ch);
 	if (ret < 0)
 		goto err0;
-
-	svc_rdma_put_req_map(rdma, vec);
-	dprintk("svcrdma: send_reply returns %d\n", ret);
-	return ret;
+	return 0;
 
  err1:
 	put_page(res_page);
  err0:
-	svc_rdma_put_req_map(rdma, vec);
 	pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n",
 	       ret);
-	set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+	set_bit(XPT_CLOSE, &xprt->xpt_flags);
 	return -ENOTCONN;
 }
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index b25c50992a95..237c377c1e06 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1053,6 +1053,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	memset(&qp_attr, 0, sizeof qp_attr);
 	qp_attr.event_handler = qp_event_handler;
 	qp_attr.qp_context = &newxprt->sc_xprt;
+	qp_attr.port_num = newxprt->sc_cm_id->port_num;
+	qp_attr.cap.max_rdma_ctxs = newxprt->sc_max_requests;
 	qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
 	qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
 	qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
-- 
cgit v1.2.3-70-g09d2


From 6b19cc5ca2f78ebc88f5d39ba6a94197bb392fcc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:06:33 -0400
Subject: svcrdma: Clean up RDMA_ERROR path

Now that svc_rdma_sendto has been renovated, svc_rdma_send_error can
be refactored to reduce code duplication and remove C structure-
based XDR encoding. It is also relocated to the source file that
contains its only caller.

This is a refactoring change only.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/rpc_rdma.h         |  3 ++
 include/linux/sunrpc/svc_rdma.h         |  5 ----
 net/sunrpc/xprtrdma/svc_rdma_marshal.c  | 19 ------------
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 52 ++++++++++++++++++++++++++++++++-
 net/sunrpc/xprtrdma/svc_rdma_sendto.c   | 43 ---------------------------
 5 files changed, 54 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h
index 245fc59b7324..b7e85b341a54 100644
--- a/include/linux/sunrpc/rpc_rdma.h
+++ b/include/linux/sunrpc/rpc_rdma.h
@@ -143,6 +143,9 @@ enum rpcrdma_proc {
 #define rdma_done	cpu_to_be32(RDMA_DONE)
 #define rdma_error	cpu_to_be32(RDMA_ERROR)
 
+#define err_vers	cpu_to_be32(ERR_VERS)
+#define err_chunk	cpu_to_be32(ERR_CHUNK)
+
 /*
  * Private extension to RPC-over-RDMA Version One.
  * Message passed during RDMA-CM connection set-up.
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 599ee03ee3fb..a770d200f607 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -209,9 +209,6 @@ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
 
 /* svc_rdma_marshal.c */
 extern int svc_rdma_xdr_decode_req(struct xdr_buf *);
-extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
-				     struct rpcrdma_msg *,
-				     enum rpcrdma_errcode, __be32 *);
 extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
 extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
 					    __be32, __be64, u32);
@@ -244,8 +241,6 @@ extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
 				 struct svc_rdma_op_ctxt *ctxt,
 				 int num_sge, u32 inv_rkey);
 extern int svc_rdma_sendto(struct svc_rqst *);
-extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *,
-				int);
 
 /* svc_rdma_transport.c */
 extern void svc_rdma_wc_send(struct ib_cq *, struct ib_wc *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 1c4aabf0f657..ae58a897eca0 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -167,25 +167,6 @@ out_inval:
 	return -EINVAL;
 }
 
-int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
-			      struct rpcrdma_msg *rmsgp,
-			      enum rpcrdma_errcode err, __be32 *va)
-{
-	__be32 *startp = va;
-
-	*va++ = rmsgp->rm_xid;
-	*va++ = rmsgp->rm_vers;
-	*va++ = xprt->sc_fc_credits;
-	*va++ = rdma_error;
-	*va++ = cpu_to_be32(err);
-	if (err == ERR_VERS) {
-		*va++ = rpcrdma_version;
-		*va++ = rpcrdma_version;
-	}
-
-	return (int)((unsigned long)va - (unsigned long)startp);
-}
-
 /**
  * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
  * @rdma_resp: buffer containing Reply transport header
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index f7b2daf72a86..7435cb666f42 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -558,6 +558,56 @@ static void rdma_read_complete(struct svc_rqst *rqstp,
 	rqstp->rq_arg.buflen = head->arg.buflen;
 }
 
+static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
+				__be32 *rdma_argp, int status)
+{
+	struct svc_rdma_op_ctxt *ctxt;
+	__be32 *p, *err_msgp;
+	unsigned int length;
+	struct page *page;
+	int ret;
+
+	ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
+	if (ret)
+		return;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return;
+	err_msgp = page_address(page);
+
+	p = err_msgp;
+	*p++ = *rdma_argp;
+	*p++ = *(rdma_argp + 1);
+	*p++ = xprt->sc_fc_credits;
+	*p++ = rdma_error;
+	if (status == -EPROTONOSUPPORT) {
+		*p++ = err_vers;
+		*p++ = rpcrdma_version;
+		*p++ = rpcrdma_version;
+	} else {
+		*p++ = err_chunk;
+	}
+	length = (unsigned long)p - (unsigned long)err_msgp;
+
+	/* Map transport header; no RPC message payload */
+	ctxt = svc_rdma_get_context(xprt);
+	ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length);
+	if (ret) {
+		dprintk("svcrdma: Error %d mapping send for protocol error\n",
+			ret);
+		return;
+	}
+
+	ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
+	if (ret) {
+		dprintk("svcrdma: Error %d posting send for protocol error\n",
+			ret);
+		svc_rdma_unmap_dma(ctxt);
+		svc_rdma_put_context(ctxt, 1);
+	}
+}
+
 /* By convention, backchannel calls arrive via rdma_msg type
  * messages, and never populate the chunk lists. This makes
  * the RPC/RDMA header small and fixed in size, so it is
@@ -686,7 +736,7 @@ complete:
 	return ret;
 
 out_err:
-	svc_rdma_send_error(rdma_xprt, rmsgp, ret);
+	svc_rdma_send_error(rdma_xprt, &rmsgp->rm_xid, ret);
 	svc_rdma_put_context(ctxt, 0);
 	return 0;
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index ce62b78e5bc9..0b646e8f23c7 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -710,46 +710,3 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	set_bit(XPT_CLOSE, &xprt->xpt_flags);
 	return -ENOTCONN;
 }
-
-void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
-			 int status)
-{
-	struct page *p;
-	struct svc_rdma_op_ctxt *ctxt;
-	enum rpcrdma_errcode err;
-	__be32 *va;
-	int length;
-	int ret;
-
-	ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
-	if (ret)
-		return;
-
-	p = alloc_page(GFP_KERNEL);
-	if (!p)
-		return;
-	va = page_address(p);
-
-	/* XDR encode an error reply */
-	err = ERR_CHUNK;
-	if (status == -EPROTONOSUPPORT)
-		err = ERR_VERS;
-	length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
-
-	/* Map transport header; no RPC message payload */
-	ctxt = svc_rdma_get_context(xprt);
-	ret = svc_rdma_map_reply_hdr(xprt, ctxt, &rmsgp->rm_xid, length);
-	if (ret) {
-		dprintk("svcrdma: Error %d mapping send for protocol error\n",
-			ret);
-		return;
-	}
-
-	ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
-	if (ret) {
-		dprintk("svcrdma: Error %d posting send for protocol error\n",
-			ret);
-		svc_rdma_unmap_dma(ctxt);
-		svc_rdma_put_context(ctxt, 1);
-	}
-}
-- 
cgit v1.2.3-70-g09d2


From f5821c76b2c9c2fb98b276c0bf6a101bfe9050a3 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:06:49 -0400
Subject: svcrdma: Clean up RPC-over-RDMA backchannel reply processing

Replace C structure-based XDR decoding with pointer arithmetic.
Pointer arithmetic is considered more portable.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h            |  2 +-
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 18 ++++++++++++++----
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c    | 27 +++++++++++++++------------
 3 files changed, 30 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index a770d200f607..44d642bbfce6 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -204,7 +204,7 @@ static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma,
 
 /* svc_rdma_backchannel.c */
 extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
-				    struct rpcrdma_msg *rmsgp,
+				    __be32 *rdma_resp,
 				    struct xdr_buf *rcvbuf);
 
 /* svc_rdma_marshal.c */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index bf185b79c98f..c676ed0efb5a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -12,7 +12,17 @@
 
 #undef SVCRDMA_BACKCHANNEL_DEBUG
 
-int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
+/**
+ * svc_rdma_handle_bc_reply - Process incoming backchannel reply
+ * @xprt: controlling backchannel transport
+ * @rdma_resp: pointer to incoming transport header
+ * @rcvbuf: XDR buffer into which to decode the reply
+ *
+ * Returns:
+ *	%0 if @rcvbuf is filled in, xprt_complete_rqst called,
+ *	%-EAGAIN if server should call ->recvfrom again.
+ */
+int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
 			     struct xdr_buf *rcvbuf)
 {
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
@@ -27,13 +37,13 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
 
 	p = (__be32 *)src->iov_base;
 	len = src->iov_len;
-	xid = rmsgp->rm_xid;
+	xid = *rdma_resp;
 
 #ifdef SVCRDMA_BACKCHANNEL_DEBUG
 	pr_info("%s: xid=%08x, length=%zu\n",
 		__func__, be32_to_cpu(xid), len);
 	pr_info("%s: RPC/RDMA: %*ph\n",
-		__func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp);
+		__func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp);
 	pr_info("%s:      RPC: %*ph\n",
 		__func__, (int)len, p);
 #endif
@@ -53,7 +63,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
 		goto out_unlock;
 	memcpy(dst->iov_base, p, len);
 
-	credits = be32_to_cpu(rmsgp->rm_credit);
+	credits = be32_to_cpup(rdma_resp + 2);
 	if (credits == 0)
 		credits = 1;	/* don't deadlock */
 	else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 7435cb666f42..27a99bf5b1a6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -613,28 +613,30 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
  * the RPC/RDMA header small and fixed in size, so it is
  * straightforward to check the RPC header's direction field.
  */
-static bool
-svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp)
+static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt,
+					  __be32 *rdma_resp)
 {
-	__be32 *p = (__be32 *)rmsgp;
+	__be32 *p;
 
 	if (!xprt->xpt_bc_xprt)
 		return false;
 
-	if (rmsgp->rm_type != rdma_msg)
+	p = rdma_resp + 3;
+	if (*p++ != rdma_msg)
 		return false;
-	if (rmsgp->rm_body.rm_chunks[0] != xdr_zero)
+
+	if (*p++ != xdr_zero)
 		return false;
-	if (rmsgp->rm_body.rm_chunks[1] != xdr_zero)
+	if (*p++ != xdr_zero)
 		return false;
-	if (rmsgp->rm_body.rm_chunks[2] != xdr_zero)
+	if (*p++ != xdr_zero)
 		return false;
 
-	/* sanity */
-	if (p[7] != rmsgp->rm_xid)
+	/* XID sanity */
+	if (*p++ != *rdma_resp)
 		return false;
 	/* call direction */
-	if (p[8] == cpu_to_be32(RPC_CALL))
+	if (*p == cpu_to_be32(RPC_CALL))
 		return false;
 
 	return true;
@@ -700,8 +702,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 		goto out_drop;
 	rqstp->rq_xprt_hlen = ret;
 
-	if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) {
-		ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp,
+	if (svc_rdma_is_backchannel_reply(xprt, &rmsgp->rm_xid)) {
+		ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt,
+					       &rmsgp->rm_xid,
 					       &rqstp->rq_arg);
 		svc_rdma_put_context(ctxt, 0);
 		if (ret)
-- 
cgit v1.2.3-70-g09d2


From ded8d19641a605232ab48f5d27f542648beba3cc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:06:57 -0400
Subject: svcrdma: Reduce size of sge array in struct svc_rdma_op_ctxt

The sge array in struct svc_rdma_op_ctxt is no longer used for
sending RDMA Write WRs. It need only accommodate the construction of
Send and Receive WRs. The maximum inline size is the largest payload
it needs to handle now.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h | 9 +++++++--
 net/sunrpc/xprtrdma/svc_rdma.c  | 6 +++---
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 44d642bbfce6..e84b77556784 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -48,6 +48,12 @@
 #include <rdma/rdma_cm.h>
 #define SVCRDMA_DEBUG
 
+/* Default and maximum inline threshold sizes */
+enum {
+	RPCRDMA_DEF_INLINE_THRESH = 4096,
+	RPCRDMA_MAX_INLINE_THRESH = 65536
+};
+
 /* RPC/RDMA parameters and stats */
 extern unsigned int svcrdma_ord;
 extern unsigned int svcrdma_max_requests;
@@ -86,7 +92,7 @@ struct svc_rdma_op_ctxt {
 	int count;
 	unsigned int mapped_sges;
 	struct ib_send_wr send_wr;
-	struct ib_sge sge[RPCSVC_MAXPAGES];
+	struct ib_sge sge[1 + RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE];
 	struct page *pages[RPCSVC_MAXPAGES];
 };
 
@@ -186,7 +192,6 @@ struct svcxprt_rdma {
  * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ.  */
 #define RPCRDMA_ORD             (64/4)
 #define RPCRDMA_MAX_REQUESTS    32
-#define RPCRDMA_MAX_REQ_SIZE    4096
 
 /* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our
  * current NFSv4.1 implementation supports one backchannel slot.
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 912444174647..a4a8f6989ee7 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -58,9 +58,9 @@ unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
 unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
 static unsigned int min_max_requests = 4;
 static unsigned int max_max_requests = 16384;
-unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
-static unsigned int min_max_inline = 4096;
-static unsigned int max_max_inline = 65536;
+unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH;
+static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH;
+static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH;
 
 atomic_t rdma_stat_recv;
 atomic_t rdma_stat_read;
-- 
cgit v1.2.3-70-g09d2


From 68cc4636bbbca89b9fedcf46d8b6bee444fc5e4e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:07:05 -0400
Subject: svcrdma: Remove unused RDMA Write completion handler

Clean up. All RDMA Write completions are now handled by
svc_rdma_wc_write_ctx.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          |  1 -
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 18 ------------------
 2 files changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index e84b77556784..f58c5349beb7 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -249,7 +249,6 @@ extern int svc_rdma_sendto(struct svc_rqst *);
 
 /* svc_rdma_transport.c */
 extern void svc_rdma_wc_send(struct ib_cq *, struct ib_wc *);
-extern void svc_rdma_wc_write(struct ib_cq *, struct ib_wc *);
 extern void svc_rdma_wc_reg(struct ib_cq *, struct ib_wc *);
 extern void svc_rdma_wc_read(struct ib_cq *, struct ib_wc *);
 extern void svc_rdma_wc_inv(struct ib_cq *, struct ib_wc *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 237c377c1e06..1597ca8ba3c0 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -473,24 +473,6 @@ void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
 	svc_rdma_put_context(ctxt, 1);
 }
 
-/**
- * svc_rdma_wc_write - Invoked by RDMA provider for each polled Write WC
- * @cq:        completion queue
- * @wc:        completed WR
- *
- */
-void svc_rdma_wc_write(struct ib_cq *cq, struct ib_wc *wc)
-{
-	struct ib_cqe *cqe = wc->wr_cqe;
-	struct svc_rdma_op_ctxt *ctxt;
-
-	svc_rdma_send_wc_common_put(cq, wc, "write");
-
-	ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
-	svc_rdma_unmap_dma(ctxt);
-	svc_rdma_put_context(ctxt, 0);
-}
-
 /**
  * svc_rdma_wc_reg - Invoked by RDMA provider for each polled FASTREG WC
  * @cq:        completion queue
-- 
cgit v1.2.3-70-g09d2


From 2cf32924c68a22783e6f630e1b5345a80aa1a376 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:07:13 -0400
Subject: svcrdma: Remove the req_map cache

req_maps are no longer used by the send path and can thus be removed.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          | 34 +------------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    | 68 --------------------------
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 84 --------------------------------
 3 files changed, 1 insertion(+), 185 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index f58c5349beb7..479bb7f65233 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -96,23 +96,6 @@ struct svc_rdma_op_ctxt {
 	struct page *pages[RPCSVC_MAXPAGES];
 };
 
-/*
- * NFS_ requests are mapped on the client side by the chunk lists in
- * the RPCRDMA header. During the fetching of the RPC from the client
- * and the writing of the reply to the client, the memory in the
- * client and the memory in the server must be mapped as contiguous
- * vaddr/len for access by the hardware. These data strucures keep
- * these mappings.
- *
- * For an RDMA_WRITE, the 'sge' maps the RPC REPLY. For RDMA_READ, the
- * 'sge' in the svc_rdma_req_map maps the server side RPC reply and the
- * 'ch' field maps the read-list of the RPCRDMA header to the 'sge'
- * mapping of the reply.
- */
-struct svc_rdma_chunk_sge {
-	int start;		/* sge no for this chunk */
-	int count;		/* sge count for this chunk */
-};
 struct svc_rdma_fastreg_mr {
 	struct ib_mr *mr;
 	struct scatterlist *sg;
@@ -121,15 +104,7 @@ struct svc_rdma_fastreg_mr {
 	enum dma_data_direction direction;
 	struct list_head frmr_list;
 };
-struct svc_rdma_req_map {
-	struct list_head free;
-	unsigned long count;
-	union {
-		struct kvec sge[RPCSVC_MAXPAGES];
-		struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES];
-		unsigned long lkey[RPCSVC_MAXPAGES];
-	};
-};
+
 #define RDMACTXT_F_LAST_CTXT	2
 
 #define	SVCRDMA_DEVCAP_FAST_REG		1	/* fast mr registration */
@@ -160,8 +135,6 @@ struct svcxprt_rdma {
 	int		     sc_ctxt_used;
 	spinlock_t	     sc_rw_ctxt_lock;
 	struct list_head     sc_rw_ctxts;
-	spinlock_t	     sc_map_lock;
-	struct list_head     sc_maps;
 
 	struct list_head     sc_rq_dto_q;
 	spinlock_t	     sc_rq_dto_lock;
@@ -237,8 +210,6 @@ extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
 				     struct xdr_buf *xdr);
 
 /* svc_rdma_sendto.c */
-extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *,
-			    struct svc_rdma_req_map *, bool);
 extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
 				  struct svc_rdma_op_ctxt *ctxt,
 				  __be32 *rdma_resp, unsigned int len);
@@ -259,9 +230,6 @@ extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
 extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
 extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
 extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt);
-extern struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *);
-extern void svc_rdma_put_req_map(struct svcxprt_rdma *,
-				 struct svc_rdma_req_map *);
 extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *);
 extern void svc_rdma_put_frmr(struct svcxprt_rdma *,
 			      struct svc_rdma_fastreg_mr *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index e514f6864a93..1736337f3a55 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -240,74 +240,6 @@ static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch,
 	xdr_encode_write_chunk(p, rp_ch, consumed);
 }
 
-int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
-		     struct xdr_buf *xdr,
-		     struct svc_rdma_req_map *vec,
-		     bool write_chunk_present)
-{
-	int sge_no;
-	u32 sge_bytes;
-	u32 page_bytes;
-	u32 page_off;
-	int page_no;
-
-	if (xdr->len !=
-	    (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
-		pr_err("svcrdma: %s: XDR buffer length error\n", __func__);
-		return -EIO;
-	}
-
-	/* Skip the first sge, this is for the RPCRDMA header */
-	sge_no = 1;
-
-	/* Head SGE */
-	vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
-	vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
-	sge_no++;
-
-	/* pages SGE */
-	page_no = 0;
-	page_bytes = xdr->page_len;
-	page_off = xdr->page_base;
-	while (page_bytes) {
-		vec->sge[sge_no].iov_base =
-			page_address(xdr->pages[page_no]) + page_off;
-		sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
-		page_bytes -= sge_bytes;
-		vec->sge[sge_no].iov_len = sge_bytes;
-
-		sge_no++;
-		page_no++;
-		page_off = 0; /* reset for next time through loop */
-	}
-
-	/* Tail SGE */
-	if (xdr->tail[0].iov_len) {
-		unsigned char *base = xdr->tail[0].iov_base;
-		size_t len = xdr->tail[0].iov_len;
-		u32 xdr_pad = xdr_padsize(xdr->page_len);
-
-		if (write_chunk_present && xdr_pad) {
-			base += xdr_pad;
-			len -= xdr_pad;
-		}
-
-		if (len) {
-			vec->sge[sge_no].iov_base = base;
-			vec->sge[sge_no].iov_len = len;
-			sge_no++;
-		}
-	}
-
-	dprintk("svcrdma: %s: sge_no %d page_no %d "
-		"page_base %u page_len %u head_len %zu tail_len %zu\n",
-		__func__, sge_no, page_no, xdr->page_base, xdr->page_len,
-		xdr->head[0].iov_len, xdr->tail[0].iov_len);
-
-	vec->count = sge_no;
-	return 0;
-}
-
 /* Parse the RPC Call's transport header.
  */
 static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 1597ca8ba3c0..a9d9cb1ba4c6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -272,85 +272,6 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
 	}
 }
 
-static struct svc_rdma_req_map *alloc_req_map(gfp_t flags)
-{
-	struct svc_rdma_req_map *map;
-
-	map = kmalloc(sizeof(*map), flags);
-	if (map)
-		INIT_LIST_HEAD(&map->free);
-	return map;
-}
-
-static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt)
-{
-	unsigned int i;
-
-	/* One for each receive buffer on this connection. */
-	i = xprt->sc_max_requests;
-
-	while (i--) {
-		struct svc_rdma_req_map *map;
-
-		map = alloc_req_map(GFP_KERNEL);
-		if (!map) {
-			dprintk("svcrdma: No memory for request map\n");
-			return false;
-		}
-		list_add(&map->free, &xprt->sc_maps);
-	}
-	return true;
-}
-
-struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt)
-{
-	struct svc_rdma_req_map *map = NULL;
-
-	spin_lock(&xprt->sc_map_lock);
-	if (list_empty(&xprt->sc_maps))
-		goto out_empty;
-
-	map = list_first_entry(&xprt->sc_maps,
-			       struct svc_rdma_req_map, free);
-	list_del_init(&map->free);
-	spin_unlock(&xprt->sc_map_lock);
-
-out:
-	map->count = 0;
-	return map;
-
-out_empty:
-	spin_unlock(&xprt->sc_map_lock);
-
-	/* Pre-allocation amount was incorrect */
-	map = alloc_req_map(GFP_NOIO);
-	if (map)
-		goto out;
-
-	WARN_ONCE(1, "svcrdma: empty request map list?\n");
-	return NULL;
-}
-
-void svc_rdma_put_req_map(struct svcxprt_rdma *xprt,
-			  struct svc_rdma_req_map *map)
-{
-	spin_lock(&xprt->sc_map_lock);
-	list_add(&map->free, &xprt->sc_maps);
-	spin_unlock(&xprt->sc_map_lock);
-}
-
-static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt)
-{
-	while (!list_empty(&xprt->sc_maps)) {
-		struct svc_rdma_req_map *map;
-
-		map = list_first_entry(&xprt->sc_maps,
-				       struct svc_rdma_req_map, free);
-		list_del(&map->free);
-		kfree(map);
-	}
-}
-
 /* QP event handler */
 static void qp_event_handler(struct ib_event *event, void *context)
 {
@@ -544,7 +465,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 	INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
 	INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
-	INIT_LIST_HEAD(&cma_xprt->sc_maps);
 	init_waitqueue_head(&cma_xprt->sc_send_wait);
 
 	spin_lock_init(&cma_xprt->sc_lock);
@@ -552,7 +472,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 	spin_lock_init(&cma_xprt->sc_frmr_q_lock);
 	spin_lock_init(&cma_xprt->sc_ctxt_lock);
 	spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
-	spin_lock_init(&cma_xprt->sc_map_lock);
 
 	/*
 	 * Note that this implies that the underlying transport support
@@ -1004,8 +923,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 
 	if (!svc_rdma_prealloc_ctxts(newxprt))
 		goto errout;
-	if (!svc_rdma_prealloc_maps(newxprt))
-		goto errout;
 
 	/*
 	 * Limit ORD based on client limit, local device limit, and
@@ -1237,7 +1154,6 @@ static void __svc_rdma_free(struct work_struct *work)
 	rdma_dealloc_frmr_q(rdma);
 	svc_rdma_destroy_rw_ctxts(rdma);
 	svc_rdma_destroy_ctxts(rdma);
-	svc_rdma_destroy_maps(rdma);
 
 	/* Destroy the QP if present (not a listener) */
 	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
-- 
cgit v1.2.3-70-g09d2


From dadf3e435debb85dfcf28c157012047153a21a97 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:07:21 -0400
Subject: svcrdma: Clean out old XDR encoders

Clean up: These have been replaced and are no longer used.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h        |  4 --
 net/sunrpc/xprtrdma/svc_rdma_marshal.c | 70 ----------------------------------
 2 files changed, 74 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 479bb7f65233..f3787d800ba4 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -187,10 +187,6 @@ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
 
 /* svc_rdma_marshal.c */
 extern int svc_rdma_xdr_decode_req(struct xdr_buf *);
-extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
-extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
-					    __be32, __be64, u32);
-extern unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp);
 
 /* svc_rdma_recvfrom.c */
 extern int svc_rdma_recvfrom(struct svc_rqst *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index ae58a897eca0..bdcf7d85a3dc 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -166,73 +166,3 @@ out_inval:
 	dprintk("svcrdma: failed to parse transport header\n");
 	return -EINVAL;
 }
-
-/**
- * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
- * @rdma_resp: buffer containing Reply transport header
- *
- * Returns length of transport header, in bytes.
- */
-unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
-{
-	unsigned int nsegs;
-	__be32 *p;
-
-	p = rdma_resp;
-
-	/* RPC-over-RDMA V1 replies never have a Read list. */
-	p += rpcrdma_fixed_maxsz + 1;
-
-	/* Skip Write list. */
-	while (*p++ != xdr_zero) {
-		nsegs = be32_to_cpup(p++);
-		p += nsegs * rpcrdma_segment_maxsz;
-	}
-
-	/* Skip Reply chunk. */
-	if (*p++ != xdr_zero) {
-		nsegs = be32_to_cpup(p++);
-		p += nsegs * rpcrdma_segment_maxsz;
-	}
-
-	return (unsigned long)p - (unsigned long)rdma_resp;
-}
-
-void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
-{
-	struct rpcrdma_write_array *ary;
-
-	/* no read-list */
-	rmsgp->rm_body.rm_chunks[0] = xdr_zero;
-
-	/* write-array discrim */
-	ary = (struct rpcrdma_write_array *)
-		&rmsgp->rm_body.rm_chunks[1];
-	ary->wc_discrim = xdr_one;
-	ary->wc_nchunks = cpu_to_be32(chunks);
-
-	/* write-list terminator */
-	ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
-
-	/* reply-array discriminator */
-	ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
-}
-
-void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
-				 int chunks)
-{
-	ary->wc_discrim = xdr_one;
-	ary->wc_nchunks = cpu_to_be32(chunks);
-}
-
-void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
-				     int chunk_no,
-				     __be32 rs_handle,
-				     __be64 rs_offset,
-				     u32 write_len)
-{
-	struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
-	seg->rs_handle = rs_handle;
-	seg->rs_offset = rs_offset;
-	seg->rs_length = cpu_to_be32(write_len);
-}
-- 
cgit v1.2.3-70-g09d2


From c373fff7bd252ec36e8a895c58a584088f1d38bc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 26 Apr 2017 12:26:22 -0400
Subject: NFSv4: Don't special case "launder"

If the client receives a fatal server error from nfs_pageio_add_request(),
then we should always truncate the page on which the error occurred.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/file.c          |  2 +-
 fs/nfs/write.c         | 27 +++++++++++----------------
 include/linux/nfs_fs.h | 14 +-------------
 3 files changed, 13 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index bebed885b6e4..5713eb32a45e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -482,7 +482,7 @@ static int nfs_launder_page(struct page *page)
 		inode->i_ino, (long long)page_offset(page));
 
 	nfs_fscache_wait_on_page_write(nfsi, page);
-	return nfs_wb_launder_page(inode, page);
+	return nfs_wb_page(inode, page);
 }
 
 static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2e654940478f..59e21cc0a266 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -586,8 +586,7 @@ nfs_error_is_fatal_on_server(int err)
  * May return an error if the user signalled nfs_wait_on_request().
  */
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
-				struct page *page, bool nonblock,
-				bool launder)
+				struct page *page, bool nonblock)
 {
 	struct nfs_page *req;
 	int ret = 0;
@@ -610,13 +609,11 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 	if (!nfs_pageio_add_request(pgio, req)) {
 		ret = pgio->pg_error;
 		/*
-		 * Remove the problematic req upon fatal errors
-		 * in launder case, while other dirty pages can
-		 * still be around until they get flushed.
+		 * Remove the problematic req upon fatal errors on the server
 		 */
 		if (nfs_error_is_fatal(ret)) {
 			nfs_context_set_write_error(req->wb_context, ret);
-			if (launder)
+			if (nfs_error_is_fatal_on_server(ret))
 				goto out_launder;
 		}
 		nfs_redirty_request(req);
@@ -632,13 +629,12 @@ out_launder:
 }
 
 static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
-			    struct nfs_pageio_descriptor *pgio, bool launder)
+			    struct nfs_pageio_descriptor *pgio)
 {
 	int ret;
 
 	nfs_pageio_cond_complete(pgio, page_index(page));
-	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE,
-				   launder);
+	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
 	if (ret == -EAGAIN) {
 		redirty_page_for_writepage(wbc, page);
 		ret = 0;
@@ -650,8 +646,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
  * Write an mmapped page to the server.
  */
 static int nfs_writepage_locked(struct page *page,
-				struct writeback_control *wbc,
-				bool launder)
+				struct writeback_control *wbc)
 {
 	struct nfs_pageio_descriptor pgio;
 	struct inode *inode = page_file_mapping(page)->host;
@@ -660,7 +655,7 @@ static int nfs_writepage_locked(struct page *page,
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
 	nfs_pageio_init_write(&pgio, inode, 0,
 				false, &nfs_async_write_completion_ops);
-	err = nfs_do_writepage(page, wbc, &pgio, launder);
+	err = nfs_do_writepage(page, wbc, &pgio);
 	nfs_pageio_complete(&pgio);
 	if (err < 0)
 		return err;
@@ -673,7 +668,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int ret;
 
-	ret = nfs_writepage_locked(page, wbc, false);
+	ret = nfs_writepage_locked(page, wbc);
 	unlock_page(page);
 	return ret;
 }
@@ -682,7 +677,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
 {
 	int ret;
 
-	ret = nfs_do_writepage(page, wbc, data, false);
+	ret = nfs_do_writepage(page, wbc, data);
 	unlock_page(page);
 	return ret;
 }
@@ -2013,7 +2008,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 /*
  * Write back all requests on one page - we do this before reading it.
  */
-int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
+int nfs_wb_page(struct inode *inode, struct page *page)
 {
 	loff_t range_start = page_file_offset(page);
 	loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1);
@@ -2030,7 +2025,7 @@ int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
 	for (;;) {
 		wait_on_page_writeback(page);
 		if (clear_page_dirty_for_io(page)) {
-			ret = nfs_writepage_locked(page, &wbc, launder);
+			ret = nfs_writepage_locked(page, &wbc);
 			if (ret < 0)
 				goto out_error;
 			continue;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 9aa044e76820..bb0eb2c9acca 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -500,24 +500,12 @@ extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned
  */
 extern int nfs_sync_inode(struct inode *inode);
 extern int nfs_wb_all(struct inode *inode);
-extern int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder);
+extern int nfs_wb_page(struct inode *inode, struct page *page);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 extern int  nfs_commit_inode(struct inode *, int);
 extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail);
 extern void nfs_commit_free(struct nfs_commit_data *data);
 
-static inline int
-nfs_wb_launder_page(struct inode *inode, struct page *page)
-{
-	return nfs_wb_single_page(inode, page, true);
-}
-
-static inline int
-nfs_wb_page(struct inode *inode, struct page *page)
-{
-	return nfs_wb_single_page(inode, page, false);
-}
-
 static inline int
 nfs_have_writebacks(struct inode *inode)
 {
-- 
cgit v1.2.3-70-g09d2


From c7e88067c1ae89e7bcbed070fb2c4e30bc39b51f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 18 Apr 2017 16:01:46 -0700
Subject: srcu: Exact tracking of srcu_data structures containing callbacks

The current Tree SRCU implementation schedules a workqueue for every
srcu_data covered by a given leaf srcu_node structure having callbacks,
even if only one of those srcu_data structures actually contains
callbacks.  This is clearly inefficient for workloads that don't feature
callbacks everywhere all the time.  This commit therefore adds an array
of masks that are used by the leaf srcu_node structures to track exactly
which srcu_data structures contain callbacks.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Mike Galbraith <efault@gmx.de>
---
 include/linux/srcutree.h |  4 ++++
 kernel/rcu/srcutree.c    | 29 +++++++++++++++++++++++------
 2 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 0400e211aa44..94515ff226fb 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -47,6 +47,8 @@ struct srcu_data {
 	struct delayed_work work;		/* Context for CB invoking. */
 	struct rcu_head srcu_barrier_head;	/* For srcu_barrier() use. */
 	struct srcu_node *mynode;		/* Leaf srcu_node. */
+	unsigned long grpmask;			/* Mask for leaf srcu_node */
+						/*  ->srcu_data_have_cbs[]. */
 	int cpu;
 	struct srcu_struct *sp;
 };
@@ -59,6 +61,8 @@ struct srcu_node {
 	unsigned long srcu_have_cbs[4];		/* GP seq for children */
 						/*  having CBs, but only */
 						/*  is > ->srcu_gq_seq. */
+	unsigned long srcu_data_have_cbs[4];	/* Which srcu_data structs */
+						/*  have CBs for given GP? */
 	struct srcu_node *srcu_parent;		/* Next up in tree. */
 	int grplo;				/* Least CPU for node. */
 	int grphi;				/* Biggest CPU for node. */
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 9ecf0acc18eb..1c2c1004b3b1 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -66,8 +66,12 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
 	/* Each pass through this loop initializes one srcu_node structure. */
 	rcu_for_each_node_breadth_first(sp, snp) {
 		spin_lock_init(&snp->lock);
-		for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++)
+		WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
+			     ARRAY_SIZE(snp->srcu_data_have_cbs));
+		for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
 			snp->srcu_have_cbs[i] = 0;
+			snp->srcu_data_have_cbs[i] = 0;
+		}
 		snp->grplo = -1;
 		snp->grphi = -1;
 		if (snp == &sp->node[0]) {
@@ -107,6 +111,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
 		sdp->cpu = cpu;
 		INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks);
 		sdp->sp = sp;
+		sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
 		if (is_static)
 			continue;
 
@@ -434,16 +439,21 @@ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
 
 /*
  * Schedule callback invocation for all srcu_data structures associated
- * with the specified srcu_node structure, if possible, on the corresponding
- * CPUs.
+ * with the specified srcu_node structure that have callbacks for the
+ * just-completed grace period, the one corresponding to idx.  If possible,
+ * schedule this invocation on the corresponding CPUs.
  */
-static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp)
+static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
+				  unsigned long mask)
 {
 	int cpu;
 
-	for (cpu = snp->grplo; cpu <= snp->grphi; cpu++)
+	for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
+		if (!(mask & (1 << (cpu - snp->grplo))))
+			continue;
 		srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu),
 				      atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
+	}
 }
 
 /*
@@ -461,6 +471,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
 	unsigned long gpseq;
 	int idx;
 	int idxnext;
+	unsigned long mask;
 	struct srcu_node *snp;
 
 	/* Prevent more than one additional grace period. */
@@ -486,10 +497,12 @@ static void srcu_gp_end(struct srcu_struct *sp)
 			cbs = snp->srcu_have_cbs[idx] == gpseq;
 		snp->srcu_have_cbs[idx] = gpseq;
 		rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
+		mask = snp->srcu_data_have_cbs[idx];
+		snp->srcu_data_have_cbs[idx] = 0;
 		spin_unlock_irq(&snp->lock);
 		if (cbs) {
 			smp_mb(); /* GP end before CB invocation. */
-			srcu_schedule_cbs_snp(sp, snp);
+			srcu_schedule_cbs_snp(sp, snp, mask);
 		}
 	}
 
@@ -536,6 +549,8 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp,
 		spin_lock_irqsave(&snp->lock, flags);
 		if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
 			snp_seq = snp->srcu_have_cbs[idx];
+			if (snp == sdp->mynode && snp_seq == s)
+				snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
 			spin_unlock_irqrestore(&snp->lock, flags);
 			if (snp == sdp->mynode && snp_seq != s) {
 				smp_mb(); /* CBs after GP! */
@@ -544,6 +559,8 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp,
 			return;
 		}
 		snp->srcu_have_cbs[idx] = s;
+		if (snp == sdp->mynode)
+			snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
 		spin_unlock_irqrestore(&snp->lock, flags);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 7f6733c3c648ddd6cf459c1b80ad388a95452955 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 18 Apr 2017 17:17:35 -0700
Subject: srcu: Make rcutorture writer stalls print SRCU GP state

In the past, SRCU was simple enough that there was little point in
making the rcutorture writer stall messages print the SRCU grace-period
number state.  With the advent of Tree SRCU, this has changed.  This
commit therefore makes Classic, Tiny, and Tree SRCU report this state
to rcutorture as needed.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Mike Galbraith <efault@gmx.de>
---
 include/linux/srcuclassic.h | 14 ++++++++++++++
 include/linux/srcutiny.h    | 12 ++++++++++++
 include/linux/srcutree.h    |  4 ++++
 kernel/rcu/rcutorture.c     |  8 +++++---
 kernel/rcu/srcutree.c       | 13 +++++++++++++
 kernel/rcu/tree.c           | 12 ++++--------
 6 files changed, 52 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h
index 41cf99930f34..5753f7322262 100644
--- a/include/linux/srcuclassic.h
+++ b/include/linux/srcuclassic.h
@@ -98,4 +98,18 @@ void synchronize_srcu_expedited(struct srcu_struct *sp);
 void srcu_barrier(struct srcu_struct *sp);
 unsigned long srcu_batches_completed(struct srcu_struct *sp);
 
+static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
+					   struct srcu_struct *sp, int *flags,
+					   unsigned long *gpnum,
+					   unsigned long *completed)
+{
+	if (test_type != SRCU_FLAVOR)
+		return;
+	*flags = 0;
+	*completed = sp->completed;
+	*gpnum = *completed;
+	if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check0.head)
+		(*gpnum)++;
+}
+
 #endif
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 4f284e4f4d8c..42311ee0334f 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -78,4 +78,16 @@ static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
 	return 0;
 }
 
+static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
+					   struct srcu_struct *sp, int *flags,
+					   unsigned long *gpnum,
+					   unsigned long *completed)
+{
+	if (test_type != SRCU_FLAVOR)
+		return;
+	*flags = 0;
+	*completed = sp->srcu_gp_seq;
+	*gpnum = *completed;
+}
+
 #endif
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 94515ff226fb..3865717df124 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -140,4 +140,8 @@ void synchronize_srcu_expedited(struct srcu_struct *sp);
 void srcu_barrier(struct srcu_struct *sp);
 unsigned long srcu_batches_completed(struct srcu_struct *sp);
 
+void srcutorture_get_gp_data(enum rcutorture_type test_type,
+			     struct srcu_struct *sp, int *flags,
+			     unsigned long *gpnum, unsigned long *completed);
+
 #endif
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index e9d4527cdd43..ae6e574d4cf5 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1360,12 +1360,14 @@ rcu_torture_stats_print(void)
 		cur_ops->stats();
 	if (rtcv_snap == rcu_torture_current_version &&
 	    rcu_torture_current != NULL) {
-		int __maybe_unused flags;
-		unsigned long __maybe_unused gpnum;
-		unsigned long __maybe_unused completed;
+		int __maybe_unused flags = 0;
+		unsigned long __maybe_unused gpnum = 0;
+		unsigned long __maybe_unused completed = 0;
 
 		rcutorture_get_gp_data(cur_ops->ttype,
 				       &flags, &gpnum, &completed);
+		srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
+					&flags, &gpnum, &completed);
 		wtp = READ_ONCE(writer_task);
 		pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
 			 rcu_torture_writer_state_getname(),
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 1c2c1004b3b1..72b6cce5f591 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1011,3 +1011,16 @@ void process_srcu(struct work_struct *work)
 	srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
 }
 EXPORT_SYMBOL_GPL(process_srcu);
+
+void srcutorture_get_gp_data(enum rcutorture_type test_type,
+					   struct srcu_struct *sp, int *flags,
+					   unsigned long *gpnum,
+					   unsigned long *completed)
+{
+	if (test_type != SRCU_FLAVOR)
+		return;
+	*flags = 0;
+	*completed = rcu_seq_ctr(sp->srcu_gp_seq);
+	*gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed);
+}
+EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 23aa02587d0f..91fff49d5869 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -704,15 +704,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
 	default:
 		break;
 	}
-	if (rsp != NULL) {
-		*flags = READ_ONCE(rsp->gp_flags);
-		*gpnum = READ_ONCE(rsp->gpnum);
-		*completed = READ_ONCE(rsp->completed);
+	if (rsp == NULL)
 		return;
-	}
-	*flags = 0;
-	*gpnum = 0;
-	*completed = 0;
+	*flags = READ_ONCE(rsp->gp_flags);
+	*gpnum = READ_ONCE(rsp->gpnum);
+	*completed = READ_ONCE(rsp->completed);
 }
 EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
 
-- 
cgit v1.2.3-70-g09d2


From bfd20f1cc85010d2f2d77e544da05cd8c149ba9b Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 26 Apr 2017 09:18:35 -0700
Subject: x86, iommu/vt-d: Add an option to disable Intel IOMMU force on

IOMMU harms performance signficantly when we run very fast networking
workloads. It's 40GB networking doing XDP test. Software overhead is
almost unaware, but it's the IOTLB miss (based on our analysis) which
kills the performance. We observed the same performance issue even with
software passthrough (identity mapping), only the hardware passthrough
survives. The pps with iommu (with software passthrough) is only about
~30% of that without it. This is a limitation in hardware based on our
observation, so we'd like to disable the IOMMU force on, but we do want
to use TBOOT and we can sacrifice the DMA security bought by IOMMU. I
must admit I know nothing about TBOOT, but TBOOT guys (cc-ed) think not
eabling IOMMU is totally ok.

So introduce a new boot option to disable the force on. It's kind of
silly we need to run into intel_iommu_init even without force on, but we
need to disable TBOOT PMR registers. For system without the boot option,
nothing is changed.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 Documentation/admin-guide/kernel-parameters.txt |  9 +++++++++
 arch/x86/kernel/tboot.c                         |  3 +++
 drivers/iommu/intel-iommu.c                     | 18 ++++++++++++++++++
 include/linux/dma_remapping.h                   |  1 +
 4 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2ba45caabada..17135bfade6a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1578,6 +1578,15 @@
 			extended tables themselves, and also PASID support. With
 			this option set, extended tables will not be used even
 			on hardware which claims to support them.
+		tboot_noforce [Default Off]
+			Do not force the Intel IOMMU enabled under tboot.
+			By default, tboot will force Intel IOMMU on, which
+			could harm performance of some high-throughput
+			devices like 40GBit network cards, even if identity
+			mapping is enabled.
+			Note that using this option lowers the security
+			provided by tboot because it makes the system
+			vulnerable to DMA attacks.
 
 	intel_idle.max_cstate=	[KNL,HW,ACPI,X86]
 			0	disables intel_idle and fall back on acpi_idle.
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index b868fa1b812b..edbdfe6ab60a 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -510,6 +510,9 @@ int tboot_force_iommu(void)
 	if (!tboot_enabled())
 		return 0;
 
+	if (!intel_iommu_tboot_noforce)
+		return 1;
+
 	if (no_iommu || swiotlb || dmar_disabled)
 		pr_warning("Forcing Intel-IOMMU to enabled\n");
 
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 5f08ba13972b..b0ced1c13713 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -183,6 +183,7 @@ static int rwbf_quirk;
  * (used when kernel is launched w/ TXT)
  */
 static int force_on = 0;
+int intel_iommu_tboot_noforce;
 
 /*
  * 0: Present
@@ -607,6 +608,10 @@ static int __init intel_iommu_setup(char *str)
 				"Intel-IOMMU: enable pre-production PASID support\n");
 			intel_iommu_pasid28 = 1;
 			iommu_identity_mapping |= IDENTMAP_GFX;
+		} else if (!strncmp(str, "tboot_noforce", 13)) {
+			printk(KERN_INFO
+				"Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
+			intel_iommu_tboot_noforce = 1;
 		}
 
 		str += strcspn(str, ",");
@@ -4850,6 +4855,19 @@ int __init intel_iommu_init(void)
 	}
 
 	if (no_iommu || dmar_disabled) {
+		/*
+		 * We exit the function here to ensure IOMMU's remapping and
+		 * mempool aren't setup, which means that the IOMMU's PMRs
+		 * won't be disabled via the call to init_dmars(). So disable
+		 * it explicitly here. The PMRs were setup by tboot prior to
+		 * calling SENTER, but the kernel is expected to reset/tear
+		 * down the PMRs.
+		 */
+		if (intel_iommu_tboot_noforce) {
+			for_each_iommu(iommu, drhd)
+				iommu_disable_protect_mem_regions(iommu);
+		}
+
 		/*
 		 * Make sure the IOMMUs are switched off, even when we
 		 * boot into a kexec kernel and the previous kernel left
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 187c10299722..90884072fa73 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -39,6 +39,7 @@ extern int iommu_calculate_agaw(struct intel_iommu *iommu);
 extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
 extern int dmar_disabled;
 extern int intel_iommu_enabled;
+extern int intel_iommu_tboot_noforce;
 #else
 static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
 {
-- 
cgit v1.2.3-70-g09d2


From 1e9a038b7fe9a8c10ef1238f4e695d5fbe0dd594 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 24 Apr 2017 16:02:09 -0700
Subject: srcu: Expedited grace periods with reduced memory contention

Commit f60d231a87c5 ("srcu: Crude control of expedited grace periods")
introduced a per-srcu_struct atomic counter to track outstanding
requests for grace periods.  This works, but represents a memory-contention
bottleneck.  This commit therefore uses the srcu_node combining tree
to remove this bottleneck.

This commit adds new ->srcu_gp_seq_needed_exp fields to the
srcu_data, srcu_node, and srcu_struct structures, which track the
farthest-in-the-future grace period that must be expedited, which in
turn requires that all nearer-term grace periods also be expedited.
Requests for expediting start with the srcu_data structure, run up
through the srcu_node tree, and end at the srcu_struct structure.
Note that it may be necessary to expedite a grace period that just
now started, and this is handled by a new srcu_funnel_exp_start()
function, which is invoked when the grace period itself is already
in its way, but when that grace period was not marked as expedited.

A new srcu_get_delay() function returns zero if there is at least one
expedited SRCU grace period in flight, or SRCU_INTERVAL otherwise.
This function is used to calculate delays:  Normal grace periods
are allowed to extend in order to cover more requests with a given
grace-period computation, which decreases per-request overhead.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Mike Galbraith <efault@gmx.de>
---
 include/linux/srcutree.h |   4 +-
 kernel/rcu/srcutree.c    | 135 +++++++++++++++++++++++++++++++++--------------
 2 files changed, 98 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 3865717df124..86df48d3e97b 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -43,6 +43,7 @@ struct srcu_data {
 	spinlock_t lock ____cacheline_internodealigned_in_smp;
 	struct rcu_segcblist srcu_cblist;	/* List of callbacks.*/
 	unsigned long srcu_gp_seq_needed;	/* Furthest future GP needed. */
+	unsigned long srcu_gp_seq_needed_exp;	/* Furthest future exp GP. */
 	bool srcu_cblist_invoking;		/* Invoking these CBs? */
 	struct delayed_work work;		/* Context for CB invoking. */
 	struct rcu_head srcu_barrier_head;	/* For srcu_barrier() use. */
@@ -63,6 +64,7 @@ struct srcu_node {
 						/*  is > ->srcu_gq_seq. */
 	unsigned long srcu_data_have_cbs[4];	/* Which srcu_data structs */
 						/*  have CBs for given GP? */
+	unsigned long srcu_gp_seq_needed_exp;	/* Furthest future exp GP. */
 	struct srcu_node *srcu_parent;		/* Next up in tree. */
 	int grplo;				/* Least CPU for node. */
 	int grphi;				/* Biggest CPU for node. */
@@ -81,7 +83,7 @@ struct srcu_struct {
 	unsigned int srcu_idx;			/* Current rdr array element. */
 	unsigned long srcu_gp_seq;		/* Grace-period seq #. */
 	unsigned long srcu_gp_seq_needed;	/* Latest gp_seq needed. */
-	atomic_t srcu_exp_cnt;			/* # ongoing expedited GPs. */
+	unsigned long srcu_gp_seq_needed_exp;	/* Furthest future exp GP. */
 	struct srcu_data __percpu *sda;		/* Per-CPU srcu_data array. */
 	unsigned long srcu_barrier_seq;		/* srcu_barrier seq #. */
 	struct mutex srcu_barrier_mutex;	/* Serialize barrier ops. */
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 72b6cce5f591..4b98e6f45166 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -72,6 +72,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
 			snp->srcu_have_cbs[i] = 0;
 			snp->srcu_data_have_cbs[i] = 0;
 		}
+		snp->srcu_gp_seq_needed_exp = 0;
 		snp->grplo = -1;
 		snp->grphi = -1;
 		if (snp == &sp->node[0]) {
@@ -102,6 +103,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
 		rcu_segcblist_init(&sdp->srcu_cblist);
 		sdp->srcu_cblist_invoking = false;
 		sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
+		sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq;
 		sdp->mynode = &snp_first[cpu / levelspread[level]];
 		for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
 			if (snp->grplo < 0)
@@ -135,7 +137,6 @@ static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
 	mutex_init(&sp->srcu_gp_mutex);
 	sp->srcu_idx = 0;
 	sp->srcu_gp_seq = 0;
-	atomic_set(&sp->srcu_exp_cnt, 0);
 	sp->srcu_barrier_seq = 0;
 	mutex_init(&sp->srcu_barrier_mutex);
 	atomic_set(&sp->srcu_barrier_cpu_cnt, 0);
@@ -143,6 +144,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
 	if (!is_static)
 		sp->sda = alloc_percpu(struct srcu_data);
 	init_srcu_struct_nodes(sp, is_static);
+	sp->srcu_gp_seq_needed_exp = 0;
 	smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */
 	return sp->sda ? 0 : -ENOMEM;
 }
@@ -307,6 +309,18 @@ static bool srcu_readers_active(struct srcu_struct *sp)
 
 #define SRCU_INTERVAL		1
 
+/*
+ * Return grace-period delay, zero if there are expedited grace
+ * periods pending, SRCU_INTERVAL otherwise.
+ */
+static unsigned long srcu_get_delay(struct srcu_struct *sp)
+{
+	if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq),
+			 READ_ONCE(sp->srcu_gp_seq_needed_exp)))
+		return 0;
+	return SRCU_INTERVAL;
+}
+
 /**
  * cleanup_srcu_struct - deconstruct a sleep-RCU structure
  * @sp: structure to clean up.
@@ -318,7 +332,8 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 {
 	int cpu;
 
-	WARN_ON_ONCE(atomic_read(&sp->srcu_exp_cnt));
+	if (WARN_ON(!srcu_get_delay(sp)))
+		return; /* Leakage unless caller handles error. */
 	if (WARN_ON(srcu_readers_active(sp)))
 		return; /* Leakage unless caller handles error. */
 	flush_delayed_work(&sp->work);
@@ -444,15 +459,14 @@ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
  * schedule this invocation on the corresponding CPUs.
  */
 static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
-				  unsigned long mask)
+				  unsigned long mask, unsigned long delay)
 {
 	int cpu;
 
 	for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
 		if (!(mask & (1 << (cpu - snp->grplo))))
 			continue;
-		srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu),
-				      atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
+		srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay);
 	}
 }
 
@@ -467,6 +481,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
  */
 static void srcu_gp_end(struct srcu_struct *sp)
 {
+	unsigned long cbdelay;
 	bool cbs;
 	unsigned long gpseq;
 	int idx;
@@ -481,8 +496,11 @@ static void srcu_gp_end(struct srcu_struct *sp)
 	spin_lock_irq(&sp->gp_lock);
 	idx = rcu_seq_state(sp->srcu_gp_seq);
 	WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
+	cbdelay = srcu_get_delay(sp);
 	rcu_seq_end(&sp->srcu_gp_seq);
 	gpseq = rcu_seq_current(&sp->srcu_gp_seq);
+	if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
+		sp->srcu_gp_seq_needed_exp = gpseq;
 	spin_unlock_irq(&sp->gp_lock);
 	mutex_unlock(&sp->srcu_gp_mutex);
 	/* A new grace period can start at this point.  But only one. */
@@ -497,12 +515,14 @@ static void srcu_gp_end(struct srcu_struct *sp)
 			cbs = snp->srcu_have_cbs[idx] == gpseq;
 		snp->srcu_have_cbs[idx] = gpseq;
 		rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
+		if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
+			snp->srcu_gp_seq_needed_exp = gpseq;
 		mask = snp->srcu_data_have_cbs[idx];
 		snp->srcu_data_have_cbs[idx] = 0;
 		spin_unlock_irq(&snp->lock);
 		if (cbs) {
 			smp_mb(); /* GP end before CB invocation. */
-			srcu_schedule_cbs_snp(sp, snp, mask);
+			srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
 		}
 	}
 
@@ -517,15 +537,43 @@ static void srcu_gp_end(struct srcu_struct *sp)
 		srcu_gp_start(sp);
 		spin_unlock_irq(&sp->gp_lock);
 		/* Throttle expedited grace periods: Should be rare! */
-		srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) &&
-				    rcu_seq_ctr(gpseq) & 0xf
-				    ? 0
-				    : SRCU_INTERVAL);
+		srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
+				    ? 0 : SRCU_INTERVAL);
 	} else {
 		spin_unlock_irq(&sp->gp_lock);
 	}
 }
 
+/*
+ * Funnel-locking scheme to scalably mediate many concurrent expedited
+ * grace-period requests.  This function is invoked for the first known
+ * expedited request for a grace period that has already been requested,
+ * but without expediting.  To start a completely new grace period,
+ * whether expedited or not, use srcu_funnel_gp_start() instead.
+ */
+static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
+				  unsigned long s)
+{
+	unsigned long flags;
+
+	for (; snp != NULL; snp = snp->srcu_parent) {
+		if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
+		    ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
+			return;
+		spin_lock_irqsave(&snp->lock, flags);
+		if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
+			spin_unlock_irqrestore(&snp->lock, flags);
+			return;
+		}
+		WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
+		spin_unlock_irqrestore(&snp->lock, flags);
+	}
+	spin_lock_irqsave(&sp->gp_lock, flags);
+	if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
+		sp->srcu_gp_seq_needed_exp = s;
+	spin_unlock_irqrestore(&sp->gp_lock, flags);
+}
+
 /*
  * Funnel-locking scheme to scalably mediate many concurrent grace-period
  * requests.  The winner has to do the work of actually starting grace
@@ -533,9 +581,8 @@ static void srcu_gp_end(struct srcu_struct *sp)
  * number is recorded on at least their leaf srcu_node structure, or they
  * must take steps to invoke their own callbacks.
  */
-static void srcu_funnel_gp_start(struct srcu_struct *sp,
-				 struct srcu_data *sdp,
-				 unsigned long s)
+static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
+				 unsigned long s, bool do_norm)
 {
 	unsigned long flags;
 	int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
@@ -554,13 +601,20 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp,
 			spin_unlock_irqrestore(&snp->lock, flags);
 			if (snp == sdp->mynode && snp_seq != s) {
 				smp_mb(); /* CBs after GP! */
-				srcu_schedule_cbs_sdp(sdp, 0);
+				srcu_schedule_cbs_sdp(sdp, do_norm
+							   ? SRCU_INTERVAL
+							   : 0);
+				return;
 			}
+			if (!do_norm)
+				srcu_funnel_exp_start(sp, snp, s);
 			return;
 		}
 		snp->srcu_have_cbs[idx] = s;
 		if (snp == sdp->mynode)
 			snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+		if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
+			snp->srcu_gp_seq_needed_exp = s;
 		spin_unlock_irqrestore(&snp->lock, flags);
 	}
 
@@ -573,6 +627,8 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp,
 		 */
 		smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/
 	}
+	if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
+		sp->srcu_gp_seq_needed_exp = s;
 
 	/* If grace period not already done and none in progress, start it. */
 	if (!rcu_seq_done(&sp->srcu_gp_seq, s) &&
@@ -580,9 +636,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp,
 		WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
 		srcu_gp_start(sp);
 		queue_delayed_work(system_power_efficient_wq, &sp->work,
-				   atomic_read(&sp->srcu_exp_cnt)
-				   ? 0
-				   : SRCU_INTERVAL);
+				   srcu_get_delay(sp));
 	}
 	spin_unlock_irqrestore(&sp->gp_lock, flags);
 }
@@ -597,7 +651,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
 	for (;;) {
 		if (srcu_readers_active_idx_check(sp, idx))
 			return true;
-		if (--trycount + !!atomic_read(&sp->srcu_exp_cnt) <= 0)
+		if (--trycount + !srcu_get_delay(sp) <= 0)
 			return false;
 		udelay(SRCU_RETRY_CHECK_DELAY);
 	}
@@ -650,10 +704,11 @@ static void srcu_flip(struct srcu_struct *sp)
  * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
  * srcu_struct structure.
  */
-void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
-	       rcu_callback_t func)
+void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+		 rcu_callback_t func, bool do_norm)
 {
 	unsigned long flags;
+	bool needexp = false;
 	bool needgp = false;
 	unsigned long s;
 	struct srcu_data *sdp;
@@ -672,16 +727,28 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 		sdp->srcu_gp_seq_needed = s;
 		needgp = true;
 	}
+	if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
+		sdp->srcu_gp_seq_needed_exp = s;
+		needexp = true;
+	}
 	spin_unlock_irqrestore(&sdp->lock, flags);
 	if (needgp)
-		srcu_funnel_gp_start(sp, sdp, s);
+		srcu_funnel_gp_start(sp, sdp, s, do_norm);
+	else if (needexp)
+		srcu_funnel_exp_start(sp, sdp->mynode, s);
+}
+
+void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+	       rcu_callback_t func)
+{
+	__call_srcu(sp, rhp, func, true);
 }
 EXPORT_SYMBOL_GPL(call_srcu);
 
 /*
  * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
  */
-static void __synchronize_srcu(struct srcu_struct *sp)
+static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
 {
 	struct rcu_synchronize rcu;
 
@@ -697,7 +764,7 @@ static void __synchronize_srcu(struct srcu_struct *sp)
 	check_init_srcu_struct(sp);
 	init_completion(&rcu.completion);
 	init_rcu_head_on_stack(&rcu.head);
-	call_srcu(sp, &rcu.head, wakeme_after_rcu);
+	__call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
 	wait_for_completion(&rcu.completion);
 	destroy_rcu_head_on_stack(&rcu.head);
 }
@@ -714,18 +781,7 @@ static void __synchronize_srcu(struct srcu_struct *sp)
  */
 void synchronize_srcu_expedited(struct srcu_struct *sp)
 {
-	bool do_norm = rcu_gp_is_normal();
-
-	check_init_srcu_struct(sp);
-	if (!do_norm) {
-		atomic_inc(&sp->srcu_exp_cnt);
-		smp_mb__after_atomic(); /* increment before GP. */
-	}
-	__synchronize_srcu(sp);
-	if (!do_norm) {
-		smp_mb__before_atomic(); /* GP before decrement. */
-		WARN_ON_ONCE(atomic_dec_return(&sp->srcu_exp_cnt) < 0);
-	}
+	__synchronize_srcu(sp, rcu_gp_is_normal());
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
 
@@ -773,7 +829,7 @@ void synchronize_srcu(struct srcu_struct *sp)
 	if (rcu_gp_is_expedited())
 		synchronize_srcu_expedited(sp);
 	else
-		__synchronize_srcu(sp);
+		__synchronize_srcu(sp, true);
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu);
 
@@ -1008,14 +1064,13 @@ void process_srcu(struct work_struct *work)
 	sp = container_of(work, struct srcu_struct, work.work);
 
 	srcu_advance_state(sp);
-	srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
+	srcu_reschedule(sp, srcu_get_delay(sp));
 }
 EXPORT_SYMBOL_GPL(process_srcu);
 
 void srcutorture_get_gp_data(enum rcutorture_type test_type,
-					   struct srcu_struct *sp, int *flags,
-					   unsigned long *gpnum,
-					   unsigned long *completed)
+			     struct srcu_struct *sp, int *flags,
+			     unsigned long *gpnum, unsigned long *completed)
 {
 	if (test_type != SRCU_FLAVOR)
 		return;
-- 
cgit v1.2.3-70-g09d2


From 22607d66bbc3e81140d3bcf08894f4378eb36428 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 25 Apr 2017 14:03:11 -0700
Subject: srcu: Specify auto-expedite holdoff time

On small systems, in the absence of readers, expedited SRCU grace
periods can complete in less than a microsecond.  This means that an
eight-CPU system can have all CPUs doing synchronize_srcu() in a tight
loop and almost always expedite.  This might actually be desirable in
some situations, but in general it is a good way to needlessly burn
CPU cycles.  And in those situations where it is desirable, your friend
is the function synchronize_srcu_expedited().

For other situations, this commit adds a kernel parameter that specifies
a holdoff between completing the last SRCU grace period and auto-expediting
the next.  If the next grace period starts before the holdoff expires,
auto-expediting is disabled.  The holdoff is 50 microseconds by default,
and can be tuned to the desired number of nanoseconds.  A value of zero
disables auto-expediting.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Mike Galbraith <efault@gmx.de>
---
 Documentation/admin-guide/kernel-parameters.txt |  8 ++++++++
 include/linux/srcutree.h                        |  1 +
 kernel/rcu/srcutree.c                           | 18 +++++++++++++++++-
 3 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index facc20a3f962..4a4b9266c4de 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3779,6 +3779,14 @@
 	spia_pedr=
 	spia_peddr=
 
+	srcutree.exp_holdoff [KNL]
+			Specifies how many nanoseconds must elapse
+			since the end of the last SRCU grace period for
+			a given srcu_struct until the next normal SRCU
+			grace period will be considered for automatic
+			expediting.  Set to zero to disable automatic
+			expediting.
+
 	stacktrace	[FTRACE]
 			Enabled the stack tracer on boot up.
 
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 86df48d3e97b..32e86d85fd11 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -84,6 +84,7 @@ struct srcu_struct {
 	unsigned long srcu_gp_seq;		/* Grace-period seq #. */
 	unsigned long srcu_gp_seq_needed;	/* Latest gp_seq needed. */
 	unsigned long srcu_gp_seq_needed_exp;	/* Furthest future exp GP. */
+	unsigned long srcu_last_gp_end;		/* Last GP end timestamp (ns) */
 	struct srcu_data __percpu *sda;		/* Per-CPU srcu_data array. */
 	unsigned long srcu_barrier_seq;		/* srcu_barrier seq #. */
 	struct mutex srcu_barrier_mutex;	/* Serialize barrier ops. */
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 2286e06fd159..74c283f9d15e 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -34,10 +34,14 @@
 #include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/delay.h>
+#include <linux/module.h>
 #include <linux/srcu.h>
 
 #include "rcu.h"
 
+ulong exp_holdoff = 50 * 1000; /* Holdoff (ns) for auto-expediting. */
+module_param(exp_holdoff, ulong, 0444);
+
 static void srcu_invoke_callbacks(struct work_struct *work);
 static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
 
@@ -145,6 +149,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
 		sp->sda = alloc_percpu(struct srcu_data);
 	init_srcu_struct_nodes(sp, is_static);
 	sp->srcu_gp_seq_needed_exp = 0;
+	sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
 	smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */
 	return sp->sda ? 0 : -ENOMEM;
 }
@@ -498,6 +503,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
 	idx = rcu_seq_state(sp->srcu_gp_seq);
 	WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
 	cbdelay = srcu_get_delay(sp);
+	sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
 	rcu_seq_end(&sp->srcu_gp_seq);
 	gpseq = rcu_seq_current(&sp->srcu_gp_seq);
 	if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
@@ -700,9 +706,10 @@ static void srcu_flip(struct srcu_struct *sp)
  */
 static bool srcu_might_be_idle(struct srcu_struct *sp)
 {
+	unsigned long curseq;
 	unsigned long flags;
 	struct srcu_data *sdp;
-	unsigned long curseq;
+	unsigned long t;
 
 	/* If the local srcu_data structure has callbacks, not idle.  */
 	local_irq_save(flags);
@@ -718,6 +725,15 @@ static bool srcu_might_be_idle(struct srcu_struct *sp)
 	 * Exact information would require acquiring locks, which would
 	 * kill scalability, hence the probabalistic nature of the probe.
 	 */
+
+	/* First, see if enough time has passed since the last GP. */
+	t = ktime_get_mono_fast_ns();
+	if (exp_holdoff == 0 ||
+	    time_in_range_open(t, sp->srcu_last_gp_end,
+			       sp->srcu_last_gp_end + exp_holdoff))
+		return false; /* Too soon after last GP. */
+
+	/* Next, check for probable idleness. */
 	curseq = rcu_seq_current(&sp->srcu_gp_seq);
 	smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
 	if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed)))
-- 
cgit v1.2.3-70-g09d2


From e8245c1b1a3bb8474f91c69ccd13637d3589bb2c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 26 Apr 2017 15:34:06 +0200
Subject: iommu: Include device.h in iommu.h

We make use of 'struct device' in iommu.h, so include
device.h to make it available explicitly.

Re-order the other headers while at it.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6a6de187ddc0..3b4fe4b79d20 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -19,11 +19,13 @@
 #ifndef __LINUX_IOMMU_H
 #define __LINUX_IOMMU_H
 
+#include <linux/scatterlist.h>
+#include <linux/device.h>
+#include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/of.h>
-#include <linux/types.h>
-#include <linux/scatterlist.h>
+
 #include <trace/events/iommu.h>
 
 #define IOMMU_READ	(1 << 0)
-- 
cgit v1.2.3-70-g09d2


From 207c6e36f122ebb1164d611c9f34f128313f47d5 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 26 Apr 2017 15:39:28 +0200
Subject: iommu: Move report_iommu_fault() to iommu.c

The function is in no fast-path, there is no need for it to
be static inline in a header file. This also removes the
need to include iommu trace-points in iommu.h.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/iommu.h | 41 ++---------------------------------------
 2 files changed, 44 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 9170fd498f46..4cb5792cbc21 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1655,6 +1655,48 @@ void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr)
 }
 EXPORT_SYMBOL_GPL(iommu_domain_window_disable);
 
+/**
+ * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework
+ * @domain: the iommu domain where the fault has happened
+ * @dev: the device where the fault has happened
+ * @iova: the faulting address
+ * @flags: mmu fault flags (e.g. IOMMU_FAULT_READ/IOMMU_FAULT_WRITE/...)
+ *
+ * This function should be called by the low-level IOMMU implementations
+ * whenever IOMMU faults happen, to allow high-level users, that are
+ * interested in such events, to know about them.
+ *
+ * This event may be useful for several possible use cases:
+ * - mere logging of the event
+ * - dynamic TLB/PTE loading
+ * - if restarting of the faulting device is required
+ *
+ * Returns 0 on success and an appropriate error code otherwise (if dynamic
+ * PTE/TLB loading will one day be supported, implementations will be able
+ * to tell whether it succeeded or not according to this return value).
+ *
+ * Specifically, -ENOSYS is returned if a fault handler isn't installed
+ * (though fault handlers can also return -ENOSYS, in case they want to
+ * elicit the default behavior of the IOMMU drivers).
+ */
+int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
+		       unsigned long iova, int flags)
+{
+	int ret = -ENOSYS;
+
+	/*
+	 * if upper layers showed interest and installed a fault handler,
+	 * invoke it.
+	 */
+	if (domain->handler)
+		ret = domain->handler(domain, dev, iova, flags,
+						domain->handler_token);
+
+	trace_io_page_fault(dev, iova, flags);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(report_iommu_fault);
+
 static int __init iommu_init(void)
 {
 	iommu_group_kset = kset_create_and_add("iommu_groups",
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3b4fe4b79d20..abaa0ca848bc 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -330,46 +330,9 @@ extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 				      phys_addr_t offset, u64 size,
 				      int prot);
 extern void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr);
-/**
- * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework
- * @domain: the iommu domain where the fault has happened
- * @dev: the device where the fault has happened
- * @iova: the faulting address
- * @flags: mmu fault flags (e.g. IOMMU_FAULT_READ/IOMMU_FAULT_WRITE/...)
- *
- * This function should be called by the low-level IOMMU implementations
- * whenever IOMMU faults happen, to allow high-level users, that are
- * interested in such events, to know about them.
- *
- * This event may be useful for several possible use cases:
- * - mere logging of the event
- * - dynamic TLB/PTE loading
- * - if restarting of the faulting device is required
- *
- * Returns 0 on success and an appropriate error code otherwise (if dynamic
- * PTE/TLB loading will one day be supported, implementations will be able
- * to tell whether it succeeded or not according to this return value).
- *
- * Specifically, -ENOSYS is returned if a fault handler isn't installed
- * (though fault handlers can also return -ENOSYS, in case they want to
- * elicit the default behavior of the IOMMU drivers).
- */
-static inline int report_iommu_fault(struct iommu_domain *domain,
-		struct device *dev, unsigned long iova, int flags)
-{
-	int ret = -ENOSYS;
 
-	/*
-	 * if upper layers showed interest and installed a fault handler,
-	 * invoke it.
-	 */
-	if (domain->handler)
-		ret = domain->handler(domain, dev, iova, flags,
-						domain->handler_token);
-
-	trace_io_page_fault(dev, iova, flags);
-	return ret;
-}
+extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
+			      unsigned long iova, int flags);
 
 static inline size_t iommu_map_sg(struct iommu_domain *domain,
 				  unsigned long iova, struct scatterlist *sg,
-- 
cgit v1.2.3-70-g09d2


From 5af50993850a48ba749b122173d789ea90976c72 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 5 Apr 2017 17:54:56 +1000
Subject: KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller

This patch makes KVM capable of using the XIVE interrupt controller
to provide the standard PAPR "XICS" style hypercalls. It is necessary
for proper operations when the host uses XIVE natively.

This has been lightly tested on an actual system, including PCI
pass-through with a TG3 device.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Cleanup pr_xxx(), unsplit pr_xxx() strings, etc., fix build
 failures by adding KVM_XIVE which depends on KVM_XICS and XIVE, and
 adding empty stubs for the kvm_xive_xxx() routines, fixup subject,
 integrate fixes from Paul for building PR=y HV=n]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_book3s_asm.h |    2 +
 arch/powerpc/include/asm/kvm_host.h       |   28 +-
 arch/powerpc/include/asm/kvm_ppc.h        |   74 ++
 arch/powerpc/include/asm/xive.h           |    9 +-
 arch/powerpc/kernel/asm-offsets.c         |   10 +
 arch/powerpc/kvm/Kconfig                  |    5 +
 arch/powerpc/kvm/Makefile                 |    4 +-
 arch/powerpc/kvm/book3s.c                 |   75 +-
 arch/powerpc/kvm/book3s_hv.c              |   51 +-
 arch/powerpc/kvm/book3s_hv_builtin.c      |  103 ++
 arch/powerpc/kvm/book3s_hv_rm_xics.c      |   10 +-
 arch/powerpc/kvm/book3s_hv_rm_xive.c      |   47 +
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |   62 +-
 arch/powerpc/kvm/book3s_rtas.c            |   21 +-
 arch/powerpc/kvm/book3s_xics.c            |   35 +-
 arch/powerpc/kvm/book3s_xics.h            |    7 +
 arch/powerpc/kvm/book3s_xive.c            | 1893 +++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_xive.h            |  256 ++++
 arch/powerpc/kvm/book3s_xive_template.c   |  503 ++++++++
 arch/powerpc/kvm/irq.h                    |    1 +
 arch/powerpc/kvm/powerpc.c                |   17 +-
 arch/powerpc/platforms/powernv/opal.c     |    1 +
 arch/powerpc/sysdev/xive/common.c         |  142 ++-
 arch/powerpc/sysdev/xive/native.c         |   86 +-
 include/linux/kvm_host.h                  |    1 -
 virt/kvm/kvm_main.c                       |    4 -
 26 files changed, 3358 insertions(+), 89 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_hv_rm_xive.c
 create mode 100644 arch/powerpc/kvm/book3s_xive.c
 create mode 100644 arch/powerpc/kvm/book3s_xive.h
 create mode 100644 arch/powerpc/kvm/book3s_xive_template.c

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 0593d9479f74..b148496ffe36 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -111,6 +111,8 @@ struct kvmppc_host_state {
 	struct kvm_vcpu *kvm_vcpu;
 	struct kvmppc_vcore *kvm_vcore;
 	void __iomem *xics_phys;
+	void __iomem *xive_tima_phys;
+	void __iomem *xive_tima_virt;
 	u32 saved_xirr;
 	u64 dabr;
 	u64 host_mmcr[7];	/* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 7bba8f415627..5a8ab4a758f1 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -205,6 +205,12 @@ struct kvmppc_spapr_tce_table {
 /* XICS components, defined in book3s_xics.c */
 struct kvmppc_xics;
 struct kvmppc_icp;
+extern struct kvm_device_ops kvm_xics_ops;
+
+/* XIVE components, defined in book3s_xive.c */
+struct kvmppc_xive;
+struct kvmppc_xive_vcpu;
+extern struct kvm_device_ops kvm_xive_ops;
 
 struct kvmppc_passthru_irqmap;
 
@@ -293,6 +299,7 @@ struct kvm_arch {
 #endif
 #ifdef CONFIG_KVM_XICS
 	struct kvmppc_xics *xics;
+	struct kvmppc_xive *xive;
 	struct kvmppc_passthru_irqmap *pimap;
 #endif
 	struct kvmppc_ops *kvm_ops;
@@ -421,7 +428,7 @@ struct kvmppc_passthru_irqmap {
 
 #define KVMPPC_IRQ_DEFAULT	0
 #define KVMPPC_IRQ_MPIC		1
-#define KVMPPC_IRQ_XICS		2
+#define KVMPPC_IRQ_XICS		2 /* Includes a XIVE option */
 
 #define MMIO_HPTE_CACHE_SIZE	4
 
@@ -443,6 +450,21 @@ struct mmio_hpte_cache {
 
 struct openpic;
 
+/* W0 and W1 of a XIVE thread management context */
+union xive_tma_w01 {
+	struct {
+		u8	nsr;
+		u8	cppr;
+		u8	ipb;
+		u8	lsmfb;
+		u8	ack;
+		u8	inc;
+		u8	age;
+		u8	pipr;
+	};
+	__be64 w01;
+};
+
 struct kvm_vcpu_arch {
 	ulong host_stack;
 	u32 host_pid;
@@ -688,6 +710,10 @@ struct kvm_vcpu_arch {
 	struct openpic *mpic;	/* KVM_IRQ_MPIC */
 #ifdef CONFIG_KVM_XICS
 	struct kvmppc_icp *icp; /* XICS presentation controller */
+	struct kvmppc_xive_vcpu *xive_vcpu; /* XIVE virtual CPU data */
+	__be32 xive_cam_word;    /* Cooked W2 in proper endian with valid bit */
+	u32 xive_pushed;	 /* Is the VP pushed on the physical CPU ? */
+	union xive_tma_w01 xive_saved_state; /* W0..1 of XIVE thread state */
 #endif
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index c3877992eff9..ed52b13d9ffb 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -225,6 +225,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
 extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp);
 extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu);
 extern void kvmppc_rtas_tokens_free(struct kvm *kvm);
+
 extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server,
 				u32 priority);
 extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
@@ -412,6 +413,14 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 	paca[cpu].kvm_hstate.xics_phys = (void __iomem *)addr;
 }
 
+static inline void kvmppc_set_xive_tima(int cpu,
+					unsigned long phys_addr,
+					void __iomem *virt_addr)
+{
+	paca[cpu].kvm_hstate.xive_tima_phys = (void __iomem *)phys_addr;
+	paca[cpu].kvm_hstate.xive_tima_virt = virt_addr;
+}
+
 static inline u32 kvmppc_get_xics_latch(void)
 {
 	u32 xirr;
@@ -442,6 +451,11 @@ static inline void __init kvm_cma_reserve(void)
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {}
 
+static inline void kvmppc_set_xive_tima(int cpu,
+					unsigned long phys_addr,
+					void __iomem *virt_addr)
+{}
+
 static inline u32 kvmppc_get_xics_latch(void)
 {
 	return 0;
@@ -492,6 +506,10 @@ extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, __be32 xirr,
 					struct kvmppc_irq_map *irq_map,
 					struct kvmppc_passthru_irqmap *pimap,
 					bool *again);
+
+extern int kvmppc_xics_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
+			       int level, bool line_status);
+
 extern int h_ipi_redirect;
 #else
 static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
@@ -509,6 +527,60 @@ static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
 	{ return 0; }
 #endif
 
+#ifdef CONFIG_KVM_XIVE
+/*
+ * Below the first "xive" is the "eXternal Interrupt Virtualization Engine"
+ * ie. P9 new interrupt controller, while the second "xive" is the legacy
+ * "eXternal Interrupt Vector Entry" which is the configuration of an
+ * interrupt on the "xics" interrupt controller on P8 and earlier. Those
+ * two function consume or produce a legacy "XIVE" state from the
+ * new "XIVE" interrupt controller.
+ */
+extern int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
+				u32 priority);
+extern int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
+				u32 *priority);
+extern int kvmppc_xive_int_on(struct kvm *kvm, u32 irq);
+extern int kvmppc_xive_int_off(struct kvm *kvm, u32 irq);
+extern void kvmppc_xive_init_module(void);
+extern void kvmppc_xive_exit_module(void);
+
+extern int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
+				    struct kvm_vcpu *vcpu, u32 cpu);
+extern void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu);
+extern int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
+				  struct irq_desc *host_desc);
+extern int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
+				  struct irq_desc *host_desc);
+extern u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu);
+extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
+
+extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
+			       int level, bool line_status);
+#else
+static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
+				       u32 priority) { return -1; }
+static inline int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
+				       u32 *priority) { return -1; }
+static inline int kvmppc_xive_int_on(struct kvm *kvm, u32 irq) { return -1; }
+static inline int kvmppc_xive_int_off(struct kvm *kvm, u32 irq) { return -1; }
+static inline void kvmppc_xive_init_module(void) { }
+static inline void kvmppc_xive_exit_module(void) { }
+
+static inline int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
+					   struct kvm_vcpu *vcpu, u32 cpu) { return -EBUSY; }
+static inline void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) { }
+static inline int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
+					 struct irq_desc *host_desc) { return -ENODEV; }
+static inline int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
+					 struct irq_desc *host_desc) { return -ENODEV; }
+static inline u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu) { return 0; }
+static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { return -ENOENT; }
+
+static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
+				      int level, bool line_status) { return -ENODEV; }
+#endif /* CONFIG_KVM_XIVE */
+
 /*
  * Prototypes for functions called only from assembler code.
  * Having prototypes reduces sparse errors.
@@ -546,6 +618,8 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
 long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
                           unsigned long slb_v, unsigned int status, bool data);
 unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu);
+unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu);
+unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server);
 int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
                     unsigned long mfrr);
 int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index 3cdbeaeac397..c8a822acf962 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -99,7 +99,6 @@ struct xive_q {
 #define XIVE_ESB_SET_PQ_01	0xd00
 #define XIVE_ESB_SET_PQ_10	0xe00
 #define XIVE_ESB_SET_PQ_11	0xf00
-#define XIVE_ESB_MASK		XIVE_ESB_SET_PQ_01
 
 #define XIVE_ESB_VAL_P		0x2
 #define XIVE_ESB_VAL_Q		0x1
@@ -136,11 +135,11 @@ extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
 				       __be32 *qpage, u32 order, bool can_escalate);
 extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
 
-extern bool __xive_irq_trigger(struct xive_irq_data *xd);
-extern bool __xive_irq_retrigger(struct xive_irq_data *xd);
-extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd);
-
+extern void xive_native_sync_source(u32 hw_irq);
 extern bool is_xive_irq(struct irq_chip *chip);
+extern int xive_native_enable_vp(u32 vp_id);
+extern int xive_native_disable_vp(u32 vp_id);
+extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id);
 
 #else
 
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 4367e7df51a1..1822187813dc 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -630,6 +630,8 @@ int main(void)
 	HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
 	HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
 	HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
+	HSTATE_FIELD(HSTATE_XIVE_TIMA_PHYS, xive_tima_phys);
+	HSTATE_FIELD(HSTATE_XIVE_TIMA_VIRT, xive_tima_virt);
 	HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
 	HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
 	HSTATE_FIELD(HSTATE_PTID, ptid);
@@ -715,6 +717,14 @@ int main(void)
 	OFFSET(VCPU_HOST_MAS6, kvm_vcpu, arch.host_mas6);
 #endif
 
+#ifdef CONFIG_KVM_XICS
+	DEFINE(VCPU_XIVE_SAVED_STATE, offsetof(struct kvm_vcpu,
+					       arch.xive_saved_state));
+	DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu,
+					    arch.xive_cam_word));
+	DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed));
+#endif
+
 #ifdef CONFIG_KVM_EXIT_TIMING
 	OFFSET(VCPU_TIMING_EXIT_TBU, kvm_vcpu, arch.timing_exit.tv32.tbu);
 	OFFSET(VCPU_TIMING_EXIT_TBL, kvm_vcpu, arch.timing_exit.tv32.tbl);
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 029be26b5a17..b9d66e53b773 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -196,6 +196,11 @@ config KVM_XICS
 	  Specification) interrupt controller architecture used on
 	  IBM POWER (pSeries) servers.
 
+config KVM_XIVE
+	bool
+	default y
+	depends on KVM_XICS && PPC_XIVE_NATIVE && KVM_BOOK3S_HV_POSSIBLE
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index b87ccde2137a..d91a2604c496 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -74,7 +74,7 @@ kvm-hv-y += \
 	book3s_64_mmu_radix.o
 
 kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
-	book3s_hv_rm_xics.o
+	book3s_hv_rm_xics.o book3s_hv_rm_xive.o
 
 ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
@@ -89,6 +89,8 @@ endif
 kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
 	book3s_xics.o
 
+kvm-book3s_64-objs-$(CONFIG_KVM_XIVE) += book3s_xive.o
+
 kvm-book3s_64-module-objs := \
 	$(common-objs-y) \
 	book3s.o \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index aedacefd961d..cb8009cd688d 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -35,6 +35,7 @@
 #include <asm/kvm_book3s.h>
 #include <asm/mmu_context.h>
 #include <asm/page.h>
+#include <asm/xive.h>
 
 #include "book3s.h"
 #include "trace.h"
@@ -578,11 +579,14 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
 			break;
 #ifdef CONFIG_KVM_XICS
 		case KVM_REG_PPC_ICP_STATE:
-			if (!vcpu->arch.icp) {
+			if (!vcpu->arch.icp && !vcpu->arch.xive_vcpu) {
 				r = -ENXIO;
 				break;
 			}
-			*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
+			if (xive_enabled())
+				*val = get_reg_val(id, kvmppc_xive_get_icp(vcpu));
+			else
+				*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
 			break;
 #endif /* CONFIG_KVM_XICS */
 		case KVM_REG_PPC_FSCR:
@@ -648,12 +652,14 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
 #endif /* CONFIG_VSX */
 #ifdef CONFIG_KVM_XICS
 		case KVM_REG_PPC_ICP_STATE:
-			if (!vcpu->arch.icp) {
+			if (!vcpu->arch.icp && !vcpu->arch.xive_vcpu) {
 				r = -ENXIO;
 				break;
 			}
-			r = kvmppc_xics_set_icp(vcpu,
-						set_reg_val(id, *val));
+			if (xive_enabled())
+				r = kvmppc_xive_set_icp(vcpu, set_reg_val(id, *val));
+			else
+				r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val));
 			break;
 #endif /* CONFIG_KVM_XICS */
 		case KVM_REG_PPC_FSCR:
@@ -924,6 +930,50 @@ int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall)
 	return kvm->arch.kvm_ops->hcall_implemented(hcall);
 }
 
+#ifdef CONFIG_KVM_XICS
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+		bool line_status)
+{
+	if (xive_enabled())
+		return kvmppc_xive_set_irq(kvm, irq_source_id, irq, level,
+					   line_status);
+	else
+		return kvmppc_xics_set_irq(kvm, irq_source_id, irq, level,
+					   line_status);
+}
+
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *irq_entry,
+			      struct kvm *kvm, int irq_source_id,
+			      int level, bool line_status)
+{
+	return kvm_set_irq(kvm, irq_source_id, irq_entry->gsi,
+			   level, line_status);
+}
+static int kvmppc_book3s_set_irq(struct kvm_kernel_irq_routing_entry *e,
+				 struct kvm *kvm, int irq_source_id, int level,
+				 bool line_status)
+{
+	return kvm_set_irq(kvm, irq_source_id, e->gsi, level, line_status);
+}
+
+int kvm_irq_map_gsi(struct kvm *kvm,
+		    struct kvm_kernel_irq_routing_entry *entries, int gsi)
+{
+	entries->gsi = gsi;
+	entries->type = KVM_IRQ_ROUTING_IRQCHIP;
+	entries->set = kvmppc_book3s_set_irq;
+	entries->irqchip.irqchip = 0;
+	entries->irqchip.pin = gsi;
+	return 1;
+}
+
+int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
+	return pin;
+}
+
+#endif /* CONFIG_KVM_XICS */
+
 static int kvmppc_book3s_init(void)
 {
 	int r;
@@ -934,12 +984,25 @@ static int kvmppc_book3s_init(void)
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
 	r = kvmppc_book3s_init_pr();
 #endif
-	return r;
 
+#ifdef CONFIG_KVM_XICS
+#ifdef CONFIG_KVM_XIVE
+	if (xive_enabled()) {
+		kvmppc_xive_init_module();
+		kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS);
+	} else
+#endif
+		kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS);
+#endif
+	return r;
 }
 
 static void kvmppc_book3s_exit(void)
 {
+#ifdef CONFIG_KVM_XICS
+	if (xive_enabled())
+		kvmppc_xive_exit_module();
+#endif
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
 	kvmppc_book3s_exit_pr();
 #endif
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index fadb75abfe37..128efb42ec4e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -67,6 +67,7 @@
 #include <asm/mmu.h>
 #include <asm/opal.h>
 #include <asm/xics.h>
+#include <asm/xive.h>
 
 #include "book3s.h"
 
@@ -837,6 +838,10 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 	case H_IPOLL:
 	case H_XIRR_X:
 		if (kvmppc_xics_enabled(vcpu)) {
+			if (xive_enabled()) {
+				ret = H_NOT_AVAILABLE;
+				return RESUME_GUEST;
+			}
 			ret = kvmppc_xics_hcall(vcpu, req);
 			break;
 		}
@@ -2947,8 +2952,12 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			r = kvmppc_book3s_hv_page_fault(run, vcpu,
 				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
 			srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
-		} else if (r == RESUME_PASSTHROUGH)
-			r = kvmppc_xics_rm_complete(vcpu, 0);
+		} else if (r == RESUME_PASSTHROUGH) {
+			if (WARN_ON(xive_enabled()))
+				r = H_SUCCESS;
+			else
+				r = kvmppc_xics_rm_complete(vcpu, 0);
+		}
 	} while (is_kvmppc_resume_guest(r));
 
  out:
@@ -3400,10 +3409,20 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	/*
 	 * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
 	 * Set HVICE bit to enable hypervisor virtualization interrupts.
+	 * Set HEIC to prevent OS interrupts to go to hypervisor (should
+	 * be unnecessary but better safe than sorry in case we re-enable
+	 * EE in HV mode with this LPCR still set)
 	 */
 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
 		lpcr &= ~LPCR_VPM0;
-		lpcr |= LPCR_HVICE;
+		lpcr |= LPCR_HVICE | LPCR_HEIC;
+
+		/*
+		 * If xive is enabled, we route 0x500 interrupts directly
+		 * to the guest.
+		 */
+		if (xive_enabled())
+			lpcr |= LPCR_LPES;
 	}
 
 	/*
@@ -3533,7 +3552,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	struct kvmppc_irq_map *irq_map;
 	struct kvmppc_passthru_irqmap *pimap;
 	struct irq_chip *chip;
-	int i;
+	int i, rc = 0;
 
 	if (!kvm_irq_bypass)
 		return 1;
@@ -3558,10 +3577,10 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	/*
 	 * For now, we only support interrupts for which the EOI operation
 	 * is an OPAL call followed by a write to XIRR, since that's
-	 * what our real-mode EOI code does.
+	 * what our real-mode EOI code does, or a XIVE interrupt
 	 */
 	chip = irq_data_get_irq_chip(&desc->irq_data);
-	if (!chip || !is_pnv_opal_msi(chip)) {
+	if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) {
 		pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
 			host_irq, guest_gsi);
 		mutex_unlock(&kvm->lock);
@@ -3603,7 +3622,12 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	if (i == pimap->n_mapped)
 		pimap->n_mapped++;
 
-	kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
+	if (xive_enabled())
+		rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc);
+	else
+		kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
+	if (rc)
+		irq_map->r_hwirq = 0;
 
 	mutex_unlock(&kvm->lock);
 
@@ -3614,7 +3638,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 {
 	struct irq_desc *desc;
 	struct kvmppc_passthru_irqmap *pimap;
-	int i;
+	int i, rc = 0;
 
 	if (!kvm_irq_bypass)
 		return 0;
@@ -3641,9 +3665,12 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 		return -ENODEV;
 	}
 
-	kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
+	if (xive_enabled())
+		rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc);
+	else
+		kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
 
-	/* invalidate the entry */
+	/* invalidate the entry (what do do on error from the above ?) */
 	pimap->mapped[i].r_hwirq = 0;
 
 	/*
@@ -3652,7 +3679,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	 */
 
 	mutex_unlock(&kvm->lock);
-	return 0;
+	return rc;
 }
 
 static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
@@ -3930,7 +3957,7 @@ static int kvmppc_book3s_init_hv(void)
 	 * indirectly, via OPAL.
 	 */
 #ifdef CONFIG_SMP
-	if (!get_paca()->kvm_hstate.xics_phys) {
+	if (!xive_enabled() && !get_paca()->kvm_hstate.xics_phys) {
 		struct device_node *np;
 
 		np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index a752e29977e0..846b40cb3a62 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -32,6 +32,24 @@
 
 #define KVM_CMA_CHUNK_ORDER	18
 
+#include "book3s_xics.h"
+#include "book3s_xive.h"
+
+/*
+ * The XIVE module will populate these when it loads
+ */
+unsigned long (*__xive_vm_h_xirr)(struct kvm_vcpu *vcpu);
+unsigned long (*__xive_vm_h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server);
+int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
+		       unsigned long mfrr);
+int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
+int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
+EXPORT_SYMBOL_GPL(__xive_vm_h_xirr);
+EXPORT_SYMBOL_GPL(__xive_vm_h_ipoll);
+EXPORT_SYMBOL_GPL(__xive_vm_h_ipi);
+EXPORT_SYMBOL_GPL(__xive_vm_h_cppr);
+EXPORT_SYMBOL_GPL(__xive_vm_h_eoi);
+
 /*
  * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
  * should be power of 2.
@@ -210,6 +228,7 @@ void kvmhv_rm_send_ipi(int cpu)
 		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
 		return;
 	}
+
 	/* On POWER8 for IPIs to threads in the same core, use msgsnd. */
 	if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
 	    cpu_first_thread_sibling(cpu) ==
@@ -406,6 +425,9 @@ static long kvmppc_read_one_intr(bool *again)
 	u8 host_ipi;
 	int64_t rc;
 
+	if (xive_enabled())
+		return 1;
+
 	/* see if a host IPI is pending */
 	host_ipi = local_paca->kvm_hstate.host_ipi;
 	if (host_ipi)
@@ -490,3 +512,84 @@ static long kvmppc_read_one_intr(bool *again)
 
 	return kvmppc_check_passthru(xisr, xirr, again);
 }
+
+#ifdef CONFIG_KVM_XICS
+static inline bool is_rm(void)
+{
+	return !(mfmsr() & MSR_DR);
+}
+
+unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
+{
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_xirr(vcpu);
+		if (unlikely(!__xive_vm_h_xirr))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_xirr(vcpu);
+	} else
+		return xics_rm_h_xirr(vcpu);
+}
+
+unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.gpr[5] = get_tb();
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_xirr(vcpu);
+		if (unlikely(!__xive_vm_h_xirr))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_xirr(vcpu);
+	} else
+		return xics_rm_h_xirr(vcpu);
+}
+
+unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
+{
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_ipoll(vcpu, server);
+		if (unlikely(!__xive_vm_h_ipoll))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_ipoll(vcpu, server);
+	} else
+		return H_TOO_HARD;
+}
+
+int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+		    unsigned long mfrr)
+{
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_ipi(vcpu, server, mfrr);
+		if (unlikely(!__xive_vm_h_ipi))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_ipi(vcpu, server, mfrr);
+	} else
+		return xics_rm_h_ipi(vcpu, server, mfrr);
+}
+
+int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_cppr(vcpu, cppr);
+		if (unlikely(!__xive_vm_h_cppr))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_cppr(vcpu, cppr);
+	} else
+		return xics_rm_h_cppr(vcpu, cppr);
+}
+
+int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_eoi(vcpu, xirr);
+		if (unlikely(!__xive_vm_h_eoi))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_eoi(vcpu, xirr);
+	} else
+		return xics_rm_h_eoi(vcpu, xirr);
+}
+#endif /* CONFIG_KVM_XICS */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 3a1a463a039a..f8068801ac36 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -485,7 +485,7 @@ static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 }
 
 
-unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
+unsigned long xics_rm_h_xirr(struct kvm_vcpu *vcpu)
 {
 	union kvmppc_icp_state old_state, new_state;
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
@@ -523,8 +523,8 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
 	return check_too_hard(xics, icp);
 }
 
-int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
-		    unsigned long mfrr)
+int xics_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+		  unsigned long mfrr)
 {
 	union kvmppc_icp_state old_state, new_state;
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
@@ -610,7 +610,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
 	return check_too_hard(xics, this_icp);
 }
 
-int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+int xics_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 {
 	union kvmppc_icp_state old_state, new_state;
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
@@ -730,7 +730,7 @@ static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq)
 	return check_too_hard(xics, icp);
 }
 
-int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 {
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
 	struct kvmppc_icp *icp = vcpu->arch.icp;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xive.c b/arch/powerpc/kvm/book3s_hv_rm_xive.c
new file mode 100644
index 000000000000..abf5f01b6eb1
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rm_xive.c
@@ -0,0 +1,47 @@
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/kernel_stat.h>
+
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+#include <asm/synch.h>
+#include <asm/cputhreads.h>
+#include <asm/pgtable.h>
+#include <asm/ppc-opcode.h>
+#include <asm/pnv-pci.h>
+#include <asm/opal.h>
+#include <asm/smp.h>
+#include <asm/asm-prototypes.h>
+#include <asm/xive.h>
+#include <asm/xive-regs.h>
+
+#include "book3s_xive.h"
+
+/* XXX */
+#include <asm/udbg.h>
+//#define DBG(fmt...) udbg_printf(fmt)
+#define DBG(fmt...) do { } while(0)
+
+static inline void __iomem *get_tima_phys(void)
+{
+	return local_paca->kvm_hstate.xive_tima_phys;
+}
+
+#undef XIVE_RUNTIME_CHECKS
+#define X_PFX xive_rm_
+#define X_STATIC
+#define X_STAT_PFX stat_rm_
+#define __x_tima		get_tima_phys()
+#define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_page))
+#define __x_trig_page(xd)	((void __iomem *)((xd)->trig_page))
+#define __x_readb	__raw_rm_readb
+#define __x_writeb	__raw_rm_writeb
+#define __x_readw	__raw_rm_readw
+#define __x_readq	__raw_rm_readq
+#define __x_writeq	__raw_rm_writeq
+
+#include "book3s_xive_template.c"
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 7c6477d1840a..bdb3f76ceb6b 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -30,6 +30,7 @@
 #include <asm/book3s/64/mmu-hash.h>
 #include <asm/tm.h>
 #include <asm/opal.h>
+#include <asm/xive-regs.h>
 
 #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
 
@@ -970,6 +971,23 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	cmpwi	r3, 512		/* 1 microsecond */
 	blt	hdec_soon
 
+#ifdef CONFIG_KVM_XICS
+	/* We are entering the guest on that thread, push VCPU to XIVE */
+	ld	r10, HSTATE_XIVE_TIMA_PHYS(r13)
+	cmpldi	cr0, r10, r0
+	beq	no_xive
+	ld	r11, VCPU_XIVE_SAVED_STATE(r4)
+	li	r9, TM_QW1_OS
+	stdcix	r11,r9,r10
+	eieio
+	lwz	r11, VCPU_XIVE_CAM_WORD(r4)
+	li	r9, TM_QW1_OS + TM_WORD2
+	stwcix	r11,r9,r10
+	li	r9, 1
+	stw	r9, VCPU_XIVE_PUSHED(r4)
+no_xive:
+#endif /* CONFIG_KVM_XICS */
+
 deliver_guest_interrupt:
 	ld	r6, VCPU_CTR(r4)
 	ld	r7, VCPU_XER(r4)
@@ -1307,6 +1325,42 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	blt	deliver_guest_interrupt
 
 guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
+#ifdef CONFIG_KVM_XICS
+	/* We are exiting, pull the VP from the XIVE */
+	lwz	r0, VCPU_XIVE_PUSHED(r9)
+	cmpwi	cr0, r0, 0
+	beq	1f
+	li	r7, TM_SPC_PULL_OS_CTX
+	li	r6, TM_QW1_OS
+	mfmsr	r0
+	andi.	r0, r0, MSR_IR		/* in real mode? */
+	beq	2f
+	ld	r10, HSTATE_XIVE_TIMA_VIRT(r13)
+	cmpldi	cr0, r10, 0
+	beq	1f
+	/* First load to pull the context, we ignore the value */
+	lwzx	r11, r7, r10
+	eieio
+	/* Second load to recover the context state (Words 0 and 1) */
+	ldx	r11, r6, r10
+	b	3f
+2:	ld	r10, HSTATE_XIVE_TIMA_PHYS(r13)
+	cmpldi	cr0, r10, 0
+	beq	1f
+	/* First load to pull the context, we ignore the value */
+	lwzcix	r11, r7, r10
+	eieio
+	/* Second load to recover the context state (Words 0 and 1) */
+	ldcix	r11, r6, r10
+3:	std	r11, VCPU_XIVE_SAVED_STATE(r9)
+	/* Fixup some of the state for the next load */
+	li	r10, 0
+	li	r0, 0xff
+	stw	r10, VCPU_XIVE_PUSHED(r9)
+	stb	r10, (VCPU_XIVE_SAVED_STATE+3)(r9)
+	stb	r0, (VCPU_XIVE_SAVED_STATE+4)(r9)
+1:
+#endif /* CONFIG_KVM_XICS */
 	/* Save more register state  */
 	mfdar	r6
 	mfdsisr	r7
@@ -2011,7 +2065,7 @@ hcall_real_table:
 	.long	DOTSYM(kvmppc_rm_h_eoi) - hcall_real_table
 	.long	DOTSYM(kvmppc_rm_h_cppr) - hcall_real_table
 	.long	DOTSYM(kvmppc_rm_h_ipi) - hcall_real_table
-	.long	0		/* 0x70 - H_IPOLL */
+	.long	DOTSYM(kvmppc_rm_h_ipoll) - hcall_real_table
 	.long	DOTSYM(kvmppc_rm_h_xirr) - hcall_real_table
 #else
 	.long	0		/* 0x64 - H_EOI */
@@ -2181,7 +2235,11 @@ hcall_real_table:
 	.long	0		/* 0x2f0 */
 	.long	0		/* 0x2f4 */
 	.long	0		/* 0x2f8 */
-	.long	0		/* 0x2fc */
+#ifdef CONFIG_KVM_XICS
+	.long	DOTSYM(kvmppc_rm_h_xirr_x) - hcall_real_table
+#else
+	.long	0		/* 0x2fc - H_XIRR_X*/
+#endif
 	.long	DOTSYM(kvmppc_h_random) - hcall_real_table
 	.globl	hcall_real_table_end
 hcall_real_table_end:
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
index 20528701835b..2d3b2b1cc272 100644
--- a/arch/powerpc/kvm/book3s_rtas.c
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -16,6 +16,7 @@
 #include <asm/kvm_ppc.h>
 #include <asm/hvcall.h>
 #include <asm/rtas.h>
+#include <asm/xive.h>
 
 #ifdef CONFIG_KVM_XICS
 static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
@@ -32,7 +33,10 @@ static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
 	server = be32_to_cpu(args->args[1]);
 	priority = be32_to_cpu(args->args[2]);
 
-	rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
+	if (xive_enabled())
+		rc = kvmppc_xive_set_xive(vcpu->kvm, irq, server, priority);
+	else
+		rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
 	if (rc)
 		rc = -3;
 out:
@@ -52,7 +56,10 @@ static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
 	irq = be32_to_cpu(args->args[0]);
 
 	server = priority = 0;
-	rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
+	if (xive_enabled())
+		rc = kvmppc_xive_get_xive(vcpu->kvm, irq, &server, &priority);
+	else
+		rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
 	if (rc) {
 		rc = -3;
 		goto out;
@@ -76,7 +83,10 @@ static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args)
 
 	irq = be32_to_cpu(args->args[0]);
 
-	rc = kvmppc_xics_int_off(vcpu->kvm, irq);
+	if (xive_enabled())
+		rc = kvmppc_xive_int_off(vcpu->kvm, irq);
+	else
+		rc = kvmppc_xics_int_off(vcpu->kvm, irq);
 	if (rc)
 		rc = -3;
 out:
@@ -95,7 +105,10 @@ static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args)
 
 	irq = be32_to_cpu(args->args[0]);
 
-	rc = kvmppc_xics_int_on(vcpu->kvm, irq);
+	if (xive_enabled())
+		rc = kvmppc_xive_int_on(vcpu->kvm, irq);
+	else
+		rc = kvmppc_xics_int_on(vcpu->kvm, irq);
 	if (rc)
 		rc = -3;
 out:
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index ef4fd528c193..e6829c415bc8 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -1307,8 +1307,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
 	return 0;
 }
 
-int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
-		bool line_status)
+int kvmppc_xics_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+			bool line_status)
 {
 	struct kvmppc_xics *xics = kvm->arch.xics;
 
@@ -1317,14 +1317,6 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
 	return ics_deliver_irq(xics, irq, level);
 }
 
-int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *irq_entry,
-			      struct kvm *kvm, int irq_source_id,
-			      int level, bool line_status)
-{
-	return kvm_set_irq(kvm, irq_source_id, irq_entry->gsi,
-			   level, line_status);
-}
-
 static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
 	struct kvmppc_xics *xics = dev->private;
@@ -1458,29 +1450,6 @@ void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
 	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
 }
 
-static int xics_set_irq(struct kvm_kernel_irq_routing_entry *e,
-			struct kvm *kvm, int irq_source_id, int level,
-			bool line_status)
-{
-	return kvm_set_irq(kvm, irq_source_id, e->gsi, level, line_status);
-}
-
-int kvm_irq_map_gsi(struct kvm *kvm,
-		    struct kvm_kernel_irq_routing_entry *entries, int gsi)
-{
-	entries->gsi = gsi;
-	entries->type = KVM_IRQ_ROUTING_IRQCHIP;
-	entries->set = xics_set_irq;
-	entries->irqchip.irqchip = 0;
-	entries->irqchip.pin = gsi;
-	return 1;
-}
-
-int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
-{
-	return pin;
-}
-
 void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long irq,
 			    unsigned long host_irq)
 {
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index ec5474cf70c6..453c9e518c19 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -10,6 +10,7 @@
 #ifndef _KVM_PPC_BOOK3S_XICS_H
 #define _KVM_PPC_BOOK3S_XICS_H
 
+#ifdef CONFIG_KVM_XICS
 /*
  * We use a two-level tree to store interrupt source information.
  * There are up to 1024 ICS nodes, each of which can represent
@@ -144,5 +145,11 @@ static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
 	return ics;
 }
 
+extern unsigned long xics_rm_h_xirr(struct kvm_vcpu *vcpu);
+extern int xics_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+			 unsigned long mfrr);
+extern int xics_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
+extern int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
 
+#endif /* CONFIG_KVM_XICS */
 #endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
new file mode 100644
index 000000000000..7807ee17af4b
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -0,0 +1,1893 @@
+/*
+ * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) "xive-kvm: " fmt
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/xive-regs.h>
+#include <asm/debug.h>
+#include <asm/time.h>
+#include <asm/opal.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "book3s_xive.h"
+
+
+/*
+ * Virtual mode variants of the hcalls for use on radix/radix
+ * with AIL. They require the VCPU's VP to be "pushed"
+ *
+ * We still instanciate them here because we use some of the
+ * generated utility functions as well in this file.
+ */
+#define XIVE_RUNTIME_CHECKS
+#define X_PFX xive_vm_
+#define X_STATIC static
+#define X_STAT_PFX stat_vm_
+#define __x_tima		xive_tima
+#define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_mmio))
+#define __x_trig_page(xd)	((void __iomem *)((xd)->trig_mmio))
+#define __x_readb	__raw_readb
+#define __x_writeb	__raw_writeb
+#define __x_readw	__raw_readw
+#define __x_readq	__raw_readq
+#define __x_writeq	__raw_writeq
+
+#include "book3s_xive_template.c"
+
+/*
+ * We leave a gap of a couple of interrupts in the queue to
+ * account for the IPI and additional safety guard.
+ */
+#define XIVE_Q_GAP	2
+
+/*
+ * This is a simple trigger for a generic XIVE IRQ. This must
+ * only be called for interrupts that support a trigger page
+ */
+static bool xive_irq_trigger(struct xive_irq_data *xd)
+{
+	/* This should be only for MSIs */
+	if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI))
+		return false;
+
+	/* Those interrupts should always have a trigger page */
+	if (WARN_ON(!xd->trig_mmio))
+		return false;
+
+	out_be64(xd->trig_mmio, 0);
+
+	return true;
+}
+
+static irqreturn_t xive_esc_irq(int irq, void *data)
+{
+	struct kvm_vcpu *vcpu = data;
+
+	/* We use the existing H_PROD mechanism to wake up the target */
+	vcpu->arch.prodded = 1;
+	smp_mb();
+	if (vcpu->arch.ceded)
+		kvmppc_fast_vcpu_kick(vcpu);
+
+	return IRQ_HANDLED;
+}
+
+static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct xive_q *q = &xc->queues[prio];
+	char *name = NULL;
+	int rc;
+
+	/* Already there ? */
+	if (xc->esc_virq[prio])
+		return 0;
+
+	/* Hook up the escalation interrupt */
+	xc->esc_virq[prio] = irq_create_mapping(NULL, q->esc_irq);
+	if (!xc->esc_virq[prio]) {
+		pr_err("Failed to map escalation interrupt for queue %d of VCPU %d\n",
+		       prio, xc->server_num);
+		return -EIO;
+	}
+
+	/*
+	 * Future improvement: start with them disabled
+	 * and handle DD2 and later scheme of merged escalation
+	 * interrupts
+	 */
+	name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d",
+			 vcpu->kvm->arch.lpid, xc->server_num, prio);
+	if (!name) {
+		pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n",
+		       prio, xc->server_num);
+		rc = -ENOMEM;
+		goto error;
+	}
+	rc = request_irq(xc->esc_virq[prio], xive_esc_irq,
+			 IRQF_NO_THREAD, name, vcpu);
+	if (rc) {
+		pr_err("Failed to request escalation interrupt for queue %d of VCPU %d\n",
+		       prio, xc->server_num);
+		goto error;
+	}
+	xc->esc_virq_names[prio] = name;
+	return 0;
+error:
+	irq_dispose_mapping(xc->esc_virq[prio]);
+	xc->esc_virq[prio] = 0;
+	kfree(name);
+	return rc;
+}
+
+static int xive_provision_queue(struct kvm_vcpu *vcpu, u8 prio)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvmppc_xive *xive = xc->xive;
+	struct xive_q *q =  &xc->queues[prio];
+	void *qpage;
+	int rc;
+
+	if (WARN_ON(q->qpage))
+		return 0;
+
+	/* Allocate the queue and retrieve infos on current node for now */
+	qpage = (__be32 *)__get_free_pages(GFP_KERNEL, xive->q_page_order);
+	if (!qpage) {
+		pr_err("Failed to allocate queue %d for VCPU %d\n",
+		       prio, xc->server_num);
+		return -ENOMEM;;
+	}
+	memset(qpage, 0, 1 << xive->q_order);
+
+	/*
+	 * Reconfigure the queue. This will set q->qpage only once the
+	 * queue is fully configured. This is a requirement for prio 0
+	 * as we will stop doing EOIs for every IPI as soon as we observe
+	 * qpage being non-NULL, and instead will only EOI when we receive
+	 * corresponding queue 0 entries
+	 */
+	rc = xive_native_configure_queue(xc->vp_id, q, prio, qpage,
+					 xive->q_order, true);
+	if (rc)
+		pr_err("Failed to configure queue %d for VCPU %d\n",
+		       prio, xc->server_num);
+	return rc;
+}
+
+/* Called with kvm_lock held */
+static int xive_check_provisioning(struct kvm *kvm, u8 prio)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvm_vcpu *vcpu;
+	int i, rc;
+
+	lockdep_assert_held(&kvm->lock);
+
+	/* Already provisioned ? */
+	if (xive->qmap & (1 << prio))
+		return 0;
+
+	pr_devel("Provisioning prio... %d\n", prio);
+
+	/* Provision each VCPU and enable escalations */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (!vcpu->arch.xive_vcpu)
+			continue;
+		rc = xive_provision_queue(vcpu, prio);
+		if (rc == 0)
+			xive_attach_escalation(vcpu, prio);
+		if (rc)
+			return rc;
+	}
+
+	/* Order previous stores and mark it as provisioned */
+	mb();
+	xive->qmap |= (1 << prio);
+	return 0;
+}
+
+static void xive_inc_q_pending(struct kvm *kvm, u32 server, u8 prio)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvmppc_xive_vcpu *xc;
+	struct xive_q *q;
+
+	/* Locate target server */
+	vcpu = kvmppc_xive_find_server(kvm, server);
+	if (!vcpu) {
+		pr_warn("%s: Can't find server %d\n", __func__, server);
+		return;
+	}
+	xc = vcpu->arch.xive_vcpu;
+	if (WARN_ON(!xc))
+		return;
+
+	q = &xc->queues[prio];
+	atomic_inc(&q->pending_count);
+}
+
+static int xive_try_pick_queue(struct kvm_vcpu *vcpu, u8 prio)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct xive_q *q;
+	u32 max;
+
+	if (WARN_ON(!xc))
+		return -ENXIO;
+	if (!xc->valid)
+		return -ENXIO;
+
+	q = &xc->queues[prio];
+	if (WARN_ON(!q->qpage))
+		return -ENXIO;
+
+	/* Calculate max number of interrupts in that queue. */
+	max = (q->msk + 1) - XIVE_Q_GAP;
+	return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY;
+}
+
+static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
+{
+	struct kvm_vcpu *vcpu;
+	int i, rc;
+
+	/* Locate target server */
+	vcpu = kvmppc_xive_find_server(kvm, *server);
+	if (!vcpu) {
+		pr_devel("Can't find server %d\n", *server);
+		return -EINVAL;
+	}
+
+	pr_devel("Finding irq target on 0x%x/%d...\n", *server, prio);
+
+	/* Try pick it */
+	rc = xive_try_pick_queue(vcpu, prio);
+	if (rc == 0)
+		return rc;
+
+	pr_devel(" .. failed, looking up candidate...\n");
+
+	/* Failed, pick another VCPU */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (!vcpu->arch.xive_vcpu)
+			continue;
+		rc = xive_try_pick_queue(vcpu, prio);
+		if (rc == 0) {
+			*server = vcpu->arch.xive_vcpu->server_num;
+			pr_devel("  found on 0x%x/%d\n", *server, prio);
+			return rc;
+		}
+	}
+	pr_devel("  no available target !\n");
+
+	/* No available target ! */
+	return -EBUSY;
+}
+
+static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
+			     struct kvmppc_xive_src_block *sb,
+			     struct kvmppc_xive_irq_state *state)
+{
+	struct xive_irq_data *xd;
+	u32 hw_num;
+	u8 old_prio;
+	u64 val;
+
+	/*
+	 * Take the lock, set masked, try again if racing
+	 * with H_EOI
+	 */
+	for (;;) {
+		arch_spin_lock(&sb->lock);
+		old_prio = state->guest_priority;
+		state->guest_priority = MASKED;
+		mb();
+		if (!state->in_eoi)
+			break;
+		state->guest_priority = old_prio;
+		arch_spin_unlock(&sb->lock);
+	}
+
+	/* No change ? Bail */
+	if (old_prio == MASKED)
+		return old_prio;
+
+	/* Get the right irq */
+	kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+	/*
+	 * If the interrupt is marked as needing masking via
+	 * firmware, we do it here. Firmware masking however
+	 * is "lossy", it won't return the old p and q bits
+	 * and won't set the interrupt to a state where it will
+	 * record queued ones. If this is an issue we should do
+	 * lazy masking instead.
+	 *
+	 * For now, we work around this in unmask by forcing
+	 * an interrupt whenever we unmask a non-LSI via FW
+	 * (if ever).
+	 */
+	if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
+		xive_native_configure_irq(hw_num,
+					  xive->vp_base + state->act_server,
+					  MASKED, state->number);
+		/* set old_p so we can track if an H_EOI was done */
+		state->old_p = true;
+		state->old_q = false;
+	} else {
+		/* Set PQ to 10, return old P and old Q and remember them */
+		val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_10);
+		state->old_p = !!(val & 2);
+		state->old_q = !!(val & 1);
+
+		/*
+		 * Synchronize hardware to sensure the queues are updated
+		 * when masking
+		 */
+		xive_native_sync_source(hw_num);
+	}
+
+	return old_prio;
+}
+
+static void xive_lock_for_unmask(struct kvmppc_xive_src_block *sb,
+				 struct kvmppc_xive_irq_state *state)
+{
+	/*
+	 * Take the lock try again if racing with H_EOI
+	 */
+	for (;;) {
+		arch_spin_lock(&sb->lock);
+		if (!state->in_eoi)
+			break;
+		arch_spin_unlock(&sb->lock);
+	}
+}
+
+static void xive_finish_unmask(struct kvmppc_xive *xive,
+			       struct kvmppc_xive_src_block *sb,
+			       struct kvmppc_xive_irq_state *state,
+			       u8 prio)
+{
+	struct xive_irq_data *xd;
+	u32 hw_num;
+
+	/* If we aren't changing a thing, move on */
+	if (state->guest_priority != MASKED)
+		goto bail;
+
+	/* Get the right irq */
+	kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+	/*
+	 * See command in xive_lock_and_mask() concerning masking
+	 * via firmware.
+	 */
+	if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
+		xive_native_configure_irq(hw_num,
+					  xive->vp_base + state->act_server,
+					  state->act_priority, state->number);
+		/* If an EOI is needed, do it here */
+		if (!state->old_p)
+			xive_vm_source_eoi(hw_num, xd);
+		/* If this is not an LSI, force a trigger */
+		if (!(xd->flags & OPAL_XIVE_IRQ_LSI))
+			xive_irq_trigger(xd);
+		goto bail;
+	}
+
+	/* Old Q set, set PQ to 11 */
+	if (state->old_q)
+		xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11);
+
+	/*
+	 * If not old P, then perform an "effective" EOI,
+	 * on the source. This will handle the cases where
+	 * FW EOI is needed.
+	 */
+	if (!state->old_p)
+		xive_vm_source_eoi(hw_num, xd);
+
+	/* Synchronize ordering and mark unmasked */
+	mb();
+bail:
+	state->guest_priority = prio;
+}
+
+/*
+ * Target an interrupt to a given server/prio, this will fallback
+ * to another server if necessary and perform the HW targetting
+ * updates as needed
+ *
+ * NOTE: Must be called with the state lock held
+ */
+static int xive_target_interrupt(struct kvm *kvm,
+				 struct kvmppc_xive_irq_state *state,
+				 u32 server, u8 prio)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	u32 hw_num;
+	int rc;
+
+	/*
+	 * This will return a tentative server and actual
+	 * priority. The count for that new target will have
+	 * already been incremented.
+	 */
+	rc = xive_select_target(kvm, &server, prio);
+
+	/*
+	 * We failed to find a target ? Not much we can do
+	 * at least until we support the GIQ.
+	 */
+	if (rc)
+		return rc;
+
+	/*
+	 * Increment the old queue pending count if there
+	 * was one so that the old queue count gets adjusted later
+	 * when observed to be empty.
+	 */
+	if (state->act_priority != MASKED)
+		xive_inc_q_pending(kvm,
+				   state->act_server,
+				   state->act_priority);
+	/*
+	 * Update state and HW
+	 */
+	state->act_priority = prio;
+	state->act_server = server;
+
+	/* Get the right irq */
+	kvmppc_xive_select_irq(state, &hw_num, NULL);
+
+	return xive_native_configure_irq(hw_num,
+					 xive->vp_base + server,
+					 prio, state->number);
+}
+
+/*
+ * Targetting rules: In order to avoid losing track of
+ * pending interrupts accross mask and unmask, which would
+ * allow queue overflows, we implement the following rules:
+ *
+ *  - Unless it was never enabled (or we run out of capacity)
+ *    an interrupt is always targetted at a valid server/queue
+ *    pair even when "masked" by the guest. This pair tends to
+ *    be the last one used but it can be changed under some
+ *    circumstances. That allows us to separate targetting
+ *    from masking, we only handle accounting during (re)targetting,
+ *    this also allows us to let an interrupt drain into its target
+ *    queue after masking, avoiding complex schemes to remove
+ *    interrupts out of remote processor queues.
+ *
+ *  - When masking, we set PQ to 10 and save the previous value
+ *    of P and Q.
+ *
+ *  - When unmasking, if saved Q was set, we set PQ to 11
+ *    otherwise we leave PQ to the HW state which will be either
+ *    10 if nothing happened or 11 if the interrupt fired while
+ *    masked. Effectively we are OR'ing the previous Q into the
+ *    HW Q.
+ *
+ *    Then if saved P is clear, we do an effective EOI (Q->P->Trigger)
+ *    which will unmask the interrupt and shoot a new one if Q was
+ *    set.
+ *
+ *    Otherwise (saved P is set) we leave PQ unchanged (so 10 or 11,
+ *    effectively meaning an H_EOI from the guest is still expected
+ *    for that interrupt).
+ *
+ *  - If H_EOI occurs while masked, we clear the saved P.
+ *
+ *  - When changing target, we account on the new target and
+ *    increment a separate "pending" counter on the old one.
+ *    This pending counter will be used to decrement the old
+ *    target's count when its queue has been observed empty.
+ */
+
+int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
+			 u32 priority)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u8 new_act_prio;
+	int rc = 0;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	pr_devel("set_xive ! irq 0x%x server 0x%x prio %d\n",
+		 irq, server, priority);
+
+	/* First, check provisioning of queues */
+	if (priority != MASKED)
+		rc = xive_check_provisioning(xive->kvm,
+			      xive_prio_from_guest(priority));
+	if (rc) {
+		pr_devel("  provisioning failure %d !\n", rc);
+		return rc;
+	}
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	/*
+	 * We first handle masking/unmasking since the locking
+	 * might need to be retried due to EOIs, we'll handle
+	 * targetting changes later. These functions will return
+	 * with the SB lock held.
+	 *
+	 * xive_lock_and_mask() will also set state->guest_priority
+	 * but won't otherwise change other fields of the state.
+	 *
+	 * xive_lock_for_unmask will not actually unmask, this will
+	 * be done later by xive_finish_unmask() once the targetting
+	 * has been done, so we don't try to unmask an interrupt
+	 * that hasn't yet been targetted.
+	 */
+	if (priority == MASKED)
+		xive_lock_and_mask(xive, sb, state);
+	else
+		xive_lock_for_unmask(sb, state);
+
+
+	/*
+	 * Then we handle targetting.
+	 *
+	 * First calculate a new "actual priority"
+	 */
+	new_act_prio = state->act_priority;
+	if (priority != MASKED)
+		new_act_prio = xive_prio_from_guest(priority);
+
+	pr_devel(" new_act_prio=%x act_server=%x act_prio=%x\n",
+		 new_act_prio, state->act_server, state->act_priority);
+
+	/*
+	 * Then check if we actually need to change anything,
+	 *
+	 * The condition for re-targetting the interrupt is that
+	 * we have a valid new priority (new_act_prio is not 0xff)
+	 * and either the server or the priority changed.
+	 *
+	 * Note: If act_priority was ff and the new priority is
+	 *       also ff, we don't do anything and leave the interrupt
+	 *       untargetted. An attempt of doing an int_on on an
+	 *       untargetted interrupt will fail. If that is a problem
+	 *       we could initialize interrupts with valid default
+	 */
+
+	if (new_act_prio != MASKED &&
+	    (state->act_server != server ||
+	     state->act_priority != new_act_prio))
+		rc = xive_target_interrupt(kvm, state, server, new_act_prio);
+
+	/*
+	 * Perform the final unmasking of the interrupt source
+	 * if necessary
+	 */
+	if (priority != MASKED)
+		xive_finish_unmask(xive, sb, state, priority);
+
+	/*
+	 * Finally Update saved_priority to match. Only int_on/off
+	 * set this field to a different value.
+	 */
+	state->saved_priority = priority;
+
+	arch_spin_unlock(&sb->lock);
+	return rc;
+}
+
+int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
+			 u32 *priority)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+	arch_spin_lock(&sb->lock);
+	*server = state->guest_server;
+	*priority = state->guest_priority;
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+
+int kvmppc_xive_int_on(struct kvm *kvm, u32 irq)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	pr_devel("int_on(irq=0x%x)\n", irq);
+
+	/*
+	 * Check if interrupt was not targetted
+	 */
+	if (state->act_priority == MASKED) {
+		pr_devel("int_on on untargetted interrupt\n");
+		return -EINVAL;
+	}
+
+	/* If saved_priority is 0xff, do nothing */
+	if (state->saved_priority == MASKED)
+		return 0;
+
+	/*
+	 * Lock and unmask it.
+	 */
+	xive_lock_for_unmask(sb, state);
+	xive_finish_unmask(xive, sb, state, state->saved_priority);
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+
+int kvmppc_xive_int_off(struct kvm *kvm, u32 irq)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	pr_devel("int_off(irq=0x%x)\n", irq);
+
+	/*
+	 * Lock and mask
+	 */
+	state->saved_priority = xive_lock_and_mask(xive, sb, state);
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+
+static bool xive_restore_pending_irq(struct kvmppc_xive *xive, u32 irq)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return false;
+	state = &sb->irq_state[idx];
+	if (!state->valid)
+		return false;
+
+	/*
+	 * Trigger the IPI. This assumes we never restore a pass-through
+	 * interrupt which should be safe enough
+	 */
+	xive_irq_trigger(&state->ipi_data);
+
+	return true;
+}
+
+u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+	if (!xc)
+		return 0;
+
+	/* Return the per-cpu state for state saving/migration */
+	return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT |
+	       (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT;
+}
+
+int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
+	u8 cppr, mfrr;
+	u32 xisr;
+
+	if (!xc || !xive)
+		return -ENOENT;
+
+	/* Grab individual state fields. We don't use pending_pri */
+	cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT;
+	xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) &
+		KVM_REG_PPC_ICP_XISR_MASK;
+	mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT;
+
+	pr_devel("set_icp vcpu %d cppr=0x%x mfrr=0x%x xisr=0x%x\n",
+		 xc->server_num, cppr, mfrr, xisr);
+
+	/*
+	 * We can't update the state of a "pushed" VCPU, but that
+	 * shouldn't happen.
+	 */
+	if (WARN_ON(vcpu->arch.xive_pushed))
+		return -EIO;
+
+	/* Update VCPU HW saved state */
+	vcpu->arch.xive_saved_state.cppr = cppr;
+	xc->hw_cppr = xc->cppr = cppr;
+
+	/*
+	 * Update MFRR state. If it's not 0xff, we mark the VCPU as
+	 * having a pending MFRR change, which will re-evaluate the
+	 * target. The VCPU will thus potentially get a spurious
+	 * interrupt but that's not a big deal.
+	 */
+	xc->mfrr = mfrr;
+	if (mfrr < cppr)
+		xive_irq_trigger(&xc->vp_ipi_data);
+
+	/*
+	 * Now saved XIRR is "interesting". It means there's something in
+	 * the legacy "1 element" queue... for an IPI we simply ignore it,
+	 * as the MFRR restore will handle that. For anything else we need
+	 * to force a resend of the source.
+	 * However the source may not have been setup yet. If that's the
+	 * case, we keep that info and increment a counter in the xive to
+	 * tell subsequent xive_set_source() to go look.
+	 */
+	if (xisr > XICS_IPI && !xive_restore_pending_irq(xive, xisr)) {
+		xc->delayed_irq = xisr;
+		xive->delayed_irqs++;
+		pr_devel("  xisr restore delayed\n");
+	}
+
+	return 0;
+}
+
+int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
+			   struct irq_desc *host_desc)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	struct irq_data *host_data = irq_desc_get_irq_data(host_desc);
+	unsigned int host_irq = irq_desc_get_irq(host_desc);
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data);
+	u16 idx;
+	u8 prio;
+	int rc;
+
+	if (!xive)
+		return -ENODEV;
+
+	pr_devel("set_mapped girq 0x%lx host HW irq 0x%x...\n",guest_irq, hw_irq);
+
+	sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	/*
+	 * Mark the passed-through interrupt as going to a VCPU,
+	 * this will prevent further EOIs and similar operations
+	 * from the XIVE code. It will also mask the interrupt
+	 * to either PQ=10 or 11 state, the latter if the interrupt
+	 * is pending. This will allow us to unmask or retrigger it
+	 * after routing it to the guest with a simple EOI.
+	 *
+	 * The "state" argument is a "token", all it needs is to be
+	 * non-NULL to switch to passed-through or NULL for the
+	 * other way around. We may not yet have an actual VCPU
+	 * target here and we don't really care.
+	 */
+	rc = irq_set_vcpu_affinity(host_irq, state);
+	if (rc) {
+		pr_err("Failed to set VCPU affinity for irq %d\n", host_irq);
+		return rc;
+	}
+
+	/*
+	 * Mask and read state of IPI. We need to know if its P bit
+	 * is set as that means it's potentially already using a
+	 * queue entry in the target
+	 */
+	prio = xive_lock_and_mask(xive, sb, state);
+	pr_devel(" old IPI prio %02x P:%d Q:%d\n", prio,
+		 state->old_p, state->old_q);
+
+	/* Turn the IPI hard off */
+	xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
+
+	/* Grab info about irq */
+	state->pt_number = hw_irq;
+	state->pt_data = irq_data_get_irq_handler_data(host_data);
+
+	/*
+	 * Configure the IRQ to match the existing configuration of
+	 * the IPI if it was already targetted. Otherwise this will
+	 * mask the interrupt in a lossy way (act_priority is 0xff)
+	 * which is fine for a never started interrupt.
+	 */
+	xive_native_configure_irq(hw_irq,
+				  xive->vp_base + state->act_server,
+				  state->act_priority, state->number);
+
+	/*
+	 * We do an EOI to enable the interrupt (and retrigger if needed)
+	 * if the guest has the interrupt unmasked and the P bit was *not*
+	 * set in the IPI. If it was set, we know a slot may still be in
+	 * use in the target queue thus we have to wait for a guest
+	 * originated EOI
+	 */
+	if (prio != MASKED && !state->old_p)
+		xive_vm_source_eoi(hw_irq, state->pt_data);
+
+	/* Clear old_p/old_q as they are no longer relevant */
+	state->old_p = state->old_q = false;
+
+	/* Restore guest prio (unlocks EOI) */
+	mb();
+	state->guest_priority = prio;
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped);
+
+int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
+			   struct irq_desc *host_desc)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	unsigned int host_irq = irq_desc_get_irq(host_desc);
+	u16 idx;
+	u8 prio;
+	int rc;
+
+	if (!xive)
+		return -ENODEV;
+
+	pr_devel("clr_mapped girq 0x%lx...\n", guest_irq);
+
+	sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	/*
+	 * Mask and read state of IRQ. We need to know if its P bit
+	 * is set as that means it's potentially already using a
+	 * queue entry in the target
+	 */
+	prio = xive_lock_and_mask(xive, sb, state);
+	pr_devel(" old IRQ prio %02x P:%d Q:%d\n", prio,
+		 state->old_p, state->old_q);
+
+	/*
+	 * If old_p is set, the interrupt is pending, we switch it to
+	 * PQ=11. This will force a resend in the host so the interrupt
+	 * isn't lost to whatver host driver may pick it up
+	 */
+	if (state->old_p)
+		xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_11);
+
+	/* Release the passed-through interrupt to the host */
+	rc = irq_set_vcpu_affinity(host_irq, NULL);
+	if (rc) {
+		pr_err("Failed to clr VCPU affinity for irq %d\n", host_irq);
+		return rc;
+	}
+
+	/* Forget about the IRQ */
+	state->pt_number = 0;
+	state->pt_data = NULL;
+
+	/* Reconfigure the IPI */
+	xive_native_configure_irq(state->ipi_number,
+				  xive->vp_base + state->act_server,
+				  state->act_priority, state->number);
+
+	/*
+	 * If old_p is set (we have a queue entry potentially
+	 * occupied) or the interrupt is masked, we set the IPI
+	 * to PQ=10 state. Otherwise we just re-enable it (PQ=00).
+	 */
+	if (prio == MASKED || state->old_p)
+		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_10);
+	else
+		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_00);
+
+	/* Restore guest prio (unlocks EOI) */
+	mb();
+	state->guest_priority = prio;
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped);
+
+static void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	int i, j;
+
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+
+		if (!sb)
+			continue;
+		for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
+			struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
+
+			if (!state->valid)
+				continue;
+			if (state->act_priority == MASKED)
+				continue;
+			if (state->act_server != xc->server_num)
+				continue;
+
+			/* Clean it up */
+			arch_spin_lock(&sb->lock);
+			state->act_priority = MASKED;
+			xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
+			xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
+			if (state->pt_number) {
+				xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
+				xive_native_configure_irq(state->pt_number, 0, MASKED, 0);
+			}
+			arch_spin_unlock(&sb->lock);
+		}
+	}
+}
+
+void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvmppc_xive *xive = xc->xive;
+	int i;
+
+	pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
+
+	/* Ensure no interrupt is still routed to that VP */
+	xc->valid = false;
+	kvmppc_xive_disable_vcpu_interrupts(vcpu);
+
+	/* Mask the VP IPI */
+	xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01);
+
+	/* Disable the VP */
+	xive_native_disable_vp(xc->vp_id);
+
+	/* Free the queues & associated interrupts */
+	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+		struct xive_q *q = &xc->queues[i];
+
+		/* Free the escalation irq */
+		if (xc->esc_virq[i]) {
+			free_irq(xc->esc_virq[i], vcpu);
+			irq_dispose_mapping(xc->esc_virq[i]);
+			kfree(xc->esc_virq_names[i]);
+		}
+		/* Free the queue */
+		xive_native_disable_queue(xc->vp_id, q, i);
+		if (q->qpage) {
+			free_pages((unsigned long)q->qpage,
+				   xive->q_page_order);
+			q->qpage = NULL;
+		}
+	}
+
+	/* Free the IPI */
+	if (xc->vp_ipi) {
+		xive_cleanup_irq_data(&xc->vp_ipi_data);
+		xive_native_free_irq(xc->vp_ipi);
+	}
+	/* Free the VP */
+	kfree(xc);
+}
+
+int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
+			     struct kvm_vcpu *vcpu, u32 cpu)
+{
+	struct kvmppc_xive *xive = dev->private;
+	struct kvmppc_xive_vcpu *xc;
+	int i, r = -EBUSY;
+
+	pr_devel("connect_vcpu(cpu=%d)\n", cpu);
+
+	if (dev->ops != &kvm_xive_ops) {
+		pr_devel("Wrong ops !\n");
+		return -EPERM;
+	}
+	if (xive->kvm != vcpu->kvm)
+		return -EPERM;
+	if (vcpu->arch.irq_type)
+		return -EBUSY;
+	if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {
+		pr_devel("Duplicate !\n");
+		return -EEXIST;
+	}
+	if (cpu >= KVM_MAX_VCPUS) {
+		pr_devel("Out of bounds !\n");
+		return -EINVAL;
+	}
+	xc = kzalloc(sizeof(*xc), GFP_KERNEL);
+	if (!xc)
+		return -ENOMEM;
+
+	/* We need to synchronize with queue provisioning */
+	mutex_lock(&vcpu->kvm->lock);
+	vcpu->arch.xive_vcpu = xc;
+	xc->xive = xive;
+	xc->vcpu = vcpu;
+	xc->server_num = cpu;
+	xc->vp_id = xive->vp_base + cpu;
+	xc->mfrr = 0xff;
+	xc->valid = true;
+
+	r = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
+	if (r)
+		goto bail;
+
+	/* Configure VCPU fields for use by assembly push/pull */
+	vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
+	vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
+
+	/* Allocate IPI */
+	xc->vp_ipi = xive_native_alloc_irq();
+	if (!xc->vp_ipi) {
+		r = -EIO;
+		goto bail;
+	}
+	pr_devel(" IPI=0x%x\n", xc->vp_ipi);
+
+	r = xive_native_populate_irq_data(xc->vp_ipi, &xc->vp_ipi_data);
+	if (r)
+		goto bail;
+
+	/*
+	 * Initialize queues. Initially we set them all for no queueing
+	 * and we enable escalation for queue 0 only which we'll use for
+	 * our mfrr change notifications. If the VCPU is hot-plugged, we
+	 * do handle provisioning however.
+	 */
+	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+		struct xive_q *q = &xc->queues[i];
+
+		/* Is queue already enabled ? Provision it */
+		if (xive->qmap & (1 << i)) {
+			r = xive_provision_queue(vcpu, i);
+			if (r == 0)
+				xive_attach_escalation(vcpu, i);
+			if (r)
+				goto bail;
+		} else {
+			r = xive_native_configure_queue(xc->vp_id,
+							q, i, NULL, 0, true);
+			if (r) {
+				pr_err("Failed to configure queue %d for VCPU %d\n",
+				       i, cpu);
+				goto bail;
+			}
+		}
+	}
+
+	/* If not done above, attach priority 0 escalation */
+	r = xive_attach_escalation(vcpu, 0);
+	if (r)
+		goto bail;
+
+	/* Enable the VP */
+	r = xive_native_enable_vp(xc->vp_id);
+	if (r)
+		goto bail;
+
+	/* Route the IPI */
+	r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI);
+	if (!r)
+		xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00);
+
+bail:
+	mutex_unlock(&vcpu->kvm->lock);
+	if (r) {
+		kvmppc_xive_cleanup_vcpu(vcpu);
+		return r;
+	}
+
+	vcpu->arch.irq_type = KVMPPC_IRQ_XICS;
+	return 0;
+}
+
+/*
+ * Scanning of queues before/after migration save
+ */
+static void xive_pre_save_set_queued(struct kvmppc_xive *xive, u32 irq)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return;
+
+	state = &sb->irq_state[idx];
+
+	/* Some sanity checking */
+	if (!state->valid) {
+		pr_err("invalid irq 0x%x in cpu queue!\n", irq);
+		return;
+	}
+
+	/*
+	 * If the interrupt is in a queue it should have P set.
+	 * We warn so that gets reported. A backtrace isn't useful
+	 * so no need to use a WARN_ON.
+	 */
+	if (!state->saved_p)
+		pr_err("Interrupt 0x%x is marked in a queue but P not set !\n", irq);
+
+	/* Set flag */
+	state->in_queue = true;
+}
+
+static void xive_pre_save_mask_irq(struct kvmppc_xive *xive,
+				   struct kvmppc_xive_src_block *sb,
+				   u32 irq)
+{
+	struct kvmppc_xive_irq_state *state = &sb->irq_state[irq];
+
+	if (!state->valid)
+		return;
+
+	/* Mask and save state, this will also sync HW queues */
+	state->saved_scan_prio = xive_lock_and_mask(xive, sb, state);
+
+	/* Transfer P and Q */
+	state->saved_p = state->old_p;
+	state->saved_q = state->old_q;
+
+	/* Unlock */
+	arch_spin_unlock(&sb->lock);
+}
+
+static void xive_pre_save_unmask_irq(struct kvmppc_xive *xive,
+				     struct kvmppc_xive_src_block *sb,
+				     u32 irq)
+{
+	struct kvmppc_xive_irq_state *state = &sb->irq_state[irq];
+
+	if (!state->valid)
+		return;
+
+	/*
+	 * Lock / exclude EOI (not technically necessary if the
+	 * guest isn't running concurrently. If this becomes a
+	 * performance issue we can probably remove the lock.
+	 */
+	xive_lock_for_unmask(sb, state);
+
+	/* Restore mask/prio if it wasn't masked */
+	if (state->saved_scan_prio != MASKED)
+		xive_finish_unmask(xive, sb, state, state->saved_scan_prio);
+
+	/* Unlock */
+	arch_spin_unlock(&sb->lock);
+}
+
+static void xive_pre_save_queue(struct kvmppc_xive *xive, struct xive_q *q)
+{
+	u32 idx = q->idx;
+	u32 toggle = q->toggle;
+	u32 irq;
+
+	do {
+		irq = __xive_read_eq(q->qpage, q->msk, &idx, &toggle);
+		if (irq > XICS_IPI)
+			xive_pre_save_set_queued(xive, irq);
+	} while(irq);
+}
+
+static void xive_pre_save_scan(struct kvmppc_xive *xive)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	int i, j;
+
+	/*
+	 * See comment in xive_get_source() about how this
+	 * work. Collect a stable state for all interrupts
+	 */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+		if (!sb)
+			continue;
+		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
+			xive_pre_save_mask_irq(xive, sb, j);
+	}
+
+	/* Then scan the queues and update the "in_queue" flag */
+	kvm_for_each_vcpu(i, vcpu, xive->kvm) {
+		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+		if (!xc)
+			continue;
+		for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) {
+			if (xc->queues[i].qpage)
+				xive_pre_save_queue(xive, &xc->queues[i]);
+		}
+	}
+
+	/* Finally restore interrupt states */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+		if (!sb)
+			continue;
+		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
+			xive_pre_save_unmask_irq(xive, sb, j);
+	}
+}
+
+static void xive_post_save_scan(struct kvmppc_xive *xive)
+{
+	u32 i, j;
+
+	/* Clear all the in_queue flags */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+		if (!sb)
+			continue;
+		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
+			sb->irq_state[j].in_queue = false;
+	}
+
+	/* Next get_source() will do a new scan */
+	xive->saved_src_count = 0;
+}
+
+/*
+ * This returns the source configuration and state to user space.
+ */
+static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u64 val, prio;
+	u16 idx;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -ENOENT;
+
+	state = &sb->irq_state[idx];
+
+	if (!state->valid)
+		return -ENOENT;
+
+	pr_devel("get_source(%ld)...\n", irq);
+
+	/*
+	 * So to properly save the state into something that looks like a
+	 * XICS migration stream we cannot treat interrupts individually.
+	 *
+	 * We need, instead, mask them all (& save their previous PQ state)
+	 * to get a stable state in the HW, then sync them to ensure that
+	 * any interrupt that had already fired hits its queue, and finally
+	 * scan all the queues to collect which interrupts are still present
+	 * in the queues, so we can set the "pending" flag on them and
+	 * they can be resent on restore.
+	 *
+	 * So we do it all when the "first" interrupt gets saved, all the
+	 * state is collected at that point, the rest of xive_get_source()
+	 * will merely collect and convert that state to the expected
+	 * userspace bit mask.
+	 */
+	if (xive->saved_src_count == 0)
+		xive_pre_save_scan(xive);
+	xive->saved_src_count++;
+
+	/* Convert saved state into something compatible with xics */
+	val = state->guest_server;
+	prio = state->saved_scan_prio;
+
+	if (prio == MASKED) {
+		val |= KVM_XICS_MASKED;
+		prio = state->saved_priority;
+	}
+	val |= prio << KVM_XICS_PRIORITY_SHIFT;
+	if (state->lsi) {
+		val |= KVM_XICS_LEVEL_SENSITIVE;
+		if (state->saved_p)
+			val |= KVM_XICS_PENDING;
+	} else {
+		if (state->saved_p)
+			val |= KVM_XICS_PRESENTED;
+
+		if (state->saved_q)
+			val |= KVM_XICS_QUEUED;
+
+		/*
+		 * We mark it pending (which will attempt a re-delivery)
+		 * if we are in a queue *or* we were masked and had
+		 * Q set which is equivalent to the XICS "masked pending"
+		 * state
+		 */
+		if (state->in_queue || (prio == MASKED && state->saved_q))
+			val |= KVM_XICS_PENDING;
+	}
+
+	/*
+	 * If that was the last interrupt saved, reset the
+	 * in_queue flags
+	 */
+	if (xive->saved_src_count == xive->src_count)
+		xive_post_save_scan(xive);
+
+	/* Copy the result to userspace */
+	if (put_user(val, ubufp))
+		return -EFAULT;
+
+	return 0;
+}
+
+static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *xive,
+							   int irq)
+{
+	struct kvm *kvm = xive->kvm;
+	struct kvmppc_xive_src_block *sb;
+	int i, bid;
+
+	bid = irq >> KVMPPC_XICS_ICS_SHIFT;
+
+	mutex_lock(&kvm->lock);
+
+	/* block already exists - somebody else got here first */
+	if (xive->src_blocks[bid])
+		goto out;
+
+	/* Create the ICS */
+	sb = kzalloc(sizeof(*sb), GFP_KERNEL);
+	if (!sb)
+		goto out;
+
+	sb->id = bid;
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i;
+		sb->irq_state[i].guest_priority = MASKED;
+		sb->irq_state[i].saved_priority = MASKED;
+		sb->irq_state[i].act_priority = MASKED;
+	}
+	smp_wmb();
+	xive->src_blocks[bid] = sb;
+
+	if (bid > xive->max_sbid)
+		xive->max_sbid = bid;
+
+out:
+	mutex_unlock(&kvm->lock);
+	return xive->src_blocks[bid];
+}
+
+static bool xive_check_delayed_irq(struct kvmppc_xive *xive, u32 irq)
+{
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu = NULL;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+		if (!xc)
+			continue;
+
+		if (xc->delayed_irq == irq) {
+			xc->delayed_irq = 0;
+			xive->delayed_irqs--;
+			return true;
+		}
+	}
+	return false;
+}
+
+static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u16 idx;
+	u64 val;
+	u8 act_prio, guest_prio;
+	u32 server;
+	int rc = 0;
+
+	if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
+		return -ENOENT;
+
+	pr_devel("set_source(irq=0x%lx)\n", irq);
+
+	/* Find the source */
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb) {
+		pr_devel("No source, creating source block...\n");
+		sb = xive_create_src_block(xive, irq);
+		if (!sb) {
+			pr_devel("Failed to create block...\n");
+			return -ENOMEM;
+		}
+	}
+	state = &sb->irq_state[idx];
+
+	/* Read user passed data */
+	if (get_user(val, ubufp)) {
+		pr_devel("fault getting user info !\n");
+		return -EFAULT;
+	}
+
+	server = val & KVM_XICS_DESTINATION_MASK;
+	guest_prio = val >> KVM_XICS_PRIORITY_SHIFT;
+
+	pr_devel("  val=0x016%llx (server=0x%x, guest_prio=%d)\n",
+		 val, server, guest_prio);
+	/*
+	 * If the source doesn't already have an IPI, allocate
+	 * one and get the corresponding data
+	 */
+	if (!state->ipi_number) {
+		state->ipi_number = xive_native_alloc_irq();
+		if (state->ipi_number == 0) {
+			pr_devel("Failed to allocate IPI !\n");
+			return -ENOMEM;
+		}
+		xive_native_populate_irq_data(state->ipi_number, &state->ipi_data);
+		pr_devel(" src_ipi=0x%x\n", state->ipi_number);
+	}
+
+	/*
+	 * We use lock_and_mask() to set us in the right masked
+	 * state. We will override that state from the saved state
+	 * further down, but this will handle the cases of interrupts
+	 * that need FW masking. We set the initial guest_priority to
+	 * 0 before calling it to ensure it actually performs the masking.
+	 */
+	state->guest_priority = 0;
+	xive_lock_and_mask(xive, sb, state);
+
+	/*
+	 * Now, we select a target if we have one. If we don't we
+	 * leave the interrupt untargetted. It means that an interrupt
+	 * can become "untargetted" accross migration if it was masked
+	 * by set_xive() but there is little we can do about it.
+	 */
+
+	/* First convert prio and mark interrupt as untargetted */
+	act_prio = xive_prio_from_guest(guest_prio);
+	state->act_priority = MASKED;
+	state->guest_server = server;
+
+	/*
+	 * We need to drop the lock due to the mutex below. Hopefully
+	 * nothing is touching that interrupt yet since it hasn't been
+	 * advertized to a running guest yet
+	 */
+	arch_spin_unlock(&sb->lock);
+
+	/* If we have a priority target the interrupt */
+	if (act_prio != MASKED) {
+		/* First, check provisioning of queues */
+		mutex_lock(&xive->kvm->lock);
+		rc = xive_check_provisioning(xive->kvm, act_prio);
+		mutex_unlock(&xive->kvm->lock);
+
+		/* Target interrupt */
+		if (rc == 0)
+			rc = xive_target_interrupt(xive->kvm, state,
+						   server, act_prio);
+		/*
+		 * If provisioning or targetting failed, leave it
+		 * alone and masked. It will remain disabled until
+		 * the guest re-targets it.
+		 */
+	}
+
+	/*
+	 * Find out if this was a delayed irq stashed in an ICP,
+	 * in which case, treat it as pending
+	 */
+	if (xive->delayed_irqs && xive_check_delayed_irq(xive, irq)) {
+		val |= KVM_XICS_PENDING;
+		pr_devel("  Found delayed ! forcing PENDING !\n");
+	}
+
+	/* Cleanup the SW state */
+	state->old_p = false;
+	state->old_q = false;
+	state->lsi = false;
+	state->asserted = false;
+
+	/* Restore LSI state */
+	if (val & KVM_XICS_LEVEL_SENSITIVE) {
+		state->lsi = true;
+		if (val & KVM_XICS_PENDING)
+			state->asserted = true;
+		pr_devel("  LSI ! Asserted=%d\n", state->asserted);
+	}
+
+	/*
+	 * Restore P and Q. If the interrupt was pending, we
+	 * force both P and Q, which will trigger a resend.
+	 *
+	 * That means that a guest that had both an interrupt
+	 * pending (queued) and Q set will restore with only
+	 * one instance of that interrupt instead of 2, but that
+	 * is perfectly fine as coalescing interrupts that haven't
+	 * been presented yet is always allowed.
+	 */
+	if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
+		state->old_p = true;
+	if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)
+		state->old_q = true;
+
+	pr_devel("  P=%d, Q=%d\n", state->old_p, state->old_q);
+
+	/*
+	 * If the interrupt was unmasked, update guest priority and
+	 * perform the appropriate state transition and do a
+	 * re-trigger if necessary.
+	 */
+	if (val & KVM_XICS_MASKED) {
+		pr_devel("  masked, saving prio\n");
+		state->guest_priority = MASKED;
+		state->saved_priority = guest_prio;
+	} else {
+		pr_devel("  unmasked, restoring to prio %d\n", guest_prio);
+		xive_finish_unmask(xive, sb, state, guest_prio);
+		state->saved_priority = guest_prio;
+	}
+
+	/* Increment the number of valid sources and mark this one valid */
+	if (!state->valid)
+		xive->src_count++;
+	state->valid = true;
+
+	return 0;
+}
+
+int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+			bool line_status)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+
+	/* Perform locklessly .... (we need to do some RCUisms here...) */
+	state = &sb->irq_state[idx];
+	if (!state->valid)
+		return -EINVAL;
+
+	/* We don't allow a trigger on a passed-through interrupt */
+	if (state->pt_number)
+		return -EINVAL;
+
+	if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL)
+		state->asserted = 1;
+	else if (level == 0 || level == KVM_INTERRUPT_UNSET) {
+		state->asserted = 0;
+		return 0;
+	}
+
+	/* Trigger the IPI */
+	xive_irq_trigger(&state->ipi_data);
+
+	return 0;
+}
+
+static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xive *xive = dev->private;
+
+	/* We honor the existing XICS ioctl */
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xive_set_source(xive, attr->attr, attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int xive_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xive *xive = dev->private;
+
+	/* We honor the existing XICS ioctl */
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xive_get_source(xive, attr->attr, attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int xive_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	/* We honor the same limits as XICS, at least for now */
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		if (attr->attr >= KVMPPC_XICS_FIRST_IRQ &&
+		    attr->attr < KVMPPC_XICS_NR_IRQS)
+			return 0;
+		break;
+	}
+	return -ENXIO;
+}
+
+static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
+{
+	xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
+	xive_native_configure_irq(hw_num, 0, MASKED, 0);
+	xive_cleanup_irq_data(xd);
+}
+
+static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
+{
+	int i;
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
+
+		if (!state->valid)
+			continue;
+
+		kvmppc_xive_cleanup_irq(state->ipi_number, &state->ipi_data);
+		xive_native_free_irq(state->ipi_number);
+
+		/* Pass-through, cleanup too */
+		if (state->pt_number)
+			kvmppc_xive_cleanup_irq(state->pt_number, state->pt_data);
+
+		state->valid = false;
+	}
+}
+
+static void kvmppc_xive_free(struct kvm_device *dev)
+{
+	struct kvmppc_xive *xive = dev->private;
+	struct kvm *kvm = xive->kvm;
+	int i;
+
+	debugfs_remove(xive->dentry);
+
+	if (kvm)
+		kvm->arch.xive = NULL;
+
+	/* Mask and free interrupts */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		if (xive->src_blocks[i])
+			kvmppc_xive_free_sources(xive->src_blocks[i]);
+		kfree(xive->src_blocks[i]);
+		xive->src_blocks[i] = NULL;
+	}
+
+	if (xive->vp_base != XIVE_INVALID_VP)
+		xive_native_free_vp_block(xive->vp_base);
+
+
+	kfree(xive);
+	kfree(dev);
+}
+
+static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
+{
+	struct kvmppc_xive *xive;
+	struct kvm *kvm = dev->kvm;
+	int ret = 0;
+
+	pr_devel("Creating xive for partition\n");
+
+	xive = kzalloc(sizeof(*xive), GFP_KERNEL);
+	if (!xive)
+		return -ENOMEM;
+
+	dev->private = xive;
+	xive->dev = dev;
+	xive->kvm = kvm;
+
+	/* Already there ? */
+	if (kvm->arch.xive)
+		ret = -EEXIST;
+	else
+		kvm->arch.xive = xive;
+
+	/* We use the default queue size set by the host */
+	xive->q_order = xive_native_default_eq_shift();
+	if (xive->q_order < PAGE_SHIFT)
+		xive->q_page_order = 0;
+	else
+		xive->q_page_order = xive->q_order - PAGE_SHIFT;
+
+	/* Allocate a bunch of VPs */
+	xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS);
+	pr_devel("VP_Base=%x\n", xive->vp_base);
+
+	if (xive->vp_base == XIVE_INVALID_VP)
+		ret = -ENOMEM;
+
+	if (ret) {
+		kfree(xive);
+		return ret;
+	}
+
+	return 0;
+}
+
+
+static int xive_debug_show(struct seq_file *m, void *private)
+{
+	struct kvmppc_xive *xive = m->private;
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu;
+	u64 t_rm_h_xirr = 0;
+	u64 t_rm_h_ipoll = 0;
+	u64 t_rm_h_cppr = 0;
+	u64 t_rm_h_eoi = 0;
+	u64 t_rm_h_ipi = 0;
+	u64 t_vm_h_xirr = 0;
+	u64 t_vm_h_ipoll = 0;
+	u64 t_vm_h_cppr = 0;
+	u64 t_vm_h_eoi = 0;
+	u64 t_vm_h_ipi = 0;
+	unsigned int i;
+
+	if (!kvm)
+		return 0;
+
+	seq_printf(m, "=========\nVCPU state\n=========\n");
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+		if (!xc)
+			continue;
+
+		seq_printf(m, "cpu server %#x CPPR:%#x HWCPPR:%#x"
+			   " MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n",
+			   xc->server_num, xc->cppr, xc->hw_cppr,
+			   xc->mfrr, xc->pending,
+			   xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
+
+		t_rm_h_xirr += xc->stat_rm_h_xirr;
+		t_rm_h_ipoll += xc->stat_rm_h_ipoll;
+		t_rm_h_cppr += xc->stat_rm_h_cppr;
+		t_rm_h_eoi += xc->stat_rm_h_eoi;
+		t_rm_h_ipi += xc->stat_rm_h_ipi;
+		t_vm_h_xirr += xc->stat_vm_h_xirr;
+		t_vm_h_ipoll += xc->stat_vm_h_ipoll;
+		t_vm_h_cppr += xc->stat_vm_h_cppr;
+		t_vm_h_eoi += xc->stat_vm_h_eoi;
+		t_vm_h_ipi += xc->stat_vm_h_ipi;
+	}
+
+	seq_printf(m, "Hcalls totals\n");
+	seq_printf(m, " H_XIRR  R=%10lld V=%10lld\n", t_rm_h_xirr, t_vm_h_xirr);
+	seq_printf(m, " H_IPOLL R=%10lld V=%10lld\n", t_rm_h_ipoll, t_vm_h_ipoll);
+	seq_printf(m, " H_CPPR  R=%10lld V=%10lld\n", t_rm_h_cppr, t_vm_h_cppr);
+	seq_printf(m, " H_EOI   R=%10lld V=%10lld\n", t_rm_h_eoi, t_vm_h_eoi);
+	seq_printf(m, " H_IPI   R=%10lld V=%10lld\n", t_rm_h_ipi, t_vm_h_ipi);
+
+	return 0;
+}
+
+static int xive_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xive_debug_show, inode->i_private);
+}
+
+static const struct file_operations xive_debug_fops = {
+	.open = xive_debug_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void xive_debugfs_init(struct kvmppc_xive *xive)
+{
+	char *name;
+
+	name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive);
+	if (!name) {
+		pr_err("%s: no memory for name\n", __func__);
+		return;
+	}
+
+	xive->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root,
+					   xive, &xive_debug_fops);
+
+	pr_debug("%s: created %s\n", __func__, name);
+	kfree(name);
+}
+
+static void kvmppc_xive_init(struct kvm_device *dev)
+{
+	struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private;
+
+	/* Register some debug interfaces */
+	xive_debugfs_init(xive);
+}
+
+struct kvm_device_ops kvm_xive_ops = {
+	.name = "kvm-xive",
+	.create = kvmppc_xive_create,
+	.init = kvmppc_xive_init,
+	.destroy = kvmppc_xive_free,
+	.set_attr = xive_set_attr,
+	.get_attr = xive_get_attr,
+	.has_attr = xive_has_attr,
+};
+
+void kvmppc_xive_init_module(void)
+{
+	__xive_vm_h_xirr = xive_vm_h_xirr;
+	__xive_vm_h_ipoll = xive_vm_h_ipoll;
+	__xive_vm_h_ipi = xive_vm_h_ipi;
+	__xive_vm_h_cppr = xive_vm_h_cppr;
+	__xive_vm_h_eoi = xive_vm_h_eoi;
+}
+
+void kvmppc_xive_exit_module(void)
+{
+	__xive_vm_h_xirr = NULL;
+	__xive_vm_h_ipoll = NULL;
+	__xive_vm_h_ipi = NULL;
+	__xive_vm_h_cppr = NULL;
+	__xive_vm_h_eoi = NULL;
+}
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
new file mode 100644
index 000000000000..5938f7644dc1
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -0,0 +1,256 @@
+/*
+ * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _KVM_PPC_BOOK3S_XIVE_H
+#define _KVM_PPC_BOOK3S_XIVE_H
+
+#ifdef CONFIG_KVM_XICS
+#include "book3s_xics.h"
+
+/*
+ * State for one guest irq source.
+ *
+ * For each guest source we allocate a HW interrupt in the XIVE
+ * which we use for all SW triggers. It will be unused for
+ * pass-through but it's easier to keep around as the same
+ * guest interrupt can alternatively be emulated or pass-through
+ * if a physical device is hot unplugged and replaced with an
+ * emulated one.
+ *
+ * This state structure is very similar to the XICS one with
+ * additional XIVE specific tracking.
+ */
+struct kvmppc_xive_irq_state {
+	bool valid;			/* Interrupt entry is valid */
+
+	u32 number;			/* Guest IRQ number */
+	u32 ipi_number;			/* XIVE IPI HW number */
+	struct xive_irq_data ipi_data;	/* XIVE IPI associated data */
+	u32 pt_number;			/* XIVE Pass-through number if any */
+	struct xive_irq_data *pt_data;	/* XIVE Pass-through associated data */
+
+	/* Targetting as set by guest */
+	u32 guest_server;		/* Current guest selected target */
+	u8 guest_priority;		/* Guest set priority */
+	u8 saved_priority;		/* Saved priority when masking */
+
+	/* Actual targetting */
+	u32 act_server;			/* Actual server */
+	u8 act_priority;		/* Actual priority */
+
+	/* Various state bits */
+	bool in_eoi;			/* Synchronize with H_EOI */
+	bool old_p;			/* P bit state when masking */
+	bool old_q;			/* Q bit state when masking */
+	bool lsi;			/* level-sensitive interrupt */
+	bool asserted;			/* Only for emulated LSI: current state */
+
+	/* Saved for migration state */
+	bool in_queue;
+	bool saved_p;
+	bool saved_q;
+	u8 saved_scan_prio;
+};
+
+/* Select the "right" interrupt (IPI vs. passthrough) */
+static inline void kvmppc_xive_select_irq(struct kvmppc_xive_irq_state *state,
+					  u32 *out_hw_irq,
+					  struct xive_irq_data **out_xd)
+{
+	if (state->pt_number) {
+		if (out_hw_irq)
+			*out_hw_irq = state->pt_number;
+		if (out_xd)
+			*out_xd = state->pt_data;
+	} else {
+		if (out_hw_irq)
+			*out_hw_irq = state->ipi_number;
+		if (out_xd)
+			*out_xd = &state->ipi_data;
+	}
+}
+
+/*
+ * This corresponds to an "ICS" in XICS terminology, we use it
+ * as a mean to break up source information into multiple structures.
+ */
+struct kvmppc_xive_src_block {
+	arch_spinlock_t lock;
+	u16 id;
+	struct kvmppc_xive_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
+};
+
+
+struct kvmppc_xive {
+	struct kvm *kvm;
+	struct kvm_device *dev;
+	struct dentry *dentry;
+
+	/* VP block associated with the VM */
+	u32	vp_base;
+
+	/* Blocks of sources */
+	struct kvmppc_xive_src_block *src_blocks[KVMPPC_XICS_MAX_ICS_ID + 1];
+	u32	max_sbid;
+
+	/*
+	 * For state save, we lazily scan the queues on the first interrupt
+	 * being migrated. We don't have a clean way to reset that flags
+	 * so we keep track of the number of valid sources and how many of
+	 * them were migrated so we can reset when all of them have been
+	 * processed.
+	 */
+	u32	src_count;
+	u32	saved_src_count;
+
+	/*
+	 * Some irqs are delayed on restore until the source is created,
+	 * keep track here of how many of them
+	 */
+	u32	delayed_irqs;
+
+	/* Which queues (priorities) are in use by the guest */
+	u8	qmap;
+
+	/* Queue orders */
+	u32	q_order;
+	u32	q_page_order;
+
+};
+
+#define KVMPPC_XIVE_Q_COUNT	8
+
+struct kvmppc_xive_vcpu {
+	struct kvmppc_xive	*xive;
+	struct kvm_vcpu		*vcpu;
+	bool			valid;
+
+	/* Server number. This is the HW CPU ID from a guest perspective */
+	u32			server_num;
+
+	/*
+	 * HW VP corresponding to this VCPU. This is the base of the VP
+	 * block plus the server number.
+	 */
+	u32			vp_id;
+	u32			vp_chip_id;
+	u32			vp_cam;
+
+	/* IPI used for sending ... IPIs */
+	u32			vp_ipi;
+	struct xive_irq_data	vp_ipi_data;
+
+	/* Local emulation state */
+	uint8_t			cppr;	/* guest CPPR */
+	uint8_t			hw_cppr;/* Hardware CPPR */
+	uint8_t			mfrr;
+	uint8_t			pending;
+
+	/* Each VP has 8 queues though we only provision some */
+	struct xive_q		queues[KVMPPC_XIVE_Q_COUNT];
+	u32			esc_virq[KVMPPC_XIVE_Q_COUNT];
+	char			*esc_virq_names[KVMPPC_XIVE_Q_COUNT];
+
+	/* Stash a delayed irq on restore from migration (see set_icp) */
+	u32			delayed_irq;
+
+	/* Stats */
+	u64			stat_rm_h_xirr;
+	u64			stat_rm_h_ipoll;
+	u64			stat_rm_h_cppr;
+	u64			stat_rm_h_eoi;
+	u64			stat_rm_h_ipi;
+	u64			stat_vm_h_xirr;
+	u64			stat_vm_h_ipoll;
+	u64			stat_vm_h_cppr;
+	u64			stat_vm_h_eoi;
+	u64			stat_vm_h_ipi;
+};
+
+static inline struct kvm_vcpu *kvmppc_xive_find_server(struct kvm *kvm, u32 nr)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (vcpu->arch.xive_vcpu && nr == vcpu->arch.xive_vcpu->server_num)
+			return vcpu;
+	}
+	return NULL;
+}
+
+static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmppc_xive *xive,
+		u32 irq, u16 *source)
+{
+	u32 bid = irq >> KVMPPC_XICS_ICS_SHIFT;
+	u16 src = irq & KVMPPC_XICS_SRC_MASK;
+
+	if (source)
+		*source = src;
+	if (bid > KVMPPC_XICS_MAX_ICS_ID)
+		return NULL;
+	return xive->src_blocks[bid];
+}
+
+/*
+ * Mapping between guest priorities and host priorities
+ * is as follow.
+ *
+ * Guest request for 0...6 are honored. Guest request for anything
+ * higher results in a priority of 7 being applied.
+ *
+ * However, when XIRR is returned via H_XIRR, 7 is translated to 0xb
+ * in order to match AIX expectations
+ *
+ * Similar mapping is done for CPPR values
+ */
+static inline u8 xive_prio_from_guest(u8 prio)
+{
+	if (prio == 0xff || prio < 8)
+		return prio;
+	return 7;
+}
+
+static inline u8 xive_prio_to_guest(u8 prio)
+{
+	if (prio == 0xff || prio < 7)
+		return prio;
+	return 0xb;
+}
+
+static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 *toggle)
+{
+	u32 cur;
+
+	if (!qpage)
+		return 0;
+	cur = be32_to_cpup(qpage + *idx);
+	if ((cur >> 31) == *toggle)
+		return 0;
+	*idx = (*idx + 1) & msk;
+	if (*idx == 0)
+		(*toggle) ^= 1;
+	return cur & 0x7fffffff;
+}
+
+extern unsigned long xive_rm_h_xirr(struct kvm_vcpu *vcpu);
+extern unsigned long xive_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server);
+extern int xive_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+			 unsigned long mfrr);
+extern int xive_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
+extern int xive_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
+
+extern unsigned long (*__xive_vm_h_xirr)(struct kvm_vcpu *vcpu);
+extern unsigned long (*__xive_vm_h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server);
+extern int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
+			      unsigned long mfrr);
+extern int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
+extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
+
+#endif /* CONFIG_KVM_XICS */
+#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
new file mode 100644
index 000000000000..023a31133c37
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -0,0 +1,503 @@
+/*
+ * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+/* File to be included by other .c files */
+
+#define XGLUE(a,b) a##b
+#define GLUE(a,b) XGLUE(a,b)
+
+static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc)
+{
+	u8 cppr;
+	u16 ack;
+
+	/* XXX DD1 bug workaround: Check PIPR vs. CPPR first ! */
+
+	/* Perform the acknowledge OS to register cycle. */
+	ack = be16_to_cpu(__x_readw(__x_tima + TM_SPC_ACK_OS_REG));
+
+	/* Synchronize subsequent queue accesses */
+	mb();
+
+	/* XXX Check grouping level */
+
+	/* Anything ? */
+	if (!((ack >> 8) & TM_QW1_NSR_EO))
+		return;
+
+	/* Grab CPPR of the most favored pending interrupt */
+	cppr = ack & 0xff;
+	if (cppr < 8)
+		xc->pending |= 1 << cppr;
+
+#ifdef XIVE_RUNTIME_CHECKS
+	/* Check consistency */
+	if (cppr >= xc->hw_cppr)
+		pr_warn("KVM-XIVE: CPU %d odd ack CPPR, got %d at %d\n",
+			smp_processor_id(), cppr, xc->hw_cppr);
+#endif
+
+	/*
+	 * Update our image of the HW CPPR. We don't yet modify
+	 * xc->cppr, this will be done as we scan for interrupts
+	 * in the queues.
+	 */
+	xc->hw_cppr = cppr;
+}
+
+static u8 GLUE(X_PFX,esb_load)(struct xive_irq_data *xd, u32 offset)
+{
+	u64 val;
+
+	if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
+		offset |= offset << 4;
+
+	val =__x_readq(__x_eoi_page(xd) + offset);
+#ifdef __LITTLE_ENDIAN__
+	val >>= 64-8;
+#endif
+	return (u8)val;
+}
+
+
+static void GLUE(X_PFX,source_eoi)(u32 hw_irq, struct xive_irq_data *xd)
+{
+	/* If the XIVE supports the new "store EOI facility, use it */
+	if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
+		__x_writeq(0, __x_eoi_page(xd));
+	else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) {
+		opal_int_eoi(hw_irq);
+	} else {
+		uint64_t eoi_val;
+
+		/*
+		 * Otherwise for EOI, we use the special MMIO that does
+		 * a clear of both P and Q and returns the old Q,
+		 * except for LSIs where we use the "EOI cycle" special
+		 * load.
+		 *
+		 * This allows us to then do a re-trigger if Q was set
+		 * rather than synthetizing an interrupt in software
+		 *
+		 * For LSIs, using the HW EOI cycle works around a problem
+		 * on P9 DD1 PHBs where the other ESB accesses don't work
+		 * properly.
+		 */
+		if (xd->flags & XIVE_IRQ_FLAG_LSI)
+			__x_readq(__x_eoi_page(xd));
+		else {
+			eoi_val = GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_00);
+
+			/* Re-trigger if needed */
+			if ((eoi_val & 1) && __x_trig_page(xd))
+				__x_writeq(0, __x_trig_page(xd));
+		}
+	}
+}
+
+enum {
+	scan_fetch,
+	scan_poll,
+	scan_eoi,
+};
+
+static u32 GLUE(X_PFX,scan_interrupts)(struct kvmppc_xive_vcpu *xc,
+				       u8 pending, int scan_type)
+{
+	u32 hirq = 0;
+	u8 prio = 0xff;
+
+	/* Find highest pending priority */
+	while ((xc->mfrr != 0xff || pending != 0) && hirq == 0) {
+		struct xive_q *q;
+		u32 idx, toggle;
+		__be32 *qpage;
+
+		/*
+		 * If pending is 0 this will return 0xff which is what
+		 * we want
+		 */
+		prio = ffs(pending) - 1;
+
+		/*
+		 * If the most favoured prio we found pending is less
+		 * favored (or equal) than a pending IPI, we return
+		 * the IPI instead.
+		 *
+		 * Note: If pending was 0 and mfrr is 0xff, we will
+		 * not spurriously take an IPI because mfrr cannot
+		 * then be smaller than cppr.
+		 */
+		if (prio >= xc->mfrr && xc->mfrr < xc->cppr) {
+			prio = xc->mfrr;
+			hirq = XICS_IPI;
+			break;
+		}
+
+		/* Don't scan past the guest cppr */
+		if (prio >= xc->cppr || prio > 7)
+			break;
+
+		/* Grab queue and pointers */
+		q = &xc->queues[prio];
+		idx = q->idx;
+		toggle = q->toggle;
+
+		/*
+		 * Snapshot the queue page. The test further down for EOI
+		 * must use the same "copy" that was used by __xive_read_eq
+		 * since qpage can be set concurrently and we don't want
+		 * to miss an EOI.
+		 */
+		qpage = READ_ONCE(q->qpage);
+
+skip_ipi:
+		/*
+		 * Try to fetch from the queue. Will return 0 for a
+		 * non-queueing priority (ie, qpage = 0).
+		 */
+		hirq = __xive_read_eq(qpage, q->msk, &idx, &toggle);
+
+		/*
+		 * If this was a signal for an MFFR change done by
+		 * H_IPI we skip it. Additionally, if we were fetching
+		 * we EOI it now, thus re-enabling reception of a new
+		 * such signal.
+		 *
+		 * We also need to do that if prio is 0 and we had no
+		 * page for the queue. In this case, we have non-queued
+		 * IPI that needs to be EOId.
+		 *
+		 * This is safe because if we have another pending MFRR
+		 * change that wasn't observed above, the Q bit will have
+		 * been set and another occurrence of the IPI will trigger.
+		 */
+		if (hirq == XICS_IPI || (prio == 0 && !qpage)) {
+			if (scan_type == scan_fetch)
+				GLUE(X_PFX,source_eoi)(xc->vp_ipi,
+						       &xc->vp_ipi_data);
+			/* Loop back on same queue with updated idx/toggle */
+#ifdef XIVE_RUNTIME_CHECKS
+			WARN_ON(hirq && hirq != XICS_IPI);
+#endif
+			if (hirq)
+				goto skip_ipi;
+		}
+
+		/* If fetching, update queue pointers */
+		if (scan_type == scan_fetch) {
+			q->idx = idx;
+			q->toggle = toggle;
+		}
+
+		/* Something found, stop searching */
+		if (hirq)
+			break;
+
+		/* Clear the pending bit on the now empty queue */
+		pending &= ~(1 << prio);
+
+		/*
+		 * Check if the queue count needs adjusting due to
+		 * interrupts being moved away.
+		 */
+		if (atomic_read(&q->pending_count)) {
+			int p = atomic_xchg(&q->pending_count, 0);
+			if (p) {
+#ifdef XIVE_RUNTIME_CHECKS
+				WARN_ON(p > atomic_read(&q->count));
+#endif
+				atomic_sub(p, &q->count);
+			}
+		}
+	}
+
+	/* If we are just taking a "peek", do nothing else */
+	if (scan_type == scan_poll)
+		return hirq;
+
+	/* Update the pending bits */
+	xc->pending = pending;
+
+	/*
+	 * If this is an EOI that's it, no CPPR adjustment done here,
+	 * all we needed was cleanup the stale pending bits and check
+	 * if there's anything left.
+	 */
+	if (scan_type == scan_eoi)
+		return hirq;
+
+	/*
+	 * If we found an interrupt, adjust what the guest CPPR should
+	 * be as if we had just fetched that interrupt from HW.
+	 */
+	if (hirq)
+		xc->cppr = prio;
+	/*
+	 * If it was an IPI the HW CPPR might have been lowered too much
+	 * as the HW interrupt we use for IPIs is routed to priority 0.
+	 *
+	 * We re-sync it here.
+	 */
+	if (xc->cppr != xc->hw_cppr) {
+		xc->hw_cppr = xc->cppr;
+		__x_writeb(xc->cppr, __x_tima + TM_QW1_OS + TM_CPPR);
+	}
+
+	return hirq;
+}
+
+X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	u8 old_cppr;
+	u32 hirq;
+
+	pr_devel("H_XIRR\n");
+
+	xc->GLUE(X_STAT_PFX,h_xirr)++;
+
+	/* First collect pending bits from HW */
+	GLUE(X_PFX,ack_pending)(xc);
+
+	/*
+	 * Cleanup the old-style bits if needed (they may have been
+	 * set by pull or an escalation interrupts).
+	 */
+	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions))
+		clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
+			  &vcpu->arch.pending_exceptions);
+
+	pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
+		 xc->pending, xc->hw_cppr, xc->cppr);
+
+	/* Grab previous CPPR and reverse map it */
+	old_cppr = xive_prio_to_guest(xc->cppr);
+
+	/* Scan for actual interrupts */
+	hirq = GLUE(X_PFX,scan_interrupts)(xc, xc->pending, scan_fetch);
+
+	pr_devel(" got hirq=0x%x hw_cppr=%d cppr=%d\n",
+		 hirq, xc->hw_cppr, xc->cppr);
+
+#ifdef XIVE_RUNTIME_CHECKS
+	/* That should never hit */
+	if (hirq & 0xff000000)
+		pr_warn("XIVE: Weird guest interrupt number 0x%08x\n", hirq);
+#endif
+
+	/*
+	 * XXX We could check if the interrupt is masked here and
+	 * filter it. If we chose to do so, we would need to do:
+	 *
+	 *    if (masked) {
+	 *        lock();
+	 *        if (masked) {
+	 *            old_Q = true;
+	 *            hirq = 0;
+	 *        }
+	 *        unlock();
+	 *    }
+	 */
+
+	/* Return interrupt and old CPPR in GPR4 */
+	vcpu->arch.gpr[4] = hirq | (old_cppr << 24);
+
+	return H_SUCCESS;
+}
+
+X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	u8 pending = xc->pending;
+	u32 hirq;
+	u8 pipr;
+
+	pr_devel("H_IPOLL(server=%ld)\n", server);
+
+	xc->GLUE(X_STAT_PFX,h_ipoll)++;
+
+	/* Grab the target VCPU if not the current one */
+	if (xc->server_num != server) {
+		vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
+		if (!vcpu)
+			return H_PARAMETER;
+		xc = vcpu->arch.xive_vcpu;
+
+		/* Scan all priorities */
+		pending = 0xff;
+	} else {
+		/* Grab pending interrupt if any */
+		pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR);
+		if (pipr < 8)
+			pending |= 1 << pipr;
+	}
+
+	hirq = GLUE(X_PFX,scan_interrupts)(xc, pending, scan_poll);
+
+	/* Return interrupt and old CPPR in GPR4 */
+	vcpu->arch.gpr[4] = hirq | (xc->cppr << 24);
+
+	return H_SUCCESS;
+}
+
+static void GLUE(X_PFX,push_pending_to_hw)(struct kvmppc_xive_vcpu *xc)
+{
+	u8 pending, prio;
+
+	pending = xc->pending;
+	if (xc->mfrr != 0xff) {
+		if (xc->mfrr < 8)
+			pending |= 1 << xc->mfrr;
+		else
+			pending |= 0x80;
+	}
+	if (!pending)
+		return;
+	prio = ffs(pending) - 1;
+
+	__x_writeb(prio, __x_tima + TM_SPC_SET_OS_PENDING);
+}
+
+X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	u8 old_cppr;
+
+	pr_devel("H_CPPR(cppr=%ld)\n", cppr);
+
+	xc->GLUE(X_STAT_PFX,h_cppr)++;
+
+	/* Map CPPR */
+	cppr = xive_prio_from_guest(cppr);
+
+	/* Remember old and update SW state */
+	old_cppr = xc->cppr;
+	xc->cppr = cppr;
+
+	/*
+	 * We are masking less, we need to look for pending things
+	 * to deliver and set VP pending bits accordingly to trigger
+	 * a new interrupt otherwise we might miss MFRR changes for
+	 * which we have optimized out sending an IPI signal.
+	 */
+	if (cppr > old_cppr)
+		GLUE(X_PFX,push_pending_to_hw)(xc);
+
+	/* Apply new CPPR */
+	xc->hw_cppr = cppr;
+	__x_writeb(cppr, __x_tima + TM_QW1_OS + TM_CPPR);
+
+	return H_SUCCESS;
+}
+
+X_STATIC int GLUE(X_PFX,h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct xive_irq_data *xd;
+	u8 new_cppr = xirr >> 24;
+	u32 irq = xirr & 0x00ffffff, hw_num;
+	u16 src;
+	int rc = 0;
+
+	pr_devel("H_EOI(xirr=%08lx)\n", xirr);
+
+	xc->GLUE(X_STAT_PFX,h_eoi)++;
+
+	xc->cppr = xive_prio_from_guest(new_cppr);
+
+	/*
+	 * IPIs are synthetized from MFRR and thus don't need
+	 * any special EOI handling. The underlying interrupt
+	 * used to signal MFRR changes is EOId when fetched from
+	 * the queue.
+	 */
+	if (irq == XICS_IPI || irq == 0)
+		goto bail;
+
+	/* Find interrupt source */
+	sb = kvmppc_xive_find_source(xive, irq, &src);
+	if (!sb) {
+		pr_devel(" source not found !\n");
+		rc = H_PARAMETER;
+		goto bail;
+	}
+	state = &sb->irq_state[src];
+	kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+	state->in_eoi = true;
+	mb();
+
+again:
+	if (state->guest_priority == MASKED) {
+		arch_spin_lock(&sb->lock);
+		if (state->guest_priority != MASKED) {
+			arch_spin_unlock(&sb->lock);
+			goto again;
+		}
+		pr_devel(" EOI on saved P...\n");
+
+		/* Clear old_p, that will cause unmask to perform an EOI */
+		state->old_p = false;
+
+		arch_spin_unlock(&sb->lock);
+	} else {
+		pr_devel(" EOI on source...\n");
+
+		/* Perform EOI on the source */
+		GLUE(X_PFX,source_eoi)(hw_num, xd);
+
+		/* If it's an emulated LSI, check level and resend */
+		if (state->lsi && state->asserted)
+			__x_writeq(0, __x_trig_page(xd));
+
+	}
+
+	mb();
+	state->in_eoi = false;
+bail:
+
+	/* Re-evaluate pending IRQs and update HW */
+	GLUE(X_PFX,scan_interrupts)(xc, xc->pending, scan_eoi);
+	GLUE(X_PFX,push_pending_to_hw)(xc);
+	pr_devel(" after scan pending=%02x\n", xc->pending);
+
+	/* Apply new CPPR */
+	xc->hw_cppr = xc->cppr;
+	__x_writeb(xc->cppr, __x_tima + TM_QW1_OS + TM_CPPR);
+
+	return rc;
+}
+
+X_STATIC int GLUE(X_PFX,h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
+			       unsigned long mfrr)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+	pr_devel("H_IPI(server=%08lx,mfrr=%ld)\n", server, mfrr);
+
+	xc->GLUE(X_STAT_PFX,h_ipi)++;
+
+	/* Find target */
+	vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
+	if (!vcpu)
+		return H_PARAMETER;
+	xc = vcpu->arch.xive_vcpu;
+
+	/* Locklessly write over MFRR */
+	xc->mfrr = mfrr;
+
+	/* Shoot the IPI if most favored than target cppr */
+	if (mfrr < xc->cppr)
+		__x_writeq(0, __x_trig_page(&xc->vp_ipi_data));
+
+	return H_SUCCESS;
+}
diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
index 5a9a10b90762..3f1be85a83bc 100644
--- a/arch/powerpc/kvm/irq.h
+++ b/arch/powerpc/kvm/irq.h
@@ -12,6 +12,7 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 #endif
 #ifdef CONFIG_KVM_XICS
 	ret = ret || (kvm->arch.xics != NULL);
+	ret = ret || (kvm->arch.xive != NULL);
 #endif
 	smp_rmb();
 	return ret;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 95c91a9de351..de79bd721ec7 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -37,6 +37,8 @@
 #include <asm/cputhreads.h>
 #include <asm/irqflags.h>
 #include <asm/iommu.h>
+#include <asm/xive.h>
+
 #include "timing.h"
 #include "irq.h"
 #include "../mm/mmu_decl.h"
@@ -699,7 +701,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 		kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu);
 		break;
 	case KVMPPC_IRQ_XICS:
-		kvmppc_xics_free_icp(vcpu);
+		if (xive_enabled())
+			kvmppc_xive_cleanup_vcpu(vcpu);
+		else
+			kvmppc_xics_free_icp(vcpu);
 		break;
 	}
 
@@ -1219,8 +1224,12 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 
 		r = -EPERM;
 		dev = kvm_device_from_filp(f.file);
-		if (dev)
-			r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
+		if (dev) {
+			if (xive_enabled())
+				r = kvmppc_xive_connect_vcpu(dev, vcpu, cap->args[1]);
+			else
+				r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
+		}
 
 		fdput(f);
 		break;
@@ -1244,7 +1253,7 @@ bool kvm_arch_intc_initialized(struct kvm *kvm)
 		return true;
 #endif
 #ifdef CONFIG_KVM_XICS
-	if (kvm->arch.xics)
+	if (kvm->arch.xics || kvm->arch.xive)
 		return true;
 #endif
 	return false;
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index e0f856bfbfe8..d71cd773d870 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -890,3 +890,4 @@ EXPORT_SYMBOL_GPL(opal_leds_set_ind);
 EXPORT_SYMBOL_GPL(opal_write_oppanel_async);
 /* Export this for KVM */
 EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
+EXPORT_SYMBOL_GPL(opal_int_eoi);
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index d9cd7f705f21..496036c93531 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -46,13 +46,15 @@
 #endif
 
 bool __xive_enabled;
+EXPORT_SYMBOL_GPL(__xive_enabled);
 bool xive_cmdline_disabled;
 
 /* We use only one priority for now */
 static u8 xive_irq_priority;
 
-/* TIMA */
+/* TIMA exported to KVM */
 void __iomem *xive_tima;
+EXPORT_SYMBOL_GPL(xive_tima);
 u32 xive_tima_offset;
 
 /* Backend ops */
@@ -345,8 +347,11 @@ static void xive_irq_eoi(struct irq_data *d)
 	DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n",
 		    d->irq, irqd_to_hwirq(d), xc->pending_prio);
 
-	/* EOI the source if it hasn't been disabled */
-	if (!irqd_irq_disabled(d))
+	/*
+	 * EOI the source if it hasn't been disabled and hasn't
+	 * been passed-through to a KVM guest
+	 */
+	if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d))
 		xive_do_source_eoi(irqd_to_hwirq(d), xd);
 
 	/*
@@ -689,9 +694,14 @@ static int xive_irq_set_affinity(struct irq_data *d,
 
 	old_target = xd->target;
 
-	rc = xive_ops->configure_irq(hw_irq,
-				     get_hard_smp_processor_id(target),
-				     xive_irq_priority, d->irq);
+	/*
+	 * Only configure the irq if it's not currently passed-through to
+	 * a KVM guest
+	 */
+	if (!irqd_is_forwarded_to_vcpu(d))
+		rc = xive_ops->configure_irq(hw_irq,
+					     get_hard_smp_processor_id(target),
+					     xive_irq_priority, d->irq);
 	if (rc < 0) {
 		pr_err("Error %d reconfiguring irq %d\n", rc, d->irq);
 		return rc;
@@ -771,6 +781,123 @@ static int xive_irq_retrigger(struct irq_data *d)
 	return 1;
 }
 
+static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+	int rc;
+	u8 pq;
+
+	/*
+	 * We only support this on interrupts that do not require
+	 * firmware calls for masking and unmasking
+	 */
+	if (xd->flags & XIVE_IRQ_FLAG_MASK_FW)
+		return -EIO;
+
+	/*
+	 * This is called by KVM with state non-NULL for enabling
+	 * pass-through or NULL for disabling it
+	 */
+	if (state) {
+		irqd_set_forwarded_to_vcpu(d);
+
+		/* Set it to PQ=10 state to prevent further sends */
+		pq = xive_poke_esb(xd, XIVE_ESB_SET_PQ_10);
+
+		/* No target ? nothing to do */
+		if (xd->target == XIVE_INVALID_TARGET) {
+			/*
+			 * An untargetted interrupt should have been
+			 * also masked at the source
+			 */
+			WARN_ON(pq & 2);
+
+			return 0;
+		}
+
+		/*
+		 * If P was set, adjust state to PQ=11 to indicate
+		 * that a resend is needed for the interrupt to reach
+		 * the guest. Also remember the value of P.
+		 *
+		 * This also tells us that it's in flight to a host queue
+		 * or has already been fetched but hasn't been EOIed yet
+		 * by the host. This it's potentially using up a host
+		 * queue slot. This is important to know because as long
+		 * as this is the case, we must not hard-unmask it when
+		 * "returning" that interrupt to the host.
+		 *
+		 * This saved_p is cleared by the host EOI, when we know
+		 * for sure the queue slot is no longer in use.
+		 */
+		if (pq & 2) {
+			pq = xive_poke_esb(xd, XIVE_ESB_SET_PQ_11);
+			xd->saved_p = true;
+
+			/*
+			 * Sync the XIVE source HW to ensure the interrupt
+			 * has gone through the EAS before we change its
+			 * target to the guest. That should guarantee us
+			 * that we *will* eventually get an EOI for it on
+			 * the host. Otherwise there would be a small window
+			 * for P to be seen here but the interrupt going
+			 * to the guest queue.
+			 */
+			if (xive_ops->sync_source)
+				xive_ops->sync_source(hw_irq);
+		} else
+			xd->saved_p = false;
+	} else {
+		irqd_clr_forwarded_to_vcpu(d);
+
+		/* No host target ? hard mask and return */
+		if (xd->target == XIVE_INVALID_TARGET) {
+			xive_do_source_set_mask(xd, true);
+			return 0;
+		}
+
+		/*
+		 * Sync the XIVE source HW to ensure the interrupt
+		 * has gone through the EAS before we change its
+		 * target to the host.
+		 */
+		if (xive_ops->sync_source)
+			xive_ops->sync_source(hw_irq);
+
+		/*
+		 * By convention we are called with the interrupt in
+		 * a PQ=10 or PQ=11 state, ie, it won't fire and will
+		 * have latched in Q whether there's a pending HW
+		 * interrupt or not.
+		 *
+		 * First reconfigure the target.
+		 */
+		rc = xive_ops->configure_irq(hw_irq,
+					     get_hard_smp_processor_id(xd->target),
+					     xive_irq_priority, d->irq);
+		if (rc)
+			return rc;
+
+		/*
+		 * Then if saved_p is not set, effectively re-enable the
+		 * interrupt with an EOI. If it is set, we know there is
+		 * still a message in a host queue somewhere that will be
+		 * EOId eventually.
+		 *
+		 * Note: We don't check irqd_irq_disabled(). Effectively,
+		 * we *will* let the irq get through even if masked if the
+		 * HW is still firing it in order to deal with the whole
+		 * saved_p business properly. If the interrupt triggers
+		 * while masked, the generic code will re-mask it anyway.
+		 */
+		if (!xd->saved_p)
+			xive_do_source_eoi(hw_irq, xd);
+
+	}
+	return 0;
+}
+
 static struct irq_chip xive_irq_chip = {
 	.name = "XIVE-IRQ",
 	.irq_startup = xive_irq_startup,
@@ -781,12 +908,14 @@ static struct irq_chip xive_irq_chip = {
 	.irq_set_affinity = xive_irq_set_affinity,
 	.irq_set_type = xive_irq_set_type,
 	.irq_retrigger = xive_irq_retrigger,
+	.irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity,
 };
 
 bool is_xive_irq(struct irq_chip *chip)
 {
 	return chip == &xive_irq_chip;
 }
+EXPORT_SYMBOL_GPL(is_xive_irq);
 
 void xive_cleanup_irq_data(struct xive_irq_data *xd)
 {
@@ -801,6 +930,7 @@ void xive_cleanup_irq_data(struct xive_irq_data *xd)
 		xd->trig_mmio = NULL;
 	}
 }
+EXPORT_SYMBOL_GPL(xive_cleanup_irq_data);
 
 static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
 {
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index 5fae59186cb2..6feac0a758e1 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -31,6 +31,7 @@
 #include <asm/xive.h>
 #include <asm/xive-regs.h>
 #include <asm/opal.h>
+#include <asm/kvm_ppc.h>
 
 #include "xive-internal.h"
 
@@ -95,6 +96,7 @@ int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(xive_native_populate_irq_data);
 
 int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
 {
@@ -108,6 +110,8 @@ int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
 	}
 	return rc == 0 ? 0 : -ENXIO;
 }
+EXPORT_SYMBOL_GPL(xive_native_configure_irq);
+
 
 /* This can be called multiple time to change a queue configuration */
 int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
@@ -172,6 +176,7 @@ int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
 fail:
 	return rc;
 }
+EXPORT_SYMBOL_GPL(xive_native_configure_queue);
 
 static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
 {
@@ -191,6 +196,7 @@ void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
 {
 	__xive_native_disable_queue(vp_id, q, prio);
 }
+EXPORT_SYMBOL_GPL(xive_native_disable_queue);
 
 static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio)
 {
@@ -261,6 +267,7 @@ static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc)
 	}
 	return 0;
 }
+#endif /* CONFIG_SMP */
 
 u32 xive_native_alloc_irq(void)
 {
@@ -276,6 +283,7 @@ u32 xive_native_alloc_irq(void)
 		return 0;
 	return rc;
 }
+EXPORT_SYMBOL_GPL(xive_native_alloc_irq);
 
 void xive_native_free_irq(u32 irq)
 {
@@ -286,7 +294,9 @@ void xive_native_free_irq(u32 irq)
 		msleep(1);
 	}
 }
+EXPORT_SYMBOL_GPL(xive_native_free_irq);
 
+#ifdef CONFIG_SMP
 static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc)
 {
 	s64 rc;
@@ -382,7 +392,7 @@ static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc)
 		return;
 
 	/* Enable the pool VP */
-	vp = xive_pool_vps + get_hard_smp_processor_id(cpu);
+	vp = xive_pool_vps + cpu;
 	pr_debug("CPU %d setting up pool VP 0x%x\n", cpu, vp);
 	for (;;) {
 		rc = opal_xive_set_vp_info(vp, OPAL_XIVE_VP_ENABLED, 0);
@@ -427,7 +437,7 @@ static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc)
 	in_be64(xive_tima + TM_SPC_PULL_POOL_CTX);
 
 	/* Disable it */
-	vp = xive_pool_vps + get_hard_smp_processor_id(cpu);
+	vp = xive_pool_vps + cpu;
 	for (;;) {
 		rc = opal_xive_set_vp_info(vp, 0, 0);
 		if (rc != OPAL_BUSY)
@@ -436,10 +446,11 @@ static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc)
 	}
 }
 
-static void xive_native_sync_source(u32 hw_irq)
+void xive_native_sync_source(u32 hw_irq)
 {
 	opal_xive_sync(XIVE_SYNC_EAS, hw_irq);
 }
+EXPORT_SYMBOL_GPL(xive_native_sync_source);
 
 static const struct xive_ops xive_native_ops = {
 	.populate_irq_data	= xive_native_populate_irq_data,
@@ -500,10 +511,24 @@ static bool xive_parse_provisioning(struct device_node *np)
 	return true;
 }
 
+static void xive_native_setup_pools(void)
+{
+	/* Allocate a pool big enough */
+	pr_debug("XIVE: Allocating VP block for pool size %d\n", nr_cpu_ids);
+
+	xive_pool_vps = xive_native_alloc_vp_block(nr_cpu_ids);
+	if (WARN_ON(xive_pool_vps == XIVE_INVALID_VP))
+		pr_err("XIVE: Failed to allocate pool VP, KVM might not function\n");
+
+	pr_debug("XIVE: Pool VPs allocated at 0x%x for %d max CPUs\n",
+		 xive_pool_vps, nr_cpu_ids);
+}
+
 u32 xive_native_default_eq_shift(void)
 {
 	return xive_queue_shift;
 }
+EXPORT_SYMBOL_GPL(xive_native_default_eq_shift);
 
 bool xive_native_init(void)
 {
@@ -513,7 +538,7 @@ bool xive_native_init(void)
 	struct property *prop;
 	u8 max_prio = 7;
 	const __be32 *p;
-	u32 val;
+	u32 val, cpu;
 	s64 rc;
 
 	if (xive_cmdline_disabled)
@@ -549,7 +574,11 @@ bool xive_native_init(void)
 			break;
 	}
 
-	/* Grab size of provisioning pages */
+	/* Configure Thread Management areas for KVM */
+	for_each_possible_cpu(cpu)
+		kvmppc_set_xive_tima(cpu, r.start, tima);
+
+	/* Grab size of provisionning pages */
 	xive_parse_provisioning(np);
 
 	/* Switch the XIVE to exploitation mode */
@@ -559,6 +588,9 @@ bool xive_native_init(void)
 		return false;
 	}
 
+	/* Setup some dummy HV pool VPs */
+	xive_native_setup_pools();
+
 	/* Initialize XIVE core with our backend */
 	if (!xive_core_init(&xive_native_ops, tima, TM_QW3_HV_PHYS,
 			    max_prio)) {
@@ -637,3 +669,47 @@ void xive_native_free_vp_block(u32 vp_base)
 		pr_warn("OPAL error %lld freeing VP block\n", rc);
 }
 EXPORT_SYMBOL_GPL(xive_native_free_vp_block);
+
+int xive_native_enable_vp(u32 vp_id)
+{
+	s64 rc;
+
+	for (;;) {
+		rc = opal_xive_set_vp_info(vp_id, OPAL_XIVE_VP_ENABLED, 0);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+	return rc ? -EIO : 0;
+}
+EXPORT_SYMBOL_GPL(xive_native_enable_vp);
+
+int xive_native_disable_vp(u32 vp_id)
+{
+	s64 rc;
+
+	for (;;) {
+		rc = opal_xive_set_vp_info(vp_id, 0, 0);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+	return rc ? -EIO : 0;
+}
+EXPORT_SYMBOL_GPL(xive_native_disable_vp);
+
+int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id)
+{
+	__be64 vp_cam_be;
+	__be32 vp_chip_id_be;
+	s64 rc;
+
+	rc = opal_xive_get_vp_info(vp_id, NULL, &vp_cam_be, NULL, &vp_chip_id_be);
+	if (rc)
+		return -EIO;
+	*out_cam_id = be64_to_cpu(vp_cam_be) & 0xffffffffu;
+	*out_chip_id = be32_to_cpu(vp_chip_id_be);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xive_native_get_vp_info);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c14ad9809da..d1a6e554ee68 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1165,7 +1165,6 @@ int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type);
 void kvm_unregister_device_ops(u32 type);
 
 extern struct kvm_device_ops kvm_mpic_ops;
-extern struct kvm_device_ops kvm_xics_ops;
 extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
 extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a17d78759727..1b0da5771f71 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2839,10 +2839,6 @@ static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
 	[KVM_DEV_TYPE_FSL_MPIC_20]	= &kvm_mpic_ops,
 	[KVM_DEV_TYPE_FSL_MPIC_42]	= &kvm_mpic_ops,
 #endif
-
-#ifdef CONFIG_KVM_XICS
-	[KVM_DEV_TYPE_XICS]		= &kvm_xics_ops,
-#endif
 };
 
 int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
-- 
cgit v1.2.3-70-g09d2


From ed6473ddc704a2005b9900ca08e236ebb2d8540a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 26 Apr 2017 11:55:27 -0400
Subject: NFSv4: Fix callback server shutdown

We want to use kthread_stop() in order to ensure the threads are
shut down before we tear down the nfs_callback_info in nfs_callback_down.

Tested-and-reviewed-by: Kinglong Mee <kinglongmee@gmail.com>
Reported-by: Kinglong Mee <kinglongmee@gmail.com>
Fixes: bb6aeba736ba9 ("NFSv4.x: Switch to using svc_set_num_threads()...")
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfs/callback.c          | 24 ++++++++++++++++--------
 include/linux/sunrpc/svc.h |  1 +
 net/sunrpc/svc.c           | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index c5e27ebd8da8..73a1f928226c 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -76,7 +76,10 @@ nfs4_callback_svc(void *vrqstp)
 
 	set_freezable();
 
-	while (!kthread_should_stop()) {
+	while (!kthread_freezable_should_stop(NULL)) {
+
+		if (signal_pending(current))
+			flush_signals(current);
 		/*
 		 * Listen for a request on the socket
 		 */
@@ -85,6 +88,8 @@ nfs4_callback_svc(void *vrqstp)
 			continue;
 		svc_process(rqstp);
 	}
+	svc_exit_thread(rqstp);
+	module_put_and_exit(0);
 	return 0;
 }
 
@@ -103,9 +108,10 @@ nfs41_callback_svc(void *vrqstp)
 
 	set_freezable();
 
-	while (!kthread_should_stop()) {
-		if (try_to_freeze())
-			continue;
+	while (!kthread_freezable_should_stop(NULL)) {
+
+		if (signal_pending(current))
+			flush_signals(current);
 
 		prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
 		spin_lock_bh(&serv->sv_cb_lock);
@@ -121,11 +127,13 @@ nfs41_callback_svc(void *vrqstp)
 				error);
 		} else {
 			spin_unlock_bh(&serv->sv_cb_lock);
-			schedule();
+			if (!kthread_should_stop())
+				schedule();
 			finish_wait(&serv->sv_cb_waitq, &wq);
 		}
-		flush_signals(current);
 	}
+	svc_exit_thread(rqstp);
+	module_put_and_exit(0);
 	return 0;
 }
 
@@ -221,14 +229,14 @@ err_bind:
 static struct svc_serv_ops nfs40_cb_sv_ops = {
 	.svo_function		= nfs4_callback_svc,
 	.svo_enqueue_xprt	= svc_xprt_do_enqueue,
-	.svo_setup		= svc_set_num_threads,
+	.svo_setup		= svc_set_num_threads_sync,
 	.svo_module		= THIS_MODULE,
 };
 #if defined(CONFIG_NFS_V4_1)
 static struct svc_serv_ops nfs41_cb_sv_ops = {
 	.svo_function		= nfs41_callback_svc,
 	.svo_enqueue_xprt	= svc_xprt_do_enqueue,
-	.svo_setup		= svc_set_num_threads,
+	.svo_setup		= svc_set_num_threads_sync,
 	.svo_module		= THIS_MODULE,
 };
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 6ef19cf658b4..94631026f79c 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -473,6 +473,7 @@ void		   svc_pool_map_put(void);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 			struct svc_serv_ops *);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
+int		   svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
 void		   svc_destroy(struct svc_serv *);
 void		   svc_shutdown_net(struct svc_serv *, struct net *);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 98dc33ae738b..bc0f5a0ecbdc 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -795,6 +795,44 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 }
 EXPORT_SYMBOL_GPL(svc_set_num_threads);
 
+/* destroy old threads */
+static int
+svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+	struct task_struct *task;
+	unsigned int state = serv->sv_nrthreads-1;
+
+	/* destroy old threads */
+	do {
+		task = choose_victim(serv, pool, &state);
+		if (task == NULL)
+			break;
+		kthread_stop(task);
+		nrservs++;
+	} while (nrservs < 0);
+	return 0;
+}
+
+int
+svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+	if (pool == NULL) {
+		/* The -1 assumes caller has done a svc_get() */
+		nrservs -= (serv->sv_nrthreads-1);
+	} else {
+		spin_lock_bh(&pool->sp_lock);
+		nrservs -= pool->sp_nrthreads;
+		spin_unlock_bh(&pool->sp_lock);
+	}
+
+	if (nrservs > 0)
+		return svc_start_kthreads(serv, pool, nrservs);
+	if (nrservs < 0)
+		return svc_stop_kthreads(serv, pool, nrservs);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(svc_set_num_threads_sync);
+
 /*
  * Called from a server thread as it's exiting. Caller must hold the "service
  * mutex" for the service.
-- 
cgit v1.2.3-70-g09d2


From 461a6946b1f93f6720577fb06aa78e8cbd9291c9 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 26 Apr 2017 15:46:20 +0200
Subject: iommu: Remove pci.h include from trace/events/iommu.h

The include file does not need any PCI specifics, so remove
that include. Also fix the places that relied on it.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/arm64/mm/dma-mapping.c       | 1 +
 drivers/infiniband/hw/qedr/main.c | 1 +
 drivers/iommu/fsl_pamu.h          | 1 +
 drivers/iommu/rockchip-iommu.c    | 1 +
 drivers/iommu/tegra-smmu.c        | 1 +
 include/linux/dma-iommu.h         | 1 +
 include/trace/events/iommu.h      | 1 -
 7 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 81cdb2e844ed..982f85b81624 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -28,6 +28,7 @@
 #include <linux/dma-contiguous.h>
 #include <linux/vmalloc.h>
 #include <linux/swiotlb.h>
+#include <linux/pci.h>
 
 #include <asm/cacheflush.h>
 
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index b9b47e5cc8b3..33033624cd9b 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -35,6 +35,7 @@
 #include <rdma/ib_user_verbs.h>
 #include <linux/netdevice.h>
 #include <linux/iommu.h>
+#include <linux/pci.h>
 #include <net/addrconf.h>
 #include <linux/qed/qede_roce.h>
 #include <linux/qed/qed_chain.h>
diff --git a/drivers/iommu/fsl_pamu.h b/drivers/iommu/fsl_pamu.h
index aab723f91f12..c3434f29c967 100644
--- a/drivers/iommu/fsl_pamu.h
+++ b/drivers/iommu/fsl_pamu.h
@@ -20,6 +20,7 @@
 #define __FSL_PAMU_H
 
 #include <linux/iommu.h>
+#include <linux/pci.h>
 
 #include <asm/fsl_pamu_stash.h>
 
diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index 9afcbf79f0b0..0ba303a184dd 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -8,6 +8,7 @@
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/dma-iommu.h>
+#include <linux/dma-mapping.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index 9305964250ac..eeb19f560a05 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -15,6 +15,7 @@
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
+#include <linux/dma-mapping.h>
 
 #include <soc/tegra/ahb.h>
 #include <soc/tegra/mc.h>
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 5725c94b1f12..abd946569515 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -20,6 +20,7 @@
 #include <asm/errno.h>
 
 #ifdef CONFIG_IOMMU_DMA
+#include <linux/dma-mapping.h>
 #include <linux/iommu.h>
 #include <linux/msi.h>
 
diff --git a/include/trace/events/iommu.h b/include/trace/events/iommu.h
index 2c7befb10f13..99254ed89212 100644
--- a/include/trace/events/iommu.h
+++ b/include/trace/events/iommu.h
@@ -11,7 +11,6 @@
 #define _TRACE_IOMMU_H
 
 #include <linux/tracepoint.h>
-#include <linux/pci.h>
 
 struct device;
 
-- 
cgit v1.2.3-70-g09d2


From 208480bb273e15f42711bd47f70dc0fbfa2570b8 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 26 Apr 2017 15:49:57 +0200
Subject: iommu: Remove trace-events include from iommu.h

It is not needed there anymore. All places needing it are
fixed.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/media/platform/mtk-vpu/mtk_vpu.c | 1 +
 include/linux/iommu.h                    | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/platform/mtk-vpu/mtk_vpu.c b/drivers/media/platform/mtk-vpu/mtk_vpu.c
index 463b69c934be..e011c8dc0198 100644
--- a/drivers/media/platform/mtk-vpu/mtk_vpu.c
+++ b/drivers/media/platform/mtk-vpu/mtk_vpu.c
@@ -23,6 +23,7 @@
 #include <linux/of_reserved_mem.h>
 #include <linux/sched.h>
 #include <linux/sizes.h>
+#include <linux/dma-mapping.h>
 
 #include "mtk_vpu.h"
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index abaa0ca848bc..dda8717545e9 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -26,8 +26,6 @@
 #include <linux/err.h>
 #include <linux/of.h>
 
-#include <trace/events/iommu.h>
-
 #define IOMMU_READ	(1 << 0)
 #define IOMMU_WRITE	(1 << 1)
 #define IOMMU_CACHE	(1 << 2) /* DMA cache coherency */
-- 
cgit v1.2.3-70-g09d2


From 917362135b8a5c0680acf08807e9fc6179eb6c79 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 14 Apr 2017 20:32:49 +0200
Subject: power: supply: max17042_battery: Add default platform_data fallback
 data

Some x86 machines use a max17047 fuel-gauge and x86 might be missing
platform_data if not provided by SFI.

This commit adds default platform_data as fallback option so that the
driver can work on boards where no platform_data is provided.

Since not all boards have a thermistor hooked up, set temp_min to 0 and
change the health checks from temp <= temp_min to temp < temp_min to
not trigger on such boards (where temp reads 0).

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
---
 drivers/power/supply/max17042_battery.c | 60 +++++++++++++++++++++++++++++----
 include/linux/power/max17042_battery.h  |  6 +++-
 2 files changed, 58 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/max17042_battery.c b/drivers/power/supply/max17042_battery.c
index a51b2965cd5c..f0ff6e880ff2 100644
--- a/drivers/power/supply/max17042_battery.c
+++ b/drivers/power/supply/max17042_battery.c
@@ -150,12 +150,12 @@ static int max17042_get_battery_health(struct max17042_chip *chip, int *health)
 	if (ret < 0)
 		goto health_error;
 
-	if (temp <= chip->pdata->temp_min) {
+	if (temp < chip->pdata->temp_min) {
 		*health = POWER_SUPPLY_HEALTH_COLD;
 		goto out;
 	}
 
-	if (temp >= chip->pdata->temp_max) {
+	if (temp > chip->pdata->temp_max) {
 		*health = POWER_SUPPLY_HEALTH_OVERHEAT;
 		goto out;
 	}
@@ -772,8 +772,9 @@ static void max17042_init_worker(struct work_struct *work)
 
 #ifdef CONFIG_OF
 static struct max17042_platform_data *
-max17042_get_pdata(struct device *dev)
+max17042_get_pdata(struct max17042_chip *chip)
 {
+	struct device *dev = &chip->client->dev;
 	struct device_node *np = dev->of_node;
 	u32 prop;
 	struct max17042_platform_data *pdata;
@@ -806,10 +807,55 @@ max17042_get_pdata(struct device *dev)
 	return pdata;
 }
 #else
+static struct max17042_reg_data max17047_default_pdata_init_regs[] = {
+	/*
+	 * Some firmwares do not set FullSOCThr, Enable End-of-Charge Detection
+	 * when the voltage FG reports 95%, as recommended in the datasheet.
+	 */
+	{ MAX17047_FullSOCThr, MAX17042_BATTERY_FULL << 8 },
+};
+
 static struct max17042_platform_data *
-max17042_get_pdata(struct device *dev)
+max17042_get_pdata(struct max17042_chip *chip)
 {
-	return dev->platform_data;
+	struct device *dev = &chip->client->dev;
+	struct max17042_platform_data *pdata;
+	int ret, misc_cfg;
+
+	if (dev->platform_data)
+		return dev->platform_data;
+
+	/*
+	 * The MAX17047 gets used on x86 where we might not have pdata, assume
+	 * the firmware will already have initialized the fuel-gauge and provide
+	 * default values for the non init bits to make things work.
+	 */
+	pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
+	if (!pdata)
+		return pdata;
+
+	if (chip->chip_type != MAXIM_DEVICE_TYPE_MAX17042) {
+		pdata->init_data = max17047_default_pdata_init_regs;
+		pdata->num_init_data =
+			ARRAY_SIZE(max17047_default_pdata_init_regs);
+	}
+
+	ret = regmap_read(chip->regmap, MAX17042_MiscCFG, &misc_cfg);
+	if (ret < 0)
+		return NULL;
+
+	/* If bits 0-1 are set to 3 then only Voltage readings are used */
+	if ((misc_cfg & 0x3) == 0x3)
+		pdata->enable_current_sense = false;
+	else
+		pdata->enable_current_sense = true;
+
+	pdata->vmin = MAX17042_DEFAULT_VMIN;
+	pdata->vmax = MAX17042_DEFAULT_VMAX;
+	pdata->temp_min = MAX17042_DEFAULT_TEMP_MIN;
+	pdata->temp_max = MAX17042_DEFAULT_TEMP_MAX;
+
+	return pdata;
 }
 #endif
 
@@ -858,20 +904,20 @@ static int max17042_probe(struct i2c_client *client,
 		return -ENOMEM;
 
 	chip->client = client;
+	chip->chip_type = id->driver_data;
 	chip->regmap = devm_regmap_init_i2c(client, &max17042_regmap_config);
 	if (IS_ERR(chip->regmap)) {
 		dev_err(&client->dev, "Failed to initialize regmap\n");
 		return -EINVAL;
 	}
 
-	chip->pdata = max17042_get_pdata(&client->dev);
+	chip->pdata = max17042_get_pdata(chip);
 	if (!chip->pdata) {
 		dev_err(&client->dev, "no platform data provided\n");
 		return -EINVAL;
 	}
 
 	i2c_set_clientdata(client, chip);
-	chip->chip_type = id->driver_data;
 	psy_cfg.drv_data = chip;
 
 	/* When current is not measured,
diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h
index 522757ac9cd4..3489fb0f9099 100644
--- a/include/linux/power/max17042_battery.h
+++ b/include/linux/power/max17042_battery.h
@@ -24,8 +24,12 @@
 #define __MAX17042_BATTERY_H_
 
 #define MAX17042_STATUS_BattAbsent	(1 << 3)
-#define MAX17042_BATTERY_FULL	(100)
+#define MAX17042_BATTERY_FULL		(95)   /* Recommend. FullSOCThr value */
 #define MAX17042_DEFAULT_SNS_RESISTOR	(10000)
+#define MAX17042_DEFAULT_VMIN		(3000)
+#define MAX17042_DEFAULT_VMAX		(4500) /* LiHV cell max */
+#define MAX17042_DEFAULT_TEMP_MIN	(0)    /* For sys without temp sensor */
+#define MAX17042_DEFAULT_TEMP_MAX	(700)  /* 70 degrees Celcius */
 
 #define MAX17042_CHARACTERIZATION_DATA_SIZE 48
 
-- 
cgit v1.2.3-70-g09d2


From a9df22c00d7c2c9c2944c62f1b819de6c214660f Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 14 Apr 2017 20:32:51 +0200
Subject: power: supply: max17042_battery: Add support for the STATUS property

Userspace prefers the driver having a status property over having to guess
itself. Specifically this will properly make the GNOME3 UI (and likely
others) properly show discharging / charging / full status, instead
of always showing discharging as status.

Note that in the case there is no charger driver supplying the max17042,
then a status of unknown will get returned. At least upower treats
this the same as not having a status attribute, so in this case nothing
changes from a userspace pov.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
---
 drivers/power/supply/max17042_battery.c | 46 +++++++++++++++++++++++++++++++++
 include/linux/power/max17042_battery.h  |  3 +++
 2 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/supply/max17042_battery.c b/drivers/power/supply/max17042_battery.c
index f0ff6e880ff2..62efe7eeb3f8 100644
--- a/drivers/power/supply/max17042_battery.c
+++ b/drivers/power/supply/max17042_battery.c
@@ -76,6 +76,7 @@ struct max17042_chip {
 };
 
 static enum power_supply_property max17042_battery_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
 	POWER_SUPPLY_PROP_PRESENT,
 	POWER_SUPPLY_PROP_CYCLE_COUNT,
 	POWER_SUPPLY_PROP_VOLTAGE_MAX,
@@ -113,6 +114,46 @@ static int max17042_get_temperature(struct max17042_chip *chip, int *temp)
 	return 0;
 }
 
+static int max17042_get_status(struct max17042_chip *chip, int *status)
+{
+	int ret, charge_full, charge_now;
+
+	ret = power_supply_am_i_supplied(chip->battery);
+	if (ret < 0) {
+		*status = POWER_SUPPLY_STATUS_UNKNOWN;
+		return 0;
+	}
+	if (ret == 0) {
+		*status = POWER_SUPPLY_STATUS_DISCHARGING;
+		return 0;
+	}
+
+	/*
+	 * The MAX170xx has builtin end-of-charge detection and will update
+	 * FullCAP to match RepCap when it detects end of charging.
+	 *
+	 * When this cycle the battery gets charged to a higher (calculated)
+	 * capacity then the previous cycle then FullCAP will get updated
+	 * contineously once end-of-charge detection kicks in, so allow the
+	 * 2 to differ a bit.
+	 */
+
+	ret = regmap_read(chip->regmap, MAX17042_FullCAP, &charge_full);
+	if (ret < 0)
+		return ret;
+
+	ret = regmap_read(chip->regmap, MAX17042_RepCap, &charge_now);
+	if (ret < 0)
+		return ret;
+
+	if ((charge_full - charge_now) <= MAX17042_FULL_THRESHOLD)
+		*status = POWER_SUPPLY_STATUS_FULL;
+	else
+		*status = POWER_SUPPLY_STATUS_CHARGING;
+
+	return 0;
+}
+
 static int max17042_get_battery_health(struct max17042_chip *chip, int *health)
 {
 	int temp, vavg, vbatt, ret;
@@ -182,6 +223,11 @@ static int max17042_get_property(struct power_supply *psy,
 		return -EAGAIN;
 
 	switch (psp) {
+	case POWER_SUPPLY_PROP_STATUS:
+		ret = max17042_get_status(chip, &val->intval);
+		if (ret < 0)
+			return ret;
+		break;
 	case POWER_SUPPLY_PROP_PRESENT:
 		ret = regmap_read(map, MAX17042_STATUS, &data);
 		if (ret < 0)
diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h
index 3489fb0f9099..a7ed29baf44a 100644
--- a/include/linux/power/max17042_battery.h
+++ b/include/linux/power/max17042_battery.h
@@ -31,6 +31,9 @@
 #define MAX17042_DEFAULT_TEMP_MIN	(0)    /* For sys without temp sensor */
 #define MAX17042_DEFAULT_TEMP_MAX	(700)  /* 70 degrees Celcius */
 
+/* Consider RepCap which is less then 10 units below FullCAP full */
+#define MAX17042_FULL_THRESHOLD		10
+
 #define MAX17042_CHARACTERIZATION_DATA_SIZE 48
 
 enum max17042_register {
-- 
cgit v1.2.3-70-g09d2


From 45753c5f315749711b935a2506ee5c10eef5c23d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 2 May 2017 10:31:18 +0200
Subject: srcu: Debloat the <linux/rcu_segcblist.h> header

Linus noticed that the <linux/rcu_segcblist.h> has huge inline functions
which should not be inline at all.

As a first step in cleaning this up, move them all to kernel/rcu/ and
only keep an absolute minimum of data type defines in the header:

  before:   -rw-r--r-- 1 mingo mingo 22284 May  2 10:25 include/linux/rcu_segcblist.h
   after:   -rw-r--r-- 1 mingo mingo  3180 May  2 10:22 include/linux/rcu_segcblist.h

More can be done, such as uninlining the large functions, which inlining
is unjustified even if it's an RCU internal matter.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcu_segcblist.h | 628 +---------------------------------------
 kernel/rcu/rcu_segcblist.h    | 645 ++++++++++++++++++++++++++++++++++++++++++
 kernel/rcu/srcutiny.c         |   1 +
 kernel/rcu/srcutree.c         |   1 +
 kernel/rcu/tree.h             |   3 +-
 5 files changed, 652 insertions(+), 626 deletions(-)
 create mode 100644 kernel/rcu/rcu_segcblist.h

(limited to 'include/linux')

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index ced8f313fd05..ba4d2621d9ca 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -20,8 +20,8 @@
  * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  */
 
-#ifndef __KERNEL_RCU_SEGCBLIST_H
-#define __KERNEL_RCU_SEGCBLIST_H
+#ifndef __INCLUDE_LINUX_RCU_SEGCBLIST_H
+#define __INCLUDE_LINUX_RCU_SEGCBLIST_H
 
 /* Simple unsegmented callback lists. */
 struct rcu_cblist {
@@ -33,102 +33,6 @@ struct rcu_cblist {
 
 #define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head }
 
-/* Initialize simple callback list. */
-static inline void rcu_cblist_init(struct rcu_cblist *rclp)
-{
-	rclp->head = NULL;
-	rclp->tail = &rclp->head;
-	rclp->len = 0;
-	rclp->len_lazy = 0;
-}
-
-/* Is simple callback list empty? */
-static inline bool rcu_cblist_empty(struct rcu_cblist *rclp)
-{
-	return !rclp->head;
-}
-
-/* Return number of callbacks in simple callback list. */
-static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
-{
-	return rclp->len;
-}
-
-/* Return number of lazy callbacks in simple callback list. */
-static inline long rcu_cblist_n_lazy_cbs(struct rcu_cblist *rclp)
-{
-	return rclp->len_lazy;
-}
-
-/*
- * Debug function to actually count the number of callbacks.
- * If the number exceeds the limit specified, return -1.
- */
-static inline long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
-{
-	int cnt = 0;
-	struct rcu_head **rhpp = &rclp->head;
-
-	for (;;) {
-		if (!*rhpp)
-			return cnt;
-		if (++cnt > lim)
-			return -1;
-		rhpp = &(*rhpp)->next;
-	}
-}
-
-/*
- * Dequeue the oldest rcu_head structure from the specified callback
- * list.  This function assumes that the callback is non-lazy, but
- * the caller can later invoke rcu_cblist_dequeued_lazy() if it
- * finds otherwise (and if it cares about laziness).  This allows
- * different users to have different ways of determining laziness.
- */
-static inline struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
-{
-	struct rcu_head *rhp;
-
-	rhp = rclp->head;
-	if (!rhp)
-		return NULL;
-	rclp->len--;
-	rclp->head = rhp->next;
-	if (!rclp->head)
-		rclp->tail = &rclp->head;
-	return rhp;
-}
-
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
-{
-	rclp->len_lazy--;
-}
-
-/*
- * Interim function to return rcu_cblist head pointer.  Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
-{
-	return rclp->head;
-}
-
-/*
- * Interim function to return rcu_cblist head pointer.  Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
-{
-	WARN_ON_ONCE(rcu_cblist_empty(rclp));
-	return rclp->tail;
-}
-
 /* Complicated segmented callback lists.  ;-) */
 
 /*
@@ -183,530 +87,4 @@ struct rcu_segcblist {
 	.tails[RCU_NEXT_TAIL] = &n.head, \
 }
 
-/*
- * Initialize an rcu_segcblist structure.
- */
-static inline void rcu_segcblist_init(struct rcu_segcblist *rsclp)
-{
-	int i;
-
-	BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
-	BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
-	rsclp->head = NULL;
-	for (i = 0; i < RCU_CBLIST_NSEGS; i++)
-		rsclp->tails[i] = &rsclp->head;
-	rsclp->len = 0;
-	rsclp->len_lazy = 0;
-}
-
-/*
- * Is the specified rcu_segcblist structure empty?
- *
- * But careful!  The fact that the ->head field is NULL does not
- * necessarily imply that there are no callbacks associated with
- * this structure.  When callbacks are being invoked, they are
- * removed as a group.  If callback invocation must be preempted,
- * the remaining callbacks will be added back to the list.  Either
- * way, the counts are updated later.
- *
- * So it is often the case that rcu_segcblist_n_cbs() should be used
- * instead.
- */
-static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
-{
-	return !rsclp->head;
-}
-
-/* Return number of callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
-{
-	return READ_ONCE(rsclp->len);
-}
-
-/* Return number of lazy callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
-{
-	return rsclp->len_lazy;
-}
-
-/* Return number of lazy callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
-{
-	return rsclp->len - rsclp->len_lazy;
-}
-
-/*
- * Is the specified rcu_segcblist enabled, for example, not corresponding
- * to an offline or callback-offloaded CPU?
- */
-static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
-{
-	return !!rsclp->tails[RCU_NEXT_TAIL];
-}
-
-/*
- * Disable the specified rcu_segcblist structure, so that callbacks can
- * no longer be posted to it.  This structure must be empty.
- */
-static inline void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
-{
-	WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
-	rsclp->tails[RCU_NEXT_TAIL] = NULL;
-}
-
-/*
- * Is the specified segment of the specified rcu_segcblist structure
- * empty of callbacks?
- */
-static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
-{
-	if (seg == RCU_DONE_TAIL)
-		return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
-	return rsclp->tails[seg - 1] == rsclp->tails[seg];
-}
-
-/*
- * Are all segments following the specified segment of the specified
- * rcu_segcblist structure empty of callbacks?  (The specified
- * segment might well contain callbacks.)
- */
-static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
-{
-	return !*rsclp->tails[seg];
-}
-
-/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * are ready to be invoked?
- */
-static inline bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
-{
-	return rcu_segcblist_is_enabled(rsclp) &&
-	       &rsclp->head != rsclp->tails[RCU_DONE_TAIL];
-}
-
-/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * are still pending, that is, not yet ready to be invoked?
- */
-static inline bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
-{
-	return rcu_segcblist_is_enabled(rsclp) &&
-	       !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL);
-}
-
-/*
- * Dequeue and return the first ready-to-invoke callback.  If there
- * are no ready-to-invoke callbacks, return NULL.  Disables interrupts
- * to avoid interference.  Does not protect from interference from other
- * CPUs or tasks.
- */
-static inline struct rcu_head *
-rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
-{
-	unsigned long flags;
-	int i;
-	struct rcu_head *rhp;
-
-	local_irq_save(flags);
-	if (!rcu_segcblist_ready_cbs(rsclp)) {
-		local_irq_restore(flags);
-		return NULL;
-	}
-	rhp = rsclp->head;
-	BUG_ON(!rhp);
-	rsclp->head = rhp->next;
-	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
-		if (rsclp->tails[i] != &rhp->next)
-			break;
-		rsclp->tails[i] = &rsclp->head;
-	}
-	smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
-	WRITE_ONCE(rsclp->len, rsclp->len - 1);
-	local_irq_restore(flags);
-	return rhp;
-}
-
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-static inline void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	rsclp->len_lazy--;
-	local_irq_restore(flags);
-}
-
-/*
- * Return a pointer to the first callback in the specified rcu_segcblist
- * structure.  This is useful for diagnostics.
- */
-static inline struct rcu_head *
-rcu_segcblist_first_cb(struct rcu_segcblist *rsclp)
-{
-	if (rcu_segcblist_is_enabled(rsclp))
-		return rsclp->head;
-	return NULL;
-}
-
-/*
- * Return a pointer to the first pending callback in the specified
- * rcu_segcblist structure.  This is useful just after posting a given
- * callback -- if that callback is the first pending callback, then
- * you cannot rely on someone else having already started up the required
- * grace period.
- */
-static inline struct rcu_head *
-rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
-{
-	if (rcu_segcblist_is_enabled(rsclp))
-		return *rsclp->tails[RCU_DONE_TAIL];
-	return NULL;
-}
-
-/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * have not yet been processed beyond having been posted, that is,
- * does it contain callbacks in its last segment?
- */
-static inline bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
-{
-	return rcu_segcblist_is_enabled(rsclp) &&
-	       !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
-}
-
-/*
- * Enqueue the specified callback onto the specified rcu_segcblist
- * structure, updating accounting as needed.  Note that the ->len
- * field may be accessed locklessly, hence the WRITE_ONCE().
- * The ->len field is used by rcu_barrier() and friends to determine
- * if it must post a callback on this structure, and it is OK
- * for rcu_barrier() to sometimes post callbacks needlessly, but
- * absolutely not OK for it to ever miss posting a callback.
- */
-static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
-					 struct rcu_head *rhp, bool lazy)
-{
-	WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
-	if (lazy)
-		rsclp->len_lazy++;
-	smp_mb(); /* Ensure counts are updated before callback is enqueued. */
-	rhp->next = NULL;
-	*rsclp->tails[RCU_NEXT_TAIL] = rhp;
-	rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
-}
-
-/*
- * Entrain the specified callback onto the specified rcu_segcblist at
- * the end of the last non-empty segment.  If the entire rcu_segcblist
- * is empty, make no change, but return false.
- *
- * This is intended for use by rcu_barrier()-like primitives, -not-
- * for normal grace-period use.  IMPORTANT:  The callback you enqueue
- * will wait for all prior callbacks, NOT necessarily for a grace
- * period.  You have been warned.
- */
-static inline bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
-					 struct rcu_head *rhp, bool lazy)
-{
-	int i;
-
-	if (rcu_segcblist_n_cbs(rsclp) == 0)
-		return false;
-	WRITE_ONCE(rsclp->len, rsclp->len + 1);
-	if (lazy)
-		rsclp->len_lazy++;
-	smp_mb(); /* Ensure counts are updated before callback is entrained. */
-	rhp->next = NULL;
-	for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
-		if (rsclp->tails[i] != rsclp->tails[i - 1])
-			break;
-	*rsclp->tails[i] = rhp;
-	for (; i <= RCU_NEXT_TAIL; i++)
-		rsclp->tails[i] = &rhp->next;
-	return true;
-}
-
-/*
- * Extract only the counts from the specified rcu_segcblist structure,
- * and place them in the specified rcu_cblist structure.  This function
- * supports both callback orphaning and invocation, hence the separation
- * of counts and callbacks.  (Callbacks ready for invocation must be
- * orphaned and adopted separately from pending callbacks, but counts
- * apply to all callbacks.  Locking must be used to make sure that
- * both orphaned-callbacks lists are consistent.)
- */
-static inline void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
-					       struct rcu_cblist *rclp)
-{
-	rclp->len_lazy += rsclp->len_lazy;
-	rclp->len += rsclp->len;
-	rsclp->len_lazy = 0;
-	WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
-}
-
-/*
- * Extract only those callbacks ready to be invoked from the specified
- * rcu_segcblist structure and place them in the specified rcu_cblist
- * structure.
- */
-static inline void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
-						  struct rcu_cblist *rclp)
-{
-	int i;
-
-	if (!rcu_segcblist_ready_cbs(rsclp))
-		return; /* Nothing to do. */
-	*rclp->tail = rsclp->head;
-	rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
-	*rsclp->tails[RCU_DONE_TAIL] = NULL;
-	rclp->tail = rsclp->tails[RCU_DONE_TAIL];
-	for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
-		if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
-			rsclp->tails[i] = &rsclp->head;
-}
-
-/*
- * Extract only those callbacks still pending (not yet ready to be
- * invoked) from the specified rcu_segcblist structure and place them in
- * the specified rcu_cblist structure.  Note that this loses information
- * about any callbacks that might have been partway done waiting for
- * their grace period.  Too bad!  They will have to start over.
- */
-static inline void
-rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
-			       struct rcu_cblist *rclp)
-{
-	int i;
-
-	if (!rcu_segcblist_pend_cbs(rsclp))
-		return; /* Nothing to do. */
-	*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
-	rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
-	*rsclp->tails[RCU_DONE_TAIL] = NULL;
-	for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
-		rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
-}
-
-/*
- * Move the entire contents of the specified rcu_segcblist structure,
- * counts, callbacks, and all, to the specified rcu_cblist structure.
- * @@@ Why do we need this???  Moving early-boot CBs to NOCB lists?
- * @@@ Memory barrier needed?  (Not if only used at boot time...)
- */
-static inline void rcu_segcblist_extract_all(struct rcu_segcblist *rsclp,
-					     struct rcu_cblist *rclp)
-{
-	rcu_segcblist_extract_done_cbs(rsclp, rclp);
-	rcu_segcblist_extract_pend_cbs(rsclp, rclp);
-	rcu_segcblist_extract_count(rsclp, rclp);
-}
-
-/*
- * Insert counts from the specified rcu_cblist structure in the
- * specified rcu_segcblist structure.
- */
-static inline void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
-					      struct rcu_cblist *rclp)
-{
-	rsclp->len_lazy += rclp->len_lazy;
-	/* ->len sampled locklessly. */
-	WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
-	rclp->len_lazy = 0;
-	rclp->len = 0;
-}
-
-/*
- * Move callbacks from the specified rcu_cblist to the beginning of the
- * done-callbacks segment of the specified rcu_segcblist.
- */
-static inline void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
-						 struct rcu_cblist *rclp)
-{
-	int i;
-
-	if (!rclp->head)
-		return; /* No callbacks to move. */
-	*rclp->tail = rsclp->head;
-	rsclp->head = rclp->head;
-	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
-		if (&rsclp->head == rsclp->tails[i])
-			rsclp->tails[i] = rclp->tail;
-		else
-			break;
-	rclp->head = NULL;
-	rclp->tail = &rclp->head;
-}
-
-/*
- * Move callbacks from the specified rcu_cblist to the end of the
- * new-callbacks segment of the specified rcu_segcblist.
- */
-static inline void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
-						 struct rcu_cblist *rclp)
-{
-	if (!rclp->head)
-		return; /* Nothing to do. */
-	*rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
-	rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
-	rclp->head = NULL;
-	rclp->tail = &rclp->head;
-}
-
-/*
- * Advance the callbacks in the specified rcu_segcblist structure based
- * on the current value passed in for the grace-period counter.
- */
-static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp,
-					 unsigned long seq)
-{
-	int i, j;
-
-	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
-	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
-		return;
-
-	/*
-	 * Find all callbacks whose ->gp_seq numbers indicate that they
-	 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
-	 */
-	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
-		if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
-			break;
-		rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
-	}
-
-	/* If no callbacks moved, nothing more need be done. */
-	if (i == RCU_WAIT_TAIL)
-		return;
-
-	/* Clean up tail pointers that might have been misordered above. */
-	for (j = RCU_WAIT_TAIL; j < i; j++)
-		rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
-
-	/*
-	 * Callbacks moved, so clean up the misordered ->tails[] pointers
-	 * that now point into the middle of the list of ready-to-invoke
-	 * callbacks.  The overall effect is to copy down the later pointers
-	 * into the gap that was created by the now-ready segments.
-	 */
-	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
-		if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
-			break;  /* No more callbacks. */
-		rsclp->tails[j] = rsclp->tails[i];
-		rsclp->gp_seq[j] = rsclp->gp_seq[i];
-	}
-}
-
-/*
- * "Accelerate" callbacks based on more-accurate grace-period information.
- * The reason for this is that RCU does not synchronize the beginnings and
- * ends of grace periods, and that callbacks are posted locally.  This in
- * turn means that the callbacks must be labelled conservatively early
- * on, as getting exact information would degrade both performance and
- * scalability.  When more accurate grace-period information becomes
- * available, previously posted callbacks can be "accelerated", marking
- * them to complete at the end of the earlier grace period.
- *
- * This function operates on an rcu_segcblist structure, and also the
- * grace-period sequence number seq at which new callbacks would become
- * ready to invoke.  Returns true if there are callbacks that won't be
- * ready to invoke until seq, false otherwise.
- */
-static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp,
-					    unsigned long seq)
-{
-	int i;
-
-	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
-	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
-		return false;
-
-	/*
-	 * Find the segment preceding the oldest segment of callbacks
-	 * whose ->gp_seq[] completion is at or after that passed in via
-	 * "seq", skipping any empty segments.  This oldest segment, along
-	 * with any later segments, can be merged in with any newly arrived
-	 * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
-	 * as their ->gp_seq[] grace-period completion sequence number.
-	 */
-	for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
-		if (rsclp->tails[i] != rsclp->tails[i - 1] &&
-		    ULONG_CMP_LT(rsclp->gp_seq[i], seq))
-			break;
-
-	/*
-	 * If all the segments contain callbacks that correspond to
-	 * earlier grace-period sequence numbers than "seq", leave.
-	 * Assuming that the rcu_segcblist structure has enough
-	 * segments in its arrays, this can only happen if some of
-	 * the non-done segments contain callbacks that really are
-	 * ready to invoke.  This situation will get straightened
-	 * out by the next call to rcu_segcblist_advance().
-	 *
-	 * Also advance to the oldest segment of callbacks whose
-	 * ->gp_seq[] completion is at or after that passed in via "seq",
-	 * skipping any empty segments.
-	 */
-	if (++i >= RCU_NEXT_TAIL)
-		return false;
-
-	/*
-	 * Merge all later callbacks, including newly arrived callbacks,
-	 * into the segment located by the for-loop above.  Assign "seq"
-	 * as the ->gp_seq[] value in order to correctly handle the case
-	 * where there were no pending callbacks in the rcu_segcblist
-	 * structure other than in the RCU_NEXT_TAIL segment.
-	 */
-	for (; i < RCU_NEXT_TAIL; i++) {
-		rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
-		rsclp->gp_seq[i] = seq;
-	}
-	return true;
-}
-
-/*
- * Scan the specified rcu_segcblist structure for callbacks that need
- * a grace period later than the one specified by "seq".  We don't look
- * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
- * have a grace-period sequence number.
- */
-static inline bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
-						  unsigned long seq)
-{
-	int i;
-
-	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
-		if (rsclp->tails[i - 1] != rsclp->tails[i] &&
-		    ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
-			return true;
-	return false;
-}
-
-/*
- * Interim function to return rcu_segcblist head pointer.  Longer term, the
- * rcu_segcblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
-{
-	return rsclp->head;
-}
-
-/*
- * Interim function to return rcu_segcblist head pointer.  Longer term, the
- * rcu_segcblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
-{
-	WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
-	return rsclp->tails[RCU_NEXT_TAIL];
-}
-
-#endif /* __KERNEL_RCU_SEGCBLIST_H */
+#endif /* __INCLUDE_LINUX_RCU_SEGCBLIST_H */
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
new file mode 100644
index 000000000000..d98d2f9b8d59
--- /dev/null
+++ b/kernel/rcu/rcu_segcblist.h
@@ -0,0 +1,645 @@
+/*
+ * RCU segmented callback lists
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2017
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#include <linux/rcu_segcblist.h>
+
+/* Initialize simple callback list. */
+static inline void rcu_cblist_init(struct rcu_cblist *rclp)
+{
+	rclp->head = NULL;
+	rclp->tail = &rclp->head;
+	rclp->len = 0;
+	rclp->len_lazy = 0;
+}
+
+/* Is simple callback list empty? */
+static inline bool rcu_cblist_empty(struct rcu_cblist *rclp)
+{
+	return !rclp->head;
+}
+
+/* Return number of callbacks in simple callback list. */
+static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
+{
+	return rclp->len;
+}
+
+/* Return number of lazy callbacks in simple callback list. */
+static inline long rcu_cblist_n_lazy_cbs(struct rcu_cblist *rclp)
+{
+	return rclp->len_lazy;
+}
+
+/*
+ * Debug function to actually count the number of callbacks.
+ * If the number exceeds the limit specified, return -1.
+ */
+static inline long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
+{
+	int cnt = 0;
+	struct rcu_head **rhpp = &rclp->head;
+
+	for (;;) {
+		if (!*rhpp)
+			return cnt;
+		if (++cnt > lim)
+			return -1;
+		rhpp = &(*rhpp)->next;
+	}
+}
+
+/*
+ * Dequeue the oldest rcu_head structure from the specified callback
+ * list.  This function assumes that the callback is non-lazy, but
+ * the caller can later invoke rcu_cblist_dequeued_lazy() if it
+ * finds otherwise (and if it cares about laziness).  This allows
+ * different users to have different ways of determining laziness.
+ */
+static inline struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
+{
+	struct rcu_head *rhp;
+
+	rhp = rclp->head;
+	if (!rhp)
+		return NULL;
+	rclp->len--;
+	rclp->head = rhp->next;
+	if (!rclp->head)
+		rclp->tail = &rclp->head;
+	return rhp;
+}
+
+/*
+ * Account for the fact that a previously dequeued callback turned out
+ * to be marked as lazy.
+ */
+static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
+{
+	rclp->len_lazy--;
+}
+
+/*
+ * Interim function to return rcu_cblist head pointer.  Longer term, the
+ * rcu_cblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
+{
+	return rclp->head;
+}
+
+/*
+ * Interim function to return rcu_cblist head pointer.  Longer term, the
+ * rcu_cblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
+{
+	WARN_ON_ONCE(rcu_cblist_empty(rclp));
+	return rclp->tail;
+}
+
+/*
+ * Initialize an rcu_segcblist structure.
+ */
+static inline void rcu_segcblist_init(struct rcu_segcblist *rsclp)
+{
+	int i;
+
+	BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
+	BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
+	rsclp->head = NULL;
+	for (i = 0; i < RCU_CBLIST_NSEGS; i++)
+		rsclp->tails[i] = &rsclp->head;
+	rsclp->len = 0;
+	rsclp->len_lazy = 0;
+}
+
+/*
+ * Is the specified rcu_segcblist structure empty?
+ *
+ * But careful!  The fact that the ->head field is NULL does not
+ * necessarily imply that there are no callbacks associated with
+ * this structure.  When callbacks are being invoked, they are
+ * removed as a group.  If callback invocation must be preempted,
+ * the remaining callbacks will be added back to the list.  Either
+ * way, the counts are updated later.
+ *
+ * So it is often the case that rcu_segcblist_n_cbs() should be used
+ * instead.
+ */
+static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
+{
+	return !rsclp->head;
+}
+
+/* Return number of callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
+{
+	return READ_ONCE(rsclp->len);
+}
+
+/* Return number of lazy callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
+{
+	return rsclp->len_lazy;
+}
+
+/* Return number of lazy callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
+{
+	return rsclp->len - rsclp->len_lazy;
+}
+
+/*
+ * Is the specified rcu_segcblist enabled, for example, not corresponding
+ * to an offline or callback-offloaded CPU?
+ */
+static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
+{
+	return !!rsclp->tails[RCU_NEXT_TAIL];
+}
+
+/*
+ * Disable the specified rcu_segcblist structure, so that callbacks can
+ * no longer be posted to it.  This structure must be empty.
+ */
+static inline void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
+{
+	WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
+	WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
+	WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
+	rsclp->tails[RCU_NEXT_TAIL] = NULL;
+}
+
+/*
+ * Is the specified segment of the specified rcu_segcblist structure
+ * empty of callbacks?
+ */
+static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
+{
+	if (seg == RCU_DONE_TAIL)
+		return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
+	return rsclp->tails[seg - 1] == rsclp->tails[seg];
+}
+
+/*
+ * Are all segments following the specified segment of the specified
+ * rcu_segcblist structure empty of callbacks?  (The specified
+ * segment might well contain callbacks.)
+ */
+static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
+{
+	return !*rsclp->tails[seg];
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * are ready to be invoked?
+ */
+static inline bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
+{
+	return rcu_segcblist_is_enabled(rsclp) &&
+	       &rsclp->head != rsclp->tails[RCU_DONE_TAIL];
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * are still pending, that is, not yet ready to be invoked?
+ */
+static inline bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
+{
+	return rcu_segcblist_is_enabled(rsclp) &&
+	       !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL);
+}
+
+/*
+ * Dequeue and return the first ready-to-invoke callback.  If there
+ * are no ready-to-invoke callbacks, return NULL.  Disables interrupts
+ * to avoid interference.  Does not protect from interference from other
+ * CPUs or tasks.
+ */
+static inline struct rcu_head *
+rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
+{
+	unsigned long flags;
+	int i;
+	struct rcu_head *rhp;
+
+	local_irq_save(flags);
+	if (!rcu_segcblist_ready_cbs(rsclp)) {
+		local_irq_restore(flags);
+		return NULL;
+	}
+	rhp = rsclp->head;
+	BUG_ON(!rhp);
+	rsclp->head = rhp->next;
+	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
+		if (rsclp->tails[i] != &rhp->next)
+			break;
+		rsclp->tails[i] = &rsclp->head;
+	}
+	smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
+	WRITE_ONCE(rsclp->len, rsclp->len - 1);
+	local_irq_restore(flags);
+	return rhp;
+}
+
+/*
+ * Account for the fact that a previously dequeued callback turned out
+ * to be marked as lazy.
+ */
+static inline void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	rsclp->len_lazy--;
+	local_irq_restore(flags);
+}
+
+/*
+ * Return a pointer to the first callback in the specified rcu_segcblist
+ * structure.  This is useful for diagnostics.
+ */
+static inline struct rcu_head *
+rcu_segcblist_first_cb(struct rcu_segcblist *rsclp)
+{
+	if (rcu_segcblist_is_enabled(rsclp))
+		return rsclp->head;
+	return NULL;
+}
+
+/*
+ * Return a pointer to the first pending callback in the specified
+ * rcu_segcblist structure.  This is useful just after posting a given
+ * callback -- if that callback is the first pending callback, then
+ * you cannot rely on someone else having already started up the required
+ * grace period.
+ */
+static inline struct rcu_head *
+rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
+{
+	if (rcu_segcblist_is_enabled(rsclp))
+		return *rsclp->tails[RCU_DONE_TAIL];
+	return NULL;
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * have not yet been processed beyond having been posted, that is,
+ * does it contain callbacks in its last segment?
+ */
+static inline bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
+{
+	return rcu_segcblist_is_enabled(rsclp) &&
+	       !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
+}
+
+/*
+ * Enqueue the specified callback onto the specified rcu_segcblist
+ * structure, updating accounting as needed.  Note that the ->len
+ * field may be accessed locklessly, hence the WRITE_ONCE().
+ * The ->len field is used by rcu_barrier() and friends to determine
+ * if it must post a callback on this structure, and it is OK
+ * for rcu_barrier() to sometimes post callbacks needlessly, but
+ * absolutely not OK for it to ever miss posting a callback.
+ */
+static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
+					 struct rcu_head *rhp, bool lazy)
+{
+	WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
+	if (lazy)
+		rsclp->len_lazy++;
+	smp_mb(); /* Ensure counts are updated before callback is enqueued. */
+	rhp->next = NULL;
+	*rsclp->tails[RCU_NEXT_TAIL] = rhp;
+	rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
+}
+
+/*
+ * Entrain the specified callback onto the specified rcu_segcblist at
+ * the end of the last non-empty segment.  If the entire rcu_segcblist
+ * is empty, make no change, but return false.
+ *
+ * This is intended for use by rcu_barrier()-like primitives, -not-
+ * for normal grace-period use.  IMPORTANT:  The callback you enqueue
+ * will wait for all prior callbacks, NOT necessarily for a grace
+ * period.  You have been warned.
+ */
+static inline bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
+					 struct rcu_head *rhp, bool lazy)
+{
+	int i;
+
+	if (rcu_segcblist_n_cbs(rsclp) == 0)
+		return false;
+	WRITE_ONCE(rsclp->len, rsclp->len + 1);
+	if (lazy)
+		rsclp->len_lazy++;
+	smp_mb(); /* Ensure counts are updated before callback is entrained. */
+	rhp->next = NULL;
+	for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
+		if (rsclp->tails[i] != rsclp->tails[i - 1])
+			break;
+	*rsclp->tails[i] = rhp;
+	for (; i <= RCU_NEXT_TAIL; i++)
+		rsclp->tails[i] = &rhp->next;
+	return true;
+}
+
+/*
+ * Extract only the counts from the specified rcu_segcblist structure,
+ * and place them in the specified rcu_cblist structure.  This function
+ * supports both callback orphaning and invocation, hence the separation
+ * of counts and callbacks.  (Callbacks ready for invocation must be
+ * orphaned and adopted separately from pending callbacks, but counts
+ * apply to all callbacks.  Locking must be used to make sure that
+ * both orphaned-callbacks lists are consistent.)
+ */
+static inline void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
+					       struct rcu_cblist *rclp)
+{
+	rclp->len_lazy += rsclp->len_lazy;
+	rclp->len += rsclp->len;
+	rsclp->len_lazy = 0;
+	WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
+}
+
+/*
+ * Extract only those callbacks ready to be invoked from the specified
+ * rcu_segcblist structure and place them in the specified rcu_cblist
+ * structure.
+ */
+static inline void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
+						  struct rcu_cblist *rclp)
+{
+	int i;
+
+	if (!rcu_segcblist_ready_cbs(rsclp))
+		return; /* Nothing to do. */
+	*rclp->tail = rsclp->head;
+	rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
+	*rsclp->tails[RCU_DONE_TAIL] = NULL;
+	rclp->tail = rsclp->tails[RCU_DONE_TAIL];
+	for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
+		if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
+			rsclp->tails[i] = &rsclp->head;
+}
+
+/*
+ * Extract only those callbacks still pending (not yet ready to be
+ * invoked) from the specified rcu_segcblist structure and place them in
+ * the specified rcu_cblist structure.  Note that this loses information
+ * about any callbacks that might have been partway done waiting for
+ * their grace period.  Too bad!  They will have to start over.
+ */
+static inline void
+rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
+			       struct rcu_cblist *rclp)
+{
+	int i;
+
+	if (!rcu_segcblist_pend_cbs(rsclp))
+		return; /* Nothing to do. */
+	*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
+	rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
+	*rsclp->tails[RCU_DONE_TAIL] = NULL;
+	for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
+		rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
+}
+
+/*
+ * Move the entire contents of the specified rcu_segcblist structure,
+ * counts, callbacks, and all, to the specified rcu_cblist structure.
+ * @@@ Why do we need this???  Moving early-boot CBs to NOCB lists?
+ * @@@ Memory barrier needed?  (Not if only used at boot time...)
+ */
+static inline void rcu_segcblist_extract_all(struct rcu_segcblist *rsclp,
+					     struct rcu_cblist *rclp)
+{
+	rcu_segcblist_extract_done_cbs(rsclp, rclp);
+	rcu_segcblist_extract_pend_cbs(rsclp, rclp);
+	rcu_segcblist_extract_count(rsclp, rclp);
+}
+
+/*
+ * Insert counts from the specified rcu_cblist structure in the
+ * specified rcu_segcblist structure.
+ */
+static inline void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
+					      struct rcu_cblist *rclp)
+{
+	rsclp->len_lazy += rclp->len_lazy;
+	/* ->len sampled locklessly. */
+	WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
+	rclp->len_lazy = 0;
+	rclp->len = 0;
+}
+
+/*
+ * Move callbacks from the specified rcu_cblist to the beginning of the
+ * done-callbacks segment of the specified rcu_segcblist.
+ */
+static inline void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
+						 struct rcu_cblist *rclp)
+{
+	int i;
+
+	if (!rclp->head)
+		return; /* No callbacks to move. */
+	*rclp->tail = rsclp->head;
+	rsclp->head = rclp->head;
+	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
+		if (&rsclp->head == rsclp->tails[i])
+			rsclp->tails[i] = rclp->tail;
+		else
+			break;
+	rclp->head = NULL;
+	rclp->tail = &rclp->head;
+}
+
+/*
+ * Move callbacks from the specified rcu_cblist to the end of the
+ * new-callbacks segment of the specified rcu_segcblist.
+ */
+static inline void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
+						 struct rcu_cblist *rclp)
+{
+	if (!rclp->head)
+		return; /* Nothing to do. */
+	*rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
+	rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
+	rclp->head = NULL;
+	rclp->tail = &rclp->head;
+}
+
+/*
+ * Advance the callbacks in the specified rcu_segcblist structure based
+ * on the current value passed in for the grace-period counter.
+ */
+static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp,
+					 unsigned long seq)
+{
+	int i, j;
+
+	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+		return;
+
+	/*
+	 * Find all callbacks whose ->gp_seq numbers indicate that they
+	 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
+	 */
+	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+		if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+			break;
+		rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
+	}
+
+	/* If no callbacks moved, nothing more need be done. */
+	if (i == RCU_WAIT_TAIL)
+		return;
+
+	/* Clean up tail pointers that might have been misordered above. */
+	for (j = RCU_WAIT_TAIL; j < i; j++)
+		rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
+
+	/*
+	 * Callbacks moved, so clean up the misordered ->tails[] pointers
+	 * that now point into the middle of the list of ready-to-invoke
+	 * callbacks.  The overall effect is to copy down the later pointers
+	 * into the gap that was created by the now-ready segments.
+	 */
+	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+		if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
+			break;  /* No more callbacks. */
+		rsclp->tails[j] = rsclp->tails[i];
+		rsclp->gp_seq[j] = rsclp->gp_seq[i];
+	}
+}
+
+/*
+ * "Accelerate" callbacks based on more-accurate grace-period information.
+ * The reason for this is that RCU does not synchronize the beginnings and
+ * ends of grace periods, and that callbacks are posted locally.  This in
+ * turn means that the callbacks must be labelled conservatively early
+ * on, as getting exact information would degrade both performance and
+ * scalability.  When more accurate grace-period information becomes
+ * available, previously posted callbacks can be "accelerated", marking
+ * them to complete at the end of the earlier grace period.
+ *
+ * This function operates on an rcu_segcblist structure, and also the
+ * grace-period sequence number seq at which new callbacks would become
+ * ready to invoke.  Returns true if there are callbacks that won't be
+ * ready to invoke until seq, false otherwise.
+ */
+static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp,
+					    unsigned long seq)
+{
+	int i;
+
+	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+		return false;
+
+	/*
+	 * Find the segment preceding the oldest segment of callbacks
+	 * whose ->gp_seq[] completion is at or after that passed in via
+	 * "seq", skipping any empty segments.  This oldest segment, along
+	 * with any later segments, can be merged in with any newly arrived
+	 * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
+	 * as their ->gp_seq[] grace-period completion sequence number.
+	 */
+	for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
+		if (rsclp->tails[i] != rsclp->tails[i - 1] &&
+		    ULONG_CMP_LT(rsclp->gp_seq[i], seq))
+			break;
+
+	/*
+	 * If all the segments contain callbacks that correspond to
+	 * earlier grace-period sequence numbers than "seq", leave.
+	 * Assuming that the rcu_segcblist structure has enough
+	 * segments in its arrays, this can only happen if some of
+	 * the non-done segments contain callbacks that really are
+	 * ready to invoke.  This situation will get straightened
+	 * out by the next call to rcu_segcblist_advance().
+	 *
+	 * Also advance to the oldest segment of callbacks whose
+	 * ->gp_seq[] completion is at or after that passed in via "seq",
+	 * skipping any empty segments.
+	 */
+	if (++i >= RCU_NEXT_TAIL)
+		return false;
+
+	/*
+	 * Merge all later callbacks, including newly arrived callbacks,
+	 * into the segment located by the for-loop above.  Assign "seq"
+	 * as the ->gp_seq[] value in order to correctly handle the case
+	 * where there were no pending callbacks in the rcu_segcblist
+	 * structure other than in the RCU_NEXT_TAIL segment.
+	 */
+	for (; i < RCU_NEXT_TAIL; i++) {
+		rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
+		rsclp->gp_seq[i] = seq;
+	}
+	return true;
+}
+
+/*
+ * Scan the specified rcu_segcblist structure for callbacks that need
+ * a grace period later than the one specified by "seq".  We don't look
+ * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
+ * have a grace-period sequence number.
+ */
+static inline bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
+						  unsigned long seq)
+{
+	int i;
+
+	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
+		if (rsclp->tails[i - 1] != rsclp->tails[i] &&
+		    ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+			return true;
+	return false;
+}
+
+/*
+ * Interim function to return rcu_segcblist head pointer.  Longer term, the
+ * rcu_segcblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
+{
+	return rsclp->head;
+}
+
+/*
+ * Interim function to return rcu_segcblist head pointer.  Longer term, the
+ * rcu_segcblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
+{
+	WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
+	return rsclp->tails[RCU_NEXT_TAIL];
+}
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index b8293527ee18..36e1f82faed1 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -30,6 +30,7 @@
 #include <linux/srcu.h>
 
 #include <linux/rcu_node_tree.h>
+#include "rcu_segcblist.h"
 #include "rcu.h"
 
 static int init_srcu_struct_fields(struct srcu_struct *sp)
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 87b070de6371..3ae8474557df 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -38,6 +38,7 @@
 #include <linux/srcu.h>
 
 #include "rcu.h"
+#include "rcu_segcblist.h"
 
 ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */
 module_param(exp_holdoff, ulong, 0444);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 0e598ab08fea..ba38262c3554 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -30,9 +30,10 @@
 #include <linux/seqlock.h>
 #include <linux/swait.h>
 #include <linux/stop_machine.h>
-#include <linux/rcu_segcblist.h>
 #include <linux/rcu_node_tree.h>
 
+#include "rcu_segcblist.h"
+
 /*
  * Dynticks per-CPU state.
  */
-- 
cgit v1.2.3-70-g09d2


From 9b2bbdb227588455afcc3b03475fa9b0a35d83af Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 6 Mar 2017 18:19:39 +0200
Subject: virtio: wrap find_vqs

We are going to add more parameters to find_vqs, let's wrap the call so
we don't need to tweak all drivers every time.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/block/virtio_blk.c                 | 3 +--
 drivers/char/virtio_console.c              | 6 +++---
 drivers/crypto/virtio/virtio_crypto_core.c | 3 +--
 drivers/gpu/drm/virtio/virtgpu_kms.c       | 3 +--
 drivers/net/caif/caif_virtio.c             | 3 +--
 drivers/net/virtio_net.c                   | 3 +--
 drivers/rpmsg/virtio_rpmsg_bus.c           | 2 +-
 drivers/scsi/virtio_scsi.c                 | 3 +--
 drivers/virtio/virtio_balloon.c            | 3 +--
 drivers/virtio/virtio_input.c              | 3 +--
 include/linux/virtio_config.h              | 9 +++++++++
 net/vmw_vsock/virtio_transport.c           | 6 +++---
 12 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1d4c9f8bc1e1..c08c30c35035 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -455,8 +455,7 @@ static int init_vq(struct virtio_blk *vblk)
 	}
 
 	/* Discover virtqueues and write information to configuration.  */
-	err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names,
-			&desc);
+	err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc);
 	if (err)
 		goto out;
 
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 87fe111d0be6..d0699c5fec43 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -1945,9 +1945,9 @@ static int init_vqs(struct ports_device *portdev)
 		}
 	}
 	/* Find the queues. */
-	err = portdev->vdev->config->find_vqs(portdev->vdev, nr_queues, vqs,
-					      io_callbacks,
-					      (const char **)io_names, NULL);
+	err = virtio_find_vqs(portdev->vdev, nr_queues, vqs,
+			      io_callbacks,
+			      (const char **)io_names, NULL);
 	if (err)
 		goto free;
 
diff --git a/drivers/crypto/virtio/virtio_crypto_core.c b/drivers/crypto/virtio/virtio_crypto_core.c
index 21472e427f6f..a111cd72797b 100644
--- a/drivers/crypto/virtio/virtio_crypto_core.c
+++ b/drivers/crypto/virtio/virtio_crypto_core.c
@@ -119,8 +119,7 @@ static int virtcrypto_find_vqs(struct virtio_crypto *vi)
 		names[i] = vi->data_vq[i].name;
 	}
 
-	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
-					 names, NULL);
+	ret = virtio_find_vqs(vi->vdev, total_vqs, vqs, callbacks, names, NULL);
 	if (ret)
 		goto err_find;
 
diff --git a/drivers/gpu/drm/virtio/virtgpu_kms.c b/drivers/gpu/drm/virtio/virtgpu_kms.c
index 491866865c33..1e1c90b30d4a 100644
--- a/drivers/gpu/drm/virtio/virtgpu_kms.c
+++ b/drivers/gpu/drm/virtio/virtgpu_kms.c
@@ -175,8 +175,7 @@ int virtio_gpu_driver_load(struct drm_device *dev, unsigned long flags)
 	DRM_INFO("virgl 3d acceleration not supported by guest\n");
 #endif
 
-	ret = vgdev->vdev->config->find_vqs(vgdev->vdev, 2, vqs,
-					    callbacks, names, NULL);
+	ret = virtio_find_vqs(vgdev->vdev, 2, vqs, callbacks, names, NULL);
 	if (ret) {
 		DRM_ERROR("failed to find virt queues\n");
 		goto err_vqs;
diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
index bc0eb47eccee..6122768c8644 100644
--- a/drivers/net/caif/caif_virtio.c
+++ b/drivers/net/caif/caif_virtio.c
@@ -679,8 +679,7 @@ static int cfv_probe(struct virtio_device *vdev)
 		goto err;
 
 	/* Get the TX virtio ring. This is a "guest side vring". */
-	err = vdev->config->find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names,
-			NULL);
+	err = virtio_find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names, NULL);
 	if (err)
 		goto err;
 
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f36584616e7d..71f447ab440e 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2079,8 +2079,7 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
 		names[txq2vq(i)] = vi->sq[i].name;
 	}
 
-	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
-					 names, NULL);
+	ret = virtio_find_vqs(vi->vdev, total_vqs, vqs, callbacks, names, NULL);
 	if (ret)
 		goto err_find;
 
diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c
index 5e66e081027e..f7cade09d38a 100644
--- a/drivers/rpmsg/virtio_rpmsg_bus.c
+++ b/drivers/rpmsg/virtio_rpmsg_bus.c
@@ -869,7 +869,7 @@ static int rpmsg_probe(struct virtio_device *vdev)
 	init_waitqueue_head(&vrp->sendq);
 
 	/* We expect two virtqueues, rx and tx (and in this order) */
-	err = vdev->config->find_vqs(vdev, 2, vqs, vq_cbs, names, NULL);
+	err = virtio_find_vqs(vdev, 2, vqs, vq_cbs, names, NULL);
 	if (err)
 		goto free_vrp;
 
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 939c47df73fa..e9222dcb9707 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -870,8 +870,7 @@ static int virtscsi_init(struct virtio_device *vdev,
 	}
 
 	/* Discover virtqueues and write information to configuration.  */
-	err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names,
-			&desc);
+	err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc);
 	if (err)
 		goto out;
 
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 34adf9b9c053..408c174ef0d5 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -418,8 +418,7 @@ static int init_vqs(struct virtio_balloon *vb)
 	 * optionally stat.
 	 */
 	nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
-	err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks, names,
-			NULL);
+	err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
 	if (err)
 		return err;
 
diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c
index 79f1293cda93..3a0468f2ceb0 100644
--- a/drivers/virtio/virtio_input.c
+++ b/drivers/virtio/virtio_input.c
@@ -173,8 +173,7 @@ static int virtinput_init_vqs(struct virtio_input *vi)
 	static const char * const names[] = { "events", "status" };
 	int err;
 
-	err = vi->vdev->config->find_vqs(vi->vdev, 2, vqs, cbs, names,
-			NULL);
+	err = virtio_find_vqs(vi->vdev, 2, vqs, cbs, names, NULL);
 	if (err)
 		return err;
 	vi->evt = vqs[0];
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 8355bab175e1..47f3d805c290 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -179,6 +179,15 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
 	return vq;
 }
 
+static inline
+int virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+			struct virtqueue *vqs[], vq_callback_t *callbacks[],
+			const char * const names[],
+			struct irq_affinity *desc)
+{
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, desc);
+}
+
 /**
  * virtio_device_ready - enable vq use in probe function
  * @vdev: the device
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 68675a151f22..97e26e2955e1 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -573,9 +573,9 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
 
 	vsock->vdev = vdev;
 
-	ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX,
-					    vsock->vqs, callbacks, names,
-					    NULL);
+	ret = virtio_find_vqs(vsock->vdev, VSOCK_VQ_MAX,
+			      vsock->vqs, callbacks, names,
+			      NULL);
 	if (ret < 0)
 		goto out;
 
-- 
cgit v1.2.3-70-g09d2


From f94682dde5ed23eed13533a37dfce942e60ade4e Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 6 Mar 2017 18:32:29 +0200
Subject: virtio: add context flag to find vqs

Allows maintaining extra context per vq.  For ease of use, passing in
NULL is legal and disables the feature for all vqs.

Includes fixes by Christian for s390, acked by Cornelia.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/misc/mic/vop/vop_main.c        |  9 ++++++---
 drivers/remoteproc/remoteproc_virtio.c | 10 ++++++----
 drivers/s390/virtio/kvm_virtio.c       |  8 +++++---
 drivers/s390/virtio/virtio_ccw.c       |  7 ++++---
 drivers/virtio/virtio_mmio.c           |  8 +++++---
 drivers/virtio/virtio_pci_common.c     | 17 +++++++++++------
 drivers/virtio/virtio_pci_common.h     |  4 +++-
 drivers/virtio/virtio_pci_legacy.c     |  4 +++-
 drivers/virtio/virtio_pci_modern.c     | 12 ++++++++----
 drivers/virtio/virtio_ring.c           |  7 +++++--
 include/linux/virtio_config.h          | 18 +++++++++++++++---
 include/linux/virtio_ring.h            |  3 +++
 12 files changed, 74 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/mic/vop/vop_main.c b/drivers/misc/mic/vop/vop_main.c
index c2e29d7f0de8..a341938c7e2c 100644
--- a/drivers/misc/mic/vop/vop_main.c
+++ b/drivers/misc/mic/vop/vop_main.c
@@ -278,7 +278,7 @@ static void vop_del_vqs(struct virtio_device *dev)
 static struct virtqueue *vop_find_vq(struct virtio_device *dev,
 				     unsigned index,
 				     void (*callback)(struct virtqueue *vq),
-				     const char *name)
+				     const char *name, bool ctx)
 {
 	struct _vop_vdev *vdev = to_vopvdev(dev);
 	struct vop_device *vpdev = vdev->vpdev;
@@ -314,6 +314,7 @@ static struct virtqueue *vop_find_vq(struct virtio_device *dev,
 				le16_to_cpu(config.num), MIC_VIRTIO_RING_ALIGN,
 				dev,
 				false,
+				ctx,
 				(void __force *)va, vop_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
@@ -374,7 +375,8 @@ unmap:
 static int vop_find_vqs(struct virtio_device *dev, unsigned nvqs,
 			struct virtqueue *vqs[],
 			vq_callback_t *callbacks[],
-			const char * const names[], struct irq_affinity *desc)
+			const char * const names[], const bool *ctx,
+			struct irq_affinity *desc)
 {
 	struct _vop_vdev *vdev = to_vopvdev(dev);
 	struct vop_device *vpdev = vdev->vpdev;
@@ -388,7 +390,8 @@ static int vop_find_vqs(struct virtio_device *dev, unsigned nvqs,
 	for (i = 0; i < nvqs; ++i) {
 		dev_dbg(_vop_dev(vdev), "%s: %d: %s\n",
 			__func__, i, names[i]);
-		vqs[i] = vop_find_vq(dev, i, callbacks[i], names[i]);
+		vqs[i] = vop_find_vq(dev, i, callbacks[i], names[i],
+				     ctx ? ctx[i] : false);
 		if (IS_ERR(vqs[i])) {
 			err = PTR_ERR(vqs[i]);
 			goto error;
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index 0142cc3f0c91..294634836b32 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -71,7 +71,7 @@ EXPORT_SYMBOL(rproc_vq_interrupt);
 static struct virtqueue *rp_find_vq(struct virtio_device *vdev,
 				    unsigned int id,
 				    void (*callback)(struct virtqueue *vq),
-				    const char *name)
+				    const char *name, bool ctx)
 {
 	struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
 	struct rproc *rproc = vdev_to_rproc(vdev);
@@ -103,8 +103,8 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev,
 	 * Create the new vq, and tell virtio we're not interested in
 	 * the 'weak' smp barriers, since we're talking with a real device.
 	 */
-	vq = vring_new_virtqueue(id, len, rvring->align, vdev, false, addr,
-				 rproc_virtio_notify, callback, name);
+	vq = vring_new_virtqueue(id, len, rvring->align, vdev, false, ctx,
+				 addr, rproc_virtio_notify, callback, name);
 	if (!vq) {
 		dev_err(dev, "vring_new_virtqueue %s failed\n", name);
 		rproc_free_vring(rvring);
@@ -138,12 +138,14 @@ static int rproc_virtio_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 				 struct virtqueue *vqs[],
 				 vq_callback_t *callbacks[],
 				 const char * const names[],
+				 const bool * ctx,
 				 struct irq_affinity *desc)
 {
 	int i, ret;
 
 	for (i = 0; i < nvqs; ++i) {
-		vqs[i] = rp_find_vq(vdev, i, callbacks[i], names[i]);
+		vqs[i] = rp_find_vq(vdev, i, callbacks[i], names[i],
+				    ctx ? ctx[i] : false);
 		if (IS_ERR(vqs[i])) {
 			ret = PTR_ERR(vqs[i]);
 			goto error;
diff --git a/drivers/s390/virtio/kvm_virtio.c b/drivers/s390/virtio/kvm_virtio.c
index 2ce0b3eb2efe..a99d09a11f05 100644
--- a/drivers/s390/virtio/kvm_virtio.c
+++ b/drivers/s390/virtio/kvm_virtio.c
@@ -189,7 +189,7 @@ static bool kvm_notify(struct virtqueue *vq)
 static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
 				     unsigned index,
 				     void (*callback)(struct virtqueue *vq),
-				     const char *name)
+				     const char *name, bool ctx)
 {
 	struct kvm_device *kdev = to_kvmdev(vdev);
 	struct kvm_vqconfig *config;
@@ -211,7 +211,7 @@ static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
 		goto out;
 
 	vq = vring_new_virtqueue(index, config->num, KVM_S390_VIRTIO_RING_ALIGN,
-				 vdev, true, (void *) config->address,
+				 vdev, true, ctx, (void *) config->address,
 				 kvm_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
@@ -256,6 +256,7 @@ static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			struct virtqueue *vqs[],
 			vq_callback_t *callbacks[],
 			const char * const names[],
+			const bool *ctx,
 			struct irq_affinity *desc)
 {
 	struct kvm_device *kdev = to_kvmdev(vdev);
@@ -266,7 +267,8 @@ static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		return -ENOENT;
 
 	for (i = 0; i < nvqs; ++i) {
-		vqs[i] = kvm_find_vq(vdev, i, callbacks[i], names[i]);
+		vqs[i] = kvm_find_vq(vdev, i, callbacks[i], names[i],
+				     ctx ? ctx[i] : false);
 		if (IS_ERR(vqs[i]))
 			goto error;
 	}
diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index 0ed209f3d8b0..2a76ea78a0bf 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -484,7 +484,7 @@ static void virtio_ccw_del_vqs(struct virtio_device *vdev)
 
 static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
 					     int i, vq_callback_t *callback,
-					     const char *name,
+					     const char *name, bool ctx,
 					     struct ccw1 *ccw)
 {
 	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
@@ -522,7 +522,7 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
 	}
 
 	vq = vring_new_virtqueue(i, info->num, KVM_VIRTIO_CCW_RING_ALIGN, vdev,
-				 true, info->queue, virtio_ccw_kvm_notify,
+				 true, ctx, info->queue, virtio_ccw_kvm_notify,
 				 callback, name);
 	if (!vq) {
 		/* For now, we fail if we can't get the requested size. */
@@ -629,6 +629,7 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			       struct virtqueue *vqs[],
 			       vq_callback_t *callbacks[],
 			       const char * const names[],
+			       const bool *ctx,
 			       struct irq_affinity *desc)
 {
 	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
@@ -642,7 +643,7 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 
 	for (i = 0; i < nvqs; ++i) {
 		vqs[i] = virtio_ccw_setup_vq(vdev, i, callbacks[i], names[i],
-					     ccw);
+					     ctx ? ctx[i] : false, ccw);
 		if (IS_ERR(vqs[i])) {
 			ret = PTR_ERR(vqs[i]);
 			vqs[i] = NULL;
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 78343b8f9034..74dc7170fd35 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -351,7 +351,7 @@ static void vm_del_vqs(struct virtio_device *vdev)
 
 static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
 				  void (*callback)(struct virtqueue *vq),
-				  const char *name)
+				  const char *name, bool ctx)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
 	struct virtio_mmio_vq_info *info;
@@ -388,7 +388,7 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
 
 	/* Create the vring */
 	vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev,
-				 true, true, vm_notify, callback, name);
+				 true, true, ctx, vm_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto error_new_virtqueue;
@@ -447,6 +447,7 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		       struct virtqueue *vqs[],
 		       vq_callback_t *callbacks[],
 		       const char * const names[],
+		       const bool *ctx,
 		       struct irq_affinity *desc)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
@@ -459,7 +460,8 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		return err;
 
 	for (i = 0; i < nvqs; ++i) {
-		vqs[i] = vm_setup_vq(vdev, i, callbacks[i], names[i]);
+		vqs[i] = vm_setup_vq(vdev, i, callbacks[i], names[i],
+				     ctx ? ctx[i] : false);
 		if (IS_ERR(vqs[i])) {
 			vm_del_vqs(vdev);
 			return PTR_ERR(vqs[i]);
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 698d5d06fa03..007a4f366086 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -172,6 +172,7 @@ error:
 static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned index,
 				     void (*callback)(struct virtqueue *vq),
 				     const char *name,
+				     bool ctx,
 				     u16 msix_vec)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -183,7 +184,7 @@ static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned index,
 	if (!info)
 		return ERR_PTR(-ENOMEM);
 
-	vq = vp_dev->setup_vq(vp_dev, info, index, callback, name,
+	vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, ctx,
 			      msix_vec);
 	if (IS_ERR(vq))
 		goto out_info;
@@ -274,6 +275,7 @@ void vp_del_vqs(struct virtio_device *vdev)
 static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
 		const char * const names[], bool per_vq_vectors,
+		const bool *ctx,
 		struct irq_affinity *desc)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -315,6 +317,7 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs,
 		else
 			msix_vec = VP_MSIX_VQ_VECTOR;
 		vqs[i] = vp_setup_vq(vdev, i, callbacks[i], names[i],
+				     ctx ? ctx[i] : false,
 				     msix_vec);
 		if (IS_ERR(vqs[i])) {
 			err = PTR_ERR(vqs[i]);
@@ -345,7 +348,7 @@ error_find:
 
 static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[])
+		const char * const names[], const bool *ctx)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	int i, err;
@@ -367,6 +370,7 @@ static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned nvqs,
 			continue;
 		}
 		vqs[i] = vp_setup_vq(vdev, i, callbacks[i], names[i],
+				     ctx ? ctx[i] : false,
 				     VIRTIO_MSI_NO_VECTOR);
 		if (IS_ERR(vqs[i])) {
 			err = PTR_ERR(vqs[i]);
@@ -383,20 +387,21 @@ out_del_vqs:
 /* the config->find_vqs() implementation */
 int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[], struct irq_affinity *desc)
+		const char * const names[], const bool *ctx,
+		struct irq_affinity *desc)
 {
 	int err;
 
 	/* Try MSI-X with one vector per queue. */
-	err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, true, desc);
+	err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, true, ctx, desc);
 	if (!err)
 		return 0;
 	/* Fallback: MSI-X with one vector for config, one shared for queues. */
-	err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, false, desc);
+	err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, false, ctx, desc);
 	if (!err)
 		return 0;
 	/* Finally fall back to regular interrupts. */
-	return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names);
+	return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names, ctx);
 }
 
 const char *vp_bus_name(struct virtio_device *vdev)
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index e96334aec1e0..135ee3cf7175 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -102,6 +102,7 @@ struct virtio_pci_device {
 				      unsigned idx,
 				      void (*callback)(struct virtqueue *vq),
 				      const char *name,
+				      bool ctx,
 				      u16 msix_vec);
 	void (*del_vq)(struct virtio_pci_vq_info *info);
 
@@ -131,7 +132,8 @@ void vp_del_vqs(struct virtio_device *vdev);
 /* the config->find_vqs() implementation */
 int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[], struct irq_affinity *desc);
+		const char * const names[], const bool *ctx,
+		struct irq_affinity *desc);
 const char *vp_bus_name(struct virtio_device *vdev);
 
 /* Setup the affinity for a virtqueue:
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
index 4bfa48fb1324..2780886e8ba3 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -116,6 +116,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 				  unsigned index,
 				  void (*callback)(struct virtqueue *vq),
 				  const char *name,
+				  bool ctx,
 				  u16 msix_vec)
 {
 	struct virtqueue *vq;
@@ -135,7 +136,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	/* create the vring */
 	vq = vring_create_virtqueue(index, num,
 				    VIRTIO_PCI_VRING_ALIGN, &vp_dev->vdev,
-				    true, false, vp_notify, callback, name);
+				    true, false, ctx,
+				    vp_notify, callback, name);
 	if (!vq)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 8978f109d2d7..2555d80f6eec 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -297,6 +297,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 				  unsigned index,
 				  void (*callback)(struct virtqueue *vq),
 				  const char *name,
+				  bool ctx,
 				  u16 msix_vec)
 {
 	struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common;
@@ -328,7 +329,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	/* create the vring */
 	vq = vring_create_virtqueue(index, num,
 				    SMP_CACHE_BYTES, &vp_dev->vdev,
-				    true, true, vp_notify, callback, name);
+				    true, true, ctx,
+				    vp_notify, callback, name);
 	if (!vq)
 		return ERR_PTR(-ENOMEM);
 
@@ -387,12 +389,14 @@ err_map_notify:
 }
 
 static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs,
-		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[], struct irq_affinity *desc)
+			      struct virtqueue *vqs[],
+			      vq_callback_t *callbacks[],
+			      const char * const names[], const bool *ctx,
+			      struct irq_affinity *desc)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	struct virtqueue *vq;
-	int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, desc);
+	int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc);
 
 	if (rc)
 		return rc;
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 409aeaa49246..b23b5fae468b 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -916,6 +916,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 					struct vring vring,
 					struct virtio_device *vdev,
 					bool weak_barriers,
+					bool context,
 					bool (*notify)(struct virtqueue *),
 					void (*callback)(struct virtqueue *),
 					const char *name)
@@ -1019,6 +1020,7 @@ struct virtqueue *vring_create_virtqueue(
 	struct virtio_device *vdev,
 	bool weak_barriers,
 	bool may_reduce_num,
+	bool context,
 	bool (*notify)(struct virtqueue *),
 	void (*callback)(struct virtqueue *),
 	const char *name)
@@ -1058,7 +1060,7 @@ struct virtqueue *vring_create_virtqueue(
 	queue_size_in_bytes = vring_size(num, vring_align);
 	vring_init(&vring, num, queue, vring_align);
 
-	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers,
+	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
 				   notify, callback, name);
 	if (!vq) {
 		vring_free_queue(vdev, queue_size_in_bytes, queue,
@@ -1079,6 +1081,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 				      unsigned int vring_align,
 				      struct virtio_device *vdev,
 				      bool weak_barriers,
+				      bool context,
 				      void *pages,
 				      bool (*notify)(struct virtqueue *vq),
 				      void (*callback)(struct virtqueue *vq),
@@ -1086,7 +1089,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 {
 	struct vring vring;
 	vring_init(&vring, num, pages, vring_align);
-	return __vring_new_virtqueue(index, vring, vdev, weak_barriers,
+	return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
 				     notify, callback, name);
 }
 EXPORT_SYMBOL_GPL(vring_new_virtqueue);
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 47f3d805c290..0133d8a12ccd 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -72,7 +72,8 @@ struct virtio_config_ops {
 	void (*reset)(struct virtio_device *vdev);
 	int (*find_vqs)(struct virtio_device *, unsigned nvqs,
 			struct virtqueue *vqs[], vq_callback_t *callbacks[],
-			const char * const names[], struct irq_affinity *desc);
+			const char * const names[], const bool *ctx,
+			struct irq_affinity *desc);
 	void (*del_vqs)(struct virtio_device *);
 	u64 (*get_features)(struct virtio_device *vdev);
 	int (*finalize_features)(struct virtio_device *vdev);
@@ -173,7 +174,8 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
 	vq_callback_t *callbacks[] = { c };
 	const char *names[] = { n };
 	struct virtqueue *vq;
-	int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names, NULL);
+	int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names, NULL,
+					 NULL);
 	if (err < 0)
 		return ERR_PTR(err);
 	return vq;
@@ -185,7 +187,17 @@ int virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			const char * const names[],
 			struct irq_affinity *desc)
 {
-	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, desc);
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL, desc);
+}
+
+static inline
+int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs,
+			struct virtqueue *vqs[], vq_callback_t *callbacks[],
+			const char * const names[], const bool *ctx,
+			struct irq_affinity *desc)
+{
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, ctx,
+				      desc);
 }
 
 /**
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index e8d36938f09a..270cfa81830e 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -71,6 +71,7 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
 					 struct virtio_device *vdev,
 					 bool weak_barriers,
 					 bool may_reduce_num,
+					 bool ctx,
 					 bool (*notify)(struct virtqueue *vq),
 					 void (*callback)(struct virtqueue *vq),
 					 const char *name);
@@ -80,6 +81,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 					struct vring vring,
 					struct virtio_device *vdev,
 					bool weak_barriers,
+					bool ctx,
 					bool (*notify)(struct virtqueue *),
 					void (*callback)(struct virtqueue *),
 					const char *name);
@@ -93,6 +95,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 				      unsigned int vring_align,
 				      struct virtio_device *vdev,
 				      bool weak_barriers,
+				      bool ctx,
 				      void *pages,
 				      bool (*notify)(struct virtqueue *vq),
 				      void (*callback)(struct virtqueue *vq),
-- 
cgit v1.2.3-70-g09d2


From 5a08b04f637921e44ba767c07c74b0535504ab71 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 7 Feb 2017 06:15:13 +0200
Subject: virtio: allow extra context per descriptor

Allow extra context per descriptor. To avoid slow down for data path,
this disables use of indirect descriptors for this vq.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 70 ++++++++++++++++++++++++++++++++++++--------
 include/linux/virtio.h       |  9 ++++++
 2 files changed, 66 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index b23b5fae468b..5e1b548828e6 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -263,6 +263,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 				unsigned int out_sgs,
 				unsigned int in_sgs,
 				void *data,
+				void *ctx,
 				gfp_t gfp)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
@@ -275,6 +276,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	START_USE(vq);
 
 	BUG_ON(data == NULL);
+	BUG_ON(ctx && vq->indirect);
 
 	if (unlikely(vq->broken)) {
 		END_USE(vq);
@@ -389,6 +391,8 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	vq->desc_state[head].data = data;
 	if (indirect)
 		vq->desc_state[head].indir_desc = desc;
+	if (ctx)
+		vq->desc_state[head].indir_desc = ctx;
 
 	/* Put entry in available array (but don't update avail->idx until they
 	 * do sync). */
@@ -461,7 +465,8 @@ int virtqueue_add_sgs(struct virtqueue *_vq,
 		for (sg = sgs[i]; sg; sg = sg_next(sg))
 			total_sg++;
 	}
-	return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, data, gfp);
+	return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs,
+			     data, NULL, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
 
@@ -483,7 +488,7 @@ int virtqueue_add_outbuf(struct virtqueue *vq,
 			 void *data,
 			 gfp_t gfp)
 {
-	return virtqueue_add(vq, &sg, num, 1, 0, data, gfp);
+	return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
 
@@ -505,10 +510,34 @@ int virtqueue_add_inbuf(struct virtqueue *vq,
 			void *data,
 			gfp_t gfp)
 {
-	return virtqueue_add(vq, &sg, num, 0, 1, data, gfp);
+	return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
 
+/**
+ * virtqueue_add_inbuf_ctx - expose input buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sg: scatterlist (must be well-formed and terminated!)
+ * @num: the number of entries in @sg writable by other side
+ * @data: the token identifying the buffer.
+ * @ctx: extra context for the token
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
+ */
+int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
+			struct scatterlist *sg, unsigned int num,
+			void *data,
+			void *ctx,
+			gfp_t gfp)
+{
+	return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx);
+
 /**
  * virtqueue_kick_prepare - first half of split virtqueue_kick call.
  * @vq: the struct virtqueue
@@ -598,7 +627,8 @@ bool virtqueue_kick(struct virtqueue *vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_kick);
 
-static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
+static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
+		       void **ctx)
 {
 	unsigned int i, j;
 	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
@@ -622,10 +652,15 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
 	/* Plus final descriptor */
 	vq->vq.num_free++;
 
-	/* Free the indirect table, if any, now that it's unmapped. */
-	if (vq->desc_state[head].indir_desc) {
+	if (vq->indirect) {
 		struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
-		u32 len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
+		u32 len;
+
+		/* Free the indirect table, if any, now that it's unmapped. */
+		if (!indir_desc)
+			return;
+
+		len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
 
 		BUG_ON(!(vq->vring.desc[head].flags &
 			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
@@ -634,8 +669,10 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
 		for (j = 0; j < len / sizeof(struct vring_desc); j++)
 			vring_unmap_one(vq, &indir_desc[j]);
 
-		kfree(vq->desc_state[head].indir_desc);
+		kfree(indir_desc);
 		vq->desc_state[head].indir_desc = NULL;
+	} else if (ctx) {
+		*ctx = vq->desc_state[head].indir_desc;
 	}
 }
 
@@ -660,7 +697,8 @@ static inline bool more_used(const struct vring_virtqueue *vq)
  * Returns NULL if there are no used buffers, or the "data" token
  * handed to virtqueue_add_*().
  */
-void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
+void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
+			    void **ctx)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	void *ret;
@@ -698,7 +736,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 
 	/* detach_buf clears data, so grab it now. */
 	ret = vq->desc_state[i].data;
-	detach_buf(vq, i);
+	detach_buf(vq, i, ctx);
 	vq->last_used_idx++;
 	/* If we expect an interrupt for the next entry, tell host
 	 * by writing event index and flush out the write before
@@ -715,8 +753,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 	END_USE(vq);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(virtqueue_get_buf);
+EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx);
 
+void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
+{
+	return virtqueue_get_buf_ctx(_vq, len, NULL);
+}
+EXPORT_SYMBOL_GPL(virtqueue_get_buf);
 /**
  * virtqueue_disable_cb - disable callbacks
  * @vq: the struct virtqueue we're talking about.
@@ -878,7 +921,7 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
 			continue;
 		/* detach_buf clears data, so grab it now. */
 		buf = vq->desc_state[i].data;
-		detach_buf(vq, i);
+		detach_buf(vq, i, NULL);
 		vq->avail_idx_shadow--;
 		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
 		END_USE(vq);
@@ -951,7 +994,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 	vq->last_add_time_valid = false;
 #endif
 
-	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
+	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
+		!context;
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
 	/* No callback?  Tell other side not to bother us. */
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 7edfbdb55a99..ed04753278d4 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -44,6 +44,12 @@ int virtqueue_add_inbuf(struct virtqueue *vq,
 			void *data,
 			gfp_t gfp);
 
+int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
+			    struct scatterlist sg[], unsigned int num,
+			    void *data,
+			    void *ctx,
+			    gfp_t gfp);
+
 int virtqueue_add_sgs(struct virtqueue *vq,
 		      struct scatterlist *sgs[],
 		      unsigned int out_sgs,
@@ -59,6 +65,9 @@ bool virtqueue_notify(struct virtqueue *vq);
 
 void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len);
 
+void *virtqueue_get_buf_ctx(struct virtqueue *vq, unsigned int *len,
+			    void **ctx);
+
 void virtqueue_disable_cb(struct virtqueue *vq);
 
 bool virtqueue_enable_cb(struct virtqueue *vq);
-- 
cgit v1.2.3-70-g09d2


From 74da4a0f574d11ed60dbe50a1e5e942e20476590 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 3 Mar 2017 18:16:07 +0100
Subject: libceph, ceph: always advertise all supported features

No reason to hide CephFS-specific features in the rbd case.  Recent
feature bits mix RADOS and CephFS-specific stuff together anyway.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c                |  2 +-
 fs/ceph/super.c                    |  7 +------
 include/linux/ceph/ceph_features.h |  4 ++++
 include/linux/ceph/libceph.h       |  5 +----
 net/ceph/ceph_common.c             | 16 ++++++----------
 5 files changed, 13 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 517838b65964..16010183b703 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -731,7 +731,7 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 	kref_init(&rbdc->kref);
 	INIT_LIST_HEAD(&rbdc->node);
 
-	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
+	rbdc->client = ceph_create_client(ceph_opts, rbdc);
 	if (IS_ERR(rbdc->client))
 		goto out_rbdc;
 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 0ec8d0114e57..8b9f645006da 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -544,10 +544,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 					struct ceph_options *opt)
 {
 	struct ceph_fs_client *fsc;
-	const u64 supported_features =
-		CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH |
-		CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA;
-	const u64 required_features = 0;
 	int page_count;
 	size_t size;
 	int err = -ENOMEM;
@@ -556,8 +552,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 	if (!fsc)
 		return ERR_PTR(-ENOMEM);
 
-	fsc->client = ceph_create_client(opt, fsc, supported_features,
-					 required_features);
+	fsc->client = ceph_create_client(opt, fsc);
 	if (IS_ERR(fsc->client)) {
 		err = PTR_ERR(fsc->client);
 		goto fail;
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index ae2f66833762..fd8b2953c78f 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -105,8 +105,10 @@ static inline u64 ceph_sanitize_features(u64 features)
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT		\
 	(CEPH_FEATURE_NOSRCADDR |		\
+	 CEPH_FEATURE_FLOCK |			\
 	 CEPH_FEATURE_SUBSCRIBE2 |		\
 	 CEPH_FEATURE_RECONNECT_SEQ |		\
+	 CEPH_FEATURE_DIRLAYOUTHASH |		\
 	 CEPH_FEATURE_PGID64 |			\
 	 CEPH_FEATURE_PGPOOL3 |			\
 	 CEPH_FEATURE_OSDENC |			\
@@ -114,11 +116,13 @@ static inline u64 ceph_sanitize_features(u64 features)
 	 CEPH_FEATURE_MSG_AUTH |		\
 	 CEPH_FEATURE_CRUSH_TUNABLES2 |		\
 	 CEPH_FEATURE_REPLY_CREATE_INODE |	\
+	 CEPH_FEATURE_MDSENC |			\
 	 CEPH_FEATURE_OSDHASHPSPOOL |		\
 	 CEPH_FEATURE_OSD_CACHEPOOL |		\
 	 CEPH_FEATURE_CRUSH_V2 |		\
 	 CEPH_FEATURE_EXPORT_PEER |		\
 	 CEPH_FEATURE_OSDMAP_ENC |		\
+	 CEPH_FEATURE_MDS_INLINE_DATA |		\
 	 CEPH_FEATURE_CRUSH_TUNABLES3 |		\
 	 CEPH_FEATURE_OSD_PRIMARY_AFFINITY |	\
 	 CEPH_FEATURE_MSGR_KEEPALIVE2 |		\
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 88cd5dc8e238..cecbf5a26e5a 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -262,10 +262,7 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client);
 extern void ceph_destroy_options(struct ceph_options *opt);
 extern int ceph_compare_options(struct ceph_options *new_opt,
 				struct ceph_client *client);
-extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
-					      void *private,
-					      u64 supported_features,
-					      u64 required_features);
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private);
 struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client);
 u64 ceph_client_gid(struct ceph_client *client);
 extern void ceph_destroy_client(struct ceph_client *client);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 108533859a53..4701d067169f 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -596,9 +596,7 @@ EXPORT_SYMBOL(ceph_client_gid);
 /*
  * create a fresh client instance
  */
-struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
-				       u64 supported_features,
-				       u64 required_features)
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
 {
 	struct ceph_client *client;
 	struct ceph_entity_addr *myaddr = NULL;
@@ -615,14 +613,12 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
 	init_waitqueue_head(&client->auth_wq);
 	client->auth_err = 0;
 
-	if (!ceph_test_opt(client, NOMSGAUTH))
-		required_features |= CEPH_FEATURE_MSG_AUTH;
-
 	client->extra_mon_dispatch = NULL;
-	client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT |
-		supported_features;
-	client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT |
-		required_features;
+	client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+	client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT;
+
+	if (!ceph_test_opt(client, NOMSGAUTH))
+		client->required_features |= CEPH_FEATURE_MSG_AUTH;
 
 	/* msgr */
 	if (ceph_test_opt(client, MYIP))
-- 
cgit v1.2.3-70-g09d2


From 06dfa96399a9a3280dd81e47f8696aa89f1783e7 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 17 Mar 2017 14:10:27 +0200
Subject: libceph: convert ceph_snap_context.nref from atomic_t to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/libceph.h | 3 ++-
 net/ceph/snapshot.c          | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index cecbf5a26e5a..3229ae6c7846 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -14,6 +14,7 @@
 #include <linux/wait.h>
 #include <linux/writeback.h>
 #include <linux/slab.h>
+#include <linux/refcount.h>
 
 #include <linux/ceph/types.h>
 #include <linux/ceph/messenger.h>
@@ -161,7 +162,7 @@ struct ceph_client {
  * dirtied.
  */
 struct ceph_snap_context {
-	atomic_t nref;
+	refcount_t nref;
 	u64 seq;
 	u32 num_snaps;
 	u64 snaps[];
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
index 705414e78ae0..e14a5d038656 100644
--- a/net/ceph/snapshot.c
+++ b/net/ceph/snapshot.c
@@ -49,7 +49,7 @@ struct ceph_snap_context *ceph_create_snap_context(u32 snap_count,
 	if (!snapc)
 		return NULL;
 
-	atomic_set(&snapc->nref, 1);
+	refcount_set(&snapc->nref, 1);
 	snapc->num_snaps = snap_count;
 
 	return snapc;
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(ceph_create_snap_context);
 struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc)
 {
 	if (sc)
-		atomic_inc(&sc->nref);
+		refcount_inc(&sc->nref);
 	return sc;
 }
 EXPORT_SYMBOL(ceph_get_snap_context);
@@ -68,7 +68,7 @@ void ceph_put_snap_context(struct ceph_snap_context *sc)
 {
 	if (!sc)
 		return;
-	if (atomic_dec_and_test(&sc->nref)) {
+	if (refcount_dec_and_test(&sc->nref)) {
 		/*printk(" deleting snap_context %p\n", sc);*/
 		kfree(sc);
 	}
-- 
cgit v1.2.3-70-g09d2


From 02113a0f14e20bd8e675d7cec16db6eaaf2b2380 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 17 Mar 2017 14:10:28 +0200
Subject: libceph: convert ceph_osd.o_ref from atomic_t to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  3 ++-
 net/ceph/osd_client.c           | 16 ++++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index c125b5d9e13c..d6a625e75040 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -5,6 +5,7 @@
 #include <linux/kref.h>
 #include <linux/mempool.h>
 #include <linux/rbtree.h>
+#include <linux/refcount.h>
 
 #include <linux/ceph/types.h>
 #include <linux/ceph/osdmap.h>
@@ -27,7 +28,7 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
 
 /* a given osd we're communicating with */
 struct ceph_osd {
-	atomic_t o_ref;
+	refcount_t o_ref;
 	struct ceph_osd_client *o_osdc;
 	int o_osd;
 	int o_incarnation;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index e15ea9e4c495..b4500a8ab8b3 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1005,7 +1005,7 @@ static bool osd_registered(struct ceph_osd *osd)
  */
 static void osd_init(struct ceph_osd *osd)
 {
-	atomic_set(&osd->o_ref, 1);
+	refcount_set(&osd->o_ref, 1);
 	RB_CLEAR_NODE(&osd->o_node);
 	osd->o_requests = RB_ROOT;
 	osd->o_linger_requests = RB_ROOT;
@@ -1050,9 +1050,9 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
 
 static struct ceph_osd *get_osd(struct ceph_osd *osd)
 {
-	if (atomic_inc_not_zero(&osd->o_ref)) {
-		dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
-		     atomic_read(&osd->o_ref));
+	if (refcount_inc_not_zero(&osd->o_ref)) {
+		dout("get_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref)-1,
+		     refcount_read(&osd->o_ref));
 		return osd;
 	} else {
 		dout("get_osd %p FAIL\n", osd);
@@ -1062,9 +1062,9 @@ static struct ceph_osd *get_osd(struct ceph_osd *osd)
 
 static void put_osd(struct ceph_osd *osd)
 {
-	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
-	     atomic_read(&osd->o_ref) - 1);
-	if (atomic_dec_and_test(&osd->o_ref)) {
+	dout("put_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref),
+	     refcount_read(&osd->o_ref) - 1);
+	if (refcount_dec_and_test(&osd->o_ref)) {
 		osd_cleanup(osd);
 		kfree(osd);
 	}
@@ -4126,7 +4126,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 		close_osd(osd);
 	}
 	up_write(&osdc->lock);
-	WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
+	WARN_ON(refcount_read(&osdc->homeless_osd.o_ref) != 1);
 	osd_cleanup(&osdc->homeless_osd);
 
 	WARN_ON(!list_empty(&osdc->osd_lru));
-- 
cgit v1.2.3-70-g09d2


From 0e1a5ee6577e43e5be55369d398107080b360941 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 17 Mar 2017 14:10:29 +0200
Subject: libceph: convert ceph_pagelist.refcnt from atomic_t to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c          | 2 +-
 include/linux/ceph/pagelist.h | 6 +++---
 net/ceph/pagelist.c           | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 074490542b4c..b16f1cf552a8 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1991,7 +1991,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 
 	if (req->r_pagelist) {
 		struct ceph_pagelist *pagelist = req->r_pagelist;
-		atomic_inc(&pagelist->refcnt);
+		refcount_inc(&pagelist->refcnt);
 		ceph_msg_data_add_pagelist(msg, pagelist);
 		msg->hdr.data_len = cpu_to_le32(pagelist->length);
 	} else {
diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h
index 13d71fe18b0c..75a7db21457d 100644
--- a/include/linux/ceph/pagelist.h
+++ b/include/linux/ceph/pagelist.h
@@ -2,7 +2,7 @@
 #define __FS_CEPH_PAGELIST_H
 
 #include <asm/byteorder.h>
-#include <linux/atomic.h>
+#include <linux/refcount.h>
 #include <linux/list.h>
 #include <linux/types.h>
 
@@ -13,7 +13,7 @@ struct ceph_pagelist {
 	size_t room;
 	struct list_head free_list;
 	size_t num_pages_free;
-	atomic_t refcnt;
+	refcount_t refcnt;
 };
 
 struct ceph_pagelist_cursor {
@@ -30,7 +30,7 @@ static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
 	pl->room = 0;
 	INIT_LIST_HEAD(&pl->free_list);
 	pl->num_pages_free = 0;
-	atomic_set(&pl->refcnt, 1);
+	refcount_set(&pl->refcnt, 1);
 }
 
 extern void ceph_pagelist_release(struct ceph_pagelist *pl);
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
index 6864007e64fc..ce09f73be759 100644
--- a/net/ceph/pagelist.c
+++ b/net/ceph/pagelist.c
@@ -16,7 +16,7 @@ static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
 
 void ceph_pagelist_release(struct ceph_pagelist *pl)
 {
-	if (!atomic_dec_and_test(&pl->refcnt))
+	if (!refcount_dec_and_test(&pl->refcnt))
 		return;
 	ceph_pagelist_unmap_tail(pl);
 	while (!list_empty(&pl->head)) {
-- 
cgit v1.2.3-70-g09d2


From 76201b6354bb3aa31c7ba2bd42b9cbb8dda71c44 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 28 Mar 2017 17:04:13 +0800
Subject: ceph: allow connecting to mds whose rank >= mdsmap::m_max_mds

mdsmap::m_max_mds is the expected count of active mds. It's not the
max rank of active mds. User can decrease mdsmap::m_max_mds, but does
not stop mds whose rank >= mdsmap::m_max_mds.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/debugfs.c           | 21 ++++++++++-----------
 fs/ceph/mds_client.c        | 10 +++++-----
 fs/ceph/mdsmap.c            | 44 +++++++++++++++++++++++++++++++++++++-------
 include/linux/ceph/mdsmap.h |  7 ++++---
 4 files changed, 56 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f2ae393e2c31..4107a887ca34 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -22,20 +22,19 @@ static int mdsmap_show(struct seq_file *s, void *p)
 {
 	int i;
 	struct ceph_fs_client *fsc = s->private;
+	struct ceph_mdsmap *mdsmap;
 
 	if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
 		return 0;
-	seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
-	seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
-	seq_printf(s, "session_timeout %d\n",
-		       fsc->mdsc->mdsmap->m_session_timeout);
-	seq_printf(s, "session_autoclose %d\n",
-		       fsc->mdsc->mdsmap->m_session_autoclose);
-	for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
-		struct ceph_entity_addr *addr =
-			&fsc->mdsc->mdsmap->m_info[i].addr;
-		int state = fsc->mdsc->mdsmap->m_info[i].state;
-
+	mdsmap = fsc->mdsc->mdsmap;
+	seq_printf(s, "epoch %d\n", mdsmap->m_epoch);
+	seq_printf(s, "root %d\n", mdsmap->m_root);
+	seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds);
+	seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout);
+	seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose);
+	for (i = 0; i < mdsmap->m_num_mds; i++) {
+		struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr;
+		int state = mdsmap->m_info[i].state;
 		seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
 			       ceph_pr_addr(&addr->in_addr),
 			       ceph_mds_state_name(state));
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b16f1cf552a8..2733932bf192 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -441,7 +441,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 {
 	struct ceph_mds_session *s;
 
-	if (mds >= mdsc->mdsmap->m_max_mds)
+	if (mds >= mdsc->mdsmap->m_num_mds)
 		return ERR_PTR(-EINVAL);
 
 	s = kzalloc(sizeof(*s), GFP_NOFS);
@@ -1004,7 +1004,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
 	struct ceph_mds_session *ts;
 	int i, mds = session->s_mds;
 
-	if (mds >= mdsc->mdsmap->m_max_mds)
+	if (mds >= mdsc->mdsmap->m_num_mds)
 		return;
 
 	mi = &mdsc->mdsmap->m_info[mds];
@@ -3107,7 +3107,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 	dout("check_new_map new %u old %u\n",
 	     newmap->m_epoch, oldmap->m_epoch);
 
-	for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
+	for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) {
 		if (mdsc->sessions[i] == NULL)
 			continue;
 		s = mdsc->sessions[i];
@@ -3121,7 +3121,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 		     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
 		     ceph_session_state_name(s->s_state));
 
-		if (i >= newmap->m_max_mds ||
+		if (i >= newmap->m_num_mds ||
 		    memcmp(ceph_mdsmap_get_addr(oldmap, i),
 			   ceph_mdsmap_get_addr(newmap, i),
 			   sizeof(struct ceph_entity_addr))) {
@@ -3167,7 +3167,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 		}
 	}
 
-	for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
+	for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) {
 		s = mdsc->sessions[i];
 		if (!s)
 			continue;
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 5454e2327a5f..1a748cf88535 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -22,11 +22,11 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
 	int i;
 
 	/* special case for one mds */
-	if (1 == m->m_max_mds && m->m_info[0].state > 0)
+	if (1 == m->m_num_mds && m->m_info[0].state > 0)
 		return 0;
 
 	/* count */
-	for (i = 0; i < m->m_max_mds; i++)
+	for (i = 0; i < m->m_num_mds; i++)
 		if (m->m_info[i].state > 0)
 			n++;
 	if (n == 0)
@@ -135,8 +135,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 	m->m_session_autoclose = ceph_decode_32(p);
 	m->m_max_file_size = ceph_decode_64(p);
 	m->m_max_mds = ceph_decode_32(p);
+	m->m_num_mds = m->m_max_mds;
 
-	m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
+	m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS);
 	if (m->m_info == NULL)
 		goto nomem;
 
@@ -207,9 +208,20 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		     ceph_pr_addr(&addr.in_addr),
 		     ceph_mds_state_name(state));
 
-		if (mds < 0 || mds >= m->m_max_mds || state <= 0)
+		if (mds < 0 || state <= 0)
 			continue;
 
+		if (mds >= m->m_num_mds) {
+			int new_num = max(mds + 1, m->m_num_mds * 2);
+			void *new_m_info = krealloc(m->m_info,
+						new_num * sizeof(*m->m_info),
+						GFP_NOFS | __GFP_ZERO);
+			if (!new_m_info)
+				goto nomem;
+			m->m_info = new_m_info;
+			m->m_num_mds = new_num;
+		}
+
 		info = &m->m_info[mds];
 		info->global_id = global_id;
 		info->state = state;
@@ -229,6 +241,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 			info->export_targets = NULL;
 		}
 	}
+	if (m->m_num_mds > m->m_max_mds) {
+		/* find max up mds */
+		for (i = m->m_num_mds; i >= m->m_max_mds; i--) {
+			if (i == 0 || m->m_info[i-1].state > 0)
+				break;
+		}
+		m->m_num_mds = i;
+	}
 
 	/* pg_pools */
 	ceph_decode_32_safe(p, end, n, bad);
@@ -270,12 +290,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 
 		for (i = 0; i < n; i++) {
 			s32 mds = ceph_decode_32(p);
-			if (mds >= 0 && mds < m->m_max_mds) {
+			if (mds >= 0 && mds < m->m_num_mds) {
 				if (m->m_info[mds].laggy)
 					num_laggy++;
 			}
 		}
 		m->m_num_laggy = num_laggy;
+
+		if (n > m->m_num_mds) {
+			void *new_m_info = krealloc(m->m_info,
+						    n * sizeof(*m->m_info),
+						    GFP_NOFS | __GFP_ZERO);
+			if (!new_m_info)
+				goto nomem;
+			m->m_info = new_m_info;
+		}
+		m->m_num_mds = n;
 	}
 
 	/* inc */
@@ -341,7 +371,7 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
 {
 	int i;
 
-	for (i = 0; i < m->m_max_mds; i++)
+	for (i = 0; i < m->m_num_mds; i++)
 		kfree(m->m_info[i].export_targets);
 	kfree(m->m_info);
 	kfree(m->m_data_pg_pools);
@@ -357,7 +387,7 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
 		return false;
 	if (m->m_num_laggy > 0)
 		return false;
-	for (i = 0; i < m->m_max_mds; i++) {
+	for (i = 0; i < m->m_num_mds; i++) {
 		if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
 			nr_active++;
 	}
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 8ed5dc505fbb..d5f783f3226a 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -25,6 +25,7 @@ struct ceph_mdsmap {
 	u32 m_session_autoclose;        /* seconds */
 	u64 m_max_file_size;
 	u32 m_max_mds;                  /* size of m_addr, m_state arrays */
+	int m_num_mds;
 	struct ceph_mds_info *m_info;
 
 	/* which object pools file data can be stored in */
@@ -40,7 +41,7 @@ struct ceph_mdsmap {
 static inline struct ceph_entity_addr *
 ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
 {
-	if (w >= m->m_max_mds)
+	if (w >= m->m_num_mds)
 		return NULL;
 	return &m->m_info[w].addr;
 }
@@ -48,14 +49,14 @@ ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
 static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
 {
 	BUG_ON(w < 0);
-	if (w >= m->m_max_mds)
+	if (w >= m->m_num_mds)
 		return CEPH_MDS_STATE_DNE;
 	return m->m_info[w].state;
 }
 
 static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
 {
-	if (w >= 0 && w < m->m_max_mds)
+	if (w >= 0 && w < m->m_num_mds)
 		return m->m_info[w].laggy;
 	return false;
 }
-- 
cgit v1.2.3-70-g09d2


From 79162547b76e4979b21ef80c9629ada94a51a59b Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 5 Apr 2017 12:54:05 -0400
Subject: ceph: make seeky readdir more efficient

Current cephfs client uses string to indicate start position of
readdir. The string is last entry of previous readdir reply.
This approach does not work for seeky readdir because we can
not easily convert the new postion to a string. For seeky readdir,
mds needs to return dentries from the beginning. Client keeps
retrying if the reply does not contain the dentry it wants.

In current version of ceph, mds sorts CDentry in its cache in
hash order. Client also uses dentry hash to compose dir postion.
For seeky readdir, if client passes the hash part of dir postion
to mds. mds can avoid replying useless dentries.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/dir.c                |  4 ++++
 fs/ceph/inode.c              | 17 ++++++++++++-----
 fs/ceph/mds_client.c         |  1 +
 fs/ceph/mds_client.h         |  3 ++-
 include/linux/ceph/ceph_fs.h |  2 ++
 5 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 3e9ad501addf..ae61cdf7d489 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -378,7 +378,11 @@ more:
 				ceph_mdsc_put_request(req);
 				return -ENOMEM;
 			}
+		} else if (is_hash_order(ctx->pos)) {
+			req->r_args.readdir.offset_hash =
+				cpu_to_le32(fpos_hash(ctx->pos));
 		}
+
 		req->r_dir_release_cnt = fi->dir_release_count;
 		req->r_dir_ordered_cnt = fi->dir_ordered_count;
 		req->r_readdir_cache_idx = fi->readdir_cache_idx;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d3119fe3ab45..dcce79b84406 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1482,10 +1482,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 	if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
 		return readdir_prepopulate_inodes_only(req, session);
 
-	if (rinfo->hash_order && req->r_path2) {
-		last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
-					  req->r_path2, strlen(req->r_path2));
-		last_hash = ceph_frag_value(last_hash);
+	if (rinfo->hash_order) {
+		if (req->r_path2) {
+			last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+						  req->r_path2,
+						  strlen(req->r_path2));
+			last_hash = ceph_frag_value(last_hash);
+		} else if (rinfo->offset_hash) {
+			/* mds understands offset_hash */
+			WARN_ON_ONCE(req->r_readdir_offset != 2);
+			last_hash = le32_to_cpu(rhead->args.readdir.offset_hash);
+		}
 	}
 
 	if (rinfo->dir_dir &&
@@ -1510,7 +1517,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 	}
 
 	if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 &&
-	    !(rinfo->hash_order && req->r_path2)) {
+	    !(rinfo->hash_order && last_hash)) {
 		/* note dir version at start of readdir so we can tell
 		 * if any dentries get dropped */
 		req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a22688873ec3..8cc4d4e8b077 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -189,6 +189,7 @@ static int parse_reply_info_dir(void **p, void *end,
 		info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
 		info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
 		info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
+		info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH);
 	}
 	if (num == 0)
 		goto done;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index bbebcd55d79e..3e67dd2169fa 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -83,9 +83,10 @@ struct ceph_mds_reply_info_parsed {
 			struct ceph_mds_reply_dirfrag *dir_dir;
 			size_t			      dir_buf_size;
 			int                           dir_nr;
-			bool			      dir_complete;
 			bool			      dir_end;
+			bool			      dir_complete;
 			bool			      hash_order;
+			bool			      offset_hash;
 			struct ceph_mds_reply_dir_entry  *dir_entries;
 		};
 
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index f4b2ee18f38c..1787e4a8e251 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -365,6 +365,7 @@ extern const char *ceph_mds_op_name(int op);
 #define CEPH_READDIR_FRAG_END		(1<<0)
 #define CEPH_READDIR_FRAG_COMPLETE	(1<<8)
 #define CEPH_READDIR_HASH_ORDER		(1<<9)
+#define CEPH_READDIR_OFFSET_HASH	(1<<10)
 
 union ceph_mds_request_args {
 	struct {
@@ -384,6 +385,7 @@ union ceph_mds_request_args {
 		__le32 max_entries;          /* how many dentries to grab */
 		__le32 max_bytes;
 		__le16 flags;
+		__le32 offset_hash;
 	} __attribute__ ((packed)) readdir;
 	struct {
 		__le32 mode;
-- 
cgit v1.2.3-70-g09d2


From aa26d662b9d44e7f5b0d109e892e537a23471863 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 4 Apr 2017 08:39:36 -0400
Subject: libceph: remove req->r_replay_version

Nothing uses this anymore with the removal of the ack vs. commit code.
Remove the field and just encode zeroes into place in the request
encoding.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h | 1 -
 net/ceph/debugfs.c              | 4 +---
 net/ceph/osd_client.c           | 7 ++++---
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index d6a625e75040..3fc9e7754a9b 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -192,7 +192,6 @@ struct ceph_osd_request {
 	unsigned long r_stamp;                /* jiffies, send or check time */
 	unsigned long r_start_stamp;          /* jiffies */
 	int r_attempts;
-	struct ceph_eversion r_replay_version; /* aka reassert_version */
 	u32 r_last_force_resend;
 	u32 r_map_dne_bound;
 
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index c62b2b029a6e..d7e63a4f5578 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -177,9 +177,7 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
 	seq_printf(s, "%llu\t", req->r_tid);
 	dump_target(s, &req->r_t);
 
-	seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
-		   le32_to_cpu(req->r_replay_version.epoch),
-		   le64_to_cpu(req->r_replay_version.version));
+	seq_printf(s, "\t%d", req->r_attempts);
 
 	for (i = 0; i < req->r_num_ops; i++) {
 		struct ceph_osd_req_op *op = &req->r_ops[i];
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b4500a8ab8b3..feb666c22381 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1503,9 +1503,10 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
 	ceph_encode_32(&p, req->r_flags);
 	ceph_encode_timespec(p, &req->r_mtime);
 	p += sizeof(struct ceph_timespec);
-	/* aka reassert_version */
-	memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
-	p += sizeof(req->r_replay_version);
+
+	/* reassert_version */
+	memset(p, 0, sizeof(struct ceph_eversion));
+	p += sizeof(struct ceph_eversion);
 
 	/* oloc */
 	ceph_start_encoding(&p, 5, 4,
-- 
cgit v1.2.3-70-g09d2


From a1f4020aab10a6bddb2d061c960ebe52cdfa30b5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 4 Apr 2017 08:39:37 -0400
Subject: libceph: allow requests to return immediately on full conditions if
 caller wishes

Usually, when the osd map is flagged as full or the pool is at quota,
write requests just hang. This is not what we want for cephfs, where
it would be better to simply report -ENOSPC back to userland instead
of stalling.

If the caller knows that it will want an immediate error return instead
of blocking on a full or at-quota error condition then allow it to set a
flag to request that behavior.

Set that flag in ceph_osdc_new_request (since ceph.ko is the only caller),
and on any other write request from ceph.ko.

A later patch will deal with requests that were submitted before the new
map showing the full condition came in.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c                  | 1 +
 fs/ceph/file.c                  | 1 +
 include/linux/ceph/osd_client.h | 1 +
 net/ceph/osd_client.c           | 7 +++++++
 4 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1a3e1b40799a..7e3fae334620 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1892,6 +1892,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
 	err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
 
 	wr_req->r_mtime = ci->vfs_inode.i_mtime;
+	wr_req->r_abort_on_full = true;
 	err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
 
 	if (!err)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ab4823e3e0d5..134c978141d0 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -781,6 +781,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	req->r_callback = ceph_aio_complete_req;
 	req->r_inode = inode;
 	req->r_priv = aio_req;
+	req->r_abort_on_full = true;
 
 	ret = ceph_osdc_start_request(req->r_osdc, req, false);
 out:
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 3fc9e7754a9b..8cf644197b1a 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -187,6 +187,7 @@ struct ceph_osd_request {
 	struct timespec r_mtime;              /* ditto */
 	u64 r_data_offset;                    /* ditto */
 	bool r_linger;                        /* don't resend on failure */
+	bool r_abort_on_full;		      /* return ENOSPC when full */
 
 	/* internal */
 	unsigned long r_stamp;                /* jiffies, send or check time */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index feb666c22381..52a2019a2b64 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -961,6 +961,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 				       truncate_size, truncate_seq);
 	}
 
+	req->r_abort_on_full = true;
 	req->r_flags = flags;
 	req->r_base_oloc.pool = layout->pool_id;
 	req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
@@ -1627,6 +1628,7 @@ static void maybe_request_map(struct ceph_osd_client *osdc)
 		ceph_monc_renew_subs(&osdc->client->monc);
 }
 
+static void complete_request(struct ceph_osd_request *req, int err);
 static void send_map_check(struct ceph_osd_request *req);
 
 static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
@@ -1636,6 +1638,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 	enum calc_target_result ct_res;
 	bool need_send = false;
 	bool promoted = false;
+	bool need_abort = false;
 
 	WARN_ON(req->r_tid);
 	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
@@ -1670,6 +1673,8 @@ again:
 		pr_warn_ratelimited("FULL or reached pool quota\n");
 		req->r_t.paused = true;
 		maybe_request_map(osdc);
+		if (req->r_abort_on_full)
+			need_abort = true;
 	} else if (!osd_homeless(osd)) {
 		need_send = true;
 	} else {
@@ -1686,6 +1691,8 @@ again:
 	link_request(osd, req);
 	if (need_send)
 		send_request(req);
+	else if (need_abort)
+		complete_request(req, -ENOSPC);
 	mutex_unlock(&osd->lock);
 
 	if (ct_res == CALC_TARGET_POOL_DNE)
-- 
cgit v1.2.3-70-g09d2


From 58eb7932ae4d671d2a2377a1779eda96a2789b11 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 18 Apr 2017 09:21:16 -0400
Subject: libceph: add an epoch_barrier field to struct ceph_osd_client

Cephfs can get cap update requests that contain a new epoch barrier in
them. When that happens we want to pause all OSD traffic until the right
map epoch arrives.

Add an epoch_barrier field to ceph_osd_client that is protected by the
osdc->lock rwsem. When the barrier is set, and the current OSD map
epoch is below that, pause the request target when submitting the
request or when revisiting it. Add a way for upper layers (cephfs)
to update the epoch_barrier as well.

If we get a new map, compare the new epoch against the barrier before
kicking requests and request another map if the map epoch is still lower
than the one we want.

If we get a map with a full pool, or at quota condition, then set the
barrier to the current epoch value.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  2 ++
 net/ceph/debugfs.c              |  3 +-
 net/ceph/osd_client.c           | 79 +++++++++++++++++++++++++++++++++++++----
 3 files changed, 76 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 8cf644197b1a..85650b415e73 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -267,6 +267,7 @@ struct ceph_osd_client {
 	struct rb_root         osds;          /* osds */
 	struct list_head       osd_lru;       /* idle osds */
 	spinlock_t             osd_lru_lock;
+	u32		       epoch_barrier;
 	struct ceph_osd        homeless_osd;
 	atomic64_t             last_tid;      /* tid of last request */
 	u64                    last_linger_id;
@@ -305,6 +306,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
 				   struct ceph_msg *msg);
 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 				 struct ceph_msg *msg);
+void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
 
 extern void osd_req_op_init(struct ceph_osd_request *osd_req,
 			    unsigned int which, u16 opcode, u32 flags);
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index d7e63a4f5578..71ba13927b3d 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -62,7 +62,8 @@ static int osdmap_show(struct seq_file *s, void *p)
 		return 0;
 
 	down_read(&osdc->lock);
-	seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
+	seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch,
+			osdc->epoch_barrier, map->flags);
 
 	for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
 		struct ceph_pg_pool_info *pi =
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 55b7585ccefd..9b8f57fd5ee1 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1298,8 +1298,9 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
 		       __pool_full(pi);
 
 	WARN_ON(pi->id != t->base_oloc.pool);
-	return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
-	       (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+	return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) ||
+	       ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) ||
+	       (osdc->osdmap->epoch < osdc->epoch_barrier);
 }
 
 enum calc_target_result {
@@ -1654,8 +1655,13 @@ again:
 		goto promote;
 	}
 
-	if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
-	    ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+	if (osdc->osdmap->epoch < osdc->epoch_barrier) {
+		dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch,
+		     osdc->epoch_barrier);
+		req->r_t.paused = true;
+		maybe_request_map(osdc);
+	} else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+		   ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
 		dout("req %p pausewr\n", req);
 		req->r_t.paused = true;
 		maybe_request_map(osdc);
@@ -1807,19 +1813,77 @@ static void abort_request(struct ceph_osd_request *req, int err)
 	complete_request(req, err);
 }
 
+static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
+{
+	if (likely(eb > osdc->epoch_barrier)) {
+		dout("updating epoch_barrier from %u to %u\n",
+				osdc->epoch_barrier, eb);
+		osdc->epoch_barrier = eb;
+		/* Request map if we're not to the barrier yet */
+		if (eb > osdc->osdmap->epoch)
+			maybe_request_map(osdc);
+	}
+}
+
+void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
+{
+	down_read(&osdc->lock);
+	if (unlikely(eb > osdc->epoch_barrier)) {
+		up_read(&osdc->lock);
+		down_write(&osdc->lock);
+		update_epoch_barrier(osdc, eb);
+		up_write(&osdc->lock);
+	} else {
+		up_read(&osdc->lock);
+	}
+}
+EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier);
+
 /*
  * Drop all pending requests that are stalled waiting on a full condition to
- * clear, and complete them with ENOSPC as the return code.
+ * clear, and complete them with ENOSPC as the return code. Set the
+ * osdc->epoch_barrier to the latest map epoch that we've seen if any were
+ * cancelled.
  */
 static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
 {
 	struct rb_node *n;
+	bool victims = false;
 
 	dout("enter abort_on_full\n");
 
 	if (!ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && !have_pool_full(osdc))
 		goto out;
 
+	/* Scan list and see if there is anything to abort */
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+		struct rb_node *m;
+
+		m = rb_first(&osd->o_requests);
+		while (m) {
+			struct ceph_osd_request *req = rb_entry(m,
+					struct ceph_osd_request, r_node);
+			m = rb_next(m);
+
+			if (req->r_abort_on_full) {
+				victims = true;
+				break;
+			}
+		}
+		if (victims)
+			break;
+	}
+
+	if (!victims)
+		goto out;
+
+	/*
+	 * Update the barrier to current epoch if it's behind that point,
+	 * since we know we have some calls to be aborted in the tree.
+	 */
+	update_epoch_barrier(osdc, osdc->osdmap->epoch);
+
 	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
 		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
 		struct rb_node *m;
@@ -1837,7 +1901,7 @@ static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
 		}
 	}
 out:
-	dout("return abort_on_full\n");
+	dout("return abort_on_full barrier=%u\n", osdc->epoch_barrier);
 }
 
 static void check_pool_dne(struct ceph_osd_request *req)
@@ -3293,7 +3357,8 @@ done:
 	pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
 		  ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
 		  have_pool_full(osdc);
-	if (was_pauserd || was_pausewr || pauserd || pausewr)
+	if (was_pauserd || was_pausewr || pauserd || pausewr ||
+	    osdc->osdmap->epoch < osdc->epoch_barrier)
 		maybe_request_map(osdc);
 
 	kick_requests(osdc, &need_resend, &need_resend_linger);
-- 
cgit v1.2.3-70-g09d2


From 14bb211d324d6c8140167bd6b2b8a80757348a2f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 13 Apr 2017 12:17:38 +0200
Subject: rbd: support updating the lock cookie without releasing the lock

As we no longer release the lock before potentially raising BLACKLISTED
in rbd_reregister_watch(), the "either locked or blacklisted" assert in
rbd_queue_workfn() needs to go: we can be both locked and blacklisted
at that point now.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jason Dillaman <dillaman@redhat.com>
---
 drivers/block/rbd.c                  | 66 ++++++++++++++++++++++--------------
 include/linux/ceph/cls_lock_client.h |  5 +++
 net/ceph/cls_lock_client.c           | 51 ++++++++++++++++++++++++++++
 3 files changed, 97 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 5f563db59820..063c8f06fb9c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3820,24 +3820,51 @@ static void rbd_unregister_watch(struct rbd_device *rbd_dev)
 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
 }
 
+/*
+ * lock_rwsem must be held for write
+ */
+static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	char cookie[32];
+	int ret;
+
+	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
+
+	format_lock_cookie(rbd_dev, cookie);
+	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
+				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
+				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
+				  RBD_LOCK_TAG, cookie);
+	if (ret) {
+		if (ret != -EOPNOTSUPP)
+			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
+				 ret);
+
+		/*
+		 * Lock cookie cannot be updated on older OSDs, so do
+		 * a manual release and queue an acquire.
+		 */
+		if (rbd_release_lock(rbd_dev))
+			queue_delayed_work(rbd_dev->task_wq,
+					   &rbd_dev->lock_dwork, 0);
+	} else {
+		strcpy(rbd_dev->lock_cookie, cookie);
+	}
+}
+
 static void rbd_reregister_watch(struct work_struct *work)
 {
 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
 					    struct rbd_device, watch_dwork);
-	bool was_lock_owner = false;
-	bool need_to_wake = false;
 	int ret;
 
 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
 
-	down_write(&rbd_dev->lock_rwsem);
-	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
-		was_lock_owner = rbd_release_lock(rbd_dev);
-
 	mutex_lock(&rbd_dev->watch_mutex);
 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
 		mutex_unlock(&rbd_dev->watch_mutex);
-		goto out;
+		return;
 	}
 
 	ret = __rbd_register_watch(rbd_dev);
@@ -3845,36 +3872,28 @@ static void rbd_reregister_watch(struct work_struct *work)
 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
 		if (ret == -EBLACKLISTED || ret == -ENOENT) {
 			set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
-			need_to_wake = true;
+			wake_requests(rbd_dev, true);
 		} else {
 			queue_delayed_work(rbd_dev->task_wq,
 					   &rbd_dev->watch_dwork,
 					   RBD_RETRY_DELAY);
 		}
 		mutex_unlock(&rbd_dev->watch_mutex);
-		goto out;
+		return;
 	}
 
-	need_to_wake = true;
 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
 	mutex_unlock(&rbd_dev->watch_mutex);
 
+	down_write(&rbd_dev->lock_rwsem);
+	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
+		rbd_reacquire_lock(rbd_dev);
+	up_write(&rbd_dev->lock_rwsem);
+
 	ret = rbd_dev_refresh(rbd_dev);
 	if (ret)
 		rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
-
-	if (was_lock_owner) {
-		ret = rbd_try_lock(rbd_dev);
-		if (ret)
-			rbd_warn(rbd_dev, "reregisteration lock failed: %d",
-				 ret);
-	}
-
-out:
-	up_write(&rbd_dev->lock_rwsem);
-	if (need_to_wake)
-		wake_requests(rbd_dev, true);
 }
 
 /*
@@ -4052,9 +4071,6 @@ static void rbd_queue_workfn(struct work_struct *work)
 		if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
 		    !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
 			rbd_wait_state_locked(rbd_dev);
-
-		WARN_ON((rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) ^
-			!test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
 		if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
 			result = -EBLACKLISTED;
 			goto err_unlock;
diff --git a/include/linux/ceph/cls_lock_client.h b/include/linux/ceph/cls_lock_client.h
index 84884d8d4710..0594d3bba774 100644
--- a/include/linux/ceph/cls_lock_client.h
+++ b/include/linux/ceph/cls_lock_client.h
@@ -37,6 +37,11 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc,
 			struct ceph_object_locator *oloc,
 			char *lock_name, char *cookie,
 			struct ceph_entity_name *locker);
+int ceph_cls_set_cookie(struct ceph_osd_client *osdc,
+			struct ceph_object_id *oid,
+			struct ceph_object_locator *oloc,
+			char *lock_name, u8 type, char *old_cookie,
+			char *tag, char *new_cookie);
 
 void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers);
 
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index b9233b990399..08ada893f01e 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -179,6 +179,57 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc,
 }
 EXPORT_SYMBOL(ceph_cls_break_lock);
 
+int ceph_cls_set_cookie(struct ceph_osd_client *osdc,
+			struct ceph_object_id *oid,
+			struct ceph_object_locator *oloc,
+			char *lock_name, u8 type, char *old_cookie,
+			char *tag, char *new_cookie)
+{
+	int cookie_op_buf_size;
+	int name_len = strlen(lock_name);
+	int old_cookie_len = strlen(old_cookie);
+	int tag_len = strlen(tag);
+	int new_cookie_len = strlen(new_cookie);
+	void *p, *end;
+	struct page *cookie_op_page;
+	int ret;
+
+	cookie_op_buf_size = name_len + sizeof(__le32) +
+			     old_cookie_len + sizeof(__le32) +
+			     tag_len + sizeof(__le32) +
+			     new_cookie_len + sizeof(__le32) +
+			     sizeof(u8) + CEPH_ENCODING_START_BLK_LEN;
+	if (cookie_op_buf_size > PAGE_SIZE)
+		return -E2BIG;
+
+	cookie_op_page = alloc_page(GFP_NOIO);
+	if (!cookie_op_page)
+		return -ENOMEM;
+
+	p = page_address(cookie_op_page);
+	end = p + cookie_op_buf_size;
+
+	/* encode cls_lock_set_cookie_op struct */
+	ceph_start_encoding(&p, 1, 1,
+			    cookie_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+	ceph_encode_string(&p, end, lock_name, name_len);
+	ceph_encode_8(&p, type);
+	ceph_encode_string(&p, end, old_cookie, old_cookie_len);
+	ceph_encode_string(&p, end, tag, tag_len);
+	ceph_encode_string(&p, end, new_cookie, new_cookie_len);
+
+	dout("%s lock_name %s type %d old_cookie %s tag %s new_cookie %s\n",
+	     __func__, lock_name, type, old_cookie, tag, new_cookie);
+	ret = ceph_osdc_call(osdc, oid, oloc, "lock", "set_cookie",
+			     CEPH_OSD_FLAG_WRITE, cookie_op_page,
+			     cookie_op_buf_size, NULL, NULL);
+
+	dout("%s: status %d\n", __func__, ret);
+	__free_page(cookie_op_page);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_cls_set_cookie);
+
 void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers)
 {
 	int i;
-- 
cgit v1.2.3-70-g09d2


From f775ff7d89f33fc9ba63f6f70df3bcc98c2d9828 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 27 Apr 2017 18:34:00 +0200
Subject: ceph: fix file open flags on ppc64

The file open flags (O_foo) are platform specific and should never go
out to an interface that is not local to the system.

Unfortunately these flags have leaked out onto the wire in the cephfs
implementation. That lead to bogus flags getting transmitted on ppc64.

This patch converts the kernel view of flags to the ceph view of file
open flags.

Fixes: 124e68e74 ("ceph: file operations")
Signed-off-by: Alexander Graf <agraf@suse.de>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/file.c               | 34 +++++++++++++++++++++++++++++++++-
 include/linux/ceph/ceph_fs.h | 12 ++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 39866d6a34b6..9d2eeed9e323 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -13,6 +13,38 @@
 #include "mds_client.h"
 #include "cache.h"
 
+static __le32 ceph_flags_sys2wire(u32 flags)
+{
+	u32 wire_flags = 0;
+
+	switch (flags & O_ACCMODE) {
+	case O_RDONLY:
+		wire_flags |= CEPH_O_RDONLY;
+		break;
+	case O_WRONLY:
+		wire_flags |= CEPH_O_WRONLY;
+		break;
+	case O_RDWR:
+		wire_flags |= CEPH_O_RDWR;
+		break;
+	}
+
+#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; }
+
+	ceph_sys2wire(O_CREAT);
+	ceph_sys2wire(O_EXCL);
+	ceph_sys2wire(O_TRUNC);
+	ceph_sys2wire(O_DIRECTORY);
+	ceph_sys2wire(O_NOFOLLOW);
+
+#undef ceph_sys2wire
+
+	if (flags)
+		dout("unused open flags: %x", flags);
+
+	return cpu_to_le32(wire_flags);
+}
+
 /*
  * Ceph file operations
  *
@@ -123,7 +155,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode)
 	if (IS_ERR(req))
 		goto out;
 	req->r_fmode = ceph_flags_to_mode(flags);
-	req->r_args.open.flags = cpu_to_le32(flags);
+	req->r_args.open.flags = ceph_flags_sys2wire(flags);
 	req->r_args.open.mode = cpu_to_le32(create_mode);
 out:
 	return req;
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 1787e4a8e251..ad078ebe25d6 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -367,6 +367,18 @@ extern const char *ceph_mds_op_name(int op);
 #define CEPH_READDIR_HASH_ORDER		(1<<9)
 #define CEPH_READDIR_OFFSET_HASH	(1<<10)
 
+/*
+ * open request flags
+ */
+#define CEPH_O_RDONLY		00000000
+#define CEPH_O_WRONLY		00000001
+#define CEPH_O_RDWR		00000002
+#define CEPH_O_CREAT		00000100
+#define CEPH_O_EXCL		00000200
+#define CEPH_O_TRUNC		00001000
+#define CEPH_O_DIRECTORY	00200000
+#define CEPH_O_NOFOLLOW		00400000
+
 union ceph_mds_request_args {
 	struct {
 		__le32 mask;                 /* CEPH_CAP_* */
-- 
cgit v1.2.3-70-g09d2


From 44f0aeec203738bf34f4b7e16b745c8c71fe0f06 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 2 Apr 2017 16:50:33 +0200
Subject: dmaengine: pl080: Cut some unused defines

There is no in-kernel code using these indexed register
defines, and their offsets are clearly defined right below.
Cut them.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/amba/pl080.h | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/amba/pl080.h b/include/linux/amba/pl080.h
index 91b84a7f0539..800429f4b997 100644
--- a/include/linux/amba/pl080.h
+++ b/include/linux/amba/pl080.h
@@ -46,16 +46,8 @@
 
 /* Per channel configuration registers */
 
-#define PL080_Cx_STRIDE				(0x20)
+/* Per channel configuration registers */
 #define PL080_Cx_BASE(x)			((0x100 + (x * 0x20)))
-#define PL080_Cx_SRC_ADDR(x)			((0x100 + (x * 0x20)))
-#define PL080_Cx_DST_ADDR(x)			((0x104 + (x * 0x20)))
-#define PL080_Cx_LLI(x)				((0x108 + (x * 0x20)))
-#define PL080_Cx_CONTROL(x)			((0x10C + (x * 0x20)))
-#define PL080_Cx_CONFIG(x)			((0x110 + (x * 0x20)))
-#define PL080S_Cx_CONTROL2(x)			((0x110 + (x * 0x20)))
-#define PL080S_Cx_CONFIG(x)			((0x114 + (x * 0x20)))
-
 #define PL080_CH_SRC_ADDR			(0x00)
 #define PL080_CH_DST_ADDR			(0x04)
 #define PL080_CH_LLI				(0x08)
-- 
cgit v1.2.3-70-g09d2


From ded091fee6806b7120f475d89c151d611758a395 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 2 Apr 2017 16:50:53 +0200
Subject: dmaengine: pl08x: Use the BIT() macro consistently

This makes the driver shift bits with BIT() which is used on other
places in the driver.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/amba-pl08x.c   | 10 +++++-----
 include/linux/amba/pl080.h | 40 ++++++++++++++++++++--------------------
 2 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 8624598530f3..286114ebbbb9 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -420,7 +420,7 @@ static void pl08x_start_next_txd(struct pl08x_dma_chan *plchan)
 
 	/* Enable the DMA channel */
 	/* Do not access config register until channel shows as disabled */
-	while (readl(pl08x->base + PL080_EN_CHAN) & (1 << phychan->id))
+	while (readl(pl08x->base + PL080_EN_CHAN) & BIT(phychan->id))
 		cpu_relax();
 
 	/* Do not access config register until channel shows as inactive */
@@ -487,8 +487,8 @@ static void pl08x_terminate_phy_chan(struct pl08x_driver_data *pl08x,
 
 	writel(val, ch->reg_config);
 
-	writel(1 << ch->id, pl08x->base + PL080_ERR_CLEAR);
-	writel(1 << ch->id, pl08x->base + PL080_TC_CLEAR);
+	writel(BIT(ch->id), pl08x->base + PL080_ERR_CLEAR);
+	writel(BIT(ch->id), pl08x->base + PL080_TC_CLEAR);
 }
 
 static inline u32 get_bytes_in_cctl(u32 cctl)
@@ -1837,7 +1837,7 @@ static irqreturn_t pl08x_irq(int irq, void *dev)
 		return IRQ_NONE;
 
 	for (i = 0; i < pl08x->vd->channels; i++) {
-		if (((1 << i) & err) || ((1 << i) & tc)) {
+		if ((BIT(i) & err) || (BIT(i) & tc)) {
 			/* Locate physical channel */
 			struct pl08x_phy_chan *phychan = &pl08x->phy_chans[i];
 			struct pl08x_dma_chan *plchan = phychan->serving;
@@ -1875,7 +1875,7 @@ static irqreturn_t pl08x_irq(int irq, void *dev)
 			}
 			spin_unlock(&plchan->vc.lock);
 
-			mask |= (1 << i);
+			mask |= BIT(i);
 		}
 	}
 
diff --git a/include/linux/amba/pl080.h b/include/linux/amba/pl080.h
index 800429f4b997..580b5323a717 100644
--- a/include/linux/amba/pl080.h
+++ b/include/linux/amba/pl080.h
@@ -38,9 +38,9 @@
 #define PL080_SOFT_LSREQ			(0x2C)
 
 #define PL080_CONFIG				(0x30)
-#define PL080_CONFIG_M2_BE			(1 << 2)
-#define PL080_CONFIG_M1_BE			(1 << 1)
-#define PL080_CONFIG_ENABLE			(1 << 0)
+#define PL080_CONFIG_M2_BE			BIT(2)
+#define PL080_CONFIG_M1_BE			BIT(1)
+#define PL080_CONFIG_ENABLE			BIT(0)
 
 #define PL080_SYNC				(0x34)
 
@@ -58,18 +58,18 @@
 
 #define PL080_LLI_ADDR_MASK			(0x3fffffff << 2)
 #define PL080_LLI_ADDR_SHIFT			(2)
-#define PL080_LLI_LM_AHB2			(1 << 0)
+#define PL080_LLI_LM_AHB2			BIT(0)
 
-#define PL080_CONTROL_TC_IRQ_EN			(1 << 31)
+#define PL080_CONTROL_TC_IRQ_EN			BIT(31)
 #define PL080_CONTROL_PROT_MASK			(0x7 << 28)
 #define PL080_CONTROL_PROT_SHIFT		(28)
-#define PL080_CONTROL_PROT_CACHE		(1 << 30)
-#define PL080_CONTROL_PROT_BUFF			(1 << 29)
-#define PL080_CONTROL_PROT_SYS			(1 << 28)
-#define PL080_CONTROL_DST_INCR			(1 << 27)
-#define PL080_CONTROL_SRC_INCR			(1 << 26)
-#define PL080_CONTROL_DST_AHB2			(1 << 25)
-#define PL080_CONTROL_SRC_AHB2			(1 << 24)
+#define PL080_CONTROL_PROT_CACHE		BIT(30)
+#define PL080_CONTROL_PROT_BUFF			BIT(29)
+#define PL080_CONTROL_PROT_SYS			BIT(28)
+#define PL080_CONTROL_DST_INCR			BIT(27)
+#define PL080_CONTROL_SRC_INCR			BIT(26)
+#define PL080_CONTROL_DST_AHB2			BIT(25)
+#define PL080_CONTROL_SRC_AHB2			BIT(24)
 #define PL080_CONTROL_DWIDTH_MASK		(0x7 << 21)
 #define PL080_CONTROL_DWIDTH_SHIFT		(21)
 #define PL080_CONTROL_SWIDTH_MASK		(0x7 << 18)
@@ -95,20 +95,20 @@
 #define PL080_WIDTH_16BIT			(0x1)
 #define PL080_WIDTH_32BIT			(0x2)
 
-#define PL080N_CONFIG_ITPROT			(1 << 20)
-#define PL080N_CONFIG_SECPROT			(1 << 19)
-#define PL080_CONFIG_HALT			(1 << 18)
-#define PL080_CONFIG_ACTIVE			(1 << 17)  /* RO */
-#define PL080_CONFIG_LOCK			(1 << 16)
-#define PL080_CONFIG_TC_IRQ_MASK		(1 << 15)
-#define PL080_CONFIG_ERR_IRQ_MASK		(1 << 14)
+#define PL080N_CONFIG_ITPROT			BIT(20)
+#define PL080N_CONFIG_SECPROT			BIT(19)
+#define PL080_CONFIG_HALT			BIT(18)
+#define PL080_CONFIG_ACTIVE			BIT(17)  /* RO */
+#define PL080_CONFIG_LOCK			BIT(16)
+#define PL080_CONFIG_TC_IRQ_MASK		BIT(15)
+#define PL080_CONFIG_ERR_IRQ_MASK		BIT(14)
 #define PL080_CONFIG_FLOW_CONTROL_MASK		(0x7 << 11)
 #define PL080_CONFIG_FLOW_CONTROL_SHIFT		(11)
 #define PL080_CONFIG_DST_SEL_MASK		(0xf << 6)
 #define PL080_CONFIG_DST_SEL_SHIFT		(6)
 #define PL080_CONFIG_SRC_SEL_MASK		(0xf << 1)
 #define PL080_CONFIG_SRC_SEL_SHIFT		(1)
-#define PL080_CONFIG_ENABLE			(1 << 0)
+#define PL080_CONFIG_ENABLE			BIT(0)
 
 #define PL080_FLOW_MEM2MEM			(0x0)
 #define PL080_FLOW_MEM2PER			(0x1)
-- 
cgit v1.2.3-70-g09d2


From 2be83da85a64773efaa407639de81bd1377f880e Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Thu, 4 May 2017 12:34:32 +0100
Subject: thermal: devfreq_cooling: add new interface for direct power read

This patch introduces a new interface for device drivers connected to
devfreq_cooling in the thermal framework: get_real_power().

Some devices have more sophisticated methods (like power counters)
to approximate the actual power that they use.
In the previous implementation we had a pre-calculated power
table which was then scaled by 'utilization'
('busy_time' and 'total_time' taken from devfreq 'last_status').

With this new interface the driver can provide more precise data
regarding actual power to the thermal governor every time the power
budget is calculated. We then use this value and calculate the real
resource utilization scaling factor.

Reviewed-by: Chris Diamand <chris.diamand@arm.com>
Acked-by: Javi Merino <javi.merino@kernel.org>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
---
 drivers/thermal/devfreq_cooling.c | 105 +++++++++++++++++++++++++++++---------
 include/linux/devfreq_cooling.h   |  19 +++++++
 2 files changed, 101 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
index af9d32837a3a..26c31571a12c 100644
--- a/drivers/thermal/devfreq_cooling.c
+++ b/drivers/thermal/devfreq_cooling.c
@@ -28,6 +28,8 @@
 
 #include <trace/events/thermal.h>
 
+#define SCALE_ERROR_MITIGATION 100
+
 static DEFINE_IDA(devfreq_ida);
 
 /**
@@ -45,6 +47,12 @@ static DEFINE_IDA(devfreq_ida);
  * @freq_table_size:	Size of the @freq_table and @power_table
  * @power_ops:	Pointer to devfreq_cooling_power, used to generate the
  *		@power_table.
+ * @res_util:	Resource utilization scaling factor for the power.
+ *		It is multiplied by 100 to minimize the error. It is used
+ *		for estimation of the power budget instead of using
+ *		'utilization' (which is	'busy_time / 'total_time').
+ *		The 'res_util' range is from 100 to (power_table[state] * 100)
+ *		for the corresponding 'state'.
  */
 struct devfreq_cooling_device {
 	int id;
@@ -55,6 +63,8 @@ struct devfreq_cooling_device {
 	u32 *freq_table;
 	size_t freq_table_size;
 	struct devfreq_cooling_power *power_ops;
+	u32 res_util;
+	int capped_state;
 };
 
 /**
@@ -250,6 +260,16 @@ get_dynamic_power(struct devfreq_cooling_device *dfc, unsigned long freq,
 	return power;
 }
 
+
+static inline unsigned long get_total_power(struct devfreq_cooling_device *dfc,
+					    unsigned long freq,
+					    unsigned long voltage)
+{
+	return get_static_power(dfc, freq) + get_dynamic_power(dfc, freq,
+							       voltage);
+}
+
+
 static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cdev,
 					       struct thermal_zone_device *tz,
 					       u32 *power)
@@ -259,27 +279,55 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
 	struct devfreq_dev_status *status = &df->last_status;
 	unsigned long state;
 	unsigned long freq = status->current_frequency;
-	u32 dyn_power, static_power;
+	unsigned long voltage;
+	u32 dyn_power = 0;
+	u32 static_power = 0;
+	int res;
 
-	/* Get dynamic power for state */
 	state = freq_get_state(dfc, freq);
-	if (state == THERMAL_CSTATE_INVALID)
-		return -EAGAIN;
+	if (state == THERMAL_CSTATE_INVALID) {
+		res = -EAGAIN;
+		goto fail;
+	}
 
-	dyn_power = dfc->power_table[state];
+	if (dfc->power_ops->get_real_power) {
+		voltage = get_voltage(df, freq);
+		if (voltage == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
 
-	/* Scale dynamic power for utilization */
-	dyn_power = (dyn_power * status->busy_time) / status->total_time;
+		res = dfc->power_ops->get_real_power(df, power, freq, voltage);
+		if (!res) {
+			state = dfc->capped_state;
+			dfc->res_util = dfc->power_table[state];
+			dfc->res_util *= SCALE_ERROR_MITIGATION;
 
-	/* Get static power */
-	static_power = get_static_power(dfc, freq);
+			if (*power > 1)
+				dfc->res_util /= *power;
+		} else {
+			goto fail;
+		}
+	} else {
+		dyn_power = dfc->power_table[state];
+
+		/* Scale dynamic power for utilization */
+		dyn_power *= status->busy_time;
+		dyn_power /= status->total_time;
+		/* Get static power */
+		static_power = get_static_power(dfc, freq);
+
+		*power = dyn_power + static_power;
+	}
 
 	trace_thermal_power_devfreq_get_power(cdev, status, freq, dyn_power,
 					      static_power);
 
-	*power = dyn_power + static_power;
-
 	return 0;
+fail:
+	/* It is safe to set max in this case */
+	dfc->res_util = SCALE_ERROR_MITIGATION;
+	return res;
 }
 
 static int devfreq_cooling_state2power(struct thermal_cooling_device *cdev,
@@ -312,26 +360,34 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
 	unsigned long busy_time;
 	s32 dyn_power;
 	u32 static_power;
+	s32 est_power;
 	int i;
 
-	static_power = get_static_power(dfc, freq);
+	if (dfc->power_ops->get_real_power) {
+		/* Scale for resource utilization */
+		est_power = power * dfc->res_util;
+		est_power /= SCALE_ERROR_MITIGATION;
+	} else {
+		static_power = get_static_power(dfc, freq);
 
-	dyn_power = power - static_power;
-	dyn_power = dyn_power > 0 ? dyn_power : 0;
+		dyn_power = power - static_power;
+		dyn_power = dyn_power > 0 ? dyn_power : 0;
 
-	/* Scale dynamic power for utilization */
-	busy_time = status->busy_time ?: 1;
-	dyn_power = (dyn_power * status->total_time) / busy_time;
+		/* Scale dynamic power for utilization */
+		busy_time = status->busy_time ?: 1;
+		est_power = (dyn_power * status->total_time) / busy_time;
+	}
 
 	/*
 	 * Find the first cooling state that is within the power
 	 * budget for dynamic power.
 	 */
 	for (i = 0; i < dfc->freq_table_size - 1; i++)
-		if (dyn_power >= dfc->power_table[i])
+		if (est_power >= dfc->power_table[i])
 			break;
 
 	*state = i;
+	dfc->capped_state = i;
 	trace_thermal_power_devfreq_limit(cdev, freq, *state, power);
 	return 0;
 }
@@ -387,7 +443,7 @@ static int devfreq_cooling_gen_tables(struct devfreq_cooling_device *dfc)
 	}
 
 	for (i = 0, freq = ULONG_MAX; i < num_opps; i++, freq--) {
-		unsigned long power_dyn, voltage;
+		unsigned long power, voltage;
 		struct dev_pm_opp *opp;
 
 		opp = dev_pm_opp_find_freq_floor(dev, &freq);
@@ -400,12 +456,15 @@ static int devfreq_cooling_gen_tables(struct devfreq_cooling_device *dfc)
 		dev_pm_opp_put(opp);
 
 		if (dfc->power_ops) {
-			power_dyn = get_dynamic_power(dfc, freq, voltage);
+			if (dfc->power_ops->get_real_power)
+				power = get_total_power(dfc, freq, voltage);
+			else
+				power = get_dynamic_power(dfc, freq, voltage);
 
-			dev_dbg(dev, "Dynamic power table: %lu MHz @ %lu mV: %lu = %lu mW\n",
-				freq / 1000000, voltage, power_dyn, power_dyn);
+			dev_dbg(dev, "Power table: %lu MHz @ %lu mV: %lu = %lu mW\n",
+				freq / 1000000, voltage, power, power);
 
-			power_table[i] = power_dyn;
+			power_table[i] = power;
 		}
 
 		freq_table[i] = freq;
diff --git a/include/linux/devfreq_cooling.h b/include/linux/devfreq_cooling.h
index c35d0c0e0ada..4635f95000a4 100644
--- a/include/linux/devfreq_cooling.h
+++ b/include/linux/devfreq_cooling.h
@@ -34,6 +34,23 @@
  *			If get_dynamic_power() is NULL, then the
  *			dynamic power is calculated as
  *			@dyn_power_coeff * frequency * voltage^2
+ * @get_real_power:	When this is set, the framework uses it to ask the
+ *			device driver for the actual power.
+ *			Some devices have more sophisticated methods
+ *			(like power counters) to approximate the actual power
+ *			that they use.
+ *			This function provides more accurate data to the
+ *			thermal governor. When the driver does not provide
+ *			such function, framework just uses pre-calculated
+ *			table and scale the power by 'utilization'
+ *			(based on 'busy_time' and 'total_time' taken from
+ *			devfreq 'last_status').
+ *			The value returned by this function must be lower
+ *			or equal than the maximum power value
+ *			for the current	state
+ *			(which can be found in power_table[state]).
+ *			When this interface is used, the power_table holds
+ *			max total (static + dynamic) power value for each OPP.
  */
 struct devfreq_cooling_power {
 	unsigned long (*get_static_power)(struct devfreq *devfreq,
@@ -41,6 +58,8 @@ struct devfreq_cooling_power {
 	unsigned long (*get_dynamic_power)(struct devfreq *devfreq,
 					   unsigned long freq,
 					   unsigned long voltage);
+	int (*get_real_power)(struct devfreq *df, u32 *power,
+			      unsigned long freq, unsigned long voltage);
 	unsigned long dyn_power_coeff;
 };
 
-- 
cgit v1.2.3-70-g09d2


From 8a537ece3d946227e4afa81eae0e43fa47439c7d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 26 Apr 2017 23:22:09 +0200
Subject: PM / wakeup: Integrate mechanism to abort transitions in progress

The system wakeup framework is not very consistent with respect to
the way it handles suspend-to-idle and generally wakeup events
occurring during transitions to system low-power states.

First off, system transitions in progress are aborted by the event
reporting helpers like pm_wakeup_event() only if the wakeup_count
sysfs attribute is in use (as documented), but there are cases in
which system-wide transitions should be aborted even if that is
not the case.  For example, a wakeup signal from a designated
wakeup device during system-wide PM transition, it should cause
the transition to be aborted right away.

Moreover, there is a freeze_wake() call in wakeup_source_activate(),
but that really is only effective after suspend_freeze_state has
been set to FREEZE_STATE_ENTER by freeze_enter().  However, it
is very unlikely that wakeup_source_activate() will ever be called
at that time, as it could only be triggered by a IRQF_NO_SUSPEND
interrupt handler, so wakeups from suspend-to-idle don't really
occur in wakeup_source_activate().

At the same time there is a way to abort a system suspend in
progress (or wake up the system from suspend-to-idle), which is by
calling pm_system_wakeup(), but in turn that doesn't cause any
wakeup source objects to be activated, so it will not be covered
by wakeup source statistics and will not prevent the system from
suspending again immediately (in case autosleep is used, for
example).  Consequently, if anyone wants to abort system transitions
in progress and allow the wakeup_count mechanism to work, they need
to use both pm_system_wakeup() and pm_wakeup_event(), say, at the
same time which is awkward.

For the above reasons, make it possible to trigger
pm_system_wakeup() from within wakeup_source_activate() and
provide a new pm_wakeup_hard_event() helper to do so within the
wakeup framework.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/wakeup.c | 36 ++++++++++++++++++------------------
 include/linux/pm_wakeup.h   | 25 +++++++++++++++++++++----
 2 files changed, 39 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 136854970489..84c41c98d6fc 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -512,12 +512,13 @@ static bool wakeup_source_not_registered(struct wakeup_source *ws)
 /**
  * wakup_source_activate - Mark given wakeup source as active.
  * @ws: Wakeup source to handle.
+ * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
  *
  * Update the @ws' statistics and, if @ws has just been activated, notify the PM
  * core of the event by incrementing the counter of of wakeup events being
  * processed.
  */
-static void wakeup_source_activate(struct wakeup_source *ws)
+static void wakeup_source_activate(struct wakeup_source *ws, bool hard)
 {
 	unsigned int cec;
 
@@ -525,11 +526,8 @@ static void wakeup_source_activate(struct wakeup_source *ws)
 			"unregistered wakeup source\n"))
 		return;
 
-	/*
-	 * active wakeup source should bring the system
-	 * out of PM_SUSPEND_FREEZE state
-	 */
-	freeze_wake();
+	if (hard)
+		pm_system_wakeup();
 
 	ws->active = true;
 	ws->active_count++;
@@ -546,8 +544,9 @@ static void wakeup_source_activate(struct wakeup_source *ws)
 /**
  * wakeup_source_report_event - Report wakeup event using the given source.
  * @ws: Wakeup source to report the event for.
+ * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
  */
-static void wakeup_source_report_event(struct wakeup_source *ws)
+static void wakeup_source_report_event(struct wakeup_source *ws, bool hard)
 {
 	ws->event_count++;
 	/* This is racy, but the counter is approximate anyway. */
@@ -555,7 +554,7 @@ static void wakeup_source_report_event(struct wakeup_source *ws)
 		ws->wakeup_count++;
 
 	if (!ws->active)
-		wakeup_source_activate(ws);
+		wakeup_source_activate(ws, hard);
 }
 
 /**
@@ -573,7 +572,7 @@ void __pm_stay_awake(struct wakeup_source *ws)
 
 	spin_lock_irqsave(&ws->lock, flags);
 
-	wakeup_source_report_event(ws);
+	wakeup_source_report_event(ws, false);
 	del_timer(&ws->timer);
 	ws->timer_expires = 0;
 
@@ -739,9 +738,10 @@ static void pm_wakeup_timer_fn(unsigned long data)
 }
 
 /**
- * __pm_wakeup_event - Notify the PM core of a wakeup event.
+ * pm_wakeup_ws_event - Notify the PM core of a wakeup event.
  * @ws: Wakeup source object associated with the event source.
  * @msec: Anticipated event processing time (in milliseconds).
+ * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
  *
  * Notify the PM core of a wakeup event whose source is @ws that will take
  * approximately @msec milliseconds to be processed by the kernel.  If @ws is
@@ -750,7 +750,7 @@ static void pm_wakeup_timer_fn(unsigned long data)
  *
  * It is safe to call this function from interrupt context.
  */
-void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec)
+void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard)
 {
 	unsigned long flags;
 	unsigned long expires;
@@ -760,7 +760,7 @@ void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec)
 
 	spin_lock_irqsave(&ws->lock, flags);
 
-	wakeup_source_report_event(ws);
+	wakeup_source_report_event(ws, hard);
 
 	if (!msec) {
 		wakeup_source_deactivate(ws);
@@ -779,17 +779,17 @@ void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec)
  unlock:
 	spin_unlock_irqrestore(&ws->lock, flags);
 }
-EXPORT_SYMBOL_GPL(__pm_wakeup_event);
-
+EXPORT_SYMBOL_GPL(pm_wakeup_ws_event);
 
 /**
  * pm_wakeup_event - Notify the PM core of a wakeup event.
  * @dev: Device the wakeup event is related to.
  * @msec: Anticipated event processing time (in milliseconds).
+ * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
  *
- * Call __pm_wakeup_event() for the @dev's wakeup source object.
+ * Call pm_wakeup_ws_event() for the @dev's wakeup source object.
  */
-void pm_wakeup_event(struct device *dev, unsigned int msec)
+void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard)
 {
 	unsigned long flags;
 
@@ -797,10 +797,10 @@ void pm_wakeup_event(struct device *dev, unsigned int msec)
 		return;
 
 	spin_lock_irqsave(&dev->power.lock, flags);
-	__pm_wakeup_event(dev->power.wakeup, msec);
+	pm_wakeup_ws_event(dev->power.wakeup, msec, hard);
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 }
-EXPORT_SYMBOL_GPL(pm_wakeup_event);
+EXPORT_SYMBOL_GPL(pm_wakeup_dev_event);
 
 void pm_print_active_wakeup_sources(void)
 {
diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index a3447932df1f..4c2cba7ec1d4 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -106,8 +106,8 @@ extern void __pm_stay_awake(struct wakeup_source *ws);
 extern void pm_stay_awake(struct device *dev);
 extern void __pm_relax(struct wakeup_source *ws);
 extern void pm_relax(struct device *dev);
-extern void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec);
-extern void pm_wakeup_event(struct device *dev, unsigned int msec);
+extern void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard);
+extern void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard);
 
 #else /* !CONFIG_PM_SLEEP */
 
@@ -182,9 +182,11 @@ static inline void __pm_relax(struct wakeup_source *ws) {}
 
 static inline void pm_relax(struct device *dev) {}
 
-static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec) {}
+static inline void pm_wakeup_ws_event(struct wakeup_source *ws,
+				      unsigned int msec, bool hard) {}
 
-static inline void pm_wakeup_event(struct device *dev, unsigned int msec) {}
+static inline void pm_wakeup_dev_event(struct device *dev, unsigned int msec,
+				       bool hard) {}
 
 #endif /* !CONFIG_PM_SLEEP */
 
@@ -201,4 +203,19 @@ static inline void wakeup_source_trash(struct wakeup_source *ws)
 	wakeup_source_drop(ws);
 }
 
+static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec)
+{
+	return pm_wakeup_ws_event(ws, msec, false);
+}
+
+static inline void pm_wakeup_event(struct device *dev, unsigned int msec)
+{
+	return pm_wakeup_dev_event(dev, msec, false);
+}
+
+static inline void pm_wakeup_hard_event(struct device *dev)
+{
+	return pm_wakeup_dev_event(dev, 0, true);
+}
+
 #endif /* _LINUX_PM_WAKEUP_H */
-- 
cgit v1.2.3-70-g09d2


From eed4d47efe9508b855b09754cf6de4325d8a2f0d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 26 Apr 2017 23:23:03 +0200
Subject: ACPI / sleep: Ignore spurious SCI wakeups from suspend-to-idle

The ACPI SCI (System Control Interrupt) is set up as a wakeup IRQ
during suspend-to-idle transitions and, consequently, any events
signaled through it wake up the system from that state.  However,
on some systems some of the events signaled via the ACPI SCI while
suspended to idle should not cause the system to wake up.  In fact,
quite often they should just be discarded.

Arguably, systems should not resume entirely on such events, but in
order to decide which events really should cause the system to resume
and which are spurious, it is necessary to resume up to the point
when ACPI SCIs are actually handled and processed, which is after
executing dpm_resume_noirq() in the system resume path.

For this reasons, add a loop around freeze_enter() in which the
platforms can process events signaled via multiplexed IRQ lines
like the ACPI SCI and add suspend-to-idle hooks that can be
used for this purpose to struct platform_freeze_ops.

In the ACPI case, the ->wake hook is used for checking if the SCI
has triggered while suspended and deferring the interrupt-induced
system wakeup until the events signaled through it are actually
processed sufficiently to decide whether or not the system should
resume.  In turn, the ->sync hook allows all of the relevant event
queues to be flushed so as to prevent events from being missed due
to race conditions.

In addition to that, some ACPI code processing wakeup events needs
to be modified to use the "hard" version of wakeup triggers, so that
it will cause a system resume to happen on device-induced wakeup
events even if the "soft" mechanism to prevent the system from
suspending is not enabled (that also helps to catch device-induced
wakeup events occurring during suspend transitions in progress).

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/battery.c      |  2 +-
 drivers/acpi/button.c       |  5 +++--
 drivers/acpi/device_pm.c    |  3 ++-
 drivers/acpi/sleep.c        | 28 ++++++++++++++++++++++++++++
 drivers/base/power/main.c   |  5 -----
 drivers/base/power/wakeup.c | 18 ++++++++++++------
 include/linux/suspend.h     |  7 +++++--
 kernel/power/process.c      |  2 +-
 kernel/power/suspend.c      | 29 +++++++++++++++++++++++++----
 9 files changed, 77 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index 4ef1e4624b2b..83ab17e4a795 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -776,7 +776,7 @@ static int acpi_battery_update(struct acpi_battery *battery, bool resume)
 	if ((battery->state & ACPI_BATTERY_STATE_CRITICAL) ||
 	    (test_bit(ACPI_BATTERY_ALARM_PRESENT, &battery->flags) &&
             (battery->capacity_now <= battery->alarm)))
-		pm_wakeup_event(&battery->device->dev, 0);
+		pm_wakeup_hard_event(&battery->device->dev);
 
 	return result;
 }
diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c
index 668137e4a069..b7c2a06963d6 100644
--- a/drivers/acpi/button.c
+++ b/drivers/acpi/button.c
@@ -216,7 +216,7 @@ static int acpi_lid_notify_state(struct acpi_device *device, int state)
 	}
 
 	if (state)
-		pm_wakeup_event(&device->dev, 0);
+		pm_wakeup_hard_event(&device->dev);
 
 	ret = blocking_notifier_call_chain(&acpi_lid_notifier, state, device);
 	if (ret == NOTIFY_DONE)
@@ -398,7 +398,7 @@ static void acpi_button_notify(struct acpi_device *device, u32 event)
 		} else {
 			int keycode;
 
-			pm_wakeup_event(&device->dev, 0);
+			pm_wakeup_hard_event(&device->dev);
 			if (button->suspended)
 				break;
 
@@ -530,6 +530,7 @@ static int acpi_button_add(struct acpi_device *device)
 		lid_device = device;
 	}
 
+	device_init_wakeup(&device->dev, true);
 	printk(KERN_INFO PREFIX "%s [%s]\n", name, acpi_device_bid(device));
 	return 0;
 
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index 993fd31394c8..798d5003a039 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -24,6 +24,7 @@
 #include <linux/pm_qos.h>
 #include <linux/pm_domain.h>
 #include <linux/pm_runtime.h>
+#include <linux/suspend.h>
 
 #include "internal.h"
 
@@ -399,7 +400,7 @@ static void acpi_pm_notify_handler(acpi_handle handle, u32 val, void *not_used)
 	mutex_lock(&acpi_pm_notifier_lock);
 
 	if (adev->wakeup.flags.notifier_present) {
-		__pm_wakeup_event(adev->wakeup.ws, 0);
+		pm_wakeup_ws_event(adev->wakeup.ws, 0, true);
 		if (adev->wakeup.context.work.func)
 			queue_pm_work(&adev->wakeup.context.work);
 	}
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index a4327af676fe..e84005d642e6 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -662,14 +662,40 @@ static int acpi_freeze_prepare(void)
 	acpi_os_wait_events_complete();
 	if (acpi_sci_irq_valid())
 		enable_irq_wake(acpi_sci_irq);
+
 	return 0;
 }
 
+static void acpi_freeze_wake(void)
+{
+	/*
+	 * If IRQD_WAKEUP_ARMED is not set for the SCI at this point, it means
+	 * that the SCI has triggered while suspended, so cancel the wakeup in
+	 * case it has not been a wakeup event (the GPEs will be checked later).
+	 */
+	if (acpi_sci_irq_valid() &&
+	    !irqd_is_wakeup_armed(irq_get_irq_data(acpi_sci_irq)))
+		pm_system_cancel_wakeup();
+}
+
+static void acpi_freeze_sync(void)
+{
+	/*
+	 * Process all pending events in case there are any wakeup ones.
+	 *
+	 * The EC driver uses the system workqueue, so that one needs to be
+	 * flushed too.
+	 */
+	acpi_os_wait_events_complete();
+	flush_scheduled_work();
+}
+
 static void acpi_freeze_restore(void)
 {
 	acpi_disable_wakeup_devices(ACPI_STATE_S0);
 	if (acpi_sci_irq_valid())
 		disable_irq_wake(acpi_sci_irq);
+
 	acpi_enable_all_runtime_gpes();
 }
 
@@ -681,6 +707,8 @@ static void acpi_freeze_end(void)
 static const struct platform_freeze_ops acpi_freeze_ops = {
 	.begin = acpi_freeze_begin,
 	.prepare = acpi_freeze_prepare,
+	.wake = acpi_freeze_wake,
+	.sync = acpi_freeze_sync,
 	.restore = acpi_freeze_restore,
 	.end = acpi_freeze_end,
 };
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 9faee1c893e5..e987a6f55d36 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1091,11 +1091,6 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a
 	if (async_error)
 		goto Complete;
 
-	if (pm_wakeup_pending()) {
-		async_error = -EBUSY;
-		goto Complete;
-	}
-
 	if (dev->power.syscore || dev->power.direct_complete)
 		goto Complete;
 
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 84c41c98d6fc..f62082fdd670 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -28,8 +28,8 @@ bool events_check_enabled __read_mostly;
 /* First wakeup IRQ seen by the kernel in the last cycle. */
 unsigned int pm_wakeup_irq __read_mostly;
 
-/* If set and the system is suspending, terminate the suspend. */
-static bool pm_abort_suspend __read_mostly;
+/* If greater than 0 and the system is suspending, terminate the suspend. */
+static atomic_t pm_abort_suspend __read_mostly;
 
 /*
  * Combined counters of registered wakeup events and wakeup events in progress.
@@ -856,20 +856,26 @@ bool pm_wakeup_pending(void)
 		pm_print_active_wakeup_sources();
 	}
 
-	return ret || pm_abort_suspend;
+	return ret || atomic_read(&pm_abort_suspend) > 0;
 }
 
 void pm_system_wakeup(void)
 {
-	pm_abort_suspend = true;
+	atomic_inc(&pm_abort_suspend);
 	freeze_wake();
 }
 EXPORT_SYMBOL_GPL(pm_system_wakeup);
 
-void pm_wakeup_clear(void)
+void pm_system_cancel_wakeup(void)
+{
+	atomic_dec(&pm_abort_suspend);
+}
+
+void pm_wakeup_clear(bool reset)
 {
-	pm_abort_suspend = false;
 	pm_wakeup_irq = 0;
+	if (reset)
+		atomic_set(&pm_abort_suspend, 0);
 }
 
 void pm_system_irq_wakeup(unsigned int irq_number)
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index d9718378a8be..0b1cf32edfd7 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -189,6 +189,8 @@ struct platform_suspend_ops {
 struct platform_freeze_ops {
 	int (*begin)(void);
 	int (*prepare)(void);
+	void (*wake)(void);
+	void (*sync)(void);
 	void (*restore)(void);
 	void (*end)(void);
 };
@@ -428,7 +430,8 @@ extern unsigned int pm_wakeup_irq;
 
 extern bool pm_wakeup_pending(void);
 extern void pm_system_wakeup(void);
-extern void pm_wakeup_clear(void);
+extern void pm_system_cancel_wakeup(void);
+extern void pm_wakeup_clear(bool reset);
 extern void pm_system_irq_wakeup(unsigned int irq_number);
 extern bool pm_get_wakeup_count(unsigned int *count, bool block);
 extern bool pm_save_wakeup_count(unsigned int count);
@@ -478,7 +481,7 @@ static inline int unregister_pm_notifier(struct notifier_block *nb)
 
 static inline bool pm_wakeup_pending(void) { return false; }
 static inline void pm_system_wakeup(void) {}
-static inline void pm_wakeup_clear(void) {}
+static inline void pm_wakeup_clear(bool reset) {}
 static inline void pm_system_irq_wakeup(unsigned int irq_number) {}
 
 static inline void lock_system_sleep(void) {}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index c7209f060eeb..78672d324a6e 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -132,7 +132,7 @@ int freeze_processes(void)
 	if (!pm_freezing)
 		atomic_inc(&system_freezing_cnt);
 
-	pm_wakeup_clear();
+	pm_wakeup_clear(true);
 	pr_info("Freezing user space processes ... ");
 	pm_freezing = true;
 	error = try_to_freeze_tasks(true);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 15e6baef5c73..c0248c74d6d4 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -72,6 +72,8 @@ static void freeze_begin(void)
 
 static void freeze_enter(void)
 {
+	trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true);
+
 	spin_lock_irq(&suspend_freeze_lock);
 	if (pm_wakeup_pending())
 		goto out;
@@ -98,6 +100,27 @@ static void freeze_enter(void)
  out:
 	suspend_freeze_state = FREEZE_STATE_NONE;
 	spin_unlock_irq(&suspend_freeze_lock);
+
+	trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false);
+}
+
+static void s2idle_loop(void)
+{
+	do {
+		freeze_enter();
+
+		if (freeze_ops && freeze_ops->wake)
+			freeze_ops->wake();
+
+		dpm_resume_noirq(PMSG_RESUME);
+		if (freeze_ops && freeze_ops->sync)
+			freeze_ops->sync();
+
+		if (pm_wakeup_pending())
+			break;
+
+		pm_wakeup_clear(false);
+	} while (!dpm_suspend_noirq(PMSG_SUSPEND));
 }
 
 void freeze_wake(void)
@@ -371,10 +394,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 	 * all the devices are suspended.
 	 */
 	if (state == PM_SUSPEND_FREEZE) {
-		trace_suspend_resume(TPS("machine_suspend"), state, true);
-		freeze_enter();
-		trace_suspend_resume(TPS("machine_suspend"), state, false);
-		goto Platform_wake;
+		s2idle_loop();
+		goto Platform_early_resume;
 	}
 
 	error = disable_nonboot_cpus();
-- 
cgit v1.2.3-70-g09d2


From 71afe470e20db133b30730cfa856e5d6854312e9 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 13 Apr 2017 09:06:20 +0200
Subject: KVM: arm64: vgic-its: Introduce migration ABI infrastructure

We plan to support different migration ABIs, ie. characterizing
the ITS table layout format in guest RAM. For example, a new ABI
will be needed if vLPIs get supported for nested use case.

So let's introduce an array of supported ABIs (at the moment a single
ABI is supported though). The following characteristics are foreseen
to vary with the ABI: size of table entries, save/restore operation,
the way abi settings are applied.

By default the MAX_ABI_REV is applied on its creation. In subsequent
patches we will introduce a way for the userspace to change the ABI
in use.

The entry sizes now are set according to the ABI version and not
hardcoded anymore.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Christoffer Dall <cdall@linaro.org>
---
 include/kvm/arm_vgic.h             |  3 ++
 include/linux/irqchip/arm-gic-v3.h |  5 ++
 virt/kvm/arm/vgic/vgic-its.c       | 93 ++++++++++++++++++++++++++++++++++++--
 3 files changed, 97 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 26ed4fb896bb..fabcc649e2ce 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -162,6 +162,9 @@ struct vgic_its {
 	u32			creadr;
 	u32			cwriter;
 
+	/* migration ABI revision in use */
+	u32			abi_rev;
+
 	/* Protects the device and collection lists */
 	struct mutex		its_lock;
 	struct list_head	device_list;
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 97cbca19430d..81ebe437ccc3 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -132,6 +132,9 @@
 #define GIC_BASER_SHAREABILITY(reg, type)				\
 	(GIC_BASER_##type << reg##_SHAREABILITY_SHIFT)
 
+/* encode a size field of width @w containing @n - 1 units */
+#define GIC_ENCODE_SZ(n, w) (((unsigned long)(n) - 1) & GENMASK_ULL(((w) - 1), 0))
+
 #define GICR_PROPBASER_SHAREABILITY_SHIFT		(10)
 #define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT		(7)
 #define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT		(56)
@@ -232,6 +235,7 @@
 #define GITS_CTLR_QUIESCENT		(1U << 31)
 
 #define GITS_TYPER_PLPIS		(1UL << 0)
+#define GITS_TYPER_ITT_ENTRY_SIZE_SHIFT	4
 #define GITS_TYPER_IDBITS_SHIFT		8
 #define GITS_TYPER_DEVBITS_SHIFT	13
 #define GITS_TYPER_DEVBITS(r)		((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1)
@@ -290,6 +294,7 @@
 #define GITS_BASER_TYPE(r)		(((r) >> GITS_BASER_TYPE_SHIFT) & 7)
 #define GITS_BASER_ENTRY_SIZE_SHIFT		(48)
 #define GITS_BASER_ENTRY_SIZE(r)	((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0x1f) + 1)
+#define GITS_BASER_ENTRY_SIZE_MASK	GENMASK_ULL(52, 48)
 #define GITS_BASER_SHAREABILITY_SHIFT	(10)
 #define GITS_BASER_InnerShareable					\
 	GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index bf3ff0931572..4f6ea46c496c 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -33,6 +33,10 @@
 #include "vgic.h"
 #include "vgic-mmio.h"
 
+static int vgic_its_save_tables_v0(struct vgic_its *its);
+static int vgic_its_restore_tables_v0(struct vgic_its *its);
+static int vgic_its_commit_v0(struct vgic_its *its);
+
 /*
  * Creates a new (reference to a) struct vgic_irq for a given LPI.
  * If this LPI is already mapped on another ITS, we increase its refcount
@@ -123,6 +127,50 @@ struct its_ite {
 	u32 event_id;
 };
 
+/**
+ * struct vgic_its_abi - ITS abi ops and settings
+ * @cte_esz: collection table entry size
+ * @dte_esz: device table entry size
+ * @ite_esz: interrupt translation table entry size
+ * @save tables: save the ITS tables into guest RAM
+ * @restore_tables: restore the ITS internal structs from tables
+ *  stored in guest RAM
+ * @commit: initialize the registers which expose the ABI settings,
+ *  especially the entry sizes
+ */
+struct vgic_its_abi {
+	int cte_esz;
+	int dte_esz;
+	int ite_esz;
+	int (*save_tables)(struct vgic_its *its);
+	int (*restore_tables)(struct vgic_its *its);
+	int (*commit)(struct vgic_its *its);
+};
+
+static const struct vgic_its_abi its_table_abi_versions[] = {
+	[0] = {.cte_esz = 8, .dte_esz = 8, .ite_esz = 8,
+	 .save_tables = vgic_its_save_tables_v0,
+	 .restore_tables = vgic_its_restore_tables_v0,
+	 .commit = vgic_its_commit_v0,
+	},
+};
+
+#define NR_ITS_ABIS	ARRAY_SIZE(its_table_abi_versions)
+
+inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its)
+{
+	return &its_table_abi_versions[its->abi_rev];
+}
+
+int vgic_its_set_abi(struct vgic_its *its, int rev)
+{
+	const struct vgic_its_abi *abi;
+
+	its->abi_rev = rev;
+	abi = vgic_its_get_abi(its);
+	return abi->commit(its);
+}
+
 /*
  * Find and returns a device in the device table for an ITS.
  * Must be called with the its_lock mutex held.
@@ -364,6 +412,7 @@ static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
 					      struct vgic_its *its,
 					      gpa_t addr, unsigned int len)
 {
+	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
 	u64 reg = GITS_TYPER_PLPIS;
 
 	/*
@@ -376,6 +425,7 @@ static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
 	 */
 	reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
 	reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
+	reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT;
 
 	return extract_bytes(reg, addr & 7, len);
 }
@@ -1268,6 +1318,7 @@ static void vgic_mmio_write_its_baser(struct kvm *kvm,
 				      gpa_t addr, unsigned int len,
 				      unsigned long val)
 {
+	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
 	u64 entry_size, device_type;
 	u64 reg, *regptr, clearbits = 0;
 
@@ -1278,12 +1329,12 @@ static void vgic_mmio_write_its_baser(struct kvm *kvm,
 	switch (BASER_INDEX(addr)) {
 	case 0:
 		regptr = &its->baser_device_table;
-		entry_size = 8;
+		entry_size = abi->dte_esz;
 		device_type = GITS_BASER_TYPE_DEVICE;
 		break;
 	case 1:
 		regptr = &its->baser_coll_table;
-		entry_size = 8;
+		entry_size = abi->cte_esz;
 		device_type = GITS_BASER_TYPE_COLLECTION;
 		clearbits = GITS_BASER_INDIRECT;
 		break;
@@ -1425,7 +1476,6 @@ static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its)
 	(GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb)		| \
 	 GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner)		| \
 	 GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)		| \
-	 ((8ULL - 1) << GITS_BASER_ENTRY_SIZE_SHIFT)			| \
 	 GITS_BASER_PAGE_SIZE_64K)
 
 #define INITIAL_PROPBASER_VALUE						  \
@@ -1465,7 +1515,7 @@ static int vgic_its_create(struct kvm_device *dev, u32 type)
 
 	dev->private = its;
 
-	return 0;
+	return vgic_its_set_abi(its, NR_ITS_ABIS - 1);
 }
 
 static void vgic_its_destroy(struct kvm_device *kvm_dev)
@@ -1592,6 +1642,41 @@ out:
 	return ret;
 }
 
+/**
+ * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM
+ * according to v0 ABI
+ */
+static int vgic_its_save_tables_v0(struct vgic_its *its)
+{
+	return -ENXIO;
+}
+
+/**
+ * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM
+ * to internal data structs according to V0 ABI
+ *
+ */
+static int vgic_its_restore_tables_v0(struct vgic_its *its)
+{
+	return -ENXIO;
+}
+
+static int vgic_its_commit_v0(struct vgic_its *its)
+{
+	const struct vgic_its_abi *abi;
+
+	abi = vgic_its_get_abi(its);
+	its->baser_coll_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
+	its->baser_device_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
+
+	its->baser_coll_table |= (GIC_ENCODE_SZ(abi->cte_esz, 5)
+					<< GITS_BASER_ENTRY_SIZE_SHIFT);
+
+	its->baser_device_table |= (GIC_ENCODE_SZ(abi->dte_esz, 5)
+					<< GITS_BASER_ENTRY_SIZE_SHIFT);
+	return 0;
+}
+
 static int vgic_its_has_attr(struct kvm_device *dev,
 			     struct kvm_device_attr *attr)
 {
-- 
cgit v1.2.3-70-g09d2


From ab01c6bdacc43c41c6b326889f4358f5afc38bf9 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 23 Mar 2017 15:14:00 +0100
Subject: KVM: arm64: vgic-its: Implement vgic_mmio_uaccess_write_its_iidr

The GITS_IIDR revision field is used to encode the migration ABI
revision. So we need to restore it to check the table layout is
readable by the destination.

By writing the IIDR, userspace thus forces the ABI revision to be
used and this must be less than or equal to the max revision KVM
supports.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Christoffer Dall <cdall@linaro.org>
---
 include/linux/irqchip/arm-gic-v3.h |  5 +++++
 virt/kvm/arm/vgic/vgic-its.c       | 23 ++++++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 81ebe437ccc3..2eaea308f003 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -242,6 +242,11 @@
 #define GITS_TYPER_PTA			(1UL << 19)
 #define GITS_TYPER_HWCOLLCNT_SHIFT	24
 
+#define GITS_IIDR_REV_SHIFT		12
+#define GITS_IIDR_REV_MASK		(0xf << GITS_IIDR_REV_SHIFT)
+#define GITS_IIDR_REV(r)		(((r) >> GITS_IIDR_REV_SHIFT) & 0xf)
+#define GITS_IIDR_PRODUCTID_SHIFT	24
+
 #define GITS_CBASER_VALID			(1ULL << 63)
 #define GITS_CBASER_SHAREABILITY_SHIFT		(10)
 #define GITS_CBASER_INNER_CACHEABILITY_SHIFT	(59)
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 4f6ea46c496c..9338efb79a54 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -434,7 +434,23 @@ static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm,
 					     struct vgic_its *its,
 					     gpa_t addr, unsigned int len)
 {
-	return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+	u32 val;
+
+	val = (its->abi_rev << GITS_IIDR_REV_SHIFT) & GITS_IIDR_REV_MASK;
+	val |= (PRODUCT_ID_KVM << GITS_IIDR_PRODUCTID_SHIFT) | IMPLEMENTER_ARM;
+	return val;
+}
+
+static int vgic_mmio_uaccess_write_its_iidr(struct kvm *kvm,
+					    struct vgic_its *its,
+					    gpa_t addr, unsigned int len,
+					    unsigned long val)
+{
+	u32 rev = GITS_IIDR_REV(val);
+
+	if (rev >= NR_ITS_ABIS)
+		return -EINVAL;
+	return vgic_its_set_abi(its, rev);
 }
 
 static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
@@ -1415,8 +1431,9 @@ static struct vgic_register_region its_registers[] = {
 	REGISTER_ITS_DESC(GITS_CTLR,
 		vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4,
 		VGIC_ACCESS_32bit),
-	REGISTER_ITS_DESC(GITS_IIDR,
-		vgic_mmio_read_its_iidr, its_mmio_write_wi, 4,
+	REGISTER_ITS_DESC_UACCESS(GITS_IIDR,
+		vgic_mmio_read_its_iidr, its_mmio_write_wi,
+		vgic_mmio_uaccess_write_its_iidr, 4,
 		VGIC_ACCESS_32bit),
 	REGISTER_ITS_DESC(GITS_TYPER,
 		vgic_mmio_read_its_typer, its_mmio_write_wi, 8,
-- 
cgit v1.2.3-70-g09d2


From 0d44cdb631ef53ea75be056886cf0541311e48df Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 22 Dec 2016 18:14:14 +0100
Subject: KVM: arm64: vgic-its: Interpret MAPD Size field and check related
 errors

Up to now the MAPD's ITT size field has been ignored. It encodes
the number of eventid bit minus 1. It should be used to check
the eventid when a MAPTI command is issued on a device. Let's
store the number of eventid bits in the its_device and do the
check on MAPTI. Also make sure the ITT size field does
not exceed the GITS_TYPER IDBITS field.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Christoffer Dall <cdall@linaro.org>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v3.h |  2 ++
 virt/kvm/arm/vgic/vgic-its.c       | 15 ++++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 2eaea308f003..be8bad00c419 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -347,9 +347,11 @@
 #define E_ITS_INT_UNMAPPED_INTERRUPT		0x010307
 #define E_ITS_CLEAR_UNMAPPED_INTERRUPT		0x010507
 #define E_ITS_MAPD_DEVICE_OOR			0x010801
+#define E_ITS_MAPD_ITTSIZE_OOR			0x010802
 #define E_ITS_MAPC_PROCNUM_OOR			0x010902
 #define E_ITS_MAPC_COLLECTION_OOR		0x010903
 #define E_ITS_MAPTI_UNMAPPED_DEVICE		0x010a04
+#define E_ITS_MAPTI_ID_OOR			0x010a05
 #define E_ITS_MAPTI_PHYSICALID_OOR		0x010a06
 #define E_ITS_INV_UNMAPPED_INTERRUPT		0x010c07
 #define E_ITS_INVALL_UNMAPPED_COLLECTION	0x010d09
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 9338efb79a54..031f6abd50fd 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -103,6 +103,7 @@ struct its_device {
 
 	/* the head for the list of ITTEs */
 	struct list_head itt_head;
+	u32 num_eventid_bits;
 	u32 device_id;
 };
 
@@ -224,6 +225,8 @@ static struct its_ite *find_ite(struct vgic_its *its, u32 device_id,
 
 #define GIC_LPI_OFFSET 8192
 
+#define VITS_TYPER_IDBITS 16
+
 /*
  * Finds and returns a collection in the ITS collection table.
  * Must be called with the its_lock mutex held.
@@ -424,7 +427,7 @@ static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
 	 * DevBits low - as least for the time being.
 	 */
 	reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
-	reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
+	reg |= GIC_ENCODE_SZ(VITS_TYPER_IDBITS, 5) << GITS_TYPER_IDBITS_SHIFT;
 	reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT;
 
 	return extract_bytes(reg, addr & 7, len);
@@ -595,6 +598,7 @@ static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size)
 
 #define its_cmd_get_command(cmd)	its_cmd_mask_field(cmd, 0,  0,  8)
 #define its_cmd_get_deviceid(cmd)	its_cmd_mask_field(cmd, 0, 32, 32)
+#define its_cmd_get_size(cmd)		(its_cmd_mask_field(cmd, 1,  0,  5) + 1)
 #define its_cmd_get_id(cmd)		its_cmd_mask_field(cmd, 1,  0, 32)
 #define its_cmd_get_physical_id(cmd)	its_cmd_mask_field(cmd, 1, 32, 32)
 #define its_cmd_get_collection(cmd)	its_cmd_mask_field(cmd, 2,  0, 16)
@@ -785,6 +789,9 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
 	if (!device)
 		return E_ITS_MAPTI_UNMAPPED_DEVICE;
 
+	if (event_id >= BIT_ULL(device->num_eventid_bits))
+		return E_ITS_MAPTI_ID_OOR;
+
 	if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI)
 		lpi_nr = its_cmd_get_physical_id(its_cmd);
 	else
@@ -865,11 +872,15 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
 {
 	u32 device_id = its_cmd_get_deviceid(its_cmd);
 	bool valid = its_cmd_get_validbit(its_cmd);
+	u8 num_eventid_bits = its_cmd_get_size(its_cmd);
 	struct its_device *device;
 
 	if (!vgic_its_check_id(its, its->baser_device_table, device_id))
 		return E_ITS_MAPD_DEVICE_OOR;
 
+	if (valid && num_eventid_bits > VITS_TYPER_IDBITS)
+		return E_ITS_MAPD_ITTSIZE_OOR;
+
 	device = find_its_device(its, device_id);
 
 	/*
@@ -892,6 +903,8 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
 		return -ENOMEM;
 
 	device->device_id = device_id;
+	device->num_eventid_bits = num_eventid_bits;
+
 	INIT_LIST_HEAD(&device->itt_head);
 
 	list_add_tail(&device->dev_list, &its->device_list);
-- 
cgit v1.2.3-70-g09d2


From 44de9d683847ba6dbac290bb8c9f1b773cbda746 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 4 May 2017 11:19:52 +0200
Subject: KVM: arm64: vgic-v3: vgic_v3_lpi_sync_pending_status

this new helper synchronizes the irq pending_latch
with the LPI pending bit status found in rdist pending table.
As the status is consumed, we reset the bit in pending table.

As we need the PENDBASER_ADDRESS() in vgic-v3, let's move its
definition in the irqchip header. We restore the full length
of the field, ie [51:16]. Same for PROPBASER_ADDRESS with full
field length of [51:12].

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Christoffer Dall <cdall@linaro.org>
---
 include/linux/irqchip/arm-gic-v3.h |  2 ++
 virt/kvm/arm/vgic/vgic-its.c       |  6 ++----
 virt/kvm/arm/vgic/vgic-v3.c        | 44 ++++++++++++++++++++++++++++++++++++++
 virt/kvm/arm/vgic/vgic.h           |  1 +
 4 files changed, 49 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index be8bad00c419..fffb91202bc9 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -159,6 +159,8 @@
 #define GICR_PROPBASER_RaWaWb	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb)
 
 #define GICR_PROPBASER_IDBITS_MASK			(0x1f)
+#define GICR_PROPBASER_ADDRESS(x)	((x) & GENMASK_ULL(51, 12))
+#define GICR_PENDBASER_ADDRESS(x)	((x) & GENMASK_ULL(51, 16))
 
 #define GICR_PENDBASER_SHAREABILITY_SHIFT		(10)
 #define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT		(7)
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index bd1362e9c214..3601790d841e 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -221,8 +221,6 @@ static struct its_ite *find_ite(struct vgic_its *its, u32 device_id,
  */
 #define BASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 16))
 #define CBASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 12))
-#define PENDBASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 16))
-#define PROPBASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 12))
 
 #define GIC_LPI_OFFSET 8192
 
@@ -257,7 +255,7 @@ static struct its_collection *find_collection(struct vgic_its *its, int coll_id)
 static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
 			     struct kvm_vcpu *filter_vcpu)
 {
-	u64 propbase = PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
+	u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
 	u8 prop;
 	int ret;
 
@@ -369,7 +367,7 @@ static u32 max_lpis_propbaser(u64 propbaser)
  */
 static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
 {
-	gpa_t pendbase = PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+	gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
 	struct vgic_irq *irq;
 	int last_byte_offset = -1;
 	int ret = 0;
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index df1503650300..9ae8adddff69 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -234,6 +234,50 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
 	vgic_v3->vgic_hcr = ICH_HCR_EN;
 }
 
+int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
+{
+	struct kvm_vcpu *vcpu;
+	int byte_offset, bit_nr;
+	gpa_t pendbase, ptr;
+	bool status;
+	u8 val;
+	int ret;
+
+retry:
+	vcpu = irq->target_vcpu;
+	if (!vcpu)
+		return 0;
+
+	pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+
+	byte_offset = irq->intid / BITS_PER_BYTE;
+	bit_nr = irq->intid % BITS_PER_BYTE;
+	ptr = pendbase + byte_offset;
+
+	ret = kvm_read_guest(kvm, ptr, &val, 1);
+	if (ret)
+		return ret;
+
+	status = val & (1 << bit_nr);
+
+	spin_lock(&irq->irq_lock);
+	if (irq->target_vcpu != vcpu) {
+		spin_unlock(&irq->irq_lock);
+		goto retry;
+	}
+	irq->pending_latch = status;
+	vgic_queue_irq_unlock(vcpu->kvm, irq);
+
+	if (status) {
+		/* clear consumed data */
+		val &= ~(1 << bit_nr);
+		ret = kvm_write_guest(kvm, ptr, &val, 1);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 /* check for overlapping regions and for regions crossing the end of memory */
 static bool vgic_v3_check_base(struct kvm *kvm)
 {
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index a48f33b04b2d..79768c801863 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -149,6 +149,7 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
 void vgic_v3_enable(struct kvm_vcpu *vcpu);
 int vgic_v3_probe(const struct gic_kvm_info *info);
 int vgic_v3_map_resources(struct kvm *kvm);
+int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq);
 int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
 
 void vgic_v3_load(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3-70-g09d2


From ef51042472f55b325fd7f2b26a2e29fd89757234 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 8 May 2017 10:55:27 -0700
Subject: block, dax: move "select DAX" from BLOCK to FS_DAX

For configurations that do not enable DAX filesystems or drivers, do not
require the DAX core to be built.

Given that the 'direct_access' method has been removed from
'block_device_operations', we can also go ahead and remove the
block-related dax helper functions from fs/block_dev.c to
drivers/dax/super.c. This keeps dax details out of the block layer and
lets the DAX core be built as a module in the FS_DAX=n case.

Filesystems need to include dax.h to call bdev_dax_supported().

Cc: linux-xfs@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.com>
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 block/Kconfig          |  1 -
 drivers/dax/super.c    | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/Kconfig             |  1 +
 fs/block_dev.c         | 66 -----------------------------------------------
 fs/ext2/super.c        |  1 +
 fs/ext4/super.c        |  1 +
 fs/xfs/xfs_super.c     |  1 +
 include/linux/blkdev.h |  2 --
 include/linux/dax.h    | 30 ++++++++++++++++++++--
 9 files changed, 102 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/block/Kconfig b/block/Kconfig
index 93da7fc3f254..e9f780f815f5 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -6,7 +6,6 @@ menuconfig BLOCK
        default y
        select SBITMAP
        select SRCU
-       select DAX
        help
 	 Provide block layer support for the kernel.
 
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 465dcd7317d5..e8998d15f3cd 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/magic.h>
+#include <linux/genhd.h>
 #include <linux/cdev.h>
 #include <linux/hash.h>
 #include <linux/slab.h>
@@ -47,6 +48,75 @@ void dax_read_unlock(int id)
 }
 EXPORT_SYMBOL_GPL(dax_read_unlock);
 
+int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
+		pgoff_t *pgoff)
+{
+	phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
+
+	if (pgoff)
+		*pgoff = PHYS_PFN(phys_off);
+	if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL(bdev_dax_pgoff);
+
+/**
+ * __bdev_dax_supported() - Check if the device supports dax for filesystem
+ * @sb: The superblock of the device
+ * @blocksize: The block size of the device
+ *
+ * This is a library function for filesystems to check if the block device
+ * can be mounted with dax option.
+ *
+ * Return: negative errno if unsupported, 0 if supported.
+ */
+int __bdev_dax_supported(struct super_block *sb, int blocksize)
+{
+	struct block_device *bdev = sb->s_bdev;
+	struct dax_device *dax_dev;
+	pgoff_t pgoff;
+	int err, id;
+	void *kaddr;
+	pfn_t pfn;
+	long len;
+
+	if (blocksize != PAGE_SIZE) {
+		pr_err("VFS (%s): error: unsupported blocksize for dax\n",
+				sb->s_id);
+		return -EINVAL;
+	}
+
+	err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
+	if (err) {
+		pr_err("VFS (%s): error: unaligned partition for dax\n",
+				sb->s_id);
+		return err;
+	}
+
+	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	if (!dax_dev) {
+		pr_err("VFS (%s): error: device does not support dax\n",
+				sb->s_id);
+		return -EOPNOTSUPP;
+	}
+
+	id = dax_read_lock();
+	len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
+	dax_read_unlock(id);
+
+	put_dax(dax_dev);
+
+	if (len < 1) {
+		pr_err("VFS (%s): error: dax access failed (%ld)",
+				sb->s_id, len);
+		return len < 0 ? len : -EIO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__bdev_dax_supported);
+
 /**
  * struct dax_device - anchor object for dax services
  * @inode: core vfs
diff --git a/fs/Kconfig b/fs/Kconfig
index 83eab52fb3f6..b0e42b6a96b9 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -39,6 +39,7 @@ config FS_DAX
 	depends on MMU
 	depends on !(ARM || MIPS || SPARC)
 	select FS_IOMAP
+	select DAX
 	help
 	  Direct Access (DAX) can be used on memory-backed block devices.
 	  If the block device supports DAX and the filesystem supports DAX,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 666367e13711..3096ecd48304 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -718,72 +718,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL_GPL(bdev_write_page);
 
-int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
-		pgoff_t *pgoff)
-{
-	phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
-
-	if (pgoff)
-		*pgoff = PHYS_PFN(phys_off);
-	if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
-		return -EINVAL;
-	return 0;
-}
-EXPORT_SYMBOL(bdev_dax_pgoff);
-
-/**
- * bdev_dax_supported() - Check if the device supports dax for filesystem
- * @sb: The superblock of the device
- * @blocksize: The block size of the device
- *
- * This is a library function for filesystems to check if the block device
- * can be mounted with dax option.
- *
- * Return: negative errno if unsupported, 0 if supported.
- */
-int bdev_dax_supported(struct super_block *sb, int blocksize)
-{
-	struct block_device *bdev = sb->s_bdev;
-	struct dax_device *dax_dev;
-	pgoff_t pgoff;
-	int err, id;
-	void *kaddr;
-	pfn_t pfn;
-	long len;
-
-	if (blocksize != PAGE_SIZE) {
-		vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
-		return -EINVAL;
-	}
-
-	err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
-	if (err) {
-		vfs_msg(sb, KERN_ERR, "error: unaligned partition for dax");
-		return err;
-	}
-
-	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
-	if (!dax_dev) {
-		vfs_msg(sb, KERN_ERR, "error: device does not support dax");
-		return -EOPNOTSUPP;
-	}
-
-	id = dax_read_lock();
-	len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
-	dax_read_unlock(id);
-
-	put_dax(dax_dev);
-
-	if (len < 1) {
-		vfs_msg(sb, KERN_ERR,
-				"error: dax access failed (%ld)", len);
-		return len < 0 ? len : -EIO;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(bdev_dax_supported);
-
 /*
  * pseudo-fs
  */
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 9e25a71fe1a2..d07773b81da9 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -32,6 +32,7 @@
 #include <linux/log2.h>
 #include <linux/quotaops.h>
 #include <linux/uaccess.h>
+#include <linux/dax.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a9448db1cf7e..bf6bb8997124 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -37,6 +37,7 @@
 #include <linux/ctype.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
+#include <linux/dax.h>
 #include <linux/cleancache.h>
 #include <linux/uaccess.h>
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 685c042a120f..f5c58d6dcafb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -52,6 +52,7 @@
 #include "xfs_reflink.h"
 
 #include <linux/namei.h>
+#include <linux/dax.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/mount.h>
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 848f87eb1905..e4d9899755a7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1940,8 +1940,6 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
-extern int bdev_dax_supported(struct super_block *, int);
-int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index d3158e74a59e..7fdf1d710042 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -18,12 +18,38 @@ struct dax_operations {
 			void **, pfn_t *);
 };
 
+int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
+#if IS_ENABLED(CONFIG_FS_DAX)
+int __bdev_dax_supported(struct super_block *sb, int blocksize);
+static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
+{
+	return __bdev_dax_supported(sb, blocksize);
+}
+#else
+static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_DAX)
+struct dax_device *dax_get_by_host(const char *host);
+void put_dax(struct dax_device *dax_dev);
+#else
+static inline struct dax_device *dax_get_by_host(const char *host)
+{
+	return NULL;
+}
+
+static inline void put_dax(struct dax_device *dax_dev)
+{
+}
+#endif
+
 int dax_read_lock(void);
 void dax_read_unlock(int id);
-struct dax_device *dax_get_by_host(const char *host);
 struct dax_device *alloc_dax(void *private, const char *host,
 		const struct dax_operations *ops);
-void put_dax(struct dax_device *dax_dev);
 bool dax_alive(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
 void *dax_get_private(struct dax_device *dax_dev);
-- 
cgit v1.2.3-70-g09d2


From e092693443b995c8e3a565a73b5fdb05f1260f9b Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Mon, 8 May 2017 18:02:24 -0400
Subject: NFS append COMMIT after synchronous COPY

Instead of messing with the commit path which has been causing issues,
add a COMMIT op after the COPY and ask for stable copies in the first
space.

It saves a round trip, since after the COPY, the client sends a COMMIT
anyway.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/internal.h       |  1 -
 fs/nfs/nfs42proc.c      | 21 +++++++++++++++------
 fs/nfs/nfs42xdr.c       | 22 ++++++++++++++++++++--
 fs/nfs/write.c          | 30 ------------------------------
 include/linux/nfs_xdr.h |  1 +
 5 files changed, 36 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 31b26cf1b476..e9b4c3320e37 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -495,7 +495,6 @@ void nfs_mark_request_commit(struct nfs_page *req,
 			     u32 ds_commit_idx);
 int nfs_write_need_commit(struct nfs_pgio_header *);
 void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
-int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf);
 int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
 			    int how, struct nfs_commit_info *cinfo);
 void nfs_retry_commit(struct list_head *page_list,
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 87f5b7b971ca..929d09a5310a 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -167,23 +167,29 @@ static ssize_t _nfs42_proc_copy(struct file *src,
 	if (status)
 		return status;
 
+	res->commit_res.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS);
+	if (!res->commit_res.verf)
+		return -ENOMEM;
 	status = nfs4_call_sync(server->client, server, &msg,
 				&args->seq_args, &res->seq_res, 0);
 	if (status == -ENOTSUPP)
 		server->caps &= ~NFS_CAP_COPY;
 	if (status)
-		return status;
+		goto out;
 
-	if (res->write_res.verifier.committed != NFS_FILE_SYNC) {
-		status = nfs_commit_file(dst, &res->write_res.verifier.verifier);
-		if (status)
-			return status;
+	if (!nfs_write_verifier_cmp(&res->write_res.verifier.verifier,
+				    &res->commit_res.verf->verifier)) {
+		status = -EAGAIN;
+		goto out;
 	}
 
 	truncate_pagecache_range(dst_inode, pos_dst,
 				 pos_dst + res->write_res.count);
 
-	return res->write_res.count;
+	status = res->write_res.count;
+out:
+	kfree(res->commit_res.verf);
+	return status;
 }
 
 ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
@@ -240,6 +246,9 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
 		if (err == -ENOTSUPP) {
 			err = -EOPNOTSUPP;
 			break;
+		} if (err == -EAGAIN) {
+			dst_exception.retry = 1;
+			continue;
 		}
 
 		err2 = nfs4_handle_exception(server, err, &src_exception);
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 6c7296454bbc..528362f69cc1 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -66,12 +66,14 @@
 					 encode_putfh_maxsz + \
 					 encode_savefh_maxsz + \
 					 encode_putfh_maxsz + \
-					 encode_copy_maxsz)
+					 encode_copy_maxsz + \
+					 encode_commit_maxsz)
 #define NFS4_dec_copy_sz		(compound_decode_hdr_maxsz + \
 					 decode_putfh_maxsz + \
 					 decode_savefh_maxsz + \
 					 decode_putfh_maxsz + \
-					 decode_copy_maxsz)
+					 decode_copy_maxsz + \
+					 decode_commit_maxsz)
 #define NFS4_enc_deallocate_sz		(compound_encode_hdr_maxsz + \
 					 encode_putfh_maxsz + \
 					 encode_deallocate_maxsz + \
@@ -222,6 +224,18 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
 	encode_nops(&hdr);
 }
 
+static void encode_copy_commit(struct xdr_stream *xdr,
+			  struct nfs42_copy_args *args,
+			  struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr);
+	p = reserve_space(xdr, 12);
+	p = xdr_encode_hyper(p, args->dst_pos);
+	*p = cpu_to_be32(args->count);
+}
+
 /*
  * Encode COPY request
  */
@@ -239,6 +253,7 @@ static void nfs4_xdr_enc_copy(struct rpc_rqst *req,
 	encode_savefh(xdr, &hdr);
 	encode_putfh(xdr, args->dst_fh, &hdr);
 	encode_copy(xdr, args, &hdr);
+	encode_copy_commit(xdr, args, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -481,6 +496,9 @@ static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp,
 	if (status)
 		goto out;
 	status = decode_copy(xdr, res);
+	if (status)
+		goto out;
+	status = decode_commit(xdr, &res->commit_res);
 out:
 	return status;
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 59e21cc0a266..85bfa416fcc8 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1742,36 +1742,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
 				   data->mds_ops, how, 0);
 }
 
-int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf)
-{
-	struct inode *inode = file_inode(file);
-	struct nfs_open_context *open;
-	struct nfs_commit_info cinfo;
-	struct nfs_page *req;
-	int ret;
-
-	open = get_nfs_open_context(nfs_file_open_context(file));
-	req  = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode));
-	if (IS_ERR(req)) {
-		ret = PTR_ERR(req);
-		goto out_put;
-	}
-
-	nfs_init_cinfo_from_inode(&cinfo, inode);
-
-	memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier));
-	nfs_request_add_commit_list(req, &cinfo);
-	ret = nfs_commit_inode(inode, FLUSH_SYNC);
-	if (ret > 0)
-		ret = 0;
-
-	nfs_free_request(req);
-out_put:
-	put_nfs_open_context(open);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(nfs_commit_file);
-
 /*
  * COMMIT call returned
  */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 677c6b91dfcd..b28c83475ee8 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1383,6 +1383,7 @@ struct nfs42_copy_res {
 	struct nfs42_write_res		write_res;
 	bool				consecutive;
 	bool				synchronous;
+	struct nfs_commitres		commit_res;
 };
 
 struct nfs42_seek_args {
-- 
cgit v1.2.3-70-g09d2


From 497d72d80a789501501cccabdad6b145f9e31371 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <cdall@linaro.org>
Date: Mon, 8 May 2017 20:38:40 +0200
Subject: KVM: Add kvm_vcpu_get_idx to get vcpu index in kvm->vcpus

There are occasional needs to use the index of vcpu in the kvm->vcpus
array to map something related to a VCPU.  For example, unlike the
vcpu->vcpu_id, the vcpu index is guaranteed to not be sparse across all
vcpus which is useful when allocating a memory area for each vcpu.

Signed-off-by: Christoffer Dall <cdall@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
---
 include/linux/kvm_host.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c14ad9809da..12eb26d665e8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -490,6 +490,17 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
 	return NULL;
 }
 
+static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu *tmp;
+	int idx;
+
+	kvm_for_each_vcpu(idx, tmp, vcpu->kvm)
+		if (tmp == vcpu)
+			return idx;
+	BUG();
+}
+
 #define kvm_for_each_memslot(memslot, slots)	\
 	for (memslot = &slots->memslots[0];	\
 	      memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\
-- 
cgit v1.2.3-70-g09d2


From ea47dd191d543f81e0912b5dc0471b48346b016e Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 19 Apr 2017 05:12:18 +1000
Subject: of/fdt: introduce of_scan_flat_dt_subnodes and of_get_flat_dt_phandle

Introduce primitives for FDT parsing. These will be used for powerpc
cpufeatures node scanning, which has quite complex structure but should
be processed early.

Cc: devicetree@vger.kernel.org
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/of/fdt.c       | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/of_fdt.h |  6 ++++++
 2 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index e5ce4b59e162..961ca97072a9 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -753,6 +753,36 @@ int __init of_scan_flat_dt(int (*it)(unsigned long node,
 	return rc;
 }
 
+/**
+ * of_scan_flat_dt_subnodes - scan sub-nodes of a node call callback on each.
+ * @it: callback function
+ * @data: context data pointer
+ *
+ * This function is used to scan sub-nodes of a node.
+ */
+int __init of_scan_flat_dt_subnodes(unsigned long parent,
+				    int (*it)(unsigned long node,
+					      const char *uname,
+					      void *data),
+				    void *data)
+{
+	const void *blob = initial_boot_params;
+	int node;
+
+	fdt_for_each_subnode(node, blob, parent) {
+		const char *pathp;
+		int rc;
+
+		pathp = fdt_get_name(blob, node, NULL);
+		if (*pathp == '/')
+			pathp = kbasename(pathp);
+		rc = it(node, pathp, data);
+		if (rc)
+			return rc;
+	}
+	return 0;
+}
+
 /**
  * of_get_flat_dt_subnode_by_name - get the subnode by given name
  *
@@ -812,6 +842,14 @@ int __init of_flat_dt_match(unsigned long node, const char *const *compat)
 	return of_fdt_match(initial_boot_params, node, compat);
 }
 
+/**
+ * of_get_flat_dt_prop - Given a node in the flat blob, return the phandle
+ */
+uint32_t __init of_get_flat_dt_phandle(unsigned long node)
+{
+	return fdt_get_phandle(initial_boot_params, node);
+}
+
 struct fdt_scan_status {
 	const char *name;
 	int namelen;
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 271b3fdf0070..1dfbfd0d8040 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -54,6 +54,11 @@ extern char __dtb_end[];
 extern int of_scan_flat_dt(int (*it)(unsigned long node, const char *uname,
 				     int depth, void *data),
 			   void *data);
+extern int of_scan_flat_dt_subnodes(unsigned long node,
+				    int (*it)(unsigned long node,
+					      const char *uname,
+					      void *data),
+				    void *data);
 extern int of_get_flat_dt_subnode_by_name(unsigned long node,
 					  const char *uname);
 extern const void *of_get_flat_dt_prop(unsigned long node, const char *name,
@@ -62,6 +67,7 @@ extern int of_flat_dt_is_compatible(unsigned long node, const char *name);
 extern int of_flat_dt_match(unsigned long node, const char *const *matches);
 extern unsigned long of_get_flat_dt_root(void);
 extern int of_get_flat_dt_size(void);
+extern uint32_t of_get_flat_dt_phandle(unsigned long node);
 
 extern int early_init_dt_scan_chosen(unsigned long node, const char *uname,
 				     int depth, void *data);
-- 
cgit v1.2.3-70-g09d2


From 9ea762a5ae45235eb3e5ec9c05c33cf37db78d70 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Thu, 30 Mar 2017 13:13:33 +0200
Subject: virtio: virtio_driver doc

Add comments for the virtio_driver members that were not documented.

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index ed04753278d4..28b0e965360f 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -165,9 +165,13 @@ int virtio_device_restore(struct virtio_device *dev);
  * @feature_table_legacy: same as feature_table but when working in legacy mode.
  * @feature_table_size_legacy: number of entries in feature table legacy array.
  * @probe: the function to call when a device is found.  Returns 0 or -errno.
+ * @scan: optional function to call after successful probe; intended
+ *    for virtio-scsi to invoke a scan.
  * @remove: the function to call when a device is removed.
  * @config_changed: optional function to call when the device configuration
  *    changes; may be called in interrupt context.
+ * @freeze: optional function to call during suspend/hibernation.
+ * @restore: optional function to call on resume.
  */
 struct virtio_driver {
 	struct device_driver driver;
-- 
cgit v1.2.3-70-g09d2


From fb9de9704775d6190c204f4ddf8da4cfdac26be1 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 7 Apr 2017 08:25:09 +0300
Subject: ptr_ring: batch ring zeroing

A known weakness in ptr_ring design is that it does not handle well the
situation when ring is almost full: as entries are consumed they are
immediately used again by the producer, so consumer and producer are
writing to a shared cache line.

To fix this, add batching to consume calls: as entries are
consumed do not write NULL into the ring until we get
a multiple (in current implementation 2x) of cache lines
away from the producer. At that point, write them all out.

We do the write out in the reverse order to keep
producer from sharing cache with consumer for as long
as possible.

Writeout also triggers when ring wraps around - there's
no special reason to do this but it helps keep the code
a bit simpler.

What should we do if getting away from producer by 2 cache lines
would mean we are keeping the ring moe than half empty?
Maybe we should reduce the batching in this case,
current patch simply reduces the batching.

Notes:
- it is no longer true that a call to consume guarantees
  that the following call to produce will succeed.
  No users seem to assume that.
- batching can also in theory reduce the signalling rate:
  users that would previously send interrups to the producer
  to wake it up after consuming each entry would now only
  need to do this once in a batch.
  Doing this would be easy by returning a flag to the caller.
  No users seem to do signalling on consume yet so this was not
  implemented yet.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/ptr_ring.h | 63 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 54 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 6c70444da3b9..6b2e0dd88569 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -34,11 +34,13 @@
 struct ptr_ring {
 	int producer ____cacheline_aligned_in_smp;
 	spinlock_t producer_lock;
-	int consumer ____cacheline_aligned_in_smp;
+	int consumer_head ____cacheline_aligned_in_smp; /* next valid entry */
+	int consumer_tail; /* next entry to invalidate */
 	spinlock_t consumer_lock;
 	/* Shared consumer/producer data */
 	/* Read-only by both the producer and the consumer */
 	int size ____cacheline_aligned_in_smp; /* max entries in queue */
+	int batch; /* number of entries to consume in a batch */
 	void **queue;
 };
 
@@ -170,7 +172,7 @@ static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr)
 static inline void *__ptr_ring_peek(struct ptr_ring *r)
 {
 	if (likely(r->size))
-		return r->queue[r->consumer];
+		return r->queue[r->consumer_head];
 	return NULL;
 }
 
@@ -231,9 +233,38 @@ static inline bool ptr_ring_empty_bh(struct ptr_ring *r)
 /* Must only be called after __ptr_ring_peek returned !NULL */
 static inline void __ptr_ring_discard_one(struct ptr_ring *r)
 {
-	r->queue[r->consumer++] = NULL;
-	if (unlikely(r->consumer >= r->size))
-		r->consumer = 0;
+	/* Fundamentally, what we want to do is update consumer
+	 * index and zero out the entry so producer can reuse it.
+	 * Doing it naively at each consume would be as simple as:
+	 *       r->queue[r->consumer++] = NULL;
+	 *       if (unlikely(r->consumer >= r->size))
+	 *               r->consumer = 0;
+	 * but that is suboptimal when the ring is full as producer is writing
+	 * out new entries in the same cache line.  Defer these updates until a
+	 * batch of entries has been consumed.
+	 */
+	int head = r->consumer_head++;
+
+	/* Once we have processed enough entries invalidate them in
+	 * the ring all at once so producer can reuse their space in the ring.
+	 * We also do this when we reach end of the ring - not mandatory
+	 * but helps keep the implementation simple.
+	 */
+	if (unlikely(r->consumer_head - r->consumer_tail >= r->batch ||
+		     r->consumer_head >= r->size)) {
+		/* Zero out entries in the reverse order: this way we touch the
+		 * cache line that producer might currently be reading the last;
+		 * producer won't make progress and touch other cache lines
+		 * besides the first one until we write out all entries.
+		 */
+		while (likely(head >= r->consumer_tail))
+			r->queue[head--] = NULL;
+		r->consumer_tail = r->consumer_head;
+	}
+	if (unlikely(r->consumer_head >= r->size)) {
+		r->consumer_head = 0;
+		r->consumer_tail = 0;
+	}
 }
 
 static inline void *__ptr_ring_consume(struct ptr_ring *r)
@@ -345,14 +376,27 @@ static inline void **__ptr_ring_init_queue_alloc(int size, gfp_t gfp)
 	return kzalloc(ALIGN(size * sizeof(void *), SMP_CACHE_BYTES), gfp);
 }
 
+static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
+{
+	r->size = size;
+	r->batch = SMP_CACHE_BYTES * 2 / sizeof(*(r->queue));
+	/* We need to set batch at least to 1 to make logic
+	 * in __ptr_ring_discard_one work correctly.
+	 * Batching too much (because ring is small) would cause a lot of
+	 * burstiness. Needs tuning, for now disable batching.
+	 */
+	if (r->batch > r->size / 2 || !r->batch)
+		r->batch = 1;
+}
+
 static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp)
 {
 	r->queue = __ptr_ring_init_queue_alloc(size, gfp);
 	if (!r->queue)
 		return -ENOMEM;
 
-	r->size = size;
-	r->producer = r->consumer = 0;
+	__ptr_ring_set_size(r, size);
+	r->producer = r->consumer_head = r->consumer_tail = 0;
 	spin_lock_init(&r->producer_lock);
 	spin_lock_init(&r->consumer_lock);
 
@@ -373,9 +417,10 @@ static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
 		else if (destroy)
 			destroy(ptr);
 
-	r->size = size;
+	__ptr_ring_set_size(r, size);
 	r->producer = producer;
-	r->consumer = 0;
+	r->consumer_head = 0;
+	r->consumer_tail = 0;
 	old = r->queue;
 	r->queue = queue;
 
-- 
cgit v1.2.3-70-g09d2


From 3ae3d67ba705c754a3c91ac009f9ce73a0e7286a Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 10 May 2017 15:01:30 -0600
Subject: libnvdimm: add an atomic vs process context flag to rw_bytes

nsio_rw_bytes can clear media errors, but this cannot be done while we
are in an atomic context due to locking within ACPI. From the BTT,
->rw_bytes may be called either from atomic or process context depending
on whether the calls happen during initialization or during IO.

During init, we want to ensure error clearing happens, and the flag
marking process context allows nsio_rw_bytes to do that. When called
during IO, we're in atomic context, and error clearing can be skipped.

Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/blk.c      |  3 ++-
 drivers/nvdimm/btt.c      | 67 +++++++++++++++++++++++++----------------------
 drivers/nvdimm/btt_devs.c |  2 +-
 drivers/nvdimm/claim.c    |  6 +++--
 drivers/nvdimm/nd.h       |  1 +
 drivers/nvdimm/pfn_devs.c |  4 +--
 include/linux/nd.h        | 12 +++++----
 7 files changed, 53 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 9faaa9694d87..822198a75e96 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -218,7 +218,8 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
 }
 
 static int nsblk_rw_bytes(struct nd_namespace_common *ndns,
-		resource_size_t offset, void *iobuf, size_t n, int rw)
+		resource_size_t offset, void *iobuf, size_t n, int rw,
+		unsigned long flags)
 {
 	struct nd_namespace_blk *nsblk = to_nd_namespace_blk(&ndns->dev);
 	struct nd_blk_region *ndbr = to_ndbr(nsblk);
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 368795aad5c9..aa977cd4869d 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -32,25 +32,25 @@ enum log_ent_request {
 };
 
 static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
-		void *buf, size_t n)
+		void *buf, size_t n, unsigned long flags)
 {
 	struct nd_btt *nd_btt = arena->nd_btt;
 	struct nd_namespace_common *ndns = nd_btt->ndns;
 
 	/* arena offsets are 4K from the base of the device */
 	offset += SZ_4K;
-	return nvdimm_read_bytes(ndns, offset, buf, n);
+	return nvdimm_read_bytes(ndns, offset, buf, n, flags);
 }
 
 static int arena_write_bytes(struct arena_info *arena, resource_size_t offset,
-		void *buf, size_t n)
+		void *buf, size_t n, unsigned long flags)
 {
 	struct nd_btt *nd_btt = arena->nd_btt;
 	struct nd_namespace_common *ndns = nd_btt->ndns;
 
 	/* arena offsets are 4K from the base of the device */
 	offset += SZ_4K;
-	return nvdimm_write_bytes(ndns, offset, buf, n);
+	return nvdimm_write_bytes(ndns, offset, buf, n, flags);
 }
 
 static int btt_info_write(struct arena_info *arena, struct btt_sb *super)
@@ -58,19 +58,19 @@ static int btt_info_write(struct arena_info *arena, struct btt_sb *super)
 	int ret;
 
 	ret = arena_write_bytes(arena, arena->info2off, super,
-			sizeof(struct btt_sb));
+			sizeof(struct btt_sb), 0);
 	if (ret)
 		return ret;
 
 	return arena_write_bytes(arena, arena->infooff, super,
-			sizeof(struct btt_sb));
+			sizeof(struct btt_sb), 0);
 }
 
 static int btt_info_read(struct arena_info *arena, struct btt_sb *super)
 {
 	WARN_ON(!super);
 	return arena_read_bytes(arena, arena->infooff, super,
-			sizeof(struct btt_sb));
+			sizeof(struct btt_sb), 0);
 }
 
 /*
@@ -79,16 +79,17 @@ static int btt_info_read(struct arena_info *arena, struct btt_sb *super)
  *   mapping is in little-endian
  *   mapping contains 'E' and 'Z' flags as desired
  */
-static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping)
+static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping,
+		unsigned long flags)
 {
 	u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
 
 	WARN_ON(lba >= arena->external_nlba);
-	return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE);
+	return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE, flags);
 }
 
 static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping,
-			u32 z_flag, u32 e_flag)
+			u32 z_flag, u32 e_flag, unsigned long rwb_flags)
 {
 	u32 ze;
 	__le32 mapping_le;
@@ -127,11 +128,11 @@ static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping,
 	}
 
 	mapping_le = cpu_to_le32(mapping);
-	return __btt_map_write(arena, lba, mapping_le);
+	return __btt_map_write(arena, lba, mapping_le, rwb_flags);
 }
 
 static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
-			int *trim, int *error)
+			int *trim, int *error, unsigned long rwb_flags)
 {
 	int ret;
 	__le32 in;
@@ -140,7 +141,7 @@ static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
 
 	WARN_ON(lba >= arena->external_nlba);
 
-	ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE);
+	ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE, rwb_flags);
 	if (ret)
 		return ret;
 
@@ -189,7 +190,7 @@ static int btt_log_read_pair(struct arena_info *arena, u32 lane,
 	WARN_ON(!ent);
 	return arena_read_bytes(arena,
 			arena->logoff + (2 * lane * LOG_ENT_SIZE), ent,
-			2 * LOG_ENT_SIZE);
+			2 * LOG_ENT_SIZE, 0);
 }
 
 static struct dentry *debugfs_root;
@@ -335,7 +336,7 @@ static int btt_log_read(struct arena_info *arena, u32 lane,
  * btt_flog_write is the wrapper for updating the freelist elements
  */
 static int __btt_log_write(struct arena_info *arena, u32 lane,
-			u32 sub, struct log_entry *ent)
+			u32 sub, struct log_entry *ent, unsigned long flags)
 {
 	int ret;
 	/*
@@ -350,13 +351,13 @@ static int __btt_log_write(struct arena_info *arena, u32 lane,
 	void *src = ent;
 
 	/* split the 16B write into atomic, durable halves */
-	ret = arena_write_bytes(arena, ns_off, src, log_half);
+	ret = arena_write_bytes(arena, ns_off, src, log_half, flags);
 	if (ret)
 		return ret;
 
 	ns_off += log_half;
 	src += log_half;
-	return arena_write_bytes(arena, ns_off, src, log_half);
+	return arena_write_bytes(arena, ns_off, src, log_half, flags);
 }
 
 static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub,
@@ -364,7 +365,7 @@ static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub,
 {
 	int ret;
 
-	ret = __btt_log_write(arena, lane, sub, ent);
+	ret = __btt_log_write(arena, lane, sub, ent, NVDIMM_IO_ATOMIC);
 	if (ret)
 		return ret;
 
@@ -397,7 +398,7 @@ static int btt_map_init(struct arena_info *arena)
 		size_t size = min(mapsize, chunk_size);
 
 		ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf,
-				size);
+				size, 0);
 		if (ret)
 			goto free;
 
@@ -428,10 +429,10 @@ static int btt_log_init(struct arena_info *arena)
 		log.old_map = cpu_to_le32(arena->external_nlba + i);
 		log.new_map = cpu_to_le32(arena->external_nlba + i);
 		log.seq = cpu_to_le32(LOG_SEQ_INIT);
-		ret = __btt_log_write(arena, i, 0, &log);
+		ret = __btt_log_write(arena, i, 0, &log, 0);
 		if (ret)
 			return ret;
-		ret = __btt_log_write(arena, i, 1, &zerolog);
+		ret = __btt_log_write(arena, i, 1, &zerolog, 0);
 		if (ret)
 			return ret;
 	}
@@ -470,7 +471,7 @@ static int btt_freelist_init(struct arena_info *arena)
 
 		/* Check if map recovery is needed */
 		ret = btt_map_read(arena, le32_to_cpu(log_new.lba), &map_entry,
-				NULL, NULL);
+				NULL, NULL, 0);
 		if (ret)
 			return ret;
 		if ((le32_to_cpu(log_new.new_map) != map_entry) &&
@@ -480,7 +481,7 @@ static int btt_freelist_init(struct arena_info *arena)
 			 * to complete the map write. So fix up the map.
 			 */
 			ret = btt_map_write(arena, le32_to_cpu(log_new.lba),
-					le32_to_cpu(log_new.new_map), 0, 0);
+					le32_to_cpu(log_new.new_map), 0, 0, 0);
 			if (ret)
 				return ret;
 		}
@@ -875,7 +876,7 @@ static int btt_data_read(struct arena_info *arena, struct page *page,
 	u64 nsoff = to_namespace_offset(arena, lba);
 	void *mem = kmap_atomic(page);
 
-	ret = arena_read_bytes(arena, nsoff, mem + off, len);
+	ret = arena_read_bytes(arena, nsoff, mem + off, len, NVDIMM_IO_ATOMIC);
 	kunmap_atomic(mem);
 
 	return ret;
@@ -888,7 +889,7 @@ static int btt_data_write(struct arena_info *arena, u32 lba,
 	u64 nsoff = to_namespace_offset(arena, lba);
 	void *mem = kmap_atomic(page);
 
-	ret = arena_write_bytes(arena, nsoff, mem + off, len);
+	ret = arena_write_bytes(arena, nsoff, mem + off, len, NVDIMM_IO_ATOMIC);
 	kunmap_atomic(mem);
 
 	return ret;
@@ -931,10 +932,12 @@ static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip,
 		mem = kmap_atomic(bv.bv_page);
 		if (rw)
 			ret = arena_write_bytes(arena, meta_nsoff,
-					mem + bv.bv_offset, cur_len);
+					mem + bv.bv_offset, cur_len,
+					NVDIMM_IO_ATOMIC);
 		else
 			ret = arena_read_bytes(arena, meta_nsoff,
-					mem + bv.bv_offset, cur_len);
+					mem + bv.bv_offset, cur_len,
+					NVDIMM_IO_ATOMIC);
 
 		kunmap_atomic(mem);
 		if (ret)
@@ -976,7 +979,8 @@ static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
 
 		cur_len = min(btt->sector_size, len);
 
-		ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag);
+		ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag,
+				NVDIMM_IO_ATOMIC);
 		if (ret)
 			goto out_lane;
 
@@ -1006,7 +1010,7 @@ static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
 			barrier();
 
 			ret = btt_map_read(arena, premap, &new_map, &t_flag,
-						&e_flag);
+						&e_flag, NVDIMM_IO_ATOMIC);
 			if (ret)
 				goto out_rtt;
 
@@ -1093,7 +1097,8 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
 		}
 
 		lock_map(arena, premap);
-		ret = btt_map_read(arena, premap, &old_postmap, NULL, NULL);
+		ret = btt_map_read(arena, premap, &old_postmap, NULL, NULL,
+				NVDIMM_IO_ATOMIC);
 		if (ret)
 			goto out_map;
 		if (old_postmap >= arena->internal_nlba) {
@@ -1110,7 +1115,7 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
 		if (ret)
 			goto out_map;
 
-		ret = btt_map_write(arena, premap, new_postmap, 0, 0);
+		ret = btt_map_write(arena, premap, new_postmap, 0, 0, 0);
 		if (ret)
 			goto out_map;
 
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 4b76af2b8715..ae00dc0d9791 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -273,7 +273,7 @@ static int __nd_btt_probe(struct nd_btt *nd_btt,
 	if (!btt_sb || !ndns || !nd_btt)
 		return -ENODEV;
 
-	if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb)))
+	if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb), 0))
 		return -ENXIO;
 
 	if (nvdimm_namespace_capacity(ndns) < SZ_16M)
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 93d128da1c92..7ceb5fa4f2a1 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -228,7 +228,8 @@ u64 nd_sb_checksum(struct nd_gen_sb *nd_gen_sb)
 EXPORT_SYMBOL(nd_sb_checksum);
 
 static int nsio_rw_bytes(struct nd_namespace_common *ndns,
-		resource_size_t offset, void *buf, size_t size, int rw)
+		resource_size_t offset, void *buf, size_t size, int rw,
+		unsigned long flags)
 {
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
 	unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
@@ -259,7 +260,8 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 		 * work around this collision.
 		 */
 		if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)
-				&& (!ndns->claim || !is_nd_btt(ndns->claim))) {
+				&& !(flags & NVDIMM_IO_ATOMIC)
+				&& !ndns->claim) {
 			long cleared;
 
 			cleared = nvdimm_clear_poison(&ndns->dev,
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 77d032192bf7..03852d738eec 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -31,6 +31,7 @@ enum {
 	ND_MAX_LANES = 256,
 	SECTOR_SHIFT = 9,
 	INT_LBASIZE_ALIGNMENT = 64,
+	NVDIMM_IO_ATOMIC = 1,
 };
 
 struct nd_poison {
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 335c8175410b..a6c403600d19 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -357,7 +357,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 	if (!is_nd_pmem(nd_pfn->dev.parent))
 		return -ENODEV;
 
-	if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb)))
+	if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0))
 		return -ENXIO;
 
 	if (memcmp(pfn_sb->signature, sig, PFN_SIG_LEN) != 0)
@@ -662,7 +662,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
 	pfn_sb->checksum = cpu_to_le64(checksum);
 
-	return nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb));
+	return nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0);
 }
 
 /*
diff --git a/include/linux/nd.h b/include/linux/nd.h
index fa66aeed441a..194b8e002ea7 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -48,7 +48,7 @@ struct nd_namespace_common {
 	struct device dev;
 	struct device *claim;
 	int (*rw_bytes)(struct nd_namespace_common *, resource_size_t offset,
-			void *buf, size_t size, int rw);
+			void *buf, size_t size, int rw, unsigned long flags);
 };
 
 static inline struct nd_namespace_common *to_ndns(struct device *dev)
@@ -134,9 +134,10 @@ static inline struct nd_namespace_blk *to_nd_namespace_blk(const struct device *
  * @buf is up-to-date upon return from this routine.
  */
 static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns,
-		resource_size_t offset, void *buf, size_t size)
+		resource_size_t offset, void *buf, size_t size,
+		unsigned long flags)
 {
-	return ndns->rw_bytes(ndns, offset, buf, size, READ);
+	return ndns->rw_bytes(ndns, offset, buf, size, READ, flags);
 }
 
 /**
@@ -152,9 +153,10 @@ static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns,
  * to media is handled internal to the @ndns driver, if at all.
  */
 static inline int nvdimm_write_bytes(struct nd_namespace_common *ndns,
-		resource_size_t offset, void *buf, size_t size)
+		resource_size_t offset, void *buf, size_t size,
+		unsigned long flags)
 {
-	return ndns->rw_bytes(ndns, offset, buf, size, WRITE);
+	return ndns->rw_bytes(ndns, offset, buf, size, WRITE, flags);
 }
 
 #define MODULE_ALIAS_ND_DEVICE(type) \
-- 
cgit v1.2.3-70-g09d2


From 572e0ca9b909339fbe017aaff1a225efb6db3b61 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Fri, 12 May 2017 15:46:29 -0700
Subject: time: delete current_fs_time()

All uses of the current_fs_time() function have been replaced by other
time interfaces.

And, its use cases can be fulfilled by current_time() or ktime_get_*
variants.

Link: http://lkml.kernel.org/r/1491613030-11599-13-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h |  1 -
 kernel/time/time.c | 14 --------------
 2 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0ad325ed71e8..803e5a9b2654 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1431,7 +1431,6 @@ static inline void i_gid_write(struct inode *inode, gid_t gid)
 	inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
 }
 
-extern struct timespec current_fs_time(struct super_block *sb);
 extern struct timespec current_time(struct inode *inode);
 
 /*
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 6574bba44b55..49c73c6ed648 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -230,20 +230,6 @@ SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
 	return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
 }
 
-/**
- * current_fs_time - Return FS time
- * @sb: Superblock.
- *
- * Return the current time truncated to the time granularity supported by
- * the fs.
- */
-struct timespec current_fs_time(struct super_block *sb)
-{
-	struct timespec now = current_kernel_time();
-	return timespec_trunc(now, sb->s_time_gran);
-}
-EXPORT_SYMBOL(current_fs_time);
-
 /*
  * Convert jiffies to milliseconds and back.
  *
-- 
cgit v1.2.3-70-g09d2


From 8594a21cf7a8318baedbedc3fcd2967a17ddeec0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 12 May 2017 15:46:41 -0700
Subject: mm, vmalloc: fix vmalloc users tracking properly

Commit 1f5307b1e094 ("mm, vmalloc: properly track vmalloc users") has
pulled asm/pgtable.h include dependency to linux/vmalloc.h and that
turned out to be a bad idea for some architectures.  E.g.  m68k fails
with

   In file included from arch/m68k/include/asm/pgtable_mm.h:145:0,
                    from arch/m68k/include/asm/pgtable.h:4,
                    from include/linux/vmalloc.h:9,
                    from arch/m68k/kernel/module.c:9:
   arch/m68k/include/asm/mcf_pgtable.h: In function 'nocache_page':
>> arch/m68k/include/asm/mcf_pgtable.h:339:43: error: 'init_mm' undeclared (first use in this function)
    #define pgd_offset_k(address) pgd_offset(&init_mm, address)

as spotted by kernel build bot. nios2 fails for other reason

  In file included from include/asm-generic/io.h:767:0,
                   from arch/nios2/include/asm/io.h:61,
                   from include/linux/io.h:25,
                   from arch/nios2/include/asm/pgtable.h:18,
                   from include/linux/mm.h:70,
                   from include/linux/pid_namespace.h:6,
                   from include/linux/ptrace.h:9,
                   from arch/nios2/include/uapi/asm/elf.h:23,
                   from arch/nios2/include/asm/elf.h:22,
                   from include/linux/elf.h:4,
                   from include/linux/module.h:15,
                   from init/main.c:16:
  include/linux/vmalloc.h: In function '__vmalloc_node_flags':
  include/linux/vmalloc.h:99:40: error: 'PAGE_KERNEL' undeclared (first use in this function); did you mean 'GFP_KERNEL'?

which is due to the newly added #include <asm/pgtable.h>, which on nios2
includes <linux/io.h> and thus <asm/io.h> and <asm-generic/io.h> which
again includes <linux/vmalloc.h>.

Tweaking that around just turns out a bigger headache than necessary.
This patch reverts 1f5307b1e094 and reimplements the original fix in a
different way.  __vmalloc_node_flags can stay static inline which will
cover vmalloc* functions.  We only have one external user
(kvmalloc_node) and we can export __vmalloc_node_flags_caller and
provide the caller directly.  This is much simpler and it doesn't really
need any games with header files.

[akpm@linux-foundation.org: coding-style fixes]
[mhocko@kernel.org: revert old comment]
  Link: http://lkml.kernel.org/r/20170509211054.GB16325@dhcp22.suse.cz
Fixes: 1f5307b1e094 ("mm, vmalloc: properly track vmalloc users")
Link: http://lkml.kernel.org/r/20170509153702.GR6481@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Tobias Klauser <tklauser@distanz.ch>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 21 ++++++---------------
 mm/util.c               |  3 ++-
 mm/vmalloc.c            | 19 ++++++++++++++++++-
 3 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 0328ce003992..2d92dd002abd 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -6,7 +6,6 @@
 #include <linux/list.h>
 #include <linux/llist.h>
 #include <asm/page.h>		/* pgprot_t */
-#include <asm/pgtable.h>	/* PAGE_KERNEL */
 #include <linux/rbtree.h>
 
 struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
@@ -83,22 +82,14 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			const void *caller);
 #ifndef CONFIG_MMU
 extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags);
-#else
-extern void *__vmalloc_node(unsigned long size, unsigned long align,
-			    gfp_t gfp_mask, pgprot_t prot,
-			    int node, const void *caller);
-
-/*
- * We really want to have this inlined due to caller tracking. This
- * function is used by the highlevel vmalloc apis and so we want to track
- * their callers and inlining will achieve that.
- */
-static inline void *__vmalloc_node_flags(unsigned long size,
-					int node, gfp_t flags)
+static inline void *__vmalloc_node_flags_caller(unsigned long size, int node,
+						gfp_t flags, void *caller)
 {
-	return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
-					node, __builtin_return_address(0));
+	return __vmalloc_node_flags(size, node, flags);
 }
+#else
+extern void *__vmalloc_node_flags_caller(unsigned long size,
+					 int node, gfp_t flags, void *caller);
 #endif
 
 extern void vfree(const void *addr);
diff --git a/mm/util.c b/mm/util.c
index 718154debc87..464df3489903 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -382,7 +382,8 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
 	if (ret || size <= PAGE_SIZE)
 		return ret;
 
-	return __vmalloc_node_flags(size, node, flags);
+	return __vmalloc_node_flags_caller(size, node, flags,
+			__builtin_return_address(0));
 }
 EXPORT_SYMBOL(kvmalloc_node);
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 194c22eccb9d..34a1c3e46ed7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1649,6 +1649,9 @@ void *vmap(struct page **pages, unsigned int count,
 }
 EXPORT_SYMBOL(vmap);
 
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+			    gfp_t gfp_mask, pgprot_t prot,
+			    int node, const void *caller);
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 				 pgprot_t prot, int node)
 {
@@ -1791,7 +1794,7 @@ fail:
  *	with mm people.
  *
  */
-void *__vmalloc_node(unsigned long size, unsigned long align,
+static void *__vmalloc_node(unsigned long size, unsigned long align,
 			    gfp_t gfp_mask, pgprot_t prot,
 			    int node, const void *caller)
 {
@@ -1806,6 +1809,20 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
 
+static inline void *__vmalloc_node_flags(unsigned long size,
+					int node, gfp_t flags)
+{
+	return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
+					node, __builtin_return_address(0));
+}
+
+
+void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
+				  void *caller)
+{
+	return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
+}
+
 /**
  *	vmalloc  -  allocate virtually contiguous memory
  *	@size:		allocation size
-- 
cgit v1.2.3-70-g09d2


From 4636e70bb0a8b871998b6841a2e4b205cf2bc863 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 12 May 2017 15:46:47 -0700
Subject: dax: prevent invalidation of mapped DAX entries

Patch series "mm,dax: Fix data corruption due to mmap inconsistency",
v4.

This series fixes data corruption that can happen for DAX mounts when
page faults race with write(2) and as a result page tables get out of
sync with block mappings in the filesystem and thus data seen through
mmap is different from data seen through read(2).

The series passes testing with t_mmap_stale test program from Ross and
also other mmap related tests on DAX filesystem.

This patch (of 4):

dax_invalidate_mapping_entry() currently removes DAX exceptional entries
only if they are clean and unlocked.  This is done via:

  invalidate_mapping_pages()
    invalidate_exceptional_entry()
      dax_invalidate_mapping_entry()

However, for page cache pages removed in invalidate_mapping_pages()
there is an additional criteria which is that the page must not be
mapped.  This is noted in the comments above invalidate_mapping_pages()
and is checked in invalidate_inode_page().

For DAX entries this means that we can can end up in a situation where a
DAX exceptional entry, either a huge zero page or a regular DAX entry,
could end up mapped but without an associated radix tree entry.  This is
inconsistent with the rest of the DAX code and with what happens in the
page cache case.

We aren't able to unmap the DAX exceptional entry because according to
its comments invalidate_mapping_pages() isn't allowed to block, and
unmap_mapping_range() takes a write lock on the mapping->i_mmap_rwsem.

Since we essentially never have unmapped DAX entries to evict from the
radix tree, just remove dax_invalidate_mapping_entry().

Fixes: c6dcf52c23d2 ("mm: Invalidate DAX radix tree entries only if appropriate")
Link: http://lkml.kernel.org/r/20170510085419.27601-2-jack@suse.cz
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Reported-by: Jan Kara <jack@suse.cz>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: <stable@vger.kernel.org>    [4.10+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c            | 29 -----------------------------
 include/linux/dax.h |  1 -
 mm/truncate.c       |  9 +++------
 3 files changed, 3 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 66d79067eedf..38deebb8c86e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -460,35 +460,6 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
 	return ret;
 }
 
-/*
- * Invalidate exceptional DAX entry if easily possible. This handles DAX
- * entries for invalidate_inode_pages() so we evict the entry only if we can
- * do so without blocking.
- */
-int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
-	int ret = 0;
-	void *entry, **slot;
-	struct radix_tree_root *page_tree = &mapping->page_tree;
-
-	spin_lock_irq(&mapping->tree_lock);
-	entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
-	if (!entry || !radix_tree_exceptional_entry(entry) ||
-	    slot_locked(mapping, slot))
-		goto out;
-	if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
-	    radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
-		goto out;
-	radix_tree_delete(page_tree, index);
-	mapping->nrexceptional--;
-	ret = 1;
-out:
-	spin_unlock_irq(&mapping->tree_lock);
-	if (ret)
-		dax_wake_mapping_entry_waiter(mapping, index, entry, true);
-	return ret;
-}
-
 /*
  * Invalidate exceptional DAX entry if it is clean.
  */
diff --git a/include/linux/dax.h b/include/linux/dax.h
index d3158e74a59e..d1236d16ef00 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -63,7 +63,6 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 		    const struct iomap_ops *ops);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
-int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
 void dax_wake_mapping_entry_waiter(struct address_space *mapping,
diff --git a/mm/truncate.c b/mm/truncate.c
index 83a059e8cd1d..706cff171a15 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -67,17 +67,14 @@ static void truncate_exceptional_entry(struct address_space *mapping,
 
 /*
  * Invalidate exceptional entry if easily possible. This handles exceptional
- * entries for invalidate_inode_pages() so for DAX it evicts only unlocked and
- * clean entries.
+ * entries for invalidate_inode_pages().
  */
 static int invalidate_exceptional_entry(struct address_space *mapping,
 					pgoff_t index, void *entry)
 {
-	/* Handled by shmem itself */
-	if (shmem_mapping(mapping))
+	/* Handled by shmem itself, or for DAX we do nothing. */
+	if (shmem_mapping(mapping) || dax_mapping(mapping))
 		return 1;
-	if (dax_mapping(mapping))
-		return dax_invalidate_mapping_entry(mapping, index);
 	clear_shadow_entry(mapping, index, entry);
 	return 1;
 }
-- 
cgit v1.2.3-70-g09d2