From f3b19aa5cab65f7e73613aa37f6851ce56b794d1 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Fri, 27 Feb 2015 17:54:14 +0200
Subject: ARM: OMAP2+: clock: export driver API to setup/get clock features

As most of the clock driver support code is going to be moved under
drivers/clk/ti, an API for setting / getting the SoC specific clock
features is needed. This patch provides this API and changes the
existing code to use it.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 include/linux/clk/ti.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 79b76e13d904..1a7f86a68f62 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -338,6 +338,22 @@ int am43xx_dt_clk_init(void);
 int omap2420_dt_clk_init(void);
 int omap2430_dt_clk_init(void);
 
+struct ti_clk_features {
+	u32 flags;
+	long fint_min;
+	long fint_max;
+	long fint_band1_max;
+	long fint_band2_min;
+	u8 dpll_bypass_vals;
+	u8 cm_idlest_val;
+};
+
+#define TI_CLK_DPLL_HAS_FREQSEL			BIT(0)
+#define TI_CLK_DPLL4_DENY_REPROGRAM		BIT(1)
+
+void ti_clk_setup_features(struct ti_clk_features *features);
+const struct ti_clk_features *ti_clk_get_features(void);
+
 #ifdef CONFIG_OF
 void of_ti_clk_allow_autoidle_all(void);
 void of_ti_clk_deny_autoidle_all(void);
-- 
cgit v1.2.3-70-g09d2


From b138b0283d35bed0cd3353d7e39add8ac493eb37 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Mon, 2 Mar 2015 09:57:28 +0200
Subject: clk: ti: move generic OMAP DPLL implementation under drivers/clk

With the legacy clock data now gone, we can start moving OMAP clock
type implementations under clock driver. Start this with moving the
generic OMAP DPLL clock type under TI clock driver.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/Makefile    |   2 +-
 arch/arm/mach-omap2/clkt_dpll.c | 370 ----------------------------------------
 drivers/clk/ti/Makefile         |   3 +-
 drivers/clk/ti/clkt_dpll.c      | 369 +++++++++++++++++++++++++++++++++++++++
 drivers/clk/ti/clock.h          |   2 +
 include/linux/clk/ti.h          |   1 -
 6 files changed, 374 insertions(+), 373 deletions(-)
 delete mode 100644 arch/arm/mach-omap2/clkt_dpll.c
 create mode 100644 drivers/clk/ti/clkt_dpll.c

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile
index ec002bd4af77..fcb5d47f88ca 100644
--- a/arch/arm/mach-omap2/Makefile
+++ b/arch/arm/mach-omap2/Makefile
@@ -13,7 +13,7 @@ obj-y := id.o io.o control.o mux.o devices.o fb.o serial.o timer.o pm.o \
 hwmod-common				= omap_hwmod.o omap_hwmod_reset.o \
 					  omap_hwmod_common_data.o
 clock-common				= clock.o clock_common_data.o \
-					  clkt_dpll.o clkt_clksel.o
+					  clkt_clksel.o
 secure-common				= omap-smc.o omap-secure.o
 
 obj-$(CONFIG_ARCH_OMAP2) += $(omap-2-3-common) $(hwmod-common)
diff --git a/arch/arm/mach-omap2/clkt_dpll.c b/arch/arm/mach-omap2/clkt_dpll.c
deleted file mode 100644
index 82f0600c35f4..000000000000
--- a/arch/arm/mach-omap2/clkt_dpll.c
+++ /dev/null
@@ -1,370 +0,0 @@
-/*
- * OMAP2/3/4 DPLL clock functions
- *
- * Copyright (C) 2005-2008 Texas Instruments, Inc.
- * Copyright (C) 2004-2010 Nokia Corporation
- *
- * Contacts:
- * Richard Woodruff <r-woodruff2@ti.com>
- * Paul Walmsley
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#undef DEBUG
-
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/clk-provider.h>
-#include <linux/io.h>
-
-#include <asm/div64.h>
-
-#include "clock.h"
-
-/* DPLL rate rounding: minimum DPLL multiplier, divider values */
-#define DPLL_MIN_MULTIPLIER		2
-#define DPLL_MIN_DIVIDER		1
-
-/* Possible error results from _dpll_test_mult */
-#define DPLL_MULT_UNDERFLOW		-1
-
-/*
- * Scale factor to mitigate roundoff errors in DPLL rate rounding.
- * The higher the scale factor, the greater the risk of arithmetic overflow,
- * but the closer the rounded rate to the target rate.  DPLL_SCALE_FACTOR
- * must be a power of DPLL_SCALE_BASE.
- */
-#define DPLL_SCALE_FACTOR		64
-#define DPLL_SCALE_BASE			2
-#define DPLL_ROUNDING_VAL		((DPLL_SCALE_BASE / 2) * \
-					 (DPLL_SCALE_FACTOR / DPLL_SCALE_BASE))
-
-/*
- * DPLL valid Fint frequency range for OMAP36xx and OMAP4xxx.
- * From device data manual section 4.3 "DPLL and DLL Specifications".
- */
-#define OMAP3PLUS_DPLL_FINT_JTYPE_MIN	500000
-#define OMAP3PLUS_DPLL_FINT_JTYPE_MAX	2500000
-
-/* _dpll_test_fint() return codes */
-#define DPLL_FINT_UNDERFLOW		-1
-#define DPLL_FINT_INVALID		-2
-
-/* Private functions */
-
-/*
- * _dpll_test_fint - test whether an Fint value is valid for the DPLL
- * @clk: DPLL struct clk to test
- * @n: divider value (N) to test
- *
- * Tests whether a particular divider @n will result in a valid DPLL
- * internal clock frequency Fint. See the 34xx TRM 4.7.6.2 "DPLL Jitter
- * Correction".  Returns 0 if OK, -1 if the enclosing loop can terminate
- * (assuming that it is counting N upwards), or -2 if the enclosing loop
- * should skip to the next iteration (again assuming N is increasing).
- */
-static int _dpll_test_fint(struct clk_hw_omap *clk, unsigned int n)
-{
-	struct dpll_data *dd;
-	long fint, fint_min, fint_max;
-	int ret = 0;
-
-	dd = clk->dpll_data;
-
-	/* DPLL divider must result in a valid jitter correction val */
-	fint = __clk_get_rate(__clk_get_parent(clk->hw.clk)) / n;
-
-	if (dd->flags & DPLL_J_TYPE) {
-		fint_min = OMAP3PLUS_DPLL_FINT_JTYPE_MIN;
-		fint_max = OMAP3PLUS_DPLL_FINT_JTYPE_MAX;
-	} else {
-		fint_min = ti_clk_get_features()->fint_min;
-		fint_max = ti_clk_get_features()->fint_max;
-	}
-
-	if (!fint_min || !fint_max) {
-		WARN(1, "No fint limits available!\n");
-		return DPLL_FINT_INVALID;
-	}
-
-	if (fint < ti_clk_get_features()->fint_min) {
-		pr_debug("rejecting n=%d due to Fint failure, lowering max_divider\n",
-			 n);
-		dd->max_divider = n;
-		ret = DPLL_FINT_UNDERFLOW;
-	} else if (fint > ti_clk_get_features()->fint_max) {
-		pr_debug("rejecting n=%d due to Fint failure, boosting min_divider\n",
-			 n);
-		dd->min_divider = n;
-		ret = DPLL_FINT_INVALID;
-	} else if (fint > ti_clk_get_features()->fint_band1_max &&
-		   fint < ti_clk_get_features()->fint_band2_min) {
-		pr_debug("rejecting n=%d due to Fint failure\n", n);
-		ret = DPLL_FINT_INVALID;
-	}
-
-	return ret;
-}
-
-static unsigned long _dpll_compute_new_rate(unsigned long parent_rate,
-					    unsigned int m, unsigned int n)
-{
-	unsigned long long num;
-
-	num = (unsigned long long)parent_rate * m;
-	do_div(num, n);
-	return num;
-}
-
-/*
- * _dpll_test_mult - test a DPLL multiplier value
- * @m: pointer to the DPLL m (multiplier) value under test
- * @n: current DPLL n (divider) value under test
- * @new_rate: pointer to storage for the resulting rounded rate
- * @target_rate: the desired DPLL rate
- * @parent_rate: the DPLL's parent clock rate
- *
- * This code tests a DPLL multiplier value, ensuring that the
- * resulting rate will not be higher than the target_rate, and that
- * the multiplier value itself is valid for the DPLL.  Initially, the
- * integer pointed to by the m argument should be prescaled by
- * multiplying by DPLL_SCALE_FACTOR.  The code will replace this with
- * a non-scaled m upon return.  This non-scaled m will result in a
- * new_rate as close as possible to target_rate (but not greater than
- * target_rate) given the current (parent_rate, n, prescaled m)
- * triple. Returns DPLL_MULT_UNDERFLOW in the event that the
- * non-scaled m attempted to underflow, which can allow the calling
- * function to bail out early; or 0 upon success.
- */
-static int _dpll_test_mult(int *m, int n, unsigned long *new_rate,
-			   unsigned long target_rate,
-			   unsigned long parent_rate)
-{
-	int r = 0, carry = 0;
-
-	/* Unscale m and round if necessary */
-	if (*m % DPLL_SCALE_FACTOR >= DPLL_ROUNDING_VAL)
-		carry = 1;
-	*m = (*m / DPLL_SCALE_FACTOR) + carry;
-
-	/*
-	 * The new rate must be <= the target rate to avoid programming
-	 * a rate that is impossible for the hardware to handle
-	 */
-	*new_rate = _dpll_compute_new_rate(parent_rate, *m, n);
-	if (*new_rate > target_rate) {
-		(*m)--;
-		*new_rate = 0;
-	}
-
-	/* Guard against m underflow */
-	if (*m < DPLL_MIN_MULTIPLIER) {
-		*m = DPLL_MIN_MULTIPLIER;
-		*new_rate = 0;
-		r = DPLL_MULT_UNDERFLOW;
-	}
-
-	if (*new_rate == 0)
-		*new_rate = _dpll_compute_new_rate(parent_rate, *m, n);
-
-	return r;
-}
-
-/**
- * _omap2_dpll_is_in_bypass - check if DPLL is in bypass mode or not
- * @v: bitfield value of the DPLL enable
- *
- * Checks given DPLL enable bitfield to see whether the DPLL is in bypass
- * mode or not. Returns 1 if the DPLL is in bypass, 0 otherwise.
- */
-static int _omap2_dpll_is_in_bypass(u32 v)
-{
-	u8 mask, val;
-
-	mask = ti_clk_get_features()->dpll_bypass_vals;
-
-	/*
-	 * Each set bit in the mask corresponds to a bypass value equal
-	 * to the bitshift. Go through each set-bit in the mask and
-	 * compare against the given register value.
-	 */
-	while (mask) {
-		val = __ffs(mask);
-		mask ^= (1 << val);
-		if (v == val)
-			return 1;
-	}
-
-	return 0;
-}
-
-/* Public functions */
-u8 omap2_init_dpll_parent(struct clk_hw *hw)
-{
-	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
-	u32 v;
-	struct dpll_data *dd;
-
-	dd = clk->dpll_data;
-	if (!dd)
-		return -EINVAL;
-
-	v = omap2_clk_readl(clk, dd->control_reg);
-	v &= dd->enable_mask;
-	v >>= __ffs(dd->enable_mask);
-
-	/* Reparent the struct clk in case the dpll is in bypass */
-	if (_omap2_dpll_is_in_bypass(v))
-		return 1;
-
-	return 0;
-}
-
-/**
- * omap2_get_dpll_rate - returns the current DPLL CLKOUT rate
- * @clk: struct clk * of a DPLL
- *
- * DPLLs can be locked or bypassed - basically, enabled or disabled.
- * When locked, the DPLL output depends on the M and N values.  When
- * bypassed, on OMAP2xxx, the output rate is either the 32KiHz clock
- * or sys_clk.  Bypass rates on OMAP3 depend on the DPLL: DPLLs 1 and
- * 2 are bypassed with dpll1_fclk and dpll2_fclk respectively
- * (generated by DPLL3), while DPLL 3, 4, and 5 bypass rates are sys_clk.
- * Returns the current DPLL CLKOUT rate (*not* CLKOUTX2) if the DPLL is
- * locked, or the appropriate bypass rate if the DPLL is bypassed, or 0
- * if the clock @clk is not a DPLL.
- */
-unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk)
-{
-	long long dpll_clk;
-	u32 dpll_mult, dpll_div, v;
-	struct dpll_data *dd;
-
-	dd = clk->dpll_data;
-	if (!dd)
-		return 0;
-
-	/* Return bypass rate if DPLL is bypassed */
-	v = omap2_clk_readl(clk, dd->control_reg);
-	v &= dd->enable_mask;
-	v >>= __ffs(dd->enable_mask);
-
-	if (_omap2_dpll_is_in_bypass(v))
-		return __clk_get_rate(dd->clk_bypass);
-
-	v = omap2_clk_readl(clk, dd->mult_div1_reg);
-	dpll_mult = v & dd->mult_mask;
-	dpll_mult >>= __ffs(dd->mult_mask);
-	dpll_div = v & dd->div1_mask;
-	dpll_div >>= __ffs(dd->div1_mask);
-
-	dpll_clk = (long long) __clk_get_rate(dd->clk_ref) * dpll_mult;
-	do_div(dpll_clk, dpll_div + 1);
-
-	return dpll_clk;
-}
-
-/* DPLL rate rounding code */
-
-/**
- * omap2_dpll_round_rate - round a target rate for an OMAP DPLL
- * @clk: struct clk * for a DPLL
- * @target_rate: desired DPLL clock rate
- *
- * Given a DPLL and a desired target rate, round the target rate to a
- * possible, programmable rate for this DPLL.  Attempts to select the
- * minimum possible n.  Stores the computed (m, n) in the DPLL's
- * dpll_data structure so set_rate() will not need to call this
- * (expensive) function again.  Returns ~0 if the target rate cannot
- * be rounded, or the rounded rate upon success.
- */
-long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate,
-		unsigned long *parent_rate)
-{
-	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
-	int m, n, r, scaled_max_m;
-	int min_delta_m = INT_MAX, min_delta_n = INT_MAX;
-	unsigned long scaled_rt_rp;
-	unsigned long new_rate = 0;
-	struct dpll_data *dd;
-	unsigned long ref_rate;
-	long delta;
-	long prev_min_delta = LONG_MAX;
-	const char *clk_name;
-
-	if (!clk || !clk->dpll_data)
-		return ~0;
-
-	dd = clk->dpll_data;
-
-	ref_rate = __clk_get_rate(dd->clk_ref);
-	clk_name = __clk_get_name(hw->clk);
-	pr_debug("clock: %s: starting DPLL round_rate, target rate %lu\n",
-		 clk_name, target_rate);
-
-	scaled_rt_rp = target_rate / (ref_rate / DPLL_SCALE_FACTOR);
-	scaled_max_m = dd->max_multiplier * DPLL_SCALE_FACTOR;
-
-	dd->last_rounded_rate = 0;
-
-	for (n = dd->min_divider; n <= dd->max_divider; n++) {
-
-		/* Is the (input clk, divider) pair valid for the DPLL? */
-		r = _dpll_test_fint(clk, n);
-		if (r == DPLL_FINT_UNDERFLOW)
-			break;
-		else if (r == DPLL_FINT_INVALID)
-			continue;
-
-		/* Compute the scaled DPLL multiplier, based on the divider */
-		m = scaled_rt_rp * n;
-
-		/*
-		 * Since we're counting n up, a m overflow means we
-		 * can bail out completely (since as n increases in
-		 * the next iteration, there's no way that m can
-		 * increase beyond the current m)
-		 */
-		if (m > scaled_max_m)
-			break;
-
-		r = _dpll_test_mult(&m, n, &new_rate, target_rate,
-				    ref_rate);
-
-		/* m can't be set low enough for this n - try with a larger n */
-		if (r == DPLL_MULT_UNDERFLOW)
-			continue;
-
-		/* skip rates above our target rate */
-		delta = target_rate - new_rate;
-		if (delta < 0)
-			continue;
-
-		if (delta < prev_min_delta) {
-			prev_min_delta = delta;
-			min_delta_m = m;
-			min_delta_n = n;
-		}
-
-		pr_debug("clock: %s: m = %d: n = %d: new_rate = %lu\n",
-			 clk_name, m, n, new_rate);
-
-		if (delta == 0)
-			break;
-	}
-
-	if (prev_min_delta == LONG_MAX) {
-		pr_debug("clock: %s: cannot round to rate %lu\n",
-			 clk_name, target_rate);
-		return ~0;
-	}
-
-	dd->last_rounded_m = min_delta_m;
-	dd->last_rounded_n = min_delta_n;
-	dd->last_rounded_rate = target_rate - prev_min_delta;
-
-	return dd->last_rounded_rate;
-}
-
diff --git a/drivers/clk/ti/Makefile b/drivers/clk/ti/Makefile
index 105ffd0f5e79..62dae2ad3c69 100644
--- a/drivers/clk/ti/Makefile
+++ b/drivers/clk/ti/Makefile
@@ -1,6 +1,7 @@
 obj-y					+= clk.o autoidle.o clockdomain.o
 clk-common				= dpll.o composite.o divider.o gate.o \
-					  fixed-factor.o mux.o apll.o
+					  fixed-factor.o mux.o apll.o \
+					  clkt_dpll.o
 obj-$(CONFIG_SOC_AM33XX)		+= $(clk-common) clk-33xx.o
 obj-$(CONFIG_SOC_TI81XX)		+= $(clk-common) fapll.o clk-816x.o
 obj-$(CONFIG_ARCH_OMAP2)		+= $(clk-common) interface.o clk-2xxx.o
diff --git a/drivers/clk/ti/clkt_dpll.c b/drivers/clk/ti/clkt_dpll.c
new file mode 100644
index 000000000000..a01fc7f305c1
--- /dev/null
+++ b/drivers/clk/ti/clkt_dpll.c
@@ -0,0 +1,369 @@
+/*
+ * OMAP2/3/4 DPLL clock functions
+ *
+ * Copyright (C) 2005-2008 Texas Instruments, Inc.
+ * Copyright (C) 2004-2010 Nokia Corporation
+ *
+ * Contacts:
+ * Richard Woodruff <r-woodruff2@ti.com>
+ * Paul Walmsley
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/clk-provider.h>
+#include <linux/io.h>
+#include <linux/clk/ti.h>
+
+#include <asm/div64.h>
+
+#include "clock.h"
+
+/* DPLL rate rounding: minimum DPLL multiplier, divider values */
+#define DPLL_MIN_MULTIPLIER		2
+#define DPLL_MIN_DIVIDER		1
+
+/* Possible error results from _dpll_test_mult */
+#define DPLL_MULT_UNDERFLOW		-1
+
+/*
+ * Scale factor to mitigate roundoff errors in DPLL rate rounding.
+ * The higher the scale factor, the greater the risk of arithmetic overflow,
+ * but the closer the rounded rate to the target rate.  DPLL_SCALE_FACTOR
+ * must be a power of DPLL_SCALE_BASE.
+ */
+#define DPLL_SCALE_FACTOR		64
+#define DPLL_SCALE_BASE			2
+#define DPLL_ROUNDING_VAL		((DPLL_SCALE_BASE / 2) * \
+					 (DPLL_SCALE_FACTOR / DPLL_SCALE_BASE))
+
+/*
+ * DPLL valid Fint frequency range for OMAP36xx and OMAP4xxx.
+ * From device data manual section 4.3 "DPLL and DLL Specifications".
+ */
+#define OMAP3PLUS_DPLL_FINT_JTYPE_MIN	500000
+#define OMAP3PLUS_DPLL_FINT_JTYPE_MAX	2500000
+
+/* _dpll_test_fint() return codes */
+#define DPLL_FINT_UNDERFLOW		-1
+#define DPLL_FINT_INVALID		-2
+
+/* Private functions */
+
+/*
+ * _dpll_test_fint - test whether an Fint value is valid for the DPLL
+ * @clk: DPLL struct clk to test
+ * @n: divider value (N) to test
+ *
+ * Tests whether a particular divider @n will result in a valid DPLL
+ * internal clock frequency Fint. See the 34xx TRM 4.7.6.2 "DPLL Jitter
+ * Correction".  Returns 0 if OK, -1 if the enclosing loop can terminate
+ * (assuming that it is counting N upwards), or -2 if the enclosing loop
+ * should skip to the next iteration (again assuming N is increasing).
+ */
+static int _dpll_test_fint(struct clk_hw_omap *clk, unsigned int n)
+{
+	struct dpll_data *dd;
+	long fint, fint_min, fint_max;
+	int ret = 0;
+
+	dd = clk->dpll_data;
+
+	/* DPLL divider must result in a valid jitter correction val */
+	fint = __clk_get_rate(__clk_get_parent(clk->hw.clk)) / n;
+
+	if (dd->flags & DPLL_J_TYPE) {
+		fint_min = OMAP3PLUS_DPLL_FINT_JTYPE_MIN;
+		fint_max = OMAP3PLUS_DPLL_FINT_JTYPE_MAX;
+	} else {
+		fint_min = ti_clk_get_features()->fint_min;
+		fint_max = ti_clk_get_features()->fint_max;
+	}
+
+	if (!fint_min || !fint_max) {
+		WARN(1, "No fint limits available!\n");
+		return DPLL_FINT_INVALID;
+	}
+
+	if (fint < ti_clk_get_features()->fint_min) {
+		pr_debug("rejecting n=%d due to Fint failure, lowering max_divider\n",
+			 n);
+		dd->max_divider = n;
+		ret = DPLL_FINT_UNDERFLOW;
+	} else if (fint > ti_clk_get_features()->fint_max) {
+		pr_debug("rejecting n=%d due to Fint failure, boosting min_divider\n",
+			 n);
+		dd->min_divider = n;
+		ret = DPLL_FINT_INVALID;
+	} else if (fint > ti_clk_get_features()->fint_band1_max &&
+		   fint < ti_clk_get_features()->fint_band2_min) {
+		pr_debug("rejecting n=%d due to Fint failure\n", n);
+		ret = DPLL_FINT_INVALID;
+	}
+
+	return ret;
+}
+
+static unsigned long _dpll_compute_new_rate(unsigned long parent_rate,
+					    unsigned int m, unsigned int n)
+{
+	unsigned long long num;
+
+	num = (unsigned long long)parent_rate * m;
+	do_div(num, n);
+	return num;
+}
+
+/*
+ * _dpll_test_mult - test a DPLL multiplier value
+ * @m: pointer to the DPLL m (multiplier) value under test
+ * @n: current DPLL n (divider) value under test
+ * @new_rate: pointer to storage for the resulting rounded rate
+ * @target_rate: the desired DPLL rate
+ * @parent_rate: the DPLL's parent clock rate
+ *
+ * This code tests a DPLL multiplier value, ensuring that the
+ * resulting rate will not be higher than the target_rate, and that
+ * the multiplier value itself is valid for the DPLL.  Initially, the
+ * integer pointed to by the m argument should be prescaled by
+ * multiplying by DPLL_SCALE_FACTOR.  The code will replace this with
+ * a non-scaled m upon return.  This non-scaled m will result in a
+ * new_rate as close as possible to target_rate (but not greater than
+ * target_rate) given the current (parent_rate, n, prescaled m)
+ * triple. Returns DPLL_MULT_UNDERFLOW in the event that the
+ * non-scaled m attempted to underflow, which can allow the calling
+ * function to bail out early; or 0 upon success.
+ */
+static int _dpll_test_mult(int *m, int n, unsigned long *new_rate,
+			   unsigned long target_rate,
+			   unsigned long parent_rate)
+{
+	int r = 0, carry = 0;
+
+	/* Unscale m and round if necessary */
+	if (*m % DPLL_SCALE_FACTOR >= DPLL_ROUNDING_VAL)
+		carry = 1;
+	*m = (*m / DPLL_SCALE_FACTOR) + carry;
+
+	/*
+	 * The new rate must be <= the target rate to avoid programming
+	 * a rate that is impossible for the hardware to handle
+	 */
+	*new_rate = _dpll_compute_new_rate(parent_rate, *m, n);
+	if (*new_rate > target_rate) {
+		(*m)--;
+		*new_rate = 0;
+	}
+
+	/* Guard against m underflow */
+	if (*m < DPLL_MIN_MULTIPLIER) {
+		*m = DPLL_MIN_MULTIPLIER;
+		*new_rate = 0;
+		r = DPLL_MULT_UNDERFLOW;
+	}
+
+	if (*new_rate == 0)
+		*new_rate = _dpll_compute_new_rate(parent_rate, *m, n);
+
+	return r;
+}
+
+/**
+ * _omap2_dpll_is_in_bypass - check if DPLL is in bypass mode or not
+ * @v: bitfield value of the DPLL enable
+ *
+ * Checks given DPLL enable bitfield to see whether the DPLL is in bypass
+ * mode or not. Returns 1 if the DPLL is in bypass, 0 otherwise.
+ */
+static int _omap2_dpll_is_in_bypass(u32 v)
+{
+	u8 mask, val;
+
+	mask = ti_clk_get_features()->dpll_bypass_vals;
+
+	/*
+	 * Each set bit in the mask corresponds to a bypass value equal
+	 * to the bitshift. Go through each set-bit in the mask and
+	 * compare against the given register value.
+	 */
+	while (mask) {
+		val = __ffs(mask);
+		mask ^= (1 << val);
+		if (v == val)
+			return 1;
+	}
+
+	return 0;
+}
+
+/* Public functions */
+u8 omap2_init_dpll_parent(struct clk_hw *hw)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+	u32 v;
+	struct dpll_data *dd;
+
+	dd = clk->dpll_data;
+	if (!dd)
+		return -EINVAL;
+
+	v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+	v &= dd->enable_mask;
+	v >>= __ffs(dd->enable_mask);
+
+	/* Reparent the struct clk in case the dpll is in bypass */
+	if (_omap2_dpll_is_in_bypass(v))
+		return 1;
+
+	return 0;
+}
+
+/**
+ * omap2_get_dpll_rate - returns the current DPLL CLKOUT rate
+ * @clk: struct clk * of a DPLL
+ *
+ * DPLLs can be locked or bypassed - basically, enabled or disabled.
+ * When locked, the DPLL output depends on the M and N values.  When
+ * bypassed, on OMAP2xxx, the output rate is either the 32KiHz clock
+ * or sys_clk.  Bypass rates on OMAP3 depend on the DPLL: DPLLs 1 and
+ * 2 are bypassed with dpll1_fclk and dpll2_fclk respectively
+ * (generated by DPLL3), while DPLL 3, 4, and 5 bypass rates are sys_clk.
+ * Returns the current DPLL CLKOUT rate (*not* CLKOUTX2) if the DPLL is
+ * locked, or the appropriate bypass rate if the DPLL is bypassed, or 0
+ * if the clock @clk is not a DPLL.
+ */
+unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk)
+{
+	long long dpll_clk;
+	u32 dpll_mult, dpll_div, v;
+	struct dpll_data *dd;
+
+	dd = clk->dpll_data;
+	if (!dd)
+		return 0;
+
+	/* Return bypass rate if DPLL is bypassed */
+	v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+	v &= dd->enable_mask;
+	v >>= __ffs(dd->enable_mask);
+
+	if (_omap2_dpll_is_in_bypass(v))
+		return __clk_get_rate(dd->clk_bypass);
+
+	v = ti_clk_ll_ops->clk_readl(dd->mult_div1_reg);
+	dpll_mult = v & dd->mult_mask;
+	dpll_mult >>= __ffs(dd->mult_mask);
+	dpll_div = v & dd->div1_mask;
+	dpll_div >>= __ffs(dd->div1_mask);
+
+	dpll_clk = (long long)__clk_get_rate(dd->clk_ref) * dpll_mult;
+	do_div(dpll_clk, dpll_div + 1);
+
+	return dpll_clk;
+}
+
+/* DPLL rate rounding code */
+
+/**
+ * omap2_dpll_round_rate - round a target rate for an OMAP DPLL
+ * @clk: struct clk * for a DPLL
+ * @target_rate: desired DPLL clock rate
+ *
+ * Given a DPLL and a desired target rate, round the target rate to a
+ * possible, programmable rate for this DPLL.  Attempts to select the
+ * minimum possible n.  Stores the computed (m, n) in the DPLL's
+ * dpll_data structure so set_rate() will not need to call this
+ * (expensive) function again.  Returns ~0 if the target rate cannot
+ * be rounded, or the rounded rate upon success.
+ */
+long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate,
+			   unsigned long *parent_rate)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+	int m, n, r, scaled_max_m;
+	int min_delta_m = INT_MAX, min_delta_n = INT_MAX;
+	unsigned long scaled_rt_rp;
+	unsigned long new_rate = 0;
+	struct dpll_data *dd;
+	unsigned long ref_rate;
+	long delta;
+	long prev_min_delta = LONG_MAX;
+	const char *clk_name;
+
+	if (!clk || !clk->dpll_data)
+		return ~0;
+
+	dd = clk->dpll_data;
+
+	ref_rate = __clk_get_rate(dd->clk_ref);
+	clk_name = __clk_get_name(hw->clk);
+	pr_debug("clock: %s: starting DPLL round_rate, target rate %lu\n",
+		 clk_name, target_rate);
+
+	scaled_rt_rp = target_rate / (ref_rate / DPLL_SCALE_FACTOR);
+	scaled_max_m = dd->max_multiplier * DPLL_SCALE_FACTOR;
+
+	dd->last_rounded_rate = 0;
+
+	for (n = dd->min_divider; n <= dd->max_divider; n++) {
+		/* Is the (input clk, divider) pair valid for the DPLL? */
+		r = _dpll_test_fint(clk, n);
+		if (r == DPLL_FINT_UNDERFLOW)
+			break;
+		else if (r == DPLL_FINT_INVALID)
+			continue;
+
+		/* Compute the scaled DPLL multiplier, based on the divider */
+		m = scaled_rt_rp * n;
+
+		/*
+		 * Since we're counting n up, a m overflow means we
+		 * can bail out completely (since as n increases in
+		 * the next iteration, there's no way that m can
+		 * increase beyond the current m)
+		 */
+		if (m > scaled_max_m)
+			break;
+
+		r = _dpll_test_mult(&m, n, &new_rate, target_rate,
+				    ref_rate);
+
+		/* m can't be set low enough for this n - try with a larger n */
+		if (r == DPLL_MULT_UNDERFLOW)
+			continue;
+
+		/* skip rates above our target rate */
+		delta = target_rate - new_rate;
+		if (delta < 0)
+			continue;
+
+		if (delta < prev_min_delta) {
+			prev_min_delta = delta;
+			min_delta_m = m;
+			min_delta_n = n;
+		}
+
+		pr_debug("clock: %s: m = %d: n = %d: new_rate = %lu\n",
+			 clk_name, m, n, new_rate);
+
+		if (delta == 0)
+			break;
+	}
+
+	if (prev_min_delta == LONG_MAX) {
+		pr_debug("clock: %s: cannot round to rate %lu\n",
+			 clk_name, target_rate);
+		return ~0;
+	}
+
+	dd->last_rounded_m = min_delta_m;
+	dd->last_rounded_n = min_delta_n;
+	dd->last_rounded_rate = target_rate - prev_min_delta;
+
+	return dd->last_rounded_rate;
+}
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index 404158d2d7f8..05ed10a81ace 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -169,4 +169,6 @@ void ti_clk_patch_legacy_clks(struct ti_clk **patch);
 struct clk *ti_clk_register_clk(struct ti_clk *setup);
 int ti_clk_register_legacy_clks(struct ti_clk_alias *clks);
 
+u8 omap2_init_dpll_parent(struct clk_hw *hw);
+
 #endif
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 1a7f86a68f62..886b2e9d2204 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -286,7 +286,6 @@ long omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw,
 					unsigned long max_rate,
 					unsigned long *best_parent_rate,
 					struct clk_hw **best_parent_clk);
-u8 omap2_init_dpll_parent(struct clk_hw *hw);
 unsigned long omap3_dpll_recalc(struct clk_hw *hw, unsigned long parent_rate);
 long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate,
 			   unsigned long *parent_rate);
-- 
cgit v1.2.3-70-g09d2


From 59245ce01a2e3ded836172266e3ac2e576a03333 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Mon, 2 Mar 2015 11:07:35 +0200
Subject: clk: ti: move OMAP4+ DPLL implementation under drivers/clk

With the legacy clock support gone, the OMAP4 specific DPLL implementations
can be moved under the clock driver. Change some of the function prototypes
to be static at the same time, and remove some exports from the global TI
clock driver header.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/Makefile   |   6 +-
 arch/arm/mach-omap2/clock.h    |   4 -
 arch/arm/mach-omap2/dpll44xx.c | 232 ----------------------------------------
 drivers/clk/ti/Makefile        |   6 +-
 drivers/clk/ti/clock.h         |  14 +++
 drivers/clk/ti/dpll44xx.c      | 233 +++++++++++++++++++++++++++++++++++++++++
 include/linux/clk/ti.h         |  13 +--
 7 files changed, 254 insertions(+), 254 deletions(-)
 delete mode 100644 arch/arm/mach-omap2/dpll44xx.c
 create mode 100644 drivers/clk/ti/dpll44xx.c

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile
index fcb5d47f88ca..5bcd282f04b3 100644
--- a/arch/arm/mach-omap2/Makefile
+++ b/arch/arm/mach-omap2/Makefile
@@ -193,12 +193,12 @@ obj-$(CONFIG_ARCH_OMAP3)		+= clock3517.o clock36xx.o
 obj-$(CONFIG_ARCH_OMAP3)		+= dpll3xxx.o
 obj-$(CONFIG_ARCH_OMAP3)		+= clkt_iclk.o
 obj-$(CONFIG_ARCH_OMAP4)		+= $(clock-common)
-obj-$(CONFIG_ARCH_OMAP4)		+= dpll3xxx.o dpll44xx.o
+obj-$(CONFIG_ARCH_OMAP4)		+= dpll3xxx.o
 obj-$(CONFIG_SOC_AM33XX)		+= $(clock-common) dpll3xxx.o
 obj-$(CONFIG_SOC_OMAP5)			+= $(clock-common)
-obj-$(CONFIG_SOC_OMAP5)			+= dpll3xxx.o dpll44xx.o
+obj-$(CONFIG_SOC_OMAP5)			+= dpll3xxx.o
 obj-$(CONFIG_SOC_DRA7XX)		+= $(clock-common)
-obj-$(CONFIG_SOC_DRA7XX)		+= dpll3xxx.o dpll44xx.o
+obj-$(CONFIG_SOC_DRA7XX)		+= dpll3xxx.o
 obj-$(CONFIG_SOC_AM43XX)		+= $(clock-common) dpll3xxx.o
 
 # OMAP2 clock rate set data (old "OPP" data)
diff --git a/arch/arm/mach-omap2/clock.h b/arch/arm/mach-omap2/clock.h
index ac21856d245d..d7ed2446057c 100644
--- a/arch/arm/mach-omap2/clock.h
+++ b/arch/arm/mach-omap2/clock.h
@@ -183,8 +183,6 @@ struct clksel {
 u32 omap3_dpll_autoidle_read(struct clk_hw_omap *clk);
 void omap3_dpll_allow_idle(struct clk_hw_omap *clk);
 void omap3_dpll_deny_idle(struct clk_hw_omap *clk);
-void omap4_dpllmx_allow_gatectrl(struct clk_hw_omap *clk);
-void omap4_dpllmx_deny_gatectrl(struct clk_hw_omap *clk);
 
 void __init omap2_clk_disable_clkdm_control(void);
 
@@ -204,8 +202,6 @@ int omap2_clksel_set_parent(struct clk_hw *hw, u8 field_val);
 extern void omap2_clkt_iclk_allow_idle(struct clk_hw_omap *clk);
 extern void omap2_clkt_iclk_deny_idle(struct clk_hw_omap *clk);
 
-unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk);
-
 void omap2_clk_dflt_find_companion(struct clk_hw_omap *clk,
 				   void __iomem **other_reg,
 				   u8 *other_bit);
diff --git a/arch/arm/mach-omap2/dpll44xx.c b/arch/arm/mach-omap2/dpll44xx.c
deleted file mode 100644
index f231be05b9a6..000000000000
--- a/arch/arm/mach-omap2/dpll44xx.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * OMAP4-specific DPLL control functions
- *
- * Copyright (C) 2011 Texas Instruments, Inc.
- * Rajendra Nayak
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/clk.h>
-#include <linux/io.h>
-#include <linux/bitops.h>
-
-#include "clock.h"
-
-/*
- * Maximum DPLL input frequency (FINT) and output frequency (FOUT) that
- * can supported when using the DPLL low-power mode. Frequencies are
- * defined in OMAP4430/60 Public TRM section 3.6.3.3.2 "Enable Control,
- * Status, and Low-Power Operation Mode".
- */
-#define OMAP4_DPLL_LP_FINT_MAX	1000000
-#define OMAP4_DPLL_LP_FOUT_MAX	100000000
-
-/*
- * Bitfield declarations
- */
-#define OMAP4430_DPLL_CLKOUT_GATE_CTRL_MASK		(1 << 8)
-#define OMAP4430_DPLL_CLKOUTX2_GATE_CTRL_MASK		(1 << 10)
-#define OMAP4430_DPLL_REGM4XEN_MASK			(1 << 11)
-
-/* Static rate multiplier for OMAP4 REGM4XEN clocks */
-#define OMAP4430_REGM4XEN_MULT				4
-
-void omap4_dpllmx_allow_gatectrl(struct clk_hw_omap *clk)
-{
-	u32 v;
-	u32 mask;
-
-	if (!clk || !clk->clksel_reg)
-		return;
-
-	mask = clk->flags & CLOCK_CLKOUTX2 ?
-			OMAP4430_DPLL_CLKOUTX2_GATE_CTRL_MASK :
-			OMAP4430_DPLL_CLKOUT_GATE_CTRL_MASK;
-
-	v = omap2_clk_readl(clk, clk->clksel_reg);
-	/* Clear the bit to allow gatectrl */
-	v &= ~mask;
-	omap2_clk_writel(v, clk, clk->clksel_reg);
-}
-
-void omap4_dpllmx_deny_gatectrl(struct clk_hw_omap *clk)
-{
-	u32 v;
-	u32 mask;
-
-	if (!clk || !clk->clksel_reg)
-		return;
-
-	mask = clk->flags & CLOCK_CLKOUTX2 ?
-			OMAP4430_DPLL_CLKOUTX2_GATE_CTRL_MASK :
-			OMAP4430_DPLL_CLKOUT_GATE_CTRL_MASK;
-
-	v = omap2_clk_readl(clk, clk->clksel_reg);
-	/* Set the bit to deny gatectrl */
-	v |= mask;
-	omap2_clk_writel(v, clk, clk->clksel_reg);
-}
-
-const struct clk_hw_omap_ops clkhwops_omap4_dpllmx = {
-	.allow_idle	= omap4_dpllmx_allow_gatectrl,
-	.deny_idle      = omap4_dpllmx_deny_gatectrl,
-};
-
-/**
- * omap4_dpll_lpmode_recalc - compute DPLL low-power setting
- * @dd: pointer to the dpll data structure
- *
- * Calculates if low-power mode can be enabled based upon the last
- * multiplier and divider values calculated. If low-power mode can be
- * enabled, then the bit to enable low-power mode is stored in the
- * last_rounded_lpmode variable. This implementation is based upon the
- * criteria for enabling low-power mode as described in the OMAP4430/60
- * Public TRM section 3.6.3.3.2 "Enable Control, Status, and Low-Power
- * Operation Mode".
- */
-static void omap4_dpll_lpmode_recalc(struct dpll_data *dd)
-{
-	long fint, fout;
-
-	fint = __clk_get_rate(dd->clk_ref) / (dd->last_rounded_n + 1);
-	fout = fint * dd->last_rounded_m;
-
-	if ((fint < OMAP4_DPLL_LP_FINT_MAX) && (fout < OMAP4_DPLL_LP_FOUT_MAX))
-		dd->last_rounded_lpmode = 1;
-	else
-		dd->last_rounded_lpmode = 0;
-}
-
-/**
- * omap4_dpll_regm4xen_recalc - compute DPLL rate, considering REGM4XEN bit
- * @clk: struct clk * of the DPLL to compute the rate for
- *
- * Compute the output rate for the OMAP4 DPLL represented by @clk.
- * Takes the REGM4XEN bit into consideration, which is needed for the
- * OMAP4 ABE DPLL.  Returns the DPLL's output rate (before M-dividers)
- * upon success, or 0 upon error.
- */
-unsigned long omap4_dpll_regm4xen_recalc(struct clk_hw *hw,
-			unsigned long parent_rate)
-{
-	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
-	u32 v;
-	unsigned long rate;
-	struct dpll_data *dd;
-
-	if (!clk || !clk->dpll_data)
-		return 0;
-
-	dd = clk->dpll_data;
-
-	rate = omap2_get_dpll_rate(clk);
-
-	/* regm4xen adds a multiplier of 4 to DPLL calculations */
-	v = omap2_clk_readl(clk, dd->control_reg);
-	if (v & OMAP4430_DPLL_REGM4XEN_MASK)
-		rate *= OMAP4430_REGM4XEN_MULT;
-
-	return rate;
-}
-
-/**
- * omap4_dpll_regm4xen_round_rate - round DPLL rate, considering REGM4XEN bit
- * @clk: struct clk * of the DPLL to round a rate for
- * @target_rate: the desired rate of the DPLL
- *
- * Compute the rate that would be programmed into the DPLL hardware
- * for @clk if set_rate() were to be provided with the rate
- * @target_rate.  Takes the REGM4XEN bit into consideration, which is
- * needed for the OMAP4 ABE DPLL.  Returns the rounded rate (before
- * M-dividers) upon success, -EINVAL if @clk is null or not a DPLL, or
- * ~0 if an error occurred in omap2_dpll_round_rate().
- */
-long omap4_dpll_regm4xen_round_rate(struct clk_hw *hw,
-				    unsigned long target_rate,
-				    unsigned long *parent_rate)
-{
-	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
-	struct dpll_data *dd;
-	long r;
-
-	if (!clk || !clk->dpll_data)
-		return -EINVAL;
-
-	dd = clk->dpll_data;
-
-	dd->last_rounded_m4xen = 0;
-
-	/*
-	 * First try to compute the DPLL configuration for
-	 * target rate without using the 4X multiplier.
-	 */
-	r = omap2_dpll_round_rate(hw, target_rate, NULL);
-	if (r != ~0)
-		goto out;
-
-	/*
-	 * If we did not find a valid DPLL configuration, try again, but
-	 * this time see if using the 4X multiplier can help. Enabling the
-	 * 4X multiplier is equivalent to dividing the target rate by 4.
-	 */
-	r = omap2_dpll_round_rate(hw, target_rate / OMAP4430_REGM4XEN_MULT,
-				  NULL);
-	if (r == ~0)
-		return r;
-
-	dd->last_rounded_rate *= OMAP4430_REGM4XEN_MULT;
-	dd->last_rounded_m4xen = 1;
-
-out:
-	omap4_dpll_lpmode_recalc(dd);
-
-	return dd->last_rounded_rate;
-}
-
-/**
- * omap4_dpll_regm4xen_determine_rate - determine rate for a DPLL
- * @hw: pointer to the clock to determine rate for
- * @rate: target rate for the DPLL
- * @best_parent_rate: pointer for returning best parent rate
- * @best_parent_clk: pointer for returning best parent clock
- *
- * Determines which DPLL mode to use for reaching a desired rate.
- * Checks whether the DPLL shall be in bypass or locked mode, and if
- * locked, calculates the M,N values for the DPLL via round-rate.
- * Returns a positive clock rate with success, negative error value
- * in failure.
- */
-long omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw, unsigned long rate,
-					unsigned long min_rate,
-					unsigned long max_rate,
-					unsigned long *best_parent_rate,
-					struct clk_hw **best_parent_clk)
-{
-	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
-	struct dpll_data *dd;
-
-	if (!hw || !rate)
-		return -EINVAL;
-
-	dd = clk->dpll_data;
-	if (!dd)
-		return -EINVAL;
-
-	if (__clk_get_rate(dd->clk_bypass) == rate &&
-	    (dd->modes & (1 << DPLL_LOW_POWER_BYPASS))) {
-		*best_parent_clk = __clk_get_hw(dd->clk_bypass);
-	} else {
-		rate = omap4_dpll_regm4xen_round_rate(hw, rate,
-						      best_parent_rate);
-		*best_parent_clk = __clk_get_hw(dd->clk_ref);
-	}
-
-	*best_parent_rate = rate;
-
-	return rate;
-}
diff --git a/drivers/clk/ti/Makefile b/drivers/clk/ti/Makefile
index 62dae2ad3c69..c3ec3014fb2d 100644
--- a/drivers/clk/ti/Makefile
+++ b/drivers/clk/ti/Makefile
@@ -7,10 +7,10 @@ obj-$(CONFIG_SOC_TI81XX)		+= $(clk-common) fapll.o clk-816x.o
 obj-$(CONFIG_ARCH_OMAP2)		+= $(clk-common) interface.o clk-2xxx.o
 obj-$(CONFIG_ARCH_OMAP3)		+= $(clk-common) interface.o \
 					   clk-3xxx.o
-obj-$(CONFIG_ARCH_OMAP4)		+= $(clk-common) clk-44xx.o
-obj-$(CONFIG_SOC_OMAP5)			+= $(clk-common) clk-54xx.o
+obj-$(CONFIG_ARCH_OMAP4)		+= $(clk-common) clk-44xx.o dpll44xx.o
+obj-$(CONFIG_SOC_OMAP5)			+= $(clk-common) clk-54xx.o dpll44xx.o
 obj-$(CONFIG_SOC_DRA7XX)		+= $(clk-common) clk-7xx.o \
-					   clk-dra7-atl.o
+					   clk-dra7-atl.o dpll44xx.o
 obj-$(CONFIG_SOC_AM43XX)		+= $(clk-common) clk-43xx.o
 
 ifdef CONFIG_ATAGS
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index 05ed10a81ace..c75d4b44cbef 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -169,6 +169,20 @@ void ti_clk_patch_legacy_clks(struct ti_clk **patch);
 struct clk *ti_clk_register_clk(struct ti_clk *setup);
 int ti_clk_register_legacy_clks(struct ti_clk_alias *clks);
 
+extern const struct clk_hw_omap_ops clkhwops_omap4_dpllmx;
+
 u8 omap2_init_dpll_parent(struct clk_hw *hw);
 
+unsigned long omap4_dpll_regm4xen_recalc(struct clk_hw *hw,
+					 unsigned long parent_rate);
+long omap4_dpll_regm4xen_round_rate(struct clk_hw *hw,
+				    unsigned long target_rate,
+				    unsigned long *parent_rate);
+long omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw,
+					unsigned long rate,
+					unsigned long min_rate,
+					unsigned long max_rate,
+					unsigned long *best_parent_rate,
+					struct clk_hw **best_parent_clk);
+
 #endif
diff --git a/drivers/clk/ti/dpll44xx.c b/drivers/clk/ti/dpll44xx.c
new file mode 100644
index 000000000000..ef1a5b43d01f
--- /dev/null
+++ b/drivers/clk/ti/dpll44xx.c
@@ -0,0 +1,233 @@
+/*
+ * OMAP4-specific DPLL control functions
+ *
+ * Copyright (C) 2011 Texas Instruments, Inc.
+ * Rajendra Nayak
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <linux/bitops.h>
+#include <linux/clk/ti.h>
+
+#include "clock.h"
+
+/*
+ * Maximum DPLL input frequency (FINT) and output frequency (FOUT) that
+ * can supported when using the DPLL low-power mode. Frequencies are
+ * defined in OMAP4430/60 Public TRM section 3.6.3.3.2 "Enable Control,
+ * Status, and Low-Power Operation Mode".
+ */
+#define OMAP4_DPLL_LP_FINT_MAX	1000000
+#define OMAP4_DPLL_LP_FOUT_MAX	100000000
+
+/*
+ * Bitfield declarations
+ */
+#define OMAP4430_DPLL_CLKOUT_GATE_CTRL_MASK		BIT(8)
+#define OMAP4430_DPLL_CLKOUTX2_GATE_CTRL_MASK		BIT(10)
+#define OMAP4430_DPLL_REGM4XEN_MASK			BIT(11)
+
+/* Static rate multiplier for OMAP4 REGM4XEN clocks */
+#define OMAP4430_REGM4XEN_MULT				4
+
+static void omap4_dpllmx_allow_gatectrl(struct clk_hw_omap *clk)
+{
+	u32 v;
+	u32 mask;
+
+	if (!clk || !clk->clksel_reg)
+		return;
+
+	mask = clk->flags & CLOCK_CLKOUTX2 ?
+			OMAP4430_DPLL_CLKOUTX2_GATE_CTRL_MASK :
+			OMAP4430_DPLL_CLKOUT_GATE_CTRL_MASK;
+
+	v = ti_clk_ll_ops->clk_readl(clk->clksel_reg);
+	/* Clear the bit to allow gatectrl */
+	v &= ~mask;
+	ti_clk_ll_ops->clk_writel(v, clk->clksel_reg);
+}
+
+static void omap4_dpllmx_deny_gatectrl(struct clk_hw_omap *clk)
+{
+	u32 v;
+	u32 mask;
+
+	if (!clk || !clk->clksel_reg)
+		return;
+
+	mask = clk->flags & CLOCK_CLKOUTX2 ?
+			OMAP4430_DPLL_CLKOUTX2_GATE_CTRL_MASK :
+			OMAP4430_DPLL_CLKOUT_GATE_CTRL_MASK;
+
+	v = ti_clk_ll_ops->clk_readl(clk->clksel_reg);
+	/* Set the bit to deny gatectrl */
+	v |= mask;
+	ti_clk_ll_ops->clk_writel(v, clk->clksel_reg);
+}
+
+const struct clk_hw_omap_ops clkhwops_omap4_dpllmx = {
+	.allow_idle	= omap4_dpllmx_allow_gatectrl,
+	.deny_idle      = omap4_dpllmx_deny_gatectrl,
+};
+
+/**
+ * omap4_dpll_lpmode_recalc - compute DPLL low-power setting
+ * @dd: pointer to the dpll data structure
+ *
+ * Calculates if low-power mode can be enabled based upon the last
+ * multiplier and divider values calculated. If low-power mode can be
+ * enabled, then the bit to enable low-power mode is stored in the
+ * last_rounded_lpmode variable. This implementation is based upon the
+ * criteria for enabling low-power mode as described in the OMAP4430/60
+ * Public TRM section 3.6.3.3.2 "Enable Control, Status, and Low-Power
+ * Operation Mode".
+ */
+static void omap4_dpll_lpmode_recalc(struct dpll_data *dd)
+{
+	long fint, fout;
+
+	fint = __clk_get_rate(dd->clk_ref) / (dd->last_rounded_n + 1);
+	fout = fint * dd->last_rounded_m;
+
+	if ((fint < OMAP4_DPLL_LP_FINT_MAX) && (fout < OMAP4_DPLL_LP_FOUT_MAX))
+		dd->last_rounded_lpmode = 1;
+	else
+		dd->last_rounded_lpmode = 0;
+}
+
+/**
+ * omap4_dpll_regm4xen_recalc - compute DPLL rate, considering REGM4XEN bit
+ * @clk: struct clk * of the DPLL to compute the rate for
+ *
+ * Compute the output rate for the OMAP4 DPLL represented by @clk.
+ * Takes the REGM4XEN bit into consideration, which is needed for the
+ * OMAP4 ABE DPLL.  Returns the DPLL's output rate (before M-dividers)
+ * upon success, or 0 upon error.
+ */
+unsigned long omap4_dpll_regm4xen_recalc(struct clk_hw *hw,
+					 unsigned long parent_rate)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+	u32 v;
+	unsigned long rate;
+	struct dpll_data *dd;
+
+	if (!clk || !clk->dpll_data)
+		return 0;
+
+	dd = clk->dpll_data;
+
+	rate = omap2_get_dpll_rate(clk);
+
+	/* regm4xen adds a multiplier of 4 to DPLL calculations */
+	v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+	if (v & OMAP4430_DPLL_REGM4XEN_MASK)
+		rate *= OMAP4430_REGM4XEN_MULT;
+
+	return rate;
+}
+
+/**
+ * omap4_dpll_regm4xen_round_rate - round DPLL rate, considering REGM4XEN bit
+ * @clk: struct clk * of the DPLL to round a rate for
+ * @target_rate: the desired rate of the DPLL
+ *
+ * Compute the rate that would be programmed into the DPLL hardware
+ * for @clk if set_rate() were to be provided with the rate
+ * @target_rate.  Takes the REGM4XEN bit into consideration, which is
+ * needed for the OMAP4 ABE DPLL.  Returns the rounded rate (before
+ * M-dividers) upon success, -EINVAL if @clk is null or not a DPLL, or
+ * ~0 if an error occurred in omap2_dpll_round_rate().
+ */
+long omap4_dpll_regm4xen_round_rate(struct clk_hw *hw,
+				    unsigned long target_rate,
+				    unsigned long *parent_rate)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+	struct dpll_data *dd;
+	long r;
+
+	if (!clk || !clk->dpll_data)
+		return -EINVAL;
+
+	dd = clk->dpll_data;
+
+	dd->last_rounded_m4xen = 0;
+
+	/*
+	 * First try to compute the DPLL configuration for
+	 * target rate without using the 4X multiplier.
+	 */
+	r = omap2_dpll_round_rate(hw, target_rate, NULL);
+	if (r != ~0)
+		goto out;
+
+	/*
+	 * If we did not find a valid DPLL configuration, try again, but
+	 * this time see if using the 4X multiplier can help. Enabling the
+	 * 4X multiplier is equivalent to dividing the target rate by 4.
+	 */
+	r = omap2_dpll_round_rate(hw, target_rate / OMAP4430_REGM4XEN_MULT,
+				  NULL);
+	if (r == ~0)
+		return r;
+
+	dd->last_rounded_rate *= OMAP4430_REGM4XEN_MULT;
+	dd->last_rounded_m4xen = 1;
+
+out:
+	omap4_dpll_lpmode_recalc(dd);
+
+	return dd->last_rounded_rate;
+}
+
+/**
+ * omap4_dpll_regm4xen_determine_rate - determine rate for a DPLL
+ * @hw: pointer to the clock to determine rate for
+ * @rate: target rate for the DPLL
+ * @best_parent_rate: pointer for returning best parent rate
+ * @best_parent_clk: pointer for returning best parent clock
+ *
+ * Determines which DPLL mode to use for reaching a desired rate.
+ * Checks whether the DPLL shall be in bypass or locked mode, and if
+ * locked, calculates the M,N values for the DPLL via round-rate.
+ * Returns a positive clock rate with success, negative error value
+ * in failure.
+ */
+long omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw, unsigned long rate,
+					unsigned long min_rate,
+					unsigned long max_rate,
+					unsigned long *best_parent_rate,
+					struct clk_hw **best_parent_clk)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+	struct dpll_data *dd;
+
+	if (!hw || !rate)
+		return -EINVAL;
+
+	dd = clk->dpll_data;
+	if (!dd)
+		return -EINVAL;
+
+	if (__clk_get_rate(dd->clk_bypass) == rate &&
+	    (dd->modes & (1 << DPLL_LOW_POWER_BYPASS))) {
+		*best_parent_clk = __clk_get_hw(dd->clk_bypass);
+	} else {
+		rate = omap4_dpll_regm4xen_round_rate(hw, rate,
+						      best_parent_rate);
+		*best_parent_clk = __clk_get_hw(dd->clk_ref);
+	}
+
+	*best_parent_rate = rate;
+
+	return rate;
+}
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 886b2e9d2204..ee59e076340f 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -275,17 +275,6 @@ long omap3_noncore_dpll_determine_rate(struct clk_hw *hw,
 				       unsigned long max_rate,
 				       unsigned long *best_parent_rate,
 				       struct clk_hw **best_parent_clk);
-unsigned long omap4_dpll_regm4xen_recalc(struct clk_hw *hw,
-					 unsigned long parent_rate);
-long omap4_dpll_regm4xen_round_rate(struct clk_hw *hw,
-				    unsigned long target_rate,
-				    unsigned long *parent_rate);
-long omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw,
-					unsigned long rate,
-					unsigned long min_rate,
-					unsigned long max_rate,
-					unsigned long *best_parent_rate,
-					struct clk_hw **best_parent_clk);
 unsigned long omap3_dpll_recalc(struct clk_hw *hw, unsigned long parent_rate);
 long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate,
 			   unsigned long *parent_rate);
@@ -314,6 +303,7 @@ int omap2_reprogram_dpllcore(struct clk_hw *clk, unsigned long rate,
 			     unsigned long parent_rate);
 void omap2xxx_clkt_dpllcore_init(struct clk_hw *hw);
 void omap2xxx_clkt_vps_init(void);
+unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk);
 
 void __iomem *ti_clk_get_reg_addr(struct device_node *node, int index);
 void ti_dt_clocks_register(struct ti_dt_clk *oclks);
@@ -364,7 +354,6 @@ static inline void of_ti_clk_deny_autoidle_all(void) { }
 extern const struct clk_hw_omap_ops clkhwops_omap2xxx_dpll;
 extern const struct clk_hw_omap_ops clkhwops_omap2430_i2chs_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3_dpll;
-extern const struct clk_hw_omap_ops clkhwops_omap4_dpllmx;
 extern const struct clk_hw_omap_ops clkhwops_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_dss_usbhost_wait;
 extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_module_wait;
-- 
cgit v1.2.3-70-g09d2


From ef14db0977547b1982d4f6eaa305e1a22eb95778 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Mon, 2 Mar 2015 14:33:54 +0200
Subject: clk: ti: move interface clock implementation under drivers/clk

With the legacy clock support gone, the OMAP interface clock implementation
can be moved under the clock driver. Some temporary header file tweaks are
also needed to make this change work properly.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/Makefile    |  3 +-
 arch/arm/mach-omap2/clkt_iclk.c | 68 -----------------------------------------
 arch/arm/mach-omap2/clock.h     | 11 -------
 drivers/clk/ti/Makefile         |  2 +-
 drivers/clk/ti/clkt_iclk.c      | 66 +++++++++++++++++++++++++++++++++++++++
 drivers/clk/ti/clock.h          |  2 ++
 include/linux/clk/ti.h          | 10 ++++--
 7 files changed, 78 insertions(+), 84 deletions(-)
 delete mode 100644 arch/arm/mach-omap2/clkt_iclk.c
 create mode 100644 drivers/clk/ti/clkt_iclk.c

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile
index 5bcd282f04b3..a2f51564e8d4 100644
--- a/arch/arm/mach-omap2/Makefile
+++ b/arch/arm/mach-omap2/Makefile
@@ -185,13 +185,12 @@ obj-$(CONFIG_SOC_DRA7XX)		+= clockdomains7xx_data.o
 obj-$(CONFIG_ARCH_OMAP2)		+= $(clock-common) clock2xxx.o
 obj-$(CONFIG_ARCH_OMAP2)		+= clkt2xxx_dpllcore.o
 obj-$(CONFIG_ARCH_OMAP2)		+= clkt2xxx_virt_prcm_set.o
-obj-$(CONFIG_ARCH_OMAP2)		+= clkt2xxx_dpll.o clkt_iclk.o
+obj-$(CONFIG_ARCH_OMAP2)		+= clkt2xxx_dpll.o
 obj-$(CONFIG_SOC_OMAP2430)		+= clock2430.o
 obj-$(CONFIG_ARCH_OMAP3)		+= $(clock-common) clock3xxx.o
 obj-$(CONFIG_ARCH_OMAP3)		+= clock34xx.o clkt34xx_dpll3m2.o
 obj-$(CONFIG_ARCH_OMAP3)		+= clock3517.o clock36xx.o
 obj-$(CONFIG_ARCH_OMAP3)		+= dpll3xxx.o
-obj-$(CONFIG_ARCH_OMAP3)		+= clkt_iclk.o
 obj-$(CONFIG_ARCH_OMAP4)		+= $(clock-common)
 obj-$(CONFIG_ARCH_OMAP4)		+= dpll3xxx.o
 obj-$(CONFIG_SOC_AM33XX)		+= $(clock-common) dpll3xxx.o
diff --git a/arch/arm/mach-omap2/clkt_iclk.c b/arch/arm/mach-omap2/clkt_iclk.c
deleted file mode 100644
index 55eb579aeae1..000000000000
--- a/arch/arm/mach-omap2/clkt_iclk.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * OMAP2/3 interface clock control
- *
- * Copyright (C) 2011 Nokia Corporation
- * Paul Walmsley
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#undef DEBUG
-
-#include <linux/kernel.h>
-#include <linux/clk-provider.h>
-#include <linux/io.h>
-
-#include "clock.h"
-
-/* Register offsets */
-#define CM_AUTOIDLE			0x30
-#define CM_ICLKEN			0x10
-
-/* Private functions */
-
-/* XXX */
-void omap2_clkt_iclk_allow_idle(struct clk_hw_omap *clk)
-{
-	u32 v;
-	void __iomem *r;
-
-	r = (__force void __iomem *)
-		((__force u32)clk->enable_reg ^ (CM_AUTOIDLE ^ CM_ICLKEN));
-
-	v = omap2_clk_readl(clk, r);
-	v |= (1 << clk->enable_bit);
-	omap2_clk_writel(v, clk, r);
-}
-
-/* XXX */
-void omap2_clkt_iclk_deny_idle(struct clk_hw_omap *clk)
-{
-	u32 v;
-	void __iomem *r;
-
-	r = (__force void __iomem *)
-		((__force u32)clk->enable_reg ^ (CM_AUTOIDLE ^ CM_ICLKEN));
-
-	v = omap2_clk_readl(clk, r);
-	v &= ~(1 << clk->enable_bit);
-	omap2_clk_writel(v, clk, r);
-}
-
-/* Public data */
-
-const struct clk_hw_omap_ops clkhwops_iclk = {
-	.allow_idle	= omap2_clkt_iclk_allow_idle,
-	.deny_idle	= omap2_clkt_iclk_deny_idle,
-};
-
-const struct clk_hw_omap_ops clkhwops_iclk_wait = {
-	.allow_idle	= omap2_clkt_iclk_allow_idle,
-	.deny_idle	= omap2_clkt_iclk_deny_idle,
-	.find_idlest	= omap2_clk_dflt_find_idlest,
-	.find_companion	= omap2_clk_dflt_find_companion,
-};
-
-
-
diff --git a/arch/arm/mach-omap2/clock.h b/arch/arm/mach-omap2/clock.h
index d7ed2446057c..ca8c42c70db5 100644
--- a/arch/arm/mach-omap2/clock.h
+++ b/arch/arm/mach-omap2/clock.h
@@ -198,16 +198,6 @@ int omap2_clksel_set_rate(struct clk_hw *hw, unsigned long rate,
 				unsigned long parent_rate);
 int omap2_clksel_set_parent(struct clk_hw *hw, u8 field_val);
 
-/* clkt_iclk.c public functions */
-extern void omap2_clkt_iclk_allow_idle(struct clk_hw_omap *clk);
-extern void omap2_clkt_iclk_deny_idle(struct clk_hw_omap *clk);
-
-void omap2_clk_dflt_find_companion(struct clk_hw_omap *clk,
-				   void __iomem **other_reg,
-				   u8 *other_bit);
-void omap2_clk_dflt_find_idlest(struct clk_hw_omap *clk,
-				void __iomem **idlest_reg,
-				u8 *idlest_bit, u8 *idlest_val);
 int omap2_clk_enable_autoidle_all(void);
 int omap2_clk_allow_idle(struct clk *clk);
 int omap2_clk_deny_idle(struct clk *clk);
@@ -231,7 +221,6 @@ extern const struct clksel_rate gpt_sys_rates[];
 extern const struct clksel_rate gfx_l3_rates[];
 extern const struct clksel_rate dsp_ick_rates[];
 
-extern const struct clk_hw_omap_ops clkhwops_iclk_wait;
 extern const struct clk_hw_omap_ops clkhwops_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_ssi_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_dss_usbhost_wait;
diff --git a/drivers/clk/ti/Makefile b/drivers/clk/ti/Makefile
index c3ec3014fb2d..23cd72638970 100644
--- a/drivers/clk/ti/Makefile
+++ b/drivers/clk/ti/Makefile
@@ -1,7 +1,7 @@
 obj-y					+= clk.o autoidle.o clockdomain.o
 clk-common				= dpll.o composite.o divider.o gate.o \
 					  fixed-factor.o mux.o apll.o \
-					  clkt_dpll.o
+					  clkt_dpll.o clkt_iclk.o
 obj-$(CONFIG_SOC_AM33XX)		+= $(clk-common) clk-33xx.o
 obj-$(CONFIG_SOC_TI81XX)		+= $(clk-common) fapll.o clk-816x.o
 obj-$(CONFIG_ARCH_OMAP2)		+= $(clk-common) interface.o clk-2xxx.o
diff --git a/drivers/clk/ti/clkt_iclk.c b/drivers/clk/ti/clkt_iclk.c
new file mode 100644
index 000000000000..a03919df00ef
--- /dev/null
+++ b/drivers/clk/ti/clkt_iclk.c
@@ -0,0 +1,66 @@
+/*
+ * OMAP2/3 interface clock control
+ *
+ * Copyright (C) 2011 Nokia Corporation
+ * Paul Walmsley
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/clk-provider.h>
+#include <linux/io.h>
+#include <linux/clk/ti.h>
+
+#include "clock.h"
+
+/* Register offsets */
+#define CM_AUTOIDLE			0x30
+#define CM_ICLKEN			0x10
+
+/* Private functions */
+
+/* XXX */
+void omap2_clkt_iclk_allow_idle(struct clk_hw_omap *clk)
+{
+	u32 v;
+	void __iomem *r;
+
+	r = (__force void __iomem *)
+		((__force u32)clk->enable_reg ^ (CM_AUTOIDLE ^ CM_ICLKEN));
+
+	v = ti_clk_ll_ops->clk_readl(r);
+	v |= (1 << clk->enable_bit);
+	ti_clk_ll_ops->clk_writel(v, r);
+}
+
+/* XXX */
+void omap2_clkt_iclk_deny_idle(struct clk_hw_omap *clk)
+{
+	u32 v;
+	void __iomem *r;
+
+	r = (__force void __iomem *)
+		((__force u32)clk->enable_reg ^ (CM_AUTOIDLE ^ CM_ICLKEN));
+
+	v = ti_clk_ll_ops->clk_readl(r);
+	v &= ~(1 << clk->enable_bit);
+	ti_clk_ll_ops->clk_writel(v, r);
+}
+
+/* Public data */
+
+const struct clk_hw_omap_ops clkhwops_iclk = {
+	.allow_idle	= omap2_clkt_iclk_allow_idle,
+	.deny_idle	= omap2_clkt_iclk_deny_idle,
+};
+
+const struct clk_hw_omap_ops clkhwops_iclk_wait = {
+	.allow_idle	= omap2_clkt_iclk_allow_idle,
+	.deny_idle	= omap2_clkt_iclk_deny_idle,
+	.find_idlest	= omap2_clk_dflt_find_idlest,
+	.find_companion	= omap2_clk_dflt_find_companion,
+};
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index c75d4b44cbef..a7256a98201d 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -170,6 +170,8 @@ struct clk *ti_clk_register_clk(struct ti_clk *setup);
 int ti_clk_register_legacy_clks(struct ti_clk_alias *clks);
 
 extern const struct clk_hw_omap_ops clkhwops_omap4_dpllmx;
+extern const struct clk_hw_omap_ops clkhwops_iclk;
+extern const struct clk_hw_omap_ops clkhwops_iclk_wait;
 
 u8 omap2_init_dpll_parent(struct clk_hw *hw);
 
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index ee59e076340f..79e143dfc793 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -296,6 +296,14 @@ int omap3_dpll4_set_rate_and_parent(struct clk_hw *hw, unsigned long rate,
 int omap2_dflt_clk_enable(struct clk_hw *hw);
 void omap2_dflt_clk_disable(struct clk_hw *hw);
 int omap2_dflt_clk_is_enabled(struct clk_hw *hw);
+void omap2_clkt_iclk_allow_idle(struct clk_hw_omap *clk);
+void omap2_clkt_iclk_deny_idle(struct clk_hw_omap *clk);
+void omap2_clk_dflt_find_companion(struct clk_hw_omap *clk,
+				   void __iomem **other_reg,
+				   u8 *other_bit);
+void omap2_clk_dflt_find_idlest(struct clk_hw_omap *clk,
+				void __iomem **idlest_reg,
+				u8 *idlest_bit, u8 *idlest_val);
 void omap3_clk_lock_dpll5(void);
 unsigned long omap2_dpllcore_recalc(struct clk_hw *hw,
 				    unsigned long parent_rate);
@@ -358,8 +366,6 @@ extern const struct clk_hw_omap_ops clkhwops_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_dss_usbhost_wait;
 extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_module_wait;
 extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_wait;
-extern const struct clk_hw_omap_ops clkhwops_iclk;
-extern const struct clk_hw_omap_ops clkhwops_iclk_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_ssi_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_dss_usbhost_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_hsotgusb_wait;
-- 
cgit v1.2.3-70-g09d2


From bf22bae794d696e411acfcac39b415e160e93834 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Mon, 2 Mar 2015 19:06:54 +0200
Subject: clk: ti: autoidle: move generic autoidle handling code to clock
 driver

This is no longer needed in platform directory, as the legacy clock data
is gone, so move it under TI clock driver. Some static functions are
renamed also.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/clock.c   | 104 ------------------------------------
 arch/arm/mach-omap2/clock.h   |   3 --
 drivers/clk/ti/autoidle.c     | 119 +++++++++++++++++++++++++++++++++++++++---
 drivers/clk/ti/clock.h        |   3 ++
 drivers/clk/ti/fixed-factor.c |   2 +
 include/linux/clk/ti.h        |  13 ++---
 6 files changed, 119 insertions(+), 125 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index cbc65b3a3b62..42ce860e1d4c 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -69,8 +69,6 @@ u16 cpu_mask;
  */
 static bool clkdm_control = true;
 
-static LIST_HEAD(clk_hw_omap_clocks);
-
 struct clk_iomap {
 	struct regmap *regmap;
 	void __iomem *mem;
@@ -578,108 +576,6 @@ static int __init omap_clk_setup(char *str)
 }
 __setup("mpurate=", omap_clk_setup);
 
-/**
- * omap2_init_clk_hw_omap_clocks - initialize an OMAP clock
- * @clk: struct clk * to initialize
- *
- * Add an OMAP clock @clk to the internal list of OMAP clocks.  Used
- * temporarily for autoidle handling, until this support can be
- * integrated into the common clock framework code in some way.  No
- * return value.
- */
-void omap2_init_clk_hw_omap_clocks(struct clk *clk)
-{
-	struct clk_hw_omap *c;
-
-	if (__clk_get_flags(clk) & CLK_IS_BASIC)
-		return;
-
-	c = to_clk_hw_omap(__clk_get_hw(clk));
-	list_add(&c->node, &clk_hw_omap_clocks);
-}
-
-/**
- * omap2_clk_enable_autoidle_all - enable autoidle on all OMAP clocks that
- * support it
- *
- * Enable clock autoidle on all OMAP clocks that have allow_idle
- * function pointers associated with them.  This function is intended
- * to be temporary until support for this is added to the common clock
- * code.  Returns 0.
- */
-int omap2_clk_enable_autoidle_all(void)
-{
-	struct clk_hw_omap *c;
-
-	list_for_each_entry(c, &clk_hw_omap_clocks, node)
-		if (c->ops && c->ops->allow_idle)
-			c->ops->allow_idle(c);
-
-	of_ti_clk_allow_autoidle_all();
-
-	return 0;
-}
-
-/**
- * omap2_clk_disable_autoidle_all - disable autoidle on all OMAP clocks that
- * support it
- *
- * Disable clock autoidle on all OMAP clocks that have allow_idle
- * function pointers associated with them.  This function is intended
- * to be temporary until support for this is added to the common clock
- * code.  Returns 0.
- */
-int omap2_clk_disable_autoidle_all(void)
-{
-	struct clk_hw_omap *c;
-
-	list_for_each_entry(c, &clk_hw_omap_clocks, node)
-		if (c->ops && c->ops->deny_idle)
-			c->ops->deny_idle(c);
-
-	of_ti_clk_deny_autoidle_all();
-
-	return 0;
-}
-
-/**
- * omap2_clk_deny_idle - disable autoidle on an OMAP clock
- * @clk: struct clk * to disable autoidle for
- *
- * Disable autoidle on an OMAP clock.
- */
-int omap2_clk_deny_idle(struct clk *clk)
-{
-	struct clk_hw_omap *c;
-
-	if (__clk_get_flags(clk) & CLK_IS_BASIC)
-		return -EINVAL;
-
-	c = to_clk_hw_omap(__clk_get_hw(clk));
-	if (c->ops && c->ops->deny_idle)
-		c->ops->deny_idle(c);
-	return 0;
-}
-
-/**
- * omap2_clk_allow_idle - enable autoidle on an OMAP clock
- * @clk: struct clk * to enable autoidle for
- *
- * Enable autoidle on an OMAP clock.
- */
-int omap2_clk_allow_idle(struct clk *clk)
-{
-	struct clk_hw_omap *c;
-
-	if (__clk_get_flags(clk) & CLK_IS_BASIC)
-		return -EINVAL;
-
-	c = to_clk_hw_omap(__clk_get_hw(clk));
-	if (c->ops && c->ops->allow_idle)
-		c->ops->allow_idle(c);
-	return 0;
-}
-
 /**
  * omap2_clk_enable_init_clocks - prepare & enable a list of clocks
  * @clk_names: ptr to an array of strings of clock names to enable
diff --git a/arch/arm/mach-omap2/clock.h b/arch/arm/mach-omap2/clock.h
index b71d43051c26..950a17ae4f36 100644
--- a/arch/arm/mach-omap2/clock.h
+++ b/arch/arm/mach-omap2/clock.h
@@ -186,9 +186,6 @@ void omap3_dpll_deny_idle(struct clk_hw_omap *clk);
 
 void __init omap2_clk_disable_clkdm_control(void);
 
-int omap2_clk_enable_autoidle_all(void);
-int omap2_clk_allow_idle(struct clk *clk);
-int omap2_clk_deny_idle(struct clk *clk);
 int omap2_clk_switch_mpurate_at_boot(const char *mpurate_ck_name);
 void omap2_clk_print_new_rates(const char *hfclkin_ck_name,
 			       const char *core_ck_name,
diff --git a/drivers/clk/ti/autoidle.c b/drivers/clk/ti/autoidle.c
index e75c64c9e81c..3dbcc3681058 100644
--- a/drivers/clk/ti/autoidle.c
+++ b/drivers/clk/ti/autoidle.c
@@ -33,8 +33,47 @@ struct clk_ti_autoidle {
 #define AUTOIDLE_LOW		0x1
 
 static LIST_HEAD(autoidle_clks);
+static LIST_HEAD(clk_hw_omap_clocks);
 
-static void ti_allow_autoidle(struct clk_ti_autoidle *clk)
+/**
+ * omap2_clk_deny_idle - disable autoidle on an OMAP clock
+ * @clk: struct clk * to disable autoidle for
+ *
+ * Disable autoidle on an OMAP clock.
+ */
+int omap2_clk_deny_idle(struct clk *clk)
+{
+	struct clk_hw_omap *c;
+
+	if (__clk_get_flags(clk) & CLK_IS_BASIC)
+		return -EINVAL;
+
+	c = to_clk_hw_omap(__clk_get_hw(clk));
+	if (c->ops && c->ops->deny_idle)
+		c->ops->deny_idle(c);
+	return 0;
+}
+
+/**
+ * omap2_clk_allow_idle - enable autoidle on an OMAP clock
+ * @clk: struct clk * to enable autoidle for
+ *
+ * Enable autoidle on an OMAP clock.
+ */
+int omap2_clk_allow_idle(struct clk *clk)
+{
+	struct clk_hw_omap *c;
+
+	if (__clk_get_flags(clk) & CLK_IS_BASIC)
+		return -EINVAL;
+
+	c = to_clk_hw_omap(__clk_get_hw(clk));
+	if (c->ops && c->ops->allow_idle)
+		c->ops->allow_idle(c);
+	return 0;
+}
+
+static void _allow_autoidle(struct clk_ti_autoidle *clk)
 {
 	u32 val;
 
@@ -48,7 +87,7 @@ static void ti_allow_autoidle(struct clk_ti_autoidle *clk)
 	ti_clk_ll_ops->clk_writel(val, clk->reg);
 }
 
-static void ti_deny_autoidle(struct clk_ti_autoidle *clk)
+static void _deny_autoidle(struct clk_ti_autoidle *clk)
 {
 	u32 val;
 
@@ -63,31 +102,31 @@ static void ti_deny_autoidle(struct clk_ti_autoidle *clk)
 }
 
 /**
- * of_ti_clk_allow_autoidle_all - enable autoidle for all clocks
+ * _clk_generic_allow_autoidle_all - enable autoidle for all clocks
  *
  * Enables hardware autoidle for all registered DT clocks, which have
  * the feature.
  */
-void of_ti_clk_allow_autoidle_all(void)
+static void _clk_generic_allow_autoidle_all(void)
 {
 	struct clk_ti_autoidle *c;
 
 	list_for_each_entry(c, &autoidle_clks, node)
-		ti_allow_autoidle(c);
+		_allow_autoidle(c);
 }
 
 /**
- * of_ti_clk_deny_autoidle_all - disable autoidle for all clocks
+ * _clk_generic_deny_autoidle_all - disable autoidle for all clocks
  *
  * Disables hardware autoidle for all registered DT clocks, which have
  * the feature.
  */
-void of_ti_clk_deny_autoidle_all(void)
+static void _clk_generic_deny_autoidle_all(void)
 {
 	struct clk_ti_autoidle *c;
 
 	list_for_each_entry(c, &autoidle_clks, node)
-		ti_deny_autoidle(c);
+		_deny_autoidle(c);
 }
 
 /**
@@ -131,3 +170,67 @@ int __init of_ti_clk_autoidle_setup(struct device_node *node)
 
 	return 0;
 }
+
+/**
+ * omap2_init_clk_hw_omap_clocks - initialize an OMAP clock
+ * @clk: struct clk * to initialize
+ *
+ * Add an OMAP clock @clk to the internal list of OMAP clocks.  Used
+ * temporarily for autoidle handling, until this support can be
+ * integrated into the common clock framework code in some way.  No
+ * return value.
+ */
+void omap2_init_clk_hw_omap_clocks(struct clk *clk)
+{
+	struct clk_hw_omap *c;
+
+	if (__clk_get_flags(clk) & CLK_IS_BASIC)
+		return;
+
+	c = to_clk_hw_omap(__clk_get_hw(clk));
+	list_add(&c->node, &clk_hw_omap_clocks);
+}
+
+/**
+ * omap2_clk_enable_autoidle_all - enable autoidle on all OMAP clocks that
+ * support it
+ *
+ * Enable clock autoidle on all OMAP clocks that have allow_idle
+ * function pointers associated with them.  This function is intended
+ * to be temporary until support for this is added to the common clock
+ * code.  Returns 0.
+ */
+int omap2_clk_enable_autoidle_all(void)
+{
+	struct clk_hw_omap *c;
+
+	list_for_each_entry(c, &clk_hw_omap_clocks, node)
+		if (c->ops && c->ops->allow_idle)
+			c->ops->allow_idle(c);
+
+	_clk_generic_allow_autoidle_all();
+
+	return 0;
+}
+
+/**
+ * omap2_clk_disable_autoidle_all - disable autoidle on all OMAP clocks that
+ * support it
+ *
+ * Disable clock autoidle on all OMAP clocks that have allow_idle
+ * function pointers associated with them.  This function is intended
+ * to be temporary until support for this is added to the common clock
+ * code.  Returns 0.
+ */
+int omap2_clk_disable_autoidle_all(void)
+{
+	struct clk_hw_omap *c;
+
+	list_for_each_entry(c, &clk_hw_omap_clocks, node)
+		if (c->ops && c->ops->deny_idle)
+			c->ops->deny_idle(c);
+
+	_clk_generic_deny_autoidle_all();
+
+	return 0;
+}
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index a7256a98201d..9b51021f509a 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -169,6 +169,9 @@ void ti_clk_patch_legacy_clks(struct ti_clk **patch);
 struct clk *ti_clk_register_clk(struct ti_clk *setup);
 int ti_clk_register_legacy_clks(struct ti_clk_alias *clks);
 
+void omap2_init_clk_hw_omap_clocks(struct clk *clk);
+int of_ti_clk_autoidle_setup(struct device_node *node);
+
 extern const struct clk_hw_omap_ops clkhwops_omap4_dpllmx;
 extern const struct clk_hw_omap_ops clkhwops_iclk;
 extern const struct clk_hw_omap_ops clkhwops_iclk_wait;
diff --git a/drivers/clk/ti/fixed-factor.c b/drivers/clk/ti/fixed-factor.c
index c2c8a287408c..3cd406768909 100644
--- a/drivers/clk/ti/fixed-factor.c
+++ b/drivers/clk/ti/fixed-factor.c
@@ -22,6 +22,8 @@
 #include <linux/of_address.h>
 #include <linux/clk/ti.h>
 
+#include "clock.h"
+
 #undef pr_fmt
 #define pr_fmt(fmt) "%s: " fmt, __func__
 
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 79e143dfc793..320e107f9a7a 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -259,7 +259,6 @@ extern const struct clk_ops ti_clk_mux_ops;
 
 #define to_clk_hw_omap(_hw) container_of(_hw, struct clk_hw_omap, hw)
 
-void omap2_init_clk_hw_omap_clocks(struct clk *clk);
 int omap3_noncore_dpll_enable(struct clk_hw *hw);
 void omap3_noncore_dpll_disable(struct clk_hw *hw);
 int omap3_noncore_dpll_set_parent(struct clk_hw *hw, u8 index);
@@ -288,6 +287,9 @@ long omap3_clkoutx2_round_rate(struct clk_hw *hw, unsigned long rate,
 int omap2_clkops_enable_clkdm(struct clk_hw *hw);
 void omap2_clkops_disable_clkdm(struct clk_hw *hw);
 int omap2_clk_disable_autoidle_all(void);
+int omap2_clk_enable_autoidle_all(void);
+int omap2_clk_allow_idle(struct clk *clk);
+int omap2_clk_deny_idle(struct clk *clk);
 void omap2_clk_enable_init_clocks(const char **clk_names, u8 num_clocks);
 int omap3_dpll4_set_rate(struct clk_hw *clk, unsigned long rate,
 			 unsigned long parent_rate);
@@ -320,7 +322,6 @@ void ti_dt_clk_init_retry_clks(void);
 void ti_dt_clockdomains_setup(void);
 int ti_clk_retry_init(struct device_node *node, struct clk_hw *hw,
 		      ti_of_clk_init_cb_t func);
-int of_ti_clk_autoidle_setup(struct device_node *node);
 int ti_clk_add_component(struct device_node *node, struct clk_hw *hw, int type);
 
 int omap3430_dt_clk_init(void);
@@ -351,14 +352,6 @@ struct ti_clk_features {
 void ti_clk_setup_features(struct ti_clk_features *features);
 const struct ti_clk_features *ti_clk_get_features(void);
 
-#ifdef CONFIG_OF
-void of_ti_clk_allow_autoidle_all(void);
-void of_ti_clk_deny_autoidle_all(void);
-#else
-static inline void of_ti_clk_allow_autoidle_all(void) { }
-static inline void of_ti_clk_deny_autoidle_all(void) { }
-#endif
-
 extern const struct clk_hw_omap_ops clkhwops_omap2xxx_dpll;
 extern const struct clk_hw_omap_ops clkhwops_omap2430_i2chs_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3_dpll;
-- 
cgit v1.2.3-70-g09d2


From a5aa8a603efa25dd41220bff990da025c93b632b Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Tue, 3 Mar 2015 10:51:01 +0200
Subject: clk: ti: move omap2_clk_enable_init_clocks under clock driver

This is no longer used outside clock driver, so move it under the driver
and remove the export for it from the global header file.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/clock.c | 24 ------------------------
 drivers/clk/ti/clk-2xxx.c   |  2 ++
 drivers/clk/ti/clk-33xx.c   |  2 ++
 drivers/clk/ti/clk-3xxx.c   |  1 +
 drivers/clk/ti/clk-816x.c   |  2 ++
 drivers/clk/ti/clk.c        | 24 ++++++++++++++++++++++++
 drivers/clk/ti/clock.h      |  1 +
 include/linux/clk/ti.h      |  1 -
 8 files changed, 32 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index 42ce860e1d4c..234cedf8967d 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -576,30 +576,6 @@ static int __init omap_clk_setup(char *str)
 }
 __setup("mpurate=", omap_clk_setup);
 
-/**
- * omap2_clk_enable_init_clocks - prepare & enable a list of clocks
- * @clk_names: ptr to an array of strings of clock names to enable
- * @num_clocks: number of clock names in @clk_names
- *
- * Prepare and enable a list of clocks, named by @clk_names.  No
- * return value. XXX Deprecated; only needed until these clocks are
- * properly claimed and enabled by the drivers or core code that uses
- * them.  XXX What code disables & calls clk_put on these clocks?
- */
-void omap2_clk_enable_init_clocks(const char **clk_names, u8 num_clocks)
-{
-	struct clk *init_clk;
-	int i;
-
-	for (i = 0; i < num_clocks; i++) {
-		init_clk = clk_get(NULL, clk_names[i]);
-		if (WARN(IS_ERR(init_clk), "could not find init clock %s\n",
-				clk_names[i]))
-			continue;
-		clk_prepare_enable(init_clk);
-	}
-}
-
 const struct clk_hw_omap_ops clkhwops_wait = {
 	.find_idlest	= omap2_clk_dflt_find_idlest,
 	.find_companion	= omap2_clk_dflt_find_companion,
diff --git a/drivers/clk/ti/clk-2xxx.c b/drivers/clk/ti/clk-2xxx.c
index c808ab3d2bb2..bd8790be2ab1 100644
--- a/drivers/clk/ti/clk-2xxx.c
+++ b/drivers/clk/ti/clk-2xxx.c
@@ -19,6 +19,8 @@
 #include <linux/clk-provider.h>
 #include <linux/clk/ti.h>
 
+#include "clock.h"
+
 static struct ti_dt_clk omap2xxx_clks[] = {
 	DT_CLK(NULL, "func_32k_ck", "func_32k_ck"),
 	DT_CLK(NULL, "secure_32k_ck", "secure_32k_ck"),
diff --git a/drivers/clk/ti/clk-33xx.c b/drivers/clk/ti/clk-33xx.c
index 028b33783d38..733f9d374d0f 100644
--- a/drivers/clk/ti/clk-33xx.c
+++ b/drivers/clk/ti/clk-33xx.c
@@ -19,6 +19,8 @@
 #include <linux/clk-provider.h>
 #include <linux/clk/ti.h>
 
+#include "clock.h"
+
 static struct ti_dt_clk am33xx_clks[] = {
 	DT_CLK(NULL, "clk_32768_ck", "clk_32768_ck"),
 	DT_CLK(NULL, "clk_rc32k_ck", "clk_rc32k_ck"),
diff --git a/drivers/clk/ti/clk-3xxx.c b/drivers/clk/ti/clk-3xxx.c
index 757636d166cf..bb3b88359daf 100644
--- a/drivers/clk/ti/clk-3xxx.c
+++ b/drivers/clk/ti/clk-3xxx.c
@@ -19,6 +19,7 @@
 #include <linux/clk-provider.h>
 #include <linux/clk/ti.h>
 
+#include "clock.h"
 
 static struct ti_dt_clk omap3xxx_clks[] = {
 	DT_CLK(NULL, "apb_pclk", "dummy_apb_pclk"),
diff --git a/drivers/clk/ti/clk-816x.c b/drivers/clk/ti/clk-816x.c
index 9451e651a1ff..c69352b24dba 100644
--- a/drivers/clk/ti/clk-816x.c
+++ b/drivers/clk/ti/clk-816x.c
@@ -14,6 +14,8 @@
 #include <linux/clk-provider.h>
 #include <linux/clk/ti.h>
 
+#include "clock.h"
+
 static struct ti_dt_clk dm816x_clks[] = {
 	DT_CLK(NULL, "sys_clkin", "sys_clkin_ck"),
 	DT_CLK(NULL, "timer_sys_ck", "sys_clkin_ck"),
diff --git a/drivers/clk/ti/clk.c b/drivers/clk/ti/clk.c
index e65ae4acff9c..5baea03cfc92 100644
--- a/drivers/clk/ti/clk.c
+++ b/drivers/clk/ti/clk.c
@@ -336,3 +336,27 @@ const struct ti_clk_features *ti_clk_get_features(void)
 {
 	return &ti_clk_features;
 }
+
+/**
+ * omap2_clk_enable_init_clocks - prepare & enable a list of clocks
+ * @clk_names: ptr to an array of strings of clock names to enable
+ * @num_clocks: number of clock names in @clk_names
+ *
+ * Prepare and enable a list of clocks, named by @clk_names.  No
+ * return value. XXX Deprecated; only needed until these clocks are
+ * properly claimed and enabled by the drivers or core code that uses
+ * them.  XXX What code disables & calls clk_put on these clocks?
+ */
+void omap2_clk_enable_init_clocks(const char **clk_names, u8 num_clocks)
+{
+	struct clk *init_clk;
+	int i;
+
+	for (i = 0; i < num_clocks; i++) {
+		init_clk = clk_get(NULL, clk_names[i]);
+		if (WARN(IS_ERR(init_clk), "could not find init clock %s\n",
+			 clk_names[i]))
+			continue;
+		clk_prepare_enable(init_clk);
+	}
+}
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index 9b51021f509a..4b26af8a273d 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -171,6 +171,7 @@ int ti_clk_register_legacy_clks(struct ti_clk_alias *clks);
 
 void omap2_init_clk_hw_omap_clocks(struct clk *clk);
 int of_ti_clk_autoidle_setup(struct device_node *node);
+void omap2_clk_enable_init_clocks(const char **clk_names, u8 num_clocks);
 
 extern const struct clk_hw_omap_ops clkhwops_omap4_dpllmx;
 extern const struct clk_hw_omap_ops clkhwops_iclk;
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 320e107f9a7a..61deace552ec 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -290,7 +290,6 @@ int omap2_clk_disable_autoidle_all(void);
 int omap2_clk_enable_autoidle_all(void);
 int omap2_clk_allow_idle(struct clk *clk);
 int omap2_clk_deny_idle(struct clk *clk);
-void omap2_clk_enable_init_clocks(const char **clk_names, u8 num_clocks);
 int omap3_dpll4_set_rate(struct clk_hw *clk, unsigned long rate,
 			 unsigned long parent_rate);
 int omap3_dpll4_set_rate_and_parent(struct clk_hw *hw, unsigned long rate,
-- 
cgit v1.2.3-70-g09d2


From 9a356d622e8e559eff50b298e574bbc34e860aba Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Tue, 3 Mar 2015 11:14:31 +0200
Subject: ARM: OMAP2+: clock: add support for clkdm ops to the low level clk
 ops

Clock driver requires access to certain clockdomain handling ops once
the code is being moved over under clock driver. Example of this is
clk_enable / clk_disable under omap3 DPLL code. The required clkdm
APIs are now exported through the ti_clk_ll_ops struct.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/clock.c |  2 ++
 include/linux/clk/ti.h      | 16 +++++++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index 94a4949be9b0..d6afc1291fe9 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -120,6 +120,8 @@ u32 omap2_clk_readl(struct clk_hw_omap *clk, void __iomem *reg)
 static struct ti_clk_ll_ops omap_clk_ll_ops = {
 	.clk_readl = clk_memmap_readl,
 	.clk_writel = clk_memmap_writel,
+	.clkdm_clk_enable = clkdm_clk_enable,
+	.clkdm_clk_disable = clkdm_clk_disable,
 };
 
 /**
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 61deace552ec..fcf91844e94b 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -238,18 +238,24 @@ struct clk_omap_reg {
 };
 
 /**
- * struct ti_clk_ll_ops - low-level register access ops for a clock
+ * struct ti_clk_ll_ops - low-level ops for clocks
  * @clk_readl: pointer to register read function
  * @clk_writel: pointer to register write function
+ * @clkdm_clk_enable: pointer to clockdomain enable function
+ * @clkdm_clk_disable: pointer to clockdomain disable function
  *
- * Low-level register access ops are generally used by the basic clock types
- * (clk-gate, clk-mux, clk-divider etc.) to provide support for various
- * low-level hardware interfaces (direct MMIO, regmap etc.), but can also be
- * used by other hardware-specific clock drivers if needed.
+ * Low-level ops are generally used by the basic clock types (clk-gate,
+ * clk-mux, clk-divider etc.) to provide support for various low-level
+ * hadrware interfaces (direct MMIO, regmap etc.), and is initialized
+ * by board code. Low-level ops also contain some other platform specific
+ * operations not provided directly by clock drivers.
  */
 struct ti_clk_ll_ops {
 	u32	(*clk_readl)(void __iomem *reg);
 	void	(*clk_writel)(u32 val, void __iomem *reg);
+	int	(*clkdm_clk_enable)(struct clockdomain *clkdm, struct clk *clk);
+	int	(*clkdm_clk_disable)(struct clockdomain *clkdm,
+				     struct clk *clk);
 };
 
 extern struct ti_clk_ll_ops *ti_clk_ll_ops;
-- 
cgit v1.2.3-70-g09d2


From 192383d87b876ea9879d8b598af593809a25b7d2 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Tue, 3 Mar 2015 13:47:08 +0200
Subject: ARM: OMAP2+: clock: add support for specific CM ops to ti_clk_ll_ops

Clock driver requires access to some CM API functions once the code
is being moved under the clock driver from the platform directory.
Gate type clock requires access to cm_wait_module_ready and
cm_split_idlest_reg functions, which are both used for waiting until
the module being clocked has been successfully activated. These CM
APIs are now exported through the ti_clk_ll_ops struct.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/clock.c | 2 ++
 include/linux/clk/ti.h      | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index d6afc1291fe9..7a5713df54b3 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -122,6 +122,8 @@ static struct ti_clk_ll_ops omap_clk_ll_ops = {
 	.clk_writel = clk_memmap_writel,
 	.clkdm_clk_enable = clkdm_clk_enable,
 	.clkdm_clk_disable = clkdm_clk_disable,
+	.cm_wait_module_ready = omap_cm_wait_module_ready,
+	.cm_split_idlest_reg = cm_split_idlest_reg,
 };
 
 /**
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index fcf91844e94b..25eea896627a 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -243,6 +243,8 @@ struct clk_omap_reg {
  * @clk_writel: pointer to register write function
  * @clkdm_clk_enable: pointer to clockdomain enable function
  * @clkdm_clk_disable: pointer to clockdomain disable function
+ * @cm_wait_module_ready: pointer to CM module wait ready function
+ * @cm_split_idlest_reg: pointer to CM module function to split idlest reg
  *
  * Low-level ops are generally used by the basic clock types (clk-gate,
  * clk-mux, clk-divider etc.) to provide support for various low-level
@@ -256,6 +258,10 @@ struct ti_clk_ll_ops {
 	int	(*clkdm_clk_enable)(struct clockdomain *clkdm, struct clk *clk);
 	int	(*clkdm_clk_disable)(struct clockdomain *clkdm,
 				     struct clk *clk);
+	int	(*cm_wait_module_ready)(u8 part, s16 prcm_mod, u16 idlest_reg,
+					u8 idlest_shift);
+	int	(*cm_split_idlest_reg)(void __iomem *idlest_reg, s16 *prcm_inst,
+				       u8 *idlest_reg_id);
 };
 
 extern struct ti_clk_ll_ops *ti_clk_ll_ops;
-- 
cgit v1.2.3-70-g09d2


From 0565fb168d63f89591ce7dcb85438cb19d939a92 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Tue, 3 Mar 2015 13:27:48 +0200
Subject: clk: ti: dpll: move omap3 DPLL functionality to clock driver

With the legacy clock support gone, OMAP3 generic DPLL code can now be
moved over to the clock driver also. A few un-unused clkoutx2 functions
are also removed at the same time.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/Makefile    |   8 +-
 arch/arm/mach-omap2/clock.h     |   4 -
 arch/arm/mach-omap2/clock3xxx.c |  77 ----
 arch/arm/mach-omap2/dpll3xxx.c  | 818 ---------------------------------------
 drivers/clk/ti/Makefile         |  14 +-
 drivers/clk/ti/clk-3xxx.c       |  31 ++
 drivers/clk/ti/clock.h          |  27 ++
 drivers/clk/ti/dpll3xxx.c       | 825 ++++++++++++++++++++++++++++++++++++++++
 include/linux/clk/ti.h          |  30 --
 9 files changed, 893 insertions(+), 941 deletions(-)
 delete mode 100644 arch/arm/mach-omap2/dpll3xxx.c
 create mode 100644 drivers/clk/ti/dpll3xxx.c

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile
index bf5d71d9fd2b..f9d4ccf39cea 100644
--- a/arch/arm/mach-omap2/Makefile
+++ b/arch/arm/mach-omap2/Makefile
@@ -189,15 +189,11 @@ obj-$(CONFIG_SOC_OMAP2430)		+= clock2430.o
 obj-$(CONFIG_ARCH_OMAP3)		+= $(clock-common) clock3xxx.o
 obj-$(CONFIG_ARCH_OMAP3)		+= clock34xx.o clkt34xx_dpll3m2.o
 obj-$(CONFIG_ARCH_OMAP3)		+= clock3517.o
-obj-$(CONFIG_ARCH_OMAP3)		+= dpll3xxx.o
 obj-$(CONFIG_ARCH_OMAP4)		+= $(clock-common)
-obj-$(CONFIG_ARCH_OMAP4)		+= dpll3xxx.o
-obj-$(CONFIG_SOC_AM33XX)		+= $(clock-common) dpll3xxx.o
+obj-$(CONFIG_SOC_AM33XX)		+= $(clock-common)
 obj-$(CONFIG_SOC_OMAP5)			+= $(clock-common)
-obj-$(CONFIG_SOC_OMAP5)			+= dpll3xxx.o
 obj-$(CONFIG_SOC_DRA7XX)		+= $(clock-common)
-obj-$(CONFIG_SOC_DRA7XX)		+= dpll3xxx.o
-obj-$(CONFIG_SOC_AM43XX)		+= $(clock-common) dpll3xxx.o
+obj-$(CONFIG_SOC_AM43XX)		+= $(clock-common)
 
 # OMAP2 clock rate set data (old "OPP" data)
 obj-$(CONFIG_SOC_OMAP2420)		+= opp2420_data.o
diff --git a/arch/arm/mach-omap2/clock.h b/arch/arm/mach-omap2/clock.h
index e2781b4aaeb4..d60691d5626a 100644
--- a/arch/arm/mach-omap2/clock.h
+++ b/arch/arm/mach-omap2/clock.h
@@ -180,10 +180,6 @@ struct clksel {
 #define OMAP4XXX_EN_DPLL_FRBYPASS		0x6
 #define OMAP4XXX_EN_DPLL_LOCKED			0x7
 
-u32 omap3_dpll_autoidle_read(struct clk_hw_omap *clk);
-void omap3_dpll_allow_idle(struct clk_hw_omap *clk);
-void omap3_dpll_deny_idle(struct clk_hw_omap *clk);
-
 void __init omap2_clk_disable_clkdm_control(void);
 
 void omap2_clk_print_new_rates(const char *hfclkin_ck_name,
diff --git a/arch/arm/mach-omap2/clock3xxx.c b/arch/arm/mach-omap2/clock3xxx.c
index 4bd61222aa33..0b0e3a8777d3 100644
--- a/arch/arm/mach-omap2/clock3xxx.c
+++ b/arch/arm/mach-omap2/clock3xxx.c
@@ -29,82 +29,5 @@
 #include "cm2xxx_3xxx.h"
 #include "cm-regbits-34xx.h"
 
-/*
- * DPLL5_FREQ_FOR_USBHOST: USBHOST and USBTLL are the only clocks
- * that are sourced by DPLL5, and both of these require this clock
- * to be at 120 MHz for proper operation.
- */
-#define DPLL5_FREQ_FOR_USBHOST		120000000
-
 /* needed by omap3_core_dpll_m2_set_rate() */
 struct clk *sdrc_ick_p, *arm_fck_p;
-
-/**
- * omap3_dpll4_set_rate - set rate for omap3 per-dpll
- * @hw: clock to change
- * @rate: target rate for clock
- * @parent_rate: rate of the parent clock
- *
- * Check if the current SoC supports the per-dpll reprogram operation
- * or not, and then do the rate change if supported. Returns -EINVAL
- * if not supported, 0 for success, and potential error codes from the
- * clock rate change.
- */
-int omap3_dpll4_set_rate(struct clk_hw *hw, unsigned long rate,
-				unsigned long parent_rate)
-{
-	/*
-	 * According to the 12-5 CDP code from TI, "Limitation 2.5"
-	 * on 3430ES1 prevents us from changing DPLL multipliers or dividers
-	 * on DPLL4.
-	 */
-	if (ti_clk_get_features()->flags & TI_CLK_DPLL4_DENY_REPROGRAM) {
-		pr_err("clock: DPLL4 cannot change rate due to silicon 'Limitation 2.5' on 3430ES1.\n");
-		return -EINVAL;
-	}
-
-	return omap3_noncore_dpll_set_rate(hw, rate, parent_rate);
-}
-
-/**
- * omap3_dpll4_set_rate_and_parent - set rate and parent for omap3 per-dpll
- * @hw: clock to change
- * @rate: target rate for clock
- * @parent_rate: rate of the parent clock
- * @index: parent index, 0 - reference clock, 1 - bypass clock
- *
- * Check if the current SoC support the per-dpll reprogram operation
- * or not, and then do the rate + parent change if supported. Returns
- * -EINVAL if not supported, 0 for success, and potential error codes
- * from the clock rate change.
- */
-int omap3_dpll4_set_rate_and_parent(struct clk_hw *hw, unsigned long rate,
-				    unsigned long parent_rate, u8 index)
-{
-	if (ti_clk_get_features()->flags & TI_CLK_DPLL4_DENY_REPROGRAM) {
-		pr_err("clock: DPLL4 cannot change rate due to silicon 'Limitation 2.5' on 3430ES1.\n");
-		return -EINVAL;
-	}
-
-	return omap3_noncore_dpll_set_rate_and_parent(hw, rate, parent_rate,
-						      index);
-}
-
-void __init omap3_clk_lock_dpll5(void)
-{
-	struct clk *dpll5_clk;
-	struct clk *dpll5_m2_clk;
-
-	dpll5_clk = clk_get(NULL, "dpll5_ck");
-	clk_set_rate(dpll5_clk, DPLL5_FREQ_FOR_USBHOST);
-	clk_prepare_enable(dpll5_clk);
-
-	/* Program dpll5_m2_clk divider for no division */
-	dpll5_m2_clk = clk_get(NULL, "dpll5_m2_ck");
-	clk_prepare_enable(dpll5_m2_clk);
-	clk_set_rate(dpll5_m2_clk, DPLL5_FREQ_FOR_USBHOST);
-
-	clk_disable_unprepare(dpll5_m2_clk);
-	clk_disable_unprepare(dpll5_clk);
-	return;
-}
diff --git a/arch/arm/mach-omap2/dpll3xxx.c b/arch/arm/mach-omap2/dpll3xxx.c
deleted file mode 100644
index 9a80f593ed15..000000000000
--- a/arch/arm/mach-omap2/dpll3xxx.c
+++ /dev/null
@@ -1,818 +0,0 @@
-/*
- * OMAP3/4 - specific DPLL control functions
- *
- * Copyright (C) 2009-2010 Texas Instruments, Inc.
- * Copyright (C) 2009-2010 Nokia Corporation
- *
- * Written by Paul Walmsley
- * Testing and integration fixes by Jouni Högander
- *
- * 36xx support added by Vishwanath BS, Richard Woodruff, and Nishanth
- * Menon
- *
- * Parts of this code are based on code written by
- * Richard Woodruff, Tony Lindgren, Tuukka Tikkanen, Karthik Dasu
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/device.h>
-#include <linux/list.h>
-#include <linux/errno.h>
-#include <linux/delay.h>
-#include <linux/clk.h>
-#include <linux/io.h>
-#include <linux/bitops.h>
-#include <linux/clkdev.h>
-
-#include "clockdomain.h"
-#include "clock.h"
-
-/* CM_AUTOIDLE_PLL*.AUTO_* bit values */
-#define DPLL_AUTOIDLE_DISABLE			0x0
-#define DPLL_AUTOIDLE_LOW_POWER_STOP		0x1
-
-#define MAX_DPLL_WAIT_TRIES		1000000
-
-/* Private functions */
-
-/* _omap3_dpll_write_clken - write clken_bits arg to a DPLL's enable bits */
-static void _omap3_dpll_write_clken(struct clk_hw_omap *clk, u8 clken_bits)
-{
-	const struct dpll_data *dd;
-	u32 v;
-
-	dd = clk->dpll_data;
-
-	v = omap2_clk_readl(clk, dd->control_reg);
-	v &= ~dd->enable_mask;
-	v |= clken_bits << __ffs(dd->enable_mask);
-	omap2_clk_writel(v, clk, dd->control_reg);
-}
-
-/* _omap3_wait_dpll_status: wait for a DPLL to enter a specific state */
-static int _omap3_wait_dpll_status(struct clk_hw_omap *clk, u8 state)
-{
-	const struct dpll_data *dd;
-	int i = 0;
-	int ret = -EINVAL;
-	const char *clk_name;
-
-	dd = clk->dpll_data;
-	clk_name = __clk_get_name(clk->hw.clk);
-
-	state <<= __ffs(dd->idlest_mask);
-
-	while (((omap2_clk_readl(clk, dd->idlest_reg) & dd->idlest_mask)
-		!= state) && i < MAX_DPLL_WAIT_TRIES) {
-		i++;
-		udelay(1);
-	}
-
-	if (i == MAX_DPLL_WAIT_TRIES) {
-		printk(KERN_ERR "clock: %s failed transition to '%s'\n",
-		       clk_name, (state) ? "locked" : "bypassed");
-	} else {
-		pr_debug("clock: %s transition to '%s' in %d loops\n",
-			 clk_name, (state) ? "locked" : "bypassed", i);
-
-		ret = 0;
-	}
-
-	return ret;
-}
-
-/* From 3430 TRM ES2 4.7.6.2 */
-static u16 _omap3_dpll_compute_freqsel(struct clk_hw_omap *clk, u8 n)
-{
-	unsigned long fint;
-	u16 f = 0;
-
-	fint = __clk_get_rate(clk->dpll_data->clk_ref) / n;
-
-	pr_debug("clock: fint is %lu\n", fint);
-
-	if (fint >= 750000 && fint <= 1000000)
-		f = 0x3;
-	else if (fint > 1000000 && fint <= 1250000)
-		f = 0x4;
-	else if (fint > 1250000 && fint <= 1500000)
-		f = 0x5;
-	else if (fint > 1500000 && fint <= 1750000)
-		f = 0x6;
-	else if (fint > 1750000 && fint <= 2100000)
-		f = 0x7;
-	else if (fint > 7500000 && fint <= 10000000)
-		f = 0xB;
-	else if (fint > 10000000 && fint <= 12500000)
-		f = 0xC;
-	else if (fint > 12500000 && fint <= 15000000)
-		f = 0xD;
-	else if (fint > 15000000 && fint <= 17500000)
-		f = 0xE;
-	else if (fint > 17500000 && fint <= 21000000)
-		f = 0xF;
-	else
-		pr_debug("clock: unknown freqsel setting for %d\n", n);
-
-	return f;
-}
-
-/*
- * _omap3_noncore_dpll_lock - instruct a DPLL to lock and wait for readiness
- * @clk: pointer to a DPLL struct clk
- *
- * Instructs a non-CORE DPLL to lock.  Waits for the DPLL to report
- * readiness before returning.  Will save and restore the DPLL's
- * autoidle state across the enable, per the CDP code.  If the DPLL
- * locked successfully, return 0; if the DPLL did not lock in the time
- * allotted, or DPLL3 was passed in, return -EINVAL.
- */
-static int _omap3_noncore_dpll_lock(struct clk_hw_omap *clk)
-{
-	const struct dpll_data *dd;
-	u8 ai;
-	u8 state = 1;
-	int r = 0;
-
-	pr_debug("clock: locking DPLL %s\n", __clk_get_name(clk->hw.clk));
-
-	dd = clk->dpll_data;
-	state <<= __ffs(dd->idlest_mask);
-
-	/* Check if already locked */
-	if ((omap2_clk_readl(clk, dd->idlest_reg) & dd->idlest_mask) == state)
-		goto done;
-
-	ai = omap3_dpll_autoidle_read(clk);
-
-	if (ai)
-		omap3_dpll_deny_idle(clk);
-
-	_omap3_dpll_write_clken(clk, DPLL_LOCKED);
-
-	r = _omap3_wait_dpll_status(clk, 1);
-
-	if (ai)
-		omap3_dpll_allow_idle(clk);
-
-done:
-	return r;
-}
-
-/*
- * _omap3_noncore_dpll_bypass - instruct a DPLL to bypass and wait for readiness
- * @clk: pointer to a DPLL struct clk
- *
- * Instructs a non-CORE DPLL to enter low-power bypass mode.  In
- * bypass mode, the DPLL's rate is set equal to its parent clock's
- * rate.  Waits for the DPLL to report readiness before returning.
- * Will save and restore the DPLL's autoidle state across the enable,
- * per the CDP code.  If the DPLL entered bypass mode successfully,
- * return 0; if the DPLL did not enter bypass in the time allotted, or
- * DPLL3 was passed in, or the DPLL does not support low-power bypass,
- * return -EINVAL.
- */
-static int _omap3_noncore_dpll_bypass(struct clk_hw_omap *clk)
-{
-	int r;
-	u8 ai;
-
-	if (!(clk->dpll_data->modes & (1 << DPLL_LOW_POWER_BYPASS)))
-		return -EINVAL;
-
-	pr_debug("clock: configuring DPLL %s for low-power bypass\n",
-		 __clk_get_name(clk->hw.clk));
-
-	ai = omap3_dpll_autoidle_read(clk);
-
-	_omap3_dpll_write_clken(clk, DPLL_LOW_POWER_BYPASS);
-
-	r = _omap3_wait_dpll_status(clk, 0);
-
-	if (ai)
-		omap3_dpll_allow_idle(clk);
-
-	return r;
-}
-
-/*
- * _omap3_noncore_dpll_stop - instruct a DPLL to stop
- * @clk: pointer to a DPLL struct clk
- *
- * Instructs a non-CORE DPLL to enter low-power stop. Will save and
- * restore the DPLL's autoidle state across the stop, per the CDP
- * code.  If DPLL3 was passed in, or the DPLL does not support
- * low-power stop, return -EINVAL; otherwise, return 0.
- */
-static int _omap3_noncore_dpll_stop(struct clk_hw_omap *clk)
-{
-	u8 ai;
-
-	if (!(clk->dpll_data->modes & (1 << DPLL_LOW_POWER_STOP)))
-		return -EINVAL;
-
-	pr_debug("clock: stopping DPLL %s\n", __clk_get_name(clk->hw.clk));
-
-	ai = omap3_dpll_autoidle_read(clk);
-
-	_omap3_dpll_write_clken(clk, DPLL_LOW_POWER_STOP);
-
-	if (ai)
-		omap3_dpll_allow_idle(clk);
-
-	return 0;
-}
-
-/**
- * _lookup_dco - Lookup DCO used by j-type DPLL
- * @clk: pointer to a DPLL struct clk
- * @dco: digital control oscillator selector
- * @m: DPLL multiplier to set
- * @n: DPLL divider to set
- *
- * See 36xx TRM section 3.5.3.3.3.2 "Type B DPLL (Low-Jitter)"
- *
- * XXX This code is not needed for 3430/AM35xx; can it be optimized
- * out in non-multi-OMAP builds for those chips?
- */
-static void _lookup_dco(struct clk_hw_omap *clk, u8 *dco, u16 m, u8 n)
-{
-	unsigned long fint, clkinp; /* watch out for overflow */
-
-	clkinp = __clk_get_rate(__clk_get_parent(clk->hw.clk));
-	fint = (clkinp / n) * m;
-
-	if (fint < 1000000000)
-		*dco = 2;
-	else
-		*dco = 4;
-}
-
-/**
- * _lookup_sddiv - Calculate sigma delta divider for j-type DPLL
- * @clk: pointer to a DPLL struct clk
- * @sd_div: target sigma-delta divider
- * @m: DPLL multiplier to set
- * @n: DPLL divider to set
- *
- * See 36xx TRM section 3.5.3.3.3.2 "Type B DPLL (Low-Jitter)"
- *
- * XXX This code is not needed for 3430/AM35xx; can it be optimized
- * out in non-multi-OMAP builds for those chips?
- */
-static void _lookup_sddiv(struct clk_hw_omap *clk, u8 *sd_div, u16 m, u8 n)
-{
-	unsigned long clkinp, sd; /* watch out for overflow */
-	int mod1, mod2;
-
-	clkinp = __clk_get_rate(__clk_get_parent(clk->hw.clk));
-
-	/*
-	 * target sigma-delta to near 250MHz
-	 * sd = ceil[(m/(n+1)) * (clkinp_MHz / 250)]
-	 */
-	clkinp /= 100000; /* shift from MHz to 10*Hz for 38.4 and 19.2 */
-	mod1 = (clkinp * m) % (250 * n);
-	sd = (clkinp * m) / (250 * n);
-	mod2 = sd % 10;
-	sd /= 10;
-
-	if (mod1 || mod2)
-		sd++;
-	*sd_div = sd;
-}
-
-/*
- * _omap3_noncore_dpll_program - set non-core DPLL M,N values directly
- * @clk:	struct clk * of DPLL to set
- * @freqsel:	FREQSEL value to set
- *
- * Program the DPLL with the last M, N values calculated, and wait for
- * the DPLL to lock. Returns -EINVAL upon error, or 0 upon success.
- */
-static int omap3_noncore_dpll_program(struct clk_hw_omap *clk, u16 freqsel)
-{
-	struct dpll_data *dd = clk->dpll_data;
-	u8 dco, sd_div;
-	u32 v;
-
-	/* 3430 ES2 TRM: 4.7.6.9 DPLL Programming Sequence */
-	_omap3_noncore_dpll_bypass(clk);
-
-	/*
-	 * Set jitter correction. Jitter correction applicable for OMAP343X
-	 * only since freqsel field is no longer present on other devices.
-	 */
-	if (ti_clk_get_features()->flags & TI_CLK_DPLL_HAS_FREQSEL) {
-		v = omap2_clk_readl(clk, dd->control_reg);
-		v &= ~dd->freqsel_mask;
-		v |= freqsel << __ffs(dd->freqsel_mask);
-		omap2_clk_writel(v, clk, dd->control_reg);
-	}
-
-	/* Set DPLL multiplier, divider */
-	v = omap2_clk_readl(clk, dd->mult_div1_reg);
-
-	/* Handle Duty Cycle Correction */
-	if (dd->dcc_mask) {
-		if (dd->last_rounded_rate >= dd->dcc_rate)
-			v |= dd->dcc_mask; /* Enable DCC */
-		else
-			v &= ~dd->dcc_mask; /* Disable DCC */
-	}
-
-	v &= ~(dd->mult_mask | dd->div1_mask);
-	v |= dd->last_rounded_m << __ffs(dd->mult_mask);
-	v |= (dd->last_rounded_n - 1) << __ffs(dd->div1_mask);
-
-	/* Configure dco and sd_div for dplls that have these fields */
-	if (dd->dco_mask) {
-		_lookup_dco(clk, &dco, dd->last_rounded_m, dd->last_rounded_n);
-		v &= ~(dd->dco_mask);
-		v |= dco << __ffs(dd->dco_mask);
-	}
-	if (dd->sddiv_mask) {
-		_lookup_sddiv(clk, &sd_div, dd->last_rounded_m,
-			      dd->last_rounded_n);
-		v &= ~(dd->sddiv_mask);
-		v |= sd_div << __ffs(dd->sddiv_mask);
-	}
-
-	omap2_clk_writel(v, clk, dd->mult_div1_reg);
-
-	/* Set 4X multiplier and low-power mode */
-	if (dd->m4xen_mask || dd->lpmode_mask) {
-		v = omap2_clk_readl(clk, dd->control_reg);
-
-		if (dd->m4xen_mask) {
-			if (dd->last_rounded_m4xen)
-				v |= dd->m4xen_mask;
-			else
-				v &= ~dd->m4xen_mask;
-		}
-
-		if (dd->lpmode_mask) {
-			if (dd->last_rounded_lpmode)
-				v |= dd->lpmode_mask;
-			else
-				v &= ~dd->lpmode_mask;
-		}
-
-		omap2_clk_writel(v, clk, dd->control_reg);
-	}
-
-	/* We let the clock framework set the other output dividers later */
-
-	/* REVISIT: Set ramp-up delay? */
-
-	_omap3_noncore_dpll_lock(clk);
-
-	return 0;
-}
-
-/* Public functions */
-
-/**
- * omap3_dpll_recalc - recalculate DPLL rate
- * @clk: DPLL struct clk
- *
- * Recalculate and propagate the DPLL rate.
- */
-unsigned long omap3_dpll_recalc(struct clk_hw *hw, unsigned long parent_rate)
-{
-	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
-
-	return omap2_get_dpll_rate(clk);
-}
-
-/* Non-CORE DPLL (e.g., DPLLs that do not control SDRC) clock functions */
-
-/**
- * omap3_noncore_dpll_enable - instruct a DPLL to enter bypass or lock mode
- * @clk: pointer to a DPLL struct clk
- *
- * Instructs a non-CORE DPLL to enable, e.g., to enter bypass or lock.
- * The choice of modes depends on the DPLL's programmed rate: if it is
- * the same as the DPLL's parent clock, it will enter bypass;
- * otherwise, it will enter lock.  This code will wait for the DPLL to
- * indicate readiness before returning, unless the DPLL takes too long
- * to enter the target state.  Intended to be used as the struct clk's
- * enable function.  If DPLL3 was passed in, or the DPLL does not
- * support low-power stop, or if the DPLL took too long to enter
- * bypass or lock, return -EINVAL; otherwise, return 0.
- */
-int omap3_noncore_dpll_enable(struct clk_hw *hw)
-{
-	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
-	int r;
-	struct dpll_data *dd;
-	struct clk_hw *parent;
-
-	dd = clk->dpll_data;
-	if (!dd)
-		return -EINVAL;
-
-	if (clk->clkdm) {
-		r = clkdm_clk_enable(clk->clkdm, hw->clk);
-		if (r) {
-			WARN(1,
-			     "%s: could not enable %s's clockdomain %s: %d\n",
-			     __func__, __clk_get_name(hw->clk),
-			     clk->clkdm->name, r);
-			return r;
-		}
-	}
-
-	parent = __clk_get_hw(__clk_get_parent(hw->clk));
-
-	if (__clk_get_rate(hw->clk) == __clk_get_rate(dd->clk_bypass)) {
-		WARN_ON(parent != __clk_get_hw(dd->clk_bypass));
-		r = _omap3_noncore_dpll_bypass(clk);
-	} else {
-		WARN_ON(parent != __clk_get_hw(dd->clk_ref));
-		r = _omap3_noncore_dpll_lock(clk);
-	}
-
-	return r;
-}
-
-/**
- * omap3_noncore_dpll_disable - instruct a DPLL to enter low-power stop
- * @clk: pointer to a DPLL struct clk
- *
- * Instructs a non-CORE DPLL to enter low-power stop.  This function is
- * intended for use in struct clkops.  No return value.
- */
-void omap3_noncore_dpll_disable(struct clk_hw *hw)
-{
-	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
-
-	_omap3_noncore_dpll_stop(clk);
-	if (clk->clkdm)
-		clkdm_clk_disable(clk->clkdm, hw->clk);
-}
-
-
-/* Non-CORE DPLL rate set code */
-
-/**
- * omap3_noncore_dpll_determine_rate - determine rate for a DPLL
- * @hw: pointer to the clock to determine rate for
- * @rate: target rate for the DPLL
- * @best_parent_rate: pointer for returning best parent rate
- * @best_parent_clk: pointer for returning best parent clock
- *
- * Determines which DPLL mode to use for reaching a desired target rate.
- * Checks whether the DPLL shall be in bypass or locked mode, and if
- * locked, calculates the M,N values for the DPLL via round-rate.
- * Returns a positive clock rate with success, negative error value
- * in failure.
- */
-long omap3_noncore_dpll_determine_rate(struct clk_hw *hw, unsigned long rate,
-				       unsigned long min_rate,
-				       unsigned long max_rate,
-				       unsigned long *best_parent_rate,
-				       struct clk_hw **best_parent_clk)
-{
-	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
-	struct dpll_data *dd;
-
-	if (!hw || !rate)
-		return -EINVAL;
-
-	dd = clk->dpll_data;
-	if (!dd)
-		return -EINVAL;
-
-	if (__clk_get_rate(dd->clk_bypass) == rate &&
-	    (dd->modes & (1 << DPLL_LOW_POWER_BYPASS))) {
-		*best_parent_clk = __clk_get_hw(dd->clk_bypass);
-	} else {
-		rate = omap2_dpll_round_rate(hw, rate, best_parent_rate);
-		*best_parent_clk = __clk_get_hw(dd->clk_ref);
-	}
-
-	*best_parent_rate = rate;
-
-	return rate;
-}
-
-/**
- * omap3_noncore_dpll_set_parent - set parent for a DPLL clock
- * @hw: pointer to the clock to set parent for
- * @index: parent index to select
- *
- * Sets parent for a DPLL clock. This sets the DPLL into bypass or
- * locked mode. Returns 0 with success, negative error value otherwise.
- */
-int omap3_noncore_dpll_set_parent(struct clk_hw *hw, u8 index)
-{
-	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
-	int ret;
-
-	if (!hw)
-		return -EINVAL;
-
-	if (index)
-		ret = _omap3_noncore_dpll_bypass(clk);
-	else
-		ret = _omap3_noncore_dpll_lock(clk);
-
-	return ret;
-}
-
-/**
- * omap3_noncore_dpll_set_rate - set rate for a DPLL clock
- * @hw: pointer to the clock to set parent for
- * @rate: target rate for the clock
- * @parent_rate: rate of the parent clock
- *
- * Sets rate for a DPLL clock. First checks if the clock parent is
- * reference clock (in bypass mode, the rate of the clock can't be
- * changed) and proceeds with the rate change operation. Returns 0
- * with success, negative error value otherwise.
- */
-int omap3_noncore_dpll_set_rate(struct clk_hw *hw, unsigned long rate,
-				unsigned long parent_rate)
-{
-	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
-	struct dpll_data *dd;
-	u16 freqsel = 0;
-	int ret;
-
-	if (!hw || !rate)
-		return -EINVAL;
-
-	dd = clk->dpll_data;
-	if (!dd)
-		return -EINVAL;
-
-	if (__clk_get_hw(__clk_get_parent(hw->clk)) !=
-	    __clk_get_hw(dd->clk_ref))
-		return -EINVAL;
-
-	if (dd->last_rounded_rate == 0)
-		return -EINVAL;
-
-	/* Freqsel is available only on OMAP343X devices */
-	if (ti_clk_get_features()->flags & TI_CLK_DPLL_HAS_FREQSEL) {
-		freqsel = _omap3_dpll_compute_freqsel(clk, dd->last_rounded_n);
-		WARN_ON(!freqsel);
-	}
-
-	pr_debug("%s: %s: set rate: locking rate to %lu.\n", __func__,
-		 __clk_get_name(hw->clk), rate);
-
-	ret = omap3_noncore_dpll_program(clk, freqsel);
-
-	return ret;
-}
-
-/**
- * omap3_noncore_dpll_set_rate_and_parent - set rate and parent for a DPLL clock
- * @hw: pointer to the clock to set rate and parent for
- * @rate: target rate for the DPLL
- * @parent_rate: clock rate of the DPLL parent
- * @index: new parent index for the DPLL, 0 - reference, 1 - bypass
- *
- * Sets rate and parent for a DPLL clock. If new parent is the bypass
- * clock, only selects the parent. Otherwise proceeds with a rate
- * change, as this will effectively also change the parent as the
- * DPLL is put into locked mode. Returns 0 with success, negative error
- * value otherwise.
- */
-int omap3_noncore_dpll_set_rate_and_parent(struct clk_hw *hw,
-					   unsigned long rate,
-					   unsigned long parent_rate,
-					   u8 index)
-{
-	int ret;
-
-	if (!hw || !rate)
-		return -EINVAL;
-
-	/*
-	 * clk-ref at index[0], in which case we only need to set rate,
-	 * the parent will be changed automatically with the lock sequence.
-	 * With clk-bypass case we only need to change parent.
-	 */
-	if (index)
-		ret = omap3_noncore_dpll_set_parent(hw, index);
-	else
-		ret = omap3_noncore_dpll_set_rate(hw, rate, parent_rate);
-
-	return ret;
-}
-
-/* DPLL autoidle read/set code */
-
-/**
- * omap3_dpll_autoidle_read - read a DPLL's autoidle bits
- * @clk: struct clk * of the DPLL to read
- *
- * Return the DPLL's autoidle bits, shifted down to bit 0.  Returns
- * -EINVAL if passed a null pointer or if the struct clk does not
- * appear to refer to a DPLL.
- */
-u32 omap3_dpll_autoidle_read(struct clk_hw_omap *clk)
-{
-	const struct dpll_data *dd;
-	u32 v;
-
-	if (!clk || !clk->dpll_data)
-		return -EINVAL;
-
-	dd = clk->dpll_data;
-
-	if (!dd->autoidle_reg)
-		return -EINVAL;
-
-	v = omap2_clk_readl(clk, dd->autoidle_reg);
-	v &= dd->autoidle_mask;
-	v >>= __ffs(dd->autoidle_mask);
-
-	return v;
-}
-
-/**
- * omap3_dpll_allow_idle - enable DPLL autoidle bits
- * @clk: struct clk * of the DPLL to operate on
- *
- * Enable DPLL automatic idle control.  This automatic idle mode
- * switching takes effect only when the DPLL is locked, at least on
- * OMAP3430.  The DPLL will enter low-power stop when its downstream
- * clocks are gated.  No return value.
- */
-void omap3_dpll_allow_idle(struct clk_hw_omap *clk)
-{
-	const struct dpll_data *dd;
-	u32 v;
-
-	if (!clk || !clk->dpll_data)
-		return;
-
-	dd = clk->dpll_data;
-
-	if (!dd->autoidle_reg)
-		return;
-
-	/*
-	 * REVISIT: CORE DPLL can optionally enter low-power bypass
-	 * by writing 0x5 instead of 0x1.  Add some mechanism to
-	 * optionally enter this mode.
-	 */
-	v = omap2_clk_readl(clk, dd->autoidle_reg);
-	v &= ~dd->autoidle_mask;
-	v |= DPLL_AUTOIDLE_LOW_POWER_STOP << __ffs(dd->autoidle_mask);
-	omap2_clk_writel(v, clk, dd->autoidle_reg);
-
-}
-
-/**
- * omap3_dpll_deny_idle - prevent DPLL from automatically idling
- * @clk: struct clk * of the DPLL to operate on
- *
- * Disable DPLL automatic idle control.  No return value.
- */
-void omap3_dpll_deny_idle(struct clk_hw_omap *clk)
-{
-	const struct dpll_data *dd;
-	u32 v;
-
-	if (!clk || !clk->dpll_data)
-		return;
-
-	dd = clk->dpll_data;
-
-	if (!dd->autoidle_reg)
-		return;
-
-	v = omap2_clk_readl(clk, dd->autoidle_reg);
-	v &= ~dd->autoidle_mask;
-	v |= DPLL_AUTOIDLE_DISABLE << __ffs(dd->autoidle_mask);
-	omap2_clk_writel(v, clk, dd->autoidle_reg);
-
-}
-
-/* Clock control for DPLL outputs */
-
-/* Find the parent DPLL for the given clkoutx2 clock */
-static struct clk_hw_omap *omap3_find_clkoutx2_dpll(struct clk_hw *hw)
-{
-	struct clk_hw_omap *pclk = NULL;
-	struct clk *parent;
-
-	/* Walk up the parents of clk, looking for a DPLL */
-	do {
-		do {
-			parent = __clk_get_parent(hw->clk);
-			hw = __clk_get_hw(parent);
-		} while (hw && (__clk_get_flags(hw->clk) & CLK_IS_BASIC));
-		if (!hw)
-			break;
-		pclk = to_clk_hw_omap(hw);
-	} while (pclk && !pclk->dpll_data);
-
-	/* clk does not have a DPLL as a parent?  error in the clock data */
-	if (!pclk) {
-		WARN_ON(1);
-		return NULL;
-	}
-
-	return pclk;
-}
-
-/**
- * omap3_clkoutx2_recalc - recalculate DPLL X2 output virtual clock rate
- * @clk: DPLL output struct clk
- *
- * Using parent clock DPLL data, look up DPLL state.  If locked, set our
- * rate to the dpll_clk * 2; otherwise, just use dpll_clk.
- */
-unsigned long omap3_clkoutx2_recalc(struct clk_hw *hw,
-				    unsigned long parent_rate)
-{
-	const struct dpll_data *dd;
-	unsigned long rate;
-	u32 v;
-	struct clk_hw_omap *pclk = NULL;
-
-	if (!parent_rate)
-		return 0;
-
-	pclk = omap3_find_clkoutx2_dpll(hw);
-
-	if (!pclk)
-		return 0;
-
-	dd = pclk->dpll_data;
-
-	WARN_ON(!dd->enable_mask);
-
-	v = omap2_clk_readl(pclk, dd->control_reg) & dd->enable_mask;
-	v >>= __ffs(dd->enable_mask);
-	if ((v != OMAP3XXX_EN_DPLL_LOCKED) || (dd->flags & DPLL_J_TYPE))
-		rate = parent_rate;
-	else
-		rate = parent_rate * 2;
-	return rate;
-}
-
-int omap3_clkoutx2_set_rate(struct clk_hw *hw, unsigned long rate,
-					unsigned long parent_rate)
-{
-	return 0;
-}
-
-long omap3_clkoutx2_round_rate(struct clk_hw *hw, unsigned long rate,
-		unsigned long *prate)
-{
-	const struct dpll_data *dd;
-	u32 v;
-	struct clk_hw_omap *pclk = NULL;
-
-	if (!*prate)
-		return 0;
-
-	pclk = omap3_find_clkoutx2_dpll(hw);
-
-	if (!pclk)
-		return 0;
-
-	dd = pclk->dpll_data;
-
-	/* TYPE J does not have a clkoutx2 */
-	if (dd->flags & DPLL_J_TYPE) {
-		*prate = __clk_round_rate(__clk_get_parent(pclk->hw.clk), rate);
-		return *prate;
-	}
-
-	WARN_ON(!dd->enable_mask);
-
-	v = omap2_clk_readl(pclk, dd->control_reg) & dd->enable_mask;
-	v >>= __ffs(dd->enable_mask);
-
-	/* If in bypass, the rate is fixed to the bypass rate*/
-	if (v != OMAP3XXX_EN_DPLL_LOCKED)
-		return *prate;
-
-	if (__clk_get_flags(hw->clk) & CLK_SET_RATE_PARENT) {
-		unsigned long best_parent;
-
-		best_parent = (rate / 2);
-		*prate = __clk_round_rate(__clk_get_parent(hw->clk),
-				best_parent);
-	}
-
-	return *prate * 2;
-}
-
-/* OMAP3/4 non-CORE DPLL clkops */
-const struct clk_hw_omap_ops clkhwops_omap3_dpll = {
-	.allow_idle	= omap3_dpll_allow_idle,
-	.deny_idle	= omap3_dpll_deny_idle,
-};
diff --git a/drivers/clk/ti/Makefile b/drivers/clk/ti/Makefile
index 23cd72638970..05a0294aba10 100644
--- a/drivers/clk/ti/Makefile
+++ b/drivers/clk/ti/Makefile
@@ -2,16 +2,18 @@ obj-y					+= clk.o autoidle.o clockdomain.o
 clk-common				= dpll.o composite.o divider.o gate.o \
 					  fixed-factor.o mux.o apll.o \
 					  clkt_dpll.o clkt_iclk.o
-obj-$(CONFIG_SOC_AM33XX)		+= $(clk-common) clk-33xx.o
+obj-$(CONFIG_SOC_AM33XX)		+= $(clk-common) clk-33xx.o dpll3xxx.o
 obj-$(CONFIG_SOC_TI81XX)		+= $(clk-common) fapll.o clk-816x.o
 obj-$(CONFIG_ARCH_OMAP2)		+= $(clk-common) interface.o clk-2xxx.o
 obj-$(CONFIG_ARCH_OMAP3)		+= $(clk-common) interface.o \
-					   clk-3xxx.o
-obj-$(CONFIG_ARCH_OMAP4)		+= $(clk-common) clk-44xx.o dpll44xx.o
-obj-$(CONFIG_SOC_OMAP5)			+= $(clk-common) clk-54xx.o dpll44xx.o
+					   clk-3xxx.o dpll3xxx.o
+obj-$(CONFIG_ARCH_OMAP4)		+= $(clk-common) clk-44xx.o \
+					   dpll3xxx.o dpll44xx.o
+obj-$(CONFIG_SOC_OMAP5)			+= $(clk-common) clk-54xx.o \
+					   dpll3xxx.o dpll44xx.o
 obj-$(CONFIG_SOC_DRA7XX)		+= $(clk-common) clk-7xx.o \
-					   clk-dra7-atl.o dpll44xx.o
-obj-$(CONFIG_SOC_AM43XX)		+= $(clk-common) clk-43xx.o
+					   clk-dra7-atl.o dpll3xxx.o dpll44xx.o
+obj-$(CONFIG_SOC_AM43XX)		+= $(clk-common) dpll3xxx.o clk-43xx.o
 
 ifdef CONFIG_ATAGS
 obj-$(CONFIG_ARCH_OMAP3)                += clk-3xxx-legacy.o
diff --git a/drivers/clk/ti/clk-3xxx.c b/drivers/clk/ti/clk-3xxx.c
index bb3b88359daf..5489ad8c07d4 100644
--- a/drivers/clk/ti/clk-3xxx.c
+++ b/drivers/clk/ti/clk-3xxx.c
@@ -21,6 +21,13 @@
 
 #include "clock.h"
 
+/*
+ * DPLL5_FREQ_FOR_USBHOST: USBHOST and USBTLL are the only clocks
+ * that are sourced by DPLL5, and both of these require this clock
+ * to be at 120 MHz for proper operation.
+ */
+#define DPLL5_FREQ_FOR_USBHOST		120000000
+
 static struct ti_dt_clk omap3xxx_clks[] = {
 	DT_CLK(NULL, "apb_pclk", "dummy_apb_pclk"),
 	DT_CLK(NULL, "omap_32k_fck", "omap_32k_fck"),
@@ -325,6 +332,30 @@ enum {
 	OMAP3_SOC_OMAP3630,
 };
 
+/**
+ * omap3_clk_lock_dpll5 - locks DPLL5
+ *
+ * Locks DPLL5 to a pre-defined frequency. This is required for proper
+ * operation of USB.
+ */
+void __init omap3_clk_lock_dpll5(void)
+{
+	struct clk *dpll5_clk;
+	struct clk *dpll5_m2_clk;
+
+	dpll5_clk = clk_get(NULL, "dpll5_ck");
+	clk_set_rate(dpll5_clk, DPLL5_FREQ_FOR_USBHOST);
+	clk_prepare_enable(dpll5_clk);
+
+	/* Program dpll5_m2_clk divider for no division */
+	dpll5_m2_clk = clk_get(NULL, "dpll5_m2_ck");
+	clk_prepare_enable(dpll5_m2_clk);
+	clk_set_rate(dpll5_m2_clk, DPLL5_FREQ_FOR_USBHOST);
+
+	clk_disable_unprepare(dpll5_m2_clk);
+	clk_disable_unprepare(dpll5_clk);
+}
+
 static int __init omap3xxx_dt_clk_init(int soc_type)
 {
 	if (soc_type == OMAP3_SOC_AM35XX || soc_type == OMAP3_SOC_OMAP3630 ||
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index 4b26af8a273d..688d9e47b2c8 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -173,11 +173,38 @@ void omap2_init_clk_hw_omap_clocks(struct clk *clk);
 int of_ti_clk_autoidle_setup(struct device_node *node);
 void omap2_clk_enable_init_clocks(const char **clk_names, u8 num_clocks);
 
+extern const struct clk_hw_omap_ops clkhwops_omap3_dpll;
 extern const struct clk_hw_omap_ops clkhwops_omap4_dpllmx;
 extern const struct clk_hw_omap_ops clkhwops_iclk;
 extern const struct clk_hw_omap_ops clkhwops_iclk_wait;
 
 u8 omap2_init_dpll_parent(struct clk_hw *hw);
+int omap3_noncore_dpll_enable(struct clk_hw *hw);
+void omap3_noncore_dpll_disable(struct clk_hw *hw);
+int omap3_noncore_dpll_set_parent(struct clk_hw *hw, u8 index);
+int omap3_noncore_dpll_set_rate(struct clk_hw *hw, unsigned long rate,
+				unsigned long parent_rate);
+int omap3_noncore_dpll_set_rate_and_parent(struct clk_hw *hw,
+					   unsigned long rate,
+					   unsigned long parent_rate,
+					   u8 index);
+long omap3_noncore_dpll_determine_rate(struct clk_hw *hw,
+				       unsigned long rate,
+				       unsigned long min_rate,
+				       unsigned long max_rate,
+				       unsigned long *best_parent_rate,
+				       struct clk_hw **best_parent_clk);
+long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate,
+			   unsigned long *parent_rate);
+unsigned long omap3_clkoutx2_recalc(struct clk_hw *hw,
+				    unsigned long parent_rate);
+
+unsigned long omap3_dpll_recalc(struct clk_hw *hw, unsigned long parent_rate);
+int omap3_dpll4_set_rate(struct clk_hw *clk, unsigned long rate,
+			 unsigned long parent_rate);
+int omap3_dpll4_set_rate_and_parent(struct clk_hw *hw, unsigned long rate,
+				    unsigned long parent_rate, u8 index);
+void omap3_clk_lock_dpll5(void);
 
 unsigned long omap4_dpll_regm4xen_recalc(struct clk_hw *hw,
 					 unsigned long parent_rate);
diff --git a/drivers/clk/ti/dpll3xxx.c b/drivers/clk/ti/dpll3xxx.c
new file mode 100644
index 000000000000..22d77a331287
--- /dev/null
+++ b/drivers/clk/ti/dpll3xxx.c
@@ -0,0 +1,825 @@
+/*
+ * OMAP3/4 - specific DPLL control functions
+ *
+ * Copyright (C) 2009-2010 Texas Instruments, Inc.
+ * Copyright (C) 2009-2010 Nokia Corporation
+ *
+ * Written by Paul Walmsley
+ * Testing and integration fixes by Jouni Högander
+ *
+ * 36xx support added by Vishwanath BS, Richard Woodruff, and Nishanth
+ * Menon
+ *
+ * Parts of this code are based on code written by
+ * Richard Woodruff, Tony Lindgren, Tuukka Tikkanen, Karthik Dasu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/delay.h>
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <linux/bitops.h>
+#include <linux/clkdev.h>
+#include <linux/clk/ti.h>
+
+#include "clock.h"
+
+/* CM_AUTOIDLE_PLL*.AUTO_* bit values */
+#define DPLL_AUTOIDLE_DISABLE			0x0
+#define DPLL_AUTOIDLE_LOW_POWER_STOP		0x1
+
+#define MAX_DPLL_WAIT_TRIES		1000000
+
+#define OMAP3XXX_EN_DPLL_LOCKED		0x7
+
+/* Forward declarations */
+static u32 omap3_dpll_autoidle_read(struct clk_hw_omap *clk);
+static void omap3_dpll_deny_idle(struct clk_hw_omap *clk);
+static void omap3_dpll_allow_idle(struct clk_hw_omap *clk);
+
+/* Private functions */
+
+/* _omap3_dpll_write_clken - write clken_bits arg to a DPLL's enable bits */
+static void _omap3_dpll_write_clken(struct clk_hw_omap *clk, u8 clken_bits)
+{
+	const struct dpll_data *dd;
+	u32 v;
+
+	dd = clk->dpll_data;
+
+	v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+	v &= ~dd->enable_mask;
+	v |= clken_bits << __ffs(dd->enable_mask);
+	ti_clk_ll_ops->clk_writel(v, dd->control_reg);
+}
+
+/* _omap3_wait_dpll_status: wait for a DPLL to enter a specific state */
+static int _omap3_wait_dpll_status(struct clk_hw_omap *clk, u8 state)
+{
+	const struct dpll_data *dd;
+	int i = 0;
+	int ret = -EINVAL;
+	const char *clk_name;
+
+	dd = clk->dpll_data;
+	clk_name = __clk_get_name(clk->hw.clk);
+
+	state <<= __ffs(dd->idlest_mask);
+
+	while (((ti_clk_ll_ops->clk_readl(dd->idlest_reg) & dd->idlest_mask)
+		!= state) && i < MAX_DPLL_WAIT_TRIES) {
+		i++;
+		udelay(1);
+	}
+
+	if (i == MAX_DPLL_WAIT_TRIES) {
+		pr_err("clock: %s failed transition to '%s'\n",
+		       clk_name, (state) ? "locked" : "bypassed");
+	} else {
+		pr_debug("clock: %s transition to '%s' in %d loops\n",
+			 clk_name, (state) ? "locked" : "bypassed", i);
+
+		ret = 0;
+	}
+
+	return ret;
+}
+
+/* From 3430 TRM ES2 4.7.6.2 */
+static u16 _omap3_dpll_compute_freqsel(struct clk_hw_omap *clk, u8 n)
+{
+	unsigned long fint;
+	u16 f = 0;
+
+	fint = __clk_get_rate(clk->dpll_data->clk_ref) / n;
+
+	pr_debug("clock: fint is %lu\n", fint);
+
+	if (fint >= 750000 && fint <= 1000000)
+		f = 0x3;
+	else if (fint > 1000000 && fint <= 1250000)
+		f = 0x4;
+	else if (fint > 1250000 && fint <= 1500000)
+		f = 0x5;
+	else if (fint > 1500000 && fint <= 1750000)
+		f = 0x6;
+	else if (fint > 1750000 && fint <= 2100000)
+		f = 0x7;
+	else if (fint > 7500000 && fint <= 10000000)
+		f = 0xB;
+	else if (fint > 10000000 && fint <= 12500000)
+		f = 0xC;
+	else if (fint > 12500000 && fint <= 15000000)
+		f = 0xD;
+	else if (fint > 15000000 && fint <= 17500000)
+		f = 0xE;
+	else if (fint > 17500000 && fint <= 21000000)
+		f = 0xF;
+	else
+		pr_debug("clock: unknown freqsel setting for %d\n", n);
+
+	return f;
+}
+
+/*
+ * _omap3_noncore_dpll_lock - instruct a DPLL to lock and wait for readiness
+ * @clk: pointer to a DPLL struct clk
+ *
+ * Instructs a non-CORE DPLL to lock.  Waits for the DPLL to report
+ * readiness before returning.  Will save and restore the DPLL's
+ * autoidle state across the enable, per the CDP code.  If the DPLL
+ * locked successfully, return 0; if the DPLL did not lock in the time
+ * allotted, or DPLL3 was passed in, return -EINVAL.
+ */
+static int _omap3_noncore_dpll_lock(struct clk_hw_omap *clk)
+{
+	const struct dpll_data *dd;
+	u8 ai;
+	u8 state = 1;
+	int r = 0;
+
+	pr_debug("clock: locking DPLL %s\n", __clk_get_name(clk->hw.clk));
+
+	dd = clk->dpll_data;
+	state <<= __ffs(dd->idlest_mask);
+
+	/* Check if already locked */
+	if ((ti_clk_ll_ops->clk_readl(dd->idlest_reg) & dd->idlest_mask) ==
+	    state)
+		goto done;
+
+	ai = omap3_dpll_autoidle_read(clk);
+
+	if (ai)
+		omap3_dpll_deny_idle(clk);
+
+	_omap3_dpll_write_clken(clk, DPLL_LOCKED);
+
+	r = _omap3_wait_dpll_status(clk, 1);
+
+	if (ai)
+		omap3_dpll_allow_idle(clk);
+
+done:
+	return r;
+}
+
+/*
+ * _omap3_noncore_dpll_bypass - instruct a DPLL to bypass and wait for readiness
+ * @clk: pointer to a DPLL struct clk
+ *
+ * Instructs a non-CORE DPLL to enter low-power bypass mode.  In
+ * bypass mode, the DPLL's rate is set equal to its parent clock's
+ * rate.  Waits for the DPLL to report readiness before returning.
+ * Will save and restore the DPLL's autoidle state across the enable,
+ * per the CDP code.  If the DPLL entered bypass mode successfully,
+ * return 0; if the DPLL did not enter bypass in the time allotted, or
+ * DPLL3 was passed in, or the DPLL does not support low-power bypass,
+ * return -EINVAL.
+ */
+static int _omap3_noncore_dpll_bypass(struct clk_hw_omap *clk)
+{
+	int r;
+	u8 ai;
+
+	if (!(clk->dpll_data->modes & (1 << DPLL_LOW_POWER_BYPASS)))
+		return -EINVAL;
+
+	pr_debug("clock: configuring DPLL %s for low-power bypass\n",
+		 __clk_get_name(clk->hw.clk));
+
+	ai = omap3_dpll_autoidle_read(clk);
+
+	_omap3_dpll_write_clken(clk, DPLL_LOW_POWER_BYPASS);
+
+	r = _omap3_wait_dpll_status(clk, 0);
+
+	if (ai)
+		omap3_dpll_allow_idle(clk);
+
+	return r;
+}
+
+/*
+ * _omap3_noncore_dpll_stop - instruct a DPLL to stop
+ * @clk: pointer to a DPLL struct clk
+ *
+ * Instructs a non-CORE DPLL to enter low-power stop. Will save and
+ * restore the DPLL's autoidle state across the stop, per the CDP
+ * code.  If DPLL3 was passed in, or the DPLL does not support
+ * low-power stop, return -EINVAL; otherwise, return 0.
+ */
+static int _omap3_noncore_dpll_stop(struct clk_hw_omap *clk)
+{
+	u8 ai;
+
+	if (!(clk->dpll_data->modes & (1 << DPLL_LOW_POWER_STOP)))
+		return -EINVAL;
+
+	pr_debug("clock: stopping DPLL %s\n", __clk_get_name(clk->hw.clk));
+
+	ai = omap3_dpll_autoidle_read(clk);
+
+	_omap3_dpll_write_clken(clk, DPLL_LOW_POWER_STOP);
+
+	if (ai)
+		omap3_dpll_allow_idle(clk);
+
+	return 0;
+}
+
+/**
+ * _lookup_dco - Lookup DCO used by j-type DPLL
+ * @clk: pointer to a DPLL struct clk
+ * @dco: digital control oscillator selector
+ * @m: DPLL multiplier to set
+ * @n: DPLL divider to set
+ *
+ * See 36xx TRM section 3.5.3.3.3.2 "Type B DPLL (Low-Jitter)"
+ *
+ * XXX This code is not needed for 3430/AM35xx; can it be optimized
+ * out in non-multi-OMAP builds for those chips?
+ */
+static void _lookup_dco(struct clk_hw_omap *clk, u8 *dco, u16 m, u8 n)
+{
+	unsigned long fint, clkinp; /* watch out for overflow */
+
+	clkinp = __clk_get_rate(__clk_get_parent(clk->hw.clk));
+	fint = (clkinp / n) * m;
+
+	if (fint < 1000000000)
+		*dco = 2;
+	else
+		*dco = 4;
+}
+
+/**
+ * _lookup_sddiv - Calculate sigma delta divider for j-type DPLL
+ * @clk: pointer to a DPLL struct clk
+ * @sd_div: target sigma-delta divider
+ * @m: DPLL multiplier to set
+ * @n: DPLL divider to set
+ *
+ * See 36xx TRM section 3.5.3.3.3.2 "Type B DPLL (Low-Jitter)"
+ *
+ * XXX This code is not needed for 3430/AM35xx; can it be optimized
+ * out in non-multi-OMAP builds for those chips?
+ */
+static void _lookup_sddiv(struct clk_hw_omap *clk, u8 *sd_div, u16 m, u8 n)
+{
+	unsigned long clkinp, sd; /* watch out for overflow */
+	int mod1, mod2;
+
+	clkinp = __clk_get_rate(__clk_get_parent(clk->hw.clk));
+
+	/*
+	 * target sigma-delta to near 250MHz
+	 * sd = ceil[(m/(n+1)) * (clkinp_MHz / 250)]
+	 */
+	clkinp /= 100000; /* shift from MHz to 10*Hz for 38.4 and 19.2 */
+	mod1 = (clkinp * m) % (250 * n);
+	sd = (clkinp * m) / (250 * n);
+	mod2 = sd % 10;
+	sd /= 10;
+
+	if (mod1 || mod2)
+		sd++;
+	*sd_div = sd;
+}
+
+/*
+ * _omap3_noncore_dpll_program - set non-core DPLL M,N values directly
+ * @clk:	struct clk * of DPLL to set
+ * @freqsel:	FREQSEL value to set
+ *
+ * Program the DPLL with the last M, N values calculated, and wait for
+ * the DPLL to lock. Returns -EINVAL upon error, or 0 upon success.
+ */
+static int omap3_noncore_dpll_program(struct clk_hw_omap *clk, u16 freqsel)
+{
+	struct dpll_data *dd = clk->dpll_data;
+	u8 dco, sd_div;
+	u32 v;
+
+	/* 3430 ES2 TRM: 4.7.6.9 DPLL Programming Sequence */
+	_omap3_noncore_dpll_bypass(clk);
+
+	/*
+	 * Set jitter correction. Jitter correction applicable for OMAP343X
+	 * only since freqsel field is no longer present on other devices.
+	 */
+	if (ti_clk_get_features()->flags & TI_CLK_DPLL_HAS_FREQSEL) {
+		v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+		v &= ~dd->freqsel_mask;
+		v |= freqsel << __ffs(dd->freqsel_mask);
+		ti_clk_ll_ops->clk_writel(v, dd->control_reg);
+	}
+
+	/* Set DPLL multiplier, divider */
+	v = ti_clk_ll_ops->clk_readl(dd->mult_div1_reg);
+
+	/* Handle Duty Cycle Correction */
+	if (dd->dcc_mask) {
+		if (dd->last_rounded_rate >= dd->dcc_rate)
+			v |= dd->dcc_mask; /* Enable DCC */
+		else
+			v &= ~dd->dcc_mask; /* Disable DCC */
+	}
+
+	v &= ~(dd->mult_mask | dd->div1_mask);
+	v |= dd->last_rounded_m << __ffs(dd->mult_mask);
+	v |= (dd->last_rounded_n - 1) << __ffs(dd->div1_mask);
+
+	/* Configure dco and sd_div for dplls that have these fields */
+	if (dd->dco_mask) {
+		_lookup_dco(clk, &dco, dd->last_rounded_m, dd->last_rounded_n);
+		v &= ~(dd->dco_mask);
+		v |= dco << __ffs(dd->dco_mask);
+	}
+	if (dd->sddiv_mask) {
+		_lookup_sddiv(clk, &sd_div, dd->last_rounded_m,
+			      dd->last_rounded_n);
+		v &= ~(dd->sddiv_mask);
+		v |= sd_div << __ffs(dd->sddiv_mask);
+	}
+
+	ti_clk_ll_ops->clk_writel(v, dd->mult_div1_reg);
+
+	/* Set 4X multiplier and low-power mode */
+	if (dd->m4xen_mask || dd->lpmode_mask) {
+		v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+
+		if (dd->m4xen_mask) {
+			if (dd->last_rounded_m4xen)
+				v |= dd->m4xen_mask;
+			else
+				v &= ~dd->m4xen_mask;
+		}
+
+		if (dd->lpmode_mask) {
+			if (dd->last_rounded_lpmode)
+				v |= dd->lpmode_mask;
+			else
+				v &= ~dd->lpmode_mask;
+		}
+
+		ti_clk_ll_ops->clk_writel(v, dd->control_reg);
+	}
+
+	/* We let the clock framework set the other output dividers later */
+
+	/* REVISIT: Set ramp-up delay? */
+
+	_omap3_noncore_dpll_lock(clk);
+
+	return 0;
+}
+
+/* Public functions */
+
+/**
+ * omap3_dpll_recalc - recalculate DPLL rate
+ * @clk: DPLL struct clk
+ *
+ * Recalculate and propagate the DPLL rate.
+ */
+unsigned long omap3_dpll_recalc(struct clk_hw *hw, unsigned long parent_rate)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+
+	return omap2_get_dpll_rate(clk);
+}
+
+/* Non-CORE DPLL (e.g., DPLLs that do not control SDRC) clock functions */
+
+/**
+ * omap3_noncore_dpll_enable - instruct a DPLL to enter bypass or lock mode
+ * @clk: pointer to a DPLL struct clk
+ *
+ * Instructs a non-CORE DPLL to enable, e.g., to enter bypass or lock.
+ * The choice of modes depends on the DPLL's programmed rate: if it is
+ * the same as the DPLL's parent clock, it will enter bypass;
+ * otherwise, it will enter lock.  This code will wait for the DPLL to
+ * indicate readiness before returning, unless the DPLL takes too long
+ * to enter the target state.  Intended to be used as the struct clk's
+ * enable function.  If DPLL3 was passed in, or the DPLL does not
+ * support low-power stop, or if the DPLL took too long to enter
+ * bypass or lock, return -EINVAL; otherwise, return 0.
+ */
+int omap3_noncore_dpll_enable(struct clk_hw *hw)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+	int r;
+	struct dpll_data *dd;
+	struct clk_hw *parent;
+
+	dd = clk->dpll_data;
+	if (!dd)
+		return -EINVAL;
+
+	if (clk->clkdm) {
+		r = ti_clk_ll_ops->clkdm_clk_enable(clk->clkdm, hw->clk);
+		if (r) {
+			WARN(1,
+			     "%s: could not enable %s's clockdomain %s: %d\n",
+			     __func__, __clk_get_name(hw->clk),
+			     clk->clkdm_name, r);
+			return r;
+		}
+	}
+
+	parent = __clk_get_hw(__clk_get_parent(hw->clk));
+
+	if (__clk_get_rate(hw->clk) == __clk_get_rate(dd->clk_bypass)) {
+		WARN_ON(parent != __clk_get_hw(dd->clk_bypass));
+		r = _omap3_noncore_dpll_bypass(clk);
+	} else {
+		WARN_ON(parent != __clk_get_hw(dd->clk_ref));
+		r = _omap3_noncore_dpll_lock(clk);
+	}
+
+	return r;
+}
+
+/**
+ * omap3_noncore_dpll_disable - instruct a DPLL to enter low-power stop
+ * @clk: pointer to a DPLL struct clk
+ *
+ * Instructs a non-CORE DPLL to enter low-power stop.  This function is
+ * intended for use in struct clkops.  No return value.
+ */
+void omap3_noncore_dpll_disable(struct clk_hw *hw)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+
+	_omap3_noncore_dpll_stop(clk);
+	if (clk->clkdm)
+		ti_clk_ll_ops->clkdm_clk_disable(clk->clkdm, hw->clk);
+}
+
+/* Non-CORE DPLL rate set code */
+
+/**
+ * omap3_noncore_dpll_determine_rate - determine rate for a DPLL
+ * @hw: pointer to the clock to determine rate for
+ * @rate: target rate for the DPLL
+ * @best_parent_rate: pointer for returning best parent rate
+ * @best_parent_clk: pointer for returning best parent clock
+ *
+ * Determines which DPLL mode to use for reaching a desired target rate.
+ * Checks whether the DPLL shall be in bypass or locked mode, and if
+ * locked, calculates the M,N values for the DPLL via round-rate.
+ * Returns a positive clock rate with success, negative error value
+ * in failure.
+ */
+long omap3_noncore_dpll_determine_rate(struct clk_hw *hw, unsigned long rate,
+				       unsigned long min_rate,
+				       unsigned long max_rate,
+				       unsigned long *best_parent_rate,
+				       struct clk_hw **best_parent_clk)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+	struct dpll_data *dd;
+
+	if (!hw || !rate)
+		return -EINVAL;
+
+	dd = clk->dpll_data;
+	if (!dd)
+		return -EINVAL;
+
+	if (__clk_get_rate(dd->clk_bypass) == rate &&
+	    (dd->modes & (1 << DPLL_LOW_POWER_BYPASS))) {
+		*best_parent_clk = __clk_get_hw(dd->clk_bypass);
+	} else {
+		rate = omap2_dpll_round_rate(hw, rate, best_parent_rate);
+		*best_parent_clk = __clk_get_hw(dd->clk_ref);
+	}
+
+	*best_parent_rate = rate;
+
+	return rate;
+}
+
+/**
+ * omap3_noncore_dpll_set_parent - set parent for a DPLL clock
+ * @hw: pointer to the clock to set parent for
+ * @index: parent index to select
+ *
+ * Sets parent for a DPLL clock. This sets the DPLL into bypass or
+ * locked mode. Returns 0 with success, negative error value otherwise.
+ */
+int omap3_noncore_dpll_set_parent(struct clk_hw *hw, u8 index)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+	int ret;
+
+	if (!hw)
+		return -EINVAL;
+
+	if (index)
+		ret = _omap3_noncore_dpll_bypass(clk);
+	else
+		ret = _omap3_noncore_dpll_lock(clk);
+
+	return ret;
+}
+
+/**
+ * omap3_noncore_dpll_set_rate - set rate for a DPLL clock
+ * @hw: pointer to the clock to set parent for
+ * @rate: target rate for the clock
+ * @parent_rate: rate of the parent clock
+ *
+ * Sets rate for a DPLL clock. First checks if the clock parent is
+ * reference clock (in bypass mode, the rate of the clock can't be
+ * changed) and proceeds with the rate change operation. Returns 0
+ * with success, negative error value otherwise.
+ */
+int omap3_noncore_dpll_set_rate(struct clk_hw *hw, unsigned long rate,
+				unsigned long parent_rate)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+	struct dpll_data *dd;
+	u16 freqsel = 0;
+	int ret;
+
+	if (!hw || !rate)
+		return -EINVAL;
+
+	dd = clk->dpll_data;
+	if (!dd)
+		return -EINVAL;
+
+	if (__clk_get_hw(__clk_get_parent(hw->clk)) !=
+	    __clk_get_hw(dd->clk_ref))
+		return -EINVAL;
+
+	if (dd->last_rounded_rate == 0)
+		return -EINVAL;
+
+	/* Freqsel is available only on OMAP343X devices */
+	if (ti_clk_get_features()->flags & TI_CLK_DPLL_HAS_FREQSEL) {
+		freqsel = _omap3_dpll_compute_freqsel(clk, dd->last_rounded_n);
+		WARN_ON(!freqsel);
+	}
+
+	pr_debug("%s: %s: set rate: locking rate to %lu.\n", __func__,
+		 __clk_get_name(hw->clk), rate);
+
+	ret = omap3_noncore_dpll_program(clk, freqsel);
+
+	return ret;
+}
+
+/**
+ * omap3_noncore_dpll_set_rate_and_parent - set rate and parent for a DPLL clock
+ * @hw: pointer to the clock to set rate and parent for
+ * @rate: target rate for the DPLL
+ * @parent_rate: clock rate of the DPLL parent
+ * @index: new parent index for the DPLL, 0 - reference, 1 - bypass
+ *
+ * Sets rate and parent for a DPLL clock. If new parent is the bypass
+ * clock, only selects the parent. Otherwise proceeds with a rate
+ * change, as this will effectively also change the parent as the
+ * DPLL is put into locked mode. Returns 0 with success, negative error
+ * value otherwise.
+ */
+int omap3_noncore_dpll_set_rate_and_parent(struct clk_hw *hw,
+					   unsigned long rate,
+					   unsigned long parent_rate,
+					   u8 index)
+{
+	int ret;
+
+	if (!hw || !rate)
+		return -EINVAL;
+
+	/*
+	 * clk-ref at index[0], in which case we only need to set rate,
+	 * the parent will be changed automatically with the lock sequence.
+	 * With clk-bypass case we only need to change parent.
+	 */
+	if (index)
+		ret = omap3_noncore_dpll_set_parent(hw, index);
+	else
+		ret = omap3_noncore_dpll_set_rate(hw, rate, parent_rate);
+
+	return ret;
+}
+
+/* DPLL autoidle read/set code */
+
+/**
+ * omap3_dpll_autoidle_read - read a DPLL's autoidle bits
+ * @clk: struct clk * of the DPLL to read
+ *
+ * Return the DPLL's autoidle bits, shifted down to bit 0.  Returns
+ * -EINVAL if passed a null pointer or if the struct clk does not
+ * appear to refer to a DPLL.
+ */
+static u32 omap3_dpll_autoidle_read(struct clk_hw_omap *clk)
+{
+	const struct dpll_data *dd;
+	u32 v;
+
+	if (!clk || !clk->dpll_data)
+		return -EINVAL;
+
+	dd = clk->dpll_data;
+
+	if (!dd->autoidle_reg)
+		return -EINVAL;
+
+	v = ti_clk_ll_ops->clk_readl(dd->autoidle_reg);
+	v &= dd->autoidle_mask;
+	v >>= __ffs(dd->autoidle_mask);
+
+	return v;
+}
+
+/**
+ * omap3_dpll_allow_idle - enable DPLL autoidle bits
+ * @clk: struct clk * of the DPLL to operate on
+ *
+ * Enable DPLL automatic idle control.  This automatic idle mode
+ * switching takes effect only when the DPLL is locked, at least on
+ * OMAP3430.  The DPLL will enter low-power stop when its downstream
+ * clocks are gated.  No return value.
+ */
+static void omap3_dpll_allow_idle(struct clk_hw_omap *clk)
+{
+	const struct dpll_data *dd;
+	u32 v;
+
+	if (!clk || !clk->dpll_data)
+		return;
+
+	dd = clk->dpll_data;
+
+	if (!dd->autoidle_reg)
+		return;
+
+	/*
+	 * REVISIT: CORE DPLL can optionally enter low-power bypass
+	 * by writing 0x5 instead of 0x1.  Add some mechanism to
+	 * optionally enter this mode.
+	 */
+	v = ti_clk_ll_ops->clk_readl(dd->autoidle_reg);
+	v &= ~dd->autoidle_mask;
+	v |= DPLL_AUTOIDLE_LOW_POWER_STOP << __ffs(dd->autoidle_mask);
+	ti_clk_ll_ops->clk_writel(v, dd->autoidle_reg);
+}
+
+/**
+ * omap3_dpll_deny_idle - prevent DPLL from automatically idling
+ * @clk: struct clk * of the DPLL to operate on
+ *
+ * Disable DPLL automatic idle control.  No return value.
+ */
+static void omap3_dpll_deny_idle(struct clk_hw_omap *clk)
+{
+	const struct dpll_data *dd;
+	u32 v;
+
+	if (!clk || !clk->dpll_data)
+		return;
+
+	dd = clk->dpll_data;
+
+	if (!dd->autoidle_reg)
+		return;
+
+	v = ti_clk_ll_ops->clk_readl(dd->autoidle_reg);
+	v &= ~dd->autoidle_mask;
+	v |= DPLL_AUTOIDLE_DISABLE << __ffs(dd->autoidle_mask);
+	ti_clk_ll_ops->clk_writel(v, dd->autoidle_reg);
+}
+
+/* Clock control for DPLL outputs */
+
+/* Find the parent DPLL for the given clkoutx2 clock */
+static struct clk_hw_omap *omap3_find_clkoutx2_dpll(struct clk_hw *hw)
+{
+	struct clk_hw_omap *pclk = NULL;
+	struct clk *parent;
+
+	/* Walk up the parents of clk, looking for a DPLL */
+	do {
+		do {
+			parent = __clk_get_parent(hw->clk);
+			hw = __clk_get_hw(parent);
+		} while (hw && (__clk_get_flags(hw->clk) & CLK_IS_BASIC));
+		if (!hw)
+			break;
+		pclk = to_clk_hw_omap(hw);
+	} while (pclk && !pclk->dpll_data);
+
+	/* clk does not have a DPLL as a parent?  error in the clock data */
+	if (!pclk) {
+		WARN_ON(1);
+		return NULL;
+	}
+
+	return pclk;
+}
+
+/**
+ * omap3_clkoutx2_recalc - recalculate DPLL X2 output virtual clock rate
+ * @clk: DPLL output struct clk
+ *
+ * Using parent clock DPLL data, look up DPLL state.  If locked, set our
+ * rate to the dpll_clk * 2; otherwise, just use dpll_clk.
+ */
+unsigned long omap3_clkoutx2_recalc(struct clk_hw *hw,
+				    unsigned long parent_rate)
+{
+	const struct dpll_data *dd;
+	unsigned long rate;
+	u32 v;
+	struct clk_hw_omap *pclk = NULL;
+
+	if (!parent_rate)
+		return 0;
+
+	pclk = omap3_find_clkoutx2_dpll(hw);
+
+	if (!pclk)
+		return 0;
+
+	dd = pclk->dpll_data;
+
+	WARN_ON(!dd->enable_mask);
+
+	v = ti_clk_ll_ops->clk_readl(dd->control_reg) & dd->enable_mask;
+	v >>= __ffs(dd->enable_mask);
+	if ((v != OMAP3XXX_EN_DPLL_LOCKED) || (dd->flags & DPLL_J_TYPE))
+		rate = parent_rate;
+	else
+		rate = parent_rate * 2;
+	return rate;
+}
+
+/* OMAP3/4 non-CORE DPLL clkops */
+const struct clk_hw_omap_ops clkhwops_omap3_dpll = {
+	.allow_idle	= omap3_dpll_allow_idle,
+	.deny_idle	= omap3_dpll_deny_idle,
+};
+
+/**
+ * omap3_dpll4_set_rate - set rate for omap3 per-dpll
+ * @hw: clock to change
+ * @rate: target rate for clock
+ * @parent_rate: rate of the parent clock
+ *
+ * Check if the current SoC supports the per-dpll reprogram operation
+ * or not, and then do the rate change if supported. Returns -EINVAL
+ * if not supported, 0 for success, and potential error codes from the
+ * clock rate change.
+ */
+int omap3_dpll4_set_rate(struct clk_hw *hw, unsigned long rate,
+			 unsigned long parent_rate)
+{
+	/*
+	 * According to the 12-5 CDP code from TI, "Limitation 2.5"
+	 * on 3430ES1 prevents us from changing DPLL multipliers or dividers
+	 * on DPLL4.
+	 */
+	if (ti_clk_get_features()->flags & TI_CLK_DPLL4_DENY_REPROGRAM) {
+		pr_err("clock: DPLL4 cannot change rate due to silicon 'Limitation 2.5' on 3430ES1.\n");
+		return -EINVAL;
+	}
+
+	return omap3_noncore_dpll_set_rate(hw, rate, parent_rate);
+}
+
+/**
+ * omap3_dpll4_set_rate_and_parent - set rate and parent for omap3 per-dpll
+ * @hw: clock to change
+ * @rate: target rate for clock
+ * @parent_rate: rate of the parent clock
+ * @index: parent index, 0 - reference clock, 1 - bypass clock
+ *
+ * Check if the current SoC support the per-dpll reprogram operation
+ * or not, and then do the rate + parent change if supported. Returns
+ * -EINVAL if not supported, 0 for success, and potential error codes
+ * from the clock rate change.
+ */
+int omap3_dpll4_set_rate_and_parent(struct clk_hw *hw, unsigned long rate,
+				    unsigned long parent_rate, u8 index)
+{
+	if (ti_clk_get_features()->flags & TI_CLK_DPLL4_DENY_REPROGRAM) {
+		pr_err("clock: DPLL4 cannot change rate due to silicon 'Limitation 2.5' on 3430ES1.\n");
+		return -EINVAL;
+	}
+
+	return omap3_noncore_dpll_set_rate_and_parent(hw, rate, parent_rate,
+						      index);
+}
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 25eea896627a..f8e50271ec97 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -271,41 +271,13 @@ extern const struct clk_ops ti_clk_mux_ops;
 
 #define to_clk_hw_omap(_hw) container_of(_hw, struct clk_hw_omap, hw)
 
-int omap3_noncore_dpll_enable(struct clk_hw *hw);
-void omap3_noncore_dpll_disable(struct clk_hw *hw);
-int omap3_noncore_dpll_set_parent(struct clk_hw *hw, u8 index);
-int omap3_noncore_dpll_set_rate(struct clk_hw *hw, unsigned long rate,
-				unsigned long parent_rate);
-int omap3_noncore_dpll_set_rate_and_parent(struct clk_hw *hw,
-					   unsigned long rate,
-					   unsigned long parent_rate,
-					   u8 index);
-long omap3_noncore_dpll_determine_rate(struct clk_hw *hw,
-				       unsigned long rate,
-				       unsigned long min_rate,
-				       unsigned long max_rate,
-				       unsigned long *best_parent_rate,
-				       struct clk_hw **best_parent_clk);
-unsigned long omap3_dpll_recalc(struct clk_hw *hw, unsigned long parent_rate);
-long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate,
-			   unsigned long *parent_rate);
 void omap2_init_clk_clkdm(struct clk_hw *clk);
-unsigned long omap3_clkoutx2_recalc(struct clk_hw *hw,
-				    unsigned long parent_rate);
-int omap3_clkoutx2_set_rate(struct clk_hw *hw, unsigned long rate,
-					unsigned long parent_rate);
-long omap3_clkoutx2_round_rate(struct clk_hw *hw, unsigned long rate,
-		unsigned long *prate);
 int omap2_clkops_enable_clkdm(struct clk_hw *hw);
 void omap2_clkops_disable_clkdm(struct clk_hw *hw);
 int omap2_clk_disable_autoidle_all(void);
 int omap2_clk_enable_autoidle_all(void);
 int omap2_clk_allow_idle(struct clk *clk);
 int omap2_clk_deny_idle(struct clk *clk);
-int omap3_dpll4_set_rate(struct clk_hw *clk, unsigned long rate,
-			 unsigned long parent_rate);
-int omap3_dpll4_set_rate_and_parent(struct clk_hw *hw, unsigned long rate,
-				    unsigned long parent_rate, u8 index);
 int omap2_dflt_clk_enable(struct clk_hw *hw);
 void omap2_dflt_clk_disable(struct clk_hw *hw);
 int omap2_dflt_clk_is_enabled(struct clk_hw *hw);
@@ -317,7 +289,6 @@ void omap2_clk_dflt_find_companion(struct clk_hw_omap *clk,
 void omap2_clk_dflt_find_idlest(struct clk_hw_omap *clk,
 				void __iomem **idlest_reg,
 				u8 *idlest_bit, u8 *idlest_val);
-void omap3_clk_lock_dpll5(void);
 unsigned long omap2_dpllcore_recalc(struct clk_hw *hw,
 				    unsigned long parent_rate);
 int omap2_reprogram_dpllcore(struct clk_hw *clk, unsigned long rate,
@@ -365,7 +336,6 @@ const struct ti_clk_features *ti_clk_get_features(void);
 
 extern const struct clk_hw_omap_ops clkhwops_omap2xxx_dpll;
 extern const struct clk_hw_omap_ops clkhwops_omap2430_i2chs_wait;
-extern const struct clk_hw_omap_ops clkhwops_omap3_dpll;
 extern const struct clk_hw_omap_ops clkhwops_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_dss_usbhost_wait;
 extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_module_wait;
-- 
cgit v1.2.3-70-g09d2


From 046b7c31668311942a2e431e7983d8ab9874d845 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Tue, 3 Mar 2015 15:13:50 +0200
Subject: ARM: OMAP2+: clock: remove clkdm_control static boolean from code

clkdm_control is used to determine, whether clocks should trigger a
clockdomain transition when they are enabled/disabled. Keep this
functionality intact, but replace this with a clk_features flag
which can be initialized during boot if needed.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/clock.c | 34 ++++++++++------------------------
 arch/arm/mach-omap2/clock.h |  2 --
 include/linux/clk/ti.h      |  1 +
 3 files changed, 11 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index 7a5713df54b3..6c17adf40e6f 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -61,14 +61,6 @@ u16 cpu_mask;
 #define OMAP3PLUS_DPLL_FINT_MIN		32000
 #define OMAP3PLUS_DPLL_FINT_MAX		52000000
 
-/*
- * clkdm_control: if true, then when a clock is enabled in the
- * hardware, its clockdomain will first be enabled; and when a clock
- * is disabled in the hardware, its clockdomain will be disabled
- * afterwards.
- */
-static bool clkdm_control = true;
-
 struct clk_iomap {
 	struct regmap *regmap;
 	void __iomem *mem;
@@ -287,19 +279,6 @@ void omap2_init_clk_clkdm(struct clk_hw *hw)
 	}
 }
 
-/**
- * omap2_clk_disable_clkdm_control - disable clkdm control on clk enable/disable
- *
- * Prevent the OMAP clock code from calling into the clockdomain code
- * when a hardware clock in that clockdomain is enabled or disabled.
- * Intended to be called at init time from omap*_clk_init().  No
- * return value.
- */
-void __init omap2_clk_disable_clkdm_control(void)
-{
-	clkdm_control = false;
-}
-
 /**
  * omap2_clk_dflt_find_companion - find companion clock to @clk
  * @clk: struct clk * to find the companion clock of
@@ -384,6 +363,12 @@ int omap2_dflt_clk_enable(struct clk_hw *hw)
 	struct clk_hw_omap *clk;
 	u32 v;
 	int ret = 0;
+	bool clkdm_control;
+
+	if (ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL)
+		clkdm_control = false;
+	else
+		clkdm_control = true;
 
 	clk = to_clk_hw_omap(hw);
 
@@ -457,7 +442,8 @@ void omap2_dflt_clk_disable(struct clk_hw *hw)
 	omap2_clk_writel(v, clk, clk->enable_reg);
 	/* No OCP barrier needed here since it is a disable operation */
 
-	if (clkdm_control && clk->clkdm)
+	if (!(ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL) &&
+	    clk->clkdm)
 		clkdm_clk_disable(clk->clkdm, hw->clk);
 }
 
@@ -490,7 +476,7 @@ int omap2_clkops_enable_clkdm(struct clk_hw *hw)
 		pr_err("%s: %s: should use dflt_clk_enable ?!\n", __func__,
 		       __clk_get_name(hw->clk));
 
-	if (!clkdm_control) {
+	if (ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL) {
 		pr_err("%s: %s: clkfw-based clockdomain control disabled ?!\n",
 		       __func__, __clk_get_name(hw->clk));
 		return 0;
@@ -528,7 +514,7 @@ void omap2_clkops_disable_clkdm(struct clk_hw *hw)
 		pr_err("%s: %s: should use dflt_clk_disable ?!\n", __func__,
 		       __clk_get_name(hw->clk));
 
-	if (!clkdm_control) {
+	if (ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL) {
 		pr_err("%s: %s: clkfw-based clockdomain control disabled ?!\n",
 		       __func__, __clk_get_name(hw->clk));
 		return;
diff --git a/arch/arm/mach-omap2/clock.h b/arch/arm/mach-omap2/clock.h
index d60691d5626a..948065497472 100644
--- a/arch/arm/mach-omap2/clock.h
+++ b/arch/arm/mach-omap2/clock.h
@@ -180,8 +180,6 @@ struct clksel {
 #define OMAP4XXX_EN_DPLL_FRBYPASS		0x6
 #define OMAP4XXX_EN_DPLL_LOCKED			0x7
 
-void __init omap2_clk_disable_clkdm_control(void);
-
 void omap2_clk_print_new_rates(const char *hfclkin_ck_name,
 			       const char *core_ck_name,
 			       const char *mpu_ck_name);
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index f8e50271ec97..fbb65e401d13 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -330,6 +330,7 @@ struct ti_clk_features {
 
 #define TI_CLK_DPLL_HAS_FREQSEL			BIT(0)
 #define TI_CLK_DPLL4_DENY_REPROGRAM		BIT(1)
+#define TI_CLK_DISABLE_CLKDM_CONTROL		BIT(2)
 
 void ti_clk_setup_features(struct ti_clk_features *features);
 const struct ti_clk_features *ti_clk_get_features(void);
-- 
cgit v1.2.3-70-g09d2


From 9f37e90efaf0772b8f98bc347b9db77a3f0c27eb Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Tue, 3 Mar 2015 15:28:53 +0200
Subject: clk: ti: dflt: move support for default gate clock to clock driver

With the legacy support gone, OMAP2+ default gate clock can be moved
under clock driver. Create a new file for the purpose, and clean-up
the header exports a bit as some clock APIs are no longer needed
outside clock driver itself.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/clock.c | 273 --------------------------------------
 drivers/clk/ti/Makefile     |   2 +-
 drivers/clk/ti/clkt_dflt.c  | 316 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/clk/ti/clock.h      |   5 +
 include/linux/clk/ti.h      |   4 -
 5 files changed, 322 insertions(+), 278 deletions(-)
 create mode 100644 drivers/clk/ti/clkt_dflt.c

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index 6c17adf40e6f..38a336b4c42b 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -40,12 +40,6 @@
 #include "cm-regbits-34xx.h"
 #include "common.h"
 
-/*
- * MAX_MODULE_ENABLE_WAIT: maximum of number of microseconds to wait
- * for a module to indicate that it is no longer in idle
- */
-#define MAX_MODULE_ENABLE_WAIT		100000
-
 u16 cpu_mask;
 
 /* DPLL valid Fint frequency band limits - from 34xx TRM Section 4.7.6.2 */
@@ -176,77 +170,6 @@ void __init omap2_clk_legacy_provider_init(int index, void __iomem *mem)
 
 /* Private functions */
 
-
-/**
- * _wait_idlest_generic - wait for a module to leave the idle state
- * @clk: module clock to wait for (needed for register offsets)
- * @reg: virtual address of module IDLEST register
- * @mask: value to mask against to determine if the module is active
- * @idlest: idle state indicator (0 or 1) for the clock
- * @name: name of the clock (for printk)
- *
- * Wait for a module to leave idle, where its idle-status register is
- * not inside the CM module.  Returns 1 if the module left idle
- * promptly, or 0 if the module did not leave idle before the timeout
- * elapsed.  XXX Deprecated - should be moved into drivers for the
- * individual IP block that the IDLEST register exists in.
- */
-static int _wait_idlest_generic(struct clk_hw_omap *clk, void __iomem *reg,
-				u32 mask, u8 idlest, const char *name)
-{
-	int i = 0, ena = 0;
-
-	ena = (idlest) ? 0 : mask;
-
-	omap_test_timeout(((omap2_clk_readl(clk, reg) & mask) == ena),
-			  MAX_MODULE_ENABLE_WAIT, i);
-
-	if (i < MAX_MODULE_ENABLE_WAIT)
-		pr_debug("omap clock: module associated with clock %s ready after %d loops\n",
-			 name, i);
-	else
-		pr_err("omap clock: module associated with clock %s didn't enable in %d tries\n",
-		       name, MAX_MODULE_ENABLE_WAIT);
-
-	return (i < MAX_MODULE_ENABLE_WAIT) ? 1 : 0;
-};
-
-/**
- * _omap2_module_wait_ready - wait for an OMAP module to leave IDLE
- * @clk: struct clk * belonging to the module
- *
- * If the necessary clocks for the OMAP hardware IP block that
- * corresponds to clock @clk are enabled, then wait for the module to
- * indicate readiness (i.e., to leave IDLE).  This code does not
- * belong in the clock code and will be moved in the medium term to
- * module-dependent code.  No return value.
- */
-static void _omap2_module_wait_ready(struct clk_hw_omap *clk)
-{
-	void __iomem *companion_reg, *idlest_reg;
-	u8 other_bit, idlest_bit, idlest_val, idlest_reg_id;
-	s16 prcm_mod;
-	int r;
-
-	/* Not all modules have multiple clocks that their IDLEST depends on */
-	if (clk->ops->find_companion) {
-		clk->ops->find_companion(clk, &companion_reg, &other_bit);
-		if (!(omap2_clk_readl(clk, companion_reg) & (1 << other_bit)))
-			return;
-	}
-
-	clk->ops->find_idlest(clk, &idlest_reg, &idlest_bit, &idlest_val);
-	r = cm_split_idlest_reg(idlest_reg, &prcm_mod, &idlest_reg_id);
-	if (r) {
-		/* IDLEST register not in the CM module */
-		_wait_idlest_generic(clk, idlest_reg, (1 << idlest_bit),
-				     idlest_val, __clk_get_name(clk->hw.clk));
-	} else {
-		omap_cm_wait_module_ready(0, prcm_mod, idlest_reg_id,
-					  idlest_bit);
-	};
-}
-
 /* Public functions */
 
 /**
@@ -279,174 +202,6 @@ void omap2_init_clk_clkdm(struct clk_hw *hw)
 	}
 }
 
-/**
- * omap2_clk_dflt_find_companion - find companion clock to @clk
- * @clk: struct clk * to find the companion clock of
- * @other_reg: void __iomem ** to return the companion clock CM_*CLKEN va in
- * @other_bit: u8 ** to return the companion clock bit shift in
- *
- * Note: We don't need special code here for INVERT_ENABLE for the
- * time being since INVERT_ENABLE only applies to clocks enabled by
- * CM_CLKEN_PLL
- *
- * Convert CM_ICLKEN* <-> CM_FCLKEN*.  This conversion assumes it's
- * just a matter of XORing the bits.
- *
- * Some clocks don't have companion clocks.  For example, modules with
- * only an interface clock (such as MAILBOXES) don't have a companion
- * clock.  Right now, this code relies on the hardware exporting a bit
- * in the correct companion register that indicates that the
- * nonexistent 'companion clock' is active.  Future patches will
- * associate this type of code with per-module data structures to
- * avoid this issue, and remove the casts.  No return value.
- */
-void omap2_clk_dflt_find_companion(struct clk_hw_omap *clk,
-			void __iomem **other_reg, u8 *other_bit)
-{
-	u32 r;
-
-	/*
-	 * Convert CM_ICLKEN* <-> CM_FCLKEN*.  This conversion assumes
-	 * it's just a matter of XORing the bits.
-	 */
-	r = ((__force u32)clk->enable_reg ^ (CM_FCLKEN ^ CM_ICLKEN));
-
-	*other_reg = (__force void __iomem *)r;
-	*other_bit = clk->enable_bit;
-}
-
-/**
- * omap2_clk_dflt_find_idlest - find CM_IDLEST reg va, bit shift for @clk
- * @clk: struct clk * to find IDLEST info for
- * @idlest_reg: void __iomem ** to return the CM_IDLEST va in
- * @idlest_bit: u8 * to return the CM_IDLEST bit shift in
- * @idlest_val: u8 * to return the idle status indicator
- *
- * Return the CM_IDLEST register address and bit shift corresponding
- * to the module that "owns" this clock.  This default code assumes
- * that the CM_IDLEST bit shift is the CM_*CLKEN bit shift, and that
- * the IDLEST register address ID corresponds to the CM_*CLKEN
- * register address ID (e.g., that CM_FCLKEN2 corresponds to
- * CM_IDLEST2).  This is not true for all modules.  No return value.
- */
-void omap2_clk_dflt_find_idlest(struct clk_hw_omap *clk,
-		void __iomem **idlest_reg, u8 *idlest_bit, u8 *idlest_val)
-{
-	u32 r;
-
-	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
-	*idlest_reg = (__force void __iomem *)r;
-	*idlest_bit = clk->enable_bit;
-
-	/*
-	 * 24xx uses 0 to indicate not ready, and 1 to indicate ready.
-	 * 34xx reverses this, just to keep us on our toes
-	 * AM35xx uses both, depending on the module.
-	 */
-	*idlest_val = ti_clk_get_features()->cm_idlest_val;
-}
-
-/**
- * omap2_dflt_clk_enable - enable a clock in the hardware
- * @hw: struct clk_hw * of the clock to enable
- *
- * Enable the clock @hw in the hardware.  We first call into the OMAP
- * clockdomain code to "enable" the corresponding clockdomain if this
- * is the first enabled user of the clockdomain.  Then program the
- * hardware to enable the clock.  Then wait for the IP block that uses
- * this clock to leave idle (if applicable).  Returns the error value
- * from clkdm_clk_enable() if it terminated with an error, or -EINVAL
- * if @hw has a null clock enable_reg, or zero upon success.
- */
-int omap2_dflt_clk_enable(struct clk_hw *hw)
-{
-	struct clk_hw_omap *clk;
-	u32 v;
-	int ret = 0;
-	bool clkdm_control;
-
-	if (ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL)
-		clkdm_control = false;
-	else
-		clkdm_control = true;
-
-	clk = to_clk_hw_omap(hw);
-
-	if (clkdm_control && clk->clkdm) {
-		ret = clkdm_clk_enable(clk->clkdm, hw->clk);
-		if (ret) {
-			WARN(1, "%s: could not enable %s's clockdomain %s: %d\n",
-			     __func__, __clk_get_name(hw->clk),
-			     clk->clkdm->name, ret);
-			return ret;
-		}
-	}
-
-	if (unlikely(clk->enable_reg == NULL)) {
-		pr_err("%s: %s missing enable_reg\n", __func__,
-		       __clk_get_name(hw->clk));
-		ret = -EINVAL;
-		goto err;
-	}
-
-	/* FIXME should not have INVERT_ENABLE bit here */
-	v = omap2_clk_readl(clk, clk->enable_reg);
-	if (clk->flags & INVERT_ENABLE)
-		v &= ~(1 << clk->enable_bit);
-	else
-		v |= (1 << clk->enable_bit);
-	omap2_clk_writel(v, clk, clk->enable_reg);
-	v = omap2_clk_readl(clk, clk->enable_reg); /* OCP barrier */
-
-	if (clk->ops && clk->ops->find_idlest)
-		_omap2_module_wait_ready(clk);
-
-	return 0;
-
-err:
-	if (clkdm_control && clk->clkdm)
-		clkdm_clk_disable(clk->clkdm, hw->clk);
-	return ret;
-}
-
-/**
- * omap2_dflt_clk_disable - disable a clock in the hardware
- * @hw: struct clk_hw * of the clock to disable
- *
- * Disable the clock @hw in the hardware, and call into the OMAP
- * clockdomain code to "disable" the corresponding clockdomain if all
- * clocks/hwmods in that clockdomain are now disabled.  No return
- * value.
- */
-void omap2_dflt_clk_disable(struct clk_hw *hw)
-{
-	struct clk_hw_omap *clk;
-	u32 v;
-
-	clk = to_clk_hw_omap(hw);
-	if (!clk->enable_reg) {
-		/*
-		 * 'independent' here refers to a clock which is not
-		 * controlled by its parent.
-		 */
-		pr_err("%s: independent clock %s has no enable_reg\n",
-		       __func__, __clk_get_name(hw->clk));
-		return;
-	}
-
-	v = omap2_clk_readl(clk, clk->enable_reg);
-	if (clk->flags & INVERT_ENABLE)
-		v |= (1 << clk->enable_bit);
-	else
-		v &= ~(1 << clk->enable_bit);
-	omap2_clk_writel(v, clk, clk->enable_reg);
-	/* No OCP barrier needed here since it is a disable operation */
-
-	if (!(ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL) &&
-	    clk->clkdm)
-		clkdm_clk_disable(clk->clkdm, hw->clk);
-}
-
 /**
  * omap2_clkops_enable_clkdm - increment usecount on clkdm of @hw
  * @hw: struct clk_hw * of the clock being enabled
@@ -523,29 +278,6 @@ void omap2_clkops_disable_clkdm(struct clk_hw *hw)
 	clkdm_clk_disable(clk->clkdm, hw->clk);
 }
 
-/**
- * omap2_dflt_clk_is_enabled - is clock enabled in the hardware?
- * @hw: struct clk_hw * to check
- *
- * Return 1 if the clock represented by @hw is enabled in the
- * hardware, or 0 otherwise.  Intended for use in the struct
- * clk_ops.is_enabled function pointer.
- */
-int omap2_dflt_clk_is_enabled(struct clk_hw *hw)
-{
-	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
-	u32 v;
-
-	v = omap2_clk_readl(clk, clk->enable_reg);
-
-	if (clk->flags & INVERT_ENABLE)
-		v ^= BIT(clk->enable_bit);
-
-	v &= BIT(clk->enable_bit);
-
-	return v ? 1 : 0;
-}
-
 static int __initdata mpurate;
 
 /*
@@ -566,11 +298,6 @@ static int __init omap_clk_setup(char *str)
 }
 __setup("mpurate=", omap_clk_setup);
 
-const struct clk_hw_omap_ops clkhwops_wait = {
-	.find_idlest	= omap2_clk_dflt_find_idlest,
-	.find_companion	= omap2_clk_dflt_find_companion,
-};
-
 /**
  * omap2_clk_print_new_rates - print summary of current clock tree rates
  * @hfclkin_ck_name: clk name for the off-chip HF oscillator
diff --git a/drivers/clk/ti/Makefile b/drivers/clk/ti/Makefile
index 05a0294aba10..9b93e6904359 100644
--- a/drivers/clk/ti/Makefile
+++ b/drivers/clk/ti/Makefile
@@ -1,7 +1,7 @@
 obj-y					+= clk.o autoidle.o clockdomain.o
 clk-common				= dpll.o composite.o divider.o gate.o \
 					  fixed-factor.o mux.o apll.o \
-					  clkt_dpll.o clkt_iclk.o
+					  clkt_dpll.o clkt_iclk.o clkt_dflt.o
 obj-$(CONFIG_SOC_AM33XX)		+= $(clk-common) clk-33xx.o dpll3xxx.o
 obj-$(CONFIG_SOC_TI81XX)		+= $(clk-common) fapll.o clk-816x.o
 obj-$(CONFIG_ARCH_OMAP2)		+= $(clk-common) interface.o clk-2xxx.o
diff --git a/drivers/clk/ti/clkt_dflt.c b/drivers/clk/ti/clkt_dflt.c
new file mode 100644
index 000000000000..a176b8ac8dd0
--- /dev/null
+++ b/drivers/clk/ti/clkt_dflt.c
@@ -0,0 +1,316 @@
+/*
+ * Default clock type
+ *
+ * Copyright (C) 2005-2008, 2015 Texas Instruments, Inc.
+ * Copyright (C) 2004-2010 Nokia Corporation
+ *
+ * Contacts:
+ * Richard Woodruff <r-woodruff2@ti.com>
+ * Paul Walmsley
+ * Tero Kristo <t-kristo@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/clk-provider.h>
+#include <linux/io.h>
+#include <linux/clk/ti.h>
+#include <linux/delay.h>
+
+#include "clock.h"
+
+/*
+ * MAX_MODULE_ENABLE_WAIT: maximum of number of microseconds to wait
+ * for a module to indicate that it is no longer in idle
+ */
+#define MAX_MODULE_ENABLE_WAIT		100000
+
+/*
+ * CM module register offsets, used for calculating the companion
+ * register addresses.
+ */
+#define CM_FCLKEN			0x0000
+#define CM_ICLKEN			0x0010
+
+/**
+ * _wait_idlest_generic - wait for a module to leave the idle state
+ * @clk: module clock to wait for (needed for register offsets)
+ * @reg: virtual address of module IDLEST register
+ * @mask: value to mask against to determine if the module is active
+ * @idlest: idle state indicator (0 or 1) for the clock
+ * @name: name of the clock (for printk)
+ *
+ * Wait for a module to leave idle, where its idle-status register is
+ * not inside the CM module.  Returns 1 if the module left idle
+ * promptly, or 0 if the module did not leave idle before the timeout
+ * elapsed.  XXX Deprecated - should be moved into drivers for the
+ * individual IP block that the IDLEST register exists in.
+ */
+static int _wait_idlest_generic(struct clk_hw_omap *clk, void __iomem *reg,
+				u32 mask, u8 idlest, const char *name)
+{
+	int i = 0, ena = 0;
+
+	ena = (idlest) ? 0 : mask;
+
+	/* Wait until module enters enabled state */
+	for (i = 0; i < MAX_MODULE_ENABLE_WAIT; i++) {
+		if ((ti_clk_ll_ops->clk_readl(reg) & mask) == ena)
+			break;
+		udelay(1);
+	}
+
+	if (i < MAX_MODULE_ENABLE_WAIT)
+		pr_debug("omap clock: module associated with clock %s ready after %d loops\n",
+			 name, i);
+	else
+		pr_err("omap clock: module associated with clock %s didn't enable in %d tries\n",
+		       name, MAX_MODULE_ENABLE_WAIT);
+
+	return (i < MAX_MODULE_ENABLE_WAIT) ? 1 : 0;
+}
+
+/**
+ * _omap2_module_wait_ready - wait for an OMAP module to leave IDLE
+ * @clk: struct clk * belonging to the module
+ *
+ * If the necessary clocks for the OMAP hardware IP block that
+ * corresponds to clock @clk are enabled, then wait for the module to
+ * indicate readiness (i.e., to leave IDLE).  This code does not
+ * belong in the clock code and will be moved in the medium term to
+ * module-dependent code.  No return value.
+ */
+static void _omap2_module_wait_ready(struct clk_hw_omap *clk)
+{
+	void __iomem *companion_reg, *idlest_reg;
+	u8 other_bit, idlest_bit, idlest_val, idlest_reg_id;
+	s16 prcm_mod;
+	int r;
+
+	/* Not all modules have multiple clocks that their IDLEST depends on */
+	if (clk->ops->find_companion) {
+		clk->ops->find_companion(clk, &companion_reg, &other_bit);
+		if (!(ti_clk_ll_ops->clk_readl(companion_reg) &
+		      (1 << other_bit)))
+			return;
+	}
+
+	clk->ops->find_idlest(clk, &idlest_reg, &idlest_bit, &idlest_val);
+	r = ti_clk_ll_ops->cm_split_idlest_reg(idlest_reg, &prcm_mod,
+					       &idlest_reg_id);
+	if (r) {
+		/* IDLEST register not in the CM module */
+		_wait_idlest_generic(clk, idlest_reg, (1 << idlest_bit),
+				     idlest_val, __clk_get_name(clk->hw.clk));
+	} else {
+		ti_clk_ll_ops->cm_wait_module_ready(0, prcm_mod, idlest_reg_id,
+						    idlest_bit);
+	}
+}
+
+/**
+ * omap2_clk_dflt_find_companion - find companion clock to @clk
+ * @clk: struct clk * to find the companion clock of
+ * @other_reg: void __iomem ** to return the companion clock CM_*CLKEN va in
+ * @other_bit: u8 ** to return the companion clock bit shift in
+ *
+ * Note: We don't need special code here for INVERT_ENABLE for the
+ * time being since INVERT_ENABLE only applies to clocks enabled by
+ * CM_CLKEN_PLL
+ *
+ * Convert CM_ICLKEN* <-> CM_FCLKEN*.  This conversion assumes it's
+ * just a matter of XORing the bits.
+ *
+ * Some clocks don't have companion clocks.  For example, modules with
+ * only an interface clock (such as MAILBOXES) don't have a companion
+ * clock.  Right now, this code relies on the hardware exporting a bit
+ * in the correct companion register that indicates that the
+ * nonexistent 'companion clock' is active.  Future patches will
+ * associate this type of code with per-module data structures to
+ * avoid this issue, and remove the casts.  No return value.
+ */
+void omap2_clk_dflt_find_companion(struct clk_hw_omap *clk,
+				   void __iomem **other_reg, u8 *other_bit)
+{
+	u32 r;
+
+	/*
+	 * Convert CM_ICLKEN* <-> CM_FCLKEN*.  This conversion assumes
+	 * it's just a matter of XORing the bits.
+	 */
+	r = ((__force u32)clk->enable_reg ^ (CM_FCLKEN ^ CM_ICLKEN));
+
+	*other_reg = (__force void __iomem *)r;
+	*other_bit = clk->enable_bit;
+}
+
+/**
+ * omap2_clk_dflt_find_idlest - find CM_IDLEST reg va, bit shift for @clk
+ * @clk: struct clk * to find IDLEST info for
+ * @idlest_reg: void __iomem ** to return the CM_IDLEST va in
+ * @idlest_bit: u8 * to return the CM_IDLEST bit shift in
+ * @idlest_val: u8 * to return the idle status indicator
+ *
+ * Return the CM_IDLEST register address and bit shift corresponding
+ * to the module that "owns" this clock.  This default code assumes
+ * that the CM_IDLEST bit shift is the CM_*CLKEN bit shift, and that
+ * the IDLEST register address ID corresponds to the CM_*CLKEN
+ * register address ID (e.g., that CM_FCLKEN2 corresponds to
+ * CM_IDLEST2).  This is not true for all modules.  No return value.
+ */
+void omap2_clk_dflt_find_idlest(struct clk_hw_omap *clk,
+				void __iomem **idlest_reg, u8 *idlest_bit,
+				u8 *idlest_val)
+{
+	u32 r;
+
+	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
+	*idlest_reg = (__force void __iomem *)r;
+	*idlest_bit = clk->enable_bit;
+
+	/*
+	 * 24xx uses 0 to indicate not ready, and 1 to indicate ready.
+	 * 34xx reverses this, just to keep us on our toes
+	 * AM35xx uses both, depending on the module.
+	 */
+	*idlest_val = ti_clk_get_features()->cm_idlest_val;
+}
+
+/**
+ * omap2_dflt_clk_enable - enable a clock in the hardware
+ * @hw: struct clk_hw * of the clock to enable
+ *
+ * Enable the clock @hw in the hardware.  We first call into the OMAP
+ * clockdomain code to "enable" the corresponding clockdomain if this
+ * is the first enabled user of the clockdomain.  Then program the
+ * hardware to enable the clock.  Then wait for the IP block that uses
+ * this clock to leave idle (if applicable).  Returns the error value
+ * from clkdm_clk_enable() if it terminated with an error, or -EINVAL
+ * if @hw has a null clock enable_reg, or zero upon success.
+ */
+int omap2_dflt_clk_enable(struct clk_hw *hw)
+{
+	struct clk_hw_omap *clk;
+	u32 v;
+	int ret = 0;
+	bool clkdm_control;
+
+	if (ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL)
+		clkdm_control = false;
+	else
+		clkdm_control = true;
+
+	clk = to_clk_hw_omap(hw);
+
+	if (clkdm_control && clk->clkdm) {
+		ret = ti_clk_ll_ops->clkdm_clk_enable(clk->clkdm, hw->clk);
+		if (ret) {
+			WARN(1,
+			     "%s: could not enable %s's clockdomain %s: %d\n",
+			     __func__, __clk_get_name(hw->clk),
+			     clk->clkdm_name, ret);
+			return ret;
+		}
+	}
+
+	if (unlikely(!clk->enable_reg)) {
+		pr_err("%s: %s missing enable_reg\n", __func__,
+		       __clk_get_name(hw->clk));
+		ret = -EINVAL;
+		goto err;
+	}
+
+	/* FIXME should not have INVERT_ENABLE bit here */
+	v = ti_clk_ll_ops->clk_readl(clk->enable_reg);
+	if (clk->flags & INVERT_ENABLE)
+		v &= ~(1 << clk->enable_bit);
+	else
+		v |= (1 << clk->enable_bit);
+	ti_clk_ll_ops->clk_writel(v, clk->enable_reg);
+	v = ti_clk_ll_ops->clk_readl(clk->enable_reg); /* OCP barrier */
+
+	if (clk->ops && clk->ops->find_idlest)
+		_omap2_module_wait_ready(clk);
+
+	return 0;
+
+err:
+	if (clkdm_control && clk->clkdm)
+		ti_clk_ll_ops->clkdm_clk_disable(clk->clkdm, hw->clk);
+	return ret;
+}
+
+/**
+ * omap2_dflt_clk_disable - disable a clock in the hardware
+ * @hw: struct clk_hw * of the clock to disable
+ *
+ * Disable the clock @hw in the hardware, and call into the OMAP
+ * clockdomain code to "disable" the corresponding clockdomain if all
+ * clocks/hwmods in that clockdomain are now disabled.  No return
+ * value.
+ */
+void omap2_dflt_clk_disable(struct clk_hw *hw)
+{
+	struct clk_hw_omap *clk;
+	u32 v;
+
+	clk = to_clk_hw_omap(hw);
+	if (!clk->enable_reg) {
+		/*
+		 * 'independent' here refers to a clock which is not
+		 * controlled by its parent.
+		 */
+		pr_err("%s: independent clock %s has no enable_reg\n",
+		       __func__, __clk_get_name(hw->clk));
+		return;
+	}
+
+	v = ti_clk_ll_ops->clk_readl(clk->enable_reg);
+	if (clk->flags & INVERT_ENABLE)
+		v |= (1 << clk->enable_bit);
+	else
+		v &= ~(1 << clk->enable_bit);
+	ti_clk_ll_ops->clk_writel(v, clk->enable_reg);
+	/* No OCP barrier needed here since it is a disable operation */
+
+	if (!(ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL) &&
+	    clk->clkdm)
+		ti_clk_ll_ops->clkdm_clk_disable(clk->clkdm, hw->clk);
+}
+
+/**
+ * omap2_dflt_clk_is_enabled - is clock enabled in the hardware?
+ * @hw: struct clk_hw * to check
+ *
+ * Return 1 if the clock represented by @hw is enabled in the
+ * hardware, or 0 otherwise.  Intended for use in the struct
+ * clk_ops.is_enabled function pointer.
+ */
+int omap2_dflt_clk_is_enabled(struct clk_hw *hw)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+	u32 v;
+
+	v = ti_clk_ll_ops->clk_readl(clk->enable_reg);
+
+	if (clk->flags & INVERT_ENABLE)
+		v ^= BIT(clk->enable_bit);
+
+	v &= BIT(clk->enable_bit);
+
+	return v ? 1 : 0;
+}
+
+const struct clk_hw_omap_ops clkhwops_wait = {
+	.find_idlest	= omap2_clk_dflt_find_idlest,
+	.find_companion	= omap2_clk_dflt_find_companion,
+};
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index 688d9e47b2c8..f21538364588 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -175,9 +175,14 @@ void omap2_clk_enable_init_clocks(const char **clk_names, u8 num_clocks);
 
 extern const struct clk_hw_omap_ops clkhwops_omap3_dpll;
 extern const struct clk_hw_omap_ops clkhwops_omap4_dpllmx;
+extern const struct clk_hw_omap_ops clkhwops_wait;
 extern const struct clk_hw_omap_ops clkhwops_iclk;
 extern const struct clk_hw_omap_ops clkhwops_iclk_wait;
 
+int omap2_dflt_clk_enable(struct clk_hw *hw);
+void omap2_dflt_clk_disable(struct clk_hw *hw);
+int omap2_dflt_clk_is_enabled(struct clk_hw *hw);
+
 u8 omap2_init_dpll_parent(struct clk_hw *hw);
 int omap3_noncore_dpll_enable(struct clk_hw *hw);
 void omap3_noncore_dpll_disable(struct clk_hw *hw);
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index fbb65e401d13..81a913edffa7 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -278,9 +278,6 @@ int omap2_clk_disable_autoidle_all(void);
 int omap2_clk_enable_autoidle_all(void);
 int omap2_clk_allow_idle(struct clk *clk);
 int omap2_clk_deny_idle(struct clk *clk);
-int omap2_dflt_clk_enable(struct clk_hw *hw);
-void omap2_dflt_clk_disable(struct clk_hw *hw);
-int omap2_dflt_clk_is_enabled(struct clk_hw *hw);
 void omap2_clkt_iclk_allow_idle(struct clk_hw_omap *clk);
 void omap2_clkt_iclk_deny_idle(struct clk_hw_omap *clk);
 void omap2_clk_dflt_find_companion(struct clk_hw_omap *clk,
@@ -337,7 +334,6 @@ const struct ti_clk_features *ti_clk_get_features(void);
 
 extern const struct clk_hw_omap_ops clkhwops_omap2xxx_dpll;
 extern const struct clk_hw_omap_ops clkhwops_omap2430_i2chs_wait;
-extern const struct clk_hw_omap_ops clkhwops_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_dss_usbhost_wait;
 extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_module_wait;
 extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_wait;
-- 
cgit v1.2.3-70-g09d2


From d5a04dddf51e234dc89f21e4e4b91e853cf49ff2 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Tue, 3 Mar 2015 16:08:42 +0200
Subject: clk: ti: omap2430: move clock support code under clock driver

With the legacy clock support gone, this is no longer needed under
platform code-base. Thus, move this under the TI clock driver, and
remove the exported API from the public header.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/Makefile    |  1 -
 arch/arm/mach-omap2/clock2430.c | 57 -----------------------------------------
 drivers/clk/ti/clkt_iclk.c      | 35 +++++++++++++++++++++++++
 drivers/clk/ti/clock.h          |  1 +
 include/linux/clk/ti.h          |  1 -
 5 files changed, 36 insertions(+), 59 deletions(-)
 delete mode 100644 arch/arm/mach-omap2/clock2430.c

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile
index 070526563698..695d58f81ff3 100644
--- a/arch/arm/mach-omap2/Makefile
+++ b/arch/arm/mach-omap2/Makefile
@@ -185,7 +185,6 @@ obj-$(CONFIG_ARCH_OMAP2)		+= $(clock-common)
 obj-$(CONFIG_ARCH_OMAP2)		+= clkt2xxx_dpllcore.o
 obj-$(CONFIG_ARCH_OMAP2)		+= clkt2xxx_virt_prcm_set.o
 obj-$(CONFIG_ARCH_OMAP2)		+= clkt2xxx_dpll.o
-obj-$(CONFIG_SOC_OMAP2430)		+= clock2430.o
 obj-$(CONFIG_ARCH_OMAP3)		+= $(clock-common)
 obj-$(CONFIG_ARCH_OMAP3)		+= clock34xx.o clkt34xx_dpll3m2.o
 obj-$(CONFIG_ARCH_OMAP3)		+= clock3517.o
diff --git a/arch/arm/mach-omap2/clock2430.c b/arch/arm/mach-omap2/clock2430.c
deleted file mode 100644
index cef0c8d1de52..000000000000
--- a/arch/arm/mach-omap2/clock2430.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * clock2430.c - OMAP2430-specific clock integration code
- *
- * Copyright (C) 2005-2008 Texas Instruments, Inc.
- * Copyright (C) 2004-2010 Nokia Corporation
- *
- * Contacts:
- * Richard Woodruff <r-woodruff2@ti.com>
- * Paul Walmsley
- *
- * Based on earlier work by Tuukka Tikkanen, Tony Lindgren,
- * Gordon McNutt and RidgeRun, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#undef DEBUG
-
-#include <linux/kernel.h>
-#include <linux/clk.h>
-#include <linux/io.h>
-
-#include "soc.h"
-#include "iomap.h"
-#include "clock.h"
-#include "clock2xxx.h"
-#include "cm2xxx.h"
-#include "cm-regbits-24xx.h"
-
-/**
- * omap2430_clk_i2chs_find_idlest - return CM_IDLEST info for 2430 I2CHS
- * @clk: struct clk * being enabled
- * @idlest_reg: void __iomem ** to store CM_IDLEST reg address into
- * @idlest_bit: pointer to a u8 to store the CM_IDLEST bit shift into
- * @idlest_val: pointer to a u8 to store the CM_IDLEST indicator
- *
- * OMAP2430 I2CHS CM_IDLEST bits are in CM_IDLEST1_CORE, but the
- * CM_*CLKEN bits are in CM_{I,F}CLKEN2_CORE.  This custom function
- * passes back the correct CM_IDLEST register address for I2CHS
- * modules.  No return value.
- */
-static void omap2430_clk_i2chs_find_idlest(struct clk_hw_omap *clk,
-					   void __iomem **idlest_reg,
-					   u8 *idlest_bit,
-					   u8 *idlest_val)
-{
-	*idlest_reg = OMAP2430_CM_REGADDR(CORE_MOD, CM_IDLEST);
-	*idlest_bit = clk->enable_bit;
-	*idlest_val = OMAP24XX_CM_IDLEST_VAL;
-}
-
-/* 2430 I2CHS has non-standard IDLEST register */
-const struct clk_hw_omap_ops clkhwops_omap2430_i2chs_wait = {
-	.find_idlest	= omap2430_clk_i2chs_find_idlest,
-	.find_companion	= omap2_clk_dflt_find_companion,
-};
diff --git a/drivers/clk/ti/clkt_iclk.c b/drivers/clk/ti/clkt_iclk.c
index a03919df00ef..38c36908cf88 100644
--- a/drivers/clk/ti/clkt_iclk.c
+++ b/drivers/clk/ti/clkt_iclk.c
@@ -18,8 +18,12 @@
 #include "clock.h"
 
 /* Register offsets */
+#define OMAP24XX_CM_FCLKEN2		0x04
 #define CM_AUTOIDLE			0x30
 #define CM_ICLKEN			0x10
+#define CM_IDLEST			0x20
+
+#define OMAP24XX_CM_IDLEST_VAL		0
 
 /* Private functions */
 
@@ -51,6 +55,31 @@ void omap2_clkt_iclk_deny_idle(struct clk_hw_omap *clk)
 	ti_clk_ll_ops->clk_writel(v, r);
 }
 
+/**
+ * omap2430_clk_i2chs_find_idlest - return CM_IDLEST info for 2430 I2CHS
+ * @clk: struct clk * being enabled
+ * @idlest_reg: void __iomem ** to store CM_IDLEST reg address into
+ * @idlest_bit: pointer to a u8 to store the CM_IDLEST bit shift into
+ * @idlest_val: pointer to a u8 to store the CM_IDLEST indicator
+ *
+ * OMAP2430 I2CHS CM_IDLEST bits are in CM_IDLEST1_CORE, but the
+ * CM_*CLKEN bits are in CM_{I,F}CLKEN2_CORE.  This custom function
+ * passes back the correct CM_IDLEST register address for I2CHS
+ * modules.  No return value.
+ */
+static void omap2430_clk_i2chs_find_idlest(struct clk_hw_omap *clk,
+					   void __iomem **idlest_reg,
+					   u8 *idlest_bit,
+					   u8 *idlest_val)
+{
+	u32 r;
+
+	r = ((__force u32)clk->enable_reg ^ (OMAP24XX_CM_FCLKEN2 ^ CM_IDLEST));
+	*idlest_reg = (__force void __iomem *)r;
+	*idlest_bit = clk->enable_bit;
+	*idlest_val = OMAP24XX_CM_IDLEST_VAL;
+}
+
 /* Public data */
 
 const struct clk_hw_omap_ops clkhwops_iclk = {
@@ -64,3 +93,9 @@ const struct clk_hw_omap_ops clkhwops_iclk_wait = {
 	.find_idlest	= omap2_clk_dflt_find_idlest,
 	.find_companion	= omap2_clk_dflt_find_companion,
 };
+
+/* 2430 I2CHS has non-standard IDLEST register */
+const struct clk_hw_omap_ops clkhwops_omap2430_i2chs_wait = {
+	.find_idlest	= omap2430_clk_i2chs_find_idlest,
+	.find_companion	= omap2_clk_dflt_find_companion,
+};
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index f21538364588..3652c267cf81 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -178,6 +178,7 @@ extern const struct clk_hw_omap_ops clkhwops_omap4_dpllmx;
 extern const struct clk_hw_omap_ops clkhwops_wait;
 extern const struct clk_hw_omap_ops clkhwops_iclk;
 extern const struct clk_hw_omap_ops clkhwops_iclk_wait;
+extern const struct clk_hw_omap_ops clkhwops_omap2430_i2chs_wait;
 
 int omap2_dflt_clk_enable(struct clk_hw *hw);
 void omap2_dflt_clk_disable(struct clk_hw *hw);
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 81a913edffa7..440ace33ea35 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -333,7 +333,6 @@ void ti_clk_setup_features(struct ti_clk_features *features);
 const struct ti_clk_features *ti_clk_get_features(void);
 
 extern const struct clk_hw_omap_ops clkhwops_omap2xxx_dpll;
-extern const struct clk_hw_omap_ops clkhwops_omap2430_i2chs_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_dss_usbhost_wait;
 extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_module_wait;
 extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_wait;
-- 
cgit v1.2.3-70-g09d2


From bd86cfdcbd827216fd682d62ffba2667bbe6fbc3 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Tue, 3 Mar 2015 16:22:50 +0200
Subject: clk: ti: clkdm: move clkdm gate clock support code to clock driver

With the legacy clock data gone, this is no longer needed under platform,
so move it under the clock driver itself. Remove the exported clock driver
APIs as well, as these are not needed outside clock driver anymore.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/clock.c  | 76 --------------------------------------------
 arch/arm/mach-omap2/clock.h  |  3 --
 drivers/clk/ti/clock.h       |  3 ++
 drivers/clk/ti/clockdomain.c | 76 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/clk/ti.h       |  2 --
 5 files changed, 79 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index 38a336b4c42b..99875dba803a 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -202,82 +202,6 @@ void omap2_init_clk_clkdm(struct clk_hw *hw)
 	}
 }
 
-/**
- * omap2_clkops_enable_clkdm - increment usecount on clkdm of @hw
- * @hw: struct clk_hw * of the clock being enabled
- *
- * Increment the usecount of the clockdomain of the clock pointed to
- * by @hw; if the usecount is 1, the clockdomain will be "enabled."
- * Only needed for clocks that don't use omap2_dflt_clk_enable() as
- * their enable function pointer.  Passes along the return value of
- * clkdm_clk_enable(), -EINVAL if @hw is not associated with a
- * clockdomain, or 0 if clock framework-based clockdomain control is
- * not implemented.
- */
-int omap2_clkops_enable_clkdm(struct clk_hw *hw)
-{
-	struct clk_hw_omap *clk;
-	int ret = 0;
-
-	clk = to_clk_hw_omap(hw);
-
-	if (unlikely(!clk->clkdm)) {
-		pr_err("%s: %s: no clkdm set ?!\n", __func__,
-		       __clk_get_name(hw->clk));
-		return -EINVAL;
-	}
-
-	if (unlikely(clk->enable_reg))
-		pr_err("%s: %s: should use dflt_clk_enable ?!\n", __func__,
-		       __clk_get_name(hw->clk));
-
-	if (ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL) {
-		pr_err("%s: %s: clkfw-based clockdomain control disabled ?!\n",
-		       __func__, __clk_get_name(hw->clk));
-		return 0;
-	}
-
-	ret = clkdm_clk_enable(clk->clkdm, hw->clk);
-	WARN(ret, "%s: could not enable %s's clockdomain %s: %d\n",
-	     __func__, __clk_get_name(hw->clk), clk->clkdm->name, ret);
-
-	return ret;
-}
-
-/**
- * omap2_clkops_disable_clkdm - decrement usecount on clkdm of @hw
- * @hw: struct clk_hw * of the clock being disabled
- *
- * Decrement the usecount of the clockdomain of the clock pointed to
- * by @hw; if the usecount is 0, the clockdomain will be "disabled."
- * Only needed for clocks that don't use omap2_dflt_clk_disable() as their
- * disable function pointer.  No return value.
- */
-void omap2_clkops_disable_clkdm(struct clk_hw *hw)
-{
-	struct clk_hw_omap *clk;
-
-	clk = to_clk_hw_omap(hw);
-
-	if (unlikely(!clk->clkdm)) {
-		pr_err("%s: %s: no clkdm set ?!\n", __func__,
-		       __clk_get_name(hw->clk));
-		return;
-	}
-
-	if (unlikely(clk->enable_reg))
-		pr_err("%s: %s: should use dflt_clk_disable ?!\n", __func__,
-		       __clk_get_name(hw->clk));
-
-	if (ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL) {
-		pr_err("%s: %s: clkfw-based clockdomain control disabled ?!\n",
-		       __func__, __clk_get_name(hw->clk));
-		return;
-	}
-
-	clkdm_clk_disable(clk->clkdm, hw->clk);
-}
-
 static int __initdata mpurate;
 
 /*
diff --git a/arch/arm/mach-omap2/clock.h b/arch/arm/mach-omap2/clock.h
index 948065497472..a7e951129ffb 100644
--- a/arch/arm/mach-omap2/clock.h
+++ b/arch/arm/mach-omap2/clock.h
@@ -202,9 +202,6 @@ extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_module_wait;
 extern const struct clk_hw_omap_ops clkhwops_apll54;
 extern const struct clk_hw_omap_ops clkhwops_apll96;
 
-extern int omap2_clkops_enable_clkdm(struct clk_hw *hw);
-extern void omap2_clkops_disable_clkdm(struct clk_hw *hw);
-
 struct regmap;
 
 int __init omap2_clk_provider_init(struct device_node *np, int index,
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index 3652c267cf81..83476d12d561 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -180,6 +180,9 @@ extern const struct clk_hw_omap_ops clkhwops_iclk;
 extern const struct clk_hw_omap_ops clkhwops_iclk_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap2430_i2chs_wait;
 
+int omap2_clkops_enable_clkdm(struct clk_hw *hw);
+void omap2_clkops_disable_clkdm(struct clk_hw *hw);
+
 int omap2_dflt_clk_enable(struct clk_hw *hw);
 void omap2_dflt_clk_disable(struct clk_hw *hw);
 int omap2_dflt_clk_is_enabled(struct clk_hw *hw);
diff --git a/drivers/clk/ti/clockdomain.c b/drivers/clk/ti/clockdomain.c
index 35fe1085480c..61ef87b1a688 100644
--- a/drivers/clk/ti/clockdomain.c
+++ b/drivers/clk/ti/clockdomain.c
@@ -24,6 +24,82 @@
 #undef pr_fmt
 #define pr_fmt(fmt) "%s: " fmt, __func__
 
+/**
+ * omap2_clkops_enable_clkdm - increment usecount on clkdm of @hw
+ * @hw: struct clk_hw * of the clock being enabled
+ *
+ * Increment the usecount of the clockdomain of the clock pointed to
+ * by @hw; if the usecount is 1, the clockdomain will be "enabled."
+ * Only needed for clocks that don't use omap2_dflt_clk_enable() as
+ * their enable function pointer.  Passes along the return value of
+ * clkdm_clk_enable(), -EINVAL if @hw is not associated with a
+ * clockdomain, or 0 if clock framework-based clockdomain control is
+ * not implemented.
+ */
+int omap2_clkops_enable_clkdm(struct clk_hw *hw)
+{
+	struct clk_hw_omap *clk;
+	int ret = 0;
+
+	clk = to_clk_hw_omap(hw);
+
+	if (unlikely(!clk->clkdm)) {
+		pr_err("%s: %s: no clkdm set ?!\n", __func__,
+		       __clk_get_name(hw->clk));
+		return -EINVAL;
+	}
+
+	if (unlikely(clk->enable_reg))
+		pr_err("%s: %s: should use dflt_clk_enable ?!\n", __func__,
+		       __clk_get_name(hw->clk));
+
+	if (ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL) {
+		pr_err("%s: %s: clkfw-based clockdomain control disabled ?!\n",
+		       __func__, __clk_get_name(hw->clk));
+		return 0;
+	}
+
+	ret = ti_clk_ll_ops->clkdm_clk_enable(clk->clkdm, hw->clk);
+	WARN(ret, "%s: could not enable %s's clockdomain %s: %d\n",
+	     __func__, __clk_get_name(hw->clk), clk->clkdm_name, ret);
+
+	return ret;
+}
+
+/**
+ * omap2_clkops_disable_clkdm - decrement usecount on clkdm of @hw
+ * @hw: struct clk_hw * of the clock being disabled
+ *
+ * Decrement the usecount of the clockdomain of the clock pointed to
+ * by @hw; if the usecount is 0, the clockdomain will be "disabled."
+ * Only needed for clocks that don't use omap2_dflt_clk_disable() as their
+ * disable function pointer.  No return value.
+ */
+void omap2_clkops_disable_clkdm(struct clk_hw *hw)
+{
+	struct clk_hw_omap *clk;
+
+	clk = to_clk_hw_omap(hw);
+
+	if (unlikely(!clk->clkdm)) {
+		pr_err("%s: %s: no clkdm set ?!\n", __func__,
+		       __clk_get_name(hw->clk));
+		return;
+	}
+
+	if (unlikely(clk->enable_reg))
+		pr_err("%s: %s: should use dflt_clk_disable ?!\n", __func__,
+		       __clk_get_name(hw->clk));
+
+	if (ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL) {
+		pr_err("%s: %s: clkfw-based clockdomain control disabled ?!\n",
+		       __func__, __clk_get_name(hw->clk));
+		return;
+	}
+
+	ti_clk_ll_ops->clkdm_clk_disable(clk->clkdm, hw->clk);
+}
+
 static void __init of_ti_clockdomain_setup(struct device_node *node)
 {
 	struct clk *clk;
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 440ace33ea35..27828422c9c5 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -272,8 +272,6 @@ extern const struct clk_ops ti_clk_mux_ops;
 #define to_clk_hw_omap(_hw) container_of(_hw, struct clk_hw_omap, hw)
 
 void omap2_init_clk_clkdm(struct clk_hw *clk);
-int omap2_clkops_enable_clkdm(struct clk_hw *hw);
-void omap2_clkops_disable_clkdm(struct clk_hw *hw);
 int omap2_clk_disable_autoidle_all(void);
 int omap2_clk_enable_autoidle_all(void);
 int omap2_clk_allow_idle(struct clk *clk);
-- 
cgit v1.2.3-70-g09d2


From f2671d5c6cb4abe4636014cd66fd0eeb8190b2ca Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Tue, 3 Mar 2015 17:28:12 +0200
Subject: clk: ti: omap34xx: move omap34xx clock type support code to clock
 driver

With the legacy clock data gone, this is no longer needed under platform,
so move it under the clock driver itself. Remove unnecessary declarations
from the TI clock header also.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/Makefile    |   2 +-
 arch/arm/mach-omap2/clock34xx.c | 138 ----------------------------------------
 drivers/clk/ti/clk-3xxx.c       | 118 ++++++++++++++++++++++++++++++++++
 drivers/clk/ti/clock.h          |   4 ++
 include/linux/clk/ti.h          |   4 --
 5 files changed, 123 insertions(+), 143 deletions(-)
 delete mode 100644 arch/arm/mach-omap2/clock34xx.c

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile
index 695d58f81ff3..22d2e48dcff5 100644
--- a/arch/arm/mach-omap2/Makefile
+++ b/arch/arm/mach-omap2/Makefile
@@ -186,7 +186,7 @@ obj-$(CONFIG_ARCH_OMAP2)		+= clkt2xxx_dpllcore.o
 obj-$(CONFIG_ARCH_OMAP2)		+= clkt2xxx_virt_prcm_set.o
 obj-$(CONFIG_ARCH_OMAP2)		+= clkt2xxx_dpll.o
 obj-$(CONFIG_ARCH_OMAP3)		+= $(clock-common)
-obj-$(CONFIG_ARCH_OMAP3)		+= clock34xx.o clkt34xx_dpll3m2.o
+obj-$(CONFIG_ARCH_OMAP3)		+= clkt34xx_dpll3m2.o
 obj-$(CONFIG_ARCH_OMAP3)		+= clock3517.o
 obj-$(CONFIG_ARCH_OMAP4)		+= $(clock-common)
 obj-$(CONFIG_SOC_AM33XX)		+= $(clock-common)
diff --git a/arch/arm/mach-omap2/clock34xx.c b/arch/arm/mach-omap2/clock34xx.c
deleted file mode 100644
index 4596468e50ab..000000000000
--- a/arch/arm/mach-omap2/clock34xx.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * OMAP3-specific clock framework functions
- *
- * Copyright (C) 2007-2008 Texas Instruments, Inc.
- * Copyright (C) 2007-2011 Nokia Corporation
- *
- * Paul Walmsley
- * Jouni Högander
- *
- * Parts of this code are based on code written by
- * Richard Woodruff, Tony Lindgren, Tuukka Tikkanen, Karthik Dasu,
- * Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#undef DEBUG
-
-#include <linux/kernel.h>
-#include <linux/clk.h>
-#include <linux/io.h>
-
-#include "clock.h"
-#include "clock34xx.h"
-#include "cm3xxx.h"
-#include "cm-regbits-34xx.h"
-
-/**
- * omap3430es2_clk_ssi_find_idlest - return CM_IDLEST info for SSI
- * @clk: struct clk * being enabled
- * @idlest_reg: void __iomem ** to store CM_IDLEST reg address into
- * @idlest_bit: pointer to a u8 to store the CM_IDLEST bit shift into
- * @idlest_val: pointer to a u8 to store the CM_IDLEST indicator
- *
- * The OMAP3430ES2 SSI target CM_IDLEST bit is at a different shift
- * from the CM_{I,F}CLKEN bit.  Pass back the correct info via
- * @idlest_reg and @idlest_bit.  No return value.
- */
-static void omap3430es2_clk_ssi_find_idlest(struct clk_hw_omap *clk,
-					    void __iomem **idlest_reg,
-					    u8 *idlest_bit,
-					    u8 *idlest_val)
-{
-	u32 r;
-
-	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
-	*idlest_reg = (__force void __iomem *)r;
-	*idlest_bit = OMAP3430ES2_ST_SSI_IDLE_SHIFT;
-	*idlest_val = OMAP34XX_CM_IDLEST_VAL;
-}
-const struct clk_hw_omap_ops clkhwops_omap3430es2_ssi_wait = {
-	.find_idlest	= omap3430es2_clk_ssi_find_idlest,
-	.find_companion	= omap2_clk_dflt_find_companion,
-};
-
-const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_ssi_wait = {
-	.allow_idle	= omap2_clkt_iclk_allow_idle,
-	.deny_idle	= omap2_clkt_iclk_deny_idle,
-	.find_idlest	= omap3430es2_clk_ssi_find_idlest,
-	.find_companion = omap2_clk_dflt_find_companion,
-};
-
-/**
- * omap3430es2_clk_dss_usbhost_find_idlest - CM_IDLEST info for DSS, USBHOST
- * @clk: struct clk * being enabled
- * @idlest_reg: void __iomem ** to store CM_IDLEST reg address into
- * @idlest_bit: pointer to a u8 to store the CM_IDLEST bit shift into
- * @idlest_val: pointer to a u8 to store the CM_IDLEST indicator
- *
- * Some OMAP modules on OMAP3 ES2+ chips have both initiator and
- * target IDLEST bits.  For our purposes, we are concerned with the
- * target IDLEST bits, which exist at a different bit position than
- * the *CLKEN bit position for these modules (DSS and USBHOST) (The
- * default find_idlest code assumes that they are at the same
- * position.)  No return value.
- */
-static void omap3430es2_clk_dss_usbhost_find_idlest(struct clk_hw_omap *clk,
-						    void __iomem **idlest_reg,
-						    u8 *idlest_bit,
-						    u8 *idlest_val)
-{
-	u32 r;
-
-	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
-	*idlest_reg = (__force void __iomem *)r;
-	/* USBHOST_IDLE has same shift */
-	*idlest_bit = OMAP3430ES2_ST_DSS_IDLE_SHIFT;
-	*idlest_val = OMAP34XX_CM_IDLEST_VAL;
-}
-
-const struct clk_hw_omap_ops clkhwops_omap3430es2_dss_usbhost_wait = {
-	.find_idlest	= omap3430es2_clk_dss_usbhost_find_idlest,
-	.find_companion	= omap2_clk_dflt_find_companion,
-};
-
-const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_dss_usbhost_wait = {
-	.allow_idle	= omap2_clkt_iclk_allow_idle,
-	.deny_idle	= omap2_clkt_iclk_deny_idle,
-	.find_idlest	= omap3430es2_clk_dss_usbhost_find_idlest,
-	.find_companion	= omap2_clk_dflt_find_companion,
-};
-
-/**
- * omap3430es2_clk_hsotgusb_find_idlest - return CM_IDLEST info for HSOTGUSB
- * @clk: struct clk * being enabled
- * @idlest_reg: void __iomem ** to store CM_IDLEST reg address into
- * @idlest_bit: pointer to a u8 to store the CM_IDLEST bit shift into
- * @idlest_val: pointer to a u8 to store the CM_IDLEST indicator
- *
- * The OMAP3430ES2 HSOTGUSB target CM_IDLEST bit is at a different
- * shift from the CM_{I,F}CLKEN bit.  Pass back the correct info via
- * @idlest_reg and @idlest_bit.  No return value.
- */
-static void omap3430es2_clk_hsotgusb_find_idlest(struct clk_hw_omap *clk,
-						 void __iomem **idlest_reg,
-						 u8 *idlest_bit,
-						 u8 *idlest_val)
-{
-	u32 r;
-
-	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
-	*idlest_reg = (__force void __iomem *)r;
-	*idlest_bit = OMAP3430ES2_ST_HSOTGUSB_IDLE_SHIFT;
-	*idlest_val = OMAP34XX_CM_IDLEST_VAL;
-}
-
-const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_hsotgusb_wait = {
-	.allow_idle	= omap2_clkt_iclk_allow_idle,
-	.deny_idle	= omap2_clkt_iclk_deny_idle,
-	.find_idlest	= omap3430es2_clk_hsotgusb_find_idlest,
-	.find_companion	= omap2_clk_dflt_find_companion,
-};
-
-const struct clk_hw_omap_ops clkhwops_omap3430es2_hsotgusb_wait = {
-	.find_idlest	= omap3430es2_clk_hsotgusb_find_idlest,
-	.find_companion	= omap2_clk_dflt_find_companion,
-};
diff --git a/drivers/clk/ti/clk-3xxx.c b/drivers/clk/ti/clk-3xxx.c
index 5489ad8c07d4..58879f0b7949 100644
--- a/drivers/clk/ti/clk-3xxx.c
+++ b/drivers/clk/ti/clk-3xxx.c
@@ -28,6 +28,124 @@
  */
 #define DPLL5_FREQ_FOR_USBHOST		120000000
 
+#define OMAP3430ES2_ST_DSS_IDLE_SHIFT			1
+#define OMAP3430ES2_ST_HSOTGUSB_IDLE_SHIFT		5
+#define OMAP3430ES2_ST_SSI_IDLE_SHIFT			8
+
+#define OMAP34XX_CM_IDLEST_VAL				1
+
+/**
+ * omap3430es2_clk_ssi_find_idlest - return CM_IDLEST info for SSI
+ * @clk: struct clk * being enabled
+ * @idlest_reg: void __iomem ** to store CM_IDLEST reg address into
+ * @idlest_bit: pointer to a u8 to store the CM_IDLEST bit shift into
+ * @idlest_val: pointer to a u8 to store the CM_IDLEST indicator
+ *
+ * The OMAP3430ES2 SSI target CM_IDLEST bit is at a different shift
+ * from the CM_{I,F}CLKEN bit.  Pass back the correct info via
+ * @idlest_reg and @idlest_bit.  No return value.
+ */
+static void omap3430es2_clk_ssi_find_idlest(struct clk_hw_omap *clk,
+					    void __iomem **idlest_reg,
+					    u8 *idlest_bit,
+					    u8 *idlest_val)
+{
+	u32 r;
+
+	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
+	*idlest_reg = (__force void __iomem *)r;
+	*idlest_bit = OMAP3430ES2_ST_SSI_IDLE_SHIFT;
+	*idlest_val = OMAP34XX_CM_IDLEST_VAL;
+}
+
+const struct clk_hw_omap_ops clkhwops_omap3430es2_ssi_wait = {
+	.find_idlest	= omap3430es2_clk_ssi_find_idlest,
+	.find_companion	= omap2_clk_dflt_find_companion,
+};
+
+const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_ssi_wait = {
+	.allow_idle	= omap2_clkt_iclk_allow_idle,
+	.deny_idle	= omap2_clkt_iclk_deny_idle,
+	.find_idlest	= omap3430es2_clk_ssi_find_idlest,
+	.find_companion	= omap2_clk_dflt_find_companion,
+};
+
+/**
+ * omap3430es2_clk_dss_usbhost_find_idlest - CM_IDLEST info for DSS, USBHOST
+ * @clk: struct clk * being enabled
+ * @idlest_reg: void __iomem ** to store CM_IDLEST reg address into
+ * @idlest_bit: pointer to a u8 to store the CM_IDLEST bit shift into
+ * @idlest_val: pointer to a u8 to store the CM_IDLEST indicator
+ *
+ * Some OMAP modules on OMAP3 ES2+ chips have both initiator and
+ * target IDLEST bits.  For our purposes, we are concerned with the
+ * target IDLEST bits, which exist at a different bit position than
+ * the *CLKEN bit position for these modules (DSS and USBHOST) (The
+ * default find_idlest code assumes that they are at the same
+ * position.)  No return value.
+ */
+static void omap3430es2_clk_dss_usbhost_find_idlest(struct clk_hw_omap *clk,
+						    void __iomem **idlest_reg,
+						    u8 *idlest_bit,
+						    u8 *idlest_val)
+{
+	u32 r;
+
+	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
+	*idlest_reg = (__force void __iomem *)r;
+	/* USBHOST_IDLE has same shift */
+	*idlest_bit = OMAP3430ES2_ST_DSS_IDLE_SHIFT;
+	*idlest_val = OMAP34XX_CM_IDLEST_VAL;
+}
+
+const struct clk_hw_omap_ops clkhwops_omap3430es2_dss_usbhost_wait = {
+	.find_idlest	= omap3430es2_clk_dss_usbhost_find_idlest,
+	.find_companion	= omap2_clk_dflt_find_companion,
+};
+
+const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_dss_usbhost_wait = {
+	.allow_idle	= omap2_clkt_iclk_allow_idle,
+	.deny_idle	= omap2_clkt_iclk_deny_idle,
+	.find_idlest	= omap3430es2_clk_dss_usbhost_find_idlest,
+	.find_companion	= omap2_clk_dflt_find_companion,
+};
+
+/**
+ * omap3430es2_clk_hsotgusb_find_idlest - return CM_IDLEST info for HSOTGUSB
+ * @clk: struct clk * being enabled
+ * @idlest_reg: void __iomem ** to store CM_IDLEST reg address into
+ * @idlest_bit: pointer to a u8 to store the CM_IDLEST bit shift into
+ * @idlest_val: pointer to a u8 to store the CM_IDLEST indicator
+ *
+ * The OMAP3430ES2 HSOTGUSB target CM_IDLEST bit is at a different
+ * shift from the CM_{I,F}CLKEN bit.  Pass back the correct info via
+ * @idlest_reg and @idlest_bit.  No return value.
+ */
+static void omap3430es2_clk_hsotgusb_find_idlest(struct clk_hw_omap *clk,
+						 void __iomem **idlest_reg,
+						 u8 *idlest_bit,
+						 u8 *idlest_val)
+{
+	u32 r;
+
+	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
+	*idlest_reg = (__force void __iomem *)r;
+	*idlest_bit = OMAP3430ES2_ST_HSOTGUSB_IDLE_SHIFT;
+	*idlest_val = OMAP34XX_CM_IDLEST_VAL;
+}
+
+const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_hsotgusb_wait = {
+	.allow_idle	= omap2_clkt_iclk_allow_idle,
+	.deny_idle	= omap2_clkt_iclk_deny_idle,
+	.find_idlest	= omap3430es2_clk_hsotgusb_find_idlest,
+	.find_companion	= omap2_clk_dflt_find_companion,
+};
+
+const struct clk_hw_omap_ops clkhwops_omap3430es2_hsotgusb_wait = {
+	.find_idlest	= omap3430es2_clk_hsotgusb_find_idlest,
+	.find_companion	= omap2_clk_dflt_find_companion,
+};
+
 static struct ti_dt_clk omap3xxx_clks[] = {
 	DT_CLK(NULL, "apb_pclk", "dummy_apb_pclk"),
 	DT_CLK(NULL, "omap_32k_fck", "omap_32k_fck"),
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index 83476d12d561..c6fbd153b6d4 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -179,6 +179,10 @@ extern const struct clk_hw_omap_ops clkhwops_wait;
 extern const struct clk_hw_omap_ops clkhwops_iclk;
 extern const struct clk_hw_omap_ops clkhwops_iclk_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap2430_i2chs_wait;
+extern const struct clk_hw_omap_ops clkhwops_omap3430es2_dss_usbhost_wait;
+extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_hsotgusb_wait;
+extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_dss_usbhost_wait;
+extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_ssi_wait;
 
 int omap2_clkops_enable_clkdm(struct clk_hw *hw);
 void omap2_clkops_disable_clkdm(struct clk_hw *hw);
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 27828422c9c5..cd5b3eadc317 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -331,12 +331,8 @@ void ti_clk_setup_features(struct ti_clk_features *features);
 const struct ti_clk_features *ti_clk_get_features(void);
 
 extern const struct clk_hw_omap_ops clkhwops_omap2xxx_dpll;
-extern const struct clk_hw_omap_ops clkhwops_omap3430es2_dss_usbhost_wait;
 extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_module_wait;
 extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_wait;
-extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_ssi_wait;
-extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_dss_usbhost_wait;
-extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_hsotgusb_wait;
 
 #ifdef CONFIG_ATAGS
 int omap3430_clk_legacy_init(void);
-- 
cgit v1.2.3-70-g09d2


From c9a58b0a848e4b88d2dd4690ef19bae8696649eb Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Tue, 3 Mar 2015 21:19:25 +0200
Subject: clk: ti: am3517: move remaining am3517 clock support code to clock
 driver

With legacy clock support gone, this is no longer needed under platform,
so move it under the clock driver itself. Make some exports be driver
internal definitions at the same time.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/Makefile    |   1 -
 arch/arm/mach-omap2/clock3517.c | 118 ----------------------------------------
 arch/arm/mach-omap2/clock3517.h |  14 -----
 drivers/clk/ti/clk-3xxx.c       |  94 ++++++++++++++++++++++++++++++++
 drivers/clk/ti/clock.h          |   2 +
 include/linux/clk/ti.h          |   2 -
 6 files changed, 96 insertions(+), 135 deletions(-)
 delete mode 100644 arch/arm/mach-omap2/clock3517.c
 delete mode 100644 arch/arm/mach-omap2/clock3517.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile
index 22d2e48dcff5..d424920a5e1c 100644
--- a/arch/arm/mach-omap2/Makefile
+++ b/arch/arm/mach-omap2/Makefile
@@ -187,7 +187,6 @@ obj-$(CONFIG_ARCH_OMAP2)		+= clkt2xxx_virt_prcm_set.o
 obj-$(CONFIG_ARCH_OMAP2)		+= clkt2xxx_dpll.o
 obj-$(CONFIG_ARCH_OMAP3)		+= $(clock-common)
 obj-$(CONFIG_ARCH_OMAP3)		+= clkt34xx_dpll3m2.o
-obj-$(CONFIG_ARCH_OMAP3)		+= clock3517.o
 obj-$(CONFIG_ARCH_OMAP4)		+= $(clock-common)
 obj-$(CONFIG_SOC_AM33XX)		+= $(clock-common)
 obj-$(CONFIG_SOC_OMAP5)			+= $(clock-common)
diff --git a/arch/arm/mach-omap2/clock3517.c b/arch/arm/mach-omap2/clock3517.c
deleted file mode 100644
index 4d79ae2c0241..000000000000
--- a/arch/arm/mach-omap2/clock3517.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * OMAP3517/3505-specific clock framework functions
- *
- * Copyright (C) 2010 Texas Instruments, Inc.
- * Copyright (C) 2011 Nokia Corporation
- *
- * Ranjith Lohithakshan
- * Paul Walmsley
- *
- * Parts of this code are based on code written by
- * Richard Woodruff, Tony Lindgren, Tuukka Tikkanen, Karthik Dasu,
- * Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#undef DEBUG
-
-#include <linux/kernel.h>
-#include <linux/clk.h>
-#include <linux/io.h>
-
-#include "clock.h"
-#include "clock3517.h"
-#include "cm3xxx.h"
-#include "cm-regbits-34xx.h"
-
-/*
- * In AM35xx IPSS, the {ICK,FCK} enable bits for modules are exported
- * in the same register at a bit offset of 0x8. The EN_ACK for ICK is
- * at an offset of 4 from ICK enable bit.
- */
-#define AM35XX_IPSS_ICK_MASK			0xF
-#define AM35XX_IPSS_ICK_EN_ACK_OFFSET 		0x4
-#define AM35XX_IPSS_ICK_FCK_OFFSET		0x8
-#define AM35XX_IPSS_CLK_IDLEST_VAL		0
-
-/**
- * am35xx_clk_find_idlest - return clock ACK info for AM35XX IPSS
- * @clk: struct clk * being enabled
- * @idlest_reg: void __iomem ** to store CM_IDLEST reg address into
- * @idlest_bit: pointer to a u8 to store the CM_IDLEST bit shift into
- * @idlest_val: pointer to a u8 to store the CM_IDLEST indicator
- *
- * The interface clocks on AM35xx IPSS reflects the clock idle status
- * in the enable register itsel at a bit offset of 4 from the enable
- * bit. A value of 1 indicates that clock is enabled.
- */
-static void am35xx_clk_find_idlest(struct clk_hw_omap *clk,
-					    void __iomem **idlest_reg,
-					    u8 *idlest_bit,
-					    u8 *idlest_val)
-{
-	*idlest_reg = (__force void __iomem *)(clk->enable_reg);
-	*idlest_bit = clk->enable_bit + AM35XX_IPSS_ICK_EN_ACK_OFFSET;
-	*idlest_val = AM35XX_IPSS_CLK_IDLEST_VAL;
-}
-
-/**
- * am35xx_clk_find_companion - find companion clock to @clk
- * @clk: struct clk * to find the companion clock of
- * @other_reg: void __iomem ** to return the companion clock CM_*CLKEN va in
- * @other_bit: u8 ** to return the companion clock bit shift in
- *
- * Some clocks don't have companion clocks.  For example, modules with
- * only an interface clock (such as HECC) don't have a companion
- * clock.  Right now, this code relies on the hardware exporting a bit
- * in the correct companion register that indicates that the
- * nonexistent 'companion clock' is active.  Future patches will
- * associate this type of code with per-module data structures to
- * avoid this issue, and remove the casts.  No return value.
- */
-static void am35xx_clk_find_companion(struct clk_hw_omap *clk,
-				      void __iomem **other_reg,
-				      u8 *other_bit)
-{
-	*other_reg = (__force void __iomem *)(clk->enable_reg);
-	if (clk->enable_bit & AM35XX_IPSS_ICK_MASK)
-		*other_bit = clk->enable_bit + AM35XX_IPSS_ICK_FCK_OFFSET;
-	else
-		*other_bit = clk->enable_bit - AM35XX_IPSS_ICK_FCK_OFFSET;
-}
-const struct clk_hw_omap_ops clkhwops_am35xx_ipss_module_wait = {
-	.find_idlest	= am35xx_clk_find_idlest,
-	.find_companion	= am35xx_clk_find_companion,
-};
-
-/**
- * am35xx_clk_ipss_find_idlest - return CM_IDLEST info for IPSS
- * @clk: struct clk * being enabled
- * @idlest_reg: void __iomem ** to store CM_IDLEST reg address into
- * @idlest_bit: pointer to a u8 to store the CM_IDLEST bit shift into
- * @idlest_val: pointer to a u8 to store the CM_IDLEST indicator
- *
- * The IPSS target CM_IDLEST bit is at a different shift from the
- * CM_{I,F}CLKEN bit.  Pass back the correct info via @idlest_reg
- * and @idlest_bit.  No return value.
- */
-static void am35xx_clk_ipss_find_idlest(struct clk_hw_omap *clk,
-					    void __iomem **idlest_reg,
-					    u8 *idlest_bit,
-					    u8 *idlest_val)
-{
-	u32 r;
-
-	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
-	*idlest_reg = (__force void __iomem *)r;
-	*idlest_bit = AM35XX_ST_IPSS_SHIFT;
-	*idlest_val = OMAP34XX_CM_IDLEST_VAL;
-}
-
-const struct clk_hw_omap_ops clkhwops_am35xx_ipss_wait = {
-	.allow_idle	= omap2_clkt_iclk_allow_idle,
-	.deny_idle	= omap2_clkt_iclk_deny_idle,
-	.find_idlest	= am35xx_clk_ipss_find_idlest,
-	.find_companion	= omap2_clk_dflt_find_companion,
-};
diff --git a/arch/arm/mach-omap2/clock3517.h b/arch/arm/mach-omap2/clock3517.h
deleted file mode 100644
index ca5e5a64c2e2..000000000000
--- a/arch/arm/mach-omap2/clock3517.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * OMAP3517/3505 clock function prototypes and macros
- *
- * Copyright (C) 2010 Texas Instruments, Inc.
- * Copyright (C) 2010 Nokia Corporation
- */
-
-#ifndef __ARCH_ARM_MACH_OMAP2_CLOCK3517_H
-#define __ARCH_ARM_MACH_OMAP2_CLOCK3517_H
-
-extern const struct clkops clkops_am35xx_ipss_module_wait;
-extern const struct clkops clkops_am35xx_ipss_wait;
-
-#endif
diff --git a/drivers/clk/ti/clk-3xxx.c b/drivers/clk/ti/clk-3xxx.c
index 58879f0b7949..6e33332b6b34 100644
--- a/drivers/clk/ti/clk-3xxx.c
+++ b/drivers/clk/ti/clk-3xxx.c
@@ -34,6 +34,18 @@
 
 #define OMAP34XX_CM_IDLEST_VAL				1
 
+/*
+ * In AM35xx IPSS, the {ICK,FCK} enable bits for modules are exported
+ * in the same register at a bit offset of 0x8. The EN_ACK for ICK is
+ * at an offset of 4 from ICK enable bit.
+ */
+#define AM35XX_IPSS_ICK_MASK			0xF
+#define AM35XX_IPSS_ICK_EN_ACK_OFFSET		0x4
+#define AM35XX_IPSS_ICK_FCK_OFFSET		0x8
+#define AM35XX_IPSS_CLK_IDLEST_VAL		0
+
+#define AM35XX_ST_IPSS_SHIFT			5
+
 /**
  * omap3430es2_clk_ssi_find_idlest - return CM_IDLEST info for SSI
  * @clk: struct clk * being enabled
@@ -146,6 +158,88 @@ const struct clk_hw_omap_ops clkhwops_omap3430es2_hsotgusb_wait = {
 	.find_companion	= omap2_clk_dflt_find_companion,
 };
 
+/**
+ * am35xx_clk_find_idlest - return clock ACK info for AM35XX IPSS
+ * @clk: struct clk * being enabled
+ * @idlest_reg: void __iomem ** to store CM_IDLEST reg address into
+ * @idlest_bit: pointer to a u8 to store the CM_IDLEST bit shift into
+ * @idlest_val: pointer to a u8 to store the CM_IDLEST indicator
+ *
+ * The interface clocks on AM35xx IPSS reflects the clock idle status
+ * in the enable register itsel at a bit offset of 4 from the enable
+ * bit. A value of 1 indicates that clock is enabled.
+ */
+static void am35xx_clk_find_idlest(struct clk_hw_omap *clk,
+				   void __iomem **idlest_reg,
+				   u8 *idlest_bit,
+				   u8 *idlest_val)
+{
+	*idlest_reg = (__force void __iomem *)(clk->enable_reg);
+	*idlest_bit = clk->enable_bit + AM35XX_IPSS_ICK_EN_ACK_OFFSET;
+	*idlest_val = AM35XX_IPSS_CLK_IDLEST_VAL;
+}
+
+/**
+ * am35xx_clk_find_companion - find companion clock to @clk
+ * @clk: struct clk * to find the companion clock of
+ * @other_reg: void __iomem ** to return the companion clock CM_*CLKEN va in
+ * @other_bit: u8 ** to return the companion clock bit shift in
+ *
+ * Some clocks don't have companion clocks.  For example, modules with
+ * only an interface clock (such as HECC) don't have a companion
+ * clock.  Right now, this code relies on the hardware exporting a bit
+ * in the correct companion register that indicates that the
+ * nonexistent 'companion clock' is active.  Future patches will
+ * associate this type of code with per-module data structures to
+ * avoid this issue, and remove the casts.  No return value.
+ */
+static void am35xx_clk_find_companion(struct clk_hw_omap *clk,
+				      void __iomem **other_reg,
+				      u8 *other_bit)
+{
+	*other_reg = (__force void __iomem *)(clk->enable_reg);
+	if (clk->enable_bit & AM35XX_IPSS_ICK_MASK)
+		*other_bit = clk->enable_bit + AM35XX_IPSS_ICK_FCK_OFFSET;
+	else
+	*other_bit = clk->enable_bit - AM35XX_IPSS_ICK_FCK_OFFSET;
+}
+
+const struct clk_hw_omap_ops clkhwops_am35xx_ipss_module_wait = {
+	.find_idlest	= am35xx_clk_find_idlest,
+	.find_companion	= am35xx_clk_find_companion,
+};
+
+/**
+ * am35xx_clk_ipss_find_idlest - return CM_IDLEST info for IPSS
+ * @clk: struct clk * being enabled
+ * @idlest_reg: void __iomem ** to store CM_IDLEST reg address into
+ * @idlest_bit: pointer to a u8 to store the CM_IDLEST bit shift into
+ * @idlest_val: pointer to a u8 to store the CM_IDLEST indicator
+ *
+ * The IPSS target CM_IDLEST bit is at a different shift from the
+ * CM_{I,F}CLKEN bit.  Pass back the correct info via @idlest_reg
+ * and @idlest_bit.  No return value.
+ */
+static void am35xx_clk_ipss_find_idlest(struct clk_hw_omap *clk,
+					void __iomem **idlest_reg,
+					u8 *idlest_bit,
+					u8 *idlest_val)
+{
+	u32 r;
+
+	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
+	*idlest_reg = (__force void __iomem *)r;
+	*idlest_bit = AM35XX_ST_IPSS_SHIFT;
+	*idlest_val = OMAP34XX_CM_IDLEST_VAL;
+}
+
+const struct clk_hw_omap_ops clkhwops_am35xx_ipss_wait = {
+	.allow_idle	= omap2_clkt_iclk_allow_idle,
+	.deny_idle	= omap2_clkt_iclk_deny_idle,
+	.find_idlest	= am35xx_clk_ipss_find_idlest,
+	.find_companion	= omap2_clk_dflt_find_companion,
+};
+
 static struct ti_dt_clk omap3xxx_clks[] = {
 	DT_CLK(NULL, "apb_pclk", "dummy_apb_pclk"),
 	DT_CLK(NULL, "omap_32k_fck", "omap_32k_fck"),
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index c6fbd153b6d4..0ca5a36da999 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -183,6 +183,8 @@ extern const struct clk_hw_omap_ops clkhwops_omap3430es2_dss_usbhost_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_hsotgusb_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_dss_usbhost_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_ssi_wait;
+extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_module_wait;
+extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_wait;
 
 int omap2_clkops_enable_clkdm(struct clk_hw *hw);
 void omap2_clkops_disable_clkdm(struct clk_hw *hw);
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index cd5b3eadc317..15f3c971ccab 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -331,8 +331,6 @@ void ti_clk_setup_features(struct ti_clk_features *features);
 const struct ti_clk_features *ti_clk_get_features(void);
 
 extern const struct clk_hw_omap_ops clkhwops_omap2xxx_dpll;
-extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_module_wait;
-extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_wait;
 
 #ifdef CONFIG_ATAGS
 int omap3430_clk_legacy_init(void);
-- 
cgit v1.2.3-70-g09d2


From a3314e9cf69c1d4052017e559ea69a042ccd83e2 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Wed, 4 Mar 2015 21:02:05 +0200
Subject: clk: ti: move some public definitions to private header

Several exported TI clock driver features are no longer needed outside
the clock driver itself, thus move all of these to the driver private
header file. Also, update some of the driver files to actually include
this header.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 drivers/clk/ti/apll.c     |  2 ++
 drivers/clk/ti/autoidle.c |  2 ++
 drivers/clk/ti/clk-43xx.c |  2 ++
 drivers/clk/ti/clk-44xx.c |  2 ++
 drivers/clk/ti/clk-54xx.c |  2 ++
 drivers/clk/ti/clk-7xx.c  |  3 ++-
 drivers/clk/ti/clock.h    | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/clk/ti.h    | 45 ---------------------------------------------
 8 files changed, 59 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/ti/apll.c b/drivers/clk/ti/apll.c
index 49baf3831546..594b759f02ee 100644
--- a/drivers/clk/ti/apll.c
+++ b/drivers/clk/ti/apll.c
@@ -27,6 +27,8 @@
 #include <linux/clk/ti.h>
 #include <linux/delay.h>
 
+#include "clock.h"
+
 #define APLL_FORCE_LOCK 0x1
 #define APLL_AUTO_IDLE	0x2
 #define MAX_APLL_WAIT_TRIES		1000000
diff --git a/drivers/clk/ti/autoidle.c b/drivers/clk/ti/autoidle.c
index 3dbcc3681058..94f0dcd94181 100644
--- a/drivers/clk/ti/autoidle.c
+++ b/drivers/clk/ti/autoidle.c
@@ -22,6 +22,8 @@
 #include <linux/of_address.h>
 #include <linux/clk/ti.h>
 
+#include "clock.h"
+
 struct clk_ti_autoidle {
 	void __iomem		*reg;
 	u8			shift;
diff --git a/drivers/clk/ti/clk-43xx.c b/drivers/clk/ti/clk-43xx.c
index 3795fce8a830..894316738459 100644
--- a/drivers/clk/ti/clk-43xx.c
+++ b/drivers/clk/ti/clk-43xx.c
@@ -19,6 +19,8 @@
 #include <linux/clk-provider.h>
 #include <linux/clk/ti.h>
 
+#include "clock.h"
+
 static struct ti_dt_clk am43xx_clks[] = {
 	DT_CLK(NULL, "clk_32768_ck", "clk_32768_ck"),
 	DT_CLK(NULL, "clk_rc32k_ck", "clk_rc32k_ck"),
diff --git a/drivers/clk/ti/clk-44xx.c b/drivers/clk/ti/clk-44xx.c
index 581db7711f51..7a8b51b35f9f 100644
--- a/drivers/clk/ti/clk-44xx.c
+++ b/drivers/clk/ti/clk-44xx.c
@@ -16,6 +16,8 @@
 #include <linux/clkdev.h>
 #include <linux/clk/ti.h>
 
+#include "clock.h"
+
 /*
  * OMAP4 ABE DPLL default frequency. In OMAP4460 TRM version V, section
  * "3.6.3.2.3 CM1_ABE Clock Generator" states that the "DPLL_ABE_X2_CLK
diff --git a/drivers/clk/ti/clk-54xx.c b/drivers/clk/ti/clk-54xx.c
index 96c69a335975..59ce2fa2c104 100644
--- a/drivers/clk/ti/clk-54xx.c
+++ b/drivers/clk/ti/clk-54xx.c
@@ -17,6 +17,8 @@
 #include <linux/io.h>
 #include <linux/clk/ti.h>
 
+#include "clock.h"
+
 #define OMAP5_DPLL_ABE_DEFFREQ				98304000
 
 /*
diff --git a/drivers/clk/ti/clk-7xx.c b/drivers/clk/ti/clk-7xx.c
index 5d2217ae4478..8b827219d454 100644
--- a/drivers/clk/ti/clk-7xx.c
+++ b/drivers/clk/ti/clk-7xx.c
@@ -16,11 +16,12 @@
 #include <linux/clkdev.h>
 #include <linux/clk/ti.h>
 
+#include "clock.h"
+
 #define DRA7_DPLL_ABE_DEFFREQ				180633600
 #define DRA7_DPLL_GMAC_DEFFREQ				1000000000
 #define DRA7_DPLL_USB_DEFFREQ				960000000
 
-
 static struct ti_dt_clk dra7xx_clks[] = {
 	DT_CLK(NULL, "atl_clkin0_ck", "atl_clkin0_ck"),
 	DT_CLK(NULL, "atl_clkin1_ck", "atl_clkin1_ck"),
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index 0ca5a36da999..3c43125b9cc9 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -154,6 +154,35 @@ struct ti_clk_dpll {
 	u8 recal_st_bit;
 };
 
+/* Composite clock component types */
+enum {
+	CLK_COMPONENT_TYPE_GATE = 0,
+	CLK_COMPONENT_TYPE_DIVIDER,
+	CLK_COMPONENT_TYPE_MUX,
+	CLK_COMPONENT_TYPE_MAX,
+};
+
+/**
+ * struct ti_dt_clk - OMAP DT clock alias declarations
+ * @lk: clock lookup definition
+ * @node_name: clock DT node to map to
+ */
+struct ti_dt_clk {
+	struct clk_lookup		lk;
+	char				*node_name;
+};
+
+#define DT_CLK(dev, con, name)		\
+	{				\
+		.lk = {			\
+			.dev_id = dev,	\
+			.con_id = con,	\
+		},			\
+		.node_name = name,	\
+	}
+
+typedef void (*ti_of_clk_init_cb_t)(struct clk_hw *, struct device_node *);
+
 struct clk *ti_clk_register_gate(struct ti_clk *setup);
 struct clk *ti_clk_register_interface(struct ti_clk *setup);
 struct clk *ti_clk_register_mux(struct ti_clk *setup);
@@ -169,6 +198,12 @@ void ti_clk_patch_legacy_clks(struct ti_clk **patch);
 struct clk *ti_clk_register_clk(struct ti_clk *setup);
 int ti_clk_register_legacy_clks(struct ti_clk_alias *clks);
 
+void __iomem *ti_clk_get_reg_addr(struct device_node *node, int index);
+void ti_dt_clocks_register(struct ti_dt_clk *oclks);
+int ti_clk_retry_init(struct device_node *node, struct clk_hw *hw,
+		      ti_of_clk_init_cb_t func);
+int ti_clk_add_component(struct device_node *node, struct clk_hw *hw, int type);
+
 void omap2_init_clk_hw_omap_clocks(struct clk *clk);
 int of_ti_clk_autoidle_setup(struct device_node *node);
 void omap2_clk_enable_init_clocks(const char **clk_names, u8 num_clocks);
@@ -186,12 +221,24 @@ extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_ssi_wait;
 extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_module_wait;
 extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_wait;
 
+extern const struct clk_ops ti_clk_divider_ops;
+extern const struct clk_ops ti_clk_mux_ops;
+
 int omap2_clkops_enable_clkdm(struct clk_hw *hw);
 void omap2_clkops_disable_clkdm(struct clk_hw *hw);
 
 int omap2_dflt_clk_enable(struct clk_hw *hw);
 void omap2_dflt_clk_disable(struct clk_hw *hw);
 int omap2_dflt_clk_is_enabled(struct clk_hw *hw);
+void omap2_clk_dflt_find_companion(struct clk_hw_omap *clk,
+				   void __iomem **other_reg,
+				   u8 *other_bit);
+void omap2_clk_dflt_find_idlest(struct clk_hw_omap *clk,
+				void __iomem **idlest_reg,
+				u8 *idlest_bit, u8 *idlest_val);
+
+void omap2_clkt_iclk_allow_idle(struct clk_hw_omap *clk);
+void omap2_clkt_iclk_deny_idle(struct clk_hw_omap *clk);
 
 u8 omap2_init_dpll_parent(struct clk_hw *hw);
 int omap3_noncore_dpll_enable(struct clk_hw *hw);
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 15f3c971ccab..5eccdf5c0e84 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -188,33 +188,6 @@ struct clk_hw_omap {
 /* DPLL Type and DCO Selection Flags */
 #define DPLL_J_TYPE		0x1
 
-/* Composite clock component types */
-enum {
-	CLK_COMPONENT_TYPE_GATE = 0,
-	CLK_COMPONENT_TYPE_DIVIDER,
-	CLK_COMPONENT_TYPE_MUX,
-	CLK_COMPONENT_TYPE_MAX,
-};
-
-/**
- * struct ti_dt_clk - OMAP DT clock alias declarations
- * @lk: clock lookup definition
- * @node_name: clock DT node to map to
- */
-struct ti_dt_clk {
-	struct clk_lookup		lk;
-	char				*node_name;
-};
-
-#define DT_CLK(dev, con, name)		\
-	{				\
-		.lk = {			\
-			.dev_id = dev,	\
-			.con_id = con,	\
-		},			\
-		.node_name = name,	\
-	}
-
 /* Static memmap indices */
 enum {
 	TI_CLKM_CM = 0,
@@ -225,8 +198,6 @@ enum {
 	CLK_MAX_MEMMAPS
 };
 
-typedef void (*ti_of_clk_init_cb_t)(struct clk_hw *, struct device_node *);
-
 /**
  * struct clk_omap_reg - OMAP register declaration
  * @offset: offset from the master IP module base address
@@ -266,9 +237,6 @@ struct ti_clk_ll_ops {
 
 extern struct ti_clk_ll_ops *ti_clk_ll_ops;
 
-extern const struct clk_ops ti_clk_divider_ops;
-extern const struct clk_ops ti_clk_mux_ops;
-
 #define to_clk_hw_omap(_hw) container_of(_hw, struct clk_hw_omap, hw)
 
 void omap2_init_clk_clkdm(struct clk_hw *clk);
@@ -276,14 +244,6 @@ int omap2_clk_disable_autoidle_all(void);
 int omap2_clk_enable_autoidle_all(void);
 int omap2_clk_allow_idle(struct clk *clk);
 int omap2_clk_deny_idle(struct clk *clk);
-void omap2_clkt_iclk_allow_idle(struct clk_hw_omap *clk);
-void omap2_clkt_iclk_deny_idle(struct clk_hw_omap *clk);
-void omap2_clk_dflt_find_companion(struct clk_hw_omap *clk,
-				   void __iomem **other_reg,
-				   u8 *other_bit);
-void omap2_clk_dflt_find_idlest(struct clk_hw_omap *clk,
-				void __iomem **idlest_reg,
-				u8 *idlest_bit, u8 *idlest_val);
 unsigned long omap2_dpllcore_recalc(struct clk_hw *hw,
 				    unsigned long parent_rate);
 int omap2_reprogram_dpllcore(struct clk_hw *clk, unsigned long rate,
@@ -292,14 +252,9 @@ void omap2xxx_clkt_dpllcore_init(struct clk_hw *hw);
 void omap2xxx_clkt_vps_init(void);
 unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk);
 
-void __iomem *ti_clk_get_reg_addr(struct device_node *node, int index);
-void ti_dt_clocks_register(struct ti_dt_clk *oclks);
 void ti_dt_clk_init_provider(struct device_node *np, int index);
 void ti_dt_clk_init_retry_clks(void);
 void ti_dt_clockdomains_setup(void);
-int ti_clk_retry_init(struct device_node *node, struct clk_hw *hw,
-		      ti_of_clk_init_cb_t func);
-int ti_clk_add_component(struct device_node *node, struct clk_hw *hw, int type);
 
 int omap3430_dt_clk_init(void);
 int omap3630_dt_clk_init(void);
-- 
cgit v1.2.3-70-g09d2


From e9e63088e4f93cf4ed7999294c09905b7dcb4d32 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Mon, 27 Apr 2015 21:55:42 +0300
Subject: clk: ti: remove exported ll_ops struct, instead add an API for
 registration

We should avoid exporting data from drivers, instead use an API for
registering the clock low level operations.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/clock.c  | 17 +++++++++++++----
 arch/arm/mach-omap2/clock.h  |  1 +
 arch/arm/mach-omap2/io.c     |  2 ++
 drivers/clk/ti/clk.c         | 21 +++++++++++++++++++++
 drivers/clk/ti/clock.h       |  2 ++
 drivers/clk/ti/clockdomain.c |  2 ++
 include/linux/clk/ti.h       |  3 +--
 7 files changed, 42 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index 99875dba803a..40a88c2e4016 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -112,6 +112,19 @@ static struct ti_clk_ll_ops omap_clk_ll_ops = {
 	.cm_split_idlest_reg = cm_split_idlest_reg,
 };
 
+/**
+ * omap2_clk_setup_ll_ops - setup clock driver low-level ops
+ *
+ * Sets up clock driver low-level platform ops. These are needed
+ * for register accesses and various other misc platform operations.
+ * Returns 0 on success, -EBUSY if low level ops have been registered
+ * already.
+ */
+int __init omap2_clk_setup_ll_ops(void)
+{
+	return ti_clk_setup_ll_ops(&omap_clk_ll_ops);
+}
+
 /**
  * omap2_clk_provider_init - initialize a clock provider
  * @match_table: DT device table to match for devices to init
@@ -130,8 +143,6 @@ int __init omap2_clk_provider_init(struct device_node *np, int index,
 {
 	struct clk_iomap *io;
 
-	ti_clk_ll_ops = &omap_clk_ll_ops;
-
 	io = kzalloc(sizeof(*io), GFP_KERNEL);
 
 	io->regmap = syscon;
@@ -155,8 +166,6 @@ void __init omap2_clk_legacy_provider_init(int index, void __iomem *mem)
 {
 	struct clk_iomap *io;
 
-	ti_clk_ll_ops = &omap_clk_ll_ops;
-
 	io = memblock_virt_alloc(sizeof(*io), 0);
 
 	io->mem = mem;
diff --git a/arch/arm/mach-omap2/clock.h b/arch/arm/mach-omap2/clock.h
index 1986ab216b1a..a7051d6a05e9 100644
--- a/arch/arm/mach-omap2/clock.h
+++ b/arch/arm/mach-omap2/clock.h
@@ -83,6 +83,7 @@ struct regmap;
 int __init omap2_clk_provider_init(struct device_node *np, int index,
 				   struct regmap *syscon, void __iomem *mem);
 void __init omap2_clk_legacy_provider_init(int index, void __iomem *mem);
+int __init omap2_clk_setup_ll_ops(void);
 
 void __init ti_clk_init_features(void);
 #endif
diff --git a/arch/arm/mach-omap2/io.c b/arch/arm/mach-omap2/io.c
index 74678565cd97..a253aafbb9a2 100644
--- a/arch/arm/mach-omap2/io.c
+++ b/arch/arm/mach-omap2/io.c
@@ -722,6 +722,8 @@ int __init omap_clk_init(void)
 
 	ti_clk_init_features();
 
+	omap2_clk_setup_ll_ops();
+
 	if (of_have_populated_dt()) {
 		ret = omap_control_init();
 		if (ret)
diff --git a/drivers/clk/ti/clk.c b/drivers/clk/ti/clk.c
index 5baea03cfc92..58b83e0af90f 100644
--- a/drivers/clk/ti/clk.c
+++ b/drivers/clk/ti/clk.c
@@ -32,6 +32,27 @@ static struct device_node *clocks_node_ptr[CLK_MAX_MEMMAPS];
 
 struct ti_clk_features ti_clk_features;
 
+/**
+ * ti_clk_setup_ll_ops - setup low level clock operations
+ * @ops: low level clock ops descriptor
+ *
+ * Sets up low level clock operations for TI clock driver. This is used
+ * to provide various callbacks for the clock driver towards platform
+ * specific code. Returns 0 on success, -EBUSY if ll_ops have been
+ * registered already.
+ */
+int ti_clk_setup_ll_ops(struct ti_clk_ll_ops *ops)
+{
+	if (ti_clk_ll_ops) {
+		pr_err("Attempt to register ll_ops multiple times.\n");
+		return -EBUSY;
+	}
+
+	ti_clk_ll_ops = ops;
+
+	return 0;
+}
+
 /**
  * ti_dt_clocks_register - register DT alias clocks during boot
  * @oclks: list of clocks to register
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index 3c43125b9cc9..d4d232fd89bc 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -280,4 +280,6 @@ long omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw,
 					unsigned long *best_parent_rate,
 					struct clk_hw **best_parent_clk);
 
+extern struct ti_clk_ll_ops *ti_clk_ll_ops;
+
 #endif
diff --git a/drivers/clk/ti/clockdomain.c b/drivers/clk/ti/clockdomain.c
index 61ef87b1a688..80a7b6944d10 100644
--- a/drivers/clk/ti/clockdomain.c
+++ b/drivers/clk/ti/clockdomain.c
@@ -21,6 +21,8 @@
 #include <linux/of_address.h>
 #include <linux/clk/ti.h>
 
+#include "clock.h"
+
 #undef pr_fmt
 #define pr_fmt(fmt) "%s: " fmt, __func__
 
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 5eccdf5c0e84..5b644313e38a 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -235,8 +235,6 @@ struct ti_clk_ll_ops {
 				       u8 *idlest_reg_id);
 };
 
-extern struct ti_clk_ll_ops *ti_clk_ll_ops;
-
 #define to_clk_hw_omap(_hw) container_of(_hw, struct clk_hw_omap, hw)
 
 void omap2_init_clk_clkdm(struct clk_hw *clk);
@@ -255,6 +253,7 @@ unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk);
 void ti_dt_clk_init_provider(struct device_node *np, int index);
 void ti_dt_clk_init_retry_clks(void);
 void ti_dt_clockdomains_setup(void);
+int ti_clk_setup_ll_ops(struct ti_clk_ll_ops *ops);
 
 int omap3430_dt_clk_init(void);
 int omap3630_dt_clk_init(void);
-- 
cgit v1.2.3-70-g09d2


From 989feafb84118a840ff21250a1e5f516f43e3dbb Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Mon, 27 Apr 2015 22:23:06 +0300
Subject: clk: ti: move low-level access and init code under clock driver

With most of the clock code under clock driver already, the low-level
register access code, and the init code for the same, is no longer
needed outside the clock driver. Thus, these can be moved under clock
driver also.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/clock.c | 84 ---------------------------------------------
 arch/arm/mach-omap2/clock.h |  5 ---
 drivers/clk/ti/clk.c        | 75 ++++++++++++++++++++++++++++++++++++++--
 include/linux/clk/ti.h      |  7 +++-
 4 files changed, 78 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index 79cec5fbbe74..4340ba6524d1 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -23,9 +23,7 @@
 #include <linux/clk-provider.h>
 #include <linux/io.h>
 #include <linux/bitops.h>
-#include <linux/regmap.h>
 #include <linux/of_address.h>
-#include <linux/bootmem.h>
 #include <asm/cpu.h>
 
 #include <trace/events/power.h>
@@ -55,41 +53,7 @@ u16 cpu_mask;
 #define OMAP3PLUS_DPLL_FINT_MIN		32000
 #define OMAP3PLUS_DPLL_FINT_MAX		52000000
 
-struct clk_iomap {
-	struct regmap *regmap;
-	void __iomem *mem;
-};
-
-static struct clk_iomap *clk_memmaps[CLK_MAX_MEMMAPS];
-
-static void clk_memmap_writel(u32 val, void __iomem *reg)
-{
-	struct clk_omap_reg *r = (struct clk_omap_reg *)&reg;
-	struct clk_iomap *io = clk_memmaps[r->index];
-
-	if (io->regmap)
-		regmap_write(io->regmap, r->offset, val);
-	else
-		writel_relaxed(val, io->mem + r->offset);
-}
-
-static u32 clk_memmap_readl(void __iomem *reg)
-{
-	u32 val;
-	struct clk_omap_reg *r = (struct clk_omap_reg *)&reg;
-	struct clk_iomap *io = clk_memmaps[r->index];
-
-	if (io->regmap)
-		regmap_read(io->regmap, r->offset, &val);
-	else
-		val = readl_relaxed(io->mem + r->offset);
-
-	return val;
-}
-
 static struct ti_clk_ll_ops omap_clk_ll_ops = {
-	.clk_readl = clk_memmap_readl,
-	.clk_writel = clk_memmap_writel,
 	.clkdm_clk_enable = clkdm_clk_enable,
 	.clkdm_clk_disable = clkdm_clk_disable,
 	.cm_wait_module_ready = omap_cm_wait_module_ready,
@@ -109,54 +73,6 @@ int __init omap2_clk_setup_ll_ops(void)
 	return ti_clk_setup_ll_ops(&omap_clk_ll_ops);
 }
 
-/**
- * omap2_clk_provider_init - initialize a clock provider
- * @match_table: DT device table to match for devices to init
- * @np: device node pointer for the this clock provider
- * @index: index for the clock provider
- + @syscon: syscon regmap pointer
- * @mem: iomem pointer for the clock provider memory area, only used if
- *	 syscon is not provided
- *
- * Initializes a clock provider module (CM/PRM etc.), registering
- * the memory mapping at specified index and initializing the
- * low level driver infrastructure. Returns 0 in success.
- */
-int __init omap2_clk_provider_init(struct device_node *np, int index,
-				   struct regmap *syscon, void __iomem *mem)
-{
-	struct clk_iomap *io;
-
-	io = kzalloc(sizeof(*io), GFP_KERNEL);
-
-	io->regmap = syscon;
-	io->mem = mem;
-
-	clk_memmaps[index] = io;
-
-	ti_dt_clk_init_provider(np, index);
-
-	return 0;
-}
-
-/**
- * omap2_clk_legacy_provider_init - initialize a legacy clock provider
- * @index: index for the clock provider
- * @mem: iomem pointer for the clock provider memory area
- *
- * Initializes a legacy clock provider memory mapping.
- */
-void __init omap2_clk_legacy_provider_init(int index, void __iomem *mem)
-{
-	struct clk_iomap *io;
-
-	io = memblock_virt_alloc(sizeof(*io), 0);
-
-	io->mem = mem;
-
-	clk_memmaps[index] = io;
-}
-
 /*
  * OMAP2+ specific clock functions
  */
diff --git a/arch/arm/mach-omap2/clock.h b/arch/arm/mach-omap2/clock.h
index f3dc04cd5538..67da640ba1c7 100644
--- a/arch/arm/mach-omap2/clock.h
+++ b/arch/arm/mach-omap2/clock.h
@@ -75,11 +75,6 @@ extern const struct clkops clkops_omap2_dflt;
 
 extern struct clk_functions omap2_clk_functions;
 
-struct regmap;
-
-int __init omap2_clk_provider_init(struct device_node *np, int index,
-				   struct regmap *syscon, void __iomem *mem);
-void __init omap2_clk_legacy_provider_init(int index, void __iomem *mem);
 int __init omap2_clk_setup_ll_ops(void);
 
 void __init ti_clk_init_features(void);
diff --git a/drivers/clk/ti/clk.c b/drivers/clk/ti/clk.c
index 58b83e0af90f..07584e00677e 100644
--- a/drivers/clk/ti/clk.c
+++ b/drivers/clk/ti/clk.c
@@ -21,6 +21,8 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/list.h>
+#include <linux/regmap.h>
+#include <linux/bootmem.h>
 
 #include "clock.h"
 
@@ -32,6 +34,38 @@ static struct device_node *clocks_node_ptr[CLK_MAX_MEMMAPS];
 
 struct ti_clk_features ti_clk_features;
 
+struct clk_iomap {
+	struct regmap *regmap;
+	void __iomem *mem;
+};
+
+static struct clk_iomap *clk_memmaps[CLK_MAX_MEMMAPS];
+
+static void clk_memmap_writel(u32 val, void __iomem *reg)
+{
+	struct clk_omap_reg *r = (struct clk_omap_reg *)&reg;
+	struct clk_iomap *io = clk_memmaps[r->index];
+
+	if (io->regmap)
+		regmap_write(io->regmap, r->offset, val);
+	else
+		writel_relaxed(val, io->mem + r->offset);
+}
+
+static u32 clk_memmap_readl(void __iomem *reg)
+{
+	u32 val;
+	struct clk_omap_reg *r = (struct clk_omap_reg *)&reg;
+	struct clk_iomap *io = clk_memmaps[r->index];
+
+	if (io->regmap)
+		regmap_read(io->regmap, r->offset, &val);
+	else
+		val = readl_relaxed(io->mem + r->offset);
+
+	return val;
+}
+
 /**
  * ti_clk_setup_ll_ops - setup low level clock operations
  * @ops: low level clock ops descriptor
@@ -49,6 +83,8 @@ int ti_clk_setup_ll_ops(struct ti_clk_ll_ops *ops)
 	}
 
 	ti_clk_ll_ops = ops;
+	ops->clk_readl = clk_memmap_readl;
+	ops->clk_writel = clk_memmap_writel;
 
 	return 0;
 }
@@ -161,28 +197,61 @@ void __iomem *ti_clk_get_reg_addr(struct device_node *node, int index)
 }
 
 /**
- * ti_dt_clk_init_provider - init master clock provider
+ * omap2_clk_provider_init - init master clock provider
  * @parent: master node
  * @index: internal index for clk_reg_ops
+ * @syscon: syscon regmap pointer for accessing clock registers
+ * @mem: iomem pointer for the clock provider memory area, only used if
+ *       syscon is not provided
  *
  * Initializes a master clock IP block. This basically sets up the
  * mapping from clocks node to the memory map index. All the clocks
  * are then initialized through the common of_clk_init call, and the
  * clocks will access their memory maps based on the node layout.
+ * Returns 0 in success.
  */
-void ti_dt_clk_init_provider(struct device_node *parent, int index)
+int __init omap2_clk_provider_init(struct device_node *parent, int index,
+				   struct regmap *syscon, void __iomem *mem)
 {
 	struct device_node *clocks;
+	struct clk_iomap *io;
 
 	/* get clocks for this parent */
 	clocks = of_get_child_by_name(parent, "clocks");
 	if (!clocks) {
 		pr_err("%s missing 'clocks' child node.\n", parent->name);
-		return;
+		return -EINVAL;
 	}
 
 	/* add clocks node info */
 	clocks_node_ptr[index] = clocks;
+
+	io = kzalloc(sizeof(*io), GFP_KERNEL);
+
+	io->regmap = syscon;
+	io->mem = mem;
+
+	clk_memmaps[index] = io;
+
+	return 0;
+}
+
+/**
+ * omap2_clk_legacy_provider_init - initialize a legacy clock provider
+ * @index: index for the clock provider
+ * @mem: iomem pointer for the clock provider memory area
+ *
+ * Initializes a legacy clock provider memory mapping.
+ */
+void __init omap2_clk_legacy_provider_init(int index, void __iomem *mem)
+{
+	struct clk_iomap *io;
+
+	io = memblock_virt_alloc(sizeof(*io), 0);
+
+	io->mem = mem;
+
+	clk_memmaps[index] = io;
 }
 
 /**
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 5b644313e38a..9299222d680d 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -250,11 +250,16 @@ void omap2xxx_clkt_dpllcore_init(struct clk_hw *hw);
 void omap2xxx_clkt_vps_init(void);
 unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk);
 
-void ti_dt_clk_init_provider(struct device_node *np, int index);
 void ti_dt_clk_init_retry_clks(void);
 void ti_dt_clockdomains_setup(void);
 int ti_clk_setup_ll_ops(struct ti_clk_ll_ops *ops);
 
+struct regmap;
+
+int omap2_clk_provider_init(struct device_node *parent, int index,
+			    struct regmap *syscon, void __iomem *mem);
+void omap2_clk_legacy_provider_init(int index, void __iomem *mem);
+
 int omap3430_dt_clk_init(void);
 int omap3630_dt_clk_init(void);
 int am35xx_dt_clk_init(void);
-- 
cgit v1.2.3-70-g09d2


From 1e25aa9641e8f3fa39cd5e46b4afcafd7f12a44b Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 1 Jun 2015 16:36:27 -0700
Subject: hid-sensor: Fix suspend/resume delay

By default all the sensors are runtime suspended state (lowest power
state). During Linux suspend process, all the run time suspended
devices are resumed and then suspended. This caused all sensors to
power up and introduced delay in suspend time, when we introduced
runtime PM for HID sensors. The opposite process happens during resume
process.

To fix this, we do powerup process of the sensors only when the request
is issued from user (raw or tiggerred). In this way when runtime,
resume calls for powerup it will simply return as this will not match
user requested state.

Note this is a regression fix as the increase in suspend / resume
times can be substantial (report of 8 seconds on Len's laptop!)

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Tested-by: Len Brown <len.brown@intel.com>
Cc: <Stable@vger.kernel.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/common/hid-sensors/hid-sensor-trigger.c | 11 ++++++++++-
 include/linux/hid-sensor-hub.h                      |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
index 610fc98f88ef..595511022795 100644
--- a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
+++ b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
@@ -36,6 +36,8 @@ static int _hid_sensor_power_state(struct hid_sensor_common *st, bool state)
 	s32 poll_value = 0;
 
 	if (state) {
+		if (!atomic_read(&st->user_requested_state))
+			return 0;
 		if (sensor_hub_device_open(st->hsdev))
 			return -EIO;
 
@@ -52,8 +54,12 @@ static int _hid_sensor_power_state(struct hid_sensor_common *st, bool state)
 
 		poll_value = hid_sensor_read_poll_value(st);
 	} else {
-		if (!atomic_dec_and_test(&st->data_ready))
+		int val;
+
+		val = atomic_dec_if_positive(&st->data_ready);
+		if (val < 0)
 			return 0;
+
 		sensor_hub_device_close(st->hsdev);
 		state_val = hid_sensor_get_usage_index(st->hsdev,
 			st->power_state.report_id,
@@ -92,9 +98,11 @@ EXPORT_SYMBOL(hid_sensor_power_state);
 
 int hid_sensor_power_state(struct hid_sensor_common *st, bool state)
 {
+
 #ifdef CONFIG_PM
 	int ret;
 
+	atomic_set(&st->user_requested_state, state);
 	if (state)
 		ret = pm_runtime_get_sync(&st->pdev->dev);
 	else {
@@ -109,6 +117,7 @@ int hid_sensor_power_state(struct hid_sensor_common *st, bool state)
 
  	return 0;
 #else
+	atomic_set(&st->user_requested_state, state);
 	return _hid_sensor_power_state(st, state);
 #endif
 }
diff --git a/include/linux/hid-sensor-hub.h b/include/linux/hid-sensor-hub.h
index 0408421d885f..cd224dfd94d8 100644
--- a/include/linux/hid-sensor-hub.h
+++ b/include/linux/hid-sensor-hub.h
@@ -230,6 +230,7 @@ struct hid_sensor_common {
 	struct platform_device *pdev;
 	unsigned usage_id;
 	atomic_t data_ready;
+	atomic_t user_requested_state;
 	struct iio_trigger *trigger;
 	struct hid_sensor_hub_attribute_info poll;
 	struct hid_sensor_hub_attribute_info report_state;
-- 
cgit v1.2.3-70-g09d2


From fcc577dd55db193926537e0e4de98492d665446b Mon Sep 17 00:00:00 2001
From: Cristina Opriceana <cristina.opriceana@gmail.com>
Date: Tue, 23 Jun 2015 16:34:19 +0300
Subject: iio: Fix parameters in iio_triggered_buffer_setup

This patch renames the top half handler and the bottom half handler
of iio_triggered_buffer_setup() in accordance with their usage.
The bottom half has been renamed to reflect the fact that it is a
thread based call, compliant with iio_alloc_pollfunc().
The names of the parameters were swapped, thus creating confusion.

Signed-off-by: Cristina Opriceana <cristina.opriceana@gmail.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/industrialio-triggered-buffer.c | 12 ++++++------
 include/linux/iio/triggered_buffer.h        |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-triggered-buffer.c b/drivers/iio/industrialio-triggered-buffer.c
index 15a5341b5e7b..4b2858ba1fd6 100644
--- a/drivers/iio/industrialio-triggered-buffer.c
+++ b/drivers/iio/industrialio-triggered-buffer.c
@@ -24,8 +24,8 @@ static const struct iio_buffer_setup_ops iio_triggered_buffer_setup_ops = {
 /**
  * iio_triggered_buffer_setup() - Setup triggered buffer and pollfunc
  * @indio_dev:		IIO device structure
- * @pollfunc_bh:	Function which will be used as pollfunc bottom half
- * @pollfunc_th:	Function which will be used as pollfunc top half
+ * @h:			Function which will be used as pollfunc top half
+ * @thread:		Function which will be used as pollfunc bottom half
  * @setup_ops:		Buffer setup functions to use for this device.
  *			If NULL the default setup functions for triggered
  *			buffers will be used.
@@ -42,8 +42,8 @@ static const struct iio_buffer_setup_ops iio_triggered_buffer_setup_ops = {
  * iio_triggered_buffer_cleanup().
  */
 int iio_triggered_buffer_setup(struct iio_dev *indio_dev,
-	irqreturn_t (*pollfunc_bh)(int irq, void *p),
-	irqreturn_t (*pollfunc_th)(int irq, void *p),
+	irqreturn_t (*h)(int irq, void *p),
+	irqreturn_t (*thread)(int irq, void *p),
 	const struct iio_buffer_setup_ops *setup_ops)
 {
 	struct iio_buffer *buffer;
@@ -57,8 +57,8 @@ int iio_triggered_buffer_setup(struct iio_dev *indio_dev,
 
 	iio_device_attach_buffer(indio_dev, buffer);
 
-	indio_dev->pollfunc = iio_alloc_pollfunc(pollfunc_bh,
-						 pollfunc_th,
+	indio_dev->pollfunc = iio_alloc_pollfunc(h,
+						 thread,
 						 IRQF_ONESHOT,
 						 indio_dev,
 						 "%s_consumer%d",
diff --git a/include/linux/iio/triggered_buffer.h b/include/linux/iio/triggered_buffer.h
index c378ebec605e..f72f70d5a97b 100644
--- a/include/linux/iio/triggered_buffer.h
+++ b/include/linux/iio/triggered_buffer.h
@@ -7,8 +7,8 @@ struct iio_dev;
 struct iio_buffer_setup_ops;
 
 int iio_triggered_buffer_setup(struct iio_dev *indio_dev,
-	irqreturn_t (*pollfunc_bh)(int irq, void *p),
-	irqreturn_t (*pollfunc_th)(int irq, void *p),
+	irqreturn_t (*h)(int irq, void *p),
+	irqreturn_t (*thread)(int irq, void *p),
 	const struct iio_buffer_setup_ops *setup_ops);
 void iio_triggered_buffer_cleanup(struct iio_dev *indio_dev);
 
-- 
cgit v1.2.3-70-g09d2


From 0fd972a7d91d6e15393c449492a04d94c0b89351 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Fri, 1 May 2015 20:13:42 -0400
Subject: module: relocate module_init from init.h to module.h

Modular users will always be users of init functionality, but
users of init functionality are not necessarily always modules.

Hence any functionality like module_init and module_exit would
be more at home in the module.h file.  And module.h should
explicitly include init.h to make the dependency clear.

We've already done all the legwork needed to ensure that this
move does not cause any build regressions due to implicit
header file include assumptions about where module_init lives.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 include/linux/init.h   | 78 ----------------------------------------------
 include/linux/module.h | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index 7c68c36d3fd8..b449f378f995 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -282,68 +282,8 @@ void __init parse_early_param(void);
 void __init parse_early_options(char *cmdline);
 #endif /* __ASSEMBLY__ */
 
-/**
- * module_init() - driver initialization entry point
- * @x: function to be run at kernel boot time or module insertion
- * 
- * module_init() will either be called during do_initcalls() (if
- * builtin) or at module insertion time (if a module).  There can only
- * be one per module.
- */
-#define module_init(x)	__initcall(x);
-
-/**
- * module_exit() - driver exit entry point
- * @x: function to be run when driver is removed
- * 
- * module_exit() will wrap the driver clean-up code
- * with cleanup_module() when used with rmmod when
- * the driver is a module.  If the driver is statically
- * compiled into the kernel, module_exit() has no effect.
- * There can only be one per module.
- */
-#define module_exit(x)	__exitcall(x);
-
 #else /* MODULE */
 
-/*
- * In most cases loadable modules do not need custom
- * initcall levels. There are still some valid cases where
- * a driver may be needed early if built in, and does not
- * matter when built as a loadable module. Like bus
- * snooping debug drivers.
- */
-#define early_initcall(fn)		module_init(fn)
-#define core_initcall(fn)		module_init(fn)
-#define core_initcall_sync(fn)		module_init(fn)
-#define postcore_initcall(fn)		module_init(fn)
-#define postcore_initcall_sync(fn)	module_init(fn)
-#define arch_initcall(fn)		module_init(fn)
-#define subsys_initcall(fn)		module_init(fn)
-#define subsys_initcall_sync(fn)	module_init(fn)
-#define fs_initcall(fn)			module_init(fn)
-#define fs_initcall_sync(fn)		module_init(fn)
-#define rootfs_initcall(fn)		module_init(fn)
-#define device_initcall(fn)		module_init(fn)
-#define device_initcall_sync(fn)	module_init(fn)
-#define late_initcall(fn)		module_init(fn)
-#define late_initcall_sync(fn)		module_init(fn)
-
-#define console_initcall(fn)		module_init(fn)
-#define security_initcall(fn)		module_init(fn)
-
-/* Each module must use one module_init(). */
-#define module_init(initfn)					\
-	static inline initcall_t __inittest(void)		\
-	{ return initfn; }					\
-	int init_module(void) __attribute__((alias(#initfn)));
-
-/* This is only required if you want to be unloadable. */
-#define module_exit(exitfn)					\
-	static inline exitcall_t __exittest(void)		\
-	{ return exitfn; }					\
-	void cleanup_module(void) __attribute__((alias(#exitfn)));
-
 #define __setup_param(str, unique_id, fn)	/* nothing */
 #define __setup(str, func) 			/* nothing */
 #endif
@@ -351,24 +291,6 @@ void __init parse_early_options(char *cmdline);
 /* Data marked not to be saved by software suspend */
 #define __nosavedata __section(.data..nosave)
 
-/* This means "can be init if no module support, otherwise module load
-   may call it." */
-#ifdef CONFIG_MODULES
-#define __init_or_module
-#define __initdata_or_module
-#define __initconst_or_module
-#define __INIT_OR_MODULE	.text
-#define __INITDATA_OR_MODULE	.data
-#define __INITRODATA_OR_MODULE	.section ".rodata","a",%progbits
-#else
-#define __init_or_module __init
-#define __initdata_or_module __initdata
-#define __initconst_or_module __initconst
-#define __INIT_OR_MODULE __INIT
-#define __INITDATA_OR_MODULE __INITDATA
-#define __INITRODATA_OR_MODULE __INITRODATA
-#endif /*CONFIG_MODULES*/
-
 #ifdef MODULE
 #define __exit_p(x) x
 #else
diff --git a/include/linux/module.h b/include/linux/module.h
index d67b1932cc59..3a19c79918e0 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -11,6 +11,7 @@
 #include <linux/compiler.h>
 #include <linux/cache.h>
 #include <linux/kmod.h>
+#include <linux/init.h>
 #include <linux/elf.h>
 #include <linux/stringify.h>
 #include <linux/kobject.h>
@@ -71,6 +72,89 @@ extern struct module_attribute module_uevent;
 extern int init_module(void);
 extern void cleanup_module(void);
 
+#ifndef MODULE
+/**
+ * module_init() - driver initialization entry point
+ * @x: function to be run at kernel boot time or module insertion
+ *
+ * module_init() will either be called during do_initcalls() (if
+ * builtin) or at module insertion time (if a module).  There can only
+ * be one per module.
+ */
+#define module_init(x)	__initcall(x);
+
+/**
+ * module_exit() - driver exit entry point
+ * @x: function to be run when driver is removed
+ *
+ * module_exit() will wrap the driver clean-up code
+ * with cleanup_module() when used with rmmod when
+ * the driver is a module.  If the driver is statically
+ * compiled into the kernel, module_exit() has no effect.
+ * There can only be one per module.
+ */
+#define module_exit(x)	__exitcall(x);
+
+#else /* MODULE */
+
+/*
+ * In most cases loadable modules do not need custom
+ * initcall levels. There are still some valid cases where
+ * a driver may be needed early if built in, and does not
+ * matter when built as a loadable module. Like bus
+ * snooping debug drivers.
+ */
+#define early_initcall(fn)		module_init(fn)
+#define core_initcall(fn)		module_init(fn)
+#define core_initcall_sync(fn)		module_init(fn)
+#define postcore_initcall(fn)		module_init(fn)
+#define postcore_initcall_sync(fn)	module_init(fn)
+#define arch_initcall(fn)		module_init(fn)
+#define subsys_initcall(fn)		module_init(fn)
+#define subsys_initcall_sync(fn)	module_init(fn)
+#define fs_initcall(fn)			module_init(fn)
+#define fs_initcall_sync(fn)		module_init(fn)
+#define rootfs_initcall(fn)		module_init(fn)
+#define device_initcall(fn)		module_init(fn)
+#define device_initcall_sync(fn)	module_init(fn)
+#define late_initcall(fn)		module_init(fn)
+#define late_initcall_sync(fn)		module_init(fn)
+
+#define console_initcall(fn)		module_init(fn)
+#define security_initcall(fn)		module_init(fn)
+
+/* Each module must use one module_init(). */
+#define module_init(initfn)					\
+	static inline initcall_t __inittest(void)		\
+	{ return initfn; }					\
+	int init_module(void) __attribute__((alias(#initfn)));
+
+/* This is only required if you want to be unloadable. */
+#define module_exit(exitfn)					\
+	static inline exitcall_t __exittest(void)		\
+	{ return exitfn; }					\
+	void cleanup_module(void) __attribute__((alias(#exitfn)));
+
+#endif
+
+/* This means "can be init if no module support, otherwise module load
+   may call it." */
+#ifdef CONFIG_MODULES
+#define __init_or_module
+#define __initdata_or_module
+#define __initconst_or_module
+#define __INIT_OR_MODULE	.text
+#define __INITDATA_OR_MODULE	.data
+#define __INITRODATA_OR_MODULE	.section ".rodata","a",%progbits
+#else
+#define __init_or_module __init
+#define __initdata_or_module __initdata
+#define __initconst_or_module __initconst
+#define __INIT_OR_MODULE __INIT
+#define __INITDATA_OR_MODULE __INITDATA
+#define __INITRODATA_OR_MODULE __INITRODATA
+#endif /*CONFIG_MODULES*/
+
 /* Archs provide a method of finding the correct exception table. */
 struct exception_table_entry;
 
-- 
cgit v1.2.3-70-g09d2


From b17d1bf16cc72a374a48d748940f700009d40ff4 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Wed, 11 Feb 2015 11:52:37 +0100
Subject: gpio: make flags mandatory for gpiod_get functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that all[1] users of the gpiod_get functions are converted to make
use of the up to now optional flags parameter, make it mandatory which
allows to remove some cpp magic.

[1] all but etraxfs-uart which is broken anyhow and I'm allowed to
    ignore it by Jesper Nilsson :-)

Acked-by: Alexandre Courbot <acourbot@nvidia.com>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/gpio/devres.c         | 18 +++++-----
 drivers/gpio/gpiolib.c        | 16 ++++-----
 include/linux/gpio/consumer.h | 82 ++++++++++++-------------------------------
 3 files changed, 40 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/devres.c b/drivers/gpio/devres.c
index 07ba82317ece..903fcf4d04a0 100644
--- a/drivers/gpio/devres.c
+++ b/drivers/gpio/devres.c
@@ -59,13 +59,13 @@ static int devm_gpiod_match_array(struct device *dev, void *res, void *data)
  * automatically disposed on driver detach. See gpiod_get() for detailed
  * information about behavior and return values.
  */
-struct gpio_desc *__must_check __devm_gpiod_get(struct device *dev,
+struct gpio_desc *__must_check devm_gpiod_get(struct device *dev,
 					      const char *con_id,
 					      enum gpiod_flags flags)
 {
 	return devm_gpiod_get_index(dev, con_id, 0, flags);
 }
-EXPORT_SYMBOL(__devm_gpiod_get);
+EXPORT_SYMBOL(devm_gpiod_get);
 
 /**
  * devm_gpiod_get_optional - Resource-managed gpiod_get_optional()
@@ -77,13 +77,13 @@ EXPORT_SYMBOL(__devm_gpiod_get);
  * are automatically disposed on driver detach. See gpiod_get_optional() for
  * detailed information about behavior and return values.
  */
-struct gpio_desc *__must_check __devm_gpiod_get_optional(struct device *dev,
+struct gpio_desc *__must_check devm_gpiod_get_optional(struct device *dev,
 						       const char *con_id,
 						       enum gpiod_flags flags)
 {
 	return devm_gpiod_get_index_optional(dev, con_id, 0, flags);
 }
-EXPORT_SYMBOL(__devm_gpiod_get_optional);
+EXPORT_SYMBOL(devm_gpiod_get_optional);
 
 /**
  * devm_gpiod_get_index - Resource-managed gpiod_get_index()
@@ -96,7 +96,7 @@ EXPORT_SYMBOL(__devm_gpiod_get_optional);
  * automatically disposed on driver detach. See gpiod_get_index() for detailed
  * information about behavior and return values.
  */
-struct gpio_desc *__must_check __devm_gpiod_get_index(struct device *dev,
+struct gpio_desc *__must_check devm_gpiod_get_index(struct device *dev,
 						    const char *con_id,
 						    unsigned int idx,
 						    enum gpiod_flags flags)
@@ -120,7 +120,7 @@ struct gpio_desc *__must_check __devm_gpiod_get_index(struct device *dev,
 
 	return desc;
 }
-EXPORT_SYMBOL(__devm_gpiod_get_index);
+EXPORT_SYMBOL(devm_gpiod_get_index);
 
 /**
  * devm_get_gpiod_from_child - get a GPIO descriptor from a device's child node
@@ -182,10 +182,10 @@ EXPORT_SYMBOL(devm_get_gpiod_from_child);
  * gpiod_get_index_optional() for detailed information about behavior and
  * return values.
  */
-struct gpio_desc *__must_check __devm_gpiod_get_index_optional(struct device *dev,
+struct gpio_desc *__must_check devm_gpiod_get_index_optional(struct device *dev,
 							     const char *con_id,
 							     unsigned int index,
-							 enum gpiod_flags flags)
+							     enum gpiod_flags flags)
 {
 	struct gpio_desc *desc;
 
@@ -197,7 +197,7 @@ struct gpio_desc *__must_check __devm_gpiod_get_index_optional(struct device *de
 
 	return desc;
 }
-EXPORT_SYMBOL(__devm_gpiod_get_index_optional);
+EXPORT_SYMBOL(devm_gpiod_get_index_optional);
 
 /**
  * devm_gpiod_get_array - Resource-managed gpiod_get_array()
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index bf4bd1d120c3..4b2f98168225 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1894,12 +1894,12 @@ EXPORT_SYMBOL_GPL(gpiod_count);
  * dev, -ENOENT if no GPIO has been assigned to the requested function, or
  * another IS_ERR() code if an error occurred while trying to acquire the GPIO.
  */
-struct gpio_desc *__must_check __gpiod_get(struct device *dev, const char *con_id,
+struct gpio_desc *__must_check gpiod_get(struct device *dev, const char *con_id,
 					 enum gpiod_flags flags)
 {
 	return gpiod_get_index(dev, con_id, 0, flags);
 }
-EXPORT_SYMBOL_GPL(__gpiod_get);
+EXPORT_SYMBOL_GPL(gpiod_get);
 
 /**
  * gpiod_get_optional - obtain an optional GPIO for a given GPIO function
@@ -1911,13 +1911,13 @@ EXPORT_SYMBOL_GPL(__gpiod_get);
  * the requested function it will return NULL. This is convenient for drivers
  * that need to handle optional GPIOs.
  */
-struct gpio_desc *__must_check __gpiod_get_optional(struct device *dev,
+struct gpio_desc *__must_check gpiod_get_optional(struct device *dev,
 						  const char *con_id,
 						  enum gpiod_flags flags)
 {
 	return gpiod_get_index_optional(dev, con_id, 0, flags);
 }
-EXPORT_SYMBOL_GPL(__gpiod_get_optional);
+EXPORT_SYMBOL_GPL(gpiod_get_optional);
 
 
 /**
@@ -1974,7 +1974,7 @@ static int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id,
  * requested function and/or index, or another IS_ERR() code if an error
  * occurred while trying to acquire the GPIO.
  */
-struct gpio_desc *__must_check __gpiod_get_index(struct device *dev,
+struct gpio_desc *__must_check gpiod_get_index(struct device *dev,
 					       const char *con_id,
 					       unsigned int idx,
 					       enum gpiod_flags flags)
@@ -2023,7 +2023,7 @@ struct gpio_desc *__must_check __gpiod_get_index(struct device *dev,
 
 	return desc;
 }
-EXPORT_SYMBOL_GPL(__gpiod_get_index);
+EXPORT_SYMBOL_GPL(gpiod_get_index);
 
 /**
  * fwnode_get_named_gpiod - obtain a GPIO from firmware node
@@ -2092,7 +2092,7 @@ EXPORT_SYMBOL_GPL(fwnode_get_named_gpiod);
  * specified index was assigned to the requested function it will return NULL.
  * This is convenient for drivers that need to handle optional GPIOs.
  */
-struct gpio_desc *__must_check __gpiod_get_index_optional(struct device *dev,
+struct gpio_desc *__must_check gpiod_get_index_optional(struct device *dev,
 							const char *con_id,
 							unsigned int index,
 							enum gpiod_flags flags)
@@ -2107,7 +2107,7 @@ struct gpio_desc *__must_check __gpiod_get_index_optional(struct device *dev,
 
 	return desc;
 }
-EXPORT_SYMBOL_GPL(__gpiod_get_index_optional);
+EXPORT_SYMBOL_GPL(gpiod_get_index_optional);
 
 /**
  * gpiod_hog - Hog the specified GPIO desc given the provided flags
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index adac255aee86..14cac67c2012 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -47,17 +47,17 @@ enum gpiod_flags {
 int gpiod_count(struct device *dev, const char *con_id);
 
 /* Acquire and dispose GPIOs */
-struct gpio_desc *__must_check __gpiod_get(struct device *dev,
+struct gpio_desc *__must_check gpiod_get(struct device *dev,
 					 const char *con_id,
 					 enum gpiod_flags flags);
-struct gpio_desc *__must_check __gpiod_get_index(struct device *dev,
+struct gpio_desc *__must_check gpiod_get_index(struct device *dev,
 					       const char *con_id,
 					       unsigned int idx,
 					       enum gpiod_flags flags);
-struct gpio_desc *__must_check __gpiod_get_optional(struct device *dev,
+struct gpio_desc *__must_check gpiod_get_optional(struct device *dev,
 						  const char *con_id,
 						  enum gpiod_flags flags);
-struct gpio_desc *__must_check __gpiod_get_index_optional(struct device *dev,
+struct gpio_desc *__must_check gpiod_get_index_optional(struct device *dev,
 							const char *con_id,
 							unsigned int index,
 							enum gpiod_flags flags);
@@ -70,18 +70,18 @@ struct gpio_descs *__must_check gpiod_get_array_optional(struct device *dev,
 void gpiod_put(struct gpio_desc *desc);
 void gpiod_put_array(struct gpio_descs *descs);
 
-struct gpio_desc *__must_check __devm_gpiod_get(struct device *dev,
+struct gpio_desc *__must_check devm_gpiod_get(struct device *dev,
 					      const char *con_id,
 					      enum gpiod_flags flags);
-struct gpio_desc *__must_check __devm_gpiod_get_index(struct device *dev,
+struct gpio_desc *__must_check devm_gpiod_get_index(struct device *dev,
 						    const char *con_id,
 						    unsigned int idx,
 						    enum gpiod_flags flags);
-struct gpio_desc *__must_check __devm_gpiod_get_optional(struct device *dev,
+struct gpio_desc *__must_check devm_gpiod_get_optional(struct device *dev,
 						       const char *con_id,
 						       enum gpiod_flags flags);
 struct gpio_desc *__must_check
-__devm_gpiod_get_index_optional(struct device *dev, const char *con_id,
+devm_gpiod_get_index_optional(struct device *dev, const char *con_id,
 			      unsigned int index, enum gpiod_flags flags);
 struct gpio_descs *__must_check devm_gpiod_get_array(struct device *dev,
 						     const char *con_id,
@@ -146,31 +146,31 @@ static inline int gpiod_count(struct device *dev, const char *con_id)
 	return 0;
 }
 
-static inline struct gpio_desc *__must_check __gpiod_get(struct device *dev,
-						const char *con_id,
-						enum gpiod_flags flags)
+static inline struct gpio_desc *__must_check gpiod_get(struct device *dev,
+						       const char *con_id,
+						       enum gpiod_flags flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
 static inline struct gpio_desc *__must_check
-__gpiod_get_index(struct device *dev,
-		  const char *con_id,
-		  unsigned int idx,
-		  enum gpiod_flags flags)
+gpiod_get_index(struct device *dev,
+		const char *con_id,
+		unsigned int idx,
+		enum gpiod_flags flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
 
 static inline struct gpio_desc *__must_check
-__gpiod_get_optional(struct device *dev, const char *con_id,
-		     enum gpiod_flags flags)
+gpiod_get_optional(struct device *dev, const char *con_id,
+		   enum gpiod_flags flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
 
 static inline struct gpio_desc *__must_check
-__gpiod_get_index_optional(struct device *dev, const char *con_id,
-			   unsigned int index, enum gpiod_flags flags)
+gpiod_get_index_optional(struct device *dev, const char *con_id,
+			 unsigned int index, enum gpiod_flags flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
@@ -206,7 +206,7 @@ static inline void gpiod_put_array(struct gpio_descs *descs)
 }
 
 static inline struct gpio_desc *__must_check
-__devm_gpiod_get(struct device *dev,
+devm_gpiod_get(struct device *dev,
 		 const char *con_id,
 		 enum gpiod_flags flags)
 {
@@ -214,7 +214,7 @@ __devm_gpiod_get(struct device *dev,
 }
 static inline
 struct gpio_desc *__must_check
-__devm_gpiod_get_index(struct device *dev,
+devm_gpiod_get_index(struct device *dev,
 		       const char *con_id,
 		       unsigned int idx,
 		       enum gpiod_flags flags)
@@ -223,14 +223,14 @@ __devm_gpiod_get_index(struct device *dev,
 }
 
 static inline struct gpio_desc *__must_check
-__devm_gpiod_get_optional(struct device *dev, const char *con_id,
+devm_gpiod_get_optional(struct device *dev, const char *con_id,
 			  enum gpiod_flags flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
 
 static inline struct gpio_desc *__must_check
-__devm_gpiod_get_index_optional(struct device *dev, const char *con_id,
+devm_gpiod_get_index_optional(struct device *dev, const char *con_id,
 				unsigned int index, enum gpiod_flags flags)
 {
 	return ERR_PTR(-ENOSYS);
@@ -424,42 +424,6 @@ static inline struct gpio_desc *devm_get_gpiod_from_child(
 
 #endif /* CONFIG_GPIOLIB */
 
-/*
- * Vararg-hacks! This is done to transition the kernel to always pass
- * the options flags argument to the below functions. During a transition
- * phase these vararg macros make both old-and-newstyle code compile,
- * but when all calls to the elder API are removed, these should go away
- * and the __gpiod_get() etc functions above be renamed just gpiod_get()
- * etc.
- */
-#define __gpiod_get(dev, con_id, flags, ...) __gpiod_get(dev, con_id, flags)
-#define gpiod_get(varargs...) __gpiod_get(varargs, GPIOD_ASIS)
-#define __gpiod_get_index(dev, con_id, index, flags, ...)		\
-	__gpiod_get_index(dev, con_id, index, flags)
-#define gpiod_get_index(varargs...) __gpiod_get_index(varargs, GPIOD_ASIS)
-#define __gpiod_get_optional(dev, con_id, flags, ...)			\
-	__gpiod_get_optional(dev, con_id, flags)
-#define gpiod_get_optional(varargs...) __gpiod_get_optional(varargs, GPIOD_ASIS)
-#define __gpiod_get_index_optional(dev, con_id, index, flags, ...)	\
-	__gpiod_get_index_optional(dev, con_id, index, flags)
-#define gpiod_get_index_optional(varargs...)				\
-	__gpiod_get_index_optional(varargs, GPIOD_ASIS)
-#define __devm_gpiod_get(dev, con_id, flags, ...)			\
-	__devm_gpiod_get(dev, con_id, flags)
-#define devm_gpiod_get(varargs...) __devm_gpiod_get(varargs, GPIOD_ASIS)
-#define __devm_gpiod_get_index(dev, con_id, index, flags, ...)		\
-	__devm_gpiod_get_index(dev, con_id, index, flags)
-#define devm_gpiod_get_index(varargs...)				\
-	__devm_gpiod_get_index(varargs, GPIOD_ASIS)
-#define __devm_gpiod_get_optional(dev, con_id, flags, ...)		\
-	__devm_gpiod_get_optional(dev, con_id, flags)
-#define devm_gpiod_get_optional(varargs...)				\
-	__devm_gpiod_get_optional(varargs, GPIOD_ASIS)
-#define __devm_gpiod_get_index_optional(dev, con_id, index, flags, ...)	\
-	__devm_gpiod_get_index_optional(dev, con_id, index, flags)
-#define devm_gpiod_get_index_optional(varargs...)			\
-	__devm_gpiod_get_index_optional(varargs, GPIOD_ASIS)
-
 #if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_GPIO_SYSFS)
 
 int gpiod_export(struct gpio_desc *desc, bool direction_may_change);
-- 
cgit v1.2.3-70-g09d2


From 16a624a9c81814cc2f1353eff2e502430c3fa79a Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Wed, 24 Jun 2015 08:17:02 +0200
Subject: soc: mediatek: Add infracfg misc driver support

This adds support for some miscellaneous bits of the infracfg controller.
The mtk_infracfg_set/clear_bus_protection functions are necessary for
the scpsys power domain driver to handle the bus protection bits which
are contained in the infacfg register space.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Reviewed-by: Daniel Kurtz <djkurtz@chromium.org>
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/soc/mediatek/Kconfig          |  9 ++++
 drivers/soc/mediatek/Makefile         |  1 +
 drivers/soc/mediatek/mtk-infracfg.c   | 91 +++++++++++++++++++++++++++++++++++
 include/linux/soc/mediatek/infracfg.h | 26 ++++++++++
 4 files changed, 127 insertions(+)
 create mode 100644 drivers/soc/mediatek/mtk-infracfg.c
 create mode 100644 include/linux/soc/mediatek/infracfg.h

(limited to 'include/linux')

diff --git a/drivers/soc/mediatek/Kconfig b/drivers/soc/mediatek/Kconfig
index 3c1850332a90..e609a6f5e2eb 100644
--- a/drivers/soc/mediatek/Kconfig
+++ b/drivers/soc/mediatek/Kconfig
@@ -1,6 +1,15 @@
 #
 # MediaTek SoC drivers
 #
+config MTK_INFRACFG
+	bool "MediaTek INFRACFG Support"
+	depends on ARCH_MEDIATEK || COMPILE_TEST
+	select REGMAP
+	help
+	  Say yes here to add support for the MediaTek INFRACFG controller. The
+	  INFRACFG controller contains various infrastructure registers not
+	  directly associated to any device.
+
 config MTK_PMIC_WRAP
 	tristate "MediaTek PMIC Wrapper Support"
 	depends on ARCH_MEDIATEK
diff --git a/drivers/soc/mediatek/Makefile b/drivers/soc/mediatek/Makefile
index ecaf4defd7f6..3fa940fb4eab 100644
--- a/drivers/soc/mediatek/Makefile
+++ b/drivers/soc/mediatek/Makefile
@@ -1 +1,2 @@
+obj-$(CONFIG_MTK_INFRACFG) += mtk-infracfg.o
 obj-$(CONFIG_MTK_PMIC_WRAP) += mtk-pmic-wrap.o
diff --git a/drivers/soc/mediatek/mtk-infracfg.c b/drivers/soc/mediatek/mtk-infracfg.c
new file mode 100644
index 000000000000..dba3055a9493
--- /dev/null
+++ b/drivers/soc/mediatek/mtk-infracfg.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2015 Pengutronix, Sascha Hauer <kernel@pengutronix.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/export.h>
+#include <linux/jiffies.h>
+#include <linux/regmap.h>
+#include <linux/soc/mediatek/infracfg.h>
+#include <asm/processor.h>
+
+#define INFRA_TOPAXI_PROTECTEN		0x0220
+#define INFRA_TOPAXI_PROTECTSTA1	0x0228
+
+/**
+ * mtk_infracfg_set_bus_protection - enable bus protection
+ * @regmap: The infracfg regmap
+ * @mask: The mask containing the protection bits to be enabled.
+ *
+ * This function enables the bus protection bits for disabled power
+ * domains so that the system does not hang when some unit accesses the
+ * bus while in power down.
+ */
+int mtk_infracfg_set_bus_protection(struct regmap *infracfg, u32 mask)
+{
+	unsigned long expired;
+	u32 val;
+	int ret;
+
+	regmap_update_bits(infracfg, INFRA_TOPAXI_PROTECTEN, mask, mask);
+
+	expired = jiffies + HZ;
+
+	while (1) {
+		ret = regmap_read(infracfg, INFRA_TOPAXI_PROTECTSTA1, &val);
+		if (ret)
+			return ret;
+
+		if ((val & mask) == mask)
+			break;
+
+		cpu_relax();
+		if (time_after(jiffies, expired))
+			return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * mtk_infracfg_clear_bus_protection - disable bus protection
+ * @regmap: The infracfg regmap
+ * @mask: The mask containing the protection bits to be disabled.
+ *
+ * This function disables the bus protection bits previously enabled with
+ * mtk_infracfg_set_bus_protection.
+ */
+int mtk_infracfg_clear_bus_protection(struct regmap *infracfg, u32 mask)
+{
+	unsigned long expired;
+	int ret;
+
+	regmap_update_bits(infracfg, INFRA_TOPAXI_PROTECTEN, mask, 0);
+
+	expired = jiffies + HZ;
+
+	while (1) {
+		u32 val;
+
+		ret = regmap_read(infracfg, INFRA_TOPAXI_PROTECTSTA1, &val);
+		if (ret)
+			return ret;
+
+		if (!(val & mask))
+			break;
+
+		cpu_relax();
+		if (time_after(jiffies, expired))
+			return -EIO;
+	}
+
+	return 0;
+}
diff --git a/include/linux/soc/mediatek/infracfg.h b/include/linux/soc/mediatek/infracfg.h
new file mode 100644
index 000000000000..a5714e93fb34
--- /dev/null
+++ b/include/linux/soc/mediatek/infracfg.h
@@ -0,0 +1,26 @@
+#ifndef __SOC_MEDIATEK_INFRACFG_H
+#define __SOC_MEDIATEK_INFRACFG_H
+
+#define MT8173_TOP_AXI_PROT_EN_MCI_M2		BIT(0)
+#define MT8173_TOP_AXI_PROT_EN_MM_M0		BIT(1)
+#define MT8173_TOP_AXI_PROT_EN_MM_M1		BIT(2)
+#define MT8173_TOP_AXI_PROT_EN_MMAPB_S		BIT(6)
+#define MT8173_TOP_AXI_PROT_EN_L2C_M2		BIT(9)
+#define MT8173_TOP_AXI_PROT_EN_L2SS_SMI		BIT(11)
+#define MT8173_TOP_AXI_PROT_EN_L2SS_ADD		BIT(12)
+#define MT8173_TOP_AXI_PROT_EN_CCI_M2		BIT(13)
+#define MT8173_TOP_AXI_PROT_EN_MFG_S		BIT(14)
+#define MT8173_TOP_AXI_PROT_EN_PERI_M0		BIT(15)
+#define MT8173_TOP_AXI_PROT_EN_PERI_M1		BIT(16)
+#define MT8173_TOP_AXI_PROT_EN_DEBUGSYS		BIT(17)
+#define MT8173_TOP_AXI_PROT_EN_CQ_DMA		BIT(18)
+#define MT8173_TOP_AXI_PROT_EN_GCPU		BIT(19)
+#define MT8173_TOP_AXI_PROT_EN_IOMMU		BIT(20)
+#define MT8173_TOP_AXI_PROT_EN_MFG_M0		BIT(21)
+#define MT8173_TOP_AXI_PROT_EN_MFG_M1		BIT(22)
+#define MT8173_TOP_AXI_PROT_EN_MFG_SNOOP_OUT	BIT(23)
+
+int mtk_infracfg_set_bus_protection(struct regmap *infracfg, u32 mask);
+int mtk_infracfg_clear_bus_protection(struct regmap *infracfg, u32 mask);
+
+#endif /* __SOC_MEDIATEK_INFRACFG_H */
-- 
cgit v1.2.3-70-g09d2


From d1ec4c34c7a9f328e43ea87522119258194f28f8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 13 May 2015 10:41:58 -0700
Subject: rcu: Drop RCU_USER_QS in favor of NO_HZ_FULL

The RCU_USER_QS Kconfig parameter is now just a synonym for NO_HZ_FULL,
so this commit eliminates RCU_USER_QS, replacing all uses with NO_HZ_FULL.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/rcupdate.h | 4 ++--
 init/Kconfig             | 9 ---------
 kernel/rcu/tree.c        | 8 ++++----
 kernel/time/Kconfig      | 2 --
 4 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 4cf5f51b4c9c..237f7b8d38ba 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -309,7 +309,7 @@ static inline void rcu_sysrq_end(void)
 }
 #endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */
 
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_NO_HZ_FULL
 void rcu_user_enter(void);
 void rcu_user_exit(void);
 #else
@@ -317,7 +317,7 @@ static inline void rcu_user_enter(void) { }
 static inline void rcu_user_exit(void) { }
 static inline void rcu_user_hooks_switch(struct task_struct *prev,
 					 struct task_struct *next) { }
-#endif /* CONFIG_RCU_USER_QS */
+#endif /* CONFIG_NO_HZ_FULL */
 
 #ifdef CONFIG_RCU_NOCB_CPU
 void rcu_init_nohz(void);
diff --git a/init/Kconfig b/init/Kconfig
index af09b4fb43d2..fdeff4ab5995 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -538,15 +538,6 @@ config RCU_STALL_COMMON
 config CONTEXT_TRACKING
        bool
 
-config RCU_USER_QS
-	bool
-	help
-	  This option sets hooks on kernel / userspace boundaries and
-	  puts RCU in extended quiescent state when the CPU runs in
-	  userspace. It means that when a CPU runs in userspace, it is
-	  excluded from the global RCU state machine and thus doesn't
-	  try to keep the timer tick on for RCU.
-
 config CONTEXT_TRACKING_FORCE
 	bool "Force context tracking"
 	depends on CONTEXT_TRACKING
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 65137bc28b2b..8b5dd8ba9495 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -701,7 +701,7 @@ void rcu_idle_enter(void)
 }
 EXPORT_SYMBOL_GPL(rcu_idle_enter);
 
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_NO_HZ_FULL
 /**
  * rcu_user_enter - inform RCU that we are resuming userspace.
  *
@@ -714,7 +714,7 @@ void rcu_user_enter(void)
 {
 	rcu_eqs_enter(1);
 }
-#endif /* CONFIG_RCU_USER_QS */
+#endif /* CONFIG_NO_HZ_FULL */
 
 /**
  * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
@@ -828,7 +828,7 @@ void rcu_idle_exit(void)
 }
 EXPORT_SYMBOL_GPL(rcu_idle_exit);
 
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_NO_HZ_FULL
 /**
  * rcu_user_exit - inform RCU that we are exiting userspace.
  *
@@ -839,7 +839,7 @@ void rcu_user_exit(void)
 {
 	rcu_eqs_exit(1);
 }
-#endif /* CONFIG_RCU_USER_QS */
+#endif /* CONFIG_NO_HZ_FULL */
 
 /**
  * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 579ce1b929af..4008d9f95dd7 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -92,12 +92,10 @@ config NO_HZ_FULL
 	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
 	# We need at least one periodic CPU for timekeeping
 	depends on SMP
-	# RCU_USER_QS dependency
 	depends on HAVE_CONTEXT_TRACKING
 	# VIRT_CPU_ACCOUNTING_GEN dependency
 	depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
 	select NO_HZ_COMMON
-	select RCU_USER_QS
 	select RCU_NOCB_CPU
 	select VIRT_CPU_ACCOUNTING_GEN
 	select IRQ_WORK
-- 
cgit v1.2.3-70-g09d2


From 80eeb1f0f757c790b020d9f425bb0e824973d49c Mon Sep 17 00:00:00 2001
From: Sergej Sawazki <ce3a@gmx.de>
Date: Sun, 28 Jun 2015 16:24:55 +0200
Subject: clk: add gpio controlled clock multiplexer

Add a common clock driver for basic gpio controlled clock multiplexers.
This driver can be used for devices like 5V41068A or 831721I from IDT
or for discrete multiplexer circuits. The 'select' pin selects one of
two parent clocks.

Cc: Jyri Sarha <jsarha@ti.com>
Signed-off-by: Sergej Sawazki <ce3a@gmx.de>
[sboyd@codeaurora.org: Fix error paths to free memory and do it
in the correct order]
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 .../devicetree/bindings/clock/gpio-mux-clock.txt   |  19 ++
 drivers/clk/clk-gpio-gate.c                        | 242 +++++++++++++++------
 include/linux/clk-provider.h                       |  17 ++
 3 files changed, 214 insertions(+), 64 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/clock/gpio-mux-clock.txt

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/clock/gpio-mux-clock.txt b/Documentation/devicetree/bindings/clock/gpio-mux-clock.txt
new file mode 100644
index 000000000000..2be1e038ca62
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/gpio-mux-clock.txt
@@ -0,0 +1,19 @@
+Binding for simple gpio clock multiplexer.
+
+This binding uses the common clock binding[1].
+
+[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
+
+Required properties:
+- compatible : shall be "gpio-mux-clock".
+- clocks: list of two references to parent clocks.
+- #clock-cells : from common clock binding; shall be set to 0.
+- select-gpios : GPIO reference for selecting the parent clock.
+
+Example:
+	clock {
+		compatible = "gpio-mux-clock";
+		clocks = <&parentclk1>, <&parentclk2>;
+		#clock-cells = <0>;
+		select-gpios = <&gpio 1 GPIO_ACTIVE_HIGH>;
+	};
diff --git a/drivers/clk/clk-gpio-gate.c b/drivers/clk/clk-gpio-gate.c
index ef942daa955a..c0d202c24a97 100644
--- a/drivers/clk/clk-gpio-gate.c
+++ b/drivers/clk/clk-gpio-gate.c
@@ -1,12 +1,15 @@
 /*
  * Copyright (C) 2013 - 2014 Texas Instruments Incorporated - http://www.ti.com
- * Author: Jyri Sarha <jsarha@ti.com>
+ *
+ * Authors:
+ *    Jyri Sarha <jsarha@ti.com>
+ *    Sergej Sawazki <ce3a@gmx.de>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
- * Gpio gated clock implementation
+ * Gpio controlled clock implementation
  */
 
 #include <linux/clk-provider.h>
@@ -61,24 +64,55 @@ const struct clk_ops clk_gpio_gate_ops = {
 EXPORT_SYMBOL_GPL(clk_gpio_gate_ops);
 
 /**
- * clk_register_gpio - register a gpip clock with the clock framework
- * @dev: device that is registering this clock
- * @name: name of this clock
- * @parent_name: name of this clock's parent
- * @gpio: gpio number to gate this clock
- * @active_low: true if gpio should be set to 0 to enable clock
- * @flags: clock flags
+ * DOC: basic clock multiplexer which can be controlled with a gpio output
+ * Traits of this clock:
+ * prepare - clk_prepare only ensures that parents are prepared
+ * rate - rate is only affected by parent switching.  No clk_set_rate support
+ * parent - parent is adjustable through clk_set_parent
  */
-struct clk *clk_register_gpio_gate(struct device *dev, const char *name,
-		const char *parent_name, unsigned gpio, bool active_low,
-		unsigned long flags)
+
+static u8 clk_gpio_mux_get_parent(struct clk_hw *hw)
 {
-	struct clk_gpio *clk_gpio = NULL;
-	struct clk *clk = ERR_PTR(-EINVAL);
-	struct clk_init_data init = { NULL };
+	struct clk_gpio *clk = to_clk_gpio(hw);
+
+	return gpiod_get_value(clk->gpiod);
+}
+
+static int clk_gpio_mux_set_parent(struct clk_hw *hw, u8 index)
+{
+	struct clk_gpio *clk = to_clk_gpio(hw);
+
+	gpiod_set_value(clk->gpiod, index);
+
+	return 0;
+}
+
+const struct clk_ops clk_gpio_mux_ops = {
+	.get_parent = clk_gpio_mux_get_parent,
+	.set_parent = clk_gpio_mux_set_parent,
+	.determine_rate = __clk_mux_determine_rate,
+};
+EXPORT_SYMBOL_GPL(clk_gpio_mux_ops);
+
+static struct clk *clk_register_gpio(struct device *dev, const char *name,
+		const char **parent_names, u8 num_parents, unsigned gpio,
+		bool active_low, unsigned long flags,
+		const struct clk_ops *clk_gpio_ops)
+{
+	struct clk_gpio *clk_gpio;
+	struct clk *clk;
+	struct clk_init_data init = {};
 	unsigned long gpio_flags;
 	int err;
 
+	if (dev)
+		clk_gpio = devm_kzalloc(dev, sizeof(*clk_gpio),	GFP_KERNEL);
+	else
+		clk_gpio = kzalloc(sizeof(*clk_gpio), GFP_KERNEL);
+
+	if (!clk_gpio)
+		return ERR_PTR(-ENOMEM);
+
 	if (active_low)
 		gpio_flags = GPIOF_ACTIVE_LOW | GPIOF_OUT_INIT_HIGH;
 	else
@@ -88,70 +122,108 @@ struct clk *clk_register_gpio_gate(struct device *dev, const char *name,
 		err = devm_gpio_request_one(dev, gpio, gpio_flags, name);
 	else
 		err = gpio_request_one(gpio, gpio_flags, name);
-
 	if (err) {
 		if (err != -EPROBE_DEFER)
 			pr_err("%s: %s: Error requesting clock control gpio %u\n",
 					__func__, name, gpio);
-		return ERR_PTR(err);
-	}
-
-	if (dev)
-		clk_gpio = devm_kzalloc(dev, sizeof(struct clk_gpio),
-					GFP_KERNEL);
-	else
-		clk_gpio = kzalloc(sizeof(struct clk_gpio), GFP_KERNEL);
+		if (!dev)
+			kfree(clk_gpio);
 
-	if (!clk_gpio) {
-		clk = ERR_PTR(-ENOMEM);
-		goto clk_register_gpio_gate_err;
+		return ERR_PTR(err);
 	}
 
 	init.name = name;
-	init.ops = &clk_gpio_gate_ops;
+	init.ops = clk_gpio_ops;
 	init.flags = flags | CLK_IS_BASIC;
-	init.parent_names = (parent_name ? &parent_name : NULL);
-	init.num_parents = (parent_name ? 1 : 0);
+	init.parent_names = parent_names;
+	init.num_parents = num_parents;
 
 	clk_gpio->gpiod = gpio_to_desc(gpio);
 	clk_gpio->hw.init = &init;
 
-	clk = clk_register(dev, &clk_gpio->hw);
+	if (dev)
+		clk = devm_clk_register(dev, &clk_gpio->hw);
+	else
+		clk = clk_register(NULL, &clk_gpio->hw);
 
 	if (!IS_ERR(clk))
 		return clk;
 
-	if (!dev)
+	if (!dev) {
+		gpiod_put(clk_gpio->gpiod);
 		kfree(clk_gpio);
-
-clk_register_gpio_gate_err:
-	if (!dev)
-		gpio_free(gpio);
+	}
 
 	return clk;
 }
+
+/**
+ * clk_register_gpio_gate - register a gpio clock gate with the clock framework
+ * @dev: device that is registering this clock
+ * @name: name of this clock
+ * @parent_name: name of this clock's parent
+ * @gpio: gpio number to gate this clock
+ * @active_low: true if gpio should be set to 0 to enable clock
+ * @flags: clock flags
+ */
+struct clk *clk_register_gpio_gate(struct device *dev, const char *name,
+		const char *parent_name, unsigned gpio, bool active_low,
+		unsigned long flags)
+{
+	return clk_register_gpio(dev, name,
+			(parent_name ? &parent_name : NULL),
+			(parent_name ? 1 : 0), gpio, active_low, flags,
+			&clk_gpio_gate_ops);
+}
 EXPORT_SYMBOL_GPL(clk_register_gpio_gate);
 
+/**
+ * clk_register_gpio_mux - register a gpio clock mux with the clock framework
+ * @dev: device that is registering this clock
+ * @name: name of this clock
+ * @parent_names: names of this clock's parents
+ * @num_parents: number of parents listed in @parent_names
+ * @gpio: gpio number to gate this clock
+ * @active_low: true if gpio should be set to 0 to enable clock
+ * @flags: clock flags
+ */
+struct clk *clk_register_gpio_mux(struct device *dev, const char *name,
+		const char **parent_names, u8 num_parents, unsigned gpio,
+		bool active_low, unsigned long flags)
+{
+	if (num_parents != 2) {
+		pr_err("mux-clock %s must have 2 parents\n", name);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return clk_register_gpio(dev, name, parent_names, num_parents,
+			gpio, active_low, flags, &clk_gpio_mux_ops);
+}
+EXPORT_SYMBOL_GPL(clk_register_gpio_mux);
+
 #ifdef CONFIG_OF
 /**
- * The clk_register_gpio_gate has to be delayed, because the EPROBE_DEFER
+ * clk_register_get() has to be delayed, because -EPROBE_DEFER
  * can not be handled properly at of_clk_init() call time.
  */
 
-struct clk_gpio_gate_delayed_register_data {
+struct clk_gpio_delayed_register_data {
+	const char *gpio_name;
 	struct device_node *node;
 	struct mutex lock;
 	struct clk *clk;
+	struct clk *(*clk_register_get)(const char *name,
+			const char **parent_names, u8 num_parents,
+			unsigned gpio, bool active_low);
 };
 
-static struct clk *of_clk_gpio_gate_delayed_register_get(
-		struct of_phandle_args *clkspec,
-		void *_data)
+static struct clk *of_clk_gpio_delayed_register_get(
+		struct of_phandle_args *clkspec, void *_data)
 {
-	struct clk_gpio_gate_delayed_register_data *data = _data;
+	struct clk_gpio_delayed_register_data *data = _data;
 	struct clk *clk;
-	const char *clk_name = data->node->name;
-	const char *parent_name;
+	const char **parent_names;
+	int i, num_parents;
 	int gpio;
 	enum of_gpio_flags of_flags;
 
@@ -162,47 +234,89 @@ static struct clk *of_clk_gpio_gate_delayed_register_get(
 		return data->clk;
 	}
 
-	gpio = of_get_named_gpio_flags(data->node, "enable-gpios", 0,
-						&of_flags);
+	gpio = of_get_named_gpio_flags(data->node, data->gpio_name, 0,
+			&of_flags);
 	if (gpio < 0) {
 		mutex_unlock(&data->lock);
-		if (gpio != -EPROBE_DEFER)
-			pr_err("%s: %s: Can't get 'enable-gpios' DT property\n",
-			       __func__, clk_name);
+		if (gpio == -EPROBE_DEFER)
+			pr_debug("%s: %s: GPIOs not yet available, retry later\n",
+					data->node->name, __func__);
+		else
+			pr_err("%s: %s: Can't get '%s' DT property\n",
+					data->node->name, __func__,
+					data->gpio_name);
 		return ERR_PTR(gpio);
 	}
 
-	parent_name = of_clk_get_parent_name(data->node, 0);
+	num_parents = of_clk_get_parent_count(data->node);
 
-	clk = clk_register_gpio_gate(NULL, clk_name, parent_name, gpio,
-					of_flags & OF_GPIO_ACTIVE_LOW, 0);
-	if (IS_ERR(clk)) {
-		mutex_unlock(&data->lock);
-		return clk;
-	}
+	parent_names = kcalloc(num_parents, sizeof(char *), GFP_KERNEL);
+	if (!parent_names)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < num_parents; i++)
+		parent_names[i] = of_clk_get_parent_name(data->node, i);
+
+	clk = data->clk_register_get(data->node->name, parent_names,
+			num_parents, gpio, of_flags & OF_GPIO_ACTIVE_LOW);
+	if (IS_ERR(clk))
+		goto out;
 
 	data->clk = clk;
+out:
 	mutex_unlock(&data->lock);
+	kfree(parent_names);
 
 	return clk;
 }
 
-/**
- * of_gpio_gate_clk_setup() - Setup function for gpio controlled clock
- */
-static void __init of_gpio_gate_clk_setup(struct device_node *node)
+static struct clk *of_clk_gpio_gate_delayed_register_get(const char *name,
+		const char **parent_names, u8 num_parents,
+		unsigned gpio, bool active_low)
+{
+	return clk_register_gpio_gate(NULL, name, parent_names[0],
+			gpio, active_low, 0);
+}
+
+static struct clk *of_clk_gpio_mux_delayed_register_get(const char *name,
+		const char **parent_names, u8 num_parents, unsigned gpio,
+		bool active_low)
+{
+	return clk_register_gpio_mux(NULL, name, parent_names, num_parents,
+			gpio, active_low, 0);
+}
+
+static void __init of_gpio_clk_setup(struct device_node *node,
+		const char *gpio_name,
+		struct clk *(*clk_register_get)(const char *name,
+				const char **parent_names, u8 num_parents,
+				unsigned gpio, bool active_low))
 {
-	struct clk_gpio_gate_delayed_register_data *data;
+	struct clk_gpio_delayed_register_data *data;
 
-	data = kzalloc(sizeof(struct clk_gpio_gate_delayed_register_data),
-		       GFP_KERNEL);
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return;
 
 	data->node = node;
+	data->gpio_name = gpio_name;
+	data->clk_register_get = clk_register_get;
 	mutex_init(&data->lock);
 
-	of_clk_add_provider(node, of_clk_gpio_gate_delayed_register_get, data);
+	of_clk_add_provider(node, of_clk_gpio_delayed_register_get, data);
+}
+
+static void __init of_gpio_gate_clk_setup(struct device_node *node)
+{
+	of_gpio_clk_setup(node, "enable-gpios",
+		of_clk_gpio_gate_delayed_register_get);
 }
 CLK_OF_DECLARE(gpio_gate_clk, "gpio-gate-clock", of_gpio_gate_clk_setup);
+
+void __init of_gpio_mux_clk_setup(struct device_node *node)
+{
+	of_gpio_clk_setup(node, "select-gpios",
+		of_clk_gpio_mux_delayed_register_get);
+}
+CLK_OF_DECLARE(gpio_mux_clk, "gpio-mux-clock", of_gpio_mux_clk_setup);
 #endif
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 78842f46f152..823d7f70878e 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -549,6 +549,23 @@ struct clk *clk_register_gpio_gate(struct device *dev, const char *name,
 
 void of_gpio_clk_gate_setup(struct device_node *node);
 
+/**
+ * struct clk_gpio_mux - gpio controlled clock multiplexer
+ *
+ * @hw:		see struct clk_gpio
+ * @gpiod:	gpio descriptor to select the parent of this clock multiplexer
+ *
+ * Clock with a gpio control for selecting the parent clock.
+ * Implements .get_parent, .set_parent and .determine_rate
+ */
+
+extern const struct clk_ops clk_gpio_mux_ops;
+struct clk *clk_register_gpio_mux(struct device *dev, const char *name,
+		const char **parent_names, u8 num_parents, unsigned gpio,
+		bool active_low, unsigned long flags);
+
+void of_gpio_mux_clk_setup(struct device_node *node);
+
 /**
  * clk_register - allocate a new clock, register it and return an opaque cookie
  * @dev: device that is registering this clock
-- 
cgit v1.2.3-70-g09d2


From f9281648ecd5081803bb2da84b9ccb0cf48436cd Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:21 -0700
Subject: context_tracking: Add ct_state() and CT_WARN_ON()

This will let us sprinkle sanity checks around the kernel
without making too much of a mess.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/5da41fb2ceb29eac671f427c67040401ba2a1fa0.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/context_tracking.h       | 15 +++++++++++++++
 include/linux/context_tracking_state.h |  1 +
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index b96bd299966f..008fc67d0d96 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -49,13 +49,28 @@ static inline void exception_exit(enum ctx_state prev_ctx)
 	}
 }
 
+
+/**
+ * ct_state() - return the current context tracking state if known
+ *
+ * Returns the current cpu's context tracking state if context tracking
+ * is enabled.  If context tracking is disabled, returns
+ * CONTEXT_DISABLED.  This should be used primarily for debugging.
+ */
+static inline enum ctx_state ct_state(void)
+{
+	return context_tracking_is_enabled() ?
+		this_cpu_read(context_tracking.state) : CONTEXT_DISABLED;
+}
 #else
 static inline void user_enter(void) { }
 static inline void user_exit(void) { }
 static inline enum ctx_state exception_enter(void) { return 0; }
 static inline void exception_exit(enum ctx_state prev_ctx) { }
+static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
 #endif /* !CONFIG_CONTEXT_TRACKING */
 
+#define CT_WARN_ON(cond) WARN_ON(context_tracking_is_enabled() && (cond))
 
 #ifdef CONFIG_CONTEXT_TRACKING_FORCE
 extern void context_tracking_init(void);
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 678ecdf90cf6..ee956c528fab 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -14,6 +14,7 @@ struct context_tracking {
 	bool active;
 	int recursion;
 	enum ctx_state {
+		CONTEXT_DISABLED = -1,	/* returned by ct_state() if unknown */
 		CONTEXT_KERNEL = 0,
 		CONTEXT_USER,
 		CONTEXT_GUEST,
-- 
cgit v1.2.3-70-g09d2


From eca2ebc7e007c9e2b8f5ecfcfc74b53fbe68e42b Mon Sep 17 00:00:00 2001
From: Martin Sperl <kernel@martin.sperl.org>
Date: Mon, 22 Jun 2015 13:00:36 +0000
Subject: spi: expose spi_master and spi_device statistics via sysfs

per spi-master statistics accessible as:
  /sys/class/spi_master/spi*/statistics/*

per spi-device statistics accessible via:
  /sys/class/spi_master/spi*/spi*.*/statistics/*

The following statistics are exposed as separate "files" inside
these directories:
* messages              number of spi_messages
* transfers             number of spi_transfers
* bytes                 number of bytes transferred
* bytes_rx              number of bytes transmitted
* bytes_tx              number of bytes received
* errors                number of errors encounterd
* timedout              number of messages that have timed out
* spi_async             number of spi_messages submitted using spi_async
* spi_sync              number of spi_messages submitted using spi_sync
* spi_sync_immediate    number of spi_messages submitted using spi_sync,
                        that are handled immediately without a context switch
                        to the spi_pump worker-thread

Signed-off-by: Martin Sperl <kernel@martin.sperl.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 168 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/spi/spi.h |  64 ++++++++++++++++++
 2 files changed, 229 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index cf8b91b23a76..07476ca083a0 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -67,11 +67,141 @@ modalias_show(struct device *dev, struct device_attribute *a, char *buf)
 }
 static DEVICE_ATTR_RO(modalias);
 
+#define SPI_STATISTICS_ATTRS(field, file)				\
+static ssize_t spi_master_##field##_show(struct device *dev,		\
+					 struct device_attribute *attr,	\
+					 char *buf)			\
+{									\
+	struct spi_master *master = container_of(dev,			\
+						 struct spi_master, dev); \
+	return spi_statistics_##field##_show(&master->statistics, buf);	\
+}									\
+static struct device_attribute dev_attr_spi_master_##field = {		\
+	.attr = { .name = file, .mode = S_IRUGO },			\
+	.show = spi_master_##field##_show,				\
+};									\
+static ssize_t spi_device_##field##_show(struct device *dev,		\
+					 struct device_attribute *attr,	\
+					char *buf)			\
+{									\
+	struct spi_device *spi = container_of(dev,			\
+					      struct spi_device, dev);	\
+	return spi_statistics_##field##_show(&spi->statistics, buf);	\
+}									\
+static struct device_attribute dev_attr_spi_device_##field = {		\
+	.attr = { .name = file, .mode = S_IRUGO },			\
+	.show = spi_device_##field##_show,				\
+}
+
+#define SPI_STATISTICS_SHOW_NAME(name, file, field, format_string)	\
+static ssize_t spi_statistics_##name##_show(struct spi_statistics *stat, \
+					    char *buf)			\
+{									\
+	unsigned long flags;						\
+	ssize_t len;							\
+	spin_lock_irqsave(&stat->lock, flags);				\
+	len = sprintf(buf, format_string, stat->field);			\
+	spin_unlock_irqrestore(&stat->lock, flags);			\
+	return len;							\
+}									\
+SPI_STATISTICS_ATTRS(name, file)
+
+#define SPI_STATISTICS_SHOW(field, format_string)			\
+	SPI_STATISTICS_SHOW_NAME(field, __stringify(field),		\
+				 field, format_string)
+
+SPI_STATISTICS_SHOW(messages, "%lu");
+SPI_STATISTICS_SHOW(transfers, "%lu");
+SPI_STATISTICS_SHOW(errors, "%lu");
+SPI_STATISTICS_SHOW(timedout, "%lu");
+
+SPI_STATISTICS_SHOW(spi_sync, "%lu");
+SPI_STATISTICS_SHOW(spi_sync_immediate, "%lu");
+SPI_STATISTICS_SHOW(spi_async, "%lu");
+
+SPI_STATISTICS_SHOW(bytes, "%llu");
+SPI_STATISTICS_SHOW(bytes_rx, "%llu");
+SPI_STATISTICS_SHOW(bytes_tx, "%llu");
+
 static struct attribute *spi_dev_attrs[] = {
 	&dev_attr_modalias.attr,
 	NULL,
 };
-ATTRIBUTE_GROUPS(spi_dev);
+
+static const struct attribute_group spi_dev_group = {
+	.attrs  = spi_dev_attrs,
+};
+
+static struct attribute *spi_device_statistics_attrs[] = {
+	&dev_attr_spi_device_messages.attr,
+	&dev_attr_spi_device_transfers.attr,
+	&dev_attr_spi_device_errors.attr,
+	&dev_attr_spi_device_timedout.attr,
+	&dev_attr_spi_device_spi_sync.attr,
+	&dev_attr_spi_device_spi_sync_immediate.attr,
+	&dev_attr_spi_device_spi_async.attr,
+	&dev_attr_spi_device_bytes.attr,
+	&dev_attr_spi_device_bytes_rx.attr,
+	&dev_attr_spi_device_bytes_tx.attr,
+	NULL,
+};
+
+static const struct attribute_group spi_device_statistics_group = {
+	.name  = "statistics",
+	.attrs  = spi_device_statistics_attrs,
+};
+
+static const struct attribute_group *spi_dev_groups[] = {
+	&spi_dev_group,
+	&spi_device_statistics_group,
+	NULL,
+};
+
+static struct attribute *spi_master_statistics_attrs[] = {
+	&dev_attr_spi_master_messages.attr,
+	&dev_attr_spi_master_transfers.attr,
+	&dev_attr_spi_master_errors.attr,
+	&dev_attr_spi_master_timedout.attr,
+	&dev_attr_spi_master_spi_sync.attr,
+	&dev_attr_spi_master_spi_sync_immediate.attr,
+	&dev_attr_spi_master_spi_async.attr,
+	&dev_attr_spi_master_bytes.attr,
+	&dev_attr_spi_master_bytes_rx.attr,
+	&dev_attr_spi_master_bytes_tx.attr,
+	NULL,
+};
+
+static const struct attribute_group spi_master_statistics_group = {
+	.name  = "statistics",
+	.attrs  = spi_master_statistics_attrs,
+};
+
+static const struct attribute_group *spi_master_groups[] = {
+	&spi_master_statistics_group,
+	NULL,
+};
+
+void spi_statistics_add_transfer_stats(struct spi_statistics *stats,
+				       struct spi_transfer *xfer,
+				       struct spi_master *master)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&stats->lock, flags);
+
+	stats->transfers++;
+
+	stats->bytes += xfer->len;
+	if ((xfer->tx_buf) &&
+	    (xfer->tx_buf != master->dummy_tx))
+		stats->bytes_tx += xfer->len;
+	if ((xfer->rx_buf) &&
+	    (xfer->rx_buf != master->dummy_rx))
+		stats->bytes_rx += xfer->len;
+
+	spin_unlock_irqrestore(&stats->lock, flags);
+}
+EXPORT_SYMBOL_GPL(spi_statistics_add_transfer_stats);
 
 /* modalias support makes "modprobe $MODALIAS" new-style hotplug work,
  * and the sysfs version makes coldplug work too.
@@ -249,6 +379,9 @@ struct spi_device *spi_alloc_device(struct spi_master *master)
 	spi->dev.bus = &spi_bus_type;
 	spi->dev.release = spidev_release;
 	spi->cs_gpio = -ENOENT;
+
+	spin_lock_init(&spi->statistics.lock);
+
 	device_initialize(&spi->dev);
 	return spi;
 }
@@ -689,17 +822,29 @@ static int spi_transfer_one_message(struct spi_master *master,
 	bool keep_cs = false;
 	int ret = 0;
 	unsigned long ms = 1;
+	struct spi_statistics *statm = &master->statistics;
+	struct spi_statistics *stats = &msg->spi->statistics;
 
 	spi_set_cs(msg->spi, true);
 
+	SPI_STATISTICS_INCREMENT_FIELD(statm, messages);
+	SPI_STATISTICS_INCREMENT_FIELD(stats, messages);
+
 	list_for_each_entry(xfer, &msg->transfers, transfer_list) {
 		trace_spi_transfer_start(msg, xfer);
 
+		spi_statistics_add_transfer_stats(statm, xfer, master);
+		spi_statistics_add_transfer_stats(stats, xfer, master);
+
 		if (xfer->tx_buf || xfer->rx_buf) {
 			reinit_completion(&master->xfer_completion);
 
 			ret = master->transfer_one(master, msg->spi, xfer);
 			if (ret < 0) {
+				SPI_STATISTICS_INCREMENT_FIELD(statm,
+							       errors);
+				SPI_STATISTICS_INCREMENT_FIELD(stats,
+							       errors);
 				dev_err(&msg->spi->dev,
 					"SPI transfer failed: %d\n", ret);
 				goto out;
@@ -715,6 +860,10 @@ static int spi_transfer_one_message(struct spi_master *master,
 			}
 
 			if (ms == 0) {
+				SPI_STATISTICS_INCREMENT_FIELD(statm,
+							       timedout);
+				SPI_STATISTICS_INCREMENT_FIELD(stats,
+							       timedout);
 				dev_err(&msg->spi->dev,
 					"SPI transfer timed out\n");
 				msg->status = -ETIMEDOUT;
@@ -1416,10 +1565,10 @@ static struct class spi_master_class = {
 	.name		= "spi_master",
 	.owner		= THIS_MODULE,
 	.dev_release	= spi_master_release,
+	.dev_groups	= spi_master_groups,
 };
 
 
-
 /**
  * spi_alloc_master - allocate SPI master controller
  * @dev: the controller, possibly using the platform_bus
@@ -1585,6 +1734,8 @@ int spi_register_master(struct spi_master *master)
 			goto done;
 		}
 	}
+	/* add statistics */
+	spin_lock_init(&master->statistics.lock);
 
 	mutex_lock(&board_lock);
 	list_add_tail(&master->list, &spi_master_list);
@@ -1939,6 +2090,9 @@ static int __spi_async(struct spi_device *spi, struct spi_message *message)
 
 	message->spi = spi;
 
+	SPI_STATISTICS_INCREMENT_FIELD(&master->statistics, spi_async);
+	SPI_STATISTICS_INCREMENT_FIELD(&spi->statistics, spi_async);
+
 	trace_spi_message_submit(message);
 
 	return master->transfer(spi, message);
@@ -2075,6 +2229,9 @@ static int __spi_sync(struct spi_device *spi, struct spi_message *message,
 	message->context = &done;
 	message->spi = spi;
 
+	SPI_STATISTICS_INCREMENT_FIELD(&master->statistics, spi_sync);
+	SPI_STATISTICS_INCREMENT_FIELD(&spi->statistics, spi_sync);
+
 	if (!bus_locked)
 		mutex_lock(&master->bus_lock_mutex);
 
@@ -2102,8 +2259,13 @@ static int __spi_sync(struct spi_device *spi, struct spi_message *message,
 		/* Push out the messages in the calling context if we
 		 * can.
 		 */
-		if (master->transfer == spi_queued_transfer)
+		if (master->transfer == spi_queued_transfer) {
+			SPI_STATISTICS_INCREMENT_FIELD(&master->statistics,
+						       spi_sync_immediate);
+			SPI_STATISTICS_INCREMENT_FIELD(&spi->statistics,
+						       spi_sync_immediate);
 			__spi_pump_messages(master, false);
+		}
 
 		wait_for_completion(&done);
 		status = message->status;
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index d673072346f2..269e8afd3e2a 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -23,6 +23,8 @@
 #include <linux/scatterlist.h>
 
 struct dma_chan;
+struct spi_master;
+struct spi_transfer;
 
 /*
  * INTERFACES between SPI master-side drivers and SPI infrastructure.
@@ -30,6 +32,59 @@ struct dma_chan;
  */
 extern struct bus_type spi_bus_type;
 
+/**
+ * struct spi_statistics - statistics for spi transfers
+ * @clock:         lock protecting this structure
+ *
+ * @messages:      number of spi-messages handled
+ * @transfers:     number of spi_transfers handled
+ * @errors:        number of errors during spi_transfer
+ * @timedout:      number of timeouts during spi_transfer
+ *
+ * @spi_sync:      number of times spi_sync is used
+ * @spi_sync_immediate:
+ *                 number of times spi_sync is executed immediately
+ *                 in calling context without queuing and scheduling
+ * @spi_async:     number of times spi_async is used
+ *
+ * @bytes:         number of bytes transferred to/from device
+ * @bytes_tx:      number of bytes sent to device
+ * @bytes_rx:      number of bytes received from device
+ *
+ */
+struct spi_statistics {
+	spinlock_t		lock; /* lock for the whole structure */
+
+	unsigned long		messages;
+	unsigned long		transfers;
+	unsigned long		errors;
+	unsigned long		timedout;
+
+	unsigned long		spi_sync;
+	unsigned long		spi_sync_immediate;
+	unsigned long		spi_async;
+
+	unsigned long long	bytes;
+	unsigned long long	bytes_rx;
+	unsigned long long	bytes_tx;
+
+};
+
+void spi_statistics_add_transfer_stats(struct spi_statistics *stats,
+				       struct spi_transfer *xfer,
+				       struct spi_master *master);
+
+#define SPI_STATISTICS_ADD_TO_FIELD(stats, field, count)	\
+	do {							\
+		unsigned long flags;				\
+		spin_lock_irqsave(&(stats)->lock, flags);	\
+		(stats)->field += count;			\
+		spin_unlock_irqrestore(&(stats)->lock, flags);	\
+	} while (0)
+
+#define SPI_STATISTICS_INCREMENT_FIELD(stats, field)	\
+	SPI_STATISTICS_ADD_TO_FIELD(stats, field, 1)
+
 /**
  * struct spi_device - Master side proxy for an SPI slave device
  * @dev: Driver model representation of the device.
@@ -60,6 +115,8 @@ extern struct bus_type spi_bus_type;
  * @cs_gpio: gpio number of the chipselect line (optional, -ENOENT when
  *	when not using a GPIO line)
  *
+ * @statistics: statistics for the spi_device
+ *
  * A @spi_device is used to interchange data between an SPI slave
  * (usually a discrete chip) and CPU memory.
  *
@@ -98,6 +155,9 @@ struct spi_device {
 	char			modalias[SPI_NAME_SIZE];
 	int			cs_gpio;	/* chip select gpio */
 
+	/* the statistics */
+	struct spi_statistics	statistics;
+
 	/*
 	 * likely need more hooks for more protocol options affecting how
 	 * the controller talks to each chip, like:
@@ -296,6 +356,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  * @cs_gpios: Array of GPIOs to use as chip select lines; one per CS
  *	number. Any individual value may be -ENOENT for CS lines that
  *	are not GPIOs (driven by the SPI controller itself).
+ * @statistics: statistics for the spi_master
  * @dma_tx: DMA transmit channel
  * @dma_rx: DMA receive channel
  * @dummy_rx: dummy receive buffer for full-duplex devices
@@ -452,6 +513,9 @@ struct spi_master {
 	/* gpio chip select */
 	int			*cs_gpios;
 
+	/* statistics */
+	struct spi_statistics	statistics;
+
 	/* DMA channels for use with core dmaengine helpers */
 	struct dma_chan		*dma_tx;
 	struct dma_chan		*dma_rx;
-- 
cgit v1.2.3-70-g09d2


From 5f867db63473f32cce1b868e281ebd42a41f8fad Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Fri, 26 Jun 2015 19:43:58 -0500
Subject: mtd: nand: Fix NAND_USE_BOUNCE_BUFFER flag conflict

Commit 66507c7bc8895f0da6b ("mtd: nand: Add support to use nand_base
poi databuf as bounce buffer") added a flag NAND_USE_BOUNCE_BUFFER
using the same bit value as the existing NAND_BUSWIDTH_AUTO.

Cc: Kamal Dasu <kdasu.kdev@gmail.com>
Fixes: 66507c7bc8895f0da6b ("mtd: nand: Add support to use nand_base
	poi databuf as bounce buffer")
Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 include/linux/mtd/nand.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index f25e2bdd188c..272f42952f34 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -177,11 +177,6 @@ typedef enum {
 #define NAND_OWN_BUFFERS	0x00020000
 /* Chip may not exist, so silence any errors in scan */
 #define NAND_SCAN_SILENT_NODEV	0x00040000
-/*
- * This option could be defined by controller drivers to protect against
- * kmap'ed, vmalloc'ed highmem buffers being passed from upper layers
- */
-#define NAND_USE_BOUNCE_BUFFER	0x00080000
 /*
  * Autodetect nand buswidth with readid/onfi.
  * This suppose the driver will configure the hardware in 8 bits mode
@@ -189,6 +184,11 @@ typedef enum {
  * before calling nand_scan_tail.
  */
 #define NAND_BUSWIDTH_AUTO      0x00080000
+/*
+ * This option could be defined by controller drivers to protect against
+ * kmap'ed, vmalloc'ed highmem buffers being passed from upper layers
+ */
+#define NAND_USE_BOUNCE_BUFFER	0x00100000
 
 /* Options set by nand scan */
 /* Nand scan has allocated controller struct */
-- 
cgit v1.2.3-70-g09d2


From 4c62dbbce902cf2afa88cac89ec67c828160f431 Mon Sep 17 00:00:00 2001
From: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Date: Fri, 26 Jun 2015 11:27:41 +0300
Subject: ACPI: Remove FSF mailing addresses

There is no need to carry potentially outdated Free Software Foundation
mailing address in file headers since the COPYING file includes it.

Signed-off-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/ac.c                   | 4 ----
 drivers/acpi/acpi_ipmi.c            | 4 ----
 drivers/acpi/acpi_memhotplug.c      | 5 -----
 drivers/acpi/acpi_pad.c             | 4 ----
 drivers/acpi/acpi_video.c           | 4 ----
 drivers/acpi/apei/apei-base.c       | 4 ----
 drivers/acpi/apei/einj.c            | 4 ----
 drivers/acpi/apei/erst-dbg.c        | 4 ----
 drivers/acpi/apei/erst.c            | 4 ----
 drivers/acpi/apei/ghes.c            | 4 ----
 drivers/acpi/apei/hest.c            | 4 ----
 drivers/acpi/battery.c              | 4 ----
 drivers/acpi/blacklist.c            | 4 ----
 drivers/acpi/bus.c                  | 4 ----
 drivers/acpi/button.c               | 4 ----
 drivers/acpi/cm_sbs.c               | 4 ----
 drivers/acpi/container.c            | 4 ----
 drivers/acpi/device_pm.c            | 4 ----
 drivers/acpi/dock.c                 | 4 ----
 drivers/acpi/ec.c                   | 4 ----
 drivers/acpi/fan.c                  | 4 ----
 drivers/acpi/hed.c                  | 4 ----
 drivers/acpi/internal.h             | 3 ---
 drivers/acpi/numa.c                 | 4 ----
 drivers/acpi/osl.c                  | 4 ----
 drivers/acpi/pci_irq.c              | 4 ----
 drivers/acpi/pci_link.c             | 4 ----
 drivers/acpi/pci_root.c             | 4 ----
 drivers/acpi/pci_slot.c             | 4 ----
 drivers/acpi/power.c                | 4 ----
 drivers/acpi/processor_driver.c     | 4 ----
 drivers/acpi/processor_idle.c       | 4 ----
 drivers/acpi/processor_perflib.c    | 4 ----
 drivers/acpi/processor_thermal.c    | 4 ----
 drivers/acpi/processor_throttling.c | 4 ----
 drivers/acpi/resource.c             | 4 ----
 drivers/acpi/sbs.c                  | 4 ----
 drivers/acpi/tables.c               | 4 ----
 drivers/acpi/thermal.c              | 4 ----
 drivers/acpi/utils.c                | 4 ----
 include/acpi/acpi_bus.h             | 4 ----
 include/acpi/acpi_drivers.h         | 4 ----
 include/linux/acpi.h                | 4 ----
 43 files changed, 172 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c
index 9b5354a2cd08..f71b756b05c4 100644
--- a/drivers/acpi/ac.c
+++ b/drivers/acpi/ac.c
@@ -16,10 +16,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/acpi_ipmi.c b/drivers/acpi/acpi_ipmi.c
index ac0f52f6df2b..f77956c3fd45 100644
--- a/drivers/acpi/acpi_ipmi.c
+++ b/drivers/acpi/acpi_ipmi.c
@@ -17,10 +17,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index ee28f4d15625..6b0d3ef7309c 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -16,11 +16,6 @@
  * NON INFRINGEMENT.  See the GNU General Public License for more
  * details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- *
  * ACPI based HotPlug driver that supports Memory Hotplug
  * This driver fields notifications from firmware for memory add
  * and remove operations and alerts the VM of the affected memory
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
index 00b39802d7ec..ae307ff36acb 100644
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -12,10 +12,6 @@
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  * more details.
  *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c
index 8c2fe2f2f9fd..5778e8e4313a 100644
--- a/drivers/acpi/acpi_video.c
+++ b/drivers/acpi/acpi_video.c
@@ -17,10 +17,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/apei/apei-base.c b/drivers/acpi/apei/apei-base.c
index a85ac07f3da3..a2c8d7adb6eb 100644
--- a/drivers/acpi/apei/apei-base.c
+++ b/drivers/acpi/apei/apei-base.c
@@ -24,10 +24,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
index a095d4f858da..0431883653be 100644
--- a/drivers/acpi/apei/einj.c
+++ b/drivers/acpi/apei/einj.c
@@ -18,10 +18,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/acpi/apei/erst-dbg.c b/drivers/acpi/apei/erst-dbg.c
index 04ab5c9d3ced..6330f557a2c8 100644
--- a/drivers/acpi/apei/erst-dbg.c
+++ b/drivers/acpi/apei/erst-dbg.c
@@ -17,10 +17,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 3670bbab57a3..6682c5daf742 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -18,10 +18,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 2bfd53cbfe80..23981ac1c6c2 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -23,10 +23,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index 06e9b411a0a2..20b3fcf4007c 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -21,10 +21,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index b3628cc01a53..b719ab3090bb 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -18,10 +18,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/blacklist.c b/drivers/acpi/blacklist.c
index 278dc4be992a..96809cd99ace 100644
--- a/drivers/acpi/blacklist.c
+++ b/drivers/acpi/blacklist.c
@@ -20,10 +20,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 513e7230e3d0..c8356eb79911 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -15,10 +15,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c
index 6d5d1832a588..5c3b0918d5fd 100644
--- a/drivers/acpi/button.c
+++ b/drivers/acpi/button.c
@@ -16,10 +16,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/cm_sbs.c b/drivers/acpi/cm_sbs.c
index 6c9ee68e46fb..d0918d421f90 100644
--- a/drivers/acpi/cm_sbs.c
+++ b/drivers/acpi/cm_sbs.c
@@ -11,10 +11,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/container.c b/drivers/acpi/container.c
index c8ead9f97375..12c240903c18 100644
--- a/drivers/acpi/container.c
+++ b/drivers/acpi/container.c
@@ -20,10 +20,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 #include <linux/acpi.h>
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index 717afcdb5f4a..d06cd59b5906 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -15,10 +15,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/dock.c b/drivers/acpi/dock.c
index a688aa243f6c..e8e128dede29 100644
--- a/drivers/acpi/dock.c
+++ b/drivers/acpi/dock.c
@@ -17,10 +17,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 9d4761d2f6b7..990446629935 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -22,10 +22,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c
index bea0bbaafa97..e297a480e135 100644
--- a/drivers/acpi/fan.c
+++ b/drivers/acpi/fan.c
@@ -16,10 +16,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/hed.c b/drivers/acpi/hed.c
index a322710b5ba4..5c67a6d8f803 100644
--- a/drivers/acpi/hed.c
+++ b/drivers/acpi/hed.c
@@ -15,10 +15,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 4683a96932b9..8c71cb8335c0 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -13,9 +13,6 @@
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  * more details.
  *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
 #ifndef _ACPI_INTERNAL_H_
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index acaa3b4ea504..72b6e9ef0ae9 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -15,10 +15,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
  */
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index c262e4acd68d..5e1f1bc5421e 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -19,10 +19,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
  */
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index 304eccb0ae5c..25fff35df82c 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -19,10 +19,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
index cfd7581cc19f..2f5f84ced85f 100644
--- a/drivers/acpi/pci_link.c
+++ b/drivers/acpi/pci_link.c
@@ -17,10 +17,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
  * TBD: 
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 1b5569c092c6..393706a5261b 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -16,10 +16,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/pci_slot.c b/drivers/acpi/pci_slot.c
index 139d9e479370..7188e53b6b7c 100644
--- a/drivers/acpi/pci_slot.c
+++ b/drivers/acpi/pci_slot.c
@@ -20,10 +20,6 @@
  *  WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c
index 93eac53b5110..45b47f2c9f03 100644
--- a/drivers/acpi/power.c
+++ b/drivers/acpi/power.c
@@ -16,10 +16,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index d9f71581b79b..3af8dc30f129 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -21,10 +21,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index d540f42c9232..175c86bee3a9 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -21,10 +21,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index cfc8aba72f86..53cfe8ba9799 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -20,10 +20,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c
index e003663b2f8e..1fed84a092c2 100644
--- a/drivers/acpi/processor_thermal.c
+++ b/drivers/acpi/processor_thermal.c
@@ -19,10 +19,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c
index 84243c32e29c..f170d746336d 100644
--- a/drivers/acpi/processor_throttling.c
+++ b/drivers/acpi/processor_throttling.c
@@ -19,10 +19,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 10561ce16ed1..64ea0d10b788 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -15,10 +15,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c
index 01504c819e8f..cb3dedb1beae 100644
--- a/drivers/acpi/sbs.c
+++ b/drivers/acpi/sbs.c
@@ -17,10 +17,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index 2e19189da0ee..17a6fa01a338 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -15,10 +15,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
  */
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 6d4e44ea74ac..fc28b9f5aa84 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -16,10 +16,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
  *  This driver fully implements the ACPI thermal policy as described in the
diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index 67c548ad3764..475c9079bf85 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -16,10 +16,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 83061cac719b..5ba8fb64f664 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -16,10 +16,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
index ea6428b7dacb..29c691265b49 100644
--- a/include/acpi/acpi_drivers.h
+++ b/include/acpi/acpi_drivers.h
@@ -16,10 +16,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c471dfc93b71..1c116ee53b1e 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -15,10 +15,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
-- 
cgit v1.2.3-70-g09d2


From f089d4d20fcbcf16a62ef3b4b57f41ecf59a5d83 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Date: Tue, 7 Jul 2015 15:28:12 +0100
Subject: mfd: wm5110: Add registers for custom write sequence triggers

This register will be needed as part of some additional support for the
headphone path on wm5110, so this patch adds the register and sets up
its regmap config.

Signed-off-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/mfd/wm5110-tables.c           |  2 ++
 include/linux/mfd/arizona/registers.h | 37 +++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/wm5110-tables.c b/drivers/mfd/wm5110-tables.c
index 12cad94b4035..62a4aa13cb98 100644
--- a/drivers/mfd/wm5110-tables.c
+++ b/drivers/mfd/wm5110-tables.c
@@ -676,6 +676,7 @@ static const struct reg_default wm5110_reg_default[] = {
 	{ 0x00000032, 0x0100 },    /* R50    - PWM Drive 3 */
 	{ 0x00000040, 0x0000 },    /* R64    - Wake control */
 	{ 0x00000041, 0x0000 },    /* R65    - Sequence control */
+	{ 0x00000042, 0x0000 },    /* R66    - Spare Triggers */
 	{ 0x00000061, 0x01FF },    /* R97    - Sample Rate Sequence Select 1 */
 	{ 0x00000062, 0x01FF },    /* R98    - Sample Rate Sequence Select 2 */
 	{ 0x00000063, 0x01FF },    /* R99    - Sample Rate Sequence Select 3 */
@@ -1716,6 +1717,7 @@ static bool wm5110_readable_register(struct device *dev, unsigned int reg)
 	case ARIZONA_PWM_DRIVE_3:
 	case ARIZONA_WAKE_CONTROL:
 	case ARIZONA_SEQUENCE_CONTROL:
+	case ARIZONA_SPARE_TRIGGERS:
 	case ARIZONA_SAMPLE_RATE_SEQUENCE_SELECT_1:
 	case ARIZONA_SAMPLE_RATE_SEQUENCE_SELECT_2:
 	case ARIZONA_SAMPLE_RATE_SEQUENCE_SELECT_3:
diff --git a/include/linux/mfd/arizona/registers.h b/include/linux/mfd/arizona/registers.h
index 3499d36e6067..11affb3c2768 100644
--- a/include/linux/mfd/arizona/registers.h
+++ b/include/linux/mfd/arizona/registers.h
@@ -39,6 +39,7 @@
 #define ARIZONA_PWM_DRIVE_3                      0x32
 #define ARIZONA_WAKE_CONTROL                     0x40
 #define ARIZONA_SEQUENCE_CONTROL                 0x41
+#define ARIZONA_SPARE_TRIGGERS                   0x42
 #define ARIZONA_SAMPLE_RATE_SEQUENCE_SELECT_1    0x61
 #define ARIZONA_SAMPLE_RATE_SEQUENCE_SELECT_2    0x62
 #define ARIZONA_SAMPLE_RATE_SEQUENCE_SELECT_3    0x63
@@ -1430,6 +1431,42 @@
 #define ARIZONA_WSEQ_ENA_JD2_RISE_SHIFT               0  /* WSEQ_ENA_JD2_RISE */
 #define ARIZONA_WSEQ_ENA_JD2_RISE_WIDTH               1  /* WSEQ_ENA_JD2_RISE */
 
+/*
+ * R66 (0x42) - Spare Triggers
+ */
+#define ARIZONA_WS_TRG8                          0x0080  /* WS_TRG8 */
+#define ARIZONA_WS_TRG8_MASK                     0x0080  /* WS_TRG8 */
+#define ARIZONA_WS_TRG8_SHIFT                         7  /* WS_TRG8 */
+#define ARIZONA_WS_TRG8_WIDTH                         1  /* WS_TRG8 */
+#define ARIZONA_WS_TRG7                          0x0040  /* WS_TRG7 */
+#define ARIZONA_WS_TRG7_MASK                     0x0040  /* WS_TRG7 */
+#define ARIZONA_WS_TRG7_SHIFT                         6  /* WS_TRG7 */
+#define ARIZONA_WS_TRG7_WIDTH                         1  /* WS_TRG7 */
+#define ARIZONA_WS_TRG6                          0x0020  /* WS_TRG6 */
+#define ARIZONA_WS_TRG6_MASK                     0x0020  /* WS_TRG6 */
+#define ARIZONA_WS_TRG6_SHIFT                         5  /* WS_TRG6 */
+#define ARIZONA_WS_TRG6_WIDTH                         1  /* WS_TRG6 */
+#define ARIZONA_WS_TRG5                          0x0010  /* WS_TRG5 */
+#define ARIZONA_WS_TRG5_MASK                     0x0010  /* WS_TRG5 */
+#define ARIZONA_WS_TRG5_SHIFT                         4  /* WS_TRG5 */
+#define ARIZONA_WS_TRG5_WIDTH                         1  /* WS_TRG5 */
+#define ARIZONA_WS_TRG4                          0x0008  /* WS_TRG4 */
+#define ARIZONA_WS_TRG4_MASK                     0x0008  /* WS_TRG4 */
+#define ARIZONA_WS_TRG4_SHIFT                         3  /* WS_TRG4 */
+#define ARIZONA_WS_TRG4_WIDTH                         1  /* WS_TRG4 */
+#define ARIZONA_WS_TRG3                          0x0004  /* WS_TRG3 */
+#define ARIZONA_WS_TRG3_MASK                     0x0004  /* WS_TRG3 */
+#define ARIZONA_WS_TRG3_SHIFT                         2  /* WS_TRG3 */
+#define ARIZONA_WS_TRG3_WIDTH                         1  /* WS_TRG3 */
+#define ARIZONA_WS_TRG2                          0x0002  /* WS_TRG2 */
+#define ARIZONA_WS_TRG2_MASK                     0x0002  /* WS_TRG2 */
+#define ARIZONA_WS_TRG2_SHIFT                         1  /* WS_TRG2 */
+#define ARIZONA_WS_TRG2_WIDTH                         1  /* WS_TRG2 */
+#define ARIZONA_WS_TRG1                          0x0001  /* WS_TRG1 */
+#define ARIZONA_WS_TRG1_MASK                     0x0001  /* WS_TRG1 */
+#define ARIZONA_WS_TRG1_SHIFT                         0  /* WS_TRG1 */
+#define ARIZONA_WS_TRG1_WIDTH                         1  /* WS_TRG1 */
+
 /*
  * R97 (0x61) - Sample Rate Sequence Select 1
  */
-- 
cgit v1.2.3-70-g09d2


From 2d53809594afaf2ae66a90a3142c1b702fd3bcea Mon Sep 17 00:00:00 2001
From: Dirk Behme <dirk.behme@de.bosch.com>
Date: Mon, 6 Jul 2015 15:57:44 -0700
Subject: Input: zforce_ts - convert to use the gpiod interface

Use the new GPIO descriptor interface to handle the zForce GPIOs.
This simplifies the code and allows transparently handle GPIO polarity, as
specified in device tree data.

Also switch to using gpio_{set|get}_value_cansleep() since none of the
callers is in atomic context and cansleep variant allows more GPIO
controllers to be used with the touchscreen.

Signed-off-by: Dirk Behme <dirk.behme@de.bosch.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/zforce_ts.c   | 58 +++++++++++++++++----------------
 include/linux/platform_data/zforce_ts.h |  3 --
 2 files changed, 30 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/zforce_ts.c b/drivers/input/touchscreen/zforce_ts.c
index f58a196521a9..c4cffcfb03d3 100644
--- a/drivers/input/touchscreen/zforce_ts.c
+++ b/drivers/input/touchscreen/zforce_ts.c
@@ -24,14 +24,13 @@
 #include <linux/interrupt.h>
 #include <linux/i2c.h>
 #include <linux/delay.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/device.h>
 #include <linux/sysfs.h>
 #include <linux/input/mt.h>
 #include <linux/platform_data/zforce_ts.h>
 #include <linux/regulator/consumer.h>
 #include <linux/of.h>
-#include <linux/of_gpio.h>
 
 #define WAIT_TIMEOUT		msecs_to_jiffies(1000)
 
@@ -120,6 +119,9 @@ struct zforce_ts {
 
 	struct regulator	*reg_vdd;
 
+	struct gpio_desc	*gpio_int;
+	struct gpio_desc	*gpio_rst;
+
 	bool			suspending;
 	bool			suspended;
 	bool			boot_complete;
@@ -161,6 +163,16 @@ static int zforce_command(struct zforce_ts *ts, u8 cmd)
 	return 0;
 }
 
+static void zforce_reset_assert(struct zforce_ts *ts)
+{
+	gpiod_set_value_cansleep(ts->gpio_rst, 1);
+}
+
+static void zforce_reset_deassert(struct zforce_ts *ts)
+{
+	gpiod_set_value_cansleep(ts->gpio_rst, 0);
+}
+
 static int zforce_send_wait(struct zforce_ts *ts, const char *buf, int len)
 {
 	struct i2c_client *client = ts->client;
@@ -479,7 +491,6 @@ static irqreturn_t zforce_irq_thread(int irq, void *dev_id)
 {
 	struct zforce_ts *ts = dev_id;
 	struct i2c_client *client = ts->client;
-	const struct zforce_ts_platdata *pdata = ts->pdata;
 	int ret;
 	u8 payload_buffer[FRAME_MAXSIZE];
 	u8 *payload;
@@ -499,7 +510,7 @@ static irqreturn_t zforce_irq_thread(int irq, void *dev_id)
 	if (!ts->suspending && device_may_wakeup(&client->dev))
 		pm_stay_awake(&client->dev);
 
-	while (!gpio_get_value(pdata->gpio_int)) {
+	while (!gpiod_get_value_cansleep(ts->gpio_int)) {
 		ret = zforce_read_packet(ts, payload_buffer);
 		if (ret < 0) {
 			dev_err(&client->dev,
@@ -690,7 +701,7 @@ static void zforce_reset(void *data)
 {
 	struct zforce_ts *ts = data;
 
-	gpio_set_value(ts->pdata->gpio_rst, 0);
+	zforce_reset_assert(ts);
 
 	udelay(10);
 
@@ -712,18 +723,6 @@ static struct zforce_ts_platdata *zforce_parse_dt(struct device *dev)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	pdata->gpio_int = of_get_gpio(np, 0);
-	if (!gpio_is_valid(pdata->gpio_int)) {
-		dev_err(dev, "failed to get interrupt gpio\n");
-		return ERR_PTR(-EINVAL);
-	}
-
-	pdata->gpio_rst = of_get_gpio(np, 1);
-	if (!gpio_is_valid(pdata->gpio_rst)) {
-		dev_err(dev, "failed to get reset gpio\n");
-		return ERR_PTR(-EINVAL);
-	}
-
 	if (of_property_read_u32(np, "x-size", &pdata->x_max)) {
 		dev_err(dev, "failed to get x-size property\n");
 		return ERR_PTR(-EINVAL);
@@ -755,19 +754,22 @@ static int zforce_probe(struct i2c_client *client,
 	if (!ts)
 		return -ENOMEM;
 
-	ret = devm_gpio_request_one(&client->dev, pdata->gpio_int, GPIOF_IN,
-				    "zforce_ts_int");
-	if (ret) {
-		dev_err(&client->dev, "request of gpio %d failed, %d\n",
-			pdata->gpio_int, ret);
+	/* INT GPIO */
+	ts->gpio_int = devm_gpiod_get_index(&client->dev, NULL, 0, GPIOD_IN);
+	if (IS_ERR(ts->gpio_int)) {
+		ret = PTR_ERR(ts->gpio_int);
+		dev_err(&client->dev,
+			"failed to request interrupt GPIO: %d\n", ret);
 		return ret;
 	}
 
-	ret = devm_gpio_request_one(&client->dev, pdata->gpio_rst,
-				    GPIOF_OUT_INIT_LOW, "zforce_ts_rst");
-	if (ret) {
-		dev_err(&client->dev, "request of gpio %d failed, %d\n",
-			pdata->gpio_rst, ret);
+	/* RST GPIO */
+	ts->gpio_rst = devm_gpiod_get_index(&client->dev, NULL, 1,
+					    GPIOD_OUT_HIGH);
+	if (IS_ERR(ts->gpio_rst)) {
+		ret = PTR_ERR(ts->gpio_rst);
+		dev_err(&client->dev,
+			"failed to request reset GPIO: %d\n", ret);
 		return ret;
 	}
 
@@ -863,7 +865,7 @@ static int zforce_probe(struct i2c_client *client,
 	i2c_set_clientdata(client, ts);
 
 	/* let the controller boot */
-	gpio_set_value(pdata->gpio_rst, 1);
+	zforce_reset_deassert(ts);
 
 	ts->command_waiting = NOTIFICATION_BOOTCOMPLETE;
 	if (wait_for_completion_timeout(&ts->command_done, WAIT_TIMEOUT) == 0)
diff --git a/include/linux/platform_data/zforce_ts.h b/include/linux/platform_data/zforce_ts.h
index 0472ab2f6ede..7bdece8ef33e 100644
--- a/include/linux/platform_data/zforce_ts.h
+++ b/include/linux/platform_data/zforce_ts.h
@@ -16,9 +16,6 @@
 #define _LINUX_INPUT_ZFORCE_TS_H
 
 struct zforce_ts_platdata {
-	int gpio_int;
-	int gpio_rst;
-
 	unsigned int x_max;
 	unsigned int y_max;
 };
-- 
cgit v1.2.3-70-g09d2


From 7876f930d0e78addc6bbdbba0d6c196a0788d545 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 9 Jul 2015 16:39:49 -0400
Subject: blkcg: implement all_blkcgs list

Add all_blkcgs list goes through blkcg->all_blkcgs_node and is
protected by blkcg_pol_mutex.  This will be used to fix
blkcg_policy_data allocation bug.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 17 ++++++++++++-----
 include/linux/blk-cgroup.h |  1 +
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 05b893de516b..42ff436ffaf4 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -46,6 +46,8 @@ struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
 
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
+static LIST_HEAD(all_blkcgs);		/* protected by blkcg_pol_mutex */
+
 static bool blkcg_policy_enabled(struct request_queue *q,
 				 const struct blkcg_policy *pol)
 {
@@ -817,6 +819,10 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
 
+	mutex_lock(&blkcg_pol_mutex);
+	list_del(&blkcg->all_blkcgs_node);
+	mutex_unlock(&blkcg_pol_mutex);
+
 	if (blkcg != &blkcg_root) {
 		int i;
 
@@ -833,6 +839,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 	struct cgroup_subsys_state *ret;
 	int i;
 
+	mutex_lock(&blkcg_pol_mutex);
+
 	if (!parent_css) {
 		blkcg = &blkcg_root;
 		goto done;
@@ -844,8 +852,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		goto free_blkcg;
 	}
 
-	mutex_lock(&blkcg_pol_mutex);
-
 	for (i = 0; i < BLKCG_MAX_POLS ; i++) {
 		struct blkcg_policy *pol = blkcg_policy[i];
 		struct blkcg_policy_data *cpd;
@@ -862,7 +868,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		BUG_ON(blkcg->pd[i]);
 		cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
 		if (!cpd) {
-			mutex_unlock(&blkcg_pol_mutex);
 			ret = ERR_PTR(-ENOMEM);
 			goto free_pd_blkcg;
 		}
@@ -871,7 +876,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		pol->cpd_init_fn(blkcg);
 	}
 
-	mutex_unlock(&blkcg_pol_mutex);
 done:
 	spin_lock_init(&blkcg->lock);
 	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
@@ -879,14 +883,17 @@ done:
 #ifdef CONFIG_CGROUP_WRITEBACK
 	INIT_LIST_HEAD(&blkcg->cgwb_list);
 #endif
+	list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
+
+	mutex_unlock(&blkcg_pol_mutex);
 	return &blkcg->css;
 
 free_pd_blkcg:
 	for (i--; i >= 0; i--)
 		kfree(blkcg->pd[i]);
-
 free_blkcg:
 	kfree(blkcg);
+	mutex_unlock(&blkcg_pol_mutex);
 	return ret;
 }
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 58cfab80dd70..cf3e7bc22ef3 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -47,6 +47,7 @@ struct blkcg {
 
 	struct blkcg_policy_data	*pd[BLKCG_MAX_POLS];
 
+	struct list_head		all_blkcgs_node;
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct list_head		cgwb_list;
 #endif
-- 
cgit v1.2.3-70-g09d2


From 06b285bd11257bccc5a1b85a835507e33656aff2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 9 Jul 2015 16:39:50 -0400
Subject: blkcg: fix blkcg_policy_data allocation bug

e48453c386f3 ("block, cgroup: implement policy-specific per-blkcg
data") updated per-blkcg policy data to be dynamically allocated.
When a policy is registered, its policy data aren't created.  Instead,
when the policy is activated on a queue, the policy data are allocated
if there are blkg's (blkcg_gq's) which are attached to a given blkcg.
This is buggy.  Consider the following scenario.

1. A blkcg is created.  No blkg's attached yet.

2. The policy is registered.  No policy data is allocated.

3. The policy is activated on a queue.  As the above blkcg doesn't
   have any blkg's, it won't allocate the matching blkcg_policy_data.

4. An IO is issued from the blkcg and blkg is created and the blkcg
   still doesn't have the matching policy data allocated.

With cfq-iosched, this leads to an oops.

It also doesn't free policy data on policy unregistration assuming
that freeing of all policy data on blkcg destruction should take care
of it; however, this also is incorrect.

1. A blkcg has policy data.

2. The policy gets unregistered but the policy data remains.

3. Another policy gets registered on the same slot.

4. Later, the new policy tries to allocate policy data on the previous
   blkcg but the slot is already occupied and gets skipped.  The
   policy ends up operating on the policy data of the previous policy.

There's no reason to manage blkcg_policy_data lazily.  The reason we
do lazy allocation of blkg's is that the number of all possible blkg's
is the product of cgroups and block devices which can reach a
surprising level.  blkcg_policy_data is contrained by the number of
cgroups and shouldn't be a problem.

This patch makes blkcg_policy_data to be allocated for all existing
blkcg's on policy registration and freed on unregistration and removes
blkcg_policy_data handling from policy [de]activation paths.  This
makes that blkcg_policy_data are created and removed with the policy
they belong to and fixes the above described problems.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: e48453c386f3 ("block, cgroup: implement policy-specific per-blkcg data")
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 78 +++++++++++++++++++++++++---------------------
 include/linux/blk-cgroup.h | 10 ++----
 2 files changed, 44 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 42ff436ffaf4..9da02c021ebe 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1048,10 +1048,8 @@ int blkcg_activate_policy(struct request_queue *q,
 			  const struct blkcg_policy *pol)
 {
 	LIST_HEAD(pds);
-	LIST_HEAD(cpds);
 	struct blkcg_gq *blkg;
 	struct blkg_policy_data *pd, *nd;
-	struct blkcg_policy_data *cpd, *cnd;
 	int cnt = 0, ret;
 
 	if (blkcg_policy_enabled(q, pol))
@@ -1064,10 +1062,7 @@ int blkcg_activate_policy(struct request_queue *q,
 		cnt++;
 	spin_unlock_irq(q->queue_lock);
 
-	/*
-	 * Allocate per-blkg and per-blkcg policy data
-	 * for all existing blkgs.
-	 */
+	/* allocate per-blkg policy data for all existing blkgs */
 	while (cnt--) {
 		pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
 		if (!pd) {
@@ -1075,15 +1070,6 @@ int blkcg_activate_policy(struct request_queue *q,
 			goto out_free;
 		}
 		list_add_tail(&pd->alloc_node, &pds);
-
-		if (!pol->cpd_size)
-			continue;
-		cpd = kzalloc_node(pol->cpd_size, GFP_KERNEL, q->node);
-		if (!cpd) {
-			ret = -ENOMEM;
-			goto out_free;
-		}
-		list_add_tail(&cpd->alloc_node, &cpds);
 	}
 
 	/*
@@ -1093,32 +1079,17 @@ int blkcg_activate_policy(struct request_queue *q,
 	spin_lock_irq(q->queue_lock);
 
 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
-		if (WARN_ON(list_empty(&pds)) ||
-		    WARN_ON(pol->cpd_size && list_empty(&cpds))) {
+		if (WARN_ON(list_empty(&pds))) {
 			/* umm... this shouldn't happen, just abort */
 			ret = -ENOMEM;
 			goto out_unlock;
 		}
-		cpd = list_first_entry(&cpds, struct blkcg_policy_data,
-				       alloc_node);
-		list_del_init(&cpd->alloc_node);
 		pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
 		list_del_init(&pd->alloc_node);
 
 		/* grab blkcg lock too while installing @pd on @blkg */
 		spin_lock(&blkg->blkcg->lock);
 
-		if (!pol->cpd_size)
-			goto no_cpd;
-		if (!blkg->blkcg->pd[pol->plid]) {
-			/* Per-policy per-blkcg data */
-			blkg->blkcg->pd[pol->plid] = cpd;
-			cpd->plid = pol->plid;
-			pol->cpd_init_fn(blkg->blkcg);
-		} else { /* must free it as it has already been extracted */
-			kfree(cpd);
-		}
-no_cpd:
 		blkg->pd[pol->plid] = pd;
 		pd->blkg = blkg;
 		pd->plid = pol->plid;
@@ -1135,8 +1106,6 @@ out_free:
 	blk_queue_bypass_end(q);
 	list_for_each_entry_safe(pd, nd, &pds, alloc_node)
 		kfree(pd);
-	list_for_each_entry_safe(cpd, cnd, &cpds, alloc_node)
-		kfree(cpd);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1191,6 +1160,7 @@ EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
  */
 int blkcg_policy_register(struct blkcg_policy *pol)
 {
+	struct blkcg *blkcg;
 	int i, ret;
 
 	if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
@@ -1207,9 +1177,27 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	if (i >= BLKCG_MAX_POLS)
 		goto err_unlock;
 
-	/* register and update blkgs */
+	/* register @pol */
 	pol->plid = i;
-	blkcg_policy[i] = pol;
+	blkcg_policy[pol->plid] = pol;
+
+	/* allocate and install cpd's */
+	if (pol->cpd_size) {
+		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+			struct blkcg_policy_data *cpd;
+
+			cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+			if (!cpd) {
+				mutex_unlock(&blkcg_pol_mutex);
+				goto err_free_cpds;
+			}
+
+			blkcg->pd[pol->plid] = cpd;
+			cpd->plid = pol->plid;
+			pol->cpd_init_fn(blkcg);
+		}
+	}
+
 	mutex_unlock(&blkcg_pol_mutex);
 
 	/* everything is in place, add intf files for the new policy */
@@ -1219,6 +1207,14 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	mutex_unlock(&blkcg_pol_register_mutex);
 	return 0;
 
+err_free_cpds:
+	if (pol->cpd_size) {
+		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+			kfree(blkcg->pd[pol->plid]);
+			blkcg->pd[pol->plid] = NULL;
+		}
+	}
+	blkcg_policy[pol->plid] = NULL;
 err_unlock:
 	mutex_unlock(&blkcg_pol_mutex);
 	mutex_unlock(&blkcg_pol_register_mutex);
@@ -1234,6 +1230,8 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register);
  */
 void blkcg_policy_unregister(struct blkcg_policy *pol)
 {
+	struct blkcg *blkcg;
+
 	mutex_lock(&blkcg_pol_register_mutex);
 
 	if (WARN_ON(blkcg_policy[pol->plid] != pol))
@@ -1243,9 +1241,17 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 	if (pol->cftypes)
 		cgroup_rm_cftypes(pol->cftypes);
 
-	/* unregister and update blkgs */
+	/* remove cpds and unregister */
 	mutex_lock(&blkcg_pol_mutex);
+
+	if (pol->cpd_size) {
+		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+			kfree(blkcg->pd[pol->plid]);
+			blkcg->pd[pol->plid] = NULL;
+		}
+	}
 	blkcg_policy[pol->plid] = NULL;
+
 	mutex_unlock(&blkcg_pol_mutex);
 out_unlock:
 	mutex_unlock(&blkcg_pol_register_mutex);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index cf3e7bc22ef3..1b62d768c7df 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -89,18 +89,12 @@ struct blkg_policy_data {
  * Policies that need to keep per-blkcg data which is independent
  * from any request_queue associated to it must specify its size
  * with the cpd_size field of the blkcg_policy structure and
- * embed a blkcg_policy_data in it. blkcg core allocates
- * policy-specific per-blkcg structures lazily the first time
- * they are actually needed, so it handles them together with
- * blkgs. cpd_init() is invoked to let each policy handle
- * per-blkcg data.
+ * embed a blkcg_policy_data in it.  cpd_init() is invoked to let
+ * each policy handle per-blkcg data.
  */
 struct blkcg_policy_data {
 	/* the policy id this per-policy data belongs to */
 	int				plid;
-
-	/* used during policy activation */
-	struct list_head		alloc_node;
 };
 
 /* association between a blk cgroup and a request queue */
-- 
cgit v1.2.3-70-g09d2


From 4a0e3e989d66bb7204b163d9cfaa7fa96d0f2023 Mon Sep 17 00:00:00 2001
From: Enrico Mioso <mrkiko.rs@gmail.com>
Date: Wed, 8 Jul 2015 13:05:57 +0200
Subject: cdc_ncm: Add support for moving NDP to end of NCM frame

NCM specs are not actually mandating a specific position in the frame for
the NDP (Network Datagram Pointer). However, some Huawei devices will
ignore our aggregates if it is not placed after the datagrams it points
to. Add support for doing just this, in a per-device configurable way.
While at it, update NCM subdrivers, disabling this functionality in all of
them, except in huawei_cdc_ncm where it is enabled instead.
We aren't making any distinction between different Huawei NCM devices,
based on what the vendor driver does. Standard NCM devices are left
unaffected: if they are compliant, they should be always usable, still
stay on the safe side.

This change has been tested and working with a Huawei E3131 device (which
works regardless of NDP position), a Huawei E3531 (also working both
ways) and an E3372 (which mandates NDP to be after indexed datagrams).

V1->V2:
- corrected wrong NDP acronym definition
- fixed possible NULL pointer dereference
- patch cleanup
V2->V3:
- Properly account for the NDP size when writing new packets to SKB

Signed-off-by: Enrico Mioso <mrkiko.rs@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_mbim.c       |  2 +-
 drivers/net/usb/cdc_ncm.c        | 61 ++++++++++++++++++++++++++++++++++++----
 drivers/net/usb/huawei_cdc_ncm.c |  7 +++--
 include/linux/usb/cdc_ncm.h      |  7 ++++-
 4 files changed, 67 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c
index e4b7a47a825c..efc18e05af0a 100644
--- a/drivers/net/usb/cdc_mbim.c
+++ b/drivers/net/usb/cdc_mbim.c
@@ -158,7 +158,7 @@ static int cdc_mbim_bind(struct usbnet *dev, struct usb_interface *intf)
 	if (!cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting))
 		goto err;
 
-	ret = cdc_ncm_bind_common(dev, intf, data_altsetting);
+	ret = cdc_ncm_bind_common(dev, intf, data_altsetting, 0);
 	if (ret)
 		goto err;
 
diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 8067b8fbb0ee..1991e4a24657 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -684,10 +684,12 @@ static void cdc_ncm_free(struct cdc_ncm_ctx *ctx)
 		ctx->tx_curr_skb = NULL;
 	}
 
+	kfree(ctx->delayed_ndp16);
+
 	kfree(ctx);
 }
 
-int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting)
+int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting, int drvflags)
 {
 	const struct usb_cdc_union_desc *union_desc = NULL;
 	struct cdc_ncm_ctx *ctx;
@@ -855,6 +857,17 @@ advance:
 	/* finish setting up the device specific data */
 	cdc_ncm_setup(dev);
 
+	/* Device-specific flags */
+	ctx->drvflags = drvflags;
+
+	/* Allocate the delayed NDP if needed. */
+	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) {
+		ctx->delayed_ndp16 = kzalloc(ctx->max_ndp_size, GFP_KERNEL);
+		if (!ctx->delayed_ndp16)
+			goto error2;
+		dev_info(&intf->dev, "NDP will be placed at end of frame for this device.");
+	}
+
 	/* override ethtool_ops */
 	dev->net->ethtool_ops = &cdc_ncm_ethtool_ops;
 
@@ -954,8 +967,11 @@ static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf)
 	if (cdc_ncm_select_altsetting(intf) != CDC_NCM_COMM_ALTSETTING_NCM)
 		return -ENODEV;
 
-	/* The NCM data altsetting is fixed */
-	ret = cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM);
+	/* The NCM data altsetting is fixed, so we hard-coded it.
+	 * Additionally, generic NCM devices are assumed to accept arbitrarily
+	 * placed NDP.
+	 */
+	ret = cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0);
 
 	/*
 	 * We should get an event when network connection is "connected" or
@@ -986,6 +1002,14 @@ static struct usb_cdc_ncm_ndp16 *cdc_ncm_ndp(struct cdc_ncm_ctx *ctx, struct sk_
 	struct usb_cdc_ncm_nth16 *nth16 = (void *)skb->data;
 	size_t ndpoffset = le16_to_cpu(nth16->wNdpIndex);
 
+	/* If NDP should be moved to the end of the NCM package, we can't follow the
+	* NTH16 header as we would normally do. NDP isn't written to the SKB yet, and
+	* the wNdpIndex field in the header is actually not consistent with reality. It will be later.
+	*/
+	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END)
+		if (ctx->delayed_ndp16->dwSignature == sign)
+			return ctx->delayed_ndp16;
+
 	/* follow the chain of NDPs, looking for a match */
 	while (ndpoffset) {
 		ndp16 = (struct usb_cdc_ncm_ndp16 *)(skb->data + ndpoffset);
@@ -995,7 +1019,8 @@ static struct usb_cdc_ncm_ndp16 *cdc_ncm_ndp(struct cdc_ncm_ctx *ctx, struct sk_
 	}
 
 	/* align new NDP */
-	cdc_ncm_align_tail(skb, ctx->tx_ndp_modulus, 0, ctx->tx_max);
+	if (!(ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END))
+		cdc_ncm_align_tail(skb, ctx->tx_ndp_modulus, 0, ctx->tx_max);
 
 	/* verify that there is room for the NDP and the datagram (reserve) */
 	if ((ctx->tx_max - skb->len - reserve) < ctx->max_ndp_size)
@@ -1008,7 +1033,11 @@ static struct usb_cdc_ncm_ndp16 *cdc_ncm_ndp(struct cdc_ncm_ctx *ctx, struct sk_
 		nth16->wNdpIndex = cpu_to_le16(skb->len);
 
 	/* push a new empty NDP */
-	ndp16 = (struct usb_cdc_ncm_ndp16 *)memset(skb_put(skb, ctx->max_ndp_size), 0, ctx->max_ndp_size);
+	if (!(ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END))
+		ndp16 = (struct usb_cdc_ncm_ndp16 *)memset(skb_put(skb, ctx->max_ndp_size), 0, ctx->max_ndp_size);
+	else
+		ndp16 = ctx->delayed_ndp16;
+
 	ndp16->dwSignature = sign;
 	ndp16->wLength = cpu_to_le16(sizeof(struct usb_cdc_ncm_ndp16) + sizeof(struct usb_cdc_ncm_dpe16));
 	return ndp16;
@@ -1023,6 +1052,15 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 	struct sk_buff *skb_out;
 	u16 n = 0, index, ndplen;
 	u8 ready2send = 0;
+	u32 delayed_ndp_size;
+
+	/* When our NDP gets written in cdc_ncm_ndp(), then skb_out->len gets updated
+	 * accordingly. Otherwise, we should check here.
+	 */
+	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END)
+		delayed_ndp_size = ctx->max_ndp_size;
+	else
+		delayed_ndp_size = 0;
 
 	/* if there is a remaining skb, it gets priority */
 	if (skb != NULL) {
@@ -1077,7 +1115,7 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 		cdc_ncm_align_tail(skb_out,  ctx->tx_modulus, ctx->tx_remainder, ctx->tx_max);
 
 		/* check if we had enough room left for both NDP and frame */
-		if (!ndp16 || skb_out->len + skb->len > ctx->tx_max) {
+		if (!ndp16 || skb_out->len + skb->len + delayed_ndp_size > ctx->tx_max) {
 			if (n == 0) {
 				/* won't fit, MTU problem? */
 				dev_kfree_skb_any(skb);
@@ -1150,6 +1188,17 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 		/* variables will be reset at next call */
 	}
 
+	/* If requested, put NDP at end of frame. */
+	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) {
+		nth16 = (struct usb_cdc_ncm_nth16 *)skb_out->data;
+		cdc_ncm_align_tail(skb_out, ctx->tx_ndp_modulus, 0, ctx->tx_max);
+		nth16->wNdpIndex = cpu_to_le16(skb_out->len);
+		memcpy(skb_put(skb_out, ctx->max_ndp_size), ctx->delayed_ndp16, ctx->max_ndp_size);
+
+		/* Zero out delayed NDP - signature checking will naturally fail. */
+		ndp16 = memset(ctx->delayed_ndp16, 0, ctx->max_ndp_size);
+	}
+
 	/* If collected data size is less or equal ctx->min_tx_pkt
 	 * bytes, we send buffers as it is. If we get more data, it
 	 * would be more efficient for USB HS mobile device with DMA
diff --git a/drivers/net/usb/huawei_cdc_ncm.c b/drivers/net/usb/huawei_cdc_ncm.c
index 735f7dadb9a0..2680a65cd5e4 100644
--- a/drivers/net/usb/huawei_cdc_ncm.c
+++ b/drivers/net/usb/huawei_cdc_ncm.c
@@ -73,11 +73,14 @@ static int huawei_cdc_ncm_bind(struct usbnet *usbnet_dev,
 	struct usb_driver *subdriver = ERR_PTR(-ENODEV);
 	int ret = -ENODEV;
 	struct huawei_cdc_ncm_state *drvstate = (void *)&usbnet_dev->data;
+	int drvflags = 0;
 
 	/* altsetting should always be 1 for NCM devices - so we hard-coded
-	 * it here
+	 * it here. Some huawei devices will need the NDP part of the NCM package to
+	 * be at the end of the frame.
 	 */
-	ret = cdc_ncm_bind_common(usbnet_dev, intf, 1);
+	drvflags |= CDC_NCM_FLAG_NDP_TO_END;
+	ret = cdc_ncm_bind_common(usbnet_dev, intf, 1, drvflags);
 	if (ret)
 		goto err;
 
diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index 7c9b484735c5..1f6526c76ee8 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -80,6 +80,9 @@
 #define CDC_NCM_TIMER_INTERVAL_MIN		5UL
 #define CDC_NCM_TIMER_INTERVAL_MAX		(U32_MAX / NSEC_PER_USEC)
 
+/* Driver flags */
+#define CDC_NCM_FLAG_NDP_TO_END	0x02		/* NDP is placed at end of frame */
+
 #define cdc_ncm_comm_intf_is_mbim(x)  ((x)->desc.bInterfaceSubClass == USB_CDC_SUBCLASS_MBIM && \
 				       (x)->desc.bInterfaceProtocol == USB_CDC_PROTO_NONE)
 #define cdc_ncm_data_intf_is_mbim(x)  ((x)->desc.bInterfaceProtocol == USB_CDC_MBIM_PROTO_NTB)
@@ -103,9 +106,11 @@ struct cdc_ncm_ctx {
 
 	spinlock_t mtx;
 	atomic_t stop;
+	int drvflags;
 
 	u32 timer_interval;
 	u32 max_ndp_size;
+	struct usb_cdc_ncm_ndp16 *delayed_ndp16;
 
 	u32 tx_timer_pending;
 	u32 tx_curr_frame_num;
@@ -133,7 +138,7 @@ struct cdc_ncm_ctx {
 };
 
 u8 cdc_ncm_select_altsetting(struct usb_interface *intf);
-int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting);
+int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting, int drvflags);
 void cdc_ncm_unbind(struct usbnet *dev, struct usb_interface *intf);
 struct sk_buff *cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign);
 int cdc_ncm_rx_verify_nth16(struct cdc_ncm_ctx *ctx, struct sk_buff *skb_in);
-- 
cgit v1.2.3-70-g09d2


From 8b58a39846568dcd7d0c98b2fadc25018e59dedf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 8 Jul 2015 23:32:12 +0200
Subject: ipv6: use flag instead of u16 for hop in inet6_skb_parm

Hop was always either 0 or sizeof(struct ipv6hdr).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 2 +-
 net/ipv6/af_inet6.c  | 4 ++--
 net/ipv6/datagram.c  | 8 ++++----
 net/ipv6/exthdrs.c   | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 82806c60aa42..1319a6bb6b82 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -94,7 +94,6 @@ static inline struct ipv6hdr *ipipv6_hdr(const struct sk_buff *skb)
 struct inet6_skb_parm {
 	int			iif;
 	__be16			ra;
-	__u16			hop;
 	__u16			dst0;
 	__u16			srcrt;
 	__u16			dst1;
@@ -111,6 +110,7 @@ struct inet6_skb_parm {
 #define IP6SKB_REROUTED		4
 #define IP6SKB_ROUTERALERT	8
 #define IP6SKB_FRAGMENTED      16
+#define IP6SKB_HOPBYHOP        32
 };
 
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 7de52b65173f..39e670a91596 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -679,8 +679,8 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
 	const struct ipv6_pinfo *np = inet6_sk(sk);
 
 	if (np->rxopt.all) {
-		if ((opt->hop && (np->rxopt.bits.hopopts ||
-				  np->rxopt.bits.ohopopts)) ||
+		if (((opt->flags & IP6SKB_HOPBYHOP) &&
+		     (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) ||
 		    (ip6_flowinfo((struct ipv6hdr *) skb_network_header(skb)) &&
 		     np->rxopt.bits.rxflow) ||
 		    (opt->srcrt && (np->rxopt.bits.srcrt ||
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 62d908e64eeb..50115522e80f 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -558,8 +558,8 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
 	}
 
 	/* HbH is allowed only once */
-	if (np->rxopt.bits.hopopts && opt->hop) {
-		u8 *ptr = nh + opt->hop;
+	if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) {
+		u8 *ptr = nh + sizeof(struct ipv6hdr);
 		put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr);
 	}
 
@@ -620,8 +620,8 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
 		int hlim = ipv6_hdr(skb)->hop_limit;
 		put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
 	}
-	if (np->rxopt.bits.ohopopts && opt->hop) {
-		u8 *ptr = nh + opt->hop;
+	if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) {
+		u8 *ptr = nh + sizeof(struct ipv6hdr);
 		put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr);
 	}
 	if (np->rxopt.bits.odstopts && opt->dst0) {
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index a7bbbe45570b..ce203b0402be 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -632,7 +632,7 @@ int ipv6_parse_hopopts(struct sk_buff *skb)
 		return -1;
 	}
 
-	opt->hop = sizeof(struct ipv6hdr);
+	opt->flags |= IP6SKB_HOPBYHOP;
 	if (ip6_parse_tlv(tlvprochopopt_lst, skb)) {
 		skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3;
 		opt = IP6CB(skb);
-- 
cgit v1.2.3-70-g09d2


From fd4b7286ccc469bf5dde22db6b8fcc455c3c4a66 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 16 Jun 2015 08:52:39 +0000
Subject: regmap: add regmap_write_bits()

regmap_write_bits() is similar to regmap_update_bits(),
but regmap_write_bits() write data to register even though
it is same value.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap.c | 23 +++++++++++++++++++++++
 include/linux/regmap.h       |  9 +++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 69ec411ce722..d93bb9a8ab98 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -2374,6 +2374,29 @@ int regmap_update_bits(struct regmap *map, unsigned int reg,
 }
 EXPORT_SYMBOL_GPL(regmap_update_bits);
 
+/**
+ * regmap_write_bits: Perform a read/modify/write cycle on the register map
+ *
+ * @map: Register map to update
+ * @reg: Register to update
+ * @mask: Bitmask to change
+ * @val: New value for bitmask
+ *
+ * Returns zero for success, a negative number on error.
+ */
+int regmap_write_bits(struct regmap *map, unsigned int reg,
+		      unsigned int mask, unsigned int val)
+{
+	int ret;
+
+	map->lock(map->lock_arg);
+	ret = _regmap_update_bits(map, reg, mask, val, NULL, true);
+	map->unlock(map->lock_arg);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regmap_write_bits);
+
 /**
  * regmap_update_bits_async: Perform a read/modify/write cycle on the register
  *                           map asynchronously
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 59c55ea0f0b5..e4b9ad4f05ef 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -424,6 +424,8 @@ int regmap_bulk_read(struct regmap *map, unsigned int reg, void *val,
 		     size_t val_count);
 int regmap_update_bits(struct regmap *map, unsigned int reg,
 		       unsigned int mask, unsigned int val);
+int regmap_write_bits(struct regmap *map, unsigned int reg,
+		       unsigned int mask, unsigned int val);
 int regmap_update_bits_async(struct regmap *map, unsigned int reg,
 			     unsigned int mask, unsigned int val);
 int regmap_update_bits_check(struct regmap *map, unsigned int reg,
@@ -645,6 +647,13 @@ static inline int regmap_update_bits(struct regmap *map, unsigned int reg,
 	return -EINVAL;
 }
 
+static inline int regmap_write_bits(struct regmap *map, unsigned int reg,
+				     unsigned int mask, unsigned int val)
+{
+	WARN_ONCE(1, "regmap API is disabled");
+	return -EINVAL;
+}
+
 static inline int regmap_update_bits_async(struct regmap *map,
 					   unsigned int reg,
 					   unsigned int mask, unsigned int val)
-- 
cgit v1.2.3-70-g09d2


From e874e6c7edc43436f73cf84157d9221f8b807c36 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 16 Jun 2015 08:52:55 +0000
Subject: regmap: add regmap_fields_force_write()

regmap_fields_force_write() is similar to regmap_fields_write(),
but regmap_fields_force_write() write data to register even though
it is same value.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap.c | 12 ++++++++++++
 include/linux/regmap.h       |  2 ++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index d93bb9a8ab98..dd63bcbbf8a5 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -1624,6 +1624,18 @@ int regmap_fields_write(struct regmap_field *field, unsigned int id,
 }
 EXPORT_SYMBOL_GPL(regmap_fields_write);
 
+int regmap_fields_force_write(struct regmap_field *field, unsigned int id,
+			unsigned int val)
+{
+	if (id >= field->id_size)
+		return -EINVAL;
+
+	return regmap_write_bits(field->regmap,
+				  field->reg + (field->id_offset * id),
+				  field->mask, val << field->shift);
+}
+EXPORT_SYMBOL_GPL(regmap_fields_force_write);
+
 /**
  * regmap_fields_update_bits():	Perform a read/modify/write cycle
  *                              on the register field
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index e4b9ad4f05ef..519c96231a91 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -505,6 +505,8 @@ int regmap_field_update_bits(struct regmap_field *field,
 
 int regmap_fields_write(struct regmap_field *field, unsigned int id,
 			unsigned int val);
+int regmap_fields_force_write(struct regmap_field *field, unsigned int id,
+			unsigned int val);
 int regmap_fields_read(struct regmap_field *field, unsigned int id,
 		       unsigned int *val);
 int regmap_fields_update_bits(struct regmap_field *field,  unsigned int id,
-- 
cgit v1.2.3-70-g09d2


From 5544eb9b81940647b8fad1f251b37cbe2819ce44 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 7 Jul 2015 15:41:58 +0200
Subject: KVM: count number of assigned devices

If there are no assigned devices, the guest PAT are not providing
any useful information and can be overridden to writeback; VMX
always does this because it has the "IPAT" bit in its extended
page table entries, but SVM does not have anything similar.
Hook into VFIO and legacy device assignment so that they
provide this information to KVM.

Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Tested-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/iommu.c            |  2 ++
 arch/x86/kvm/x86.c              | 18 ++++++++++++++++++
 include/linux/kvm_host.h        | 18 ++++++++++++++++++
 virt/kvm/vfio.c                 |  5 +++++
 5 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2a7f5d782c33..49ec9038ec14 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -604,6 +604,8 @@ struct kvm_arch {
 	bool iommu_noncoherent;
 #define __KVM_HAVE_ARCH_NONCOHERENT_DMA
 	atomic_t noncoherent_dma_count;
+#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
+	atomic_t assigned_device_count;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 7dbced309ddb..5c520ebf6343 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -200,6 +200,7 @@ int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev)
 			goto out_unmap;
 	}
 
+	kvm_arch_start_assignment(kvm);
 	pci_set_dev_assigned(pdev);
 
 	dev_info(&pdev->dev, "kvm assign device\n");
@@ -224,6 +225,7 @@ int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev)
 	iommu_detach_device(domain, &pdev->dev);
 
 	pci_clear_dev_assigned(pdev);
+	kvm_arch_end_assignment(kvm);
 
 	dev_info(&pdev->dev, "kvm deassign device\n");
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6bd19c7abc65..0024968b342d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8213,6 +8213,24 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
 			kvm_x86_ops->interrupt_allowed(vcpu);
 }
 
+void kvm_arch_start_assignment(struct kvm *kvm)
+{
+	atomic_inc(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
+
+void kvm_arch_end_assignment(struct kvm *kvm)
+{
+	atomic_dec(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
+
+bool kvm_arch_has_assigned_device(struct kvm *kvm)
+{
+	return atomic_read(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
+
 void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
 {
 	atomic_inc(&kvm->arch.noncoherent_dma_count);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9564fd78c547..05e99b8ef465 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -734,6 +734,24 @@ static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
 	return false;
 }
 #endif
+#ifdef __KVM_HAVE_ARCH_ASSIGNED_DEVICE
+void kvm_arch_start_assignment(struct kvm *kvm);
+void kvm_arch_end_assignment(struct kvm *kvm);
+bool kvm_arch_has_assigned_device(struct kvm *kvm);
+#else
+static inline void kvm_arch_start_assignment(struct kvm *kvm)
+{
+}
+
+static inline void kvm_arch_end_assignment(struct kvm *kvm)
+{
+}
+
+static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
+{
+	return false;
+}
+#endif
 
 static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
 {
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index 620e37f741b8..1dd087da6f31 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -155,6 +155,8 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
 		list_add_tail(&kvg->node, &kv->group_list);
 		kvg->vfio_group = vfio_group;
 
+		kvm_arch_start_assignment(dev->kvm);
+
 		mutex_unlock(&kv->lock);
 
 		kvm_vfio_update_coherency(dev);
@@ -190,6 +192,8 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
 			break;
 		}
 
+		kvm_arch_end_assignment(dev->kvm);
+
 		mutex_unlock(&kv->lock);
 
 		kvm_vfio_group_put_external_user(vfio_group);
@@ -239,6 +243,7 @@ static void kvm_vfio_destroy(struct kvm_device *dev)
 		kvm_vfio_group_put_external_user(kvg->vfio_group);
 		list_del(&kvg->node);
 		kfree(kvg);
+		kvm_arch_end_assignment(dev->kvm);
 	}
 
 	kvm_vfio_update_coherency(dev);
-- 
cgit v1.2.3-70-g09d2


From 90f8572b0f021fdd1baa68e00a8c30482ee9e5f4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 29 Jun 2015 14:42:03 -0500
Subject: vfs: Commit to never having exectuables on proc and sysfs.

Today proc and sysfs do not contain any executable files.  Several
applications today mount proc or sysfs without noexec and nosuid and
then depend on there being no exectuables files on proc or sysfs.
Having any executable files show on proc or sysfs would cause
a user space visible regression, and most likely security problems.

Therefore commit to never allowing executables on proc and sysfs by
adding a new flag to mark them as filesystems without executables and
enforce that flag.

Test the flag where MNT_NOEXEC is tested today, so that the only user
visible effect will be that exectuables will be treated as if the
execute bit is cleared.

The filesystems proc and sysfs do not currently incoporate any
executable files so this does not result in any user visible effects.

This makes it unnecessary to vet changes to proc and sysfs tightly for
adding exectuable files or changes to chattr that would modify
existing files, as no matter what the individual file say they will
not be treated as exectuable files by the vfs.

Not having to vet changes to closely is important as without this we
are only one proc_create call (or another goof up in the
implementation of notify_change) from having problematic executables
on proc.  Those mistakes are all too easy to make and would create
a situation where there are security issues or the assumptions of
some program having to be broken (and cause userspace regressions).

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/exec.c           | 10 ++++++++--
 fs/open.c           |  2 +-
 fs/proc/root.c      |  2 ++
 fs/sysfs/mount.c    |  4 ++++
 include/linux/fs.h  |  3 +++
 kernel/sys.c        |  3 +--
 mm/mmap.c           |  4 ++--
 mm/nommu.c          |  2 +-
 security/security.c |  2 +-
 9 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 1977c2a553ac..b06623a9347f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -98,6 +98,12 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
 	module_put(fmt->module);
 }
 
+bool path_noexec(const struct path *path)
+{
+	return (path->mnt->mnt_flags & MNT_NOEXEC) ||
+	       (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
+}
+
 #ifdef CONFIG_USELIB
 /*
  * Note that a shared library must be both readable and executable due to
@@ -132,7 +138,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 		goto exit;
 
 	error = -EACCES;
-	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+	if (path_noexec(&file->f_path))
 		goto exit;
 
 	fsnotify_open(file);
@@ -777,7 +783,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
 	if (!S_ISREG(file_inode(file)->i_mode))
 		goto exit;
 
-	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+	if (path_noexec(&file->f_path))
 		goto exit;
 
 	err = deny_write_access(file);
diff --git a/fs/open.c b/fs/open.c
index e33dab287fa0..b6f1e96a7c0b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -377,7 +377,7 @@ retry:
 		 * with the "noexec" flag.
 		 */
 		res = -EACCES;
-		if (path.mnt->mnt_flags & MNT_NOEXEC)
+		if (path_noexec(&path))
 			goto out_path_release;
 	}
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 68feb0f70e63..361ab4ee42fc 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -134,6 +134,8 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 		}
 
 		sb->s_flags |= MS_ACTIVE;
+		/* User space would break if executables appear on proc */
+		sb->s_iflags |= SB_I_NOEXEC;
 	}
 
 	return dget(sb->s_root);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 1c6ac6fcee9f..f3db82071cfb 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -40,6 +40,10 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 				SYSFS_MAGIC, &new_sb, ns);
 	if (IS_ERR(root) || !new_sb)
 		kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
+	else if (new_sb)
+		/* Userspace would break if executables appear on sysfs */
+		root->d_sb->s_iflags |= SB_I_NOEXEC;
+
 	return root;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a0653e560c26..42912f8d286e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1244,6 +1244,7 @@ struct mm_struct;
 
 /* sb->s_iflags */
 #define SB_I_CGROUPWB	0x00000001	/* cgroup-aware writeback enabled */
+#define SB_I_NOEXEC	0x00000002	/* Ignore executables on this fs */
 
 /* Possible states of 'frozen' field */
 enum {
@@ -3030,4 +3031,6 @@ static inline bool dir_relax(struct inode *inode)
 	return !IS_DEADDIR(inode);
 }
 
+extern bool path_noexec(const struct path *path);
+
 #endif /* _LINUX_FS_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 259fda25eb6b..fa2f2f671a5c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1668,8 +1668,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 	 * overall picture.
 	 */
 	err = -EACCES;
-	if (!S_ISREG(inode->i_mode)	||
-	    exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+	if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
 		goto exit;
 
 	err = inode_permission(inode, MAY_EXEC);
diff --git a/mm/mmap.c b/mm/mmap.c
index aa632ade2be7..f126923ce683 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1268,7 +1268,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	 *  mounted, in which case we dont add PROT_EXEC.)
 	 */
 	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
-		if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
+		if (!(file && path_noexec(&file->f_path)))
 			prot |= PROT_EXEC;
 
 	if (!(flags & MAP_FIXED))
@@ -1337,7 +1337,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 		case MAP_PRIVATE:
 			if (!(file->f_mode & FMODE_READ))
 				return -EACCES;
-			if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
+			if (path_noexec(&file->f_path)) {
 				if (vm_flags & VM_EXEC)
 					return -EPERM;
 				vm_flags &= ~VM_MAYEXEC;
diff --git a/mm/nommu.c b/mm/nommu.c
index 58ea3643b9e9..ce17abf087ff 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1035,7 +1035,7 @@ static int validate_mmap_request(struct file *file,
 
 		/* handle executable mappings and implied executable
 		 * mappings */
-		if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
+		if (path_noexec(&file->f_path)) {
 			if (prot & PROT_EXEC)
 				return -EPERM;
 		} else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
diff --git a/security/security.c b/security/security.c
index 595fffab48b0..062f3c997fdc 100644
--- a/security/security.c
+++ b/security/security.c
@@ -776,7 +776,7 @@ static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
 	 * ditto if it's not on noexec mount, except that on !MMU we need
 	 * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
 	 */
-	if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC)) {
+	if (!path_noexec(&file->f_path)) {
 #ifndef CONFIG_MMU
 		if (file->f_op->mmap_capabilities) {
 			unsigned caps = file->f_op->mmap_capabilities(file);
-- 
cgit v1.2.3-70-g09d2


From 5127e31a6ce04bd41a020c0ba28a1c0915ab6da1 Mon Sep 17 00:00:00 2001
From: "Suzuki K. Poulose" <suzuki.poulose@arm.com>
Date: Fri, 10 Jul 2015 16:26:38 +0100
Subject: regulator: Add missing dummy definition for regulator_list_voltage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes a build break when CONFIG_REGULATOR is not selected.

e.g, on linux-next - 07102015:

drivers/clk/tegra/clk-dfll.c: In function ‘find_lut_index_for_rate’:
drivers/clk/tegra/clk-dfll.c:691:3: error: implicit declaration of function ‘regulator_list_voltage’ [-Werror=implicit-function-declaration]
    if (regulator_list_voltage(td->vdd_reg, td->i2c_lut[i]) == uv)
    ^
   CC      drivers/clocksource/mmio.o
   CC      fs/proc/softirqs.o
cc1: some warnings being treated as errors
make[3]: *** [drivers/clk/tegra/clk-dfll.o] Error 1
make[2]: *** [drivers/clk/tegra] Error 2
make[1]: *** [drivers/clk] Error 2
make[1]: *** Waiting for unfinished jobs....

This should be pushed to 4.2 as we have the issue in 4.2-rc1, just that
nobody uses it without the REGULATOR(yet).

Signed-off-by: Suzuki K. Poulose <suzuki.poulose@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/consumer.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index f8a689ed62a5..2ba4a40919c8 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -550,6 +550,12 @@ static inline int regulator_count_voltages(struct regulator *regulator)
 {
 	return 0;
 }
+
+static inline int regulator_list_voltage(struct regulator *regulator, unsigned selector)
+{
+	return -EINVAL;
+}
+
 #endif
 
 static inline int regulator_set_voltage_tol(struct regulator *regulator,
-- 
cgit v1.2.3-70-g09d2


From 634ec36cc0ab9d8dda0f2c101fa28d2e2a61b9eb Mon Sep 17 00:00:00 2001
From: David Thomson <david.thomson@alliedtelesis.co.nz>
Date: Fri, 10 Jul 2015 13:56:54 +1200
Subject: net: phy: Pass mdix ethtool setting through to phy driver

Pass the mdix setting from ethtool down to the phy driver, to allow
driver specific implementations of manually setting the polarity.

Signed-off-by: David Thomson <david.thomson@alliedtelesis.co.nz>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 2 ++
 include/linux/phy.h   | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index b2197b506acb..47693a9ebd3a 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -353,6 +353,8 @@ int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd)
 
 	phydev->duplex = cmd->duplex;
 
+	phydev->mdix = cmd->eth_tp_mdix_ctrl;
+
 	/* Restart the PHY */
 	phy_start_aneg(phydev);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index a26c3f84b8dd..e5fb1d415961 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -424,6 +424,8 @@ struct phy_device {
 
 	struct net_device *attached_dev;
 
+	u8 mdix;
+
 	void (*adjust_link)(struct net_device *dev);
 };
 #define to_phy_device(d) container_of(d, struct phy_device, dev)
-- 
cgit v1.2.3-70-g09d2


From 0dcdbc97557fd8c297c4e38e9f66e304a64bae9d Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 4 Jun 2015 12:13:28 +0800
Subject: genirq: Remove the irq argument from note_interrupt()

Only required for the slow path. Retrieve it from irq descriptor if
necessary.

[ tglx: Split out from combo patch. Left [try_]misrouted_irq()
  	untouched as there is no win in the slow path ]

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Kevin Cernekee <cernekee@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Link: http://lkml.kernel.org/r/1433391238-19471-19-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h   | 3 +--
 kernel/irq/chip.c     | 2 +-
 kernel/irq/handle.c   | 2 +-
 kernel/irq/spurious.c | 6 ++++--
 4 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 92188b0225bb..429ac266c7c6 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -487,8 +487,7 @@ extern int irq_chip_set_vcpu_affinity_parent(struct irq_data *data,
 #endif
 
 /* Handling of unhandled and spurious interrupts: */
-extern void note_interrupt(unsigned int irq, struct irq_desc *desc,
-			   irqreturn_t action_ret);
+extern void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret);
 
 
 /* Enable/disable irq debugging output: */
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 310d65885440..76f199dc6a5e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -328,7 +328,7 @@ void handle_nested_irq(unsigned int irq)
 
 	action_ret = action->thread_fn(action->irq, action->dev_id);
 	if (!noirqdebug)
-		note_interrupt(irq, desc, action_ret);
+		note_interrupt(desc, action_ret);
 
 	raw_spin_lock_irq(&desc->lock);
 	irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 4d37b96343e9..b6eeea8a80c5 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -176,7 +176,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 	add_interrupt_randomness(irq, flags);
 
 	if (!noirqdebug)
-		note_interrupt(irq, desc, retval);
+		note_interrupt(desc, retval);
 	return retval;
 }
 
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 5378c529c1dc..32144175458d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -270,9 +270,10 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
 
 #define SPURIOUS_DEFERRED	0x80000000
 
-void note_interrupt(unsigned int irq, struct irq_desc *desc,
-		    irqreturn_t action_ret)
+void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
 {
+	unsigned int irq;
+
 	if (desc->istate & IRQS_POLL_INPROGRESS ||
 	    irq_settings_is_polled(desc))
 		return;
@@ -396,6 +397,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		desc->last_unhandled = jiffies;
 	}
 
+	irq = irq_desc_get_irq(desc);
 	if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
 		int ok = misrouted_irq(irq);
 		if (action_ret == IRQ_NONE)
-- 
cgit v1.2.3-70-g09d2


From 4200e831e4a8fd09fa4e78de2e571ab270c12d06 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Mon, 6 Jul 2015 15:18:24 -0700
Subject: Input: of_touchscreen - switch to using device properties

Let's switch form OF to device properties so that common parsing code could
work not only on device tree but also on ACPI-based platforms.

Reviewed-by: Roger Quadros <rogerq@ti.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/Kconfig          |  4 +--
 drivers/input/touchscreen/Makefile         |  2 +-
 drivers/input/touchscreen/edt-ft5x06.c     |  2 +-
 drivers/input/touchscreen/of_touchscreen.c | 56 ++++++++++++++++--------------
 drivers/input/touchscreen/tsc2005.c        |  2 +-
 include/linux/input/touchscreen.h          | 11 ++----
 6 files changed, 37 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig
index aa2b5f21b89b..27035ecbd4f5 100644
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -11,9 +11,9 @@ menuconfig INPUT_TOUCHSCREEN
 
 if INPUT_TOUCHSCREEN
 
-config OF_TOUCHSCREEN
+config TOUCHSCREEN_PROPERTIES
 	def_tristate INPUT
-	depends on INPUT && OF
+	depends on INPUT
 
 config TOUCHSCREEN_88PM860X
 	tristate "Marvell 88PM860x touchscreen"
diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile
index fa3d33bac7fc..c85aae23e7f8 100644
--- a/drivers/input/touchscreen/Makefile
+++ b/drivers/input/touchscreen/Makefile
@@ -6,7 +6,7 @@
 
 wm97xx-ts-y := wm97xx-core.o
 
-obj-$(CONFIG_OF_TOUCHSCREEN)		+= of_touchscreen.o
+obj-$(CONFIG_TOUCHSCREEN_PROPERTIES)	+= of_touchscreen.o
 obj-$(CONFIG_TOUCHSCREEN_88PM860X)	+= 88pm860x-ts.o
 obj-$(CONFIG_TOUCHSCREEN_AD7877)	+= ad7877.o
 obj-$(CONFIG_TOUCHSCREEN_AD7879)	+= ad7879.o
diff --git a/drivers/input/touchscreen/edt-ft5x06.c b/drivers/input/touchscreen/edt-ft5x06.c
index 394b1de9a2a3..8f8f3199be39 100644
--- a/drivers/input/touchscreen/edt-ft5x06.c
+++ b/drivers/input/touchscreen/edt-ft5x06.c
@@ -1041,7 +1041,7 @@ static int edt_ft5x06_ts_probe(struct i2c_client *client,
 			     0, tsdata->num_y * 64 - 1, 0, 0);
 
 	if (!pdata)
-		touchscreen_parse_of_params(input, true);
+		touchscreen_parse_properties(input, true);
 
 	error = input_mt_init_slots(input, MAX_SUPPORT_POINTS, INPUT_MT_DIRECT);
 	if (error) {
diff --git a/drivers/input/touchscreen/of_touchscreen.c b/drivers/input/touchscreen/of_touchscreen.c
index 50bc0f219547..bb6f2fe14667 100644
--- a/drivers/input/touchscreen/of_touchscreen.c
+++ b/drivers/input/touchscreen/of_touchscreen.c
@@ -9,12 +9,12 @@
  *
  */
 
-#include <linux/of.h>
+#include <linux/property.h>
 #include <linux/input.h>
 #include <linux/input/mt.h>
 #include <linux/input/touchscreen.h>
 
-static bool touchscreen_get_prop_u32(struct device_node *np,
+static bool touchscreen_get_prop_u32(struct device *dev,
 				     const char *property,
 				     unsigned int default_value,
 				     unsigned int *value)
@@ -22,7 +22,7 @@ static bool touchscreen_get_prop_u32(struct device_node *np,
 	u32 val;
 	int error;
 
-	error = of_property_read_u32(np, property, &val);
+	error = device_property_read_u32(dev, property, &val);
 	if (error) {
 		*value = default_value;
 		return false;
@@ -51,54 +51,58 @@ static void touchscreen_set_params(struct input_dev *dev,
 }
 
 /**
- * touchscreen_parse_of_params - parse common touchscreen DT properties
- * @dev: device that should be parsed
+ * touchscreen_parse_properties - parse common touchscreen DT properties
+ * @input: input device that should be parsed
+ * @multitouch: specifies whether parsed properties should be applied to
+ *	single-touch or multi-touch axes
  *
  * This function parses common DT properties for touchscreens and setups the
- * input device accordingly. The function keeps previously setuped default
+ * input device accordingly. The function keeps previously set up default
  * values if no value is specified via DT.
  */
-void touchscreen_parse_of_params(struct input_dev *dev, bool multitouch)
+void touchscreen_parse_properties(struct input_dev *input, bool multitouch)
 {
-	struct device_node *np = dev->dev.parent->of_node;
+	struct device *dev = input->dev.parent;
 	unsigned int axis;
 	unsigned int maximum, fuzz;
 	bool data_present;
 
-	input_alloc_absinfo(dev);
-	if (!dev->absinfo)
+	input_alloc_absinfo(input);
+	if (!input->absinfo)
 		return;
 
 	axis = multitouch ? ABS_MT_POSITION_X : ABS_X;
-	data_present = touchscreen_get_prop_u32(np, "touchscreen-size-x",
-						input_abs_get_max(dev,
+	data_present = touchscreen_get_prop_u32(dev, "touchscreen-size-x",
+						input_abs_get_max(input,
 								  axis) + 1,
 						&maximum) |
-		       touchscreen_get_prop_u32(np, "touchscreen-fuzz-x",
-						input_abs_get_fuzz(dev, axis),
+		       touchscreen_get_prop_u32(dev, "touchscreen-fuzz-x",
+						input_abs_get_fuzz(input, axis),
 						&fuzz);
 	if (data_present)
-		touchscreen_set_params(dev, axis, maximum - 1, fuzz);
+		touchscreen_set_params(input, axis, maximum - 1, fuzz);
 
 	axis = multitouch ? ABS_MT_POSITION_Y : ABS_Y;
-	data_present = touchscreen_get_prop_u32(np, "touchscreen-size-y",
-						input_abs_get_max(dev,
+	data_present = touchscreen_get_prop_u32(dev, "touchscreen-size-y",
+						input_abs_get_max(input,
 								  axis) + 1,
 						&maximum) |
-		       touchscreen_get_prop_u32(np, "touchscreen-fuzz-y",
-						input_abs_get_fuzz(dev, axis),
+		       touchscreen_get_prop_u32(dev, "touchscreen-fuzz-y",
+						input_abs_get_fuzz(input, axis),
 						&fuzz);
 	if (data_present)
-		touchscreen_set_params(dev, axis, maximum - 1, fuzz);
+		touchscreen_set_params(input, axis, maximum - 1, fuzz);
 
 	axis = multitouch ? ABS_MT_PRESSURE : ABS_PRESSURE;
-	data_present = touchscreen_get_prop_u32(np, "touchscreen-max-pressure",
-						input_abs_get_max(dev, axis),
+	data_present = touchscreen_get_prop_u32(dev,
+						"touchscreen-max-pressure",
+						input_abs_get_max(input, axis),
 						&maximum) |
-		       touchscreen_get_prop_u32(np, "touchscreen-fuzz-pressure",
-						input_abs_get_fuzz(dev, axis),
+		       touchscreen_get_prop_u32(dev,
+						"touchscreen-fuzz-pressure",
+						input_abs_get_fuzz(input, axis),
 						&fuzz);
 	if (data_present)
-		touchscreen_set_params(dev, axis, maximum, fuzz);
+		touchscreen_set_params(input, axis, maximum, fuzz);
 }
-EXPORT_SYMBOL(touchscreen_parse_of_params);
+EXPORT_SYMBOL(touchscreen_parse_properties);
diff --git a/drivers/input/touchscreen/tsc2005.c b/drivers/input/touchscreen/tsc2005.c
index d8c025b0f88c..aaf947525cd9 100644
--- a/drivers/input/touchscreen/tsc2005.c
+++ b/drivers/input/touchscreen/tsc2005.c
@@ -709,7 +709,7 @@ static int tsc2005_probe(struct spi_device *spi)
 	input_set_abs_params(input_dev, ABS_PRESSURE, 0, max_p, fudge_p, 0);
 
 	if (np)
-		touchscreen_parse_of_params(input_dev, false);
+		touchscreen_parse_properties(input_dev, false);
 
 	input_dev->open = tsc2005_open;
 	input_dev->close = tsc2005_close;
diff --git a/include/linux/input/touchscreen.h b/include/linux/input/touchscreen.h
index eecc9ea6cd58..c91e1376132b 100644
--- a/include/linux/input/touchscreen.h
+++ b/include/linux/input/touchscreen.h
@@ -9,15 +9,8 @@
 #ifndef _TOUCHSCREEN_H
 #define _TOUCHSCREEN_H
 
-#include <linux/input.h>
+struct input_dev;
 
-#ifdef CONFIG_OF
-void touchscreen_parse_of_params(struct input_dev *dev, bool multitouch);
-#else
-static inline void touchscreen_parse_of_params(struct input_dev *dev,
-					       bool multitouch)
-{
-}
-#endif
+void touchscreen_parse_properties(struct input_dev *dev, bool multitouch);
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 28a74c050060c17b1edaee2d60470a33be476941 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Mon, 6 Jul 2015 11:48:47 -0700
Subject: Input: pixcir_i2c_ts - move platform data

Let's move driver's platform data definitions from include/linux/input/
into include/linux/platform_data/ so that it stays with the rest of
platform data definitions.

Acked-by: Roger Quadros <rogerq@ti.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/pixcir_i2c_ts.c   |  2 +-
 include/linux/input/pixcir_ts.h             | 64 -----------------------------
 include/linux/platform_data/pixcir_i2c_ts.h | 64 +++++++++++++++++++++++++++++
 3 files changed, 65 insertions(+), 65 deletions(-)
 delete mode 100644 include/linux/input/pixcir_ts.h
 create mode 100644 include/linux/platform_data/pixcir_i2c_ts.h

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/pixcir_i2c_ts.c b/drivers/input/touchscreen/pixcir_i2c_ts.c
index 2c2107147319..f7d90997a786 100644
--- a/drivers/input/touchscreen/pixcir_i2c_ts.c
+++ b/drivers/input/touchscreen/pixcir_i2c_ts.c
@@ -24,11 +24,11 @@
 #include <linux/i2c.h>
 #include <linux/input.h>
 #include <linux/input/mt.h>
-#include <linux/input/pixcir_ts.h>
 #include <linux/gpio.h>
 #include <linux/of.h>
 #include <linux/of_gpio.h>
 #include <linux/of_device.h>
+#include <linux/platform_data/pixcir_i2c_ts.h>
 
 #define PIXCIR_MAX_SLOTS       5 /* Max fingers supported by driver */
 
diff --git a/include/linux/input/pixcir_ts.h b/include/linux/input/pixcir_ts.h
deleted file mode 100644
index 7bae83b7c396..000000000000
--- a/include/linux/input/pixcir_ts.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef	_PIXCIR_I2C_TS_H
-#define	_PIXCIR_I2C_TS_H
-
-/*
- * Register map
- */
-#define PIXCIR_REG_POWER_MODE	51
-#define PIXCIR_REG_INT_MODE	52
-
-/*
- * Power modes:
- * active: max scan speed
- * idle: lower scan speed with automatic transition to active on touch
- * halt: datasheet says sleep but this is more like halt as the chip
- *       clocks are cut and it can only be brought out of this mode
- *	 using the RESET pin.
- */
-enum pixcir_power_mode {
-	PIXCIR_POWER_ACTIVE,
-	PIXCIR_POWER_IDLE,
-	PIXCIR_POWER_HALT,
-};
-
-#define PIXCIR_POWER_MODE_MASK	0x03
-#define PIXCIR_POWER_ALLOW_IDLE (1UL << 2)
-
-/*
- * Interrupt modes:
- * periodical: interrupt is asserted periodicaly
- * diff coordinates: interrupt is asserted when coordinates change
- * level on touch: interrupt level asserted during touch
- * pulse on touch: interrupt pulse asserted druing touch
- *
- */
-enum pixcir_int_mode {
-	PIXCIR_INT_PERIODICAL,
-	PIXCIR_INT_DIFF_COORD,
-	PIXCIR_INT_LEVEL_TOUCH,
-	PIXCIR_INT_PULSE_TOUCH,
-};
-
-#define PIXCIR_INT_MODE_MASK	0x03
-#define PIXCIR_INT_ENABLE	(1UL << 3)
-#define PIXCIR_INT_POL_HIGH	(1UL << 2)
-
-/**
- * struct pixcir_irc_chip_data - chip related data
- * @max_fingers:	Max number of fingers reported simultaneously by h/w
- * @has_hw_ids:		Hardware supports finger tracking IDs
- *
- */
-struct pixcir_i2c_chip_data {
-	u8 max_fingers;
-	bool has_hw_ids;
-};
-
-struct pixcir_ts_platform_data {
-	int x_max;
-	int y_max;
-	int gpio_attb;		/* GPIO connected to ATTB line */
-	struct pixcir_i2c_chip_data chip;
-};
-
-#endif
diff --git a/include/linux/platform_data/pixcir_i2c_ts.h b/include/linux/platform_data/pixcir_i2c_ts.h
new file mode 100644
index 000000000000..7bae83b7c396
--- /dev/null
+++ b/include/linux/platform_data/pixcir_i2c_ts.h
@@ -0,0 +1,64 @@
+#ifndef	_PIXCIR_I2C_TS_H
+#define	_PIXCIR_I2C_TS_H
+
+/*
+ * Register map
+ */
+#define PIXCIR_REG_POWER_MODE	51
+#define PIXCIR_REG_INT_MODE	52
+
+/*
+ * Power modes:
+ * active: max scan speed
+ * idle: lower scan speed with automatic transition to active on touch
+ * halt: datasheet says sleep but this is more like halt as the chip
+ *       clocks are cut and it can only be brought out of this mode
+ *	 using the RESET pin.
+ */
+enum pixcir_power_mode {
+	PIXCIR_POWER_ACTIVE,
+	PIXCIR_POWER_IDLE,
+	PIXCIR_POWER_HALT,
+};
+
+#define PIXCIR_POWER_MODE_MASK	0x03
+#define PIXCIR_POWER_ALLOW_IDLE (1UL << 2)
+
+/*
+ * Interrupt modes:
+ * periodical: interrupt is asserted periodicaly
+ * diff coordinates: interrupt is asserted when coordinates change
+ * level on touch: interrupt level asserted during touch
+ * pulse on touch: interrupt pulse asserted druing touch
+ *
+ */
+enum pixcir_int_mode {
+	PIXCIR_INT_PERIODICAL,
+	PIXCIR_INT_DIFF_COORD,
+	PIXCIR_INT_LEVEL_TOUCH,
+	PIXCIR_INT_PULSE_TOUCH,
+};
+
+#define PIXCIR_INT_MODE_MASK	0x03
+#define PIXCIR_INT_ENABLE	(1UL << 3)
+#define PIXCIR_INT_POL_HIGH	(1UL << 2)
+
+/**
+ * struct pixcir_irc_chip_data - chip related data
+ * @max_fingers:	Max number of fingers reported simultaneously by h/w
+ * @has_hw_ids:		Hardware supports finger tracking IDs
+ *
+ */
+struct pixcir_i2c_chip_data {
+	u8 max_fingers;
+	bool has_hw_ids;
+};
+
+struct pixcir_ts_platform_data {
+	int x_max;
+	int y_max;
+	int gpio_attb;		/* GPIO connected to ATTB line */
+	struct pixcir_i2c_chip_data chip;
+};
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From cb4a5f068096c0cea954f363e70020aabb3555f4 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Mon, 6 Jul 2015 11:56:21 -0700
Subject: Input: pixcir_i2c_ts - switch the device over to gpiod

This allows uniform parsing on legacy, DT and ACPI systems.

Acked-by: Roger Quadros <rogerq@ti.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/pixcir_i2c_ts.c   | 26 +++++++++-----------------
 include/linux/platform_data/pixcir_i2c_ts.h |  1 -
 2 files changed, 9 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/pixcir_i2c_ts.c b/drivers/input/touchscreen/pixcir_i2c_ts.c
index f7d90997a786..19732b573227 100644
--- a/drivers/input/touchscreen/pixcir_i2c_ts.c
+++ b/drivers/input/touchscreen/pixcir_i2c_ts.c
@@ -25,8 +25,8 @@
 #include <linux/input.h>
 #include <linux/input/mt.h>
 #include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/of.h>
-#include <linux/of_gpio.h>
 #include <linux/of_device.h>
 #include <linux/platform_data/pixcir_i2c_ts.h>
 
@@ -35,6 +35,7 @@
 struct pixcir_i2c_ts_data {
 	struct i2c_client *client;
 	struct input_dev *input;
+	struct gpio_desc *gpio_attb;
 	const struct pixcir_ts_platform_data *pdata;
 	bool running;
 	int max_fingers;	/* Max fingers supported in this instance */
@@ -161,7 +162,6 @@ static void pixcir_ts_report(struct pixcir_i2c_ts_data *ts,
 static irqreturn_t pixcir_ts_isr(int irq, void *dev_id)
 {
 	struct pixcir_i2c_ts_data *tsdata = dev_id;
-	const struct pixcir_ts_platform_data *pdata = tsdata->pdata;
 	struct pixcir_report_data report;
 
 	while (tsdata->running) {
@@ -171,7 +171,7 @@ static irqreturn_t pixcir_ts_isr(int irq, void *dev_id)
 		/* report it */
 		pixcir_ts_report(tsdata, &report);
 
-		if (gpio_get_value(pdata->gpio_attb)) {
+		if (gpiod_get_value(tsdata->gpio_attb)) {
 			if (report.num_touches) {
 				/*
 				 * Last report with no finger up?
@@ -427,9 +427,6 @@ static struct pixcir_ts_platform_data *pixcir_parse_dt(struct device *dev)
 
 	pdata->chip = *(const struct pixcir_i2c_chip_data *)match->data;
 
-	pdata->gpio_attb = of_get_named_gpio(np, "attb-gpio", 0);
-	/* gpio_attb validity is checked in probe */
-
 	if (of_property_read_u32(np, "touchscreen-size-x", &pdata->x_max)) {
 		dev_err(dev, "Failed to get touchscreen-size-x property\n");
 		return ERR_PTR(-EINVAL);
@@ -442,8 +439,8 @@ static struct pixcir_ts_platform_data *pixcir_parse_dt(struct device *dev)
 	}
 	pdata->y_max -= 1;
 
-	dev_dbg(dev, "%s: x %d, y %d, gpio %d\n", __func__,
-		pdata->x_max + 1, pdata->y_max + 1, pdata->gpio_attb);
+	dev_dbg(dev, "%s: x %d, y %d\n", __func__,
+		pdata->x_max + 1, pdata->y_max + 1);
 
 	return pdata;
 }
@@ -476,11 +473,6 @@ static int pixcir_i2c_ts_probe(struct i2c_client *client,
 		return -EINVAL;
 	}
 
-	if (!gpio_is_valid(pdata->gpio_attb)) {
-		dev_err(dev, "Invalid gpio_attb in pdata\n");
-		return -EINVAL;
-	}
-
 	if (!pdata->chip.max_fingers) {
 		dev_err(dev, "Invalid max_fingers in pdata\n");
 		return -EINVAL;
@@ -530,10 +522,10 @@ static int pixcir_i2c_ts_probe(struct i2c_client *client,
 
 	input_set_drvdata(input, tsdata);
 
-	error = devm_gpio_request_one(dev, pdata->gpio_attb,
-				      GPIOF_DIR_IN, "pixcir_i2c_attb");
-	if (error) {
-		dev_err(dev, "Failed to request ATTB gpio\n");
+	tsdata->gpio_attb = devm_gpiod_get(dev, "attb", GPIOD_IN);
+	if (IS_ERR(tsdata->gpio_attb)) {
+		error = PTR_ERR(tsdata->gpio_attb);
+		dev_err(dev, "Failed to request ATTB gpio: %d\n", error);
 		return error;
 	}
 
diff --git a/include/linux/platform_data/pixcir_i2c_ts.h b/include/linux/platform_data/pixcir_i2c_ts.h
index 7bae83b7c396..646af6f8b838 100644
--- a/include/linux/platform_data/pixcir_i2c_ts.h
+++ b/include/linux/platform_data/pixcir_i2c_ts.h
@@ -57,7 +57,6 @@ struct pixcir_i2c_chip_data {
 struct pixcir_ts_platform_data {
 	int x_max;
 	int y_max;
-	int gpio_attb;		/* GPIO connected to ATTB line */
 	struct pixcir_i2c_chip_data chip;
 };
 
-- 
cgit v1.2.3-70-g09d2


From d3b58c47d330de8c29898fe9746f7530408f8a59 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Fri, 26 Jun 2015 11:58:19 +0200
Subject: can: replace timestamp as unique skb attribute

Commit 514ac99c64b "can: fix multiple delivery of a single CAN frame for
overlapping CAN filters" requires the skb->tstamp to be set to check for
identical CAN skbs.

Without timestamping to be required by user space applications this timestamp
was not generated which lead to commit 36c01245eb8 "can: fix loss of CAN frames
in raw_rcv" - which forces the timestamp to be set in all CAN related skbuffs
by introducing several __net_timestamp() calls.

This forces e.g. out of tree drivers which are not using alloc_can{,fd}_skb()
to add __net_timestamp() after skbuff creation to prevent the frame loss fixed
in mainline Linux.

This patch removes the timestamp dependency and uses an atomic counter to
create an unique identifier together with the skbuff pointer.

Btw: the new skbcnt element introduced in struct can_skb_priv has to be
initialized with zero in out-of-tree drivers which are not using
alloc_can{,fd}_skb() too.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Cc: linux-stable <stable@vger.kernel.org>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev.c   |  7 ++-----
 drivers/net/can/slcan.c |  2 +-
 drivers/net/can/vcan.c  |  3 ---
 include/linux/can/skb.h |  2 ++
 net/can/af_can.c        | 12 +++++++-----
 net/can/bcm.c           |  2 ++
 net/can/raw.c           |  7 ++++---
 7 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index e9b1810d319f..aede704605c6 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -440,9 +440,6 @@ unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx)
 		struct can_frame *cf = (struct can_frame *)skb->data;
 		u8 dlc = cf->can_dlc;
 
-		if (!(skb->tstamp.tv64))
-			__net_timestamp(skb);
-
 		netif_rx(priv->echo_skb[idx]);
 		priv->echo_skb[idx] = NULL;
 
@@ -578,7 +575,6 @@ struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf)
 	if (unlikely(!skb))
 		return NULL;
 
-	__net_timestamp(skb);
 	skb->protocol = htons(ETH_P_CAN);
 	skb->pkt_type = PACKET_BROADCAST;
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
@@ -589,6 +585,7 @@ struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf)
 
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 
 	*cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
 	memset(*cf, 0, sizeof(struct can_frame));
@@ -607,7 +604,6 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev,
 	if (unlikely(!skb))
 		return NULL;
 
-	__net_timestamp(skb);
 	skb->protocol = htons(ETH_P_CANFD);
 	skb->pkt_type = PACKET_BROADCAST;
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
@@ -618,6 +614,7 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev,
 
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 
 	*cfd = (struct canfd_frame *)skb_put(skb, sizeof(struct canfd_frame));
 	memset(*cfd, 0, sizeof(struct canfd_frame));
diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c
index f64f5290d6f8..a23a7af8eb9a 100644
--- a/drivers/net/can/slcan.c
+++ b/drivers/net/can/slcan.c
@@ -207,7 +207,6 @@ static void slc_bump(struct slcan *sl)
 	if (!skb)
 		return;
 
-	__net_timestamp(skb);
 	skb->dev = sl->dev;
 	skb->protocol = htons(ETH_P_CAN);
 	skb->pkt_type = PACKET_BROADCAST;
@@ -215,6 +214,7 @@ static void slc_bump(struct slcan *sl)
 
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = sl->dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 
 	memcpy(skb_put(skb, sizeof(struct can_frame)),
 	       &cf, sizeof(struct can_frame));
diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c
index 0ce868de855d..674f367087c5 100644
--- a/drivers/net/can/vcan.c
+++ b/drivers/net/can/vcan.c
@@ -78,9 +78,6 @@ static void vcan_rx(struct sk_buff *skb, struct net_device *dev)
 	skb->dev       = dev;
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-	if (!(skb->tstamp.tv64))
-		__net_timestamp(skb);
-
 	netif_rx_ni(skb);
 }
 
diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index b6a52a4b457a..51bb6532785c 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -27,10 +27,12 @@
 /**
  * struct can_skb_priv - private additional data inside CAN sk_buffs
  * @ifindex:	ifindex of the first interface the CAN frame appeared on
+ * @skbcnt:	atomic counter to have an unique id together with skb pointer
  * @cf:		align to the following CAN frame at skb->data
  */
 struct can_skb_priv {
 	int ifindex;
+	int skbcnt;
 	struct can_frame cf[0];
 };
 
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 7933e62a7318..166d436196c1 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -89,6 +89,8 @@ struct timer_list can_stattimer;   /* timer for statistics update */
 struct s_stats    can_stats;       /* packet statistics */
 struct s_pstats   can_pstats;      /* receive list statistics */
 
+static atomic_t skbcounter = ATOMIC_INIT(0);
+
 /*
  * af_can socket functions
  */
@@ -310,12 +312,8 @@ int can_send(struct sk_buff *skb, int loop)
 		return err;
 	}
 
-	if (newskb) {
-		if (!(newskb->tstamp.tv64))
-			__net_timestamp(newskb);
-
+	if (newskb)
 		netif_rx_ni(newskb);
-	}
 
 	/* update statistics */
 	can_stats.tx_frames++;
@@ -683,6 +681,10 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev)
 	can_stats.rx_frames++;
 	can_stats.rx_frames_delta++;
 
+	/* create non-zero unique skb identifier together with *skb */
+	while (!(can_skb_prv(skb)->skbcnt))
+		can_skb_prv(skb)->skbcnt = atomic_inc_return(&skbcounter);
+
 	rcu_read_lock();
 
 	/* deliver the packet to sockets listening on all devices */
diff --git a/net/can/bcm.c b/net/can/bcm.c
index b523453585be..a1ba6875c2a2 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -261,6 +261,7 @@ static void bcm_can_tx(struct bcm_op *op)
 
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 
 	memcpy(skb_put(skb, CFSIZ), cf, CFSIZ);
 
@@ -1217,6 +1218,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk)
 	}
 
 	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 	skb->dev = dev;
 	can_skb_set_owner(skb, sk);
 	err = can_send(skb, 1); /* send with loopback */
diff --git a/net/can/raw.c b/net/can/raw.c
index 31b9748cbb4e..2e67b1423cd3 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -75,7 +75,7 @@ MODULE_ALIAS("can-proto-1");
  */
 
 struct uniqframe {
-	ktime_t tstamp;
+	int skbcnt;
 	const struct sk_buff *skb;
 	unsigned int join_rx_count;
 };
@@ -133,7 +133,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
 
 	/* eliminate multiple filter matches for the same skb */
 	if (this_cpu_ptr(ro->uniq)->skb == oskb &&
-	    ktime_equal(this_cpu_ptr(ro->uniq)->tstamp, oskb->tstamp)) {
+	    this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) {
 		if (ro->join_filters) {
 			this_cpu_inc(ro->uniq->join_rx_count);
 			/* drop frame until all enabled filters matched */
@@ -144,7 +144,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
 		}
 	} else {
 		this_cpu_ptr(ro->uniq)->skb = oskb;
-		this_cpu_ptr(ro->uniq)->tstamp = oskb->tstamp;
+		this_cpu_ptr(ro->uniq)->skbcnt = can_skb_prv(oskb)->skbcnt;
 		this_cpu_ptr(ro->uniq)->join_rx_count = 1;
 		/* drop first frame to check all enabled filters? */
 		if (ro->join_filters && ro->count > 1)
@@ -749,6 +749,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 
 	err = memcpy_from_msg(skb_put(skb, size), msg, size);
 	if (err < 0)
-- 
cgit v1.2.3-70-g09d2


From 29d01b22eaa18d8b46091d3c98c6001c49f78e4a Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Sat, 11 Jul 2015 06:43:02 -0400
Subject: locks: new helpers - flock_lock_inode_wait and posix_lock_inode_wait

Allow callers to pass in an inode instead of a filp.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Reviewed-by: "J. Bruce Fields" <bfields@fieldses.org>
Tested-by: "J. Bruce Fields" <bfields@fieldses.org>
---
 fs/locks.c         | 50 ++++++++++++++++++++++++++++++++++++++------------
 include/linux/fs.h | 14 ++++++++++++++
 2 files changed, 52 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 4366b7c54e6d..ba268a503c1b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1163,20 +1163,19 @@ int posix_lock_file(struct file *filp, struct file_lock *fl,
 EXPORT_SYMBOL(posix_lock_file);
 
 /**
- * posix_lock_file_wait - Apply a POSIX-style lock to a file
- * @filp: The file to apply the lock to
+ * posix_lock_inode_wait - Apply a POSIX-style lock to a file
+ * @inode: inode of file to which lock request should be applied
  * @fl: The lock to be applied
  *
- * Add a POSIX style lock to a file.
- * We merge adjacent & overlapping locks whenever possible.
- * POSIX locks are sorted by owner task, then by starting address
+ * Variant of posix_lock_file_wait that does not take a filp, and so can be
+ * used after the filp has already been torn down.
  */
-int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
+int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 {
 	int error;
 	might_sleep ();
 	for (;;) {
-		error = posix_lock_file(filp, fl, NULL);
+		error = __posix_lock_file(inode, fl, NULL);
 		if (error != FILE_LOCK_DEFERRED)
 			break;
 		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
@@ -1188,6 +1187,21 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
 	}
 	return error;
 }
+EXPORT_SYMBOL(posix_lock_inode_wait);
+
+/**
+ * posix_lock_file_wait - Apply a POSIX-style lock to a file
+ * @filp: The file to apply the lock to
+ * @fl: The lock to be applied
+ *
+ * Add a POSIX style lock to a file.
+ * We merge adjacent & overlapping locks whenever possible.
+ * POSIX locks are sorted by owner task, then by starting address
+ */
+int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
+{
+	return posix_lock_inode_wait(file_inode(filp), fl);
+}
 EXPORT_SYMBOL(posix_lock_file_wait);
 
 /**
@@ -1850,18 +1864,18 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 }
 
 /**
- * flock_lock_file_wait - Apply a FLOCK-style lock to a file
- * @filp: The file to apply the lock to
+ * flock_lock_inode_wait - Apply a FLOCK-style lock to a file
+ * @inode: inode of the file to apply to
  * @fl: The lock to be applied
  *
- * Add a FLOCK style lock to a file.
+ * Apply a FLOCK style lock request to an inode.
  */
-int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
+int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 {
 	int error;
 	might_sleep();
 	for (;;) {
-		error = flock_lock_inode(file_inode(filp), fl);
+		error = flock_lock_inode(inode, fl);
 		if (error != FILE_LOCK_DEFERRED)
 			break;
 		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
@@ -1873,7 +1887,19 @@ int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
 	}
 	return error;
 }
+EXPORT_SYMBOL(flock_lock_inode_wait);
 
+/**
+ * flock_lock_file_wait - Apply a FLOCK-style lock to a file
+ * @filp: The file to apply the lock to
+ * @fl: The lock to be applied
+ *
+ * Add a FLOCK style lock to a file.
+ */
+int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
+{
+	return flock_lock_inode_wait(file_inode(filp), fl);
+}
 EXPORT_SYMBOL(flock_lock_file_wait);
 
 /**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a0653e560c26..4c990edd1377 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1046,11 +1046,13 @@ extern void locks_remove_file(struct file *);
 extern void locks_release_private(struct file_lock *);
 extern void posix_test_lock(struct file *, struct file_lock *);
 extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
+extern int posix_lock_inode_wait(struct inode *, struct file_lock *);
 extern int posix_lock_file_wait(struct file *, struct file_lock *);
 extern int posix_unblock_lock(struct file_lock *);
 extern int vfs_test_lock(struct file *, struct file_lock *);
 extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
 extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
+extern int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl);
 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
 extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
 extern void lease_get_mtime(struct inode *, struct timespec *time);
@@ -1137,6 +1139,12 @@ static inline int posix_lock_file(struct file *filp, struct file_lock *fl,
 	return -ENOLCK;
 }
 
+static inline int posix_lock_inode_wait(struct inode *inode,
+					struct file_lock *fl)
+{
+	return -ENOLCK;
+}
+
 static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
 {
 	return -ENOLCK;
@@ -1163,6 +1171,12 @@ static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
 	return 0;
 }
 
+static inline int flock_lock_inode_wait(struct inode *inode,
+					struct file_lock *request)
+{
+	return -ENOLCK;
+}
+
 static inline int flock_lock_file_wait(struct file *filp,
 				       struct file_lock *request)
 {
-- 
cgit v1.2.3-70-g09d2


From ee296d7c5709440f8abd36b5b65c6b3e388538d9 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Sat, 11 Jul 2015 06:43:03 -0400
Subject: locks: inline posix_lock_file_wait and flock_lock_file_wait

They just call file_inode and then the corresponding *_inode_file_wait
function. Just make them static inlines instead.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/locks.c         | 28 ----------------------------
 include/linux/fs.h | 32 ++++++++++++++------------------
 2 files changed, 14 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index ba268a503c1b..d3d558ba4da7 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1189,21 +1189,6 @@ int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 }
 EXPORT_SYMBOL(posix_lock_inode_wait);
 
-/**
- * posix_lock_file_wait - Apply a POSIX-style lock to a file
- * @filp: The file to apply the lock to
- * @fl: The lock to be applied
- *
- * Add a POSIX style lock to a file.
- * We merge adjacent & overlapping locks whenever possible.
- * POSIX locks are sorted by owner task, then by starting address
- */
-int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
-{
-	return posix_lock_inode_wait(file_inode(filp), fl);
-}
-EXPORT_SYMBOL(posix_lock_file_wait);
-
 /**
  * locks_mandatory_locked - Check for an active lock
  * @file: the file to check
@@ -1889,19 +1874,6 @@ int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 }
 EXPORT_SYMBOL(flock_lock_inode_wait);
 
-/**
- * flock_lock_file_wait - Apply a FLOCK-style lock to a file
- * @filp: The file to apply the lock to
- * @fl: The lock to be applied
- *
- * Add a FLOCK style lock to a file.
- */
-int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
-{
-	return flock_lock_inode_wait(file_inode(filp), fl);
-}
-EXPORT_SYMBOL(flock_lock_file_wait);
-
 /**
  *	sys_flock: - flock() system call.
  *	@fd: the file descriptor to lock.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4c990edd1377..cc008c338f5a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1047,13 +1047,11 @@ extern void locks_release_private(struct file_lock *);
 extern void posix_test_lock(struct file *, struct file_lock *);
 extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
 extern int posix_lock_inode_wait(struct inode *, struct file_lock *);
-extern int posix_lock_file_wait(struct file *, struct file_lock *);
 extern int posix_unblock_lock(struct file_lock *);
 extern int vfs_test_lock(struct file *, struct file_lock *);
 extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
 extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
 extern int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl);
-extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
 extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
 extern void lease_get_mtime(struct inode *, struct timespec *time);
 extern int generic_setlease(struct file *, long, struct file_lock **, void **priv);
@@ -1145,11 +1143,6 @@ static inline int posix_lock_inode_wait(struct inode *inode,
 	return -ENOLCK;
 }
 
-static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
-{
-	return -ENOLCK;
-}
-
 static inline int posix_unblock_lock(struct file_lock *waiter)
 {
 	return -ENOENT;
@@ -1177,12 +1170,6 @@ static inline int flock_lock_inode_wait(struct inode *inode,
 	return -ENOLCK;
 }
 
-static inline int flock_lock_file_wait(struct file *filp,
-				       struct file_lock *request)
-{
-	return -ENOLCK;
-}
-
 static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 {
 	return 0;
@@ -1216,6 +1203,20 @@ static inline void show_fd_locks(struct seq_file *f,
 			struct file *filp, struct files_struct *files) {}
 #endif /* !CONFIG_FILE_LOCKING */
 
+static inline struct inode *file_inode(const struct file *f)
+{
+	return f->f_inode;
+}
+
+static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
+{
+	return posix_lock_inode_wait(file_inode(filp), fl);
+}
+
+static inline int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
+{
+	return flock_lock_inode_wait(file_inode(filp), fl);
+}
 
 struct fasync_struct {
 	spinlock_t		fa_lock;
@@ -2025,11 +2026,6 @@ extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
 extern int generic_update_time(struct inode *, struct timespec *, int);
 
-static inline struct inode *file_inode(const struct file *f)
-{
-	return f->f_inode;
-}
-
 /* /sys/fs */
 extern struct kobject *fs_kobj;
 
-- 
cgit v1.2.3-70-g09d2


From 671a2781ff01abf4fdc8904881fc3abd3a8279af Mon Sep 17 00:00:00 2001
From: Jeff Vander Stoep <jeffv@google.com>
Date: Fri, 10 Jul 2015 17:19:55 -0400
Subject: security: add ioctl specific auditing to lsm_audit

Add information about ioctl calls to the LSM audit data. Log the
file path and command number.

Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Acked-by: Nick Kralevich <nnk@google.com>
[PM: subject line tweak]
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 include/linux/lsm_audit.h |  7 +++++++
 security/lsm_audit.c      | 15 +++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
index 1cc89e9df480..ffb9c9da4f39 100644
--- a/include/linux/lsm_audit.h
+++ b/include/linux/lsm_audit.h
@@ -40,6 +40,11 @@ struct lsm_network_audit {
 	} fam;
 };
 
+struct lsm_ioctlop_audit {
+	struct path path;
+	u16 cmd;
+};
+
 /* Auxiliary data to use in generating the audit record. */
 struct common_audit_data {
 	char type;
@@ -53,6 +58,7 @@ struct common_audit_data {
 #define LSM_AUDIT_DATA_KMOD	8
 #define LSM_AUDIT_DATA_INODE	9
 #define LSM_AUDIT_DATA_DENTRY	10
+#define LSM_AUDIT_DATA_IOCTL_OP	11
 	union 	{
 		struct path path;
 		struct dentry *dentry;
@@ -68,6 +74,7 @@ struct common_audit_data {
 		} key_struct;
 #endif
 		char *kmod_name;
+		struct lsm_ioctlop_audit *op;
 	} u;
 	/* this union contains LSM specific data */
 	union {
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 1d34277dc402..9f6c649c65e9 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -245,6 +245,21 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 		}
 		break;
 	}
+	case LSM_AUDIT_DATA_IOCTL_OP: {
+		struct inode *inode;
+
+		audit_log_d_path(ab, " path=", &a->u.op->path);
+
+		inode = a->u.op->path.dentry->d_inode;
+		if (inode) {
+			audit_log_format(ab, " dev=");
+			audit_log_untrustedstring(ab, inode->i_sb->s_id);
+			audit_log_format(ab, " ino=%lu", inode->i_ino);
+		}
+
+		audit_log_format(ab, " ioctlcmd=%hx", a->u.op->cmd);
+		break;
+	}
 	case LSM_AUDIT_DATA_DENTRY: {
 		struct inode *inode;
 
-- 
cgit v1.2.3-70-g09d2


From 7bd393543287b921f964a350166bf2866527a1b5 Mon Sep 17 00:00:00 2001
From: James Ban <james.ban.opensource@diasemi.com>
Date: Tue, 30 Jun 2015 13:39:39 +0900
Subject: regulator: da9211: support da9215

This is a patch for supporting da9215 buck converter.

Signed-off-by: James Ban <james.ban.opensource@diasemi.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../devicetree/bindings/regulator/da9211.txt       | 32 +++++++++++++++--
 drivers/regulator/Kconfig                          |  6 ++--
 drivers/regulator/da9211-regulator.c               | 40 ++++++++++++++++------
 drivers/regulator/da9211-regulator.h               | 18 +++++-----
 include/linux/regulator/da9211.h                   | 19 +++++-----
 5 files changed, 81 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/regulator/da9211.txt b/Documentation/devicetree/bindings/regulator/da9211.txt
index eb618907c7de..c620493e8dbe 100644
--- a/Documentation/devicetree/bindings/regulator/da9211.txt
+++ b/Documentation/devicetree/bindings/regulator/da9211.txt
@@ -1,7 +1,7 @@
-* Dialog Semiconductor DA9211/DA9213 Voltage Regulator
+* Dialog Semiconductor DA9211/DA9213/DA9215 Voltage Regulator
 
 Required properties:
-- compatible: "dlg,da9211" or "dlg,da9213".
+- compatible: "dlg,da9211" or "dlg,da9213" or "dlg,da9215"
 - reg: I2C slave address, usually 0x68.
 - interrupts: the interrupt outputs of the controller
 - regulators: A node that houses a sub-node for each regulator within the
@@ -66,3 +66,31 @@ Example 2) DA9213
 			};
 		};
 	};
+
+
+Example 3) DA9215
+	pmic: da9215@68 {
+		compatible = "dlg,da9215";
+		reg = <0x68>;
+		interrupts = <3 27>;
+
+		regulators {
+			BUCKA {
+				regulator-name = "VBUCKA";
+				regulator-min-microvolt = < 300000>;
+				regulator-max-microvolt = <1570000>;
+				regulator-min-microamp 	= <4000000>;
+				regulator-max-microamp 	= <7000000>;
+				enable-gpios = <&gpio 27 0>;
+			};
+			BUCKB {
+				regulator-name = "VBUCKB";
+				regulator-min-microvolt = < 300000>;
+				regulator-max-microvolt = <1570000>;
+				regulator-min-microamp 	= <4000000>;
+				regulator-max-microamp 	= <7000000>;
+				enable-gpios = <&gpio 17 0>;
+			};
+		};
+	};
+
diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index bef3bde6971b..23496da101de 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -209,13 +209,13 @@ config REGULATOR_DA9210
 	  interface.
 
 config REGULATOR_DA9211
-	tristate "Dialog Semiconductor DA9211/DA9212/DA9213/DA9214 regulator"
+	tristate "Dialog Semiconductor DA9211/DA9212/DA9213/DA9214/DA9215 regulator"
 	depends on I2C
 	select REGMAP_I2C
 	help
 	  Say y here to support for the Dialog Semiconductor DA9211/DA9212
-	  /DA9213/DA9214.
-	  The DA9211/DA9212/DA9213/DA9214 is a multi-phase synchronous
+	  /DA9213/DA9214/DA9215.
+	  The DA9211/DA9212/DA9213/DA9214/DA9215 is a multi-phase synchronous
 	  step down converter 12A or 16A DC-DC Buck controlled through an I2C
 	  interface.
 
diff --git a/drivers/regulator/da9211-regulator.c b/drivers/regulator/da9211-regulator.c
index df79e4b1946e..0858100d2d03 100644
--- a/drivers/regulator/da9211-regulator.c
+++ b/drivers/regulator/da9211-regulator.c
@@ -1,6 +1,6 @@
 /*
- * da9211-regulator.c - Regulator device driver for DA9211/DA9213
- * Copyright (C) 2014  Dialog Semiconductor Ltd.
+ * da9211-regulator.c - Regulator device driver for DA9211/DA9213/DA9215
+ * Copyright (C) 2015  Dialog Semiconductor Ltd.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Library General Public
@@ -32,6 +32,7 @@
 /* DEVICE IDs */
 #define DA9211_DEVICE_ID	0x22
 #define DA9213_DEVICE_ID	0x23
+#define DA9215_DEVICE_ID	0x24
 
 #define DA9211_BUCK_MODE_SLEEP	1
 #define DA9211_BUCK_MODE_SYNC	2
@@ -90,6 +91,13 @@ static const int da9213_current_limits[] = {
 	3000000, 3200000, 3400000, 3600000, 3800000, 4000000, 4200000, 4400000,
 	4600000, 4800000, 5000000, 5200000, 5400000, 5600000, 5800000, 6000000
 };
+/* Current limits for DA9215 buck (uA) indices
+ * corresponds with register values
+ */
+static const int da9215_current_limits[] = {
+	4000000, 4200000, 4400000, 4600000, 4800000, 5000000, 5200000, 5400000,
+	5600000, 5800000, 6000000, 6200000, 6400000, 6600000, 6800000, 7000000
+};
 
 static unsigned int da9211_buck_get_mode(struct regulator_dev *rdev)
 {
@@ -157,6 +165,10 @@ static int da9211_set_current_limit(struct regulator_dev *rdev, int min,
 		current_limits = da9213_current_limits;
 		max_size = ARRAY_SIZE(da9213_current_limits)-1;
 		break;
+	case DA9215:
+		current_limits = da9215_current_limits;
+		max_size = ARRAY_SIZE(da9215_current_limits)-1;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -189,6 +201,9 @@ static int da9211_get_current_limit(struct regulator_dev *rdev)
 	case DA9213:
 		current_limits = da9213_current_limits;
 		break;
+	case DA9215:
+		current_limits = da9215_current_limits;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -350,13 +365,11 @@ static int da9211_regulator_init(struct da9211 *chip)
 	/* If configuration for 1/2 bucks is different between platform data
 	 * and the register, driver should exit.
 	 */
-	if ((chip->pdata->num_buck == 2 && data == 0x40)
-		|| (chip->pdata->num_buck == 1 && data == 0x00)) {
-		if (data == 0)
-			chip->num_regulator = 1;
-		else
-			chip->num_regulator = 2;
-	} else {
+	if (chip->pdata->num_buck == 1 && data == 0x00)
+		chip->num_regulator = 1;
+	else if (chip->pdata->num_buck == 2 && data != 0x00)
+		chip->num_regulator = 2;
+	else {
 		dev_err(chip->dev, "Configuration is mismatched\n");
 		return -EINVAL;
 	}
@@ -438,6 +451,9 @@ static int da9211_i2c_probe(struct i2c_client *i2c,
 	case DA9213_DEVICE_ID:
 		chip->chip_id = DA9213;
 		break;
+	case DA9215_DEVICE_ID:
+		chip->chip_id = DA9215;
+		break;
 	default:
 		dev_err(chip->dev, "Unsupported device id = 0x%x.\n", data);
 		return -ENODEV;
@@ -478,6 +494,7 @@ static int da9211_i2c_probe(struct i2c_client *i2c,
 static const struct i2c_device_id da9211_i2c_id[] = {
 	{"da9211", DA9211},
 	{"da9213", DA9213},
+	{"da9215", DA9215},
 	{},
 };
 MODULE_DEVICE_TABLE(i2c, da9211_i2c_id);
@@ -486,6 +503,7 @@ MODULE_DEVICE_TABLE(i2c, da9211_i2c_id);
 static const struct of_device_id da9211_dt_ids[] = {
 	{ .compatible = "dlg,da9211", .data = &da9211_i2c_id[0] },
 	{ .compatible = "dlg,da9213", .data = &da9211_i2c_id[1] },
+	{ .compatible = "dlg,da9215", .data = &da9211_i2c_id[2] },
 	{},
 };
 MODULE_DEVICE_TABLE(of, da9211_dt_ids);
@@ -504,5 +522,5 @@ static struct i2c_driver da9211_regulator_driver = {
 module_i2c_driver(da9211_regulator_driver);
 
 MODULE_AUTHOR("James Ban <James.Ban.opensource@diasemi.com>");
-MODULE_DESCRIPTION("Regulator device driver for Dialog DA9211/DA9213");
-MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Regulator device driver for Dialog DA9211/DA9213/DA9215");
+MODULE_LICENSE("GPL");
diff --git a/drivers/regulator/da9211-regulator.h b/drivers/regulator/da9211-regulator.h
index 93fa9df2721c..d6ad96fc64d3 100644
--- a/drivers/regulator/da9211-regulator.h
+++ b/drivers/regulator/da9211-regulator.h
@@ -1,16 +1,16 @@
 /*
- * da9211-regulator.h - Regulator definitions for DA9211/DA9213
- * Copyright (C) 2014  Dialog Semiconductor Ltd.
+ * da9211-regulator.h - Regulator definitions for DA9211/DA9213/DA9215
+ * Copyright (C) 2015  Dialog Semiconductor Ltd.
  *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
  *
- * This library is distributed in the hope that it will be useful,
+ * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Library General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
  */
 
 #ifndef __DA9211_REGISTERS_H__
diff --git a/include/linux/regulator/da9211.h b/include/linux/regulator/da9211.h
index 5dd65acc2a69..a43a5ca1167b 100644
--- a/include/linux/regulator/da9211.h
+++ b/include/linux/regulator/da9211.h
@@ -1,16 +1,16 @@
 /*
- * da9211.h - Regulator device driver for DA9211/DA9213
- * Copyright (C) 2014  Dialog Semiconductor Ltd.
+ * da9211.h - Regulator device driver for DA9211/DA9213/DA9215
+ * Copyright (C) 2015  Dialog Semiconductor Ltd.
  *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
  *
- * This library is distributed in the hope that it will be useful,
+ * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Library General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
  */
 
 #ifndef __LINUX_REGULATOR_DA9211_H
@@ -23,6 +23,7 @@
 enum da9211_chip_id {
 	DA9211,
 	DA9213,
+	DA9215,
 };
 
 struct da9211_pdata {
-- 
cgit v1.2.3-70-g09d2


From 7e47682ea555e7c1edef1d8fd96e2aa4c12abe59 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 9 Jun 2015 21:32:09 +1000
Subject: cgroup: allow a cgroup subsystem to reject a fork

Add a new cgroup subsystem callback can_fork that conditionally
states whether or not the fork is accepted or rejected by a cgroup
policy. In addition, add a cancel_fork callback so that if an error
occurs later in the forking process, any state modified by can_fork can
be reverted.

Allow for a private opaque pointer to be passed from cgroup_can_fork to
cgroup_post_fork, allowing for the fork state to be stored by each
subsystem separately.

Also add a tagging system for cgroup_subsys.h to allow for CGROUP_<TAG>
enumerations to be be defined and used. In addition, explicitly add a
CGROUP_CANFORK_COUNT macro to make arrays easier to define.

This is in preparation for implementing the pids cgroup subsystem.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h   | 10 +++++-
 include/linux/cgroup.h        | 15 +++++++--
 include/linux/cgroup_subsys.h | 23 ++++++++++++++
 kernel/cgroup.c               | 73 +++++++++++++++++++++++++++++++++++++++++--
 kernel/cgroup_freezer.c       |  2 +-
 kernel/fork.c                 | 17 ++++++++--
 kernel/sched/core.c           |  2 +-
 7 files changed, 133 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 93755a629299..83e37d8c4d80 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -34,12 +34,17 @@ struct seq_file;
 
 /* define the enumeration of all cgroup subsystems */
 #define SUBSYS(_x) _x ## _cgrp_id,
+#define SUBSYS_TAG(_t) CGROUP_ ## _t, \
+	__unused_tag_ ## _t = CGROUP_ ## _t - 1,
 enum cgroup_subsys_id {
 #include <linux/cgroup_subsys.h>
 	CGROUP_SUBSYS_COUNT,
 };
+#undef SUBSYS_TAG
 #undef SUBSYS
 
+#define CGROUP_CANFORK_COUNT (CGROUP_CANFORK_END - CGROUP_CANFORK_START)
+
 /* bits in struct cgroup_subsys_state flags field */
 enum {
 	CSS_NO_REF	= (1 << 0), /* no reference counting for this css */
@@ -406,7 +411,9 @@ struct cgroup_subsys {
 			      struct cgroup_taskset *tset);
 	void (*attach)(struct cgroup_subsys_state *css,
 		       struct cgroup_taskset *tset);
-	void (*fork)(struct task_struct *task);
+	int (*can_fork)(struct task_struct *task, void **priv_p);
+	void (*cancel_fork)(struct task_struct *task, void *priv);
+	void (*fork)(struct task_struct *task, void *priv);
 	void (*exit)(struct cgroup_subsys_state *css,
 		     struct cgroup_subsys_state *old_css,
 		     struct task_struct *task);
@@ -491,6 +498,7 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
 
 #else	/* CONFIG_CGROUPS */
 
+#define CGROUP_CANFORK_COUNT 0
 #define CGROUP_SUBSYS_COUNT 0
 
 static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a593e299162e..a71fe2a3984e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -62,7 +62,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 		     struct pid *pid, struct task_struct *tsk);
 
 void cgroup_fork(struct task_struct *p);
-void cgroup_post_fork(struct task_struct *p);
+extern int cgroup_can_fork(struct task_struct *p,
+			   void *ss_priv[CGROUP_CANFORK_COUNT]);
+extern void cgroup_cancel_fork(struct task_struct *p,
+			       void *ss_priv[CGROUP_CANFORK_COUNT]);
+extern void cgroup_post_fork(struct task_struct *p,
+			     void *old_ss_priv[CGROUP_CANFORK_COUNT]);
 void cgroup_exit(struct task_struct *p);
 
 int cgroup_init_early(void);
@@ -524,7 +529,13 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
 				    struct dentry *dentry) { return -EINVAL; }
 
 static inline void cgroup_fork(struct task_struct *p) {}
-static inline void cgroup_post_fork(struct task_struct *p) {}
+static inline int cgroup_can_fork(struct task_struct *p,
+				  void *ss_priv[CGROUP_CANFORK_COUNT])
+{ return 0; }
+static inline void cgroup_cancel_fork(struct task_struct *p,
+				      void *ss_priv[CGROUP_CANFORK_COUNT]) {}
+static inline void cgroup_post_fork(struct task_struct *p,
+				    void *ss_priv[CGROUP_CANFORK_COUNT]) {}
 static inline void cgroup_exit(struct task_struct *p) {}
 
 static inline int cgroup_init_early(void) { return 0; }
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e4a96fb14403..ec43bce7e1ea 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -3,6 +3,17 @@
  *
  * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
  */
+
+/*
+ * This file *must* be included with SUBSYS() defined.
+ * SUBSYS_TAG() is a noop if undefined.
+ */
+
+#ifndef SUBSYS_TAG
+#define __TMP_SUBSYS_TAG
+#define SUBSYS_TAG(_x)
+#endif
+
 #if IS_ENABLED(CONFIG_CPUSETS)
 SUBSYS(cpuset)
 #endif
@@ -47,12 +58,24 @@ SUBSYS(net_prio)
 SUBSYS(hugetlb)
 #endif
 
+/*
+ * Subsystems that implement the can_fork() family of callbacks.
+ */
+SUBSYS_TAG(CANFORK_START)
+SUBSYS_TAG(CANFORK_END)
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
 #if IS_ENABLED(CONFIG_CGROUP_DEBUG)
 SUBSYS(debug)
 #endif
+
+#ifdef __TMP_SUBSYS_TAG
+#undef __TMP_SUBSYS_TAG
+#undef SUBSYS_TAG
+#endif
+
 /*
  * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
  */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f89d9292eee6..a59dd1a6b74a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -186,6 +186,9 @@ static u64 css_serial_nr_next = 1;
 static unsigned long have_fork_callback __read_mostly;
 static unsigned long have_exit_callback __read_mostly;
 
+/* Ditto for the can_fork callback. */
+static unsigned long have_canfork_callback __read_mostly;
+
 static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
 
@@ -4955,6 +4958,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 
 	have_fork_callback |= (bool)ss->fork << ss->id;
 	have_exit_callback |= (bool)ss->exit << ss->id;
+	have_canfork_callback |= (bool)ss->can_fork << ss->id;
 
 	/* At system boot, before all subsystems have been
 	 * registered, no tasks have been forked, so we don't
@@ -5197,6 +5201,19 @@ static const struct file_operations proc_cgroupstats_operations = {
 	.release = single_release,
 };
 
+static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+	if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
+		return &ss_priv[i - CGROUP_CANFORK_START];
+	return NULL;
+}
+
+static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+	void **private = subsys_canfork_priv_p(ss_priv, i);
+	return private ? *private : NULL;
+}
+
 /**
  * cgroup_fork - initialize cgroup related fields during copy_process()
  * @child: pointer to task_struct of forking parent process.
@@ -5211,6 +5228,57 @@ void cgroup_fork(struct task_struct *child)
 	INIT_LIST_HEAD(&child->cg_list);
 }
 
+/**
+ * cgroup_can_fork - called on a new task before the process is exposed
+ * @child: the task in question.
+ *
+ * This calls the subsystem can_fork() callbacks. If the can_fork() callback
+ * returns an error, the fork aborts with that error code. This allows for
+ * a cgroup subsystem to conditionally allow or deny new forks.
+ */
+int cgroup_can_fork(struct task_struct *child,
+		    void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+	struct cgroup_subsys *ss;
+	int i, j, ret;
+
+	for_each_subsys_which(ss, i, &have_canfork_callback) {
+		ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+		if (ret)
+			goto out_revert;
+	}
+
+	return 0;
+
+out_revert:
+	for_each_subsys(ss, j) {
+		if (j >= i)
+			break;
+		if (ss->cancel_fork)
+			ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+	}
+
+	return ret;
+}
+
+/**
+ * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
+ * @child: the task in question
+ *
+ * This calls the cancel_fork() callbacks if a fork failed *after*
+ * cgroup_can_fork() succeded.
+ */
+void cgroup_cancel_fork(struct task_struct *child,
+			void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+	struct cgroup_subsys *ss;
+	int i;
+
+	for_each_subsys(ss, i)
+		if (ss->cancel_fork)
+			ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+}
+
 /**
  * cgroup_post_fork - called on a new task after adding it to the task list
  * @child: the task in question
@@ -5221,7 +5289,8 @@ void cgroup_fork(struct task_struct *child)
  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
  * list.
  */
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child,
+		      void *old_ss_priv[CGROUP_CANFORK_COUNT])
 {
 	struct cgroup_subsys *ss;
 	int i;
@@ -5266,7 +5335,7 @@ void cgroup_post_fork(struct task_struct *child)
 	 * and addition to css_set.
 	 */
 	for_each_subsys_which(ss, i, &have_fork_callback)
-		ss->fork(child);
+		ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
 }
 
 /**
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 92b98cc0ee76..f1b30ad5dc6d 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
  * to do anything as freezer_attach() will put @task into the appropriate
  * state.
  */
-static void freezer_fork(struct task_struct *task)
+static void freezer_fork(struct task_struct *task, void *private)
 {
 	struct freezer *freezer;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 1bfefc6f96a4..40e3af12c55e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1239,6 +1239,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 {
 	int retval;
 	struct task_struct *p;
+	void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
 
 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 		return ERR_PTR(-EINVAL);
@@ -1512,6 +1513,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	INIT_LIST_HEAD(&p->thread_group);
 	p->task_works = NULL;
 
+	/*
+	 * Ensure that the cgroup subsystem policies allow the new process to be
+	 * forked. It should be noted the the new process's css_set can be changed
+	 * between here and cgroup_post_fork() if an organisation operation is in
+	 * progress.
+	 */
+	retval = cgroup_can_fork(p, cgrp_ss_priv);
+	if (retval)
+		goto bad_fork_free_pid;
+
 	/*
 	 * Make it visible to the rest of the system, but dont wake it up yet.
 	 * Need tasklist lock for parent etc handling!
@@ -1548,7 +1559,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		spin_unlock(&current->sighand->siglock);
 		write_unlock_irq(&tasklist_lock);
 		retval = -ERESTARTNOINTR;
-		goto bad_fork_free_pid;
+		goto bad_fork_cancel_cgroup;
 	}
 
 	if (likely(p->pid)) {
@@ -1590,7 +1601,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	write_unlock_irq(&tasklist_lock);
 
 	proc_fork_connector(p);
-	cgroup_post_fork(p);
+	cgroup_post_fork(p, cgrp_ss_priv);
 	if (clone_flags & CLONE_THREAD)
 		threadgroup_change_end(current);
 	perf_event_fork(p);
@@ -1600,6 +1611,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	return p;
 
+bad_fork_cancel_cgroup:
+	cgroup_cancel_fork(p, cgrp_ss_priv);
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78b4bad10081..d811652fe6f5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8068,7 +8068,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
 	sched_offline_group(tg);
 }
 
-static void cpu_cgroup_fork(struct task_struct *task)
+static void cpu_cgroup_fork(struct task_struct *task, void *private)
 {
 	sched_move_task(task);
 }
-- 
cgit v1.2.3-70-g09d2


From 49b786ea146f69c371df18e81ce0a2d5839f865c Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 9 Jun 2015 21:32:10 +1000
Subject: cgroup: implement the PIDs subsystem

Adds a new single-purpose PIDs subsystem to limit the number of
tasks that can be forked inside a cgroup. Essentially this is an
implementation of RLIMIT_NPROC that applies to a cgroup rather than a
process tree.

However, it should be noted that organisational operations (adding and
removing tasks from a PIDs hierarchy) will *not* be prevented. Rather,
the number of tasks in the hierarchy cannot exceed the limit through
forking. This is due to the fact that, in the unified hierarchy, attach
cannot fail (and it is not possible for a task to overcome its PIDs
cgroup policy limit by attaching to a child cgroup -- even if migrating
mid-fork it must be able to fork in the parent first).

PIDs are fundamentally a global resource, and it is possible to reach
PID exhaustion inside a cgroup without hitting any reasonable kmemcg
policy. Once you've hit PID exhaustion, you're only in a marginally
better state than OOM. This subsystem allows PID exhaustion inside a
cgroup to be prevented.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 CREDITS                       |   5 +
 include/linux/cgroup_subsys.h |   5 +
 init/Kconfig                  |  16 ++
 kernel/Makefile               |   1 +
 kernel/cgroup_pids.c          | 366 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 393 insertions(+)
 create mode 100644 kernel/cgroup_pids.c

(limited to 'include/linux')

diff --git a/CREDITS b/CREDITS
index 1d616640bbf6..4fcf9cd8544c 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3219,6 +3219,11 @@ S: 69 rue Dunois
 S: 75013 Paris
 S: France
 
+N: Aleksa Sarai
+E: cyphar@cyphar.com
+W: https://www.cyphar.com/
+D: `pids` cgroup subsystem
+
 N: Dipankar Sarma
 E: dipankar@in.ibm.com
 D: RCU
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ec43bce7e1ea..1f36945fd23d 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -62,6 +62,11 @@ SUBSYS(hugetlb)
  * Subsystems that implement the can_fork() family of callbacks.
  */
 SUBSYS_TAG(CANFORK_START)
+
+#if IS_ENABLED(CONFIG_CGROUP_PIDS)
+SUBSYS(pids)
+#endif
+
 SUBSYS_TAG(CANFORK_END)
 
 /*
diff --git a/init/Kconfig b/init/Kconfig
index af09b4fb43d2..2184b34cbf73 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -955,6 +955,22 @@ config CGROUP_FREEZER
 	  Provides a way to freeze and unfreeze all tasks in a
 	  cgroup.
 
+config CGROUP_PIDS
+	bool "PIDs cgroup subsystem"
+	help
+	  Provides enforcement of process number limits in the scope of a
+	  cgroup. Any attempt to fork more processes than is allowed in the
+	  cgroup will fail. PIDs are fundamentally a global resource because it
+	  is fairly trivial to reach PID exhaustion before you reach even a
+	  conservative kmemcg limit. As a result, it is possible to grind a
+	  system to halt without being limited by other cgroup policies. The
+	  PIDs cgroup subsystem is designed to stop this from happening.
+
+	  It should be noted that organisational operations (such as attaching
+	  to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
+	  since the PIDs limit only affects a process's ability to fork, not to
+	  attach to a cgroup.
+
 config CGROUP_DEVICE
 	bool "Device controller for cgroups"
 	help
diff --git a/kernel/Makefile b/kernel/Makefile
index 43c4c920f30a..718fb8afab7a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
new file mode 100644
index 000000000000..d75488824ae2
--- /dev/null
+++ b/kernel/cgroup_pids.c
@@ -0,0 +1,366 @@
+/*
+ * Process number limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
+ * after a certain limit is reached.
+ *
+ * Since it is trivial to hit the task limit without hitting any kmemcg limits
+ * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
+ * preventable in the scope of a cgroup hierarchy by allowing resource limiting
+ * of the number of tasks in a cgroup.
+ *
+ * In order to use the `pids` controller, set the maximum number of tasks in
+ * pids.max (this is not available in the root cgroup for obvious reasons). The
+ * number of processes currently in the cgroup is given by pids.current.
+ * Organisational operations are not blocked by cgroup policies, so it is
+ * possible to have pids.current > pids.max. However, it is not possible to
+ * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
+ * would cause a cgroup policy to be violated.
+ *
+ * To set a cgroup to have no limit, set pids.max to "max". This is the default
+ * for all new cgroups (N.B. that PID limits are hierarchical, so the most
+ * stringent limit in the hierarchy is followed).
+ *
+ * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
+ * a superset of parent/child/pids.current.
+ *
+ * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+
+#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
+#define PIDS_MAX_STR "max"
+
+struct pids_cgroup {
+	struct cgroup_subsys_state	css;
+
+	/*
+	 * Use 64-bit types so that we can safely represent "max" as
+	 * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
+	 */
+	atomic64_t			counter;
+	int64_t				limit;
+};
+
+static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct pids_cgroup, css);
+}
+
+static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
+{
+	return css_pids(pids->css.parent);
+}
+
+static struct cgroup_subsys_state *
+pids_css_alloc(struct cgroup_subsys_state *parent)
+{
+	struct pids_cgroup *pids;
+
+	pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
+	if (!pids)
+		return ERR_PTR(-ENOMEM);
+
+	pids->limit = PIDS_MAX;
+	atomic64_set(&pids->counter, 0);
+	return &pids->css;
+}
+
+static void pids_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css_pids(css));
+}
+
+/**
+ * pids_cancel - uncharge the local pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to cancel
+ *
+ * This function will WARN if the pid count goes under 0, because such a case is
+ * a bug in the pids controller proper.
+ */
+static void pids_cancel(struct pids_cgroup *pids, int num)
+{
+	/*
+	 * A negative count (or overflow for that matter) is invalid,
+	 * and indicates a bug in the `pids` controller proper.
+	 */
+	WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
+}
+
+/**
+ * pids_uncharge - hierarchically uncharge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to uncharge
+ */
+static void pids_uncharge(struct pids_cgroup *pids, int num)
+{
+	struct pids_cgroup *p;
+
+	for (p = pids; p; p = parent_pids(p))
+		pids_cancel(p, num);
+}
+
+/**
+ * pids_charge - hierarchically charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function does *not* follow the pid limit set. It cannot fail and the new
+ * pid count may exceed the limit. This is only used for reverting failed
+ * attaches, where there is no other way out than violating the limit.
+ */
+static void pids_charge(struct pids_cgroup *pids, int num)
+{
+	struct pids_cgroup *p;
+
+	for (p = pids; p; p = parent_pids(p))
+		atomic64_add(num, &p->counter);
+}
+
+/**
+ * pids_try_charge - hierarchically try to charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function follows the set limit. It will fail if the charge would cause
+ * the new value to exceed the hierarchical limit. Returns 0 if the charge
+ * succeded, otherwise -EAGAIN.
+ */
+static int pids_try_charge(struct pids_cgroup *pids, int num)
+{
+	struct pids_cgroup *p, *q;
+
+	for (p = pids; p; p = parent_pids(p)) {
+		int64_t new = atomic64_add_return(num, &p->counter);
+
+		/*
+		 * Since new is capped to the maximum number of pid_t, if
+		 * p->limit is %PIDS_MAX then we know that this test will never
+		 * fail.
+		 */
+		if (new > p->limit)
+			goto revert;
+	}
+
+	return 0;
+
+revert:
+	for (q = pids; q != p; q = parent_pids(q))
+		pids_cancel(q, num);
+	pids_cancel(p, num);
+
+	return -EAGAIN;
+}
+
+static int pids_can_attach(struct cgroup_subsys_state *css,
+			   struct cgroup_taskset *tset)
+{
+	struct pids_cgroup *pids = css_pids(css);
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, tset) {
+		struct cgroup_subsys_state *old_css;
+		struct pids_cgroup *old_pids;
+
+		/*
+		 * Grab a ref to each task's css. We don't drop the ref until
+		 * we either fail and hit ->cancel_attach() or succeed and hit
+		 * ->attach().
+		 */
+		old_css = task_get_css(task, pids_cgrp_id);
+		old_pids = css_pids(old_css);
+
+		pids_charge(pids, 1);
+		pids_uncharge(old_pids, 1);
+	}
+
+	return 0;
+}
+
+static void pids_cancel_attach(struct cgroup_subsys_state *css,
+			       struct cgroup_taskset *tset)
+{
+	struct pids_cgroup *pids = css_pids(css);
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, tset) {
+		struct cgroup_subsys_state *old_css;
+		struct pids_cgroup *old_pids;
+
+		old_css = task_css(task, pids_cgrp_id);
+		old_pids = css_pids(old_css);
+
+		pids_charge(old_pids, 1);
+		pids_uncharge(pids, 1);
+		css_put(old_css);
+	}
+}
+
+static void pids_attach(struct cgroup_subsys_state *css,
+			struct cgroup_taskset *tset)
+{
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, tset)
+		css_put(task_css(task, pids_cgrp_id));
+}
+
+static int pids_can_fork(struct task_struct *task, void **priv_p)
+{
+	struct cgroup_subsys_state *css;
+	struct pids_cgroup *pids;
+	int err;
+
+	/*
+	 * Use the "current" task_css for the pids subsystem as the tentative
+	 * css. It is possible we will charge the wrong hierarchy, in which
+	 * case we will forcefully revert/reapply the charge on the right
+	 * hierarchy after it is committed to the task proper.
+	 */
+	css = task_get_css(current, pids_cgrp_id);
+	pids = css_pids(css);
+
+	err = pids_try_charge(pids, 1);
+	if (err)
+		goto err_css_put;
+
+	*priv_p = css;
+	return 0;
+
+err_css_put:
+	css_put(css);
+	return err;
+}
+
+static void pids_cancel_fork(struct task_struct *task, void *priv)
+{
+	struct cgroup_subsys_state *css = priv;
+	struct pids_cgroup *pids = css_pids(css);
+
+	pids_uncharge(pids, 1);
+	css_put(css);
+}
+
+static void pids_fork(struct task_struct *task, void *priv)
+{
+	struct cgroup_subsys_state *css;
+	struct cgroup_subsys_state *old_css = priv;
+	struct pids_cgroup *pids;
+	struct pids_cgroup *old_pids = css_pids(old_css);
+
+	css = task_get_css(task, pids_cgrp_id);
+	pids = css_pids(css);
+
+	/*
+	 * If the association has changed, we have to revert and reapply the
+	 * charge/uncharge on the wrong hierarchy to the current one. Since
+	 * the association can only change due to an organisation event, its
+	 * okay for us to ignore the limit in this case.
+	 */
+	if (pids != old_pids) {
+		pids_uncharge(old_pids, 1);
+		pids_charge(pids, 1);
+	}
+
+	css_put(css);
+	css_put(old_css);
+}
+
+static void pids_exit(struct cgroup_subsys_state *css,
+		      struct cgroup_subsys_state *old_css,
+		      struct task_struct *task)
+{
+	struct pids_cgroup *pids = css_pids(old_css);
+
+	pids_uncharge(pids, 1);
+}
+
+static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
+			      size_t nbytes, loff_t off)
+{
+	struct cgroup_subsys_state *css = of_css(of);
+	struct pids_cgroup *pids = css_pids(css);
+	int64_t limit;
+	int err;
+
+	buf = strstrip(buf);
+	if (!strcmp(buf, PIDS_MAX_STR)) {
+		limit = PIDS_MAX;
+		goto set_limit;
+	}
+
+	err = kstrtoll(buf, 0, &limit);
+	if (err)
+		return err;
+
+	if (limit < 0 || limit >= PIDS_MAX)
+		return -EINVAL;
+
+set_limit:
+	/*
+	 * Limit updates don't need to be mutex'd, since it isn't
+	 * critical that any racing fork()s follow the new limit.
+	 */
+	pids->limit = limit;
+	return nbytes;
+}
+
+static int pids_max_show(struct seq_file *sf, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(sf);
+	struct pids_cgroup *pids = css_pids(css);
+	int64_t limit = pids->limit;
+
+	if (limit >= PIDS_MAX)
+		seq_printf(sf, "%s\n", PIDS_MAX_STR);
+	else
+		seq_printf(sf, "%lld\n", limit);
+
+	return 0;
+}
+
+static s64 pids_current_read(struct cgroup_subsys_state *css,
+			     struct cftype *cft)
+{
+	struct pids_cgroup *pids = css_pids(css);
+
+	return atomic64_read(&pids->counter);
+}
+
+static struct cftype pids_files[] = {
+	{
+		.name = "max",
+		.write = pids_max_write,
+		.seq_show = pids_max_show,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "current",
+		.read_s64 = pids_current_read,
+	},
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys pids_cgrp_subsys = {
+	.css_alloc	= pids_css_alloc,
+	.css_free	= pids_css_free,
+	.attach		= pids_attach,
+	.can_attach 	= pids_can_attach,
+	.cancel_attach 	= pids_cancel_attach,
+	.can_fork	= pids_can_fork,
+	.cancel_fork	= pids_cancel_fork,
+	.fork		= pids_fork,
+	.exit		= pids_exit,
+	.legacy_cftypes	= pids_files,
+	.dfl_cftypes	= pids_files,
+};
-- 
cgit v1.2.3-70-g09d2


From 83cb8557e8d2c8e5eddc64840c437299343a7960 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 26 Jun 2015 17:21:28 -0400
Subject: percpu: update incorrect comment for this_cpu_*() operations

this_cpu_*() ops have been protected against both preemption and
interrupts for quite a while now.  We apparently forgot to update the
comment.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
---
 include/linux/percpu-defs.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 57f3a1c550dc..8f16299ca068 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -488,10 +488,8 @@ do {									\
 #define __this_cpu_dec_return(pcp)	__this_cpu_add_return(pcp, -1)
 
 /*
- * Operations with implied preemption protection.  These operations can be
- * used without worrying about preemption.  Note that interrupts may still
- * occur while an operation is in progress and if the interrupt modifies
- * the variable too then RMW actions may not be reliable.
+ * Operations with implied preemption/interrupt protection.  These
+ * operations can be used without worrying about preemption or interrupt.
  */
 #define this_cpu_read(pcp)		__pcpu_size_call_return(this_cpu_read_, pcp)
 #define this_cpu_write(pcp, val)	__pcpu_size_call(this_cpu_write_, pcp, val)
-- 
cgit v1.2.3-70-g09d2


From 4c62360d7562a20c996836d163259c87d9378120 Mon Sep 17 00:00:00 2001
From: "Luck, Tony" <tony.luck@intel.com>
Date: Tue, 30 Jun 2015 15:57:51 -0700
Subject: efi: Handle memory error structures produced based on old versions of
 standard

The memory error record structure includes as its first field a
bitmask of which subsequent fields are valid. The allows new fields
to be added to the structure while keeping compatibility with older
software that parses these records. This mechanism was used between
versions 2.2 and 2.3 to add four new fields, growing the size of the
structure from 73 bytes to 80. But Linux just added all the new
fields so this test:
	if (gdata->error_data_length >= sizeof(*mem_err))
		cper_print_mem(newpfx, mem_err);
	else
		goto err_section_too_small;
now make Linux complain about old format records being too short.

Add a definition for the old format of the structure and use that
for the minimum size check. Pass the actual size to cper_print_mem()
so it can sanity check the validation_bits field to ensure that if
a BIOS using the old format sets bits as if it were new, we won't
access fields beyond the end of the structure.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 drivers/firmware/efi/cper.c | 15 ++++++++++++---
 include/linux/cper.h        | 22 +++++++++++++++++++++-
 2 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 4fd9961d552e..d42537425438 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -305,10 +305,17 @@ const char *cper_mem_err_unpack(struct trace_seq *p,
 	return ret;
 }
 
-static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
+static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem,
+	int len)
 {
 	struct cper_mem_err_compact cmem;
 
+	/* Don't trust UEFI 2.1/2.2 structure with bad validation bits */
+	if (len == sizeof(struct cper_sec_mem_err_old) &&
+	    (mem->validation_bits & ~(CPER_MEM_VALID_RANK_NUMBER - 1))) {
+		pr_err(FW_WARN "valid bits set for fields beyond structure\n");
+		return;
+	}
 	if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)
 		printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status);
 	if (mem->validation_bits & CPER_MEM_VALID_PA)
@@ -405,8 +412,10 @@ static void cper_estatus_print_section(
 	} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
 		struct cper_sec_mem_err *mem_err = (void *)(gdata + 1);
 		printk("%s""section_type: memory error\n", newpfx);
-		if (gdata->error_data_length >= sizeof(*mem_err))
-			cper_print_mem(newpfx, mem_err);
+		if (gdata->error_data_length >=
+		    sizeof(struct cper_sec_mem_err_old))
+			cper_print_mem(newpfx, mem_err,
+				       gdata->error_data_length);
 		else
 			goto err_section_too_small;
 	} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PCIE)) {
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 76abba4b238e..dcacb1a72e26 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -340,7 +340,27 @@ struct cper_ia_proc_ctx {
 	__u64	mm_reg_addr;
 };
 
-/* Memory Error Section */
+/* Old Memory Error Section UEFI 2.1, 2.2 */
+struct cper_sec_mem_err_old {
+	__u64	validation_bits;
+	__u64	error_status;
+	__u64	physical_addr;
+	__u64	physical_addr_mask;
+	__u16	node;
+	__u16	card;
+	__u16	module;
+	__u16	bank;
+	__u16	device;
+	__u16	row;
+	__u16	column;
+	__u16	bit_pos;
+	__u64	requestor_id;
+	__u64	responder_id;
+	__u64	target_id;
+	__u8	error_type;
+};
+
+/* Memory Error Section UEFI >= 2.3 */
 struct cper_sec_mem_err {
 	__u64	validation_bits;
 	__u64	error_status;
-- 
cgit v1.2.3-70-g09d2


From 71d126fd28de2d4d9b7b2088dbccd7ca62fad6e0 Mon Sep 17 00:00:00 2001
From: Arne Fitzenreiter <arne_f@ipfire.org>
Date: Wed, 15 Jul 2015 13:54:36 +0200
Subject: libata: add ATA_HORKAGE_NOTRIM

Some devices lose data on TRIM whether queued or not.  This patch adds
a horkage to disable TRIM.

tj: Collapsed unnecessary if() nesting.

Signed-off-by: Arne Fitzenreiter <arne_f@ipfire.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
---
 drivers/ata/libata-scsi.c      | 3 ++-
 drivers/ata/libata-transport.c | 2 ++
 include/linux/libata.h         | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 3131adcc1f87..641a61a59e89 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -2568,7 +2568,8 @@ static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf)
 		rbuf[14] = (lowest_aligned >> 8) & 0x3f;
 		rbuf[15] = lowest_aligned;
 
-		if (ata_id_has_trim(args->id)) {
+		if (ata_id_has_trim(args->id) &&
+		    !(dev->horkage & ATA_HORKAGE_NOTRIM)) {
 			rbuf[14] |= 0x80; /* LBPME */
 
 			if (ata_id_has_zero_after_trim(args->id) &&
diff --git a/drivers/ata/libata-transport.c b/drivers/ata/libata-transport.c
index d6c37bcd416d..e2d94972962d 100644
--- a/drivers/ata/libata-transport.c
+++ b/drivers/ata/libata-transport.c
@@ -569,6 +569,8 @@ show_ata_dev_trim(struct device *dev,
 
 	if (!ata_id_has_trim(ata_dev->id))
 		mode = "unsupported";
+	else if (ata_dev->horkage & ATA_HORKAGE_NOTRIM)
+		mode = "forced_unsupported";
 	else if (ata_dev->horkage & ATA_HORKAGE_NO_NCQ_TRIM)
 			mode = "forced_unqueued";
 	else if (ata_fpdma_dsm_supported(ata_dev))
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 36ce37bcc963..5c8bac6225a6 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -431,6 +431,8 @@ enum {
 	ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21),	/* some WDs have broken LPM */
 	ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */
 	ATA_HORKAGE_NO_NCQ_LOG	= (1 << 23),	/* don't use NCQ for log read */
+	ATA_HORKAGE_NOTRIM	= (1 << 24),	/* don't use TRIM */
+
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.2.3-70-g09d2


From af34d637637eabaf49406eb35c948cd51ba262a6 Mon Sep 17 00:00:00 2001
From: David Milburn <dmilburn@redhat.com>
Date: Mon, 13 Jul 2015 11:48:23 -0500
Subject: libata: add ATA_HORKAGE_MAX_SEC_1024 to revert back to previous
 max_sectors limit

Since no longer limiting max_sectors to BLK_DEF_MAX_SECTORS (commit 34b48db66e08),
data corruption may occur on ST380013AS drive configured on 82801JI (ICH10 Family)
SATA controller. This patch will allow the driver to limit max_sectors as before

 # cat /sys/block/sdb/queue/max_sectors_kb
 512

I was able to double the max_sectors_kb value up to 16384 on linux-4.2.0-rc2
before seeing corruption, but seems safer to use previous limit. Without this
patch max_sectors_kb will be 32767.

tj: Minor comment update.

Reported-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: David Milburn <dmilburn@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org # v3.19 and later
Fixes: 34b48db66e08 ("block: remove artifical max_hw_sectors cap")
---
 drivers/ata/libata-core.c | 10 ++++++++++
 include/linux/ata.h       |  1 +
 include/linux/libata.h    |  2 +-
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index ed2b218ea64d..68202a8a3a0b 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2478,6 +2478,10 @@ int ata_dev_configure(struct ata_device *dev)
 		dev->max_sectors = min_t(unsigned int, ATA_MAX_SECTORS_128,
 					 dev->max_sectors);
 
+	if (dev->horkage & ATA_HORKAGE_MAX_SEC_1024)
+		dev->max_sectors = min_t(unsigned int, ATA_MAX_SECTORS_1024,
+					 dev->max_sectors);
+
 	if (dev->horkage & ATA_HORKAGE_MAX_SEC_LBA48)
 		dev->max_sectors = ATA_MAX_SECTORS_LBA48;
 
@@ -4146,6 +4150,12 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	{ "Slimtype DVD A  DS8A8SH", NULL,	ATA_HORKAGE_MAX_SEC_LBA48 },
 	{ "Slimtype DVD A  DS8A9SH", NULL,	ATA_HORKAGE_MAX_SEC_LBA48 },
 
+	/*
+	 * Causes silent data corruption with higher max sects.
+	 * http://lkml.kernel.org/g/x49wpy40ysk.fsf@segfault.boston.devel.redhat.com
+	 */
+	{ "ST380013AS",		"3.20",		ATA_HORKAGE_MAX_SEC_1024 },
+
 	/* Devices we expect to fail diagnostics */
 
 	/* Devices where NCQ should be avoided */
diff --git a/include/linux/ata.h b/include/linux/ata.h
index fed36418dd1c..6c78956aa470 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -45,6 +45,7 @@ enum {
 	ATA_SECT_SIZE		= 512,
 	ATA_MAX_SECTORS_128	= 128,
 	ATA_MAX_SECTORS		= 256,
+	ATA_MAX_SECTORS_1024    = 1024,
 	ATA_MAX_SECTORS_LBA48	= 65535,/* TODO: 65536? */
 	ATA_MAX_SECTORS_TAPE	= 65535,
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 5c8bac6225a6..c9cfbcdb8d14 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -432,7 +432,7 @@ enum {
 	ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */
 	ATA_HORKAGE_NO_NCQ_LOG	= (1 << 23),	/* don't use NCQ for log read */
 	ATA_HORKAGE_NOTRIM	= (1 << 24),	/* don't use TRIM */
-
+	ATA_HORKAGE_MAX_SEC_1024 = (1 << 25),	/* Limit max sects to 1024 */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.2.3-70-g09d2


From 70aa996601335ca3069190ebcdae8870828086a8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 10 Jul 2015 18:13:20 -0500
Subject: netfilter: kill nf_hooks_active

The function obscures what is going on in nf_hook_thresh and it's existence
requires computing the hook list twice.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 00050dfd9f23..60e89348a91d 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -150,11 +150,6 @@ static inline bool nf_hook_list_active(struct list_head *nf_hook_list,
 }
 #endif
 
-static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook)
-{
-	return nf_hook_list_active(&nf_hooks[pf][hook], pf, hook);
-}
-
 int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state);
 
 /**
@@ -172,10 +167,12 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
 				 int (*okfn)(struct sock *, struct sk_buff *),
 				 int thresh)
 {
-	if (nf_hooks_active(pf, hook)) {
+	struct list_head *nf_hook_list = &nf_hooks[pf][hook];
+
+	if (nf_hook_list_active(nf_hook_list, pf, hook)) {
 		struct nf_hook_state state;
 
-		nf_hook_state_init(&state, &nf_hooks[pf][hook], hook, thresh,
+		nf_hook_state_init(&state, nf_hook_list, hook, thresh,
 				   pf, indev, outdev, sk, okfn);
 		return nf_hook_slow(skb, &state);
 	}
-- 
cgit v1.2.3-70-g09d2


From 085db2c04557d31db61541f361bd8b4de92c9939 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 10 Jul 2015 18:15:06 -0500
Subject: netfilter: Per network namespace netfilter hooks.

- Add a new set of functions for registering and unregistering per
  network namespace hooks.

- Modify the old global namespace hook functions to use the per
  network namespace hooks in their implementation, so their remains a
  single list that needs to be walked for any hook (this is important
  for keeping the hook priority working and for keeping the code
  walking the hooks simple).

- Only allow registering the per netdevice hooks in the network
  namespace where the network device lives.

- Dynamically allocate the structures in the per network namespace
  hook list in nf_register_net_hook, and unregister them in
  nf_unregister_net_hook.

  Dynamic allocate is required somewhere as the number of network
  namespaces are not fixed so we might as well allocate them in the
  registration function.

  The chain of registered hooks on any list is expected to be small so
  the cost of walking that list to find the entry we are unregistering
  should also be small.

  Performing the management of the dynamically allocated list entries
  in the registration and unregistration functions keeps the complexity
  from spreading.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/netfilter.h     |  14 +++-
 include/net/netns/netfilter.h |   1 +
 net/netfilter/core.c          | 182 +++++++++++++++++++++++++++++++++++++-----
 3 files changed, 173 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 60e89348a91d..9bbd110ec81b 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -11,6 +11,8 @@
 #include <linux/list.h>
 #include <linux/static_key.h>
 #include <linux/netfilter_defs.h>
+#include <linux/netdevice.h>
+#include <net/net_namespace.h>
 
 #ifdef CONFIG_NETFILTER
 static inline int NF_DROP_GETERR(int verdict)
@@ -118,6 +120,13 @@ struct nf_sockopt_ops {
 };
 
 /* Function to register/unregister hook points. */
+int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops);
+void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops);
+int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+			  unsigned int n);
+void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+			     unsigned int n);
+
 int nf_register_hook(struct nf_hook_ops *reg);
 void nf_unregister_hook(struct nf_hook_ops *reg);
 int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
@@ -128,8 +137,6 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
 int nf_register_sockopt(struct nf_sockopt_ops *reg);
 void nf_unregister_sockopt(struct nf_sockopt_ops *reg);
 
-extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
-
 #ifdef HAVE_JUMP_LABEL
 extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 
@@ -167,7 +174,8 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
 				 int (*okfn)(struct sock *, struct sk_buff *),
 				 int thresh)
 {
-	struct list_head *nf_hook_list = &nf_hooks[pf][hook];
+	struct net *net = dev_net(indev ? indev : outdev);
+	struct list_head *nf_hook_list = &net->nf.hooks[pf][hook];
 
 	if (nf_hook_list_active(nf_hook_list, pf, hook)) {
 		struct nf_hook_state state;
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index 532e4ba64f49..38aa4983e2a9 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -14,5 +14,6 @@ struct netns_nf {
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header *nf_log_dir_header;
 #endif
+	struct list_head hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 };
 #endif
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index fa4d3c111d3f..56ead1a1711c 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -52,9 +52,6 @@ void nf_unregister_afinfo(const struct nf_afinfo *afinfo)
 }
 EXPORT_SYMBOL_GPL(nf_unregister_afinfo);
 
-struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;
-EXPORT_SYMBOL(nf_hooks);
-
 #ifdef HAVE_JUMP_LABEL
 struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 EXPORT_SYMBOL(nf_hooks_needed);
@@ -62,27 +59,40 @@ EXPORT_SYMBOL(nf_hooks_needed);
 
 static DEFINE_MUTEX(nf_hook_mutex);
 
-static struct list_head *find_nf_hook_list(const struct nf_hook_ops *reg)
+static struct list_head *find_nf_hook_list(struct net *net,
+					   const struct nf_hook_ops *reg)
 {
 	struct list_head *nf_hook_list = NULL;
 
 	if (reg->pf != NFPROTO_NETDEV)
-		nf_hook_list = &nf_hooks[reg->pf][reg->hooknum];
+		nf_hook_list = &net->nf.hooks[reg->pf][reg->hooknum];
 	else if (reg->hooknum == NF_NETDEV_INGRESS) {
 #ifdef CONFIG_NETFILTER_INGRESS
-		if (reg->dev)
+		if (reg->dev && dev_net(reg->dev) == net)
 			nf_hook_list = &reg->dev->nf_hooks_ingress;
 #endif
 	}
 	return nf_hook_list;
 }
 
-int nf_register_hook(struct nf_hook_ops *reg)
+int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 {
 	struct list_head *nf_hook_list;
-	struct nf_hook_ops *elem;
+	struct nf_hook_ops *elem, *new;
+
+	new = kzalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
 
-	nf_hook_list = find_nf_hook_list(reg);
+	new->hook     = reg->hook;
+	new->dev      = reg->dev;
+	new->owner    = reg->owner;
+	new->priv     = reg->priv;
+	new->pf       = reg->pf;
+	new->hooknum  = reg->hooknum;
+	new->priority = reg->priority;
+
+	nf_hook_list = find_nf_hook_list(net, reg);
 	if (!nf_hook_list)
 		return -ENOENT;
 
@@ -91,7 +101,7 @@ int nf_register_hook(struct nf_hook_ops *reg)
 		if (reg->priority < elem->priority)
 			break;
 	}
-	list_add_rcu(&reg->list, elem->list.prev);
+	list_add_rcu(&new->list, elem->list.prev);
 	mutex_unlock(&nf_hook_mutex);
 #ifdef CONFIG_NETFILTER_INGRESS
 	if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
@@ -102,13 +112,35 @@ int nf_register_hook(struct nf_hook_ops *reg)
 #endif
 	return 0;
 }
-EXPORT_SYMBOL(nf_register_hook);
+EXPORT_SYMBOL(nf_register_net_hook);
 
-void nf_unregister_hook(struct nf_hook_ops *reg)
+void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 {
+	struct list_head *nf_hook_list;
+	struct nf_hook_ops *elem;
+
+	nf_hook_list = find_nf_hook_list(net, reg);
+	if (!nf_hook_list)
+		return;
+
 	mutex_lock(&nf_hook_mutex);
-	list_del_rcu(&reg->list);
+	list_for_each_entry(elem, nf_hook_list, list) {
+		if ((reg->hook     == elem->hook) &&
+		    (reg->dev      == elem->dev) &&
+		    (reg->owner    == elem->owner) &&
+		    (reg->priv     == elem->priv) &&
+		    (reg->pf       == elem->pf) &&
+		    (reg->hooknum  == elem->hooknum) &&
+		    (reg->priority == elem->priority)) {
+			list_del_rcu(&elem->list);
+			break;
+		}
+	}
 	mutex_unlock(&nf_hook_mutex);
+	if (&elem->list == nf_hook_list) {
+		WARN(1, "nf_unregister_net_hook: hook not found!\n");
+		return;
+	}
 #ifdef CONFIG_NETFILTER_INGRESS
 	if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
 		net_dec_ingress_queue();
@@ -117,7 +149,77 @@ void nf_unregister_hook(struct nf_hook_ops *reg)
 	static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
 #endif
 	synchronize_net();
-	nf_queue_nf_hook_drop(reg);
+	nf_queue_nf_hook_drop(elem);
+	kfree(elem);
+}
+EXPORT_SYMBOL(nf_unregister_net_hook);
+
+int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+			  unsigned int n)
+{
+	unsigned int i;
+	int err = 0;
+
+	for (i = 0; i < n; i++) {
+		err = nf_register_net_hook(net, &reg[i]);
+		if (err)
+			goto err;
+	}
+	return err;
+
+err:
+	if (i > 0)
+		nf_unregister_net_hooks(net, reg, i);
+	return err;
+}
+EXPORT_SYMBOL(nf_register_net_hooks);
+
+void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+			     unsigned int n)
+{
+	while (n-- > 0)
+		nf_unregister_net_hook(net, &reg[n]);
+}
+EXPORT_SYMBOL(nf_unregister_net_hooks);
+
+static LIST_HEAD(nf_hook_list);
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+	struct net *net, *last;
+	int ret;
+
+	rtnl_lock();
+	for_each_net(net) {
+		ret = nf_register_net_hook(net, reg);
+		if (ret && ret != -ENOENT)
+			goto rollback;
+	}
+	list_add_tail(&reg->list, &nf_hook_list);
+	rtnl_unlock();
+
+	return 0;
+rollback:
+	last = net;
+	for_each_net(net) {
+		if (net == last)
+			break;
+		nf_unregister_net_hook(net, reg);
+	}
+	rtnl_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(nf_register_hook);
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+	struct net *net;
+
+	rtnl_lock();
+	list_del(&reg->list);
+	for_each_net(net)
+		nf_unregister_net_hook(net, reg);
+	rtnl_unlock();
 }
 EXPORT_SYMBOL(nf_unregister_hook);
 
@@ -294,8 +396,46 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
 EXPORT_SYMBOL(nf_nat_decode_session_hook);
 #endif
 
+static int nf_register_hook_list(struct net *net)
+{
+	struct nf_hook_ops *elem;
+	int ret;
+
+	rtnl_lock();
+	list_for_each_entry(elem, &nf_hook_list, list) {
+		ret = nf_register_net_hook(net, elem);
+		if (ret && ret != -ENOENT)
+			goto out_undo;
+	}
+	rtnl_unlock();
+	return 0;
+
+out_undo:
+	list_for_each_entry_continue_reverse(elem, &nf_hook_list, list)
+		nf_unregister_net_hook(net, elem);
+	rtnl_unlock();
+	return ret;
+}
+
+static void nf_unregister_hook_list(struct net *net)
+{
+	struct nf_hook_ops *elem;
+
+	rtnl_lock();
+	list_for_each_entry(elem, &nf_hook_list, list)
+		nf_unregister_net_hook(net, elem);
+	rtnl_unlock();
+}
+
 static int __net_init netfilter_net_init(struct net *net)
 {
+	int i, h, ret;
+
+	for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) {
+		for (h = 0; h < NF_MAX_HOOKS; h++)
+			INIT_LIST_HEAD(&net->nf.hooks[i][h]);
+	}
+
 #ifdef CONFIG_PROC_FS
 	net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
 						net->proc_net);
@@ -306,11 +446,16 @@ static int __net_init netfilter_net_init(struct net *net)
 		return -ENOMEM;
 	}
 #endif
-	return 0;
+	ret = nf_register_hook_list(net);
+	if (ret)
+		remove_proc_entry("netfilter", net->proc_net);
+
+	return ret;
 }
 
 static void __net_exit netfilter_net_exit(struct net *net)
 {
+	nf_unregister_hook_list(net);
 	remove_proc_entry("netfilter", net->proc_net);
 }
 
@@ -321,12 +466,7 @@ static struct pernet_operations netfilter_net_ops = {
 
 int __init netfilter_init(void)
 {
-	int i, h, ret;
-
-	for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) {
-		for (h = 0; h < NF_MAX_HOOKS; h++)
-			INIT_LIST_HEAD(&nf_hooks[i][h]);
-	}
+	int ret;
 
 	ret = register_pernet_subsys(&netfilter_net_ops);
 	if (ret < 0)
-- 
cgit v1.2.3-70-g09d2


From e7c8899f3e6f2830136cf6e115c4a55ce7a3920a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 14 Jul 2015 17:51:07 +0200
Subject: netfilter: move tee_active to core

This prepares for a TEE like expression in nftables.
We want to ensure only one duplicate is sent, so both will
use the same percpu variable to detect duplication.

The other use case is detection of recursive call to xtables, but since
we don't want dependency from nft to xtables core its put into core.c
instead of the x_tables core.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 11 +++++++++++
 net/netfilter/core.c      |  3 +++
 net/netfilter/xt_TEE.c    | 13 ++++++-------
 3 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 9bbd110ec81b..e01da73ee6c4 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -390,4 +390,15 @@ extern struct nfq_ct_hook __rcu *nfq_ct_hook;
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
 #endif
 
+/**
+ * nf_skb_duplicated - TEE target has sent a packet
+ *
+ * When a xtables target sends a packet, the OUTPUT and POSTROUTING
+ * hooks are traversed again, i.e. nft and xtables are invoked recursively.
+ *
+ * This is used by xtables TEE target to prevent the duplicated skb from
+ * being duplicated again.
+ */
+DECLARE_PER_CPU(bool, nf_skb_duplicated);
+
 #endif /*__LINUX_NETFILTER_H*/
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 56ead1a1711c..6896cee8b733 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -34,6 +34,9 @@ EXPORT_SYMBOL(nf_afinfo);
 const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
 EXPORT_SYMBOL_GPL(nf_ipv6_ops);
 
+DEFINE_PER_CPU(bool, nf_skb_duplicated);
+EXPORT_SYMBOL_GPL(nf_skb_duplicated);
+
 int nf_register_afinfo(const struct nf_afinfo *afinfo)
 {
 	mutex_lock(&afinfo_mutex);
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index a747eb475b68..8950e79c4dc9 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -37,7 +37,6 @@ struct xt_tee_priv {
 };
 
 static const union nf_inet_addr tee_zero_address;
-static DEFINE_PER_CPU(bool, tee_active);
 
 static struct net *pick_net(struct sk_buff *skb)
 {
@@ -88,7 +87,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	const struct xt_tee_tginfo *info = par->targinfo;
 	struct iphdr *iph;
 
-	if (__this_cpu_read(tee_active))
+	if (__this_cpu_read(nf_skb_duplicated))
 		return XT_CONTINUE;
 	/*
 	 * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
@@ -125,9 +124,9 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	ip_send_check(iph);
 
 	if (tee_tg_route4(skb, info)) {
-		__this_cpu_write(tee_active, true);
+		__this_cpu_write(nf_skb_duplicated, true);
 		ip_local_out(skb);
-		__this_cpu_write(tee_active, false);
+		__this_cpu_write(nf_skb_duplicated, false);
 	} else {
 		kfree_skb(skb);
 	}
@@ -170,7 +169,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_tee_tginfo *info = par->targinfo;
 
-	if (__this_cpu_read(tee_active))
+	if (__this_cpu_read(nf_skb_duplicated))
 		return XT_CONTINUE;
 	skb = pskb_copy(skb, GFP_ATOMIC);
 	if (skb == NULL)
@@ -188,9 +187,9 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 		--iph->hop_limit;
 	}
 	if (tee_tg_route6(skb, info)) {
-		__this_cpu_write(tee_active, true);
+		__this_cpu_write(nf_skb_duplicated, true);
 		ip6_local_out(skb);
-		__this_cpu_write(tee_active, false);
+		__this_cpu_write(nf_skb_duplicated, false);
 	} else {
 		kfree_skb(skb);
 	}
-- 
cgit v1.2.3-70-g09d2


From 7814b6ec6d0d63444abdb49554166c8cfcbd063e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 14 Jul 2015 17:51:08 +0200
Subject: netfilter: xtables: don't save/restore jumpstack offset

In most cases there is no reentrancy into ip/ip6tables.

For skbs sent by REJECT or SYNPROXY targets, there is one level
of reentrancy, but its not relevant as those targets issue an absolute
verdict, i.e. the jumpstack can be clobbered since its not used
after the target issues absolute verdict (ACCEPT, DROP, STOLEN, etc).

So the only special case where it is relevant is the TEE target, which
returns XT_CONTINUE.

This patch changes ip(6)_do_table to always use the jump stack starting
from 0.

When we detect we're operating on an skb sent via TEE (percpu
nf_skb_duplicated is 1) we switch to an alternate stack to leave
the original one alone.

Since there is no TEE support for arptables, it doesn't need to
test if tee is active.

The jump stack overflow tests are no longer needed as well --
since ->stacksize is the largest call depth we cannot exceed it.

A much better alternative to the external jumpstack would be to just
declare a jumps[32] stack on the local stack frame, but that would mean
we'd have to reject iptables rulesets that used to work before.

Another alternative would be to start rejecting rulesets with a larger
call depth, e.g. 1000 -- in this case it would be feasible to allocate the
entire stack in the percpu area which would avoid one dereference.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  1 -
 net/ipv4/netfilter/arp_tables.c    | 11 +++--------
 net/ipv4/netfilter/ip_tables.c     | 37 ++++++++++++++++++++-----------------
 net/ipv6/netfilter/ip6_tables.c    | 26 ++++++++++++++------------
 net/netfilter/x_tables.c           | 22 +++++++++++-----------
 5 files changed, 48 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 286098a5667f..149284557ca7 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -222,7 +222,6 @@ struct xt_table_info {
 	 * @stacksize jumps (number of user chains) can possibly be made.
 	 */
 	unsigned int stacksize;
-	unsigned int __percpu *stackptr;
 	void ***jumpstack;
 
 	unsigned char entries[0] __aligned(8);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index ae6d0a124213..969fdbe6fbb5 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -280,6 +280,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	table_base = private->entries;
 	jumpstack  = (struct arpt_entry **)private->jumpstack[cpu];
 
+	/* No TEE support for arptables, so no need to switch to alternate
+	 * stack.  All targets that reenter must return absolute verdicts.
+	 */
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	acpar.in      = state->in;
@@ -325,11 +328,6 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			}
 			if (table_base + v
 			    != arpt_next_entry(e)) {
-
-				if (stackidx >= private->stacksize) {
-					verdict = NF_DROP;
-					break;
-				}
 				jumpstack[stackidx++] = e;
 			}
 
@@ -337,9 +335,6 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			continue;
 		}
 
-		/* Targets which reenter must return
-		 * abs. verdicts
-		 */
 		acpar.target   = t->u.kernel.target;
 		acpar.targinfo = t->data;
 		verdict = t->u.kernel.target->target(skb, &acpar);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 5e44b35a8de8..a2e4b018a254 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -296,12 +296,13 @@ ipt_do_table(struct sk_buff *skb,
 	const char *indev, *outdev;
 	const void *table_base;
 	struct ipt_entry *e, **jumpstack;
-	unsigned int *stackptr, origptr, cpu;
+	unsigned int stackidx, cpu;
 	const struct xt_table_info *private;
 	struct xt_action_param acpar;
 	unsigned int addend;
 
 	/* Initialization */
+	stackidx = 0;
 	ip = ip_hdr(skb);
 	indev = state->in ? state->in->name : nulldevname;
 	outdev = state->out ? state->out->name : nulldevname;
@@ -331,13 +332,20 @@ ipt_do_table(struct sk_buff *skb,
 	smp_read_barrier_depends();
 	table_base = private->entries;
 	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
-	stackptr   = per_cpu_ptr(private->stackptr, cpu);
-	origptr    = *stackptr;
+
+	/* Switch to alternate jumpstack if we're being invoked via TEE.
+	 * TEE issues XT_CONTINUE verdict on original skb so we must not
+	 * clobber the jumpstack.
+	 *
+	 * For recursion via REJECT or SYNPROXY the stack will be clobbered
+	 * but it is no problem since absolute verdict is issued by these.
+	 */
+	jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
-	pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n",
-		 table->name, hook, origptr,
+	pr_debug("Entering %s(hook %u), UF %p\n",
+		 table->name, hook,
 		 get_entry(table_base, private->underflow[hook]));
 
 	do {
@@ -383,28 +391,24 @@ ipt_do_table(struct sk_buff *skb,
 					verdict = (unsigned int)(-v) - 1;
 					break;
 				}
-				if (*stackptr <= origptr) {
+				if (stackidx == 0) {
 					e = get_entry(table_base,
 					    private->underflow[hook]);
 					pr_debug("Underflow (this is normal) "
 						 "to %p\n", e);
 				} else {
-					e = jumpstack[--*stackptr];
+					e = jumpstack[--stackidx];
 					pr_debug("Pulled %p out from pos %u\n",
-						 e, *stackptr);
+						 e, stackidx);
 					e = ipt_next_entry(e);
 				}
 				continue;
 			}
 			if (table_base + v != ipt_next_entry(e) &&
 			    !(e->ip.flags & IPT_F_GOTO)) {
-				if (*stackptr >= private->stacksize) {
-					verdict = NF_DROP;
-					break;
-				}
-				jumpstack[(*stackptr)++] = e;
+				jumpstack[stackidx++] = e;
 				pr_debug("Pushed %p into pos %u\n",
-					 e, *stackptr - 1);
+					 e, stackidx - 1);
 			}
 
 			e = get_entry(table_base, v);
@@ -423,9 +427,8 @@ ipt_do_table(struct sk_buff *skb,
 			/* Verdict */
 			break;
 	} while (!acpar.hotdrop);
-	pr_debug("Exiting %s; resetting sp from %u to %u\n",
-		 __func__, *stackptr, origptr);
-	*stackptr = origptr;
+	pr_debug("Exiting %s; sp at %u\n", __func__, stackidx);
+
  	xt_write_recseq_end(addend);
  	local_bh_enable();
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index baf032179918..531281f0ff86 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -324,12 +324,13 @@ ip6t_do_table(struct sk_buff *skb,
 	const char *indev, *outdev;
 	const void *table_base;
 	struct ip6t_entry *e, **jumpstack;
-	unsigned int *stackptr, origptr, cpu;
+	unsigned int stackidx, cpu;
 	const struct xt_table_info *private;
 	struct xt_action_param acpar;
 	unsigned int addend;
 
 	/* Initialization */
+	stackidx = 0;
 	indev = state->in ? state->in->name : nulldevname;
 	outdev = state->out ? state->out->name : nulldevname;
 	/* We handle fragments by dealing with the first fragment as
@@ -357,8 +358,15 @@ ip6t_do_table(struct sk_buff *skb,
 	cpu        = smp_processor_id();
 	table_base = private->entries;
 	jumpstack  = (struct ip6t_entry **)private->jumpstack[cpu];
-	stackptr   = per_cpu_ptr(private->stackptr, cpu);
-	origptr    = *stackptr;
+
+	/* Switch to alternate jumpstack if we're being invoked via TEE.
+	 * TEE issues XT_CONTINUE verdict on original skb so we must not
+	 * clobber the jumpstack.
+	 *
+	 * For recursion via REJECT or SYNPROXY the stack will be clobbered
+	 * but it is no problem since absolute verdict is issued by these.
+	 */
+	jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -406,20 +414,16 @@ ip6t_do_table(struct sk_buff *skb,
 					verdict = (unsigned int)(-v) - 1;
 					break;
 				}
-				if (*stackptr <= origptr)
+				if (stackidx == 0)
 					e = get_entry(table_base,
 					    private->underflow[hook]);
 				else
-					e = ip6t_next_entry(jumpstack[--*stackptr]);
+					e = ip6t_next_entry(jumpstack[--stackidx]);
 				continue;
 			}
 			if (table_base + v != ip6t_next_entry(e) &&
 			    !(e->ipv6.flags & IP6T_F_GOTO)) {
-				if (*stackptr >= private->stacksize) {
-					verdict = NF_DROP;
-					break;
-				}
-				jumpstack[(*stackptr)++] = e;
+				jumpstack[stackidx++] = e;
 			}
 
 			e = get_entry(table_base, v);
@@ -437,8 +441,6 @@ ip6t_do_table(struct sk_buff *skb,
 			break;
 	} while (!acpar.hotdrop);
 
-	*stackptr = origptr;
-
  	xt_write_recseq_end(addend);
  	local_bh_enable();
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 4db7d60d42fa..154447e519ab 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -67,9 +67,6 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
 	[NFPROTO_IPV6]   = "ip6",
 };
 
-/* Allow this many total (re)entries. */
-static const unsigned int xt_jumpstack_multiplier = 2;
-
 /* Registration hooks for targets. */
 int xt_register_target(struct xt_target *target)
 {
@@ -688,8 +685,6 @@ void xt_free_table_info(struct xt_table_info *info)
 		kvfree(info->jumpstack);
 	}
 
-	free_percpu(info->stackptr);
-
 	kvfree(info);
 }
 EXPORT_SYMBOL(xt_free_table_info);
@@ -737,10 +732,6 @@ static int xt_jumpstack_alloc(struct xt_table_info *i)
 	unsigned int size;
 	int cpu;
 
-	i->stackptr = alloc_percpu(unsigned int);
-	if (i->stackptr == NULL)
-		return -ENOMEM;
-
 	size = sizeof(void **) * nr_cpu_ids;
 	if (size > PAGE_SIZE)
 		i->jumpstack = vzalloc(size);
@@ -753,8 +744,17 @@ static int xt_jumpstack_alloc(struct xt_table_info *i)
 	if (i->stacksize == 0)
 		return 0;
 
-	i->stacksize *= xt_jumpstack_multiplier;
-	size = sizeof(void *) * i->stacksize;
+	/* Jumpstack needs to be able to record two full callchains, one
+	 * from the first rule set traversal, plus one table reentrancy
+	 * via -j TEE without clobbering the callchain that brought us to
+	 * TEE target.
+	 *
+	 * This is done by allocating two jumpstacks per cpu, on reentry
+	 * the upper half of the stack is used.
+	 *
+	 * see the jumpstack setup in ipt_do_table() for more details.
+	 */
+	size = sizeof(void *) * i->stacksize * 2u;
 	for_each_possible_cpu(cpu) {
 		if (size > PAGE_SIZE)
 			i->jumpstack[cpu] = vmalloc_node(size,
-- 
cgit v1.2.3-70-g09d2


From dcebd3153e0a7749bb054ab73fa4e1ca33e9d3f9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 14 Jul 2015 17:51:09 +0200
Subject: netfilter: add and use jump label for xt_tee

Don't bother testing if we need to switch to alternate stack
unless TEE target is used.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 7 +++++++
 net/ipv4/netfilter/ip_tables.c     | 3 ++-
 net/ipv6/netfilter/ip6_tables.c    | 3 ++-
 net/netfilter/x_tables.c           | 3 +++
 net/netfilter/xt_TEE.c             | 2 ++
 5 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 149284557ca7..b006b719183f 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -3,6 +3,7 @@
 
 
 #include <linux/netdevice.h>
+#include <linux/static_key.h>
 #include <uapi/linux/netfilter/x_tables.h>
 
 /**
@@ -280,6 +281,12 @@ void xt_free_table_info(struct xt_table_info *info);
  */
 DECLARE_PER_CPU(seqcount_t, xt_recseq);
 
+/* xt_tee_enabled - true if x_tables needs to handle reentrancy
+ *
+ * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
+ */
+extern struct static_key xt_tee_enabled;
+
 /**
  * xt_write_recseq_begin - start of a write section
  *
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index a2e4b018a254..ff585bdbf850 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -340,7 +340,8 @@ ipt_do_table(struct sk_buff *skb,
 	 * For recursion via REJECT or SYNPROXY the stack will be clobbered
 	 * but it is no problem since absolute verdict is issued by these.
 	 */
-	jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
+	if (static_key_false(&xt_tee_enabled))
+		jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 531281f0ff86..ea6d105063c2 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -366,7 +366,8 @@ ip6t_do_table(struct sk_buff *skb,
 	 * For recursion via REJECT or SYNPROXY the stack will be clobbered
 	 * but it is no problem since absolute verdict is issued by these.
 	 */
-	jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
+	if (static_key_false(&xt_tee_enabled))
+		jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 154447e519ab..9b42b5ea6dcd 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -727,6 +727,9 @@ EXPORT_SYMBOL_GPL(xt_compat_unlock);
 DEFINE_PER_CPU(seqcount_t, xt_recseq);
 EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
 
+struct static_key xt_tee_enabled __read_mostly;
+EXPORT_SYMBOL_GPL(xt_tee_enabled);
+
 static int xt_jumpstack_alloc(struct xt_table_info *i)
 {
 	unsigned int size;
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 8950e79c4dc9..c5d6556dbc5e 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -251,6 +251,7 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
 	} else
 		info->priv = NULL;
 
+	static_key_slow_inc(&xt_tee_enabled);
 	return 0;
 }
 
@@ -262,6 +263,7 @@ static void tee_tg_destroy(const struct xt_tgdtor_param *par)
 		unregister_netdevice_notifier(&info->priv->notifier);
 		kfree(info->priv);
 	}
+	static_key_slow_dec(&xt_tee_enabled);
 }
 
 static struct xt_target tee_tg_reg[] __read_mostly = {
-- 
cgit v1.2.3-70-g09d2


From 13c4a90119d28cfcb6b5bdd820c233b86c2b0237 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho.andersen@canonical.com>
Date: Sat, 13 Jun 2015 09:02:48 -0600
Subject: seccomp: add ptrace options for suspend/resume

This patch is the first step in enabling checkpoint/restore of processes
with seccomp enabled.

One of the things CRIU does while dumping tasks is inject code into them
via ptrace to collect information that is only available to the process
itself. However, if we are in a seccomp mode where these processes are
prohibited from making these syscalls, then what CRIU does kills the task.

This patch adds a new ptrace option, PTRACE_O_SUSPEND_SECCOMP, that enables
a task from the init user namespace which has CAP_SYS_ADMIN and no seccomp
filters to disable (and re-enable) seccomp filters for another task so that
they can be successfully dumped (and restored). We restrict the set of
processes that can disable seccomp through ptrace because although today
ptrace can be used to bypass seccomp, there is some discussion of closing
this loophole in the future and we would like this patch to not depend on
that behavior and be future proofed for when it is removed.

Note that seccomp can be suspended before any filters are actually
installed; this behavior is useful on criu restore, so that we can suspend
seccomp, restore the filters, unmap our restore code from the restored
process' address space, and then resume the task by detaching and have the
filters resumed as well.

v2 changes:

* require that the tracer have no seccomp filters installed
* drop TIF_NOTSC manipulation from the patch
* change from ptrace command to a ptrace option and use this ptrace option
  as the flag to check. This means that as soon as the tracer
  detaches/dies, seccomp is re-enabled and as a corrollary that one can not
  disable seccomp across PTRACE_ATTACHs.

v3 changes:

* get rid of various #ifdefs everywhere
* report more sensible errors when PTRACE_O_SUSPEND_SECCOMP is incorrectly
  used

v4 changes:

* get rid of may_suspend_seccomp() in favor of a capable() check in ptrace
  directly

v5 changes:

* check that seccomp is not enabled (or suspended) on the tracer

Signed-off-by: Tycho Andersen <tycho.andersen@canonical.com>
CC: Will Drewry <wad@chromium.org>
CC: Roland McGrath <roland@hack.frob.com>
CC: Pavel Emelyanov <xemul@parallels.com>
CC: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
[kees: access seccomp.mode through seccomp_mode() instead]
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/ptrace.h      |  1 +
 include/uapi/linux/ptrace.h |  6 ++++--
 kernel/ptrace.c             | 13 +++++++++++++
 kernel/seccomp.c            |  8 ++++++++
 4 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 987a73a40ef8..061265f92876 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -34,6 +34,7 @@
 #define PT_TRACE_SECCOMP	PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP)
 
 #define PT_EXITKILL		(PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT)
+#define PT_SUSPEND_SECCOMP	(PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT)
 
 /* single stepping state bits (used on ARM and PA-RISC) */
 #define PT_SINGLESTEP_BIT	31
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index cf1019e15f5b..a7a697986614 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -89,9 +89,11 @@ struct ptrace_peeksiginfo_args {
 #define PTRACE_O_TRACESECCOMP	(1 << PTRACE_EVENT_SECCOMP)
 
 /* eventless options */
-#define PTRACE_O_EXITKILL	(1 << 20)
+#define PTRACE_O_EXITKILL		(1 << 20)
+#define PTRACE_O_SUSPEND_SECCOMP	(1 << 21)
 
-#define PTRACE_O_MASK		(0x000000ff | PTRACE_O_EXITKILL)
+#define PTRACE_O_MASK		(\
+	0x000000ff | PTRACE_O_EXITKILL | PTRACE_O_SUSPEND_SECCOMP)
 
 #include <asm/ptrace.h>
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c8e0e050a36a..787320de68e0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -556,6 +556,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 	if (data & ~(unsigned long)PTRACE_O_MASK)
 		return -EINVAL;
 
+	if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+		if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) ||
+		    !config_enabled(CONFIG_SECCOMP))
+			return -EINVAL;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+		    current->ptrace & PT_SUSPEND_SECCOMP)
+			return -EPERM;
+	}
+
 	/* Avoid intermediate state when all opts are cleared */
 	flags = child->ptrace;
 	flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 980fd26da22e..645e42d6fa4d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -590,6 +590,10 @@ void secure_computing_strict(int this_syscall)
 {
 	int mode = current->seccomp.mode;
 
+	if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+		return;
+
 	if (mode == 0)
 		return;
 	else if (mode == SECCOMP_MODE_STRICT)
@@ -691,6 +695,10 @@ u32 seccomp_phase1(struct seccomp_data *sd)
 	int this_syscall = sd ? sd->nr :
 		syscall_get_nr(current, task_pt_regs(current));
 
+	if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+		return SECCOMP_PHASE1_OK;
+
 	switch (mode) {
 	case SECCOMP_MODE_STRICT:
 		__secure_computing_strict(this_syscall);  /* may call do_exit */
-- 
cgit v1.2.3-70-g09d2


From 221272f97ca528048a577a3ff23d7774286ca5fd Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 15 Jun 2015 15:29:16 -0700
Subject: seccomp: swap hard-coded zeros to defined name

For clarity, if CONFIG_SECCOMP isn't defined, seccomp_mode() is returning
"disabled". This makes that more clear, along with another 0-use, and
results in no operational change.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/seccomp.h | 2 +-
 kernel/seccomp.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index a19ddacdac30..f4265039a94c 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -78,7 +78,7 @@ static inline long prctl_set_seccomp(unsigned long arg2, char __user *arg3)
 
 static inline int seccomp_mode(struct seccomp *s)
 {
-	return 0;
+	return SECCOMP_MODE_DISABLED;
 }
 #endif /* CONFIG_SECCOMP */
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 645e42d6fa4d..383bd6caca81 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -594,7 +594,7 @@ void secure_computing_strict(int this_syscall)
 	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
 		return;
 
-	if (mode == 0)
+	if (mode == SECCOMP_MODE_DISABLED)
 		return;
 	else if (mode == SECCOMP_MODE_STRICT)
 		__secure_computing_strict(this_syscall);
-- 
cgit v1.2.3-70-g09d2


From d5671f6bf2a672cfa72ef2cbac5cc53a4539690d Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Tue, 26 May 2015 17:48:34 +0200
Subject: rcu: Deinline rcu_read_lock_sched_held() if DEBUG_LOCK_ALLOC

DEBUG_LOCK_ALLOC=y is not a production setting, but it is
not very unusual either. Many developers routinely
use kernels built with it enabled.

Apart from being selected by hand, it is also auto-selected by
PROVE_LOCKING "Lock debugging: prove locking correctness" and
LOCK_STAT "Lock usage statistics" config options.
LOCK STAT is necessary for "perf lock" to work.

I wouldn't spend too much time optimizing it, but this particular
function has a very large cost in code size: when it is deinlined,
code size decreases by 830,000 bytes:

    text     data      bss       dec     hex filename
85674192 22294776 20627456 128596424 7aa39c8 vmlinux.before
84837612 22294424 20627456 127759492 79d7484 vmlinux

(with this config: http://busybox.net/~vda/kernel_config)

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Lai Jiangshan <laijs@cn.fujitsu.com>
CC: Tejun Heo <tj@kernel.org>
CC: Oleg Nesterov <oleg@redhat.com>
CC: linux-kernel@vger.kernel.org
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 40 ++-------------------------------------
 kernel/rcu/update.c      | 49 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 237f7b8d38ba..def6d45ad61c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -469,46 +469,10 @@ int rcu_read_lock_bh_held(void);
  * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
  * RCU-sched read-side critical section.  In absence of
  * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
- * critical section unless it can prove otherwise.  Note that disabling
- * of preemption (including disabling irqs) counts as an RCU-sched
- * read-side critical section.  This is useful for debug checks in functions
- * that required that they be called within an RCU-sched read-side
- * critical section.
- *
- * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
- * and while lockdep is disabled.
- *
- * Note that if the CPU is in the idle loop from an RCU point of
- * view (ie: that we are in the section between rcu_idle_enter() and
- * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU
- * did an rcu_read_lock().  The reason for this is that RCU ignores CPUs
- * that are in such a section, considering these as in extended quiescent
- * state, so such a CPU is effectively never in an RCU read-side critical
- * section regardless of what RCU primitives it invokes.  This state of
- * affairs is required --- we need to keep an RCU-free window in idle
- * where the CPU may possibly enter into low power mode. This way we can
- * notice an extended quiescent state to other CPUs that started a grace
- * period. Otherwise we would delay any grace period as long as we run in
- * the idle task.
- *
- * Similarly, we avoid claiming an SRCU read lock held if the current
- * CPU is offline.
+ * critical section unless it can prove otherwise.
  */
 #ifdef CONFIG_PREEMPT_COUNT
-static inline int rcu_read_lock_sched_held(void)
-{
-	int lockdep_opinion = 0;
-
-	if (!debug_lockdep_rcu_enabled())
-		return 1;
-	if (!rcu_is_watching())
-		return 0;
-	if (!rcu_lockdep_current_cpu_online())
-		return 0;
-	if (debug_locks)
-		lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
-	return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
-}
+int rcu_read_lock_sched_held(void);
 #else /* #ifdef CONFIG_PREEMPT_COUNT */
 static inline int rcu_read_lock_sched_held(void)
 {
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index afaecb7a799a..fec5f48b8860 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,6 +62,55 @@ MODULE_ALIAS("rcupdate");
 
 module_param(rcu_expedited, int, 0);
 
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
+/**
+ * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
+ *
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
+ * RCU-sched read-side critical section.  In absence of
+ * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
+ * critical section unless it can prove otherwise.  Note that disabling
+ * of preemption (including disabling irqs) counts as an RCU-sched
+ * read-side critical section.  This is useful for debug checks in functions
+ * that required that they be called within an RCU-sched read-side
+ * critical section.
+ *
+ * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * and while lockdep is disabled.
+ *
+ * Note that if the CPU is in the idle loop from an RCU point of
+ * view (ie: that we are in the section between rcu_idle_enter() and
+ * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU
+ * did an rcu_read_lock().  The reason for this is that RCU ignores CPUs
+ * that are in such a section, considering these as in extended quiescent
+ * state, so such a CPU is effectively never in an RCU read-side critical
+ * section regardless of what RCU primitives it invokes.  This state of
+ * affairs is required --- we need to keep an RCU-free window in idle
+ * where the CPU may possibly enter into low power mode. This way we can
+ * notice an extended quiescent state to other CPUs that started a grace
+ * period. Otherwise we would delay any grace period as long as we run in
+ * the idle task.
+ *
+ * Similarly, we avoid claiming an SRCU read lock held if the current
+ * CPU is offline.
+ */
+int rcu_read_lock_sched_held(void)
+{
+	int lockdep_opinion = 0;
+
+	if (!debug_lockdep_rcu_enabled())
+		return 1;
+	if (!rcu_is_watching())
+		return 0;
+	if (!rcu_lockdep_current_cpu_online())
+		return 0;
+	if (debug_locks)
+		lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
+	return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
+}
+EXPORT_SYMBOL(rcu_read_lock_sched_held);
+#endif
+
 #ifndef CONFIG_TINY_RCU
 
 static atomic_t rcu_expedited_nesting =
-- 
cgit v1.2.3-70-g09d2


From 30bb6fb39e5c08b9db5bc592d6cbc9a5fc5e67a4 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 15 Jun 2015 13:31:33 +0200
Subject: gpio: Remove double "base" in comment

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index cc7ec129b329..c8393cd4d44f 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -45,7 +45,7 @@ struct seq_file;
  * @base: identifies the first GPIO number handled by this chip;
  *	or, if negative during registration, requests dynamic ID allocation.
  *	DEPRECATION: providing anything non-negative and nailing the base
- *	base offset of GPIO chips is deprecated. Please pass -1 as base to
+ *	offset of GPIO chips is deprecated. Please pass -1 as base to
  *	let gpiolib select the chip base in all possible cases. We want to
  *	get rid of the static GPIO number space in the long run.
  * @ngpio: the number of GPIOs handled by this controller; the last GPIO
-- 
cgit v1.2.3-70-g09d2


From d746d707a8b1421a4ba46b497cb5d59e20161645 Mon Sep 17 00:00:00 2001
From: Anuradha Karuppiah <anuradhak@cumulusnetworks.com>
Date: Tue, 14 Jul 2015 13:43:19 -0700
Subject: net core: Add protodown support.

This patch introduces the proto_down flag that can be used by user space
applications to notify switch drivers that errors have been detected on the
device.

The switch driver can react to protodown notification by doing a phys down
on the associated switch port.

Signed-off-by: Anuradha Karuppiah <anuradhak@cumulusnetworks.com>
Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Wilson Kok <wkok@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 14 ++++++++++++++
 net/core/dev.c            | 20 ++++++++++++++++++++
 net/core/net-sysfs.c      | 14 ++++++++++++++
 3 files changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e20979dfd6a9..45cfd797eb77 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1041,6 +1041,12 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
  *	TX queue.
  * int (*ndo_get_iflink)(const struct net_device *dev);
  *	Called to get the iflink value of this device.
+ * void (*ndo_change_proto_down)(struct net_device *dev,
+ *				  bool proto_down);
+ *	This function is used to pass protocol port error state information
+ *	to the switch driver. The switch driver can react to the proto_down
+ *      by doing a phys down on the associated switch port.
+ *
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1211,6 +1217,8 @@ struct net_device_ops {
 						      int queue_index,
 						      u32 maxrate);
 	int			(*ndo_get_iflink)(const struct net_device *dev);
+	int			(*ndo_change_proto_down)(struct net_device *dev,
+							 bool proto_down);
 };
 
 /**
@@ -1502,6 +1510,10 @@ enum netdev_priv_flags {
  *
  *	@qdisc_tx_busylock:	XXX: need comments on this one
  *
+ *	@proto_down:	protocol port state information can be sent to the
+ *			switch driver and used to set the phys state of the
+ *			switch port.
+ *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
  */
@@ -1762,6 +1774,7 @@ struct net_device {
 #endif
 	struct phy_device *phydev;
 	struct lock_class_key *qdisc_tx_busylock;
+	bool proto_down;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -2982,6 +2995,7 @@ int dev_get_phys_port_id(struct net_device *dev,
 			 struct netdev_phys_item_id *ppid);
 int dev_get_phys_port_name(struct net_device *dev,
 			   char *name, size_t len);
+int dev_change_proto_down(struct net_device *dev, bool proto_down);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
diff --git a/net/core/dev.c b/net/core/dev.c
index 69445a33ace6..8810b6bbebfe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6074,6 +6074,26 @@ int dev_get_phys_port_name(struct net_device *dev,
 }
 EXPORT_SYMBOL(dev_get_phys_port_name);
 
+/**
+ *	dev_change_proto_down - update protocol port state information
+ *	@dev: device
+ *	@proto_down: new value
+ *
+ *	This info can be used by switch drivers to set the phys state of the
+ *	port.
+ */
+int dev_change_proto_down(struct net_device *dev, bool proto_down)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (!ops->ndo_change_proto_down)
+		return -EOPNOTSUPP;
+	if (!netif_device_present(dev))
+		return -ENODEV;
+	return ops->ndo_change_proto_down(dev, proto_down);
+}
+EXPORT_SYMBOL(dev_change_proto_down);
+
 /**
  *	dev_new_index	-	allocate an ifindex
  *	@net: the applicable net namespace
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 18b34d771ed4..194c1d03b2b3 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -404,6 +404,19 @@ static ssize_t group_store(struct device *dev, struct device_attribute *attr,
 NETDEVICE_SHOW(group, fmt_dec);
 static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store);
 
+static int change_proto_down(struct net_device *dev, unsigned long proto_down)
+{
+	return dev_change_proto_down(dev, (bool) proto_down);
+}
+
+static ssize_t proto_down_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t len)
+{
+	return netdev_store(dev, attr, buf, len, change_proto_down);
+}
+NETDEVICE_SHOW_RW(proto_down, fmt_dec);
+
 static ssize_t phys_port_id_show(struct device *dev,
 				 struct device_attribute *attr, char *buf)
 {
@@ -501,6 +514,7 @@ static struct attribute *net_class_attrs[] = {
 	&dev_attr_phys_port_id.attr,
 	&dev_attr_phys_port_name.attr,
 	&dev_attr_phys_switch_id.attr,
+	&dev_attr_proto_down.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(net_class);
-- 
cgit v1.2.3-70-g09d2


From 527b397a7a3647b8ba2eae2e7a12b237bf411476 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 23 Jun 2015 15:48:02 +0200
Subject: gpio: em: Remove obsolete platform data support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit 59032702ead90562 ("ARM: shmobile: Remove legacy platform
devices from EMEV2 SoC code"), EMMA Mobile SoCs are only supported in
generic DT-only ARM multi-platform builds.  The driver doesn't need to
use platform data anymore, hence remove platform data configuration.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Simon Horman <horms+renesas@verge.net.au>
Tested-by: Niklas Söderlund <niso@kth.se>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-em.c                | 34 ++++++++--------------------------
 include/linux/platform_data/gpio-em.h | 11 -----------
 2 files changed, 8 insertions(+), 37 deletions(-)
 delete mode 100644 include/linux/platform_data/gpio-em.h

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-em.c b/drivers/gpio/gpio-em.c
index fbf287307c4c..a77f16c8d142 100644
--- a/drivers/gpio/gpio-em.c
+++ b/drivers/gpio/gpio-em.c
@@ -31,7 +31,6 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/pinctrl/consumer.h>
-#include <linux/platform_data/gpio-em.h>
 
 struct em_gio_priv {
 	void __iomem *base0;
@@ -273,13 +272,12 @@ static const struct irq_domain_ops em_gio_irq_domain_ops = {
 
 static int em_gio_probe(struct platform_device *pdev)
 {
-	struct gpio_em_config pdata_dt;
-	struct gpio_em_config *pdata = dev_get_platdata(&pdev->dev);
 	struct em_gio_priv *p;
 	struct resource *io[2], *irq[2];
 	struct gpio_chip *gpio_chip;
 	struct irq_chip *irq_chip;
 	const char *name = dev_name(&pdev->dev);
+	unsigned int ngpios;
 	int ret;
 
 	p = devm_kzalloc(&pdev->dev, sizeof(*p), GFP_KERNEL);
@@ -319,18 +317,10 @@ static int em_gio_probe(struct platform_device *pdev)
 		goto err0;
 	}
 
-	if (!pdata) {
-		memset(&pdata_dt, 0, sizeof(pdata_dt));
-		pdata = &pdata_dt;
-
-		if (of_property_read_u32(pdev->dev.of_node, "ngpios",
-					 &pdata->number_of_pins)) {
-			dev_err(&pdev->dev, "Missing ngpios OF property\n");
-			ret = -EINVAL;
-			goto err0;
-		}
-
-		pdata->gpio_base = -1;
+	if (of_property_read_u32(pdev->dev.of_node, "ngpios", &ngpios)) {
+		dev_err(&pdev->dev, "Missing ngpios OF property\n");
+		ret = -EINVAL;
+		goto err0;
 	}
 
 	gpio_chip = &p->gpio_chip;
@@ -345,8 +335,8 @@ static int em_gio_probe(struct platform_device *pdev)
 	gpio_chip->label = name;
 	gpio_chip->dev = &pdev->dev;
 	gpio_chip->owner = THIS_MODULE;
-	gpio_chip->base = pdata->gpio_base;
-	gpio_chip->ngpio = pdata->number_of_pins;
+	gpio_chip->base = -1;
+	gpio_chip->ngpio = ngpios;
 
 	irq_chip = &p->irq_chip;
 	irq_chip->name = name;
@@ -357,9 +347,7 @@ static int em_gio_probe(struct platform_device *pdev)
 	irq_chip->irq_release_resources = em_gio_irq_relres;
 	irq_chip->flags	= IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MASK_ON_SUSPEND;
 
-	p->irq_domain = irq_domain_add_simple(pdev->dev.of_node,
-					      pdata->number_of_pins,
-					      pdata->irq_base,
+	p->irq_domain = irq_domain_add_simple(pdev->dev.of_node, ngpios, 0,
 					      &em_gio_irq_domain_ops, p);
 	if (!p->irq_domain) {
 		ret = -ENXIO;
@@ -387,12 +375,6 @@ static int em_gio_probe(struct platform_device *pdev)
 		goto err1;
 	}
 
-	if (pdata->pctl_name) {
-		ret = gpiochip_add_pin_range(gpio_chip, pdata->pctl_name, 0,
-					     gpio_chip->base, gpio_chip->ngpio);
-		if (ret < 0)
-			dev_warn(&pdev->dev, "failed to add pin range\n");
-	}
 	return 0;
 
 err1:
diff --git a/include/linux/platform_data/gpio-em.h b/include/linux/platform_data/gpio-em.h
deleted file mode 100644
index 7c5a519d2dcd..000000000000
--- a/include/linux/platform_data/gpio-em.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef __GPIO_EM_H__
-#define __GPIO_EM_H__
-
-struct gpio_em_config {
-	unsigned int gpio_base;
-	unsigned int irq_base;
-	unsigned int number_of_pins;
-	const char *pctl_name;
-};
-
-#endif /* __GPIO_EM_H__ */
-- 
cgit v1.2.3-70-g09d2


From 9cf705de06a27cc99874626c9717b32e9874b3bb Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 16 Jul 2015 01:55:57 -0700
Subject: ARM: OMAP2+: Add support for initializing dm814x clocks

Let's add a minimal clocks for dm814x to get it booted. This is
mostly a placeholder and relies on the PLLs being on from the
bootloader.

Note that the divider clocks work the same way as on dm816x and
am335x.

Cc: Matthijs van Duin <matthijsvanduin@gmail.com>
Cc: Mike Turquette <mturquette@linaro.org>
Cc: Paul Walmsley <paul@pwsan.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Tero Kristo <t-kristo@ti.com>
Acked-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/io.c  |  4 ++--
 drivers/clk/ti/Makefile   |  2 +-
 drivers/clk/ti/clk-814x.c | 31 +++++++++++++++++++++++++++++++
 drivers/clk/ti/clk-816x.c |  2 +-
 include/linux/clk/ti.h    |  3 ++-
 5 files changed, 37 insertions(+), 5 deletions(-)
 create mode 100644 drivers/clk/ti/clk-814x.c

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/io.c b/arch/arm/mach-omap2/io.c
index 6779a9ff0d10..596af73c7549 100644
--- a/arch/arm/mach-omap2/io.c
+++ b/arch/arm/mach-omap2/io.c
@@ -558,7 +558,7 @@ void __init ti814x_init_early(void)
 	ti81xx_hwmod_init();
 	omap_hwmod_init_postsetup();
 	if (of_have_populated_dt())
-		omap_clk_soc_init = ti81xx_dt_clk_init;
+		omap_clk_soc_init = dm814x_dt_clk_init;
 }
 
 void __init ti816x_init_early(void)
@@ -575,7 +575,7 @@ void __init ti816x_init_early(void)
 	ti81xx_hwmod_init();
 	omap_hwmod_init_postsetup();
 	if (of_have_populated_dt())
-		omap_clk_soc_init = ti81xx_dt_clk_init;
+		omap_clk_soc_init = dm816x_dt_clk_init;
 }
 #endif
 
diff --git a/drivers/clk/ti/Makefile b/drivers/clk/ti/Makefile
index 105ffd0f5e79..80b42884a0e9 100644
--- a/drivers/clk/ti/Makefile
+++ b/drivers/clk/ti/Makefile
@@ -2,7 +2,7 @@ obj-y					+= clk.o autoidle.o clockdomain.o
 clk-common				= dpll.o composite.o divider.o gate.o \
 					  fixed-factor.o mux.o apll.o
 obj-$(CONFIG_SOC_AM33XX)		+= $(clk-common) clk-33xx.o
-obj-$(CONFIG_SOC_TI81XX)		+= $(clk-common) fapll.o clk-816x.o
+obj-$(CONFIG_SOC_TI81XX)		+= $(clk-common) fapll.o clk-814x.o clk-816x.o
 obj-$(CONFIG_ARCH_OMAP2)		+= $(clk-common) interface.o clk-2xxx.o
 obj-$(CONFIG_ARCH_OMAP3)		+= $(clk-common) interface.o \
 					   clk-3xxx.o
diff --git a/drivers/clk/ti/clk-814x.c b/drivers/clk/ti/clk-814x.c
new file mode 100644
index 000000000000..d490d427cc20
--- /dev/null
+++ b/drivers/clk/ti/clk-814x.c
@@ -0,0 +1,31 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ */
+
+#include <linux/kernel.h>
+#include <linux/clk-provider.h>
+#include <linux/clk/ti.h>
+
+static struct ti_dt_clk dm814_clks[] = {
+	DT_CLK(NULL, "devosc_ck", "devosc_ck"),
+	DT_CLK(NULL, "mpu_ck", "mpu_ck"),
+	DT_CLK(NULL, "sysclk4_ck", "sysclk4_ck"),
+	DT_CLK(NULL, "sysclk6_ck", "sysclk6_ck"),
+	DT_CLK(NULL, "sysclk10_ck", "sysclk10_ck"),
+	DT_CLK(NULL, "sysclk18_ck", "sysclk18_ck"),
+	DT_CLK(NULL, "timer_sys_ck", "devosc_ck"),
+	DT_CLK(NULL, "cpsw_125mhz_gclk", "cpsw_125mhz_gclk"),
+	DT_CLK(NULL, "cpsw_cpts_rft_clk", "cpsw_cpts_rft_clk"),
+	{ .node_name = NULL },
+};
+
+int __init dm814x_dt_clk_init(void)
+{
+	ti_dt_clocks_register(dm814_clks);
+	omap2_clk_disable_autoidle_all();
+	omap2_clk_enable_init_clocks(NULL, 0);
+
+	return 0;
+}
diff --git a/drivers/clk/ti/clk-816x.c b/drivers/clk/ti/clk-816x.c
index 9451e651a1ff..43d07456e78d 100644
--- a/drivers/clk/ti/clk-816x.c
+++ b/drivers/clk/ti/clk-816x.c
@@ -42,7 +42,7 @@ static const char *enable_init_clks[] = {
 	"ddr_pll_clk3",
 };
 
-int __init ti81xx_dt_clk_init(void)
+int __init dm816x_dt_clk_init(void)
 {
 	ti_dt_clocks_register(dm816x_clks);
 	omap2_clk_disable_autoidle_all();
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 79b76e13d904..1736e29cee1b 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -329,7 +329,8 @@ int ti_clk_add_component(struct device_node *node, struct clk_hw *hw, int type);
 int omap3430_dt_clk_init(void);
 int omap3630_dt_clk_init(void);
 int am35xx_dt_clk_init(void);
-int ti81xx_dt_clk_init(void);
+int dm814x_dt_clk_init(void);
+int dm816x_dt_clk_init(void);
 int omap4xxx_dt_clk_init(void);
 int omap5xxx_dt_clk_init(void);
 int dra7xx_dt_clk_init(void);
-- 
cgit v1.2.3-70-g09d2


From e1443d2849b146be4ed8d4ef89ae7e215aafaa5b Mon Sep 17 00:00:00 2001
From: Stephen Chandler Paul <cpaul@redhat.com>
Date: Wed, 15 Jul 2015 10:20:17 -0700
Subject: Input: i8042 - add unmask_kbd_data option

A big problem with the current i8042 debugging option is that it outputs
data going to and from the keyboard by default. As a result, many dmesg
logs uploaded by users will unintentionally contain sensitive information
such as their password, as such it's probably a good idea not to output
data coming from the keyboard unless specifically enabled by the user.

Signed-off-by: Stephen Chandler Paul <cpaul@redhat.com>
Reviewed-by: Andreas Mohr <andim2@users.sf.net>
Reviewed-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 Documentation/kernel-parameters.txt |  4 ++++
 drivers/input/serio/i8042.c         | 43 +++++++++++++++++++++++++++++++++----
 drivers/input/serio/i8042.h         | 13 +++++++++++
 drivers/input/serio/serio.c         |  5 ++---
 include/linux/serio.h               |  2 ++
 5 files changed, 60 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index bfcb1a62a7b4..fd0f7cd8e496 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1274,6 +1274,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			     <bus_id>,<clkrate>
 
 	i8042.debug	[HW] Toggle i8042 debug mode
+	i8042.unmask_kbd_data
+			[HW] Enable printing of interrupt data from the KBD port
+			     (disabled by default, and as a pre-condition
+			     requires that i8042.debug=1 be enabled)
 	i8042.direct	[HW] Put keyboard port into non-translated mode
 	i8042.dumbkbd	[HW] Pretend that controller can only read data from
 			     keyboard and cannot control its state
diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c
index cb5ece77fd7d..c9c98f0ab284 100644
--- a/drivers/input/serio/i8042.c
+++ b/drivers/input/serio/i8042.c
@@ -88,6 +88,10 @@ MODULE_PARM_DESC(nopnp, "Do not use PNP to detect controller settings");
 static bool i8042_debug;
 module_param_named(debug, i8042_debug, bool, 0600);
 MODULE_PARM_DESC(debug, "Turn i8042 debugging mode on and off");
+
+static bool i8042_unmask_kbd_data;
+module_param_named(unmask_kbd_data, i8042_unmask_kbd_data, bool, 0600);
+MODULE_PARM_DESC(unmask_kbd_data, "Unconditional enable (may reveal sensitive data) of normally sanitize-filtered kbd data traffic debug log [pre-condition: i8042.debug=1 enabled]");
 #endif
 
 static bool i8042_bypass_aux_irq_test;
@@ -116,6 +120,7 @@ struct i8042_port {
 	struct serio *serio;
 	int irq;
 	bool exists;
+	bool driver_bound;
 	signed char mux;
 };
 
@@ -133,6 +138,7 @@ static bool i8042_kbd_irq_registered;
 static bool i8042_aux_irq_registered;
 static unsigned char i8042_suppress_kbd_ack;
 static struct platform_device *i8042_platform_device;
+static struct notifier_block i8042_kbd_bind_notifier_block;
 
 static irqreturn_t i8042_interrupt(int irq, void *dev_id);
 static bool (*i8042_platform_filter)(unsigned char data, unsigned char str,
@@ -528,10 +534,10 @@ static irqreturn_t i8042_interrupt(int irq, void *dev_id)
 	port = &i8042_ports[port_no];
 	serio = port->exists ? port->serio : NULL;
 
-	dbg("%02x <- i8042 (interrupt, %d, %d%s%s)\n",
-	    data, port_no, irq,
-	    dfl & SERIO_PARITY ? ", bad parity" : "",
-	    dfl & SERIO_TIMEOUT ? ", timeout" : "");
+	filter_dbg(port->driver_bound, data, "<- i8042 (interrupt, %d, %d%s%s)\n",
+		   port_no, irq,
+		   dfl & SERIO_PARITY ? ", bad parity" : "",
+		   dfl & SERIO_TIMEOUT ? ", timeout" : "");
 
 	filtered = i8042_filter(data, str, serio);
 
@@ -1438,6 +1444,29 @@ static int __init i8042_setup_kbd(void)
 	return error;
 }
 
+static int i8042_kbd_bind_notifier(struct notifier_block *nb,
+				   unsigned long action, void *data)
+{
+	struct device *dev = data;
+	struct serio *serio = to_serio_port(dev);
+	struct i8042_port *port = serio->port_data;
+
+	if (serio != i8042_ports[I8042_KBD_PORT_NO].serio)
+		return 0;
+
+	switch (action) {
+	case BUS_NOTIFY_BOUND_DRIVER:
+		port->driver_bound = true;
+		break;
+
+	case BUS_NOTIFY_UNBIND_DRIVER:
+		port->driver_bound = false;
+		break;
+	}
+
+	return 0;
+}
+
 static int __init i8042_probe(struct platform_device *dev)
 {
 	int error;
@@ -1507,6 +1536,10 @@ static struct platform_driver i8042_driver = {
 	.shutdown	= i8042_shutdown,
 };
 
+static struct notifier_block i8042_kbd_bind_notifier_block = {
+	.notifier_call = i8042_kbd_bind_notifier,
+};
+
 static int __init i8042_init(void)
 {
 	struct platform_device *pdev;
@@ -1528,6 +1561,7 @@ static int __init i8042_init(void)
 		goto err_platform_exit;
 	}
 
+	bus_register_notifier(&serio_bus, &i8042_kbd_bind_notifier_block);
 	panic_blink = i8042_panic_blink;
 
 	return 0;
@@ -1543,6 +1577,7 @@ static void __exit i8042_exit(void)
 	platform_driver_unregister(&i8042_driver);
 	i8042_platform_exit();
 
+	bus_unregister_notifier(&serio_bus, &i8042_kbd_bind_notifier_block);
 	panic_blink = NULL;
 }
 
diff --git a/drivers/input/serio/i8042.h b/drivers/input/serio/i8042.h
index fc080beffedc..1db0a40c9bab 100644
--- a/drivers/input/serio/i8042.h
+++ b/drivers/input/serio/i8042.h
@@ -73,6 +73,17 @@ static unsigned long i8042_start_time;
 			printk(KERN_DEBUG KBUILD_MODNAME ": [%d] " format,	\
 			       (int) (jiffies - i8042_start_time), ##arg);	\
 	} while (0)
+
+#define filter_dbg(filter, data, format, args...)		\
+	do {							\
+		if (!i8042_debug)				\
+			break;					\
+								\
+		if (!filter || i8042_unmask_kbd_data)		\
+			dbg("%02x " format, data, ##args);	\
+		else						\
+			dbg("** " format, ##args);		\
+	} while (0)
 #else
 #define dbg_init() do { } while (0)
 #define dbg(format, arg...)							\
@@ -80,6 +91,8 @@ static unsigned long i8042_start_time;
 		if (0)								\
 			printk(KERN_DEBUG pr_fmt(format), ##arg);		\
 	} while (0)
+
+#define filter_dbg(filter, data, format, args...) do { } while (0)
 #endif
 
 #endif /* _I8042_H */
diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c
index a05a5179da32..8f828975ab10 100644
--- a/drivers/input/serio/serio.c
+++ b/drivers/input/serio/serio.c
@@ -49,8 +49,6 @@ static DEFINE_MUTEX(serio_mutex);
 
 static LIST_HEAD(serio_list);
 
-static struct bus_type serio_bus;
-
 static void serio_add_port(struct serio *serio);
 static int serio_reconnect_port(struct serio *serio);
 static void serio_disconnect_port(struct serio *serio);
@@ -1017,7 +1015,7 @@ irqreturn_t serio_interrupt(struct serio *serio,
 }
 EXPORT_SYMBOL(serio_interrupt);
 
-static struct bus_type serio_bus = {
+struct bus_type serio_bus = {
 	.name		= "serio",
 	.drv_groups	= serio_driver_groups,
 	.match		= serio_bus_match,
@@ -1029,6 +1027,7 @@ static struct bus_type serio_bus = {
 	.pm		= &serio_pm_ops,
 #endif
 };
+EXPORT_SYMBOL(serio_bus);
 
 static int __init serio_init(void)
 {
diff --git a/include/linux/serio.h b/include/linux/serio.h
index 9f779c7a2da4..df4ab5de1586 100644
--- a/include/linux/serio.h
+++ b/include/linux/serio.h
@@ -18,6 +18,8 @@
 #include <linux/mod_devicetable.h>
 #include <uapi/linux/serio.h>
 
+extern struct bus_type serio_bus;
+
 struct serio {
 	void *port_data;
 
-- 
cgit v1.2.3-70-g09d2


From 0afab670bda6f3c9980be9e6de0effcc2c6d456c Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Date: Wed, 15 Jul 2015 21:59:47 +0900
Subject: mfd/extcon: max77693: Remove unused extern declarations and
 max77693_dev members

Clean up the max77693 private header file by removing:
1. Left-overs from previous way of interrupt handling (driver uses
   regmap_irq_chip).
2. Unused members of struct 'max77693_dev' related to interrupts in
   extcon driver.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/extcon/extcon-max77693.c     | 19 -------------------
 include/linux/mfd/max77693-private.h |  8 --------
 2 files changed, 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-max77693.c b/drivers/extcon/extcon-max77693.c
index f4f3b3d53928..770db3a72a6a 100644
--- a/drivers/extcon/extcon-max77693.c
+++ b/drivers/extcon/extcon-max77693.c
@@ -1164,28 +1164,9 @@ static int max77693_muic_probe(struct platform_device *pdev)
 	}
 
 	for (i = 0; i < num_init_data; i++) {
-		enum max77693_irq_source irq_src
-				= MAX77693_IRQ_GROUP_NR;
-
 		regmap_write(info->max77693->regmap_muic,
 				init_data[i].addr,
 				init_data[i].data);
-
-		switch (init_data[i].addr) {
-		case MAX77693_MUIC_REG_INTMASK1:
-			irq_src = MUIC_INT1;
-			break;
-		case MAX77693_MUIC_REG_INTMASK2:
-			irq_src = MUIC_INT2;
-			break;
-		case MAX77693_MUIC_REG_INTMASK3:
-			irq_src = MUIC_INT3;
-			break;
-		}
-
-		if (irq_src < MAX77693_IRQ_GROUP_NR)
-			info->max77693->irq_masks_cur[irq_src]
-				= init_data[i].data;
 	}
 
 	if (pdata && pdata->muic_data) {
diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h
index 51633ea6f910..ad67b8235a8d 100644
--- a/include/linux/mfd/max77693-private.h
+++ b/include/linux/mfd/max77693-private.h
@@ -547,18 +547,10 @@ struct max77693_dev {
 	struct regmap_irq_chip_data *irq_data_muic;
 
 	int irq;
-	int irq_gpio;
-	struct mutex irqlock;
-	int irq_masks_cur[MAX77693_IRQ_GROUP_NR];
-	int irq_masks_cache[MAX77693_IRQ_GROUP_NR];
 };
 
 enum max77693_types {
 	TYPE_MAX77693,
 };
 
-extern int max77693_irq_init(struct max77693_dev *max77686);
-extern void max77693_irq_exit(struct max77693_dev *max77686);
-extern int max77693_irq_resume(struct max77693_dev *max77686);
-
 #endif /*  __LINUX_MFD_MAX77693_PRIV_H */
-- 
cgit v1.2.3-70-g09d2


From b3b58cee8aced52e3d7fdb387f40c782a4511198 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Date: Wed, 15 Jul 2015 21:59:48 +0900
Subject: mfd: max77693: Store I2C device type as enum and add default unknown

Store the device type (obtained from i2c_device_id) as an enum and add a
default type of unknown to distinguish from case when this is not set
at all.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/max77693-private.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h
index ad67b8235a8d..e3c0afff38d3 100644
--- a/include/linux/mfd/max77693-private.h
+++ b/include/linux/mfd/max77693-private.h
@@ -529,13 +529,18 @@ enum max77693_irq_muic {
 	MAX77693_MUIC_IRQ_NR,
 };
 
+enum max77693_types {
+	TYPE_MAX77693_UNKNOWN,
+	TYPE_MAX77693,
+};
+
 struct max77693_dev {
 	struct device *dev;
 	struct i2c_client *i2c;		/* 0xCC , PMIC, Charger, Flash LED */
 	struct i2c_client *muic;	/* 0x4A , MUIC */
 	struct i2c_client *haptic;	/* 0x90 , Haptic */
 
-	int type;
+	enum max77693_types type;
 
 	struct regmap *regmap;
 	struct regmap *regmap_muic;
@@ -549,8 +554,4 @@ struct max77693_dev {
 	int irq;
 };
 
-enum max77693_types {
-	TYPE_MAX77693,
-};
-
 #endif /*  __LINUX_MFD_MAX77693_PRIV_H */
-- 
cgit v1.2.3-70-g09d2


From 61b305cd2ae747b8c9a2e4467dea2575a390162c Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Date: Wed, 15 Jul 2015 21:59:50 +0900
Subject: drivers: max77693: Move state container to common header

This prepares for merging some of the drivers between max77693 and
max77843 so the child MFD driver can be attached to any parent MFD main
driver.

Move the state container to common header file. Additionally add
consistent 'i2c' prefixes to its members (of 'struct i2c_client' type).

Signed-off-by: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Acked-by: Sebastian Reichel <sre@kernel.org>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Acked-by: Jacek Anaszewski <j.anaszewski@samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/extcon/extcon-max77693.c     |  3 ++-
 drivers/input/misc/max77693-haptic.c |  1 +
 drivers/leds/leds-max77693.c         |  1 +
 drivers/mfd/max77693.c               | 31 +++++++++++++------------
 drivers/power/max77693_charger.c     |  1 +
 drivers/regulator/max77693.c         |  1 +
 include/linux/mfd/max77693-common.h  | 44 ++++++++++++++++++++++++++++++++++++
 include/linux/mfd/max77693-private.h | 25 --------------------
 8 files changed, 66 insertions(+), 41 deletions(-)
 create mode 100644 include/linux/mfd/max77693-common.h

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-max77693.c b/drivers/extcon/extcon-max77693.c
index 770db3a72a6a..c7bb180cfff4 100644
--- a/drivers/extcon/extcon-max77693.c
+++ b/drivers/extcon/extcon-max77693.c
@@ -24,6 +24,7 @@
 #include <linux/err.h>
 #include <linux/platform_device.h>
 #include <linux/mfd/max77693.h>
+#include <linux/mfd/max77693-common.h>
 #include <linux/mfd/max77693-private.h>
 #include <linux/extcon.h>
 #include <linux/regmap.h>
@@ -1077,7 +1078,7 @@ static int max77693_muic_probe(struct platform_device *pdev)
 		dev_dbg(&pdev->dev, "allocate register map\n");
 	} else {
 		info->max77693->regmap_muic = devm_regmap_init_i2c(
-						info->max77693->muic,
+						info->max77693->i2c_muic,
 						&max77693_muic_regmap_config);
 		if (IS_ERR(info->max77693->regmap_muic)) {
 			ret = PTR_ERR(info->max77693->regmap_muic);
diff --git a/drivers/input/misc/max77693-haptic.c b/drivers/input/misc/max77693-haptic.c
index 39e930c10ebb..4524499ea72f 100644
--- a/drivers/input/misc/max77693-haptic.c
+++ b/drivers/input/misc/max77693-haptic.c
@@ -24,6 +24,7 @@
 #include <linux/workqueue.h>
 #include <linux/regulator/consumer.h>
 #include <linux/mfd/max77693.h>
+#include <linux/mfd/max77693-common.h>
 #include <linux/mfd/max77693-private.h>
 
 #define MAX_MAGNITUDE_SHIFT	16
diff --git a/drivers/leds/leds-max77693.c b/drivers/leds/leds-max77693.c
index b8b0eec7b540..df348a06d8c7 100644
--- a/drivers/leds/leds-max77693.c
+++ b/drivers/leds/leds-max77693.c
@@ -13,6 +13,7 @@
 
 #include <linux/led-class-flash.h>
 #include <linux/mfd/max77693.h>
+#include <linux/mfd/max77693-common.h>
 #include <linux/mfd/max77693-private.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
diff --git a/drivers/mfd/max77693.c b/drivers/mfd/max77693.c
index cb14afa97e6f..67bc53fdc389 100644
--- a/drivers/mfd/max77693.c
+++ b/drivers/mfd/max77693.c
@@ -33,6 +33,7 @@
 #include <linux/mutex.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/max77693.h>
+#include <linux/mfd/max77693-common.h>
 #include <linux/mfd/max77693-private.h>
 #include <linux/regulator/machine.h>
 #include <linux/regmap.h>
@@ -193,22 +194,22 @@ static int max77693_i2c_probe(struct i2c_client *i2c,
 	} else
 		dev_info(max77693->dev, "device ID: 0x%x\n", reg_data);
 
-	max77693->muic = i2c_new_dummy(i2c->adapter, I2C_ADDR_MUIC);
-	if (!max77693->muic) {
+	max77693->i2c_muic = i2c_new_dummy(i2c->adapter, I2C_ADDR_MUIC);
+	if (!max77693->i2c_muic) {
 		dev_err(max77693->dev, "Failed to allocate I2C device for MUIC\n");
 		return -ENODEV;
 	}
-	i2c_set_clientdata(max77693->muic, max77693);
+	i2c_set_clientdata(max77693->i2c_muic, max77693);
 
-	max77693->haptic = i2c_new_dummy(i2c->adapter, I2C_ADDR_HAPTIC);
-	if (!max77693->haptic) {
+	max77693->i2c_haptic = i2c_new_dummy(i2c->adapter, I2C_ADDR_HAPTIC);
+	if (!max77693->i2c_haptic) {
 		dev_err(max77693->dev, "Failed to allocate I2C device for Haptic\n");
 		ret = -ENODEV;
 		goto err_i2c_haptic;
 	}
-	i2c_set_clientdata(max77693->haptic, max77693);
+	i2c_set_clientdata(max77693->i2c_haptic, max77693);
 
-	max77693->regmap_haptic = devm_regmap_init_i2c(max77693->haptic,
+	max77693->regmap_haptic = devm_regmap_init_i2c(max77693->i2c_haptic,
 					&max77693_regmap_haptic_config);
 	if (IS_ERR(max77693->regmap_haptic)) {
 		ret = PTR_ERR(max77693->regmap_haptic);
@@ -222,7 +223,7 @@ static int max77693_i2c_probe(struct i2c_client *i2c,
 	 * instance of MUIC device when irq of max77693 is initialized
 	 * before call max77693-muic probe() function.
 	 */
-	max77693->regmap_muic = devm_regmap_init_i2c(max77693->muic,
+	max77693->regmap_muic = devm_regmap_init_i2c(max77693->i2c_muic,
 					 &max77693_regmap_muic_config);
 	if (IS_ERR(max77693->regmap_muic)) {
 		ret = PTR_ERR(max77693->regmap_muic);
@@ -255,7 +256,7 @@ static int max77693_i2c_probe(struct i2c_client *i2c,
 				IRQF_ONESHOT | IRQF_SHARED |
 				IRQF_TRIGGER_FALLING, 0,
 				&max77693_charger_irq_chip,
-				&max77693->irq_data_charger);
+				&max77693->irq_data_chg);
 	if (ret) {
 		dev_err(max77693->dev, "failed to add irq chip: %d\n", ret);
 		goto err_irq_charger;
@@ -296,15 +297,15 @@ err_mfd:
 err_intsrc:
 	regmap_del_irq_chip(max77693->irq, max77693->irq_data_muic);
 err_irq_muic:
-	regmap_del_irq_chip(max77693->irq, max77693->irq_data_charger);
+	regmap_del_irq_chip(max77693->irq, max77693->irq_data_chg);
 err_irq_charger:
 	regmap_del_irq_chip(max77693->irq, max77693->irq_data_topsys);
 err_irq_topsys:
 	regmap_del_irq_chip(max77693->irq, max77693->irq_data_led);
 err_regmap:
-	i2c_unregister_device(max77693->haptic);
+	i2c_unregister_device(max77693->i2c_haptic);
 err_i2c_haptic:
-	i2c_unregister_device(max77693->muic);
+	i2c_unregister_device(max77693->i2c_muic);
 	return ret;
 }
 
@@ -315,12 +316,12 @@ static int max77693_i2c_remove(struct i2c_client *i2c)
 	mfd_remove_devices(max77693->dev);
 
 	regmap_del_irq_chip(max77693->irq, max77693->irq_data_muic);
-	regmap_del_irq_chip(max77693->irq, max77693->irq_data_charger);
+	regmap_del_irq_chip(max77693->irq, max77693->irq_data_chg);
 	regmap_del_irq_chip(max77693->irq, max77693->irq_data_topsys);
 	regmap_del_irq_chip(max77693->irq, max77693->irq_data_led);
 
-	i2c_unregister_device(max77693->muic);
-	i2c_unregister_device(max77693->haptic);
+	i2c_unregister_device(max77693->i2c_muic);
+	i2c_unregister_device(max77693->i2c_haptic);
 
 	return 0;
 }
diff --git a/drivers/power/max77693_charger.c b/drivers/power/max77693_charger.c
index 754879eb59f6..060cab5ae3aa 100644
--- a/drivers/power/max77693_charger.c
+++ b/drivers/power/max77693_charger.c
@@ -20,6 +20,7 @@
 #include <linux/power_supply.h>
 #include <linux/regmap.h>
 #include <linux/mfd/max77693.h>
+#include <linux/mfd/max77693-common.h>
 #include <linux/mfd/max77693-private.h>
 
 #define MAX77693_CHARGER_NAME				"max77693-charger"
diff --git a/drivers/regulator/max77693.c b/drivers/regulator/max77693.c
index 236851ab575a..c6ab440a74b7 100644
--- a/drivers/regulator/max77693.c
+++ b/drivers/regulator/max77693.c
@@ -29,6 +29,7 @@
 #include <linux/regulator/driver.h>
 #include <linux/regulator/machine.h>
 #include <linux/mfd/max77693.h>
+#include <linux/mfd/max77693-common.h>
 #include <linux/mfd/max77693-private.h>
 #include <linux/regulator/of_regulator.h>
 #include <linux/regmap.h>
diff --git a/include/linux/mfd/max77693-common.h b/include/linux/mfd/max77693-common.h
new file mode 100644
index 000000000000..7da4cc38e982
--- /dev/null
+++ b/include/linux/mfd/max77693-common.h
@@ -0,0 +1,44 @@
+/*
+ * Common data shared between Maxim 77693 and 77843 drivers
+ *
+ * Copyright (C) 2015 Samsung Electronics
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_MFD_MAX77693_COMMON_H
+#define __LINUX_MFD_MAX77693_COMMON_H
+
+enum max77693_types {
+	TYPE_MAX77693_UNKNOWN,
+	TYPE_MAX77693,
+};
+
+/*
+ * Shared also with max77843.
+ */
+struct max77693_dev {
+	struct device *dev;
+	struct i2c_client *i2c;		/* 0xCC , PMIC, Charger, Flash LED */
+	struct i2c_client *i2c_muic;	/* 0x4A , MUIC */
+	struct i2c_client *i2c_haptic;	/* MAX77693: 0x90 , Haptic */
+
+	enum max77693_types type;
+
+	struct regmap *regmap;
+	struct regmap *regmap_muic;
+	struct regmap *regmap_haptic;	/* Only MAX77693 */
+
+	struct regmap_irq_chip_data *irq_data_led;
+	struct regmap_irq_chip_data *irq_data_topsys;
+	struct regmap_irq_chip_data *irq_data_chg; /* Only MAX77693 */
+	struct regmap_irq_chip_data *irq_data_muic;
+
+	int irq;
+};
+
+
+#endif /*  __LINUX_MFD_MAX77693_COMMON_H */
diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h
index e3c0afff38d3..8c4143c0c651 100644
--- a/include/linux/mfd/max77693-private.h
+++ b/include/linux/mfd/max77693-private.h
@@ -529,29 +529,4 @@ enum max77693_irq_muic {
 	MAX77693_MUIC_IRQ_NR,
 };
 
-enum max77693_types {
-	TYPE_MAX77693_UNKNOWN,
-	TYPE_MAX77693,
-};
-
-struct max77693_dev {
-	struct device *dev;
-	struct i2c_client *i2c;		/* 0xCC , PMIC, Charger, Flash LED */
-	struct i2c_client *muic;	/* 0x4A , MUIC */
-	struct i2c_client *haptic;	/* 0x90 , Haptic */
-
-	enum max77693_types type;
-
-	struct regmap *regmap;
-	struct regmap *regmap_muic;
-	struct regmap *regmap_haptic;
-
-	struct regmap_irq_chip_data *irq_data_led;
-	struct regmap_irq_chip_data *irq_data_topsys;
-	struct regmap_irq_chip_data *irq_data_charger;
-	struct regmap_irq_chip_data *irq_data_muic;
-
-	int irq;
-};
-
 #endif /*  __LINUX_MFD_MAX77693_PRIV_H */
-- 
cgit v1.2.3-70-g09d2


From bc1aadc18621ccf93fb33ecbb847b422c354899d Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Date: Wed, 15 Jul 2015 21:59:51 +0900
Subject: drivers: max77843: Switch to common max77693 state container

Switch to the same definition of state container as in MAX77693 drivers.
This will allow usage of one regulator driver in both devices: MAX77693
and MAX77843.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/extcon/extcon-max77843.c     | 17 +++++++++--------
 drivers/input/misc/max77843-haptic.c |  3 ++-
 drivers/mfd/max77843.c               | 20 +++++++++++---------
 drivers/regulator/max77843.c         |  6 ++++--
 include/linux/mfd/max77693-common.h  |  5 +++++
 include/linux/mfd/max77843-private.h | 20 --------------------
 6 files changed, 31 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-max77843.c b/drivers/extcon/extcon-max77843.c
index fac2f1417a79..4dfe0a6337d8 100644
--- a/drivers/extcon/extcon-max77843.c
+++ b/drivers/extcon/extcon-max77843.c
@@ -15,6 +15,7 @@
 #include <linux/i2c.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
+#include <linux/mfd/max77693-common.h>
 #include <linux/mfd/max77843-private.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
@@ -32,7 +33,7 @@ enum max77843_muic_status {
 
 struct max77843_muic_info {
 	struct device *dev;
-	struct max77843 *max77843;
+	struct max77693_dev *max77843;
 	struct extcon_dev *edev;
 
 	struct mutex mutex;
@@ -198,7 +199,7 @@ static const struct regmap_irq_chip max77843_muic_irq_chip = {
 static int max77843_muic_set_path(struct max77843_muic_info *info,
 		u8 val, bool attached)
 {
-	struct max77843 *max77843 = info->max77843;
+	struct max77693_dev *max77843 = info->max77843;
 	int ret = 0;
 	unsigned int ctrl1, ctrl2;
 
@@ -539,7 +540,7 @@ static void max77843_muic_irq_work(struct work_struct *work)
 {
 	struct max77843_muic_info *info = container_of(work,
 			struct max77843_muic_info, irq_work);
-	struct max77843 *max77843 = info->max77843;
+	struct max77693_dev *max77843 = info->max77843;
 	int ret = 0;
 
 	mutex_lock(&info->mutex);
@@ -615,7 +616,7 @@ static void max77843_muic_detect_cable_wq(struct work_struct *work)
 {
 	struct max77843_muic_info *info = container_of(to_delayed_work(work),
 			struct max77843_muic_info, wq_detcable);
-	struct max77843 *max77843 = info->max77843;
+	struct max77693_dev *max77843 = info->max77843;
 	int chg_type, adc, ret;
 	bool attached;
 
@@ -656,7 +657,7 @@ err_cable_wq:
 static int max77843_muic_set_debounce_time(struct max77843_muic_info *info,
 		enum max77843_muic_adc_debounce_time time)
 {
-	struct max77843 *max77843 = info->max77843;
+	struct max77693_dev *max77843 = info->max77843;
 	int ret;
 
 	switch (time) {
@@ -681,7 +682,7 @@ static int max77843_muic_set_debounce_time(struct max77843_muic_info *info,
 	return 0;
 }
 
-static int max77843_init_muic_regmap(struct max77843 *max77843)
+static int max77843_init_muic_regmap(struct max77693_dev *max77843)
 {
 	int ret;
 
@@ -720,7 +721,7 @@ err_muic_i2c:
 
 static int max77843_muic_probe(struct platform_device *pdev)
 {
-	struct max77843 *max77843 = dev_get_drvdata(pdev->dev.parent);
+	struct max77693_dev *max77843 = dev_get_drvdata(pdev->dev.parent);
 	struct max77843_muic_info *info;
 	unsigned int id;
 	int i, ret;
@@ -821,7 +822,7 @@ err_muic_irq:
 static int max77843_muic_remove(struct platform_device *pdev)
 {
 	struct max77843_muic_info *info = platform_get_drvdata(pdev);
-	struct max77843 *max77843 = info->max77843;
+	struct max77693_dev *max77843 = info->max77843;
 
 	cancel_work_sync(&info->irq_work);
 	regmap_del_irq_chip(max77843->irq, max77843->irq_data_muic);
diff --git a/drivers/input/misc/max77843-haptic.c b/drivers/input/misc/max77843-haptic.c
index dccbb465a055..30da81ab5a21 100644
--- a/drivers/input/misc/max77843-haptic.c
+++ b/drivers/input/misc/max77843-haptic.c
@@ -14,6 +14,7 @@
 #include <linux/i2c.h>
 #include <linux/init.h>
 #include <linux/input.h>
+#include <linux/mfd/max77693-common.h>
 #include <linux/mfd/max77843-private.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
@@ -243,7 +244,7 @@ static void max77843_haptic_close(struct input_dev *dev)
 
 static int max77843_haptic_probe(struct platform_device *pdev)
 {
-	struct max77843 *max77843 = dev_get_drvdata(pdev->dev.parent);
+	struct max77693_dev *max77843 = dev_get_drvdata(pdev->dev.parent);
 	struct max77843_haptic *haptic;
 	int error;
 
diff --git a/drivers/mfd/max77843.c b/drivers/mfd/max77843.c
index a354ac677ec7..c52162ea3d0a 100644
--- a/drivers/mfd/max77843.c
+++ b/drivers/mfd/max77843.c
@@ -17,6 +17,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/mfd/core.h>
+#include <linux/mfd/max77693-common.h>
 #include <linux/mfd/max77843-private.h>
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
@@ -71,7 +72,7 @@ static const struct regmap_irq_chip max77843_irq_chip = {
 };
 
 /* Charger and Charger regulator use same regmap. */
-static int max77843_chg_init(struct max77843 *max77843)
+static int max77843_chg_init(struct max77693_dev *max77843)
 {
 	int ret;
 
@@ -101,7 +102,7 @@ err_chg_i2c:
 static int max77843_probe(struct i2c_client *i2c,
 			  const struct i2c_device_id *id)
 {
-	struct max77843 *max77843;
+	struct max77693_dev *max77843;
 	unsigned int reg_data;
 	int ret;
 
@@ -113,6 +114,7 @@ static int max77843_probe(struct i2c_client *i2c,
 	max77843->dev = &i2c->dev;
 	max77843->i2c = i2c;
 	max77843->irq = i2c->irq;
+	max77843->type = id->driver_data;
 
 	max77843->regmap = devm_regmap_init_i2c(i2c,
 			&max77843_regmap_config);
@@ -123,7 +125,7 @@ static int max77843_probe(struct i2c_client *i2c,
 
 	ret = regmap_add_irq_chip(max77843->regmap, max77843->irq,
 			IRQF_TRIGGER_LOW | IRQF_ONESHOT | IRQF_SHARED,
-			0, &max77843_irq_chip, &max77843->irq_data);
+			0, &max77843_irq_chip, &max77843->irq_data_topsys);
 	if (ret) {
 		dev_err(&i2c->dev, "Failed to add TOPSYS IRQ chip\n");
 		return ret;
@@ -164,18 +166,18 @@ static int max77843_probe(struct i2c_client *i2c,
 	return 0;
 
 err_pmic_id:
-	regmap_del_irq_chip(max77843->irq, max77843->irq_data);
+	regmap_del_irq_chip(max77843->irq, max77843->irq_data_topsys);
 
 	return ret;
 }
 
 static int max77843_remove(struct i2c_client *i2c)
 {
-	struct max77843 *max77843 = i2c_get_clientdata(i2c);
+	struct max77693_dev *max77843 = i2c_get_clientdata(i2c);
 
 	mfd_remove_devices(max77843->dev);
 
-	regmap_del_irq_chip(max77843->irq, max77843->irq_data);
+	regmap_del_irq_chip(max77843->irq, max77843->irq_data_topsys);
 
 	i2c_unregister_device(max77843->i2c_chg);
 
@@ -188,7 +190,7 @@ static const struct of_device_id max77843_dt_match[] = {
 };
 
 static const struct i2c_device_id max77843_id[] = {
-	{ "max77843", },
+	{ "max77843", TYPE_MAX77843, },
 	{ },
 };
 MODULE_DEVICE_TABLE(i2c, max77843_id);
@@ -196,7 +198,7 @@ MODULE_DEVICE_TABLE(i2c, max77843_id);
 static int __maybe_unused max77843_suspend(struct device *dev)
 {
 	struct i2c_client *i2c = container_of(dev, struct i2c_client, dev);
-	struct max77843 *max77843 = i2c_get_clientdata(i2c);
+	struct max77693_dev *max77843 = i2c_get_clientdata(i2c);
 
 	disable_irq(max77843->irq);
 	if (device_may_wakeup(dev))
@@ -208,7 +210,7 @@ static int __maybe_unused max77843_suspend(struct device *dev)
 static int __maybe_unused max77843_resume(struct device *dev)
 {
 	struct i2c_client *i2c = container_of(dev, struct i2c_client, dev);
-	struct max77843 *max77843 = i2c_get_clientdata(i2c);
+	struct max77693_dev *max77843 = i2c_get_clientdata(i2c);
 
 	if (device_may_wakeup(dev))
 		disable_irq_wake(max77843->irq);
diff --git a/drivers/regulator/max77843.c b/drivers/regulator/max77843.c
index f4fd0d3cfa6e..9926247aae6b 100644
--- a/drivers/regulator/max77843.c
+++ b/drivers/regulator/max77843.c
@@ -15,6 +15,7 @@
 #include <linux/platform_device.h>
 #include <linux/regulator/driver.h>
 #include <linux/regulator/machine.h>
+#include <linux/mfd/max77693-common.h>
 #include <linux/mfd/max77843-private.h>
 #include <linux/regulator/of_regulator.h>
 
@@ -130,7 +131,8 @@ static const struct regulator_desc max77843_supported_regulators[] = {
 	},
 };
 
-static struct regmap *max77843_get_regmap(struct max77843 *max77843, int reg_id)
+static struct regmap *max77843_get_regmap(struct max77693_dev *max77843,
+					  int reg_id)
 {
 	switch (reg_id) {
 	case MAX77843_SAFEOUT1:
@@ -145,7 +147,7 @@ static struct regmap *max77843_get_regmap(struct max77843 *max77843, int reg_id)
 
 static int max77843_regulator_probe(struct platform_device *pdev)
 {
-	struct max77843 *max77843 = dev_get_drvdata(pdev->dev.parent);
+	struct max77693_dev *max77843 = dev_get_drvdata(pdev->dev.parent);
 	struct regulator_config config = {};
 	int i;
 
diff --git a/include/linux/mfd/max77693-common.h b/include/linux/mfd/max77693-common.h
index 7da4cc38e982..095b121aa725 100644
--- a/include/linux/mfd/max77693-common.h
+++ b/include/linux/mfd/max77693-common.h
@@ -15,6 +15,9 @@
 enum max77693_types {
 	TYPE_MAX77693_UNKNOWN,
 	TYPE_MAX77693,
+	TYPE_MAX77843,
+
+	TYPE_MAX77693_NUM,
 };
 
 /*
@@ -25,12 +28,14 @@ struct max77693_dev {
 	struct i2c_client *i2c;		/* 0xCC , PMIC, Charger, Flash LED */
 	struct i2c_client *i2c_muic;	/* 0x4A , MUIC */
 	struct i2c_client *i2c_haptic;	/* MAX77693: 0x90 , Haptic */
+	struct i2c_client *i2c_chg;	/* MAX77843: 0xD2, Charger */
 
 	enum max77693_types type;
 
 	struct regmap *regmap;
 	struct regmap *regmap_muic;
 	struct regmap *regmap_haptic;	/* Only MAX77693 */
+	struct regmap *regmap_chg;	/* Only MAX77843 */
 
 	struct regmap_irq_chip_data *irq_data_led;
 	struct regmap_irq_chip_data *irq_data_topsys;
diff --git a/include/linux/mfd/max77843-private.h b/include/linux/mfd/max77843-private.h
index 7178ace8379e..0121d9440340 100644
--- a/include/linux/mfd/max77843-private.h
+++ b/include/linux/mfd/max77843-private.h
@@ -431,24 +431,4 @@ enum max77843_irq_muic {
 #define MAX77843_REG_SAFEOUTCTRL_SAFEOUT2_MASK \
 		(0x3 << SAFEOUTCTRL_SAFEOUT2_SHIFT)
 
-struct max77843 {
-	struct device *dev;
-
-	struct i2c_client *i2c;
-	struct i2c_client *i2c_chg;
-	struct i2c_client *i2c_fuel;
-	struct i2c_client *i2c_muic;
-
-	struct regmap *regmap;
-	struct regmap *regmap_chg;
-	struct regmap *regmap_fuel;
-	struct regmap *regmap_muic;
-
-	struct regmap_irq_chip_data *irq_data;
-	struct regmap_irq_chip_data *irq_data_chg;
-	struct regmap_irq_chip_data *irq_data_fuel;
-	struct regmap_irq_chip_data *irq_data_muic;
-
-	int irq;
-};
 #endif /* __MAX77843_H__ */
-- 
cgit v1.2.3-70-g09d2


From cceb433a1e2930301b33c79016eff147eb555cea Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Date: Wed, 15 Jul 2015 21:59:52 +0900
Subject: mfd/extcon: max77693: Rename defines to allow inclusion with max77843

Add MAX77693 prefix to some of the defines used in max77693 extcon
driver so the max77693-private.h can be included simultaneously with
max77843-private.h.

Additionally use BIT() macro in header.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/extcon/extcon-max77693.c     |  72 +++++++++++++------------
 include/linux/mfd/max77693-private.h | 102 +++++++++++++++++------------------
 2 files changed, 89 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-max77693.c b/drivers/extcon/extcon-max77693.c
index c7bb180cfff4..35b9e118b2fb 100644
--- a/drivers/extcon/extcon-max77693.c
+++ b/drivers/extcon/extcon-max77693.c
@@ -43,7 +43,7 @@ static struct max77693_reg_data default_init_data[] = {
 	{
 		/* STATUS2 - [3]ChgDetRun */
 		.addr = MAX77693_MUIC_REG_STATUS2,
-		.data = STATUS2_CHGDETRUN_MASK,
+		.data = MAX77693_STATUS2_CHGDETRUN_MASK,
 	}, {
 		/* INTMASK1 - Unmask [3]ADC1KM,[0]ADCM */
 		.addr = MAX77693_MUIC_REG_INTMASK1,
@@ -236,7 +236,7 @@ static int max77693_muic_set_debounce_time(struct max77693_muic_info *info,
 		 */
 		ret = regmap_write(info->max77693->regmap_muic,
 				  MAX77693_MUIC_REG_CTRL3,
-				  time << CONTROL3_ADCDBSET_SHIFT);
+				  time << MAX77693_CONTROL3_ADCDBSET_SHIFT);
 		if (ret) {
 			dev_err(info->dev, "failed to set ADC debounce time\n");
 			return ret;
@@ -269,7 +269,7 @@ static int max77693_muic_set_path(struct max77693_muic_info *info,
 	if (attached)
 		ctrl1 = val;
 	else
-		ctrl1 = CONTROL1_SW_OPEN;
+		ctrl1 = MAX77693_CONTROL1_SW_OPEN;
 
 	ret = regmap_update_bits(info->max77693->regmap_muic,
 			MAX77693_MUIC_REG_CTRL1, COMP_SW_MASK, ctrl1);
@@ -279,13 +279,14 @@ static int max77693_muic_set_path(struct max77693_muic_info *info,
 	}
 
 	if (attached)
-		ctrl2 |= CONTROL2_CPEN_MASK;	/* LowPwr=0, CPEn=1 */
+		ctrl2 |= MAX77693_CONTROL2_CPEN_MASK;	/* LowPwr=0, CPEn=1 */
 	else
-		ctrl2 |= CONTROL2_LOWPWR_MASK;	/* LowPwr=1, CPEn=0 */
+		ctrl2 |= MAX77693_CONTROL2_LOWPWR_MASK;	/* LowPwr=1, CPEn=0 */
 
 	ret = regmap_update_bits(info->max77693->regmap_muic,
 			MAX77693_MUIC_REG_CTRL2,
-			CONTROL2_LOWPWR_MASK | CONTROL2_CPEN_MASK, ctrl2);
+			MAX77693_CONTROL2_LOWPWR_MASK | MAX77693_CONTROL2_CPEN_MASK,
+			ctrl2);
 	if (ret < 0) {
 		dev_err(info->dev, "failed to update MUIC register\n");
 		return ret;
@@ -327,8 +328,8 @@ static int max77693_muic_get_cable_type(struct max77693_muic_info *info,
 		 * Read ADC value to check cable type and decide cable state
 		 * according to cable type
 		 */
-		adc = info->status[0] & STATUS1_ADC_MASK;
-		adc >>= STATUS1_ADC_SHIFT;
+		adc = info->status[0] & MAX77693_STATUS1_ADC_MASK;
+		adc >>= MAX77693_STATUS1_ADC_SHIFT;
 
 		/*
 		 * Check current cable state/cable type and store cable type
@@ -351,8 +352,8 @@ static int max77693_muic_get_cable_type(struct max77693_muic_info *info,
 		 * Read ADC value to check cable type and decide cable state
 		 * according to cable type
 		 */
-		adc = info->status[0] & STATUS1_ADC_MASK;
-		adc >>= STATUS1_ADC_SHIFT;
+		adc = info->status[0] & MAX77693_STATUS1_ADC_MASK;
+		adc >>= MAX77693_STATUS1_ADC_SHIFT;
 
 		/*
 		 * Check current cable state/cable type and store cable type
@@ -367,13 +368,13 @@ static int max77693_muic_get_cable_type(struct max77693_muic_info *info,
 		} else {
 			*attached = true;
 
-			adclow = info->status[0] & STATUS1_ADCLOW_MASK;
-			adclow >>= STATUS1_ADCLOW_SHIFT;
-			adc1k = info->status[0] & STATUS1_ADC1K_MASK;
-			adc1k >>= STATUS1_ADC1K_SHIFT;
+			adclow = info->status[0] & MAX77693_STATUS1_ADCLOW_MASK;
+			adclow >>= MAX77693_STATUS1_ADCLOW_SHIFT;
+			adc1k = info->status[0] & MAX77693_STATUS1_ADC1K_MASK;
+			adc1k >>= MAX77693_STATUS1_ADC1K_SHIFT;
 
-			vbvolt = info->status[1] & STATUS2_VBVOLT_MASK;
-			vbvolt >>= STATUS2_VBVOLT_SHIFT;
+			vbvolt = info->status[1] & MAX77693_STATUS2_VBVOLT_MASK;
+			vbvolt >>= MAX77693_STATUS2_VBVOLT_SHIFT;
 
 			/**
 			 * [0x1|VBVolt|ADCLow|ADC1K]
@@ -398,8 +399,8 @@ static int max77693_muic_get_cable_type(struct max77693_muic_info *info,
 		 * Read charger type to check cable type and decide cable state
 		 * according to type of charger cable.
 		 */
-		chg_type = info->status[1] & STATUS2_CHGTYP_MASK;
-		chg_type >>= STATUS2_CHGTYP_SHIFT;
+		chg_type = info->status[1] & MAX77693_STATUS2_CHGTYP_MASK;
+		chg_type >>= MAX77693_STATUS2_CHGTYP_SHIFT;
 
 		if (chg_type == MAX77693_CHARGER_TYPE_NONE) {
 			*attached = false;
@@ -423,10 +424,10 @@ static int max77693_muic_get_cable_type(struct max77693_muic_info *info,
 		 * Read ADC value to check cable type and decide cable state
 		 * according to cable type
 		 */
-		adc = info->status[0] & STATUS1_ADC_MASK;
-		adc >>= STATUS1_ADC_SHIFT;
-		chg_type = info->status[1] & STATUS2_CHGTYP_MASK;
-		chg_type >>= STATUS2_CHGTYP_SHIFT;
+		adc = info->status[0] & MAX77693_STATUS1_ADC_MASK;
+		adc >>= MAX77693_STATUS1_ADC_SHIFT;
+		chg_type = info->status[1] & MAX77693_STATUS2_CHGTYP_MASK;
+		chg_type >>= MAX77693_STATUS2_CHGTYP_SHIFT;
 
 		if (adc == MAX77693_MUIC_ADC_OPEN
 				&& chg_type == MAX77693_CHARGER_TYPE_NONE)
@@ -438,8 +439,8 @@ static int max77693_muic_get_cable_type(struct max77693_muic_info *info,
 		 * Read vbvolt field, if vbvolt is 1,
 		 * this cable is used for charging.
 		 */
-		vbvolt = info->status[1] & STATUS2_VBVOLT_MASK;
-		vbvolt >>= STATUS2_VBVOLT_SHIFT;
+		vbvolt = info->status[1] & MAX77693_STATUS2_VBVOLT_MASK;
+		vbvolt >>= MAX77693_STATUS2_VBVOLT_SHIFT;
 
 		cable_type = vbvolt;
 		break;
@@ -521,7 +522,8 @@ static int max77693_muic_dock_handler(struct max77693_muic_info *info,
 	}
 
 	/* Dock-Car/Desk/Audio, PATH:AUDIO */
-	ret = max77693_muic_set_path(info, CONTROL1_SW_AUDIO, attached);
+	ret = max77693_muic_set_path(info, MAX77693_CONTROL1_SW_AUDIO,
+					attached);
 	if (ret < 0)
 		return ret;
 	extcon_set_cable_state_(info->edev, dock_id, attached);
@@ -586,14 +588,16 @@ static int max77693_muic_adc_ground_handler(struct max77693_muic_info *info)
 	case MAX77693_MUIC_GND_USB_HOST:
 	case MAX77693_MUIC_GND_USB_HOST_VB:
 		/* USB_HOST, PATH: AP_USB */
-		ret = max77693_muic_set_path(info, CONTROL1_SW_USB, attached);
+		ret = max77693_muic_set_path(info, MAX77693_CONTROL1_SW_USB,
+						attached);
 		if (ret < 0)
 			return ret;
 		extcon_set_cable_state_(info->edev, EXTCON_USB_HOST, attached);
 		break;
 	case MAX77693_MUIC_GND_AV_CABLE_LOAD:
 		/* Audio Video Cable with load, PATH:AUDIO */
-		ret = max77693_muic_set_path(info, CONTROL1_SW_AUDIO, attached);
+		ret = max77693_muic_set_path(info, MAX77693_CONTROL1_SW_AUDIO,
+						attached);
 		if (ret < 0)
 			return ret;
 		extcon_set_cable_state_(info->edev, EXTCON_USB, attached);
@@ -616,7 +620,7 @@ static int max77693_muic_jig_handler(struct max77693_muic_info *info,
 		int cable_type, bool attached)
 {
 	int ret = 0;
-	u8 path = CONTROL1_SW_OPEN;
+	u8 path = MAX77693_CONTROL1_SW_OPEN;
 
 	dev_info(info->dev,
 		"external connector is %s (adc:0x%02x)\n",
@@ -626,12 +630,12 @@ static int max77693_muic_jig_handler(struct max77693_muic_info *info,
 	case MAX77693_MUIC_ADC_FACTORY_MODE_USB_OFF:	/* ADC_JIG_USB_OFF */
 	case MAX77693_MUIC_ADC_FACTORY_MODE_USB_ON:	/* ADC_JIG_USB_ON */
 		/* PATH:AP_USB */
-		path = CONTROL1_SW_USB;
+		path = MAX77693_CONTROL1_SW_USB;
 		break;
 	case MAX77693_MUIC_ADC_FACTORY_MODE_UART_OFF:	/* ADC_JIG_UART_OFF */
 	case MAX77693_MUIC_ADC_FACTORY_MODE_UART_ON:	/* ADC_JIG_UART_ON */
 		/* PATH:AP_UART */
-		path = CONTROL1_SW_UART;
+		path = MAX77693_CONTROL1_SW_UART;
 		break;
 	default:
 		dev_err(info->dev, "failed to detect %s jig cable\n",
@@ -1181,12 +1185,12 @@ static int max77693_muic_probe(struct platform_device *pdev)
 		if (muic_pdata->path_uart)
 			info->path_uart = muic_pdata->path_uart;
 		else
-			info->path_uart = CONTROL1_SW_UART;
+			info->path_uart = MAX77693_CONTROL1_SW_UART;
 
 		if (muic_pdata->path_usb)
 			info->path_usb = muic_pdata->path_usb;
 		else
-			info->path_usb = CONTROL1_SW_USB;
+			info->path_usb = MAX77693_CONTROL1_SW_USB;
 
 		/*
 		 * Default delay time for detecting cable state
@@ -1198,8 +1202,8 @@ static int max77693_muic_probe(struct platform_device *pdev)
 		else
 			delay_jiffies = msecs_to_jiffies(DELAY_MS_DEFAULT);
 	} else {
-		info->path_usb = CONTROL1_SW_USB;
-		info->path_uart = CONTROL1_SW_UART;
+		info->path_usb = MAX77693_CONTROL1_SW_USB;
+		info->path_uart = MAX77693_CONTROL1_SW_UART;
 		delay_jiffies = msecs_to_jiffies(DELAY_MS_DEFAULT);
 	}
 
diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h
index 8c4143c0c651..3c7a63b98ad6 100644
--- a/include/linux/mfd/max77693-private.h
+++ b/include/linux/mfd/max77693-private.h
@@ -310,30 +310,30 @@ enum max77693_muic_reg {
 #define INTMASK2_CHGTYP_MASK		(1 << INTMASK2_CHGTYP_SHIFT)
 
 /* MAX77693 MUIC - STATUS1~3 Register */
-#define STATUS1_ADC_SHIFT		(0)
-#define STATUS1_ADCLOW_SHIFT		(5)
-#define STATUS1_ADCERR_SHIFT		(6)
-#define STATUS1_ADC1K_SHIFT		(7)
-#define STATUS1_ADC_MASK		(0x1f << STATUS1_ADC_SHIFT)
-#define STATUS1_ADCLOW_MASK		(0x1 << STATUS1_ADCLOW_SHIFT)
-#define STATUS1_ADCERR_MASK		(0x1 << STATUS1_ADCERR_SHIFT)
-#define STATUS1_ADC1K_MASK		(0x1 << STATUS1_ADC1K_SHIFT)
-
-#define STATUS2_CHGTYP_SHIFT		(0)
-#define STATUS2_CHGDETRUN_SHIFT		(3)
-#define STATUS2_DCDTMR_SHIFT		(4)
-#define STATUS2_DXOVP_SHIFT		(5)
-#define STATUS2_VBVOLT_SHIFT		(6)
-#define STATUS2_VIDRM_SHIFT		(7)
-#define STATUS2_CHGTYP_MASK		(0x7 << STATUS2_CHGTYP_SHIFT)
-#define STATUS2_CHGDETRUN_MASK		(0x1 << STATUS2_CHGDETRUN_SHIFT)
-#define STATUS2_DCDTMR_MASK		(0x1 << STATUS2_DCDTMR_SHIFT)
-#define STATUS2_DXOVP_MASK		(0x1 << STATUS2_DXOVP_SHIFT)
-#define STATUS2_VBVOLT_MASK		(0x1 << STATUS2_VBVOLT_SHIFT)
-#define STATUS2_VIDRM_MASK		(0x1 << STATUS2_VIDRM_SHIFT)
-
-#define STATUS3_OVP_SHIFT		(2)
-#define STATUS3_OVP_MASK		(0x1 << STATUS3_OVP_SHIFT)
+#define MAX77693_STATUS1_ADC_SHIFT		0
+#define MAX77693_STATUS1_ADCLOW_SHIFT		5
+#define MAX77693_STATUS1_ADCERR_SHIFT		6
+#define MAX77693_STATUS1_ADC1K_SHIFT		7
+#define MAX77693_STATUS1_ADC_MASK		(0x1f << MAX77693_STATUS1_ADC_SHIFT)
+#define MAX77693_STATUS1_ADCLOW_MASK		BIT(MAX77693_STATUS1_ADCLOW_SHIFT)
+#define MAX77693_STATUS1_ADCERR_MASK		BIT(MAX77693_STATUS1_ADCERR_SHIFT)
+#define MAX77693_STATUS1_ADC1K_MASK		BIT(MAX77693_STATUS1_ADC1K_SHIFT)
+
+#define MAX77693_STATUS2_CHGTYP_SHIFT		0
+#define MAX77693_STATUS2_CHGDETRUN_SHIFT	3
+#define MAX77693_STATUS2_DCDTMR_SHIFT		4
+#define MAX77693_STATUS2_DXOVP_SHIFT		5
+#define MAX77693_STATUS2_VBVOLT_SHIFT		6
+#define MAX77693_STATUS2_VIDRM_SHIFT		7
+#define MAX77693_STATUS2_CHGTYP_MASK		(0x7 << MAX77693_STATUS2_CHGTYP_SHIFT)
+#define MAX77693_STATUS2_CHGDETRUN_MASK		BIT(MAX77693_STATUS2_CHGDETRUN_SHIFT)
+#define MAX77693_STATUS2_DCDTMR_MASK		BIT(MAX77693_STATUS2_DCDTMR_SHIFT)
+#define MAX77693_STATUS2_DXOVP_MASK		BIT(MAX77693_STATUS2_DXOVP_SHIFT)
+#define MAX77693_STATUS2_VBVOLT_MASK		BIT(MAX77693_STATUS2_VBVOLT_SHIFT)
+#define MAX77693_STATUS2_VIDRM_MASK		BIT(MAX77693_STATUS2_VIDRM_SHIFT)
+
+#define MAX77693_STATUS3_OVP_SHIFT		2
+#define MAX77693_STATUS3_OVP_MASK		BIT(MAX77693_STATUS3_OVP_SHIFT)
 
 /* MAX77693 CDETCTRL1~2 register */
 #define CDETCTRL1_CHGDETEN_SHIFT	(0)
@@ -362,38 +362,38 @@ enum max77693_muic_reg {
 #define COMN1SW_MASK			(0x7 << COMN1SW_SHIFT)
 #define COMP2SW_MASK			(0x7 << COMP2SW_SHIFT)
 #define COMP_SW_MASK			(COMP2SW_MASK | COMN1SW_MASK)
-#define CONTROL1_SW_USB			((1 << COMP2SW_SHIFT) \
+#define MAX77693_CONTROL1_SW_USB	((1 << COMP2SW_SHIFT) \
 						| (1 << COMN1SW_SHIFT))
-#define CONTROL1_SW_AUDIO		((2 << COMP2SW_SHIFT) \
+#define MAX77693_CONTROL1_SW_AUDIO	((2 << COMP2SW_SHIFT) \
 						| (2 << COMN1SW_SHIFT))
-#define CONTROL1_SW_UART		((3 << COMP2SW_SHIFT) \
+#define MAX77693_CONTROL1_SW_UART	((3 << COMP2SW_SHIFT) \
 						| (3 << COMN1SW_SHIFT))
-#define CONTROL1_SW_OPEN		((0 << COMP2SW_SHIFT) \
+#define MAX77693_CONTROL1_SW_OPEN	((0 << COMP2SW_SHIFT) \
 						| (0 << COMN1SW_SHIFT))
 
-#define CONTROL2_LOWPWR_SHIFT		(0)
-#define CONTROL2_ADCEN_SHIFT		(1)
-#define CONTROL2_CPEN_SHIFT		(2)
-#define CONTROL2_SFOUTASRT_SHIFT	(3)
-#define CONTROL2_SFOUTORD_SHIFT		(4)
-#define CONTROL2_ACCDET_SHIFT		(5)
-#define CONTROL2_USBCPINT_SHIFT		(6)
-#define CONTROL2_RCPS_SHIFT		(7)
-#define CONTROL2_LOWPWR_MASK		(0x1 << CONTROL2_LOWPWR_SHIFT)
-#define CONTROL2_ADCEN_MASK		(0x1 << CONTROL2_ADCEN_SHIFT)
-#define CONTROL2_CPEN_MASK		(0x1 << CONTROL2_CPEN_SHIFT)
-#define CONTROL2_SFOUTASRT_MASK		(0x1 << CONTROL2_SFOUTASRT_SHIFT)
-#define CONTROL2_SFOUTORD_MASK		(0x1 << CONTROL2_SFOUTORD_SHIFT)
-#define CONTROL2_ACCDET_MASK		(0x1 << CONTROL2_ACCDET_SHIFT)
-#define CONTROL2_USBCPINT_MASK		(0x1 << CONTROL2_USBCPINT_SHIFT)
-#define CONTROL2_RCPS_MASK		(0x1 << CONTROL2_RCPS_SHIFT)
-
-#define CONTROL3_JIGSET_SHIFT		(0)
-#define CONTROL3_BTLDSET_SHIFT		(2)
-#define CONTROL3_ADCDBSET_SHIFT		(4)
-#define CONTROL3_JIGSET_MASK		(0x3 << CONTROL3_JIGSET_SHIFT)
-#define CONTROL3_BTLDSET_MASK		(0x3 << CONTROL3_BTLDSET_SHIFT)
-#define CONTROL3_ADCDBSET_MASK		(0x3 << CONTROL3_ADCDBSET_SHIFT)
+#define MAX77693_CONTROL2_LOWPWR_SHIFT		0
+#define MAX77693_CONTROL2_ADCEN_SHIFT		1
+#define MAX77693_CONTROL2_CPEN_SHIFT		2
+#define MAX77693_CONTROL2_SFOUTASRT_SHIFT	3
+#define MAX77693_CONTROL2_SFOUTORD_SHIFT	4
+#define MAX77693_CONTROL2_ACCDET_SHIFT		5
+#define MAX77693_CONTROL2_USBCPINT_SHIFT	6
+#define MAX77693_CONTROL2_RCPS_SHIFT		7
+#define MAX77693_CONTROL2_LOWPWR_MASK		BIT(MAX77693_CONTROL2_LOWPWR_SHIFT)
+#define MAX77693_CONTROL2_ADCEN_MASK		BIT(MAX77693_CONTROL2_ADCEN_SHIFT)
+#define MAX77693_CONTROL2_CPEN_MASK		BIT(MAX77693_CONTROL2_CPEN_SHIFT)
+#define MAX77693_CONTROL2_SFOUTASRT_MASK	BIT(MAX77693_CONTROL2_SFOUTASRT_SHIFT)
+#define MAX77693_CONTROL2_SFOUTORD_MASK		BIT(MAX77693_CONTROL2_SFOUTORD_SHIFT)
+#define MAX77693_CONTROL2_ACCDET_MASK		BIT(MAX77693_CONTROL2_ACCDET_SHIFT)
+#define MAX77693_CONTROL2_USBCPINT_MASK		BIT(MAX77693_CONTROL2_USBCPINT_SHIFT)
+#define MAX77693_CONTROL2_RCPS_MASK		BIT(MAX77693_CONTROL2_RCPS_SHIFT)
+
+#define MAX77693_CONTROL3_JIGSET_SHIFT		0
+#define MAX77693_CONTROL3_BTLDSET_SHIFT		2
+#define MAX77693_CONTROL3_ADCDBSET_SHIFT	4
+#define MAX77693_CONTROL3_JIGSET_MASK		(0x3 << MAX77693_CONTROL3_JIGSET_SHIFT)
+#define MAX77693_CONTROL3_BTLDSET_MASK		(0x3 << MAX77693_CONTROL3_BTLDSET_SHIFT)
+#define MAX77693_CONTROL3_ADCDBSET_MASK		(0x3 << MAX77693_CONTROL3_ADCDBSET_SHIFT)
 
 /* Slave addr = 0x90: Haptic */
 enum max77693_haptic_reg {
-- 
cgit v1.2.3-70-g09d2


From 309a3e00a511a233acb25eec567a4b11c99d016a Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Date: Wed, 15 Jul 2015 21:59:53 +0900
Subject: mfd/extcon: max77843: Rename defines to allow inclusion with max77693

Add MAX77843_MUIC prefix to some of the defines used in max77843 extcon
driver so the max77693-private.h can be included simultaneously with
max77843-private.h.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/extcon/extcon-max77843.c     |  49 +++++++----
 include/linux/mfd/max77843-private.h | 154 +++++++++++++++++------------------
 2 files changed, 109 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-max77843.c b/drivers/extcon/extcon-max77843.c
index 4dfe0a6337d8..f652c4199870 100644
--- a/drivers/extcon/extcon-max77843.c
+++ b/drivers/extcon/extcon-max77843.c
@@ -206,11 +206,11 @@ static int max77843_muic_set_path(struct max77843_muic_info *info,
 	if (attached)
 		ctrl1 = val;
 	else
-		ctrl1 = CONTROL1_SW_OPEN;
+		ctrl1 = MAX77843_MUIC_CONTROL1_SW_OPEN;
 
 	ret = regmap_update_bits(max77843->regmap_muic,
 			MAX77843_MUIC_REG_CONTROL1,
-			CONTROL1_COM_SW, ctrl1);
+			MAX77843_MUIC_CONTROL1_COM_SW, ctrl1);
 	if (ret < 0) {
 		dev_err(info->dev, "Cannot switch MUIC port\n");
 		return ret;
@@ -244,7 +244,7 @@ static int max77843_muic_get_cable_type(struct max77843_muic_info *info,
 
 	adc = info->status[MAX77843_MUIC_STATUS1] &
 			MAX77843_MUIC_STATUS1_ADC_MASK;
-	adc >>= STATUS1_ADC_SHIFT;
+	adc >>= MAX77843_MUIC_STATUS1_ADC_SHIFT;
 
 	switch (group) {
 	case MAX77843_CABLE_GROUP_ADC:
@@ -310,7 +310,7 @@ static int max77843_muic_get_cable_type(struct max77843_muic_info *info,
 			/* Get VBVolt register bit */
 			gnd_type |= (info->status[MAX77843_MUIC_STATUS2] &
 					MAX77843_MUIC_STATUS2_VBVOLT_MASK);
-			gnd_type >>= STATUS2_VBVOLT_SHIFT;
+			gnd_type >>= MAX77843_MUIC_STATUS2_VBVOLT_SHIFT;
 
 			/* Offset of GND cable */
 			gnd_type |= MAX77843_MUIC_GND_USB_HOST;
@@ -339,7 +339,9 @@ static int max77843_muic_adc_gnd_handler(struct max77843_muic_info *info)
 	switch (gnd_cable_type) {
 	case MAX77843_MUIC_GND_USB_HOST:
 	case MAX77843_MUIC_GND_USB_HOST_VB:
-		ret = max77843_muic_set_path(info, CONTROL1_SW_USB, attached);
+		ret = max77843_muic_set_path(info,
+					     MAX77843_MUIC_CONTROL1_SW_USB,
+					     attached);
 		if (ret < 0)
 			return ret;
 
@@ -347,7 +349,9 @@ static int max77843_muic_adc_gnd_handler(struct max77843_muic_info *info)
 		break;
 	case MAX77843_MUIC_GND_MHL_VB:
 	case MAX77843_MUIC_GND_MHL:
-		ret = max77843_muic_set_path(info, CONTROL1_SW_OPEN, attached);
+		ret = max77843_muic_set_path(info,
+					     MAX77843_MUIC_CONTROL1_SW_OPEN,
+					     attached);
 		if (ret < 0)
 			return ret;
 
@@ -366,7 +370,7 @@ static int max77843_muic_jig_handler(struct max77843_muic_info *info,
 		int cable_type, bool attached)
 {
 	int ret;
-	u8 path = CONTROL1_SW_OPEN;
+	u8 path = MAX77843_MUIC_CONTROL1_SW_OPEN;
 
 	dev_dbg(info->dev, "external connector is %s (adc:0x%02x)\n",
 			attached ? "attached" : "detached", cable_type);
@@ -374,10 +378,10 @@ static int max77843_muic_jig_handler(struct max77843_muic_info *info,
 	switch (cable_type) {
 	case MAX77843_MUIC_ADC_FACTORY_MODE_USB_OFF:
 	case MAX77843_MUIC_ADC_FACTORY_MODE_USB_ON:
-		path = CONTROL1_SW_USB;
+		path = MAX77843_MUIC_CONTROL1_SW_USB;
 		break;
 	case MAX77843_MUIC_ADC_FACTORY_MODE_UART_OFF:
-		path = CONTROL1_SW_UART;
+		path = MAX77843_MUIC_CONTROL1_SW_UART;
 		break;
 	default:
 		return -EINVAL;
@@ -475,14 +479,18 @@ static int max77843_muic_chg_handler(struct max77843_muic_info *info)
 
 	switch (chg_type) {
 	case MAX77843_MUIC_CHG_USB:
-		ret = max77843_muic_set_path(info, CONTROL1_SW_USB, attached);
+		ret = max77843_muic_set_path(info,
+					     MAX77843_MUIC_CONTROL1_SW_USB,
+					     attached);
 		if (ret < 0)
 			return ret;
 
 		extcon_set_cable_state_(info->edev, EXTCON_USB, attached);
 		break;
 	case MAX77843_MUIC_CHG_DOWNSTREAM:
-		ret = max77843_muic_set_path(info, CONTROL1_SW_OPEN, attached);
+		ret = max77843_muic_set_path(info,
+					     MAX77843_MUIC_CONTROL1_SW_OPEN,
+					     attached);
 		if (ret < 0)
 			return ret;
 
@@ -490,14 +498,18 @@ static int max77843_muic_chg_handler(struct max77843_muic_info *info)
 					attached);
 		break;
 	case MAX77843_MUIC_CHG_DEDICATED:
-		ret = max77843_muic_set_path(info, CONTROL1_SW_OPEN, attached);
+		ret = max77843_muic_set_path(info,
+					     MAX77843_MUIC_CONTROL1_SW_OPEN,
+					     attached);
 		if (ret < 0)
 			return ret;
 
 		extcon_set_cable_state_(info->edev, EXTCON_TA, attached);
 		break;
 	case MAX77843_MUIC_CHG_SPECIAL_500MA:
-		ret = max77843_muic_set_path(info, CONTROL1_SW_OPEN, attached);
+		ret = max77843_muic_set_path(info,
+					     MAX77843_MUIC_CONTROL1_SW_OPEN,
+					     attached);
 		if (ret < 0)
 			return ret;
 
@@ -505,7 +517,9 @@ static int max77843_muic_chg_handler(struct max77843_muic_info *info)
 					attached);
 		break;
 	case MAX77843_MUIC_CHG_SPECIAL_1A:
-		ret = max77843_muic_set_path(info, CONTROL1_SW_OPEN, attached);
+		ret = max77843_muic_set_path(info,
+					     MAX77843_MUIC_CONTROL1_SW_OPEN,
+					     attached);
 		if (ret < 0)
 			return ret;
 
@@ -529,7 +543,8 @@ static int max77843_muic_chg_handler(struct max77843_muic_info *info)
 			"failed to detect %s accessory (chg_type:0x%x)\n",
 			attached ? "attached" : "detached", chg_type);
 
-		max77843_muic_set_path(info, CONTROL1_SW_OPEN, attached);
+		max77843_muic_set_path(info, MAX77843_MUIC_CONTROL1_SW_OPEN,
+				       attached);
 		return -EINVAL;
 	}
 
@@ -668,7 +683,7 @@ static int max77843_muic_set_debounce_time(struct max77843_muic_info *info,
 		ret = regmap_update_bits(max77843->regmap_muic,
 				MAX77843_MUIC_REG_CONTROL4,
 				MAX77843_MUIC_CONTROL4_ADCDBSET_MASK,
-				time << CONTROL4_ADCDBSET_SHIFT);
+				time << MAX77843_MUIC_CONTROL4_ADCDBSET_SHIFT);
 		if (ret < 0) {
 			dev_err(info->dev, "Cannot write MUIC regmap\n");
 			return ret;
@@ -769,7 +784,7 @@ static int max77843_muic_probe(struct platform_device *pdev)
 	max77843_muic_set_debounce_time(info, MAX77843_DEBOUNCE_TIME_25MS);
 
 	/* Set initial path for UART */
-	max77843_muic_set_path(info, CONTROL1_SW_UART, true);
+	max77843_muic_set_path(info, MAX77843_MUIC_CONTROL1_SW_UART, true);
 
 	/* Check revision number of MUIC device */
 	ret = regmap_read(max77843->regmap_muic, MAX77843_MUIC_REG_ID, &id);
diff --git a/include/linux/mfd/max77843-private.h b/include/linux/mfd/max77843-private.h
index 0121d9440340..c19303b0ccfd 100644
--- a/include/linux/mfd/max77843-private.h
+++ b/include/linux/mfd/max77843-private.h
@@ -318,62 +318,62 @@ enum max77843_irq_muic {
 	MAX77843_INTSRCMASK_SYS_MASK | MAX77843_INTSRCMASK_CHGR_MASK)
 
 /* MAX77843 STATUS register*/
-#define STATUS1_ADC_SHIFT			0
-#define STATUS1_ADCERROR_SHIFT			6
-#define STATUS1_ADC1K_SHIFT			7
-#define STATUS2_CHGTYP_SHIFT			0
-#define STATUS2_CHGDETRUN_SHIFT			3
-#define STATUS2_DCDTMR_SHIFT			4
-#define STATUS2_DXOVP_SHIFT			5
-#define STATUS2_VBVOLT_SHIFT			6
-#define STATUS3_VBADC_SHIFT			0
-#define STATUS3_VDNMON_SHIFT			4
-#define STATUS3_DNRES_SHIFT			5
-#define STATUS3_MPNACK_SHIFT			6
-
-#define MAX77843_MUIC_STATUS1_ADC_MASK		(0x1f << STATUS1_ADC_SHIFT)
-#define MAX77843_MUIC_STATUS1_ADCERROR_MASK	BIT(STATUS1_ADCERROR_SHIFT)
-#define MAX77843_MUIC_STATUS1_ADC1K_MASK	BIT(STATUS1_ADC1K_SHIFT)
-#define MAX77843_MUIC_STATUS2_CHGTYP_MASK	(0x7 << STATUS2_CHGTYP_SHIFT)
-#define MAX77843_MUIC_STATUS2_CHGDETRUN_MASK	BIT(STATUS2_CHGDETRUN_SHIFT)
-#define MAX77843_MUIC_STATUS2_DCDTMR_MASK	BIT(STATUS2_DCDTMR_SHIFT)
-#define MAX77843_MUIC_STATUS2_DXOVP_MASK	BIT(STATUS2_DXOVP_SHIFT)
-#define MAX77843_MUIC_STATUS2_VBVOLT_MASK	BIT(STATUS2_VBVOLT_SHIFT)
-#define MAX77843_MUIC_STATUS3_VBADC_MASK	(0xf << STATUS3_VBADC_SHIFT)
-#define MAX77843_MUIC_STATUS3_VDNMON_MASK	BIT(STATUS3_VDNMON_SHIFT)
-#define MAX77843_MUIC_STATUS3_DNRES_MASK	BIT(STATUS3_DNRES_SHIFT)
-#define MAX77843_MUIC_STATUS3_MPNACK_MASK	BIT(STATUS3_MPNACK_SHIFT)
+#define MAX77843_MUIC_STATUS1_ADC_SHIFT		0
+#define MAX77843_MUIC_STATUS1_ADCERROR_SHIFT	6
+#define MAX77843_MUIC_STATUS1_ADC1K_SHIFT	7
+#define MAX77843_MUIC_STATUS2_CHGTYP_SHIFT	0
+#define MAX77843_MUIC_STATUS2_CHGDETRUN_SHIFT	3
+#define MAX77843_MUIC_STATUS2_DCDTMR_SHIFT	4
+#define MAX77843_MUIC_STATUS2_DXOVP_SHIFT	5
+#define MAX77843_MUIC_STATUS2_VBVOLT_SHIFT	6
+#define MAX77843_MUIC_STATUS3_VBADC_SHIFT	0
+#define MAX77843_MUIC_STATUS3_VDNMON_SHIFT	4
+#define MAX77843_MUIC_STATUS3_DNRES_SHIFT	5
+#define MAX77843_MUIC_STATUS3_MPNACK_SHIFT	6
+
+#define MAX77843_MUIC_STATUS1_ADC_MASK		(0x1f << MAX77843_MUIC_STATUS1_ADC_SHIFT)
+#define MAX77843_MUIC_STATUS1_ADCERROR_MASK	BIT(MAX77843_MUIC_STATUS1_ADCERROR_SHIFT)
+#define MAX77843_MUIC_STATUS1_ADC1K_MASK	BIT(MAX77843_MUIC_STATUS1_ADC1K_SHIFT)
+#define MAX77843_MUIC_STATUS2_CHGTYP_MASK	(0x7 << MAX77843_MUIC_STATUS2_CHGTYP_SHIFT)
+#define MAX77843_MUIC_STATUS2_CHGDETRUN_MASK	BIT(MAX77843_MUIC_STATUS2_CHGDETRUN_SHIFT)
+#define MAX77843_MUIC_STATUS2_DCDTMR_MASK	BIT(MAX77843_MUIC_STATUS2_DCDTMR_SHIFT)
+#define MAX77843_MUIC_STATUS2_DXOVP_MASK	BIT(MAX77843_MUIC_STATUS2_DXOVP_SHIFT)
+#define MAX77843_MUIC_STATUS2_VBVOLT_MASK	BIT(MAX77843_MUIC_STATUS2_VBVOLT_SHIFT)
+#define MAX77843_MUIC_STATUS3_VBADC_MASK	(0xf << MAX77843_MUIC_STATUS3_VBADC_SHIFT)
+#define MAX77843_MUIC_STATUS3_VDNMON_MASK	BIT(MAX77843_MUIC_STATUS3_VDNMON_SHIFT)
+#define MAX77843_MUIC_STATUS3_DNRES_MASK	BIT(MAX77843_MUIC_STATUS3_DNRES_SHIFT)
+#define MAX77843_MUIC_STATUS3_MPNACK_MASK	BIT(MAX77843_MUIC_STATUS3_MPNACK_SHIFT)
 
 /* MAX77843 CONTROL register */
-#define CONTROL1_COMP1SW_SHIFT			0
-#define CONTROL1_COMP2SW_SHIFT			3
-#define CONTROL1_IDBEN_SHIFT			7
-#define CONTROL2_LOWPWR_SHIFT			0
-#define CONTROL2_ADCEN_SHIFT			1
-#define CONTROL2_CPEN_SHIFT			2
-#define CONTROL2_ACC_DET_SHIFT			5
-#define CONTROL2_USBCPINT_SHIFT			6
-#define CONTROL2_RCPS_SHIFT			7
-#define CONTROL3_JIGSET_SHIFT			0
-#define CONTROL4_ADCDBSET_SHIFT			0
-#define CONTROL4_USBAUTO_SHIFT			4
-#define CONTROL4_FCTAUTO_SHIFT			5
-#define CONTROL4_ADCMODE_SHIFT			6
-
-#define MAX77843_MUIC_CONTROL1_COMP1SW_MASK	(0x7 << CONTROL1_COMP1SW_SHIFT)
-#define MAX77843_MUIC_CONTROL1_COMP2SW_MASK	(0x7 << CONTROL1_COMP2SW_SHIFT)
-#define MAX77843_MUIC_CONTROL1_IDBEN_MASK	BIT(CONTROL1_IDBEN_SHIFT)
-#define MAX77843_MUIC_CONTROL2_LOWPWR_MASK	BIT(CONTROL2_LOWPWR_SHIFT)
-#define MAX77843_MUIC_CONTROL2_ADCEN_MASK	BIT(CONTROL2_ADCEN_SHIFT)
-#define MAX77843_MUIC_CONTROL2_CPEN_MASK	BIT(CONTROL2_CPEN_SHIFT)
-#define MAX77843_MUIC_CONTROL2_ACC_DET_MASK	BIT(CONTROL2_ACC_DET_SHIFT)
-#define MAX77843_MUIC_CONTROL2_USBCPINT_MASK	BIT(CONTROL2_USBCPINT_SHIFT)
-#define MAX77843_MUIC_CONTROL2_RCPS_MASK	BIT(CONTROL2_RCPS_SHIFT)
-#define MAX77843_MUIC_CONTROL3_JIGSET_MASK	(0x3 << CONTROL3_JIGSET_SHIFT)
-#define MAX77843_MUIC_CONTROL4_ADCDBSET_MASK	(0x3 << CONTROL4_ADCDBSET_SHIFT)
-#define MAX77843_MUIC_CONTROL4_USBAUTO_MASK	BIT(CONTROL4_USBAUTO_SHIFT)
-#define MAX77843_MUIC_CONTROL4_FCTAUTO_MASK	BIT(CONTROL4_FCTAUTO_SHIFT)
-#define MAX77843_MUIC_CONTROL4_ADCMODE_MASK	(0x3 << CONTROL4_ADCMODE_SHIFT)
+#define MAX77843_MUIC_CONTROL1_COMP1SW_SHIFT	0
+#define MAX77843_MUIC_CONTROL1_COMP2SW_SHIFT	3
+#define MAX77843_MUIC_CONTROL1_IDBEN_SHIFT	7
+#define MAX77843_MUIC_CONTROL2_LOWPWR_SHIFT	0
+#define MAX77843_MUIC_CONTROL2_ADCEN_SHIFT	1
+#define MAX77843_MUIC_CONTROL2_CPEN_SHIFT	2
+#define MAX77843_MUIC_CONTROL2_ACC_DET_SHIFT	5
+#define MAX77843_MUIC_CONTROL2_USBCPINT_SHIFT	6
+#define MAX77843_MUIC_CONTROL2_RCPS_SHIFT	7
+#define MAX77843_MUIC_CONTROL3_JIGSET_SHIFT	0
+#define MAX77843_MUIC_CONTROL4_ADCDBSET_SHIFT	0
+#define MAX77843_MUIC_CONTROL4_USBAUTO_SHIFT	4
+#define MAX77843_MUIC_CONTROL4_FCTAUTO_SHIFT	5
+#define MAX77843_MUIC_CONTROL4_ADCMODE_SHIFT	6
+
+#define MAX77843_MUIC_CONTROL1_COMP1SW_MASK	(0x7 << MAX77843_MUIC_CONTROL1_COMP1SW_SHIFT)
+#define MAX77843_MUIC_CONTROL1_COMP2SW_MASK	(0x7 << MAX77843_MUIC_CONTROL1_COMP2SW_SHIFT)
+#define MAX77843_MUIC_CONTROL1_IDBEN_MASK	BIT(MAX77843_MUIC_CONTROL1_IDBEN_SHIFT)
+#define MAX77843_MUIC_CONTROL2_LOWPWR_MASK	BIT(MAX77843_MUIC_CONTROL2_LOWPWR_SHIFT)
+#define MAX77843_MUIC_CONTROL2_ADCEN_MASK	BIT(MAX77843_MUIC_CONTROL2_ADCEN_SHIFT)
+#define MAX77843_MUIC_CONTROL2_CPEN_MASK	BIT(MAX77843_MUIC_CONTROL2_CPEN_SHIFT)
+#define MAX77843_MUIC_CONTROL2_ACC_DET_MASK	BIT(MAX77843_MUIC_CONTROL2_ACC_DET_SHIFT)
+#define MAX77843_MUIC_CONTROL2_USBCPINT_MASK	BIT(MAX77843_MUIC_CONTROL2_USBCPINT_SHIFT)
+#define MAX77843_MUIC_CONTROL2_RCPS_MASK	BIT(MAX77843_MUIC_CONTROL2_RCPS_SHIFT)
+#define MAX77843_MUIC_CONTROL3_JIGSET_MASK	(0x3 << MAX77843_MUIC_CONTROL3_JIGSET_SHIFT)
+#define MAX77843_MUIC_CONTROL4_ADCDBSET_MASK	(0x3 << MAX77843_MUIC_CONTROL4_ADCDBSET_SHIFT)
+#define MAX77843_MUIC_CONTROL4_USBAUTO_MASK	BIT(MAX77843_MUIC_CONTROL4_USBAUTO_SHIFT)
+#define MAX77843_MUIC_CONTROL4_FCTAUTO_MASK	BIT(MAX77843_MUIC_CONTROL4_FCTAUTO_SHIFT)
+#define MAX77843_MUIC_CONTROL4_ADCMODE_MASK	(0x3 << MAX77843_MUIC_CONTROL4_ADCMODE_SHIFT)
 
 /* MAX77843 switch port */
 #define COM_OPEN				0
@@ -383,38 +383,38 @@ enum max77843_irq_muic {
 #define COM_AUX_USB				4
 #define COM_AUX_UART				5
 
-#define CONTROL1_COM_SW \
+#define MAX77843_MUIC_CONTROL1_COM_SW \
 	((MAX77843_MUIC_CONTROL1_COMP1SW_MASK | \
 	 MAX77843_MUIC_CONTROL1_COMP2SW_MASK))
 
-#define CONTROL1_SW_OPEN \
-	((COM_OPEN << CONTROL1_COMP1SW_SHIFT | \
-	 COM_OPEN << CONTROL1_COMP2SW_SHIFT))
-#define CONTROL1_SW_USB \
-	((COM_USB << CONTROL1_COMP1SW_SHIFT | \
-	 COM_USB << CONTROL1_COMP2SW_SHIFT))
-#define CONTROL1_SW_AUDIO \
-	((COM_AUDIO << CONTROL1_COMP1SW_SHIFT | \
-	 COM_AUDIO << CONTROL1_COMP2SW_SHIFT))
-#define CONTROL1_SW_UART \
-	((COM_UART << CONTROL1_COMP1SW_SHIFT | \
-	 COM_UART << CONTROL1_COMP2SW_SHIFT))
-#define CONTROL1_SW_AUX_USB \
-	((COM_AUX_USB << CONTROL1_COMP1SW_SHIFT | \
-	 COM_AUX_USB << CONTROL1_COMP2SW_SHIFT))
-#define CONTROL1_SW_AUX_UART \
-	((COM_AUX_UART << CONTROL1_COMP1SW_SHIFT | \
-	 COM_AUX_UART << CONTROL1_COMP2SW_SHIFT))
+#define MAX77843_MUIC_CONTROL1_SW_OPEN \
+	((COM_OPEN << MAX77843_MUIC_CONTROL1_COMP1SW_SHIFT | \
+	 COM_OPEN << MAX77843_MUIC_CONTROL1_COMP2SW_SHIFT))
+#define MAX77843_MUIC_CONTROL1_SW_USB \
+	((COM_USB << MAX77843_MUIC_CONTROL1_COMP1SW_SHIFT | \
+	 COM_USB << MAX77843_MUIC_CONTROL1_COMP2SW_SHIFT))
+#define MAX77843_MUIC_CONTROL1_SW_AUDIO \
+	((COM_AUDIO << MAX77843_MUIC_CONTROL1_COMP1SW_SHIFT | \
+	 COM_AUDIO << MAX77843_MUIC_CONTROL1_COMP2SW_SHIFT))
+#define MAX77843_MUIC_CONTROL1_SW_UART \
+	((COM_UART << MAX77843_MUIC_CONTROL1_COMP1SW_SHIFT | \
+	 COM_UART << MAX77843_MUIC_CONTROL1_COMP2SW_SHIFT))
+#define MAX77843_MUIC_CONTROL1_SW_AUX_USB \
+	((COM_AUX_USB << MAX77843_MUIC_CONTROL1_COMP1SW_SHIFT | \
+	 COM_AUX_USB << MAX77843_MUIC_CONTROL1_COMP2SW_SHIFT))
+#define MAX77843_MUIC_CONTROL1_SW_AUX_UART \
+	((COM_AUX_UART << MAX77843_MUIC_CONTROL1_COMP1SW_SHIFT | \
+	 COM_AUX_UART << MAX77843_MUIC_CONTROL1_COMP2SW_SHIFT))
 
 #define MAX77843_DISABLE			0
 #define MAX77843_ENABLE				1
 
 #define CONTROL4_AUTO_DISABLE \
-	((MAX77843_DISABLE << CONTROL4_USBAUTO_SHIFT) | \
-	(MAX77843_DISABLE << CONTROL4_FCTAUTO_SHIFT))
+	((MAX77843_DISABLE << MAX77843_MUIC_CONTROL4_USBAUTO_SHIFT) | \
+	(MAX77843_DISABLE << MAX77843_MUIC_CONTROL4_FCTAUTO_SHIFT))
 #define CONTROL4_AUTO_ENABLE \
-	((MAX77843_ENABLE << CONTROL4_USBAUTO_SHIFT) | \
-	(MAX77843_ENABLE << CONTROL4_FCTAUTO_SHIFT))
+	((MAX77843_ENABLE << MAX77843_MUIC_CONTROL4_USBAUTO_SHIFT) | \
+	(MAX77843_ENABLE << MAX77843_MUIC_CONTROL4_FCTAUTO_SHIFT))
 
 /* MAX77843 SAFEOUT LDO Control register */
 #define SAFEOUTCTRL_SAFEOUT1_SHIFT		0
-- 
cgit v1.2.3-70-g09d2


From 8019ff6cfc0440415fcfb6352c58c3951e6ab053 Mon Sep 17 00:00:00 2001
From: Nariman Poushin <nariman@opensource.wolfsonmicro.com>
Date: Thu, 16 Jul 2015 16:36:21 +0100
Subject: regmap: Use reg_sequence for multi_reg_write / register_patch

Separate the functionality using sequences of register writes from the
functions that take register defaults. This change renames the arguments
in order to support the extension of reg_sequence to take an optional
delay to be applied after any given register in a sequence is written.
This avoids adding an int to all register defaults, which could
substantially increase memory usage for regmaps with large default tables.

This also updates all the clients of multi_reg_write/register_patch.

Signed-off-by: Nariman Poushin <nariman@opensource.wolfsonmicro.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/internal.h |  2 +-
 drivers/base/regmap/regmap.c   | 22 +++++++++++-----------
 drivers/gpu/drm/i2c/adv7511.c  |  2 +-
 drivers/input/misc/drv260x.c   |  6 +++---
 drivers/input/misc/drv2665.c   |  2 +-
 drivers/input/misc/drv2667.c   |  4 ++--
 drivers/mfd/arizona-core.c     |  2 +-
 drivers/mfd/twl6040.c          |  2 +-
 drivers/mfd/wm5102-tables.c    |  6 +++---
 drivers/mfd/wm5110-tables.c    |  6 +++---
 drivers/mfd/wm8994-core.c      |  8 ++++----
 drivers/mfd/wm8997-tables.c    |  2 +-
 include/linux/regmap.h         | 17 ++++++++++++++---
 sound/soc/codecs/arizona.c     |  2 +-
 sound/soc/codecs/cs35l32.c     |  2 +-
 sound/soc/codecs/cs42l52.c     |  2 +-
 sound/soc/codecs/da7210.c      |  4 ++--
 sound/soc/codecs/rt5640.c      |  2 +-
 sound/soc/codecs/rt5645.c      |  4 ++--
 sound/soc/codecs/rt5651.c      |  2 +-
 sound/soc/codecs/rt5670.c      |  2 +-
 sound/soc/codecs/rt5677.c      |  2 +-
 sound/soc/codecs/tlv320aic3x.c |  2 +-
 sound/soc/codecs/wm2200.c      |  2 +-
 sound/soc/codecs/wm5100.c      |  2 +-
 sound/soc/codecs/wm8962.c      |  2 +-
 sound/soc/codecs/wm8993.c      |  2 +-
 27 files changed, 62 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index b2b2849fc6d3..873ddf91c9d3 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -136,7 +136,7 @@ struct regmap {
 	/* if set, the HW registers are known to match map->reg_defaults */
 	bool no_sync_defaults;
 
-	struct reg_default *patch;
+	struct reg_sequence *patch;
 	int patch_regs;
 
 	/* if set, converts bulk rw to single rw */
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 7111d04f2621..2cbb4502747d 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -1743,7 +1743,7 @@ EXPORT_SYMBOL_GPL(regmap_bulk_write);
  * relative. The page register has been written if that was neccessary.
  */
 static int _regmap_raw_multi_reg_write(struct regmap *map,
-				       const struct reg_default *regs,
+				       const struct reg_sequence *regs,
 				       size_t num_regs)
 {
 	int ret;
@@ -1800,12 +1800,12 @@ static unsigned int _regmap_register_page(struct regmap *map,
 }
 
 static int _regmap_range_multi_paged_reg_write(struct regmap *map,
-					       struct reg_default *regs,
+					       struct reg_sequence *regs,
 					       size_t num_regs)
 {
 	int ret;
 	int i, n;
-	struct reg_default *base;
+	struct reg_sequence *base;
 	unsigned int this_page = 0;
 	/*
 	 * the set of registers are not neccessarily in order, but
@@ -1843,7 +1843,7 @@ static int _regmap_range_multi_paged_reg_write(struct regmap *map,
 }
 
 static int _regmap_multi_reg_write(struct regmap *map,
-				   const struct reg_default *regs,
+				   const struct reg_sequence *regs,
 				   size_t num_regs)
 {
 	int i;
@@ -1895,8 +1895,8 @@ static int _regmap_multi_reg_write(struct regmap *map,
 		struct regmap_range_node *range;
 		range = _regmap_range_lookup(map, reg);
 		if (range) {
-			size_t len = sizeof(struct reg_default)*num_regs;
-			struct reg_default *base = kmemdup(regs, len,
+			size_t len = sizeof(struct reg_sequence)*num_regs;
+			struct reg_sequence *base = kmemdup(regs, len,
 							   GFP_KERNEL);
 			if (!base)
 				return -ENOMEM;
@@ -1929,7 +1929,7 @@ static int _regmap_multi_reg_write(struct regmap *map,
  * A value of zero will be returned on success, a negative errno will be
  * returned in error cases.
  */
-int regmap_multi_reg_write(struct regmap *map, const struct reg_default *regs,
+int regmap_multi_reg_write(struct regmap *map, const struct reg_sequence *regs,
 			   int num_regs)
 {
 	int ret;
@@ -1962,7 +1962,7 @@ EXPORT_SYMBOL_GPL(regmap_multi_reg_write);
  * be returned in error cases.
  */
 int regmap_multi_reg_write_bypassed(struct regmap *map,
-				    const struct reg_default *regs,
+				    const struct reg_sequence *regs,
 				    int num_regs)
 {
 	int ret;
@@ -2552,10 +2552,10 @@ EXPORT_SYMBOL_GPL(regmap_async_complete);
  * The caller must ensure that this function cannot be called
  * concurrently with either itself or regcache_sync().
  */
-int regmap_register_patch(struct regmap *map, const struct reg_default *regs,
+int regmap_register_patch(struct regmap *map, const struct reg_sequence *regs,
 			  int num_regs)
 {
-	struct reg_default *p;
+	struct reg_sequence *p;
 	int ret;
 	bool bypass;
 
@@ -2564,7 +2564,7 @@ int regmap_register_patch(struct regmap *map, const struct reg_default *regs,
 		return 0;
 
 	p = krealloc(map->patch,
-		     sizeof(struct reg_default) * (map->patch_regs + num_regs),
+		     sizeof(struct reg_sequence) * (map->patch_regs + num_regs),
 		     GFP_KERNEL);
 	if (p) {
 		memcpy(p + map->patch_regs, regs, num_regs * sizeof(*regs));
diff --git a/drivers/gpu/drm/i2c/adv7511.c b/drivers/gpu/drm/i2c/adv7511.c
index 2aaa3c88999e..00416f23b5cb 100644
--- a/drivers/gpu/drm/i2c/adv7511.c
+++ b/drivers/gpu/drm/i2c/adv7511.c
@@ -54,7 +54,7 @@ static struct adv7511 *encoder_to_adv7511(struct drm_encoder *encoder)
 }
 
 /* ADI recommended values for proper operation. */
-static const struct reg_default adv7511_fixed_registers[] = {
+static const struct reg_sequence adv7511_fixed_registers[] = {
 	{ 0x98, 0x03 },
 	{ 0x9a, 0xe0 },
 	{ 0x9c, 0x30 },
diff --git a/drivers/input/misc/drv260x.c b/drivers/input/misc/drv260x.c
index e5d60ecd29a4..f5c9cf2f4073 100644
--- a/drivers/input/misc/drv260x.c
+++ b/drivers/input/misc/drv260x.c
@@ -313,14 +313,14 @@ static void drv260x_close(struct input_dev *input)
 	gpiod_set_value(haptics->enable_gpio, 0);
 }
 
-static const struct reg_default drv260x_lra_cal_regs[] = {
+static const struct reg_sequence drv260x_lra_cal_regs[] = {
 	{ DRV260X_MODE, DRV260X_AUTO_CAL },
 	{ DRV260X_CTRL3, DRV260X_NG_THRESH_2 },
 	{ DRV260X_FEEDBACK_CTRL, DRV260X_FB_REG_LRA_MODE |
 		DRV260X_BRAKE_FACTOR_4X | DRV260X_LOOP_GAIN_HIGH },
 };
 
-static const struct reg_default drv260x_lra_init_regs[] = {
+static const struct reg_sequence drv260x_lra_init_regs[] = {
 	{ DRV260X_MODE, DRV260X_RT_PLAYBACK },
 	{ DRV260X_A_TO_V_CTRL, DRV260X_AUDIO_HAPTICS_PEAK_20MS |
 		DRV260X_AUDIO_HAPTICS_FILTER_125HZ },
@@ -337,7 +337,7 @@ static const struct reg_default drv260x_lra_init_regs[] = {
 	{ DRV260X_CTRL4, DRV260X_AUTOCAL_TIME_500MS },
 };
 
-static const struct reg_default drv260x_erm_cal_regs[] = {
+static const struct reg_sequence drv260x_erm_cal_regs[] = {
 	{ DRV260X_MODE, DRV260X_AUTO_CAL },
 	{ DRV260X_A_TO_V_MIN_INPUT, DRV260X_AUDIO_HAPTICS_MIN_IN_VOLT },
 	{ DRV260X_A_TO_V_MAX_INPUT, DRV260X_AUDIO_HAPTICS_MAX_IN_VOLT },
diff --git a/drivers/input/misc/drv2665.c b/drivers/input/misc/drv2665.c
index 0afaa33de07d..924456e3ca75 100644
--- a/drivers/input/misc/drv2665.c
+++ b/drivers/input/misc/drv2665.c
@@ -132,7 +132,7 @@ static void drv2665_close(struct input_dev *input)
 			"Failed to enter standby mode: %d\n", error);
 }
 
-static const struct reg_default drv2665_init_regs[] = {
+static const struct reg_sequence drv2665_init_regs[] = {
 	{ DRV2665_CTRL_2, 0 | DRV2665_10_MS_IDLE_TOUT },
 	{ DRV2665_CTRL_1, DRV2665_25_VPP_GAIN },
 };
diff --git a/drivers/input/misc/drv2667.c b/drivers/input/misc/drv2667.c
index fc0fddf0896a..047136aa646f 100644
--- a/drivers/input/misc/drv2667.c
+++ b/drivers/input/misc/drv2667.c
@@ -262,14 +262,14 @@ static void drv2667_close(struct input_dev *input)
 			"Failed to enter standby mode: %d\n", error);
 }
 
-static const struct reg_default drv2667_init_regs[] = {
+static const struct reg_sequence drv2667_init_regs[] = {
 	{ DRV2667_CTRL_2, 0 },
 	{ DRV2667_CTRL_1, DRV2667_25_VPP_GAIN },
 	{ DRV2667_WV_SEQ_0, 1 },
 	{ DRV2667_WV_SEQ_1, 0 }
 };
 
-static const struct reg_default drv2667_page1_init[] = {
+static const struct reg_sequence drv2667_page1_init[] = {
 	{ DRV2667_RAM_HDR_SZ, 0x05 },
 	{ DRV2667_RAM_START_HI, 0x80 },
 	{ DRV2667_RAM_START_LO, 0x06 },
diff --git a/drivers/mfd/arizona-core.c b/drivers/mfd/arizona-core.c
index bebf58a06a6b..66d50be11960 100644
--- a/drivers/mfd/arizona-core.c
+++ b/drivers/mfd/arizona-core.c
@@ -392,7 +392,7 @@ err:
  * Register patch to some of the CODECs internal write sequences
  * to ensure a clean exit from the low power sleep state.
  */
-static const struct reg_default wm5110_sleep_patch[] = {
+static const struct reg_sequence wm5110_sleep_patch[] = {
 	{ 0x337A, 0xC100 },
 	{ 0x337B, 0x0041 },
 	{ 0x3300, 0xA210 },
diff --git a/drivers/mfd/twl6040.c b/drivers/mfd/twl6040.c
index c5265c1262c5..583dc33432f3 100644
--- a/drivers/mfd/twl6040.c
+++ b/drivers/mfd/twl6040.c
@@ -86,7 +86,7 @@ static const struct reg_default twl6040_defaults[] = {
 	{ 0x2E, 0x00 }, /* REG_STATUS	(ro) */
 };
 
-static struct reg_default twl6040_patch[] = {
+static struct reg_sequence twl6040_patch[] = {
 	/*
 	 * Select I2C bus access to dual access registers
 	 * Interrupt register is cleared on read
diff --git a/drivers/mfd/wm5102-tables.c b/drivers/mfd/wm5102-tables.c
index aeae6ec123b3..423fb3730dc7 100644
--- a/drivers/mfd/wm5102-tables.c
+++ b/drivers/mfd/wm5102-tables.c
@@ -21,7 +21,7 @@
 #define WM5102_NUM_AOD_ISR 2
 #define WM5102_NUM_ISR 5
 
-static const struct reg_default wm5102_reva_patch[] = {
+static const struct reg_sequence wm5102_reva_patch[] = {
 	{ 0x80, 0x0003 },
 	{ 0x221, 0x0090 },
 	{ 0x211, 0x0014 },
@@ -57,7 +57,7 @@ static const struct reg_default wm5102_reva_patch[] = {
 	{ 0x80, 0x0000 },
 };
 
-static const struct reg_default wm5102_revb_patch[] = {
+static const struct reg_sequence wm5102_revb_patch[] = {
 	{ 0x19, 0x0001 },
 	{ 0x80, 0x0003 },
 	{ 0x081, 0xE022 },
@@ -80,7 +80,7 @@ static const struct reg_default wm5102_revb_patch[] = {
 /* We use a function so we can use ARRAY_SIZE() */
 int wm5102_patch(struct arizona *arizona)
 {
-	const struct reg_default *wm5102_patch;
+	const struct reg_sequence *wm5102_patch;
 	int patch_size;
 
 	switch (arizona->rev) {
diff --git a/drivers/mfd/wm5110-tables.c b/drivers/mfd/wm5110-tables.c
index 12cad94b4035..26ce14f903fe 100644
--- a/drivers/mfd/wm5110-tables.c
+++ b/drivers/mfd/wm5110-tables.c
@@ -21,7 +21,7 @@
 #define WM5110_NUM_AOD_ISR 2
 #define WM5110_NUM_ISR 5
 
-static const struct reg_default wm5110_reva_patch[] = {
+static const struct reg_sequence wm5110_reva_patch[] = {
 	{ 0x80, 0x3 },
 	{ 0x44, 0x20 },
 	{ 0x45, 0x40 },
@@ -134,7 +134,7 @@ static const struct reg_default wm5110_reva_patch[] = {
 	{ 0x209, 0x002A },
 };
 
-static const struct reg_default wm5110_revb_patch[] = {
+static const struct reg_sequence wm5110_revb_patch[] = {
 	{ 0x80, 0x3 },
 	{ 0x36e, 0x0210 },
 	{ 0x370, 0x0210 },
@@ -224,7 +224,7 @@ static const struct reg_default wm5110_revb_patch[] = {
 	{ 0x80, 0x0 },
 };
 
-static const struct reg_default wm5110_revd_patch[] = {
+static const struct reg_sequence wm5110_revd_patch[] = {
 	{ 0x80, 0x3 },
 	{ 0x80, 0x3 },
 	{ 0x393, 0x27 },
diff --git a/drivers/mfd/wm8994-core.c b/drivers/mfd/wm8994-core.c
index 53ae5af5d6e4..0f4169a3a5d4 100644
--- a/drivers/mfd/wm8994-core.c
+++ b/drivers/mfd/wm8994-core.c
@@ -243,21 +243,21 @@ static int wm8994_ldo_in_use(struct wm8994_pdata *pdata, int ldo)
 }
 #endif
 
-static const struct reg_default wm8994_revc_patch[] = {
+static const struct reg_sequence wm8994_revc_patch[] = {
 	{ 0x102, 0x3 },
 	{ 0x56, 0x3 },
 	{ 0x817, 0x0 },
 	{ 0x102, 0x0 },
 };
 
-static const struct reg_default wm8958_reva_patch[] = {
+static const struct reg_sequence wm8958_reva_patch[] = {
 	{ 0x102, 0x3 },
 	{ 0xcb, 0x81 },
 	{ 0x817, 0x0 },
 	{ 0x102, 0x0 },
 };
 
-static const struct reg_default wm1811_reva_patch[] = {
+static const struct reg_sequence wm1811_reva_patch[] = {
 	{ 0x102, 0x3 },
 	{ 0x56, 0xc07 },
 	{ 0x5d, 0x7e },
@@ -326,7 +326,7 @@ static int wm8994_device_init(struct wm8994 *wm8994, int irq)
 {
 	struct wm8994_pdata *pdata;
 	struct regmap_config *regmap_config;
-	const struct reg_default *regmap_patch = NULL;
+	const struct reg_sequence *regmap_patch = NULL;
 	const char *devname;
 	int ret, i, patch_regs = 0;
 	int pulls = 0;
diff --git a/drivers/mfd/wm8997-tables.c b/drivers/mfd/wm8997-tables.c
index c0c25d75aacc..cab2c68f1737 100644
--- a/drivers/mfd/wm8997-tables.c
+++ b/drivers/mfd/wm8997-tables.c
@@ -17,7 +17,7 @@
 
 #include "arizona.h"
 
-static const struct reg_default wm8997_reva_patch[] = {
+static const struct reg_sequence wm8997_reva_patch[] = {
 	{ 0x80, 0x0003 },
 	{ 0x214, 0x0008 },
 	{ 0x458, 0x0000 },
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 59c55ea0f0b5..c9ef2ec69142 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -50,6 +50,17 @@ struct reg_default {
 	unsigned int def;
 };
 
+/**
+ * Register/value pairs for sequences of writes
+ *
+ * @reg: Register address.
+ * @def: Register value.
+ */
+struct reg_sequence {
+	unsigned int reg;
+	unsigned int def;
+};
+
 #ifdef CONFIG_REGMAP
 
 enum regmap_endian {
@@ -410,10 +421,10 @@ int regmap_raw_write(struct regmap *map, unsigned int reg,
 		     const void *val, size_t val_len);
 int regmap_bulk_write(struct regmap *map, unsigned int reg, const void *val,
 			size_t val_count);
-int regmap_multi_reg_write(struct regmap *map, const struct reg_default *regs,
+int regmap_multi_reg_write(struct regmap *map, const struct reg_sequence *regs,
 			int num_regs);
 int regmap_multi_reg_write_bypassed(struct regmap *map,
-				    const struct reg_default *regs,
+				    const struct reg_sequence *regs,
 				    int num_regs);
 int regmap_raw_write_async(struct regmap *map, unsigned int reg,
 			   const void *val, size_t val_len);
@@ -450,7 +461,7 @@ void regcache_mark_dirty(struct regmap *map);
 bool regmap_check_range_table(struct regmap *map, unsigned int reg,
 			      const struct regmap_access_table *table);
 
-int regmap_register_patch(struct regmap *map, const struct reg_default *regs,
+int regmap_register_patch(struct regmap *map, const struct reg_sequence *regs,
 			  int num_regs);
 int regmap_parse_val(struct regmap *map, const void *buf,
 				unsigned int *val);
diff --git a/sound/soc/codecs/arizona.c b/sound/soc/codecs/arizona.c
index 802e05eae3e9..5edd33fcd68c 100644
--- a/sound/soc/codecs/arizona.c
+++ b/sound/soc/codecs/arizona.c
@@ -1366,7 +1366,7 @@ static void arizona_wm5102_set_dac_comp(struct snd_soc_codec *codec,
 {
 	struct arizona_priv *priv = snd_soc_codec_get_drvdata(codec);
 	struct arizona *arizona = priv->arizona;
-	struct reg_default dac_comp[] = {
+	struct reg_sequence dac_comp[] = {
 		{ 0x80, 0x3 },
 		{ ARIZONA_DAC_COMP_1, 0 },
 		{ ARIZONA_DAC_COMP_2, 0 },
diff --git a/sound/soc/codecs/cs35l32.c b/sound/soc/codecs/cs35l32.c
index 8f40025b7e7c..2813a1b0c949 100644
--- a/sound/soc/codecs/cs35l32.c
+++ b/sound/soc/codecs/cs35l32.c
@@ -276,7 +276,7 @@ static const struct snd_soc_codec_driver soc_codec_dev_cs35l32 = {
 };
 
 /* Current and threshold powerup sequence Pg37 in datasheet */
-static const struct reg_default cs35l32_monitor_patch[] = {
+static const struct reg_sequence cs35l32_monitor_patch[] = {
 
 	{ 0x00, 0x99 },
 	{ 0x48, 0x17 },
diff --git a/sound/soc/codecs/cs42l52.c b/sound/soc/codecs/cs42l52.c
index 4de52c9957ac..8b2d05933594 100644
--- a/sound/soc/codecs/cs42l52.c
+++ b/sound/soc/codecs/cs42l52.c
@@ -1118,7 +1118,7 @@ static const struct snd_soc_codec_driver soc_codec_dev_cs42l52 = {
 };
 
 /* Current and threshold powerup sequence Pg37 */
-static const struct reg_default cs42l52_threshold_patch[] = {
+static const struct reg_sequence cs42l52_threshold_patch[] = {
 
 	{ 0x00, 0x99 },
 	{ 0x3E, 0xBA },
diff --git a/sound/soc/codecs/da7210.c b/sound/soc/codecs/da7210.c
index 21810e5f3321..bf0fb3d4df22 100644
--- a/sound/soc/codecs/da7210.c
+++ b/sound/soc/codecs/da7210.c
@@ -1182,7 +1182,7 @@ static struct snd_soc_codec_driver soc_codec_dev_da7210 = {
 
 #if IS_ENABLED(CONFIG_I2C)
 
-static struct reg_default da7210_regmap_i2c_patch[] = {
+static struct reg_sequence da7210_regmap_i2c_patch[] = {
 
 	/* System controller master disable */
 	{ DA7210_STARTUP1, 0x00 },
@@ -1269,7 +1269,7 @@ static struct i2c_driver da7210_i2c_driver = {
 
 #if defined(CONFIG_SPI_MASTER)
 
-static struct reg_default da7210_regmap_spi_patch[] = {
+static struct reg_sequence da7210_regmap_spi_patch[] = {
 	/* Dummy read to give two pulses over nCS for SPI */
 	{ DA7210_AUX2, 0x00 },
 	{ DA7210_AUX2, 0x00 },
diff --git a/sound/soc/codecs/rt5640.c b/sound/soc/codecs/rt5640.c
index 9bc78e57513d..1ed1f8895e12 100644
--- a/sound/soc/codecs/rt5640.c
+++ b/sound/soc/codecs/rt5640.c
@@ -51,7 +51,7 @@ static const struct regmap_range_cfg rt5640_ranges[] = {
 	  .window_len = 0x1, },
 };
 
-static const struct reg_default init_list[] = {
+static const struct reg_sequence init_list[] = {
 	{RT5640_PR_BASE + 0x3d,	0x3600},
 	{RT5640_PR_BASE + 0x12,	0x0aa8},
 	{RT5640_PR_BASE + 0x14,	0x0aaa},
diff --git a/sound/soc/codecs/rt5645.c b/sound/soc/codecs/rt5645.c
index 9ce311e088fc..c0f4be430e70 100644
--- a/sound/soc/codecs/rt5645.c
+++ b/sound/soc/codecs/rt5645.c
@@ -54,7 +54,7 @@ static const struct regmap_range_cfg rt5645_ranges[] = {
 	},
 };
 
-static const struct reg_default init_list[] = {
+static const struct reg_sequence init_list[] = {
 	{RT5645_PR_BASE + 0x3d,	0x3600},
 	{RT5645_PR_BASE + 0x1c,	0xfd20},
 	{RT5645_PR_BASE + 0x20,	0x611f},
@@ -63,7 +63,7 @@ static const struct reg_default init_list[] = {
 };
 #define RT5645_INIT_REG_LEN ARRAY_SIZE(init_list)
 
-static const struct reg_default rt5650_init_list[] = {
+static const struct reg_sequence rt5650_init_list[] = {
 	{0xf6,	0x0100},
 };
 
diff --git a/sound/soc/codecs/rt5651.c b/sound/soc/codecs/rt5651.c
index a3506e193abc..db9b8667f136 100644
--- a/sound/soc/codecs/rt5651.c
+++ b/sound/soc/codecs/rt5651.c
@@ -46,7 +46,7 @@ static const struct regmap_range_cfg rt5651_ranges[] = {
 	  .window_len = 0x1, },
 };
 
-static struct reg_default init_list[] = {
+static struct reg_sequence init_list[] = {
 	{RT5651_PR_BASE + 0x3d,	0x3e00},
 };
 
diff --git a/sound/soc/codecs/rt5670.c b/sound/soc/codecs/rt5670.c
index a9123d414178..462a91f7cf68 100644
--- a/sound/soc/codecs/rt5670.c
+++ b/sound/soc/codecs/rt5670.c
@@ -51,7 +51,7 @@ static const struct regmap_range_cfg rt5670_ranges[] = {
 	  .window_len = 0x1, },
 };
 
-static const struct reg_default init_list[] = {
+static const struct reg_sequence init_list[] = {
 	{ RT5670_PR_BASE + 0x14, 0x9a8a },
 	{ RT5670_PR_BASE + 0x38, 0x3ba1 },
 	{ RT5670_PR_BASE + 0x3d, 0x3640 },
diff --git a/sound/soc/codecs/rt5677.c b/sound/soc/codecs/rt5677.c
index 31d969ac1192..b89775251470 100644
--- a/sound/soc/codecs/rt5677.c
+++ b/sound/soc/codecs/rt5677.c
@@ -54,7 +54,7 @@ static const struct regmap_range_cfg rt5677_ranges[] = {
 	},
 };
 
-static const struct reg_default init_list[] = {
+static const struct reg_sequence init_list[] = {
 	{RT5677_ASRC_12,	0x0018},
 	{RT5677_PR_BASE + 0x3d,	0x364d},
 	{RT5677_PR_BASE + 0x17,	0x4fc0},
diff --git a/sound/soc/codecs/tlv320aic3x.c b/sound/soc/codecs/tlv320aic3x.c
index a7cf19b53fb2..83ae1eb44d4f 100644
--- a/sound/soc/codecs/tlv320aic3x.c
+++ b/sound/soc/codecs/tlv320aic3x.c
@@ -1668,7 +1668,7 @@ static const struct i2c_device_id aic3x_i2c_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, aic3x_i2c_id);
 
-static const struct reg_default aic3007_class_d[] = {
+static const struct reg_sequence aic3007_class_d[] = {
 	/* Class-D speaker driver init; datasheet p. 46 */
 	{ AIC3X_PAGE_SELECT, 0x0D },
 	{ 0xD, 0x0D },
diff --git a/sound/soc/codecs/wm2200.c b/sound/soc/codecs/wm2200.c
index c83083285e53..6c607928fb9b 100644
--- a/sound/soc/codecs/wm2200.c
+++ b/sound/soc/codecs/wm2200.c
@@ -897,7 +897,7 @@ static bool wm2200_readable_register(struct device *dev, unsigned int reg)
 	}
 }
 
-static const struct reg_default wm2200_reva_patch[] = {
+static const struct reg_sequence wm2200_reva_patch[] = {
 	{ 0x07, 0x0003 },
 	{ 0x102, 0x0200 },
 	{ 0x203, 0x0084 },
diff --git a/sound/soc/codecs/wm5100.c b/sound/soc/codecs/wm5100.c
index 4c10cd88c1af..26d79bbb7599 100644
--- a/sound/soc/codecs/wm5100.c
+++ b/sound/soc/codecs/wm5100.c
@@ -1247,7 +1247,7 @@ static const struct snd_soc_dapm_route wm5100_dapm_routes[] = {
 	{ "PWM2", NULL, "PWM2 Driver" },
 };
 
-static const struct reg_default wm5100_reva_patches[] = {
+static const struct reg_sequence wm5100_reva_patches[] = {
 	{ WM5100_AUDIO_IF_1_10, 0 },
 	{ WM5100_AUDIO_IF_1_11, 1 },
 	{ WM5100_AUDIO_IF_1_12, 2 },
diff --git a/sound/soc/codecs/wm8962.c b/sound/soc/codecs/wm8962.c
index c5748fd4f296..05492e826aea 100644
--- a/sound/soc/codecs/wm8962.c
+++ b/sound/soc/codecs/wm8962.c
@@ -3495,7 +3495,7 @@ static struct snd_soc_codec_driver soc_codec_dev_wm8962 = {
 };
 
 /* Improve power consumption for IN4 DC measurement mode */
-static const struct reg_default wm8962_dc_measure[] = {
+static const struct reg_sequence wm8962_dc_measure[] = {
 	{ 0xfd, 0x1 },
 	{ 0xcc, 0x40 },
 	{ 0xfd, 0 },
diff --git a/sound/soc/codecs/wm8993.c b/sound/soc/codecs/wm8993.c
index 8a8db8605dc2..52ec64d8502d 100644
--- a/sound/soc/codecs/wm8993.c
+++ b/sound/soc/codecs/wm8993.c
@@ -1595,7 +1595,7 @@ static int wm8993_resume(struct snd_soc_codec *codec)
 #endif
 
 /* Tune DC servo configuration */
-static struct reg_default wm8993_regmap_patch[] = {
+static struct reg_sequence wm8993_regmap_patch[] = {
 	{ 0x44, 3 },
 	{ 0x56, 3 },
 	{ 0x44, 0 },
-- 
cgit v1.2.3-70-g09d2


From 2de9d6006c190bb0f706e8404de94cd94293801f Mon Sep 17 00:00:00 2001
From: Nariman Poushin <nariman@opensource.wolfsonmicro.com>
Date: Thu, 16 Jul 2015 16:36:22 +0100
Subject: regmap: Apply optional delay in multi_reg_write/register_patch

Add an optional delay_us field in reg_sequence to allow the client to
specify a delay (in microseconds) to be applied after any given write
in a sequence of writes.

We treat a delay in a sequence the same way we treat a page change as
they are logically similar in that you can coalesce all write before
a delay (in the same way you can coalesce all writes before a page
change is needed)

Signed-off-by: Nariman Poushin <nariman@opensource.wolfsonmicro.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap.c | 54 +++++++++++++++++++++++++++++++++++++++-----
 include/linux/regmap.h       |  5 +++-
 2 files changed, 52 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 2cbb4502747d..b3a5aa5cd580 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -18,6 +18,7 @@
 #include <linux/of.h>
 #include <linux/rbtree.h>
 #include <linux/sched.h>
+#include <linux/delay.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -1807,10 +1808,12 @@ static int _regmap_range_multi_paged_reg_write(struct regmap *map,
 	int i, n;
 	struct reg_sequence *base;
 	unsigned int this_page = 0;
+	unsigned int page_change = 0;
 	/*
 	 * the set of registers are not neccessarily in order, but
 	 * since the order of write must be preserved this algorithm
-	 * chops the set each time the page changes
+	 * chops the set each time the page changes. This also applies
+	 * if there is a delay required at any point in the sequence.
 	 */
 	base = regs;
 	for (i = 0, n = 0; i < num_regs; i++, n++) {
@@ -1826,16 +1829,48 @@ static int _regmap_range_multi_paged_reg_write(struct regmap *map,
 				this_page = win_page;
 			if (win_page != this_page) {
 				this_page = win_page;
+				page_change = 1;
+			}
+		}
+
+		/* If we have both a page change and a delay make sure to
+		 * write the regs and apply the delay before we change the
+		 * page.
+		 */
+
+		if (page_change || regs[i].delay_us) {
+
+				/* For situations where the first write requires
+				 * a delay we need to make sure we don't call
+				 * raw_multi_reg_write with n=0
+				 * This can't occur with page breaks as we
+				 * never write on the first iteration
+				 */
+				if (regs[i].delay_us && i == 0)
+					n = 1;
+
 				ret = _regmap_raw_multi_reg_write(map, base, n);
 				if (ret != 0)
 					return ret;
+
+				if (regs[i].delay_us)
+					udelay(regs[i].delay_us);
+
 				base += n;
 				n = 0;
-			}
-			ret = _regmap_select_page(map, &base[n].reg, range, 1);
-			if (ret != 0)
-				return ret;
+
+				if (page_change) {
+					ret = _regmap_select_page(map,
+								  &base[n].reg,
+								  range, 1);
+					if (ret != 0)
+						return ret;
+
+					page_change = 0;
+				}
+
 		}
+
 	}
 	if (n > 0)
 		return _regmap_raw_multi_reg_write(map, base, n);
@@ -1854,6 +1889,9 @@ static int _regmap_multi_reg_write(struct regmap *map,
 			ret = _regmap_write(map, regs[i].reg, regs[i].def);
 			if (ret != 0)
 				return ret;
+
+			if (regs[i].delay_us)
+				udelay(regs[i].delay_us);
 		}
 		return 0;
 	}
@@ -1893,8 +1931,12 @@ static int _regmap_multi_reg_write(struct regmap *map,
 	for (i = 0; i < num_regs; i++) {
 		unsigned int reg = regs[i].reg;
 		struct regmap_range_node *range;
+
+		/* Coalesce all the writes between a page break or a delay
+		 * in a sequence
+		 */
 		range = _regmap_range_lookup(map, reg);
-		if (range) {
+		if (range || regs[i].delay_us) {
 			size_t len = sizeof(struct reg_sequence)*num_regs;
 			struct reg_sequence *base = kmemdup(regs, len,
 							   GFP_KERNEL);
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index c9ef2ec69142..5a7cf2136c81 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -51,14 +51,17 @@ struct reg_default {
 };
 
 /**
- * Register/value pairs for sequences of writes
+ * Register/value pairs for sequences of writes with an optional delay in
+ * microseconds to be applied after each write.
  *
  * @reg: Register address.
  * @def: Register value.
+ * @delay_us: Delay to be applied after the register write in microseconds
  */
 struct reg_sequence {
 	unsigned int reg;
 	unsigned int def;
+	unsigned int delay_us;
 };
 
 #ifdef CONFIG_REGMAP
-- 
cgit v1.2.3-70-g09d2


From c391f262bee9d0d6424a99c85183a06c50e307ee Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 1 Jun 2015 16:05:41 +0800
Subject: genirq: Rename irq_data_get_msi() as irq_data_get_msi_desc()

Rename irq_data_get_msi() as irq_data_get_msi_desc() to keep consistency
with other irq_data access helpers.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/host/pcie-designware.c | 2 +-
 drivers/pci/msi.c                  | 2 +-
 include/linux/irq.h                | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/host/pcie-designware.c b/drivers/pci/host/pcie-designware.c
index 69486be7181e..85c7735d7511 100644
--- a/drivers/pci/host/pcie-designware.c
+++ b/drivers/pci/host/pcie-designware.c
@@ -326,7 +326,7 @@ static int dw_msi_setup_irq(struct msi_controller *chip, struct pci_dev *pdev,
 static void dw_msi_teardown_irq(struct msi_controller *chip, unsigned int irq)
 {
 	struct irq_data *data = irq_get_irq_data(irq);
-	struct msi_desc *msi = irq_data_get_msi(data);
+	struct msi_desc *msi = irq_data_get_msi_desc(data);
 	struct pcie_port *pp = sys_to_pcie(msi->dev->bus->sysdata);
 
 	clear_irq_range(pp, irq, 1, data->hwirq);
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index f66be868ad21..64673f13bbb9 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -249,7 +249,7 @@ static void msix_mask_irq(struct msi_desc *desc, u32 flag)
 
 static void msi_set_mask_bit(struct irq_data *data, u32 flag)
 {
-	struct msi_desc *desc = irq_data_get_msi(data);
+	struct msi_desc *desc = irq_data_get_msi_desc(data);
 
 	if (desc->msi_attrib.is_msix) {
 		msix_mask_irq(desc, flag);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 429ac266c7c6..5284cb166d90 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -638,7 +638,7 @@ static inline struct msi_desc *irq_get_msi_desc(unsigned int irq)
 	return d ? d->msi_desc : NULL;
 }
 
-static inline struct msi_desc *irq_data_get_msi(struct irq_data *d)
+static inline struct msi_desc *irq_data_get_msi_desc(struct irq_data *d)
 {
 	return d->msi_desc;
 }
-- 
cgit v1.2.3-70-g09d2


From b2c0b2cbb282f0cf42518ffacbe197e6f2884168 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Wed, 3 Sep 2014 23:57:13 +0100
Subject: nmi: create generic NMI backtrace implementation

x86s NMI backtrace implementation (for arch_trigger_all_cpu_backtrace())
is fairly generic in nature - the only architecture specific bits are
the act of raising the NMI to other CPUs, and reporting the status of
the NMI handler.

These are fairly simple to factor out, and produce a generic
implementation which can be shared between ARM and x86.

Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/nmi.h |   6 ++
 lib/Makefile        |   2 +-
 lib/nmi_backtrace.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 169 insertions(+), 1 deletion(-)
 create mode 100644 lib/nmi_backtrace.c

(limited to 'include/linux')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index f94da0e65dea..5791e3229068 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -49,6 +49,12 @@ static inline bool trigger_allbutself_cpu_backtrace(void)
 	arch_trigger_all_cpu_backtrace(false);
 	return true;
 }
+
+/* generic implementation */
+void nmi_trigger_all_cpu_backtrace(bool include_self,
+				   void (*raise)(cpumask_t *mask));
+bool nmi_cpu_backtrace(struct pt_regs *regs);
+
 #else
 static inline bool trigger_all_cpu_backtrace(void)
 {
diff --git a/lib/Makefile b/lib/Makefile
index 6897b527581a..392169c5bc4e 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 sha1.o md5.o irq_regs.o argv_split.o \
 	 proportions.o flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-	 earlycpio.o seq_buf.o
+	 earlycpio.o seq_buf.o nmi_backtrace.o
 
 obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
 lib-$(CONFIG_MMU) += ioremap.o
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
new file mode 100644
index 000000000000..88d3d32e5923
--- /dev/null
+++ b/lib/nmi_backtrace.c
@@ -0,0 +1,162 @@
+/*
+ *  NMI backtrace support
+ *
+ * Gratuitously copied from arch/x86/kernel/apic/hw_nmi.c by Russell King,
+ * with the following header:
+ *
+ *  HW NMI watchdog support
+ *
+ *  started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ *  Arch specific calls to support NMI watchdog
+ *
+ *  Bits copied from original nmi.c file
+ */
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include <linux/kprobes.h>
+#include <linux/nmi.h>
+#include <linux/seq_buf.h>
+
+#ifdef arch_trigger_all_cpu_backtrace
+/* For reliability, we're prepared to waste bits here. */
+static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
+static cpumask_t printtrace_mask;
+
+#define NMI_BUF_SIZE		4096
+
+struct nmi_seq_buf {
+	unsigned char		buffer[NMI_BUF_SIZE];
+	struct seq_buf		seq;
+};
+
+/* Safe printing in NMI context */
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+/* "in progress" flag of arch_trigger_all_cpu_backtrace */
+static unsigned long backtrace_flag;
+
+static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+	const char *buf = s->buffer + start;
+
+	printk("%.*s", (end - start) + 1, buf);
+}
+
+void nmi_trigger_all_cpu_backtrace(bool include_self,
+				   void (*raise)(cpumask_t *mask))
+{
+	struct nmi_seq_buf *s;
+	int i, cpu, this_cpu = get_cpu();
+
+	if (test_and_set_bit(0, &backtrace_flag)) {
+		/*
+		 * If there is already a trigger_all_cpu_backtrace() in progress
+		 * (backtrace_flag == 1), don't output double cpu dump infos.
+		 */
+		put_cpu();
+		return;
+	}
+
+	cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+	if (!include_self)
+		cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
+
+	cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
+
+	/*
+	 * Set up per_cpu seq_buf buffers that the NMIs running on the other
+	 * CPUs will write to.
+	 */
+	for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
+		s = &per_cpu(nmi_print_seq, cpu);
+		seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
+	}
+
+	if (!cpumask_empty(to_cpumask(backtrace_mask))) {
+		pr_info("Sending NMI to %s CPUs:\n",
+			(include_self ? "all" : "other"));
+		raise(to_cpumask(backtrace_mask));
+	}
+
+	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
+	for (i = 0; i < 10 * 1000; i++) {
+		if (cpumask_empty(to_cpumask(backtrace_mask)))
+			break;
+		mdelay(1);
+		touch_softlockup_watchdog();
+	}
+
+	/*
+	 * Now that all the NMIs have triggered, we can dump out their
+	 * back traces safely to the console.
+	 */
+	for_each_cpu(cpu, &printtrace_mask) {
+		int len, last_i = 0;
+
+		s = &per_cpu(nmi_print_seq, cpu);
+		len = seq_buf_used(&s->seq);
+		if (!len)
+			continue;
+
+		/* Print line by line. */
+		for (i = 0; i < len; i++) {
+			if (s->buffer[i] == '\n') {
+				print_seq_line(s, last_i, i);
+				last_i = i + 1;
+			}
+		}
+		/* Check if there was a partial line. */
+		if (last_i < len) {
+			print_seq_line(s, last_i, len - 1);
+			pr_cont("\n");
+		}
+	}
+
+	clear_bit(0, &backtrace_flag);
+	smp_mb__after_atomic();
+	put_cpu();
+}
+
+/*
+ * It is not safe to call printk() directly from NMI handlers.
+ * It may be fine if the NMI detected a lock up and we have no choice
+ * but to do so, but doing a NMI on all other CPUs to get a back trace
+ * can be done with a sysrq-l. We don't want that to lock up, which
+ * can happen if the NMI interrupts a printk in progress.
+ *
+ * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
+ * the content into a per cpu seq_buf buffer. Then when the NMIs are
+ * all done, we can safely dump the contents of the seq_buf to a printk()
+ * from a non NMI context.
+ */
+static int nmi_vprintk(const char *fmt, va_list args)
+{
+	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+	unsigned int len = seq_buf_used(&s->seq);
+
+	seq_buf_vprintf(&s->seq, fmt, args);
+	return seq_buf_used(&s->seq) - len;
+}
+
+bool nmi_cpu_backtrace(struct pt_regs *regs)
+{
+	int cpu = smp_processor_id();
+
+	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
+		printk_func_t printk_func_save = this_cpu_read(printk_func);
+
+		/* Replace printk to write into the NMI seq */
+		this_cpu_write(printk_func, nmi_vprintk);
+		pr_warn("NMI backtrace for cpu %d\n", cpu);
+		show_regs(regs);
+		this_cpu_write(printk_func, printk_func_save);
+
+		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+		return true;
+	}
+
+	return false;
+}
+NOKPROBE_SYMBOL(nmi_cpu_backtrace);
+#endif
-- 
cgit v1.2.3-70-g09d2


From b54e5ed8f285d62c0d242c4ef9da90937994db02 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Thu, 16 Jul 2015 11:16:44 +0800
Subject: block: partition: introduce hd_free_part()

So the helper can be used in both generic partition
case and part0 case.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/genhd.c             | 3 +--
 block/partition-generic.c | 3 +--
 include/linux/genhd.h     | 6 ++++++
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 59a1395eedac..85df45292dba 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1110,8 +1110,7 @@ static void disk_release(struct device *dev)
 	disk_release_events(disk);
 	kfree(disk->random);
 	disk_replace_part_tbl(disk, NULL);
-	free_part_stats(&disk->part0);
-	free_part_info(&disk->part0);
+	hd_free_part(&disk->part0);
 	if (disk->queue)
 		blk_put_queue(disk->queue);
 	kfree(disk);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 0d9e5f97f0a8..eca0d02a607c 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -212,8 +212,7 @@ static void part_release(struct device *dev)
 {
 	struct hd_struct *p = dev_to_part(dev);
 	blk_free_devt(dev->devt);
-	free_part_stats(p);
-	free_part_info(p);
+	hd_free_part(p);
 	kfree(p);
 }
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index ec274e0f4ed2..a221220ffcb2 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -663,6 +663,12 @@ static inline void hd_struct_put(struct hd_struct *part)
 		__delete_partition(part);
 }
 
+static inline void hd_free_part(struct hd_struct *part)
+{
+	free_part_stats(part);
+	free_part_info(part);
+}
+
 /*
  * Any access of part->nr_sects which is not protected by partition
  * bd_mutex or gendisk bdev bd_mutex, should be done using this
-- 
cgit v1.2.3-70-g09d2


From 6c71013ecb7e2bddbed9f5b95e7aed22c491daa9 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Thu, 16 Jul 2015 11:16:45 +0800
Subject: block: partition: convert percpu ref

Percpu refcount is the perfect match for partition's case,
and the conversion is quite straight.

With the convertion, one pair of atomic inc/dec can be saved
for accounting block I/O, which is run in hot path of block I/O.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/genhd.c             |  6 +++++-
 block/partition-generic.c |  9 +++++----
 include/linux/genhd.h     | 27 +++++++++++++++++----------
 3 files changed, 27 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 85df45292dba..0c706f33a599 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1284,7 +1284,11 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 		 * converted to make use of bd_mutex and sequence counters.
 		 */
 		seqcount_init(&disk->part0.nr_sects_seq);
-		hd_ref_init(&disk->part0);
+		if (hd_ref_init(&disk->part0)) {
+			hd_free_part(&disk->part0);
+			kfree(disk);
+			return NULL;
+		}
 
 		disk->minors = minors;
 		rand_initialize_disk(disk);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index eca0d02a607c..e7711133284e 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -232,8 +232,9 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
 	put_device(part_to_dev(part));
 }
 
-void __delete_partition(struct hd_struct *part)
+void __delete_partition(struct percpu_ref *ref)
 {
+	struct hd_struct *part = container_of(ref, struct hd_struct, ref);
 	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
 
@@ -254,7 +255,7 @@ void delete_partition(struct gendisk *disk, int partno)
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
-	hd_struct_put(part);
+	hd_struct_kill(part);
 }
 
 static ssize_t whole_disk_show(struct device *dev,
@@ -355,8 +356,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	if (!dev_get_uevent_suppress(ddev))
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
-	hd_ref_init(p);
-	return p;
+	if (!hd_ref_init(p))
+		return p;
 
 out_free_info:
 	free_part_info(p);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index a221220ffcb2..2adbfa6d02bc 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -13,6 +13,7 @@
 #include <linux/kdev_t.h>
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
+#include <linux/percpu-refcount.h>
 
 #ifdef CONFIG_BLOCK
 
@@ -124,7 +125,7 @@ struct hd_struct {
 #else
 	struct disk_stats dkstats;
 #endif
-	atomic_t ref;
+	struct percpu_ref ref;
 	struct rcu_head rcu_head;
 };
 
@@ -611,7 +612,7 @@ extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
 						     sector_t len, int flags,
 						     struct partition_meta_info
 						       *info);
-extern void __delete_partition(struct hd_struct *);
+extern void __delete_partition(struct percpu_ref *);
 extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);
 
@@ -640,33 +641,39 @@ extern ssize_t part_fail_store(struct device *dev,
 			       const char *buf, size_t count);
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 
-static inline void hd_ref_init(struct hd_struct *part)
+static inline int hd_ref_init(struct hd_struct *part)
 {
-	atomic_set(&part->ref, 1);
-	smp_mb();
+	if (percpu_ref_init(&part->ref, __delete_partition, 0,
+				GFP_KERNEL))
+		return -ENOMEM;
+	return 0;
 }
 
 static inline void hd_struct_get(struct hd_struct *part)
 {
-	atomic_inc(&part->ref);
-	smp_mb__after_atomic();
+	percpu_ref_get(&part->ref);
 }
 
 static inline int hd_struct_try_get(struct hd_struct *part)
 {
-	return atomic_inc_not_zero(&part->ref);
+	return percpu_ref_tryget_live(&part->ref);
 }
 
 static inline void hd_struct_put(struct hd_struct *part)
 {
-	if (atomic_dec_and_test(&part->ref))
-		__delete_partition(part);
+	percpu_ref_put(&part->ref);
+}
+
+static inline void hd_struct_kill(struct hd_struct *part)
+{
+	percpu_ref_kill(&part->ref);
 }
 
 static inline void hd_free_part(struct hd_struct *part)
 {
 	free_part_stats(part);
 	free_part_info(part);
+	percpu_ref_exit(&part->ref);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 0034af036554c39eefd14d835a8ec3496ac46712 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 16 Jul 2015 09:14:26 -0600
Subject: block: make /sys/block/<dev>/queue/discard_max_bytes writeable

Lots of devices support huge discard sizes these days. Depending
on how the device handles them internally, huge discards can
introduce massive latencies (hundreds of msec) on the device side.

We have a sysfs file, discard_max_bytes, that advertises the max
hardware supported discard size. Make this writeable, and split
the settings into a soft and hard limit. This can be set from
'discard_granularity' and up to the hardware limit.

Add a new sysfs file, 'discard_max_hw_bytes', that shows the hw
set limit.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/queue-sysfs.txt | 10 +++++++++-
 block/blk-settings.c                |  4 ++++
 block/blk-sysfs.c                   | 40 ++++++++++++++++++++++++++++++++++++-
 include/linux/blkdev.h              |  1 +
 4 files changed, 53 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index 3a29f8914df9..e5d914845be6 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -20,7 +20,7 @@ This shows the size of internal allocation of the device in bytes, if
 reported by the device. A value of '0' means device does not support
 the discard functionality.
 
-discard_max_bytes (RO)
+discard_max_hw_bytes (RO)
 ----------------------
 Devices that support discard functionality may have internal limits on
 the number of bytes that can be trimmed or unmapped in a single operation.
@@ -29,6 +29,14 @@ number of bytes that can be discarded in a single operation. Discard
 requests issued to the device must not exceed this limit. A discard_max_bytes
 value of 0 means that the device does not support discard functionality.
 
+discard_max_bytes (RW)
+----------------------
+While discard_max_hw_bytes is the hardware limit for the device, this
+setting is the software limit. Some devices exhibit large latencies when
+large discards are issued, setting this value lower will make Linux issue
+smaller discards and potentially help reduce latencies induced by large
+discard operations.
+
 discard_zeroes_data (RO)
 ------------------------
 When read, this file will show if the discarded block are zeroed by the
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 12600bfffca9..b38d8d723276 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -116,6 +116,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->chunk_sectors = 0;
 	lim->max_write_same_sectors = 0;
 	lim->max_discard_sectors = 0;
+	lim->max_hw_discard_sectors = 0;
 	lim->discard_granularity = 0;
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
@@ -303,6 +304,7 @@ EXPORT_SYMBOL(blk_queue_chunk_sectors);
 void blk_queue_max_discard_sectors(struct request_queue *q,
 		unsigned int max_discard_sectors)
 {
+	q->limits.max_hw_discard_sectors = max_discard_sectors;
 	q->limits.max_discard_sectors = max_discard_sectors;
 }
 EXPORT_SYMBOL(blk_queue_max_discard_sectors);
@@ -641,6 +643,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
 		t->max_discard_sectors = min_not_zero(t->max_discard_sectors,
 						      b->max_discard_sectors);
+		t->max_hw_discard_sectors = min_not_zero(t->max_hw_discard_sectors,
+							 b->max_hw_discard_sectors);
 		t->discard_granularity = max(t->discard_granularity,
 					     b->discard_granularity);
 		t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) %
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 6264b382d4d1..b1f34e463c0f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -145,12 +145,43 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
 	return queue_var_show(q->limits.discard_granularity, page);
 }
 
+static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page)
+{
+	unsigned long long val;
+
+	val = q->limits.max_hw_discard_sectors << 9;
+	return sprintf(page, "%llu\n", val);
+}
+
 static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
 {
 	return sprintf(page, "%llu\n",
 		       (unsigned long long)q->limits.max_discard_sectors << 9);
 }
 
+static ssize_t queue_discard_max_store(struct request_queue *q,
+				       const char *page, size_t count)
+{
+	unsigned long max_discard;
+	ssize_t ret = queue_var_store(&max_discard, page, count);
+
+	if (ret < 0)
+		return ret;
+
+	if (max_discard & (q->limits.discard_granularity - 1))
+		return -EINVAL;
+
+	max_discard >>= 9;
+	if (max_discard > UINT_MAX)
+		return -EINVAL;
+
+	if (max_discard > q->limits.max_hw_discard_sectors)
+		max_discard = q->limits.max_hw_discard_sectors;
+
+	q->limits.max_discard_sectors = max_discard;
+	return ret;
+}
+
 static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(queue_discard_zeroes_data(q), page);
@@ -360,9 +391,15 @@ static struct queue_sysfs_entry queue_discard_granularity_entry = {
 	.show = queue_discard_granularity_show,
 };
 
+static struct queue_sysfs_entry queue_discard_max_hw_entry = {
+	.attr = {.name = "discard_max_hw_bytes", .mode = S_IRUGO },
+	.show = queue_discard_max_hw_show,
+};
+
 static struct queue_sysfs_entry queue_discard_max_entry = {
-	.attr = {.name = "discard_max_bytes", .mode = S_IRUGO },
+	.attr = {.name = "discard_max_bytes", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_discard_max_show,
+	.store = queue_discard_max_store,
 };
 
 static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
@@ -421,6 +458,7 @@ static struct attribute *default_attrs[] = {
 	&queue_io_opt_entry.attr,
 	&queue_discard_granularity_entry.attr,
 	&queue_discard_max_entry.attr,
+	&queue_discard_max_hw_entry.attr,
 	&queue_discard_zeroes_data_entry.attr,
 	&queue_write_same_max_entry.attr,
 	&queue_nonrot_entry.attr,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d4068c17d0df..243f29e779ec 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -268,6 +268,7 @@ struct queue_limits {
 	unsigned int		io_min;
 	unsigned int		io_opt;
 	unsigned int		max_discard_sectors;
+	unsigned int		max_hw_discard_sectors;
 	unsigned int		max_write_same_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
-- 
cgit v1.2.3-70-g09d2


From 2531c8cf56a640cd7d17057df8484e570716a450 Mon Sep 17 00:00:00 2001
From: Dominik Dingel <dingel@linux.vnet.ibm.com>
Date: Fri, 17 Jul 2015 16:23:37 -0700
Subject: mm: hugetlb: allow hugepages_supported to be architecture specific

s390 has a constant hugepage size, by setting HPAGE_SHIFT we also change
e.g. the pageblock_order, which should be independent in respect to
hugepage support.

With this patch every architecture is free to define how to check
for hugepage support.

Signed-off-by: Dominik Dingel <dingel@linux.vnet.ibm.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 205026175c42..d891f949466a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -460,15 +460,14 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
 	return &mm->page_table_lock;
 }
 
-static inline bool hugepages_supported(void)
-{
-	/*
-	 * Some platform decide whether they support huge pages at boot
-	 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
-	 * there is no such support
-	 */
-	return HPAGE_SHIFT != 0;
-}
+#ifndef hugepages_supported
+/*
+ * Some platform decide whether they support huge pages at boot
+ * time. Some of them, such as powerpc, set HPAGE_SHIFT to 0
+ * when there is no such support
+ */
+#define hugepages_supported() (HPAGE_SHIFT != 0)
+#endif
 
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
-- 
cgit v1.2.3-70-g09d2


From 8db1486065141e619e4855b84e350ef32064f7e1 Mon Sep 17 00:00:00 2001
From: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Date: Fri, 17 Jul 2015 16:23:42 -0700
Subject: include, lib: add __printf attributes to several function prototypes

Using __printf attributes helps to detect several format string issues
at compile time (even though -Wformat-security is currently disabled in
Makefile).  For example it can detect when formatting a pointer as a
number, like the issue fixed in commit a3fa71c40f18 ("wl18xx: show
rx_frames_per_rates as an array as it really is"), or when the arguments
do not match the format string, c.f.  for example commit 5ce1aca81435
("reiserfs: fix __RASSERT format string").

To prevent similar bugs in the future, add a __printf attribute to every
function prototype which needs one in include/linux/ and lib/.  These
functions were mostly found by using gcc's -Wsuggest-attribute=format
flag.

Signed-off-by: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/clkdev.h    |  7 ++++---
 include/linux/compat.h    |  2 +-
 include/linux/configfs.h  |  3 ++-
 include/linux/cpu.h       |  7 ++++---
 include/linux/dcache.h    |  3 ++-
 include/linux/device.h    | 15 +++++++--------
 include/linux/iommu.h     |  2 +-
 include/linux/kernel.h    |  9 +++++----
 include/linux/kobject.h   |  5 +++--
 include/linux/mmiotrace.h |  2 +-
 include/linux/printk.h    |  6 +++---
 lib/kobject.c             |  5 +++--
 12 files changed, 36 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h
index a240b18e86fa..08bffcc466de 100644
--- a/include/linux/clkdev.h
+++ b/include/linux/clkdev.h
@@ -33,18 +33,19 @@ struct clk_lookup {
 	}
 
 struct clk_lookup *clkdev_alloc(struct clk *clk, const char *con_id,
-	const char *dev_fmt, ...);
+	const char *dev_fmt, ...) __printf(3, 4);
 
 void clkdev_add(struct clk_lookup *cl);
 void clkdev_drop(struct clk_lookup *cl);
 
 struct clk_lookup *clkdev_create(struct clk *clk, const char *con_id,
-	const char *dev_fmt, ...);
+	const char *dev_fmt, ...) __printf(3, 4);
 
 void clkdev_add_table(struct clk_lookup *, size_t);
 int clk_add_alias(const char *, const char *, const char *, struct device *);
 
-int clk_register_clkdev(struct clk *, const char *, const char *, ...);
+int clk_register_clkdev(struct clk *, const char *, const char *, ...)
+	__printf(3, 4);
 int clk_register_clkdevs(struct clk *, struct clk_lookup *, size_t);
 
 #ifdef CONFIG_COMMON_CLK
diff --git a/include/linux/compat.h b/include/linux/compat.h
index ab25814690bc..a76c9172b2eb 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -424,7 +424,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
 
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 
-extern int compat_printk(const char *fmt, ...);
+extern __printf(1, 2) int compat_printk(const char *fmt, ...);
 extern void sigset_from_compat(sigset_t *set, const compat_sigset_t *compat);
 extern void sigset_to_compat(compat_sigset_t *compat, const sigset_t *set);
 
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index c9e5c57e4edf..63a36e89d0eb 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -64,7 +64,8 @@ struct config_item {
 	struct dentry		*ci_dentry;
 };
 
-extern int config_item_set_name(struct config_item *, const char *, ...);
+extern __printf(2, 3)
+int config_item_set_name(struct config_item *, const char *, ...);
 
 static inline char *config_item_name(struct config_item * item)
 {
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index c0fb6b1b4712..23c30bdcca86 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -40,9 +40,10 @@ extern void cpu_remove_dev_attr(struct device_attribute *attr);
 extern int cpu_add_dev_attr_group(struct attribute_group *attrs);
 extern void cpu_remove_dev_attr_group(struct attribute_group *attrs);
 
-extern struct device *cpu_device_create(struct device *parent, void *drvdata,
-					const struct attribute_group **groups,
-					const char *fmt, ...);
+extern __printf(4, 5)
+struct device *cpu_device_create(struct device *parent, void *drvdata,
+				 const struct attribute_group **groups,
+				 const char *fmt, ...);
 #ifdef CONFIG_HOTPLUG_CPU
 extern void unregister_cpu(struct cpu *cpu);
 extern ssize_t arch_cpu_probe(const char *, size_t);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index d2d50249b7b2..d67ae119cf4e 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -327,7 +327,8 @@ static inline unsigned d_count(const struct dentry *dentry)
 /*
  * helper function for dentry_operations.d_dname() members
  */
-extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
+extern __printf(4, 5)
+char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
 extern char *simple_dname(struct dentry *, char *, int);
 
 extern char *__d_path(const struct path *, const struct path *, char *, int);
diff --git a/include/linux/device.h b/include/linux/device.h
index 5a31bf3a4024..a2b4ea70a946 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -637,8 +637,9 @@ extern int devres_release_group(struct device *dev, void *id);
 
 /* managed devm_k.alloc/kfree for device drivers */
 extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp);
-extern char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt,
-			     va_list ap);
+extern __printf(3, 0)
+char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt,
+		      va_list ap);
 extern __printf(3, 4)
 char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...);
 static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp)
@@ -1011,12 +1012,10 @@ extern int __must_check device_reprobe(struct device *dev);
 /*
  * Easy functions for dynamically creating devices on the fly
  */
-extern struct device *device_create_vargs(struct class *cls,
-					  struct device *parent,
-					  dev_t devt,
-					  void *drvdata,
-					  const char *fmt,
-					  va_list vargs);
+extern __printf(5, 0)
+struct device *device_create_vargs(struct class *cls, struct device *parent,
+				   dev_t devt, void *drvdata,
+				   const char *fmt, va_list vargs);
 extern __printf(5, 6)
 struct device *device_create(struct class *cls, struct device *parent,
 			     dev_t devt, void *drvdata,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index dc767f7c3704..f9c1b6d0f2e4 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -258,7 +258,7 @@ extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
 struct device *iommu_device_create(struct device *parent, void *drvdata,
 				   const struct attribute_group **groups,
-				   const char *fmt, ...);
+				   const char *fmt, ...) __printf(4, 5);
 void iommu_device_destroy(struct device *dev);
 int iommu_device_link(struct device *dev, struct device *link);
 void iommu_device_unlink(struct device *dev, struct device *link);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5f0be58640ea..5582410727cb 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -411,7 +411,8 @@ extern __printf(3, 0)
 int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
 extern __printf(2, 3)
 char *kasprintf(gfp_t gfp, const char *fmt, ...);
-extern char *kvasprintf(gfp_t gfp, const char *fmt, va_list args);
+extern __printf(2, 0)
+char *kvasprintf(gfp_t gfp, const char *fmt, va_list args);
 
 extern __scanf(2, 3)
 int sscanf(const char *, const char *, ...);
@@ -679,10 +680,10 @@ do {									\
 		__ftrace_vprintk(_THIS_IP_, fmt, vargs);		\
 } while (0)
 
-extern int
+extern __printf(2, 0) int
 __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);
 
-extern int
+extern __printf(2, 0) int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 
 extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
@@ -702,7 +703,7 @@ int trace_printk(const char *fmt, ...)
 {
 	return 0;
 }
-static inline int
+static __printf(1, 0) inline int
 ftrace_vprintk(const char *fmt, va_list ap)
 {
 	return 0;
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 2d61b909f414..637f67002c5a 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -80,8 +80,9 @@ struct kobject {
 
 extern __printf(2, 3)
 int kobject_set_name(struct kobject *kobj, const char *name, ...);
-extern int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
-				  va_list vargs);
+extern __printf(2, 0)
+int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
+			   va_list vargs);
 
 static inline const char *kobject_name(const struct kobject *kobj)
 {
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index c5d52780d6a0..3ba327af055c 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -106,6 +106,6 @@ extern void enable_mmiotrace(void);
 extern void disable_mmiotrace(void);
 extern void mmio_trace_rw(struct mmiotrace_rw *rw);
 extern void mmio_trace_mapping(struct mmiotrace_map *map);
-extern int mmio_trace_printk(const char *fmt, va_list args);
+extern __printf(1, 0) int mmio_trace_printk(const char *fmt, va_list args);
 
 #endif /* _LINUX_MMIOTRACE_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 58b1fec40d37..a6298b27ac99 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -122,7 +122,7 @@ static inline __printf(1, 2) __cold
 void early_printk(const char *s, ...) { }
 #endif
 
-typedef int(*printk_func_t)(const char *fmt, va_list args);
+typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
 
 #ifdef CONFIG_PRINTK
 asmlinkage __printf(5, 0)
@@ -166,7 +166,7 @@ char *log_buf_addr_get(void);
 u32 log_buf_len_get(void);
 void log_buf_kexec_setup(void);
 void __init setup_log_buf(int early);
-void dump_stack_set_arch_desc(const char *fmt, ...);
+__printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
 void dump_stack_print_info(const char *log_lvl);
 void show_regs_print_info(const char *log_lvl);
 #else
@@ -217,7 +217,7 @@ static inline void setup_log_buf(int early)
 {
 }
 
-static inline void dump_stack_set_arch_desc(const char *fmt, ...)
+static inline __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...)
 {
 }
 
diff --git a/lib/kobject.c b/lib/kobject.c
index 2e3bd01964a9..3e3a5c3cb330 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -337,8 +337,9 @@ error:
 }
 EXPORT_SYMBOL(kobject_init);
 
-static int kobject_add_varg(struct kobject *kobj, struct kobject *parent,
-			    const char *fmt, va_list vargs)
+static __printf(3, 0) int kobject_add_varg(struct kobject *kobj,
+					   struct kobject *parent,
+					   const char *fmt, va_list vargs)
 {
 	int retval;
 
-- 
cgit v1.2.3-70-g09d2


From da89947b47a3a355f33a75d7672892c147ed880d Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 17 Jul 2015 16:23:50 -0700
Subject: Update Viresh Kumar's email address

Switch to my kernel.org alias instead of a badly named gmail address,
which I rarely use.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .mailmap                                      |  4 +++-
 Documentation/arm/SPEAr/overview.txt          |  2 +-
 MAINTAINERS                                   | 12 ++++++------
 arch/arm/boot/dts/spear1310-evb.dts           |  2 +-
 arch/arm/boot/dts/spear1310.dtsi              |  2 +-
 arch/arm/boot/dts/spear1340-evb.dts           |  2 +-
 arch/arm/boot/dts/spear1340.dtsi              |  2 +-
 arch/arm/boot/dts/spear13xx.dtsi              |  2 +-
 arch/arm/boot/dts/spear300-evb.dts            |  2 +-
 arch/arm/boot/dts/spear300.dtsi               |  2 +-
 arch/arm/boot/dts/spear310-evb.dts            |  2 +-
 arch/arm/boot/dts/spear310.dtsi               |  2 +-
 arch/arm/boot/dts/spear320-evb.dts            |  2 +-
 arch/arm/boot/dts/spear320.dtsi               |  2 +-
 arch/arm/boot/dts/spear3xx.dtsi               |  2 +-
 arch/arm/mach-spear/generic.h                 |  2 +-
 arch/arm/mach-spear/include/mach/irqs.h       |  2 +-
 arch/arm/mach-spear/include/mach/misc_regs.h  |  2 +-
 arch/arm/mach-spear/include/mach/spear.h      |  2 +-
 arch/arm/mach-spear/include/mach/uncompress.h |  2 +-
 arch/arm/mach-spear/pl080.c                   |  2 +-
 arch/arm/mach-spear/pl080.h                   |  2 +-
 arch/arm/mach-spear/restart.c                 |  2 +-
 arch/arm/mach-spear/spear1310.c               |  2 +-
 arch/arm/mach-spear/spear1340.c               |  2 +-
 arch/arm/mach-spear/spear13xx.c               |  2 +-
 arch/arm/mach-spear/spear300.c                |  2 +-
 arch/arm/mach-spear/spear310.c                |  2 +-
 arch/arm/mach-spear/spear320.c                |  2 +-
 arch/arm/mach-spear/spear3xx.c                |  2 +-
 drivers/ata/pata_arasan_cf.c                  |  4 ++--
 drivers/clk/spear/clk-aux-synth.c             |  2 +-
 drivers/clk/spear/clk-frac-synth.c            |  2 +-
 drivers/clk/spear/clk-gpt-synth.c             |  2 +-
 drivers/clk/spear/clk-vco-pll.c               |  2 +-
 drivers/clk/spear/clk.c                       |  2 +-
 drivers/clk/spear/clk.h                       |  2 +-
 drivers/clk/spear/spear1310_clock.c           |  2 +-
 drivers/clk/spear/spear1340_clock.c           |  2 +-
 drivers/clk/spear/spear3xx_clock.c            |  2 +-
 drivers/clk/spear/spear6xx_clock.c            |  2 +-
 drivers/dma/dw/core.c                         |  2 +-
 drivers/irqchip/spear-shirq.c                 |  2 +-
 drivers/mfd/stmpe-i2c.c                       |  2 +-
 drivers/mfd/stmpe-spi.c                       |  4 ++--
 drivers/mmc/host/sdhci-spear.c                |  4 ++--
 drivers/pinctrl/spear/pinctrl-spear.c         |  2 +-
 drivers/pinctrl/spear/pinctrl-spear.h         |  2 +-
 drivers/pinctrl/spear/pinctrl-spear1310.c     |  4 ++--
 drivers/pinctrl/spear/pinctrl-spear1340.c     |  4 ++--
 drivers/pinctrl/spear/pinctrl-spear300.c      |  4 ++--
 drivers/pinctrl/spear/pinctrl-spear310.c      |  4 ++--
 drivers/pinctrl/spear/pinctrl-spear320.c      |  4 ++--
 drivers/pinctrl/spear/pinctrl-spear3xx.c      |  2 +-
 drivers/pinctrl/spear/pinctrl-spear3xx.h      |  2 +-
 drivers/watchdog/sp805_wdt.c                  |  4 ++--
 include/linux/amba/sp810.h                    |  2 +-
 include/linux/pata_arasan_cf_data.h           |  2 +-
 58 files changed, 74 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/.mailmap b/.mailmap
index 977f958eedbe..8d8ad17dcf9c 100644
--- a/.mailmap
+++ b/.mailmap
@@ -125,7 +125,9 @@ Uwe Kleine-König <ukleinek@informatik.uni-freiburg.de>
 Uwe Kleine-König <ukl@pengutronix.de>
 Uwe Kleine-König <Uwe.Kleine-Koenig@digi.com>
 Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
-Viresh Kumar <viresh.linux@gmail.com> <viresh.kumar@st.com>
+Viresh Kumar <vireshk@kernel.org> <viresh.kumar@st.com>
+Viresh Kumar <vireshk@kernel.org> <viresh.linux@gmail.com>
+Viresh Kumar <vireshk@kernel.org> <viresh.kumar2@arm.com>
 Takashi YOSHII <takashi.yoshii.zj@renesas.com>
 Yusuke Goda <goda.yusuke@renesas.com>
 Gustavo Padovan <gustavo@las.ic.unicamp.br>
diff --git a/Documentation/arm/SPEAr/overview.txt b/Documentation/arm/SPEAr/overview.txt
index 65610bf52ebf..1b049be6c84f 100644
--- a/Documentation/arm/SPEAr/overview.txt
+++ b/Documentation/arm/SPEAr/overview.txt
@@ -60,4 +60,4 @@ Introduction
   Document Author
   ---------------
 
-  Viresh Kumar <viresh.linux@gmail.com>, (c) 2010-2012 ST Microelectronics
+  Viresh Kumar <vireshk@kernel.org>, (c) 2010-2012 ST Microelectronics
diff --git a/MAINTAINERS b/MAINTAINERS
index 2de150384670..093da8a3f8b2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6090,7 +6090,7 @@ F:	include/linux/ata.h
 F:	include/linux/libata.h
 
 LIBATA PATA ARASAN COMPACT FLASH CONTROLLER
-M:	Viresh Kumar <viresh.linux@gmail.com>
+M:	Viresh Kumar <vireshk@kernel.org>
 L:	linux-ide@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tj/libata.git
 S:	Maintained
@@ -7996,7 +7996,7 @@ S:	Maintained
 F:	drivers/pinctrl/samsung/
 
 PIN CONTROLLER - ST SPEAR
-M:	Viresh Kumar <viresh.linux@gmail.com>
+M:	Viresh Kumar <vireshk@kernel.org>
 L:	spear-devel@list.st.com
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 W:	http://www.st.com/spear
@@ -8895,7 +8895,7 @@ S:	Maintained
 F:	drivers/tty/serial/
 
 SYNOPSYS DESIGNWARE DMAC DRIVER
-M:	Viresh Kumar <viresh.linux@gmail.com>
+M:	Viresh Kumar <vireshk@kernel.org>
 M:	Andy Shevchenko <andriy.shevchenko@linux.intel.com>
 S:	Maintained
 F:	include/linux/dma/dw.h
@@ -9062,7 +9062,7 @@ S:	Maintained
 F:	drivers/mmc/host/sdhci-s3c*
 
 SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) ST SPEAR DRIVER
-M:	Viresh Kumar <viresh.linux@gmail.com>
+M:	Viresh Kumar <vireshk@kernel.org>
 L:	spear-devel@list.st.com
 L:	linux-mmc@vger.kernel.org
 S:	Maintained
@@ -9600,7 +9600,7 @@ S:	Maintained
 F:	include/linux/compiler.h
 
 SPEAR PLATFORM SUPPORT
-M:	Viresh Kumar <viresh.linux@gmail.com>
+M:	Viresh Kumar <vireshk@kernel.org>
 M:	Shiraz Hashim <shiraz.linux.kernel@gmail.com>
 L:	spear-devel@list.st.com
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@ -9609,7 +9609,7 @@ S:	Maintained
 F:	arch/arm/mach-spear/
 
 SPEAR CLOCK FRAMEWORK SUPPORT
-M:	Viresh Kumar <viresh.linux@gmail.com>
+M:	Viresh Kumar <vireshk@kernel.org>
 L:	spear-devel@list.st.com
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 W:	http://www.st.com/spear
diff --git a/arch/arm/boot/dts/spear1310-evb.dts b/arch/arm/boot/dts/spear1310-evb.dts
index d42c84b1df8d..e48857249ce7 100644
--- a/arch/arm/boot/dts/spear1310-evb.dts
+++ b/arch/arm/boot/dts/spear1310-evb.dts
@@ -1,7 +1,7 @@
 /*
  * DTS file for SPEAr1310 Evaluation Baord
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear1310.dtsi b/arch/arm/boot/dts/spear1310.dtsi
index 9d342920695a..54bc6d3cf290 100644
--- a/arch/arm/boot/dts/spear1310.dtsi
+++ b/arch/arm/boot/dts/spear1310.dtsi
@@ -1,7 +1,7 @@
 /*
  * DTS file for all SPEAr1310 SoCs
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear1340-evb.dts b/arch/arm/boot/dts/spear1340-evb.dts
index b23e05ed1d60..c611f5606dfe 100644
--- a/arch/arm/boot/dts/spear1340-evb.dts
+++ b/arch/arm/boot/dts/spear1340-evb.dts
@@ -1,7 +1,7 @@
 /*
  * DTS file for SPEAr1340 Evaluation Baord
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear1340.dtsi b/arch/arm/boot/dts/spear1340.dtsi
index 13e1aa33daa2..df2232d767ed 100644
--- a/arch/arm/boot/dts/spear1340.dtsi
+++ b/arch/arm/boot/dts/spear1340.dtsi
@@ -1,7 +1,7 @@
 /*
  * DTS file for all SPEAr1340 SoCs
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear13xx.dtsi b/arch/arm/boot/dts/spear13xx.dtsi
index 40accc87e3a2..14594ce8c18a 100644
--- a/arch/arm/boot/dts/spear13xx.dtsi
+++ b/arch/arm/boot/dts/spear13xx.dtsi
@@ -1,7 +1,7 @@
 /*
  * DTS file for all SPEAr13xx SoCs
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear300-evb.dts b/arch/arm/boot/dts/spear300-evb.dts
index 5de1431653e4..e859e8288bcd 100644
--- a/arch/arm/boot/dts/spear300-evb.dts
+++ b/arch/arm/boot/dts/spear300-evb.dts
@@ -1,7 +1,7 @@
 /*
  * DTS file for SPEAr300 Evaluation Baord
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear300.dtsi b/arch/arm/boot/dts/spear300.dtsi
index f79b3dfaabe6..f4e92e599729 100644
--- a/arch/arm/boot/dts/spear300.dtsi
+++ b/arch/arm/boot/dts/spear300.dtsi
@@ -1,7 +1,7 @@
 /*
  * DTS file for SPEAr300 SoC
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear310-evb.dts b/arch/arm/boot/dts/spear310-evb.dts
index b09632963d15..070f2c1b7851 100644
--- a/arch/arm/boot/dts/spear310-evb.dts
+++ b/arch/arm/boot/dts/spear310-evb.dts
@@ -1,7 +1,7 @@
 /*
  * DTS file for SPEAr310 Evaluation Baord
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear310.dtsi b/arch/arm/boot/dts/spear310.dtsi
index 95372080eea6..da210b454753 100644
--- a/arch/arm/boot/dts/spear310.dtsi
+++ b/arch/arm/boot/dts/spear310.dtsi
@@ -1,7 +1,7 @@
 /*
  * DTS file for SPEAr310 SoC
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear320-evb.dts b/arch/arm/boot/dts/spear320-evb.dts
index fdedbb514102..1b1034477923 100644
--- a/arch/arm/boot/dts/spear320-evb.dts
+++ b/arch/arm/boot/dts/spear320-evb.dts
@@ -1,7 +1,7 @@
 /*
  * DTS file for SPEAr320 Evaluation Baord
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear320.dtsi b/arch/arm/boot/dts/spear320.dtsi
index ffea342aeec9..22be6e5edaac 100644
--- a/arch/arm/boot/dts/spear320.dtsi
+++ b/arch/arm/boot/dts/spear320.dtsi
@@ -1,7 +1,7 @@
 /*
  * DTS file for SPEAr320 SoC
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear3xx.dtsi b/arch/arm/boot/dts/spear3xx.dtsi
index f0e3fcf8e323..118135d75899 100644
--- a/arch/arm/boot/dts/spear3xx.dtsi
+++ b/arch/arm/boot/dts/spear3xx.dtsi
@@ -1,7 +1,7 @@
 /*
  * DTS file for all SPEAr3xx SoCs
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/mach-spear/generic.h b/arch/arm/mach-spear/generic.h
index a99d90a4d09c..06640914d9a0 100644
--- a/arch/arm/mach-spear/generic.h
+++ b/arch/arm/mach-spear/generic.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2009-2012 ST Microelectronics
  * Rajeev Kumar <rajeev-dlh.kumar@st.com>
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/include/mach/irqs.h b/arch/arm/mach-spear/include/mach/irqs.h
index 92da0a8c6bce..7058720c5278 100644
--- a/arch/arm/mach-spear/include/mach/irqs.h
+++ b/arch/arm/mach-spear/include/mach/irqs.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2009-2012 ST Microelectronics
  * Rajeev Kumar <rajeev-dlh.kumar@st.com>
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/include/mach/misc_regs.h b/arch/arm/mach-spear/include/mach/misc_regs.h
index 935639ce59ba..cfaf7c665b58 100644
--- a/arch/arm/mach-spear/include/mach/misc_regs.h
+++ b/arch/arm/mach-spear/include/mach/misc_regs.h
@@ -4,7 +4,7 @@
  * Miscellaneous registers definitions for SPEAr3xx machine family
  *
  * Copyright (C) 2009 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/include/mach/spear.h b/arch/arm/mach-spear/include/mach/spear.h
index f2d6a0176575..5ed841ccf8a3 100644
--- a/arch/arm/mach-spear/include/mach/spear.h
+++ b/arch/arm/mach-spear/include/mach/spear.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2009,2012 ST Microelectronics
  * Rajeev Kumar<rajeev-dlh.kumar@st.com>
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/include/mach/uncompress.h b/arch/arm/mach-spear/include/mach/uncompress.h
index 51b2dc93e4da..8439b9c12edb 100644
--- a/arch/arm/mach-spear/include/mach/uncompress.h
+++ b/arch/arm/mach-spear/include/mach/uncompress.h
@@ -4,7 +4,7 @@
  * Serial port stubs for kernel decompress status messages
  *
  * Copyright (C) 2009 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/pl080.c b/arch/arm/mach-spear/pl080.c
index cfa1199d0f4a..b4529f3e0ee9 100644
--- a/arch/arm/mach-spear/pl080.c
+++ b/arch/arm/mach-spear/pl080.c
@@ -4,7 +4,7 @@
  * DMAC pl080 definitions for SPEAr platform
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/pl080.h b/arch/arm/mach-spear/pl080.h
index eb6590ded40d..608dec6725ae 100644
--- a/arch/arm/mach-spear/pl080.h
+++ b/arch/arm/mach-spear/pl080.h
@@ -4,7 +4,7 @@
  * DMAC pl080 definitions for SPEAr platform
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/restart.c b/arch/arm/mach-spear/restart.c
index ce5e098c4888..b4342155a783 100644
--- a/arch/arm/mach-spear/restart.c
+++ b/arch/arm/mach-spear/restart.c
@@ -4,7 +4,7 @@
  * SPEAr platform specific restart functions
  *
  * Copyright (C) 2009 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/spear1310.c b/arch/arm/mach-spear/spear1310.c
index d9ce4d8000f0..cd5d375d91f0 100644
--- a/arch/arm/mach-spear/spear1310.c
+++ b/arch/arm/mach-spear/spear1310.c
@@ -4,7 +4,7 @@
  * SPEAr1310 machine source file
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/spear1340.c b/arch/arm/mach-spear/spear1340.c
index 3f3c0f124bd3..94594d5a446c 100644
--- a/arch/arm/mach-spear/spear1340.c
+++ b/arch/arm/mach-spear/spear1340.c
@@ -4,7 +4,7 @@
  * SPEAr1340 machine source file
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/spear13xx.c b/arch/arm/mach-spear/spear13xx.c
index 2e463a93468d..b7afce6795f4 100644
--- a/arch/arm/mach-spear/spear13xx.c
+++ b/arch/arm/mach-spear/spear13xx.c
@@ -4,7 +4,7 @@
  * SPEAr13XX machines common source file
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/spear300.c b/arch/arm/mach-spear/spear300.c
index b52e48f342f4..5b32edda2276 100644
--- a/arch/arm/mach-spear/spear300.c
+++ b/arch/arm/mach-spear/spear300.c
@@ -4,7 +4,7 @@
  * SPEAr300 machine source file
  *
  * Copyright (C) 2009-2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/spear310.c b/arch/arm/mach-spear/spear310.c
index ed2029db391f..86a44ac7ff67 100644
--- a/arch/arm/mach-spear/spear310.c
+++ b/arch/arm/mach-spear/spear310.c
@@ -4,7 +4,7 @@
  * SPEAr310 machine source file
  *
  * Copyright (C) 2009-2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/spear320.c b/arch/arm/mach-spear/spear320.c
index bf634b32a930..d45d751926c5 100644
--- a/arch/arm/mach-spear/spear320.c
+++ b/arch/arm/mach-spear/spear320.c
@@ -4,7 +4,7 @@
  * SPEAr320 machine source file
  *
  * Copyright (C) 2009-2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/spear3xx.c b/arch/arm/mach-spear/spear3xx.c
index bf3b1fd8cb23..23394ac76cf2 100644
--- a/arch/arm/mach-spear/spear3xx.c
+++ b/arch/arm/mach-spear/spear3xx.c
@@ -4,7 +4,7 @@
  * SPEAr3XX machines common source file
  *
  * Copyright (C) 2009-2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/ata/pata_arasan_cf.c b/drivers/ata/pata_arasan_cf.c
index a9b0c820f2eb..5d9ee99c2148 100644
--- a/drivers/ata/pata_arasan_cf.c
+++ b/drivers/ata/pata_arasan_cf.c
@@ -4,7 +4,7 @@
  * Arasan Compact Flash host controller source file
  *
  * Copyright (C) 2011 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
@@ -968,7 +968,7 @@ static struct platform_driver arasan_cf_driver = {
 
 module_platform_driver(arasan_cf_driver);
 
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
 MODULE_DESCRIPTION("Arasan ATA Compact Flash driver");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("platform:" DRIVER_NAME);
diff --git a/drivers/clk/spear/clk-aux-synth.c b/drivers/clk/spear/clk-aux-synth.c
index bdfb4421c643..f271c350ef94 100644
--- a/drivers/clk/spear/clk-aux-synth.c
+++ b/drivers/clk/spear/clk-aux-synth.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/clk-frac-synth.c b/drivers/clk/spear/clk-frac-synth.c
index dffd4ce6c8b5..58d678b5b40a 100644
--- a/drivers/clk/spear/clk-frac-synth.c
+++ b/drivers/clk/spear/clk-frac-synth.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/clk-gpt-synth.c b/drivers/clk/spear/clk-gpt-synth.c
index 1afc18c4effc..1a722e99e76e 100644
--- a/drivers/clk/spear/clk-gpt-synth.c
+++ b/drivers/clk/spear/clk-gpt-synth.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/clk-vco-pll.c b/drivers/clk/spear/clk-vco-pll.c
index 1b9b65bca51e..5ebddc528145 100644
--- a/drivers/clk/spear/clk-vco-pll.c
+++ b/drivers/clk/spear/clk-vco-pll.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/clk.c b/drivers/clk/spear/clk.c
index 628b6d5ed3d9..157fe099ea6a 100644
--- a/drivers/clk/spear/clk.c
+++ b/drivers/clk/spear/clk.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/clk.h b/drivers/clk/spear/clk.h
index 931737677dfa..9834944f08b1 100644
--- a/drivers/clk/spear/clk.h
+++ b/drivers/clk/spear/clk.h
@@ -2,7 +2,7 @@
  * Clock framework definitions for SPEAr platform
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/spear1310_clock.c b/drivers/clk/spear/spear1310_clock.c
index 4daa5977793a..222ce108b41a 100644
--- a/drivers/clk/spear/spear1310_clock.c
+++ b/drivers/clk/spear/spear1310_clock.c
@@ -4,7 +4,7 @@
  * SPEAr1310 machine clock framework source file
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/spear1340_clock.c b/drivers/clk/spear/spear1340_clock.c
index 5a5c6648308d..973c9d3fbcf8 100644
--- a/drivers/clk/spear/spear1340_clock.c
+++ b/drivers/clk/spear/spear1340_clock.c
@@ -4,7 +4,7 @@
  * SPEAr1340 machine clock framework source file
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/spear3xx_clock.c b/drivers/clk/spear/spear3xx_clock.c
index bb5f387774e2..404a55edd613 100644
--- a/drivers/clk/spear/spear3xx_clock.c
+++ b/drivers/clk/spear/spear3xx_clock.c
@@ -2,7 +2,7 @@
  * SPEAr3xx machines clock framework source file
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/spear6xx_clock.c b/drivers/clk/spear/spear6xx_clock.c
index 4f649c9cb094..231061fa73a4 100644
--- a/drivers/clk/spear/spear6xx_clock.c
+++ b/drivers/clk/spear/spear6xx_clock.c
@@ -2,7 +2,7 @@
  * SPEAr6xx machines clock framework source file
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index 1022c2e1a2b0..cf1c87fa1edd 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -1746,4 +1746,4 @@ EXPORT_SYMBOL_GPL(dw_dma_enable);
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("Synopsys DesignWare DMA Controller core driver");
 MODULE_AUTHOR("Haavard Skinnemoen (Atmel)");
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
diff --git a/drivers/irqchip/spear-shirq.c b/drivers/irqchip/spear-shirq.c
index a45121546caf..acb721b31bcf 100644
--- a/drivers/irqchip/spear-shirq.c
+++ b/drivers/irqchip/spear-shirq.c
@@ -2,7 +2,7 @@
  * SPEAr platform shared irq layer source file
  *
  * Copyright (C) 2009-2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * Copyright (C) 2012 ST Microelectronics
  * Shiraz Hashim <shiraz.linux.kernel@gmail.com>
diff --git a/drivers/mfd/stmpe-i2c.c b/drivers/mfd/stmpe-i2c.c
index 5c054031c3f8..e14c8c9d189b 100644
--- a/drivers/mfd/stmpe-i2c.c
+++ b/drivers/mfd/stmpe-i2c.c
@@ -6,7 +6,7 @@
  *
  * License Terms: GNU General Public License, version 2
  * Author: Rabin Vincent <rabin.vincent@stericsson.com> for ST-Ericsson
- * Author: Viresh Kumar <viresh.linux@gmail.com> for ST Microelectronics
+ * Author: Viresh Kumar <vireshk@kernel.org> for ST Microelectronics
  */
 
 #include <linux/i2c.h>
diff --git a/drivers/mfd/stmpe-spi.c b/drivers/mfd/stmpe-spi.c
index a81badbaa917..6fdb30e84a2b 100644
--- a/drivers/mfd/stmpe-spi.c
+++ b/drivers/mfd/stmpe-spi.c
@@ -4,7 +4,7 @@
  * Copyright (C) ST Microelectronics SA 2011
  *
  * License Terms: GNU General Public License, version 2
- * Author: Viresh Kumar <viresh.linux@gmail.com> for ST Microelectronics
+ * Author: Viresh Kumar <vireshk@kernel.org> for ST Microelectronics
  */
 
 #include <linux/spi/spi.h>
@@ -146,4 +146,4 @@ module_exit(stmpe_exit);
 
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("STMPE MFD SPI Interface Driver");
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
diff --git a/drivers/mmc/host/sdhci-spear.c b/drivers/mmc/host/sdhci-spear.c
index df088343d60f..255a896769b8 100644
--- a/drivers/mmc/host/sdhci-spear.c
+++ b/drivers/mmc/host/sdhci-spear.c
@@ -4,7 +4,7 @@
  * Support of SDHCI platform devices for spear soc family
  *
  * Copyright (C) 2010 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * Inspired by sdhci-pltfm.c
  *
@@ -211,5 +211,5 @@ static struct platform_driver sdhci_driver = {
 module_platform_driver(sdhci_driver);
 
 MODULE_DESCRIPTION("SPEAr Secure Digital Host Controller Interface driver");
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/pinctrl/spear/pinctrl-spear.c b/drivers/pinctrl/spear/pinctrl-spear.c
index f87a5eaf75da..0afaf79a4e51 100644
--- a/drivers/pinctrl/spear/pinctrl-spear.c
+++ b/drivers/pinctrl/spear/pinctrl-spear.c
@@ -2,7 +2,7 @@
  * Driver for the ST Microelectronics SPEAr pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * Inspired from:
  * - U300 Pinctl drivers
diff --git a/drivers/pinctrl/spear/pinctrl-spear.h b/drivers/pinctrl/spear/pinctrl-spear.h
index dc8bf85ecb2a..27c2cc8d83ad 100644
--- a/drivers/pinctrl/spear/pinctrl-spear.h
+++ b/drivers/pinctrl/spear/pinctrl-spear.h
@@ -2,7 +2,7 @@
  * Driver header file for the ST Microelectronics SPEAr pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/pinctrl/spear/pinctrl-spear1310.c b/drivers/pinctrl/spear/pinctrl-spear1310.c
index a7bdc537efa7..92611bb757ac 100644
--- a/drivers/pinctrl/spear/pinctrl-spear1310.c
+++ b/drivers/pinctrl/spear/pinctrl-spear1310.c
@@ -2,7 +2,7 @@
  * Driver for the ST Microelectronics SPEAr1310 pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
@@ -2730,7 +2730,7 @@ static void __exit spear1310_pinctrl_exit(void)
 }
 module_exit(spear1310_pinctrl_exit);
 
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
 MODULE_DESCRIPTION("ST Microelectronics SPEAr1310 pinctrl driver");
 MODULE_LICENSE("GPL v2");
 MODULE_DEVICE_TABLE(of, spear1310_pinctrl_of_match);
diff --git a/drivers/pinctrl/spear/pinctrl-spear1340.c b/drivers/pinctrl/spear/pinctrl-spear1340.c
index f43ec85a0328..f842e9dc40d0 100644
--- a/drivers/pinctrl/spear/pinctrl-spear1340.c
+++ b/drivers/pinctrl/spear/pinctrl-spear1340.c
@@ -2,7 +2,7 @@
  * Driver for the ST Microelectronics SPEAr1340 pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
@@ -2046,7 +2046,7 @@ static void __exit spear1340_pinctrl_exit(void)
 }
 module_exit(spear1340_pinctrl_exit);
 
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
 MODULE_DESCRIPTION("ST Microelectronics SPEAr1340 pinctrl driver");
 MODULE_LICENSE("GPL v2");
 MODULE_DEVICE_TABLE(of, spear1340_pinctrl_of_match);
diff --git a/drivers/pinctrl/spear/pinctrl-spear300.c b/drivers/pinctrl/spear/pinctrl-spear300.c
index da8990a8eeef..d998a2ccff48 100644
--- a/drivers/pinctrl/spear/pinctrl-spear300.c
+++ b/drivers/pinctrl/spear/pinctrl-spear300.c
@@ -2,7 +2,7 @@
  * Driver for the ST Microelectronics SPEAr300 pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
@@ -703,7 +703,7 @@ static void __exit spear300_pinctrl_exit(void)
 }
 module_exit(spear300_pinctrl_exit);
 
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
 MODULE_DESCRIPTION("ST Microelectronics SPEAr300 pinctrl driver");
 MODULE_LICENSE("GPL v2");
 MODULE_DEVICE_TABLE(of, spear300_pinctrl_of_match);
diff --git a/drivers/pinctrl/spear/pinctrl-spear310.c b/drivers/pinctrl/spear/pinctrl-spear310.c
index 31ede51e819b..609b18aceb16 100644
--- a/drivers/pinctrl/spear/pinctrl-spear310.c
+++ b/drivers/pinctrl/spear/pinctrl-spear310.c
@@ -2,7 +2,7 @@
  * Driver for the ST Microelectronics SPEAr310 pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
@@ -426,7 +426,7 @@ static void __exit spear310_pinctrl_exit(void)
 }
 module_exit(spear310_pinctrl_exit);
 
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
 MODULE_DESCRIPTION("ST Microelectronics SPEAr310 pinctrl driver");
 MODULE_LICENSE("GPL v2");
 MODULE_DEVICE_TABLE(of, spear310_pinctrl_of_match);
diff --git a/drivers/pinctrl/spear/pinctrl-spear320.c b/drivers/pinctrl/spear/pinctrl-spear320.c
index 506e40b641e0..c07114431bd4 100644
--- a/drivers/pinctrl/spear/pinctrl-spear320.c
+++ b/drivers/pinctrl/spear/pinctrl-spear320.c
@@ -2,7 +2,7 @@
  * Driver for the ST Microelectronics SPEAr320 pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
@@ -3467,7 +3467,7 @@ static void __exit spear320_pinctrl_exit(void)
 }
 module_exit(spear320_pinctrl_exit);
 
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
 MODULE_DESCRIPTION("ST Microelectronics SPEAr320 pinctrl driver");
 MODULE_LICENSE("GPL v2");
 MODULE_DEVICE_TABLE(of, spear320_pinctrl_of_match);
diff --git a/drivers/pinctrl/spear/pinctrl-spear3xx.c b/drivers/pinctrl/spear/pinctrl-spear3xx.c
index 12ee21af766b..d3119aafe709 100644
--- a/drivers/pinctrl/spear/pinctrl-spear3xx.c
+++ b/drivers/pinctrl/spear/pinctrl-spear3xx.c
@@ -2,7 +2,7 @@
  * Driver for the ST Microelectronics SPEAr3xx pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/pinctrl/spear/pinctrl-spear3xx.h b/drivers/pinctrl/spear/pinctrl-spear3xx.h
index 7860b36053c4..ce19dcf8f08b 100644
--- a/drivers/pinctrl/spear/pinctrl-spear3xx.h
+++ b/drivers/pinctrl/spear/pinctrl-spear3xx.h
@@ -2,7 +2,7 @@
  * Header file for the ST Microelectronics SPEAr3xx pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/watchdog/sp805_wdt.c b/drivers/watchdog/sp805_wdt.c
index c1b03f4235b9..4e7fec36f5c3 100644
--- a/drivers/watchdog/sp805_wdt.c
+++ b/drivers/watchdog/sp805_wdt.c
@@ -4,7 +4,7 @@
  * Watchdog driver for ARM SP805 watchdog module
  *
  * Copyright (C) 2010 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2 or later. This program is licensed "as is" without any
@@ -303,6 +303,6 @@ static struct amba_driver sp805_wdt_driver = {
 
 module_amba_driver(sp805_wdt_driver);
 
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
 MODULE_DESCRIPTION("ARM SP805 Watchdog Driver");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/amba/sp810.h b/include/linux/amba/sp810.h
index c7df89f99115..58fe9e8b6fd7 100644
--- a/include/linux/amba/sp810.h
+++ b/include/linux/amba/sp810.h
@@ -2,7 +2,7 @@
  * ARM PrimeXsys System Controller SP810 header file
  *
  * Copyright (C) 2009 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/include/linux/pata_arasan_cf_data.h b/include/linux/pata_arasan_cf_data.h
index 3cc21c9cc1e8..9fade5dd2e86 100644
--- a/include/linux/pata_arasan_cf_data.h
+++ b/include/linux/pata_arasan_cf_data.h
@@ -4,7 +4,7 @@
  * Arasan Compact Flash host controller platform data header file
  *
  * Copyright (C) 2011 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
-- 
cgit v1.2.3-70-g09d2


From e2cfc91120fa01e3458167054af993fb83d7d0ec Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Fri, 17 Jul 2015 16:24:18 -0700
Subject: mm/page_owner: set correct gfp_mask on page_owner

Currently, we set wrong gfp_mask to page_owner info in case of isolated
freepage by compaction and split page.  It causes incorrect mixed
pageblock report that we can get from '/proc/pagetypeinfo'.  This metric
is really useful to measure fragmentation effect so should be accurate.
This patch fixes it by setting correct information.

Without this patch, after kernel build workload is finished, number of
mixed pageblock is 112 among roughly 210 movable pageblocks.

But, with this fix, output shows that mixed pageblock is just 57.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_owner.h | 13 +++++++++++++
 mm/page_alloc.c            |  8 +++++---
 mm/page_owner.c            |  7 +++++++
 3 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index b48c3471c254..cacaabea8a09 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -8,6 +8,7 @@ extern struct page_ext_operations page_owner_ops;
 extern void __reset_page_owner(struct page *page, unsigned int order);
 extern void __set_page_owner(struct page *page,
 			unsigned int order, gfp_t gfp_mask);
+extern gfp_t __get_page_owner_gfp(struct page *page);
 
 static inline void reset_page_owner(struct page *page, unsigned int order)
 {
@@ -25,6 +26,14 @@ static inline void set_page_owner(struct page *page,
 
 	__set_page_owner(page, order, gfp_mask);
 }
+
+static inline gfp_t get_page_owner_gfp(struct page *page)
+{
+	if (likely(!page_owner_inited))
+		return 0;
+
+	return __get_page_owner_gfp(page);
+}
 #else
 static inline void reset_page_owner(struct page *page, unsigned int order)
 {
@@ -33,6 +42,10 @@ static inline void set_page_owner(struct page *page,
 			unsigned int order, gfp_t gfp_mask)
 {
 }
+static inline gfp_t get_page_owner_gfp(struct page *page)
+{
+	return 0;
+}
 
 #endif /* CONFIG_PAGE_OWNER */
 #endif /* __LINUX_PAGE_OWNER_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fbba675a0bd9..ef19f22b2b7d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1948,6 +1948,7 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
 void split_page(struct page *page, unsigned int order)
 {
 	int i;
+	gfp_t gfp_mask;
 
 	VM_BUG_ON_PAGE(PageCompound(page), page);
 	VM_BUG_ON_PAGE(!page_count(page), page);
@@ -1961,10 +1962,11 @@ void split_page(struct page *page, unsigned int order)
 		split_page(virt_to_page(page[0].shadow), order);
 #endif
 
-	set_page_owner(page, 0, 0);
+	gfp_mask = get_page_owner_gfp(page);
+	set_page_owner(page, 0, gfp_mask);
 	for (i = 1; i < (1 << order); i++) {
 		set_page_refcounted(page + i);
-		set_page_owner(page + i, 0, 0);
+		set_page_owner(page + i, 0, gfp_mask);
 	}
 }
 EXPORT_SYMBOL_GPL(split_page);
@@ -1994,7 +1996,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
 	zone->free_area[order].nr_free--;
 	rmv_page_order(page);
 
-	set_page_owner(page, order, 0);
+	set_page_owner(page, order, __GFP_MOVABLE);
 
 	/* Set the pageblock if the isolated page is at least a pageblock */
 	if (order >= pageblock_order - 1) {
diff --git a/mm/page_owner.c b/mm/page_owner.c
index bd5f842b56d2..983c3a10fa07 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -76,6 +76,13 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
 	__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
 }
 
+gfp_t __get_page_owner_gfp(struct page *page)
+{
+	struct page_ext *page_ext = lookup_page_ext(page);
+
+	return page_ext->gfp_mask;
+}
+
 static ssize_t
 print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 		struct page *page, struct page_ext *page_ext)
-- 
cgit v1.2.3-70-g09d2


From 0c8c0f03e3a292e031596484275c14cf39c0ab7a Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@sr71.net>
Date: Fri, 17 Jul 2015 12:28:11 +0200
Subject: x86/fpu, sched: Dynamically allocate 'struct fpu'

The FPU rewrite removed the dynamic allocations of 'struct fpu'.
But, this potentially wastes massive amounts of memory (2k per
task on systems that do not have AVX-512 for instance).

Instead of having a separate slab, this patch just appends the
space that we need to the 'task_struct' which we dynamically
allocate already.  This saves from doing an extra slab
allocation at fork().

The only real downside here is that we have to stick everything
and the end of the task_struct.  But, I think the
BUILD_BUG_ON()s I stuck in there should keep that from being too
fragile.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437128892-9831-2-git-send-email-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/types.h | 72 +++++++++++++++++++++-------------------
 arch/x86/include/asm/processor.h | 10 ++++--
 arch/x86/kernel/fpu/init.c       | 39 ++++++++++++++++++++++
 arch/x86/kernel/process.c        |  2 +-
 fs/proc/kcore.c                  |  4 +--
 include/linux/sched.h            | 12 +++++--
 kernel/fork.c                    |  8 ++++-
 7 files changed, 104 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 0637826292de..c49c5173158e 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -189,6 +189,7 @@ union fpregs_state {
 	struct fxregs_state		fxsave;
 	struct swregs_state		soft;
 	struct xregs_state		xsave;
+	u8 __padding[PAGE_SIZE];
 };
 
 /*
@@ -197,40 +198,6 @@ union fpregs_state {
  * state fields:
  */
 struct fpu {
-	/*
-	 * @state:
-	 *
-	 * In-memory copy of all FPU registers that we save/restore
-	 * over context switches. If the task is using the FPU then
-	 * the registers in the FPU are more recent than this state
-	 * copy. If the task context-switches away then they get
-	 * saved here and represent the FPU state.
-	 *
-	 * After context switches there may be a (short) time period
-	 * during which the in-FPU hardware registers are unchanged
-	 * and still perfectly match this state, if the tasks
-	 * scheduled afterwards are not using the FPU.
-	 *
-	 * This is the 'lazy restore' window of optimization, which
-	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
-	 *
-	 * We detect whether a subsequent task uses the FPU via setting
-	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
-	 *
-	 * During this window, if the task gets scheduled again, we
-	 * might be able to skip having to do a restore from this
-	 * memory buffer to the hardware registers - at the cost of
-	 * incurring the overhead of #NM fault traps.
-	 *
-	 * Note that on modern CPUs that support the XSAVEOPT (or other
-	 * optimized XSAVE instructions), we don't use #NM traps anymore,
-	 * as the hardware can track whether FPU registers need saving
-	 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
-	 * logic, which unconditionally saves/restores all FPU state
-	 * across context switches. (if FPU state exists.)
-	 */
-	union fpregs_state		state;
-
 	/*
 	 * @last_cpu:
 	 *
@@ -288,6 +255,43 @@ struct fpu {
 	 * deal with bursty apps that only use the FPU for a short time:
 	 */
 	unsigned char			counter;
+	/*
+	 * @state:
+	 *
+	 * In-memory copy of all FPU registers that we save/restore
+	 * over context switches. If the task is using the FPU then
+	 * the registers in the FPU are more recent than this state
+	 * copy. If the task context-switches away then they get
+	 * saved here and represent the FPU state.
+	 *
+	 * After context switches there may be a (short) time period
+	 * during which the in-FPU hardware registers are unchanged
+	 * and still perfectly match this state, if the tasks
+	 * scheduled afterwards are not using the FPU.
+	 *
+	 * This is the 'lazy restore' window of optimization, which
+	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
+	 *
+	 * We detect whether a subsequent task uses the FPU via setting
+	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
+	 *
+	 * During this window, if the task gets scheduled again, we
+	 * might be able to skip having to do a restore from this
+	 * memory buffer to the hardware registers - at the cost of
+	 * incurring the overhead of #NM fault traps.
+	 *
+	 * Note that on modern CPUs that support the XSAVEOPT (or other
+	 * optimized XSAVE instructions), we don't use #NM traps anymore,
+	 * as the hardware can track whether FPU registers need saving
+	 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
+	 * logic, which unconditionally saves/restores all FPU state
+	 * across context switches. (if FPU state exists.)
+	 */
+	union fpregs_state		state;
+	/*
+	 * WARNING: 'state' is dynamically-sized.  Do not put
+	 * anything after it here.
+	 */
 };
 
 #endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 43e6519df0d5..944f1785ed0d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -390,9 +390,6 @@ struct thread_struct {
 #endif
 	unsigned long		gs;
 
-	/* Floating point and extended processor state */
-	struct fpu		fpu;
-
 	/* Save middle states of ptrace breakpoints */
 	struct perf_event	*ptrace_bps[HBP_NUM];
 	/* Debug status used for traps, single steps, etc... */
@@ -418,6 +415,13 @@ struct thread_struct {
 	unsigned long		iopl;
 	/* Max allowed port in the bitmap, in bytes: */
 	unsigned		io_bitmap_max;
+
+	/* Floating point and extended processor state */
+	struct fpu		fpu;
+	/*
+	 * WARNING: 'fpu' is dynamically-sized.  It *MUST* be at
+	 * the end.
+	 */
 };
 
 /*
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 32826791e675..deacbfa6b33e 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -136,6 +136,45 @@ static void __init fpu__init_system_generic(void)
 unsigned int xstate_size;
 EXPORT_SYMBOL_GPL(xstate_size);
 
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER)	\
+	BUILD_BUG_ON((sizeof(TYPE) -			\
+			offsetof(TYPE, MEMBER) -	\
+			sizeof(((TYPE *)0)->MEMBER)) > 	\
+			0)				\
+
+/*
+ * We append the 'struct fpu' to the task_struct.
+ */
+int __weak arch_task_struct_size(void)
+{
+	int task_size = sizeof(struct task_struct);
+
+	/*
+	 * Subtract off the static size of the register state.
+	 * It potentially has a bunch of padding.
+	 */
+	task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+
+	/*
+	 * Add back the dynamically-calculated register state
+	 * size.
+	 */
+	task_size += xstate_size;
+
+	/*
+	 * We dynamically size 'struct fpu', so we require that
+	 * it be at the end of 'thread_struct' and that
+	 * 'thread_struct' be at the end of 'task_struct'.  If
+	 * you hit a compile error here, check the structure to
+	 * see if something got added to the end.
+	 */
+	CHECK_MEMBER_AT_END_OF(struct fpu, state);
+	CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
+	CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
+
+	return task_size;
+}
+
 /*
  * Set up the xstate_size based on the legacy FPU context size.
  *
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9cad694ed7c4..975420eac105 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
  */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	*dst = *src;
+	memcpy(dst, src, arch_task_struct_size());
 
 	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 91a4e6426321..a0fe99485687 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
 			     roundup(sizeof(CORE_STR), 4)) +
 			roundup(sizeof(struct elf_prstatus), 4) +
 			roundup(sizeof(struct elf_prpsinfo), 4) +
-			roundup(sizeof(struct task_struct), 4);
+			roundup(arch_task_struct_size(), 4);
 	*elf_buflen = PAGE_ALIGN(*elf_buflen);
 	return size + *elf_buflen;
 }
@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	/* set up the task structure */
 	notes[2].name	= CORE_STR;
 	notes[2].type	= NT_TASKSTRUCT;
-	notes[2].datasz	= sizeof(struct task_struct);
+	notes[2].datasz	= arch_task_struct_size();
 	notes[2].data	= current;
 
 	nhdr->p_filesz	+= notesize(&notes[2]);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ae21f1591615..e43a41d892b6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1522,8 +1522,6 @@ struct task_struct {
 /* hung task detection */
 	unsigned long last_switch_count;
 #endif
-/* CPU-specific state of this task */
-	struct thread_struct thread;
 /* filesystem information */
 	struct fs_struct *fs;
 /* open file information */
@@ -1778,8 +1776,18 @@ struct task_struct {
 	unsigned long	task_state_change;
 #endif
 	int pagefault_disabled;
+/* CPU-specific state of this task */
+	struct thread_struct thread;
+/*
+ * WARNING: on x86, 'thread_struct' contains a variable-sized
+ * structure.  It *MUST* be at the end of 'task_struct'.
+ *
+ * Do not put anything below here!
+ */
 };
 
+extern int arch_task_struct_size(void);
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 1bfefc6f96a4..431b67a6098c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -287,15 +287,21 @@ static void set_max_threads(unsigned int max_threads_suggested)
 	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
 }
 
+int __weak arch_task_struct_size(void)
+{
+	return sizeof(struct task_struct);
+}
+
 void __init fork_init(void)
 {
+	int task_struct_size = arch_task_struct_size();
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
 #endif
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
-		kmem_cache_create("task_struct", sizeof(struct task_struct),
+		kmem_cache_create("task_struct", task_struct_size,
 			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 5aaeb5c01c5b6c0be7b7aadbf3ace9f3a4458c3d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 17 Jul 2015 12:28:12 +0200
Subject: x86/fpu, sched: Introduce CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT and
 use it on x86

Don't burden architectures without dynamic task_struct sizing
with the overhead of dynamic sizing.

Also optimize the x86 code a bit by caching task_struct_size.

Acked-and-Tested-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437128892-9831-3-git-send-email-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/Kconfig               |  4 ++++
 arch/x86/Kconfig           |  1 +
 arch/x86/kernel/fpu/init.c | 17 +++++++++--------
 arch/x86/kernel/process.c  |  2 +-
 fs/proc/kcore.c            |  4 ++--
 include/linux/sched.h      |  6 +++++-
 kernel/fork.c              | 11 +++++------
 7 files changed, 27 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index bec6666a3cc4..8a8ea7110de8 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -221,6 +221,10 @@ config ARCH_TASK_STRUCT_ALLOCATOR
 config ARCH_THREAD_INFO_ALLOCATOR
 	bool
 
+# Select if arch wants to size task_struct dynamically via arch_task_struct_size:
+config ARCH_WANTS_DYNAMIC_TASK_STRUCT
+	bool
+
 config HAVE_REGS_AND_STACK_ACCESS_API
 	bool
 	help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3dbb7e7909ca..b3a1a5d77d92 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
 	select ARCH_USE_CMPXCHG_LOCKREF		if X86_64
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
+	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_IPC_PARSE_VERSION	if X86_32
 	select ARCH_WANT_OPTIONAL_GPIOLIB
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index deacbfa6b33e..0b39173dd971 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -4,6 +4,8 @@
 #include <asm/fpu/internal.h>
 #include <asm/tlbflush.h>
 
+#include <linux/sched.h>
+
 /*
  * Initialize the TS bit in CR0 according to the style of context-switches
  * we are using:
@@ -136,16 +138,14 @@ static void __init fpu__init_system_generic(void)
 unsigned int xstate_size;
 EXPORT_SYMBOL_GPL(xstate_size);
 
-#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER)	\
-	BUILD_BUG_ON((sizeof(TYPE) -			\
-			offsetof(TYPE, MEMBER) -	\
-			sizeof(((TYPE *)0)->MEMBER)) > 	\
-			0)				\
+/* Enforce that 'MEMBER' is the last field of 'TYPE': */
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
+	BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
 
 /*
- * We append the 'struct fpu' to the task_struct.
+ * We append the 'struct fpu' to the task_struct:
  */
-int __weak arch_task_struct_size(void)
+static void __init fpu__init_task_struct_size(void)
 {
 	int task_size = sizeof(struct task_struct);
 
@@ -172,7 +172,7 @@ int __weak arch_task_struct_size(void)
 	CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
 	CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
 
-	return task_size;
+	arch_task_struct_size = task_size;
 }
 
 /*
@@ -326,6 +326,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
 	fpu__init_system_generic();
 	fpu__init_system_xstate_size_legacy();
 	fpu__init_system_xstate();
+	fpu__init_task_struct_size();
 
 	fpu__init_system_ctx_switch();
 }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 975420eac105..397688beed4b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
  */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	memcpy(dst, src, arch_task_struct_size());
+	memcpy(dst, src, arch_task_struct_size);
 
 	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a0fe99485687..92e6726f6e37 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
 			     roundup(sizeof(CORE_STR), 4)) +
 			roundup(sizeof(struct elf_prstatus), 4) +
 			roundup(sizeof(struct elf_prpsinfo), 4) +
-			roundup(arch_task_struct_size(), 4);
+			roundup(arch_task_struct_size, 4);
 	*elf_buflen = PAGE_ALIGN(*elf_buflen);
 	return size + *elf_buflen;
 }
@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	/* set up the task structure */
 	notes[2].name	= CORE_STR;
 	notes[2].type	= NT_TASKSTRUCT;
-	notes[2].datasz	= arch_task_struct_size();
+	notes[2].datasz	= arch_task_struct_size;
 	notes[2].data	= current;
 
 	nhdr->p_filesz	+= notesize(&notes[2]);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e43a41d892b6..04b5ada460b4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1786,7 +1786,11 @@ struct task_struct {
  */
 };
 
-extern int arch_task_struct_size(void);
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+extern int arch_task_struct_size __read_mostly;
+#else
+# define arch_task_struct_size (sizeof(struct task_struct))
+#endif
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
diff --git a/kernel/fork.c b/kernel/fork.c
index 431b67a6098c..dbd9b8d7b7cc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -287,21 +287,20 @@ static void set_max_threads(unsigned int max_threads_suggested)
 	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
 }
 
-int __weak arch_task_struct_size(void)
-{
-	return sizeof(struct task_struct);
-}
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+/* Initialized by the architecture: */
+int arch_task_struct_size __read_mostly;
+#endif
 
 void __init fork_init(void)
 {
-	int task_struct_size = arch_task_struct_size();
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
 #endif
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
-		kmem_cache_create("task_struct", task_struct_size,
+		kmem_cache_create("task_struct", arch_task_struct_size,
 			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 5c31252c4a86dc591c23f1a951edd52ad791ef0e Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 1 Jul 2015 10:21:47 +0200
Subject: pwm: Add the pwm_is_enabled() helper

Some PWM drivers are testing the PWMF_ENABLED flag. Create a helper
function to hide the logic behind enabled test. This will allow us to
smoothly move from the current approach to an atomic PWM update
approach.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c            |  4 ++--
 drivers/pwm/pwm-atmel-tcb.c   |  2 +-
 drivers/pwm/pwm-atmel.c       |  6 +++---
 drivers/pwm/pwm-bcm-kona.c    |  4 ++--
 drivers/pwm/pwm-ep93xx.c      |  4 ++--
 drivers/pwm/pwm-imx.c         |  2 +-
 drivers/pwm/pwm-mxs.c         |  4 ++--
 drivers/pwm/pwm-renesas-tpu.c |  2 +-
 drivers/pwm/pwm-tegra.c       |  6 +++---
 drivers/pwm/pwm-tiecap.c      | 10 +++++-----
 drivers/pwm/pwm-tiehrpwm.c    |  6 +++---
 drivers/pwm/sysfs.c           |  2 +-
 include/linux/pwm.h           |  5 +++++
 13 files changed, 31 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 3a7769fe53de..f7c11d2dec37 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -455,7 +455,7 @@ int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity)
 	if (!pwm->chip->ops->set_polarity)
 		return -ENOSYS;
 
-	if (test_bit(PWMF_ENABLED, &pwm->flags))
+	if (pwm_is_enabled(pwm))
 		return -EBUSY;
 
 	err = pwm->chip->ops->set_polarity(pwm->chip, pwm, polarity);
@@ -853,7 +853,7 @@ static void pwm_dbg_show(struct pwm_chip *chip, struct seq_file *s)
 		if (test_bit(PWMF_REQUESTED, &pwm->flags))
 			seq_puts(s, " requested");
 
-		if (test_bit(PWMF_ENABLED, &pwm->flags))
+		if (pwm_is_enabled(pwm))
 			seq_puts(s, " enabled");
 
 		seq_puts(s, "\n");
diff --git a/drivers/pwm/pwm-atmel-tcb.c b/drivers/pwm/pwm-atmel-tcb.c
index d14e0677c92d..6da01b3bf6f4 100644
--- a/drivers/pwm/pwm-atmel-tcb.c
+++ b/drivers/pwm/pwm-atmel-tcb.c
@@ -347,7 +347,7 @@ static int atmel_tcb_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	tcbpwm->duty = duty;
 
 	/* If the PWM is enabled, call enable to apply the new conf */
-	if (test_bit(PWMF_ENABLED, &pwm->flags))
+	if (pwm_is_enabled(pwm))
 		atmel_tcb_pwm_enable(chip, pwm);
 
 	return 0;
diff --git a/drivers/pwm/pwm-atmel.c b/drivers/pwm/pwm-atmel.c
index a947c9095d9d..b3b294de88e0 100644
--- a/drivers/pwm/pwm-atmel.c
+++ b/drivers/pwm/pwm-atmel.c
@@ -114,7 +114,7 @@ static int atmel_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	u32 val;
 	int ret;
 
-	if (test_bit(PWMF_ENABLED, &pwm->flags) && (period_ns != pwm->period)) {
+	if (pwm_is_enabled(pwm) && (period_ns != pwm->period)) {
 		dev_err(chip->dev, "cannot change PWM period while enabled\n");
 		return -EBUSY;
 	}
@@ -176,7 +176,7 @@ static void atmel_pwm_config_v1(struct pwm_chip *chip, struct pwm_device *pwm,
 	 * If the PWM channel is enabled, only update CDTY by using the update
 	 * register, it needs to set bit 10 of CMR to 0
 	 */
-	if (test_bit(PWMF_ENABLED, &pwm->flags))
+	if (pwm_is_enabled(pwm))
 		return;
 	/*
 	 * If the PWM channel is disabled, write value to duty and period
@@ -191,7 +191,7 @@ static void atmel_pwm_config_v2(struct pwm_chip *chip, struct pwm_device *pwm,
 {
 	struct atmel_pwm_chip *atmel_pwm = to_atmel_pwm_chip(chip);
 
-	if (test_bit(PWMF_ENABLED, &pwm->flags)) {
+	if (pwm_is_enabled(pwm)) {
 		/*
 		 * If the PWM channel is enabled, using the duty update register
 		 * to update the value.
diff --git a/drivers/pwm/pwm-bcm-kona.c b/drivers/pwm/pwm-bcm-kona.c
index 7af8fea2dc5b..dfdcf88279ae 100644
--- a/drivers/pwm/pwm-bcm-kona.c
+++ b/drivers/pwm/pwm-bcm-kona.c
@@ -134,7 +134,7 @@ static int kona_pwmc_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	}
 
 	/* If the PWM channel is enabled, write the settings to the HW */
-	if (test_bit(PWMF_ENABLED, &pwm->flags)) {
+	if (pwm_is_enabled(pwm)) {
 		value = readl(kp->base + PRESCALE_OFFSET);
 		value &= ~PRESCALE_MASK(chan);
 		value |= prescale << PRESCALE_SHIFT(chan);
@@ -287,7 +287,7 @@ static int kona_pwmc_remove(struct platform_device *pdev)
 	unsigned int chan;
 
 	for (chan = 0; chan < kp->chip.npwm; chan++)
-		if (test_bit(PWMF_ENABLED, &kp->chip.pwms[chan].flags))
+		if (pwm_is_enabled(&kp->chip.pwms[chan]))
 			clk_disable_unprepare(kp->clk);
 
 	return pwmchip_remove(&kp->chip);
diff --git a/drivers/pwm/pwm-ep93xx.c b/drivers/pwm/pwm-ep93xx.c
index e593e9c45c51..bbf10ae02f0e 100644
--- a/drivers/pwm/pwm-ep93xx.c
+++ b/drivers/pwm/pwm-ep93xx.c
@@ -82,7 +82,7 @@ static int ep93xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	 * The clock needs to be enabled to access the PWM registers.
 	 * Configuration can be changed at any time.
 	 */
-	if (!test_bit(PWMF_ENABLED, &pwm->flags)) {
+	if (!pwm_is_enabled(pwm)) {
 		ret = clk_enable(ep93xx_pwm->clk);
 		if (ret)
 			return ret;
@@ -113,7 +113,7 @@ static int ep93xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 		ret = -EINVAL;
 	}
 
-	if (!test_bit(PWMF_ENABLED, &pwm->flags))
+	if (!pwm_is_enabled(pwm))
 		clk_disable(ep93xx_pwm->clk);
 
 	return ret;
diff --git a/drivers/pwm/pwm-imx.c b/drivers/pwm/pwm-imx.c
index 66d6f0c5c421..008dc646225e 100644
--- a/drivers/pwm/pwm-imx.c
+++ b/drivers/pwm/pwm-imx.c
@@ -114,7 +114,7 @@ static int imx_pwm_config_v2(struct pwm_chip *chip,
 	unsigned long long c;
 	unsigned long period_cycles, duty_cycles, prescale;
 	unsigned int period_ms;
-	bool enable = test_bit(PWMF_ENABLED, &pwm->flags);
+	bool enable = pwm_is_enabled(pwm);
 	int wait_count = 0, fifoav;
 	u32 cr, sr;
 
diff --git a/drivers/pwm/pwm-mxs.c b/drivers/pwm/pwm-mxs.c
index b430811e14f5..9a596324ebef 100644
--- a/drivers/pwm/pwm-mxs.c
+++ b/drivers/pwm/pwm-mxs.c
@@ -77,7 +77,7 @@ static int mxs_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	 * If the PWM channel is disabled, make sure to turn on the clock
 	 * before writing the register. Otherwise, keep it enabled.
 	 */
-	if (!test_bit(PWMF_ENABLED, &pwm->flags)) {
+	if (!pwm_is_enabled(pwm)) {
 		ret = clk_prepare_enable(mxs->clk);
 		if (ret)
 			return ret;
@@ -92,7 +92,7 @@ static int mxs_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	/*
 	 * If the PWM is not enabled, turn the clock off again to save power.
 	 */
-	if (!test_bit(PWMF_ENABLED, &pwm->flags))
+	if (!pwm_is_enabled(pwm))
 		clk_disable_unprepare(mxs->clk);
 
 	return 0;
diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c
index ee63f9e9d0fb..075c1a764ba2 100644
--- a/drivers/pwm/pwm-renesas-tpu.c
+++ b/drivers/pwm/pwm-renesas-tpu.c
@@ -301,7 +301,7 @@ static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *_pwm,
 	pwm->duty = duty;
 
 	/* If the channel is disabled we're done. */
-	if (!test_bit(PWMF_ENABLED, &_pwm->flags))
+	if (!pwm_is_enabled(_pwm))
 		return 0;
 
 	if (duty_only && pwm->timer_on) {
diff --git a/drivers/pwm/pwm-tegra.c b/drivers/pwm/pwm-tegra.c
index cabd7d8e05cc..d4de0607b502 100644
--- a/drivers/pwm/pwm-tegra.c
+++ b/drivers/pwm/pwm-tegra.c
@@ -112,7 +112,7 @@ static int tegra_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	 * If the PWM channel is disabled, make sure to turn on the clock
 	 * before writing the register. Otherwise, keep it enabled.
 	 */
-	if (!test_bit(PWMF_ENABLED, &pwm->flags)) {
+	if (!pwm_is_enabled(pwm)) {
 		err = clk_prepare_enable(pc->clk);
 		if (err < 0)
 			return err;
@@ -124,7 +124,7 @@ static int tegra_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	/*
 	 * If the PWM is not enabled, turn the clock off again to save power.
 	 */
-	if (!test_bit(PWMF_ENABLED, &pwm->flags))
+	if (!pwm_is_enabled(pwm))
 		clk_disable_unprepare(pc->clk);
 
 	return 0;
@@ -214,7 +214,7 @@ static int tegra_pwm_remove(struct platform_device *pdev)
 	for (i = 0; i < NUM_PWM; i++) {
 		struct pwm_device *pwm = &pc->chip.pwms[i];
 
-		if (!test_bit(PWMF_ENABLED, &pwm->flags))
+		if (!pwm_is_enabled(pwm))
 			if (clk_prepare_enable(pc->clk) < 0)
 				continue;
 
diff --git a/drivers/pwm/pwm-tiecap.c b/drivers/pwm/pwm-tiecap.c
index e557befdf4e6..616af764a276 100644
--- a/drivers/pwm/pwm-tiecap.c
+++ b/drivers/pwm/pwm-tiecap.c
@@ -97,7 +97,7 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 
 	writew(reg_val, pc->mmio_base + ECCTL2);
 
-	if (!test_bit(PWMF_ENABLED, &pwm->flags)) {
+	if (!pwm_is_enabled(pwm)) {
 		/* Update active registers if not running */
 		writel(duty_cycles, pc->mmio_base + CAP2);
 		writel(period_cycles, pc->mmio_base + CAP1);
@@ -111,7 +111,7 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 		writel(period_cycles, pc->mmio_base + CAP3);
 	}
 
-	if (!test_bit(PWMF_ENABLED, &pwm->flags)) {
+	if (!pwm_is_enabled(pwm)) {
 		reg_val = readw(pc->mmio_base + ECCTL2);
 		/* Disable APWM mode to put APWM output Low */
 		reg_val &= ~ECCTL2_APWM_MODE;
@@ -179,7 +179,7 @@ static void ecap_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 
 static void ecap_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
 {
-	if (test_bit(PWMF_ENABLED, &pwm->flags)) {
+	if (pwm_is_enabled(pwm)) {
 		dev_warn(chip->dev, "Removing PWM device without disabling\n");
 		pm_runtime_put_sync(chip->dev);
 	}
@@ -306,7 +306,7 @@ static int ecap_pwm_suspend(struct device *dev)
 	ecap_pwm_save_context(pc);
 
 	/* Disable explicitly if PWM is running */
-	if (test_bit(PWMF_ENABLED, &pwm->flags))
+	if (pwm_is_enabled(pwm))
 		pm_runtime_put_sync(dev);
 
 	return 0;
@@ -318,7 +318,7 @@ static int ecap_pwm_resume(struct device *dev)
 	struct pwm_device *pwm = pc->chip.pwms;
 
 	/* Enable explicitly if PWM was running */
-	if (test_bit(PWMF_ENABLED, &pwm->flags))
+	if (pwm_is_enabled(pwm))
 		pm_runtime_get_sync(dev);
 
 	ecap_pwm_restore_context(pc);
diff --git a/drivers/pwm/pwm-tiehrpwm.c b/drivers/pwm/pwm-tiehrpwm.c
index 694b3cf7694b..6a41e66015b6 100644
--- a/drivers/pwm/pwm-tiehrpwm.c
+++ b/drivers/pwm/pwm-tiehrpwm.c
@@ -407,7 +407,7 @@ static void ehrpwm_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
 {
 	struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip);
 
-	if (test_bit(PWMF_ENABLED, &pwm->flags)) {
+	if (pwm_is_enabled(pwm)) {
 		dev_warn(chip->dev, "Removing PWM device without disabling\n");
 		pm_runtime_put_sync(chip->dev);
 	}
@@ -565,7 +565,7 @@ static int ehrpwm_pwm_suspend(struct device *dev)
 	for (i = 0; i < pc->chip.npwm; i++) {
 		struct pwm_device *pwm = &pc->chip.pwms[i];
 
-		if (!test_bit(PWMF_ENABLED, &pwm->flags))
+		if (!pwm_is_enabled(pwm))
 			continue;
 
 		/* Disable explicitly if PWM is running */
@@ -582,7 +582,7 @@ static int ehrpwm_pwm_resume(struct device *dev)
 	for (i = 0; i < pc->chip.npwm; i++) {
 		struct pwm_device *pwm = &pc->chip.pwms[i];
 
-		if (!test_bit(PWMF_ENABLED, &pwm->flags))
+		if (!pwm_is_enabled(pwm))
 			continue;
 
 		/* Enable explicitly if PWM was running */
diff --git a/drivers/pwm/sysfs.c b/drivers/pwm/sysfs.c
index 4bd0c639e16d..eecf21d68108 100644
--- a/drivers/pwm/sysfs.c
+++ b/drivers/pwm/sysfs.c
@@ -97,7 +97,7 @@ static ssize_t pwm_enable_show(struct device *child,
 			       char *buf)
 {
 	const struct pwm_device *pwm = child_to_pwm_device(child);
-	int enabled = test_bit(PWMF_ENABLED, &pwm->flags);
+	int enabled = pwm_is_enabled(pwm);
 
 	return sprintf(buf, "%d\n", enabled);
 }
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 36262d08a9da..ec34f4d9a9ee 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -92,6 +92,11 @@ struct pwm_device {
 	enum pwm_polarity	polarity;
 };
 
+static inline bool pwm_is_enabled(const struct pwm_device *pwm)
+{
+	return test_bit(PWMF_ENABLED, &pwm->flags);
+}
+
 static inline void pwm_set_period(struct pwm_device *pwm, unsigned int period)
 {
 	if (pwm)
-- 
cgit v1.2.3-70-g09d2


From a1cf42171a2e3c33cbc12bb037795caf0589149b Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 1 Jul 2015 10:21:48 +0200
Subject: pwm: Constify PWM device where possible

The PWM argument is not modified in PWM property accessors, make it a
const argument so that the accessors can be used from sysfs.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index ec34f4d9a9ee..d8f691339a45 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -103,7 +103,7 @@ static inline void pwm_set_period(struct pwm_device *pwm, unsigned int period)
 		pwm->period = period;
 }
 
-static inline unsigned int pwm_get_period(struct pwm_device *pwm)
+static inline unsigned int pwm_get_period(const struct pwm_device *pwm)
 {
 	return pwm ? pwm->period : 0;
 }
@@ -114,7 +114,7 @@ static inline void pwm_set_duty_cycle(struct pwm_device *pwm, unsigned int duty)
 		pwm->duty_cycle = duty;
 }
 
-static inline unsigned int pwm_get_duty_cycle(struct pwm_device *pwm)
+static inline unsigned int pwm_get_duty_cycle(const struct pwm_device *pwm)
 {
 	return pwm ? pwm->duty_cycle : 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 011e76314818b6a24d5347b2d83b8a577e6aaae6 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 1 Jul 2015 10:21:49 +0200
Subject: pwm: Add pwm_get_polarity() helper function

Some drivers are directly accessing the ->polarity field in pwm_device.
Add a helper to retrieve the current polarity so that we can easily move
this field elsewhere (required to support atomic update).

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index d8f691339a45..6f286df30021 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -124,6 +124,11 @@ static inline unsigned int pwm_get_duty_cycle(const struct pwm_device *pwm)
  */
 int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity);
 
+static inline enum pwm_polarity pwm_get_polarity(const struct pwm_device *pwm)
+{
+	return pwm ? pwm->polarity : PWM_POLARITY_NORMAL;
+}
+
 /**
  * struct pwm_ops - PWM controller operations
  * @request: optional hook for requesting a PWM
-- 
cgit v1.2.3-70-g09d2


From 87dc11220df266d7d4b6dc594b55d0729b92809d Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Wed, 15 Jul 2015 11:21:40 -0700
Subject: clockevents: Remove clockevents_notify() prototype

This function no longer exists after commit a49b116dcb12
(clockevents: Cleanup dead cpu explicitely, 2015-04-03). Remove
the prototype and the stub function.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Cc: trivial@kernel.org
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: http://lkml.kernel.org/r/1436984500-5425-1-git-send-email-sboyd@codeaurora.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 597a1e836f22..31ce435981fe 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -234,13 +234,10 @@ static inline int tick_check_broadcast_expired(void) { return 0; }
 static inline void tick_setup_hrtimer_broadcast(void) { }
 # endif
 
-extern int clockevents_notify(unsigned long reason, void *arg);
-
 #else /* !CONFIG_GENERIC_CLOCKEVENTS: */
 
 static inline void clockevents_suspend(void) { }
 static inline void clockevents_resume(void) { }
-static inline int clockevents_notify(unsigned long reason, void *arg) { return 0; }
 static inline int tick_check_broadcast_expired(void) { return 0; }
 static inline void tick_setup_hrtimer_broadcast(void) { }
 
-- 
cgit v1.2.3-70-g09d2


From 0642ef6f2992eba46c41abb5ceb7d4fa14ba888e Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.wolfsonmicro.com>
Date: Tue, 23 Jun 2015 14:32:54 +0100
Subject: debugfs: Export bool read/write functions

The file read/write functions for bools have no special dependencies
on debugfs internals and are sufficiently non-trivial to be worth
exporting so clients can re-use the implementation.

Signed-off-by: Richard Fitzgerald <rf@opensource.wolfsonmicro.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 fs/debugfs/file.c       | 14 ++++++++------
 include/linux/debugfs.h | 20 ++++++++++++++++++++
 2 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 284f9aa0028b..6c55ade071c3 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -435,8 +435,8 @@ struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
 
-static ssize_t read_file_bool(struct file *file, char __user *user_buf,
-			      size_t count, loff_t *ppos)
+ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
+			       size_t count, loff_t *ppos)
 {
 	char buf[3];
 	u32 *val = file->private_data;
@@ -449,9 +449,10 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf,
 	buf[2] = 0x00;
 	return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
 }
+EXPORT_SYMBOL_GPL(debugfs_read_file_bool);
 
-static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
-			       size_t count, loff_t *ppos)
+ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
+				size_t count, loff_t *ppos)
 {
 	char buf[32];
 	size_t buf_size;
@@ -468,10 +469,11 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
 
 	return count;
 }
+EXPORT_SYMBOL_GPL(debugfs_write_file_bool);
 
 static const struct file_operations fops_bool = {
-	.read =		read_file_bool,
-	.write =	write_file_bool,
+	.read =		debugfs_read_file_bool,
+	.write =	debugfs_write_file_bool,
 	.open =		simple_open,
 	.llseek =	default_llseek,
 };
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 420311bcee38..9beb636b97eb 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -116,6 +116,12 @@ struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name,
 
 bool debugfs_initialized(void);
 
+ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
+			       size_t count, loff_t *ppos);
+
+ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
+				size_t count, loff_t *ppos);
+
 #else
 
 #include <linux/err.h>
@@ -282,6 +288,20 @@ static inline struct dentry *debugfs_create_devm_seqfile(struct device *dev,
 	return ERR_PTR(-ENODEV);
 }
 
+static inline ssize_t debugfs_read_file_bool(struct file *file,
+					     char __user *user_buf,
+					     size_t count, loff_t *ppos)
+{
+	return -ENODEV;
+}
+
+static inline ssize_t debugfs_write_file_bool(struct file *file,
+					      const char __user *user_buf,
+					      size_t count, loff_t *ppos)
+{
+	return -ENODEV;
+}
+
 #endif
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 584ac4e935a1f905d67c8fa3fbe8e32d384721f1 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Fri, 19 Jun 2015 15:00:46 -0700
Subject: clk: tegra: Properly include clk.h

Clock provider drivers generally shouldn't include clk.h because
it's the consumer API. Only include clk.h in files that are using
it. Also add in a clkdev.h include that was missing in a file
using clkdev APIs.

Cc: Peter De Schrijver <pdeschrijver@nvidia.com>
Cc: Thierry Reding <treding@nvidia.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/tegra/clk-divider.c          | 1 -
 drivers/clk/tegra/clk-periph-gate.c      | 1 -
 drivers/clk/tegra/clk-periph.c           | 1 -
 drivers/clk/tegra/clk-pll-out.c          | 1 -
 drivers/clk/tegra/clk-pll.c              | 2 +-
 drivers/clk/tegra/clk-super.c            | 1 -
 drivers/clk/tegra/clk-tegra-audio.c      | 1 -
 drivers/clk/tegra/clk-tegra-fixed.c      | 1 -
 drivers/clk/tegra/clk-tegra-periph.c     | 1 -
 drivers/clk/tegra/clk-tegra-pmc.c        | 1 -
 drivers/clk/tegra/clk-tegra-super-gen4.c | 1 -
 drivers/clk/tegra/clk-tegra114.c         | 2 --
 drivers/clk/tegra/clk-tegra124.c         | 1 -
 drivers/clk/tegra/clk-tegra20.c          | 1 -
 drivers/clk/tegra/clk-tegra30.c          | 1 -
 drivers/clk/tegra/clk.c                  | 1 +
 include/linux/clk/tegra.h                | 3 ++-
 17 files changed, 4 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/tegra/clk-divider.c b/drivers/clk/tegra/clk-divider.c
index 59a5714dfe18..48c83efda4cf 100644
--- a/drivers/clk/tegra/clk-divider.c
+++ b/drivers/clk/tegra/clk-divider.c
@@ -19,7 +19,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/clk-provider.h>
-#include <linux/clk.h>
 
 #include "clk.h"
 
diff --git a/drivers/clk/tegra/clk-periph-gate.c b/drivers/clk/tegra/clk-periph-gate.c
index 0aa8830ae7cc..d28d6e95020f 100644
--- a/drivers/clk/tegra/clk-periph-gate.c
+++ b/drivers/clk/tegra/clk-periph-gate.c
@@ -14,7 +14,6 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <linux/clk.h>
 #include <linux/clk-provider.h>
 #include <linux/slab.h>
 #include <linux/io.h>
diff --git a/drivers/clk/tegra/clk-periph.c b/drivers/clk/tegra/clk-periph.c
index d84ae49d0e05..ec5b6113b012 100644
--- a/drivers/clk/tegra/clk-periph.c
+++ b/drivers/clk/tegra/clk-periph.c
@@ -14,7 +14,6 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <linux/clk.h>
 #include <linux/clk-provider.h>
 #include <linux/export.h>
 #include <linux/slab.h>
diff --git a/drivers/clk/tegra/clk-pll-out.c b/drivers/clk/tegra/clk-pll-out.c
index 3598987a451d..257cae0c1488 100644
--- a/drivers/clk/tegra/clk-pll-out.c
+++ b/drivers/clk/tegra/clk-pll-out.c
@@ -20,7 +20,6 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/clk-provider.h>
-#include <linux/clk.h>
 
 #include "clk.h"
 
diff --git a/drivers/clk/tegra/clk-pll.c b/drivers/clk/tegra/clk-pll.c
index 05c6d08a6695..63499c461482 100644
--- a/drivers/clk/tegra/clk-pll.c
+++ b/drivers/clk/tegra/clk-pll.c
@@ -18,8 +18,8 @@
 #include <linux/io.h>
 #include <linux/delay.h>
 #include <linux/err.h>
-#include <linux/clk-provider.h>
 #include <linux/clk.h>
+#include <linux/clk-provider.h>
 
 #include "clk.h"
 
diff --git a/drivers/clk/tegra/clk-super.c b/drivers/clk/tegra/clk-super.c
index 2fd924d38606..131d1b5085e2 100644
--- a/drivers/clk/tegra/clk-super.c
+++ b/drivers/clk/tegra/clk-super.c
@@ -20,7 +20,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/clk-provider.h>
-#include <linux/clk.h>
 
 #include "clk.h"
 
diff --git a/drivers/clk/tegra/clk-tegra-audio.c b/drivers/clk/tegra/clk-tegra-audio.c
index 5c38aab2c5b8..11e3ad7ad7a3 100644
--- a/drivers/clk/tegra/clk-tegra-audio.c
+++ b/drivers/clk/tegra/clk-tegra-audio.c
@@ -15,7 +15,6 @@
  */
 
 #include <linux/io.h>
-#include <linux/clk.h>
 #include <linux/clk-provider.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
diff --git a/drivers/clk/tegra/clk-tegra-fixed.c b/drivers/clk/tegra/clk-tegra-fixed.c
index 605676d368eb..da0b5941c89f 100644
--- a/drivers/clk/tegra/clk-tegra-fixed.c
+++ b/drivers/clk/tegra/clk-tegra-fixed.c
@@ -15,7 +15,6 @@
  */
 
 #include <linux/io.h>
-#include <linux/clk.h>
 #include <linux/clk-provider.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
diff --git a/drivers/clk/tegra/clk-tegra-periph.c b/drivers/clk/tegra/clk-tegra-periph.c
index 46af9244ba74..cb6ab830941d 100644
--- a/drivers/clk/tegra/clk-tegra-periph.c
+++ b/drivers/clk/tegra/clk-tegra-periph.c
@@ -15,7 +15,6 @@
  */
 
 #include <linux/io.h>
-#include <linux/clk.h>
 #include <linux/clk-provider.h>
 #include <linux/clkdev.h>
 #include <linux/of.h>
diff --git a/drivers/clk/tegra/clk-tegra-pmc.c b/drivers/clk/tegra/clk-tegra-pmc.c
index 08b21c1ee867..91377abfefa1 100644
--- a/drivers/clk/tegra/clk-tegra-pmc.c
+++ b/drivers/clk/tegra/clk-tegra-pmc.c
@@ -15,7 +15,6 @@
  */
 
 #include <linux/io.h>
-#include <linux/clk.h>
 #include <linux/clk-provider.h>
 #include <linux/clkdev.h>
 #include <linux/of.h>
diff --git a/drivers/clk/tegra/clk-tegra-super-gen4.c b/drivers/clk/tegra/clk-tegra-super-gen4.c
index feb3201c85ce..ecd7ff736b74 100644
--- a/drivers/clk/tegra/clk-tegra-super-gen4.c
+++ b/drivers/clk/tegra/clk-tegra-super-gen4.c
@@ -15,7 +15,6 @@
  */
 
 #include <linux/io.h>
-#include <linux/clk.h>
 #include <linux/clk-provider.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
diff --git a/drivers/clk/tegra/clk-tegra114.c b/drivers/clk/tegra/clk-tegra114.c
index 8237d16b4075..db5871519bf5 100644
--- a/drivers/clk/tegra/clk-tegra114.c
+++ b/drivers/clk/tegra/clk-tegra114.c
@@ -15,9 +15,7 @@
  */
 
 #include <linux/io.h>
-#include <linux/clk.h>
 #include <linux/clk-provider.h>
-#include <linux/clkdev.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/delay.h>
diff --git a/drivers/clk/tegra/clk-tegra124.c b/drivers/clk/tegra/clk-tegra124.c
index e8cca3eac007..0c44cc7f8558 100644
--- a/drivers/clk/tegra/clk-tegra124.c
+++ b/drivers/clk/tegra/clk-tegra124.c
@@ -15,7 +15,6 @@
  */
 
 #include <linux/io.h>
-#include <linux/clk.h>
 #include <linux/clk-provider.h>
 #include <linux/clkdev.h>
 #include <linux/of.h>
diff --git a/drivers/clk/tegra/clk-tegra20.c b/drivers/clk/tegra/clk-tegra20.c
index 41272dcc9e22..bf004f0e4f65 100644
--- a/drivers/clk/tegra/clk-tegra20.c
+++ b/drivers/clk/tegra/clk-tegra20.c
@@ -15,7 +15,6 @@
  */
 
 #include <linux/io.h>
-#include <linux/clk.h>
 #include <linux/clk-provider.h>
 #include <linux/clkdev.h>
 #include <linux/of.h>
diff --git a/drivers/clk/tegra/clk-tegra30.c b/drivers/clk/tegra/clk-tegra30.c
index 0af3e834dd24..fad561a5896b 100644
--- a/drivers/clk/tegra/clk-tegra30.c
+++ b/drivers/clk/tegra/clk-tegra30.c
@@ -16,7 +16,6 @@
 
 #include <linux/io.h>
 #include <linux/delay.h>
-#include <linux/clk.h>
 #include <linux/clk-provider.h>
 #include <linux/clkdev.h>
 #include <linux/of.h>
diff --git a/drivers/clk/tegra/clk.c b/drivers/clk/tegra/clk.c
index 41cd87c67be6..22aa8b18c840 100644
--- a/drivers/clk/tegra/clk.c
+++ b/drivers/clk/tegra/clk.c
@@ -14,6 +14,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/clkdev.h>
 #include <linux/clk.h>
 #include <linux/clk-provider.h>
 #include <linux/of.h>
diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index 19c4208f4752..57bf7aab4516 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -17,7 +17,8 @@
 #ifndef __LINUX_CLK_TEGRA_H_
 #define __LINUX_CLK_TEGRA_H_
 
-#include <linux/clk.h>
+#include <linux/types.h>
+#include <linux/bug.h>
 
 /*
  * Tegra CPU clock and reset control ops
-- 
cgit v1.2.3-70-g09d2


From 61ae76563ec3b506235d5dd69c6fdacea321254d Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Mon, 22 Jun 2015 17:13:49 -0700
Subject: clk: Remove clk.h from clk-provider.h

Remove clk.h from clk-provider.h so that we can clearly split clk
providers from clk consumers. This will allow us to quickly
detect when clock providers are using the consumer APIs by
looking at the includes.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 include/linux/clk-provider.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 78842f46f152..36fa555ff431 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -11,7 +11,6 @@
 #ifndef __LINUX_CLK_PROVIDER_H
 #define __LINUX_CLK_PROVIDER_H
 
-#include <linux/clk.h>
 #include <linux/io.h>
 #include <linux/of.h>
 
@@ -33,6 +32,7 @@
 #define CLK_GET_ACCURACY_NOCACHE BIT(8) /* do not use the cached clk accuracy */
 #define CLK_RECALC_NEW_RATES	BIT(9) /* recalc rates after notifications */
 
+struct clk;
 struct clk_hw;
 struct clk_core;
 struct dentry;
-- 
cgit v1.2.3-70-g09d2


From 10dc4512185741a298cd7bc87e9968944f31a50d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 9 Jul 2015 16:45:28 -0400
Subject: svcrdma: Clean up svc_rdma_get_reply_array()

Kernel coding conventions frown upon having large nontrivial
functions in header files, and the preference these days is to
allow the compiler to make inlining decisions if possible.

As these functions are re-homed into a .c file, be sure that
comparisons with fields in struct rpcrdma_msg are with be32
constants.

This is a refactoring change; no behavior change is intended.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h       | 81 +----------------------------------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c | 73 +++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index cb94ee4181d4..ca4d86a6c947 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -213,6 +213,8 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
 
 /* svc_rdma_sendto.c */
 extern int svc_rdma_sendto(struct svc_rqst *);
+extern struct rpcrdma_read_chunk *
+	svc_rdma_get_read_chunk(struct rpcrdma_msg *);
 
 /* svc_rdma_transport.c */
 extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *);
@@ -238,83 +240,4 @@ extern void svc_rdma_prep_reply_hdr(struct svc_rqst *);
 extern int svc_rdma_init(void);
 extern void svc_rdma_cleanup(void);
 
-/*
- * Returns the address of the first read chunk or <nul> if no read chunk is
- * present
- */
-static inline struct rpcrdma_read_chunk *
-svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp)
-{
-	struct rpcrdma_read_chunk *ch =
-		(struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
-
-	if (ch->rc_discrim == 0)
-		return NULL;
-
-	return ch;
-}
-
-/*
- * Returns the address of the first read write array element or <nul> if no
- * write array list is present
- */
-static inline struct rpcrdma_write_array *
-svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp)
-{
-	if (rmsgp->rm_body.rm_chunks[0] != 0
-	    || rmsgp->rm_body.rm_chunks[1] == 0)
-		return NULL;
-
-	return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1];
-}
-
-/*
- * Returns the address of the first reply array element or <nul> if no
- * reply array is present
- */
-static inline struct rpcrdma_write_array *
-svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp)
-{
-	struct rpcrdma_read_chunk *rch;
-	struct rpcrdma_write_array *wr_ary;
-	struct rpcrdma_write_array *rp_ary;
-
-	/* XXX: Need to fix when reply list may occur with read-list and/or
-	 * write list */
-	if (rmsgp->rm_body.rm_chunks[0] != 0 ||
-	    rmsgp->rm_body.rm_chunks[1] != 0)
-		return NULL;
-
-	rch = svc_rdma_get_read_chunk(rmsgp);
-	if (rch) {
-		while (rch->rc_discrim)
-			rch++;
-
-		/* The reply list follows an empty write array located
-		 * at 'rc_position' here. The reply array is at rc_target.
-		 */
-		rp_ary = (struct rpcrdma_write_array *)&rch->rc_target;
-
-		goto found_it;
-	}
-
-	wr_ary = svc_rdma_get_write_array(rmsgp);
-	if (wr_ary) {
-		rp_ary = (struct rpcrdma_write_array *)
-			&wr_ary->
-			wc_array[ntohl(wr_ary->wc_nchunks)].wc_target.rs_length;
-
-		goto found_it;
-	}
-
-	/* No read list, no write list */
-	rp_ary = (struct rpcrdma_write_array *)
-		&rmsgp->rm_body.rm_chunks[2];
-
- found_it:
-	if (rp_ary->wc_discrim == 0)
-		return NULL;
-
-	return rp_ary;
-}
 #endif
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 95412abc95b0..1dfae8317065 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -136,6 +136,79 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
 	return dma_addr;
 }
 
+/* Returns the address of the first read chunk or <nul> if no read chunk
+ * is present
+ */
+struct rpcrdma_read_chunk *
+svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp)
+{
+	struct rpcrdma_read_chunk *ch =
+		(struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+
+	if (ch->rc_discrim == xdr_zero)
+		return NULL;
+	return ch;
+}
+
+/* Returns the address of the first read write array element or <nul>
+ * if no write array list is present
+ */
+static struct rpcrdma_write_array *
+svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp)
+{
+	if (rmsgp->rm_body.rm_chunks[0] != xdr_zero ||
+	    rmsgp->rm_body.rm_chunks[1] == xdr_zero)
+		return NULL;
+	return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1];
+}
+
+/* Returns the address of the first reply array element or <nul> if no
+ * reply array is present
+ */
+static struct rpcrdma_write_array *
+svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp)
+{
+	struct rpcrdma_read_chunk *rch;
+	struct rpcrdma_write_array *wr_ary;
+	struct rpcrdma_write_array *rp_ary;
+
+	/* XXX: Need to fix when reply chunk may occur with read list
+	 *	and/or write list.
+	 */
+	if (rmsgp->rm_body.rm_chunks[0] != xdr_zero ||
+	    rmsgp->rm_body.rm_chunks[1] != xdr_zero)
+		return NULL;
+
+	rch = svc_rdma_get_read_chunk(rmsgp);
+	if (rch) {
+		while (rch->rc_discrim != xdr_zero)
+			rch++;
+
+		/* The reply chunk follows an empty write array located
+		 * at 'rc_position' here. The reply array is at rc_target.
+		 */
+		rp_ary = (struct rpcrdma_write_array *)&rch->rc_target;
+		goto found_it;
+	}
+
+	wr_ary = svc_rdma_get_write_array(rmsgp);
+	if (wr_ary) {
+		int chunk = be32_to_cpu(wr_ary->wc_nchunks);
+
+		rp_ary = (struct rpcrdma_write_array *)
+			 &wr_ary->wc_array[chunk].wc_target.rs_length;
+		goto found_it;
+	}
+
+	/* No read list, no write list */
+	rp_ary = (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[2];
+
+ found_it:
+	if (rp_ary->wc_discrim == xdr_zero)
+		return NULL;
+	return rp_ary;
+}
+
 /* Assumptions:
  * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
  */
-- 
cgit v1.2.3-70-g09d2


From 31193fe5f6fb616711323f5d74ee5bb92aacba4a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 9 Jul 2015 16:45:37 -0400
Subject: svcrdma: Remove svc_rdma_fastreg()

Commit 0bf4828983df ("svcrdma: refactor marshalling logic") removed
the last call site for svc_rdma_fastreg().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          |  1 -
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 34 --------------------------------
 2 files changed, 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index ca4d86a6c947..13af61b70417 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -227,7 +227,6 @@ extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
 extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt);
 extern struct svc_rdma_req_map *svc_rdma_get_req_map(void);
 extern void svc_rdma_put_req_map(struct svc_rdma_req_map *);
-extern int svc_rdma_fastreg(struct svcxprt_rdma *, struct svc_rdma_fastreg_mr *);
 extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *);
 extern void svc_rdma_put_frmr(struct svcxprt_rdma *,
 			      struct svc_rdma_fastreg_mr *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index f4b973233977..4054a9de6a91 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1202,40 +1202,6 @@ static int svc_rdma_secure_port(struct svc_rqst *rqstp)
 	return 1;
 }
 
-/*
- * Attempt to register the kvec representing the RPC memory with the
- * device.
- *
- * Returns:
- *  NULL : The device does not support fastreg or there were no more
- *         fastreg mr.
- *  frmr : The kvec register request was successfully posted.
- *    <0 : An error was encountered attempting to register the kvec.
- */
-int svc_rdma_fastreg(struct svcxprt_rdma *xprt,
-		     struct svc_rdma_fastreg_mr *frmr)
-{
-	struct ib_send_wr fastreg_wr;
-	u8 key;
-
-	/* Bump the key */
-	key = (u8)(frmr->mr->lkey & 0x000000FF);
-	ib_update_fast_reg_key(frmr->mr, ++key);
-
-	/* Prepare FASTREG WR */
-	memset(&fastreg_wr, 0, sizeof fastreg_wr);
-	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
-	fastreg_wr.send_flags = IB_SEND_SIGNALED;
-	fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
-	fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
-	fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
-	fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
-	fastreg_wr.wr.fast_reg.length = frmr->map_len;
-	fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
-	fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
-	return svc_rdma_send(xprt, &fastreg_wr);
-}
-
 int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
 {
 	struct ib_send_wr *bad_wr, *n_wr;
-- 
cgit v1.2.3-70-g09d2


From 0c4f691ff6791e55ac831666df0b49b1679c56e4 Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sat, 18 Jul 2015 18:24:48 -0700
Subject: net: don't reforward packets already forwarded by offload device

Just before queuing skb for xmit on port, check if skb has been marked by
switchdev port driver as already fordwarded by device.  If so, drop skb.  A
non-zero skb->offload_fwd_mark field is set by the switchdev port
driver/device on ingress to indicate the skb has already been forwarded by
the device to egress ports with matching dev->skb_mark.  The switchdev port
driver would assign a non-zero dev->offload_skb_mark for each device port
netdev during registration, for example.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  6 ++++++
 include/linux/skbuff.h    |  9 ++++++++-
 net/core/dev.c            | 10 ++++++++++
 3 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 45cfd797eb77..8364f29e08be 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1456,6 +1456,8 @@ enum netdev_priv_flags {
  *
  *	@xps_maps:	XXX: need comments on this one
  *
+ *	@offload_fwd_mark:	Offload device fwding mark
+ *
  *	@trans_start:		Time (in jiffies) of last Tx
  *	@watchdog_timeo:	Represents the timeout that is used by
  *				the watchdog ( see dev_watchdog() )
@@ -1697,6 +1699,10 @@ struct net_device {
 	struct xps_dev_maps __rcu *xps_maps;
 #endif
 
+#ifdef CONFIG_NET_SWITCHDEV
+	u32			offload_fwd_mark;
+#endif
+
 	/* These may be needed for future network-power-down code. */
 
 	/*
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d6cdd6e87d53..af7a09650fa2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -506,6 +506,7 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
  *	@no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
   *	@napi_id: id of the NAPI struct this skb came from
  *	@secmark: security marking
+ *	@offload_fwd_mark: fwding offload mark
  *	@mark: Generic packet mark
  *	@vlan_proto: vlan encapsulation protocol
  *	@vlan_tci: vlan tag control information
@@ -650,9 +651,15 @@ struct sk_buff {
 		unsigned int	sender_cpu;
 	};
 #endif
+	union {
 #ifdef CONFIG_NETWORK_SECMARK
-	__u32			secmark;
+		__u32		secmark;
+#endif
+#ifdef CONFIG_NET_SWITCHDEV
+		__u32		offload_fwd_mark;
 #endif
+	};
+
 	union {
 		__u32		mark;
 		__u32		reserved_tailroom;
diff --git a/net/core/dev.c b/net/core/dev.c
index 8810b6bbebfe..2ee15afb412d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3061,6 +3061,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 	else
 		skb_dst_force(skb);
 
+#ifdef CONFIG_NET_SWITCHDEV
+	/* Don't forward if offload device already forwarded */
+	if (skb->offload_fwd_mark &&
+	    skb->offload_fwd_mark == dev->offload_fwd_mark) {
+		consume_skb(skb);
+		rc = NET_XMIT_SUCCESS;
+		goto out;
+	}
+#endif
+
 	txq = netdev_pick_tx(dev, skb, accel_priv);
 	q = rcu_dereference_bh(txq->qdisc);
 
-- 
cgit v1.2.3-70-g09d2


From d754f98b502ad9a8c7570d494e1eaa0e6bc0350c Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sat, 18 Jul 2015 18:24:49 -0700
Subject: net: add phys ID compare helper to test if two IDs are the same

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 7 +++++++
 net/switchdev/switchdev.c | 8 ++------
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8364f29e08be..607b5f41f46f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -766,6 +766,13 @@ struct netdev_phys_item_id {
 	unsigned char id_len;
 };
 
+static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a,
+					    struct netdev_phys_item_id *b)
+{
+	return a->id_len == b->id_len &&
+	       memcmp(a->id, b->id, a->id_len) == 0;
+}
+
 typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 				       struct sk_buff *skb);
 
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 9f2add3cba26..4e5bba50ccff 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -910,13 +910,9 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
 		if (switchdev_port_attr_get(dev, &attr))
 			return NULL;
 
-		if (nhsel > 0) {
-			if (prev_attr.u.ppid.id_len != attr.u.ppid.id_len)
+		if (nhsel > 0 &&
+		    !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid))
 				return NULL;
-			if (memcmp(prev_attr.u.ppid.id, attr.u.ppid.id,
-				   attr.u.ppid.id_len))
-				return NULL;
-		}
 
 		prev_attr = attr;
 	}
-- 
cgit v1.2.3-70-g09d2


From 6acc23266054a9969737b435fa012f87465dbc50 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 16 Jul 2015 21:50:50 +0200
Subject: net: remove skb_frag_add_head

It's not used anywhere.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index af7a09650fa2..6bd96fe9416a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2678,12 +2678,6 @@ static inline void skb_frag_list_init(struct sk_buff *skb)
 	skb_shinfo(skb)->frag_list = NULL;
 }
 
-static inline void skb_frag_add_head(struct sk_buff *skb, struct sk_buff *frag)
-{
-	frag->next = skb_shinfo(skb)->frag_list;
-	skb_shinfo(skb)->frag_list = frag;
-}
-
 #define skb_walk_frags(skb, iter)	\
 	for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)
 
-- 
cgit v1.2.3-70-g09d2


From f4c190eb8b4f80b12dc98ce7d54a3bea0e4e7e69 Mon Sep 17 00:00:00 2001
From: Joachim Eastwood <manabian@gmail.com>
Date: Fri, 17 Jul 2015 00:26:12 +0200
Subject: stmmac: drop custom_* fields from plat_stmmacenet_data

Both of these fields are unused and has been unused since they
were added 3 and 5 years ago. Drop them since they are clearly
not very useful.

Signed-off-by: Joachim Eastwood <manabian@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/stmmac.txt | 4 ----
 include/linux/stmmac.h              | 2 --
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt
index e655e2453c98..5fddefa69baf 100644
--- a/Documentation/networking/stmmac.txt
+++ b/Documentation/networking/stmmac.txt
@@ -139,8 +139,6 @@ struct plat_stmmacenet_data {
 	void (*free)(struct platform_device *pdev, void *priv);
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
-	void *custom_cfg;
-	void *custom_data;
 	void *bsp_priv;
 };
 
@@ -186,8 +184,6 @@ Where:
 	     which will be stored in bsp_priv, and then passed to init and
 	     exit callbacks. init/exit callbacks should not use or modify
 	     platform data.
- o custom_cfg/custom_data: this is a custom configuration that can be passed
-			   while initializing the resources.
  o bsp_priv: another private pointer.
 
 For MDIO bus The we have:
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index c735f5c91eea..c86a20047cb1 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -123,8 +123,6 @@ struct plat_stmmacenet_data {
 	void (*free)(struct platform_device *pdev, void *priv);
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
-	void *custom_cfg;
-	void *custom_data;
 	void *bsp_priv;
 };
 
-- 
cgit v1.2.3-70-g09d2


From 4e10df9a60d96ced321dd2af71da558c6b750078 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Mon, 20 Jul 2015 20:34:18 -0700
Subject: bpf: introduce bpf_skb_vlan_push/pop() helpers

Allow eBPF programs attached to TC qdiscs call skb_vlan_push/pop via
helper functions. These functions may change skb->data/hlen which are
cached by some JITs to improve performance of ld_abs/ld_ind instructions.
Therefore JITs need to recognize bpf_skb_vlan_push/pop() calls,
re-compute header len and re-cache skb->data/hlen back into cpu registers.
Note, skb->data/hlen are not directly accessible from the programs,
so any changes to skb->data done either by these helpers or by other
TC actions are safe.

eBPF JIT supported by three architectures:
- arm64 JIT is using bpf_load_pointer() without caching, so it's ok as-is.
- x64 JIT re-caches skb->data/hlen unconditionally after vlan_push/pop calls
  (experiments showed that conditional re-caching is slower).
- s390 JIT falls back to interpreter for now when bpf_skb_vlan_push() is present
  in the program (re-caching is tbd).

These helpers allow more scalable handling of vlan from the programs.
Instead of creating thousands of vlan netdevs on top of eth0 and attaching
TC+ingress+bpf to all of them, the program can be attached to eth0 directly
and manipulate vlans as necessary.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/s390/net/bpf_jit_comp.c |  4 +++
 arch/x86/net/bpf_jit_comp.c  | 80 +++++++++++++++++++++++---------------------
 include/linux/bpf.h          |  2 ++
 include/linux/filter.h       |  1 +
 include/uapi/linux/bpf.h     |  2 ++
 net/core/filter.c            | 48 ++++++++++++++++++++++++++
 6 files changed, 99 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index fee782acc2ee..79c731e8d178 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -973,6 +973,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 		 */
 		const u64 func = (u64)__bpf_call_base + imm;
 
+		if (bpf_helper_changes_skb_data((void *)func))
+			/* TODO reload skb->data, hlen */
+			return -1;
+
 		REG_SET_SEEN(BPF_REG_5);
 		jit->seen |= SEEN_FUNC;
 		/* lg %w1,<d(imm)>(%l) */
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 579a8fd74be0..6c335a8fc086 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -315,6 +315,26 @@ static void emit_bpf_tail_call(u8 **pprog)
 	*pprog = prog;
 }
 
+
+static void emit_load_skb_data_hlen(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	/* r9d = skb->len - skb->data_len (headlen)
+	 * r10 = skb->data
+	 */
+	/* mov %r9d, off32(%rdi) */
+	EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len));
+
+	/* sub %r9d, off32(%rdi) */
+	EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len));
+
+	/* mov %r10, off32(%rdi) */
+	EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data));
+	*pprog = prog;
+}
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		  int oldproglen, struct jit_context *ctx)
 {
@@ -329,36 +349,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 
 	emit_prologue(&prog);
 
-	if (seen_ld_abs) {
-		/* r9d : skb->len - skb->data_len (headlen)
-		 * r10 : skb->data
-		 */
-		if (is_imm8(offsetof(struct sk_buff, len)))
-			/* mov %r9d, off8(%rdi) */
-			EMIT4(0x44, 0x8b, 0x4f,
-			      offsetof(struct sk_buff, len));
-		else
-			/* mov %r9d, off32(%rdi) */
-			EMIT3_off32(0x44, 0x8b, 0x8f,
-				    offsetof(struct sk_buff, len));
-
-		if (is_imm8(offsetof(struct sk_buff, data_len)))
-			/* sub %r9d, off8(%rdi) */
-			EMIT4(0x44, 0x2b, 0x4f,
-			      offsetof(struct sk_buff, data_len));
-		else
-			EMIT3_off32(0x44, 0x2b, 0x8f,
-				    offsetof(struct sk_buff, data_len));
-
-		if (is_imm8(offsetof(struct sk_buff, data)))
-			/* mov %r10, off8(%rdi) */
-			EMIT4(0x4c, 0x8b, 0x57,
-			      offsetof(struct sk_buff, data));
-		else
-			/* mov %r10, off32(%rdi) */
-			EMIT3_off32(0x4c, 0x8b, 0x97,
-				    offsetof(struct sk_buff, data));
-	}
+	if (seen_ld_abs)
+		emit_load_skb_data_hlen(&prog);
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		const s32 imm32 = insn->imm;
@@ -367,6 +359,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		u8 b1 = 0, b2 = 0, b3 = 0;
 		s64 jmp_offset;
 		u8 jmp_cond;
+		bool reload_skb_data;
 		int ilen;
 		u8 *func;
 
@@ -818,12 +811,18 @@ xadd:			if (is_imm8(insn->off))
 			func = (u8 *) __bpf_call_base + imm32;
 			jmp_offset = func - (image + addrs[i]);
 			if (seen_ld_abs) {
-				EMIT2(0x41, 0x52); /* push %r10 */
-				EMIT2(0x41, 0x51); /* push %r9 */
-				/* need to adjust jmp offset, since
-				 * pop %r9, pop %r10 take 4 bytes after call insn
-				 */
-				jmp_offset += 4;
+				reload_skb_data = bpf_helper_changes_skb_data(func);
+				if (reload_skb_data) {
+					EMIT1(0x57); /* push %rdi */
+					jmp_offset += 22; /* pop, mov, sub, mov */
+				} else {
+					EMIT2(0x41, 0x52); /* push %r10 */
+					EMIT2(0x41, 0x51); /* push %r9 */
+					/* need to adjust jmp offset, since
+					 * pop %r9, pop %r10 take 4 bytes after call insn
+					 */
+					jmp_offset += 4;
+				}
 			}
 			if (!imm32 || !is_simm32(jmp_offset)) {
 				pr_err("unsupported bpf func %d addr %p image %p\n",
@@ -832,8 +831,13 @@ xadd:			if (is_imm8(insn->off))
 			}
 			EMIT1_off32(0xE8, jmp_offset);
 			if (seen_ld_abs) {
-				EMIT2(0x41, 0x59); /* pop %r9 */
-				EMIT2(0x41, 0x5A); /* pop %r10 */
+				if (reload_skb_data) {
+					EMIT1(0x5F); /* pop %rdi */
+					emit_load_skb_data_hlen(&prog);
+				} else {
+					EMIT2(0x41, 0x59); /* pop %r9 */
+					EMIT2(0x41, 0x5A); /* pop %r10 */
+				}
 			}
 			break;
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4383476a0d48..139d6d2e123f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -192,5 +192,7 @@ extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
 extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
 extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
 extern const struct bpf_func_proto bpf_get_current_comm_proto;
+extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
+extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 17724f6ea983..69d00555ce35 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -411,6 +411,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 void bpf_int_jit_compile(struct bpf_prog *fp);
+bool bpf_helper_changes_skb_data(void *func);
 
 #ifdef CONFIG_BPF_JIT
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2de87e58b12b..2f6c83d714e9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -256,6 +256,8 @@ enum bpf_func_id {
 	 * Return: classid if != 0
 	 */
 	BPF_FUNC_get_cgroup_classid,
+	BPF_FUNC_skb_vlan_push, /* bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) */
+	BPF_FUNC_skb_vlan_pop,  /* bpf_skb_vlan_pop(skb) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 247450a5e387..50338071fac4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1437,6 +1437,50 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
+static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	__be16 vlan_proto = (__force __be16) r2;
+
+	if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
+		     vlan_proto != htons(ETH_P_8021AD)))
+		vlan_proto = htons(ETH_P_8021Q);
+
+	return skb_vlan_push(skb, vlan_proto, vlan_tci);
+}
+
+const struct bpf_func_proto bpf_skb_vlan_push_proto = {
+	.func           = bpf_skb_vlan_push,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_ANYTHING,
+	.arg3_type      = ARG_ANYTHING,
+};
+
+static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+
+	return skb_vlan_pop(skb);
+}
+
+const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
+	.func           = bpf_skb_vlan_pop,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+};
+
+bool bpf_helper_changes_skb_data(void *func)
+{
+	if (func == bpf_skb_vlan_push)
+		return true;
+	if (func == bpf_skb_vlan_pop)
+		return true;
+	return false;
+}
+
 static const struct bpf_func_proto *
 sk_filter_func_proto(enum bpf_func_id func_id)
 {
@@ -1476,6 +1520,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_clone_redirect_proto;
 	case BPF_FUNC_get_cgroup_classid:
 		return &bpf_get_cgroup_classid_proto;
+	case BPF_FUNC_skb_vlan_push:
+		return &bpf_skb_vlan_push_proto;
+	case BPF_FUNC_skb_vlan_pop:
+		return &bpf_skb_vlan_pop_proto;
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
cgit v1.2.3-70-g09d2


From be9015abb8296d8dc72cef4da75fa30e88ab7c81 Mon Sep 17 00:00:00 2001
From: Shobhit Kumar <shobhit.kumar@intel.com>
Date: Fri, 26 Jun 2015 14:32:04 +0530
Subject: gpiolib: Add support for removing registered consumer lookup table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In case we unload and load a driver module again that is registering a
lookup table, without this it will result in multiple entries. Provide
an option to remove the lookup table on driver unload

Cc: Samuel Ortiz <sameo@linux.intel.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Alexandre Courbot <gnurou@gmail.com>
Cc: Thierry Reding <thierry.reding@gmail.com>
Reviewed-by: Alexandre Courbot <acourbot@nvidia.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Tested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Shobhit Kumar <shobhit.kumar@intel.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpio/gpiolib.c       | 13 +++++++++++++
 include/linux/gpio/machine.h |  1 +
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index bf4bd1d120c3..f25dc880b007 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1672,6 +1672,19 @@ void gpiod_add_lookup_table(struct gpiod_lookup_table *table)
 	mutex_unlock(&gpio_lookup_lock);
 }
 
+/**
+ * gpiod_remove_lookup_table() - unregister GPIO device consumers
+ * @table: table of consumers to unregister
+ */
+void gpiod_remove_lookup_table(struct gpiod_lookup_table *table)
+{
+	mutex_lock(&gpio_lookup_lock);
+
+	list_del(&table->list);
+
+	mutex_unlock(&gpio_lookup_lock);
+}
+
 static struct gpio_desc *of_find_gpio(struct device *dev, const char *con_id,
 				      unsigned int idx,
 				      enum gpio_lookup_flags *flags)
diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h
index e2706140eaff..c0d712d22b07 100644
--- a/include/linux/gpio/machine.h
+++ b/include/linux/gpio/machine.h
@@ -57,5 +57,6 @@ struct gpiod_lookup_table {
 }
 
 void gpiod_add_lookup_table(struct gpiod_lookup_table *table);
+void gpiod_remove_lookup_table(struct gpiod_lookup_table *table);
 
 #endif /* __LINUX_GPIO_MACHINE_H */
-- 
cgit v1.2.3-70-g09d2


From 3490565b633c705d2fb1f6ede51228952664663d Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Mon, 13 Jul 2015 20:31:03 +0200
Subject: locking/spinlocks: Force inlining of spinlock ops

With both gcc 4.7.2 and 4.9.2, sometimes GCC mysteriously
doesn't inline very small functions we expect to be inlined.
See:

    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

In particular, with this config:

   http://busybox.net/~vda/kernel_config

there are more than a thousand copies of tiny spinlock-related
functions:

  $ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort -rn | grep ' spin'
    473 000000000000000b t spin_unlock_irqrestore
    292 000000000000000b t spin_unlock
    215 000000000000000b t spin_lock
    134 000000000000000b t spin_unlock_irq
    130 000000000000000b t spin_unlock_bh
    120 000000000000000b t spin_lock_irq
    106 000000000000000b t spin_lock_bh

Disassembly:

 ffffffff81004720 <spin_lock>:
 ffffffff81004720:       55                      push   %rbp
 ffffffff81004721:       48 89 e5                mov    %rsp,%rbp
 ffffffff81004724:       e8 f8 4e e2 02          callq <_raw_spin_lock>
 ffffffff81004729:       5d                      pop    %rbp
 ffffffff8100472a:       c3                      retq

This patch fixes this via s/inline/__always_inline/ in
spinlock.h. This decreases vmlinux by about 40k:

      text     data      bss       dec     hex filename
  82375570 22255544 20627456 125258570 7774b4a vmlinux.before
  82335059 22255416 20627456 125217931 776ac8b vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Graf <tgraf@suug.ch>
Link: http://lkml.kernel.org/r/1436812263-15243-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/spinlock.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 0063b24b4f36..ffcd053ca89a 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -296,7 +296,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
  * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
  */
 
-static inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
+static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
 {
 	return &lock->rlock;
 }
@@ -307,17 +307,17 @@ do {							\
 	raw_spin_lock_init(&(_lock)->rlock);		\
 } while (0)
 
-static inline void spin_lock(spinlock_t *lock)
+static __always_inline void spin_lock(spinlock_t *lock)
 {
 	raw_spin_lock(&lock->rlock);
 }
 
-static inline void spin_lock_bh(spinlock_t *lock)
+static __always_inline void spin_lock_bh(spinlock_t *lock)
 {
 	raw_spin_lock_bh(&lock->rlock);
 }
 
-static inline int spin_trylock(spinlock_t *lock)
+static __always_inline int spin_trylock(spinlock_t *lock)
 {
 	return raw_spin_trylock(&lock->rlock);
 }
@@ -337,7 +337,7 @@ do {									\
 	raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);	\
 } while (0)
 
-static inline void spin_lock_irq(spinlock_t *lock)
+static __always_inline void spin_lock_irq(spinlock_t *lock)
 {
 	raw_spin_lock_irq(&lock->rlock);
 }
@@ -352,32 +352,32 @@ do {									\
 	raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \
 } while (0)
 
-static inline void spin_unlock(spinlock_t *lock)
+static __always_inline void spin_unlock(spinlock_t *lock)
 {
 	raw_spin_unlock(&lock->rlock);
 }
 
-static inline void spin_unlock_bh(spinlock_t *lock)
+static __always_inline void spin_unlock_bh(spinlock_t *lock)
 {
 	raw_spin_unlock_bh(&lock->rlock);
 }
 
-static inline void spin_unlock_irq(spinlock_t *lock)
+static __always_inline void spin_unlock_irq(spinlock_t *lock)
 {
 	raw_spin_unlock_irq(&lock->rlock);
 }
 
-static inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 {
 	raw_spin_unlock_irqrestore(&lock->rlock, flags);
 }
 
-static inline int spin_trylock_bh(spinlock_t *lock)
+static __always_inline int spin_trylock_bh(spinlock_t *lock)
 {
 	return raw_spin_trylock_bh(&lock->rlock);
 }
 
-static inline int spin_trylock_irq(spinlock_t *lock)
+static __always_inline int spin_trylock_irq(spinlock_t *lock)
 {
 	return raw_spin_trylock_irq(&lock->rlock);
 }
@@ -387,22 +387,22 @@ static inline int spin_trylock_irq(spinlock_t *lock)
 	raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
 })
 
-static inline void spin_unlock_wait(spinlock_t *lock)
+static __always_inline void spin_unlock_wait(spinlock_t *lock)
 {
 	raw_spin_unlock_wait(&lock->rlock);
 }
 
-static inline int spin_is_locked(spinlock_t *lock)
+static __always_inline int spin_is_locked(spinlock_t *lock)
 {
 	return raw_spin_is_locked(&lock->rlock);
 }
 
-static inline int spin_is_contended(spinlock_t *lock)
+static __always_inline int spin_is_contended(spinlock_t *lock)
 {
 	return raw_spin_is_contended(&lock->rlock);
 }
 
-static inline int spin_can_lock(spinlock_t *lock)
+static __always_inline int spin_can_lock(spinlock_t *lock)
 {
 	return raw_spin_can_lock(&lock->rlock);
 }
-- 
cgit v1.2.3-70-g09d2


From 8ffaadf7429270914b8f146ec13cf305e01df20d Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Mon, 20 Jul 2015 10:14:09 -0600
Subject: NVMe: Use CMB for the IO SQes if available

Some controllers have a controller-side memory buffer available for use
for submissions, completions, lists, or data.

If a CMB is available, the entire CMB will be ioremapped and it will
attempt to map the IO SQes onto the CMB. The queues will be shrunk as
needed. The CMB will not be used if the queue depth is shrunk below some
threshold where it may have reduced performance over a larger queue
in system memory.

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nvme-core.c | 122 ++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/nvme.h      |  17 +++++++
 2 files changed, 134 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e09ad6cc6dec..82b4ffb6eefa 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -72,6 +72,10 @@ module_param(nvme_char_major, int, 0);
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0);
 
+static bool use_cmb_sqes = true;
+module_param(use_cmb_sqes, bool, 0644);
+MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
+
 static DEFINE_SPINLOCK(dev_list_lock);
 static LIST_HEAD(dev_list);
 static struct task_struct *nvme_thread;
@@ -103,6 +107,7 @@ struct nvme_queue {
 	char irqname[24];	/* nvme4294967295-65535\0 */
 	spinlock_t q_lock;
 	struct nvme_command *sq_cmds;
+	struct nvme_command __iomem *sq_cmds_io;
 	volatile struct nvme_completion *cqes;
 	struct blk_mq_tags **tags;
 	dma_addr_t sq_dma_addr;
@@ -383,7 +388,11 @@ static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 {
 	u16 tail = nvmeq->sq_tail;
 
-	memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+	if (nvmeq->sq_cmds_io)
+		memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
+	else
+		memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+
 	if (++tail == nvmeq->q_depth)
 		tail = 0;
 	writel(tail, nvmeq->q_db);
@@ -1364,7 +1373,8 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
 {
 	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
 				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
-	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+	if (nvmeq->sq_cmds)
+		dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
 					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
 	kfree(nvmeq);
 }
@@ -1437,6 +1447,46 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid)
 	spin_unlock_irq(&nvmeq->q_lock);
 }
 
+static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
+				int entry_size)
+{
+	int q_depth = dev->q_depth;
+	unsigned q_size_aligned = roundup(q_depth * entry_size, dev->page_size);
+
+	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
+		q_depth = rounddown(dev->cmb_size / nr_io_queues,
+					dev->page_size) / entry_size;
+
+		/*
+		 * Ensure the reduced q_depth is above some threshold where it
+		 * would be better to map queues in system memory with the
+		 * original depth
+		 */
+		if (q_depth < 64)
+			return -ENOMEM;
+	}
+
+	return q_depth;
+}
+
+static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
+				int qid, int depth)
+{
+	if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
+		unsigned offset = (qid - 1) *
+					roundup(SQ_SIZE(depth), dev->page_size);
+		nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset;
+		nvmeq->sq_cmds_io = dev->cmb + offset;
+	} else {
+		nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
+					&nvmeq->sq_dma_addr, GFP_KERNEL);
+		if (!nvmeq->sq_cmds)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 							int depth)
 {
@@ -1449,9 +1499,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	if (!nvmeq->cqes)
 		goto free_nvmeq;
 
-	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
-					&nvmeq->sq_dma_addr, GFP_KERNEL);
-	if (!nvmeq->sq_cmds)
+	if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
 		goto free_cqdma;
 
 	nvmeq->q_dmadev = dev->dev;
@@ -2149,6 +2197,58 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 	return min(result & 0xffff, result >> 16) + 1;
 }
 
+static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
+{
+	u64 szu, size, offset;
+	u32 cmbloc;
+	resource_size_t bar_size;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+	void __iomem *cmb;
+	dma_addr_t dma_addr;
+
+	if (!use_cmb_sqes)
+		return NULL;
+
+	dev->cmbsz = readl(&dev->bar->cmbsz);
+	if (!(NVME_CMB_SZ(dev->cmbsz)))
+		return NULL;
+
+	cmbloc = readl(&dev->bar->cmbloc);
+
+	szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
+	size = szu * NVME_CMB_SZ(dev->cmbsz);
+	offset = szu * NVME_CMB_OFST(cmbloc);
+	bar_size = pci_resource_len(pdev, NVME_CMB_BIR(cmbloc));
+
+	if (offset > bar_size)
+		return NULL;
+
+	/*
+	 * Controllers may support a CMB size larger than their BAR,
+	 * for example, due to being behind a bridge. Reduce the CMB to
+	 * the reported size of the BAR
+	 */
+	if (size > bar_size - offset)
+		size = bar_size - offset;
+
+	dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(cmbloc)) + offset;
+	cmb = ioremap_wc(dma_addr, size);
+	if (!cmb)
+		return NULL;
+
+	dev->cmb_dma_addr = dma_addr;
+	dev->cmb_size = size;
+	return cmb;
+}
+
+static inline void nvme_release_cmb(struct nvme_dev *dev)
+{
+	if (dev->cmb) {
+		iounmap(dev->cmb);
+		dev->cmb = NULL;
+	}
+}
+
 static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
 {
 	return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
@@ -2167,6 +2267,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	if (result < nr_io_queues)
 		nr_io_queues = result;
 
+	if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
+		result = nvme_cmb_qdepth(dev, nr_io_queues,
+				sizeof(struct nvme_command));
+		if (result > 0)
+			dev->q_depth = result;
+		else
+			nvme_release_cmb(dev);
+	}
+
 	size = db_bar_size(dev, nr_io_queues);
 	if (size > 8192) {
 		iounmap(dev->bar);
@@ -2430,6 +2539,8 @@ static int nvme_dev_map(struct nvme_dev *dev)
 	dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
 	dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
 	dev->dbs = ((void __iomem *)dev->bar) + 4096;
+	if (readl(&dev->bar->vs) >= NVME_VS(1, 2))
+		dev->cmb = nvme_map_cmb(dev);
 
 	return 0;
 
@@ -3135,6 +3246,7 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_dev_remove_admin(dev);
 	device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
 	nvme_free_queues(dev, 0);
+	nvme_release_cmb(dev);
 	nvme_release_prp_pools(dev);
 	kref_put(&dev->kref, nvme_free_dev);
 }
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index c0d94ed8ce9a..fa3fe160c6cb 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -32,6 +32,8 @@ struct nvme_bar {
 	__u32			aqa;	/* Admin Queue Attributes */
 	__u64			asq;	/* Admin SQ Base Address */
 	__u64			acq;	/* Admin CQ Base Address */
+	__u32			cmbloc; /* Controller Memory Buffer Location */
+	__u32			cmbsz;  /* Controller Memory Buffer Size */
 };
 
 #define NVME_CAP_MQES(cap)	((cap) & 0xffff)
@@ -40,6 +42,17 @@ struct nvme_bar {
 #define NVME_CAP_MPSMIN(cap)	(((cap) >> 48) & 0xf)
 #define NVME_CAP_MPSMAX(cap)	(((cap) >> 52) & 0xf)
 
+#define NVME_CMB_BIR(cmbloc)	((cmbloc) & 0x7)
+#define NVME_CMB_OFST(cmbloc)	(((cmbloc) >> 12) & 0xfffff)
+#define NVME_CMB_SZ(cmbsz)	(((cmbsz) >> 12) & 0xfffff)
+#define NVME_CMB_SZU(cmbsz)	(((cmbsz) >> 8) & 0xf)
+
+#define NVME_CMB_WDS(cmbsz)	((cmbsz) & 0x10)
+#define NVME_CMB_RDS(cmbsz)	((cmbsz) & 0x8)
+#define NVME_CMB_LISTS(cmbsz)	((cmbsz) & 0x4)
+#define NVME_CMB_CQS(cmbsz)	((cmbsz) & 0x2)
+#define NVME_CMB_SQS(cmbsz)	((cmbsz) & 0x1)
+
 enum {
 	NVME_CC_ENABLE		= 1 << 0,
 	NVME_CC_CSS_NVM		= 0 << 4,
@@ -100,6 +113,10 @@ struct nvme_dev {
 	u32 max_hw_sectors;
 	u32 stripe_size;
 	u32 page_size;
+	void __iomem *cmb;
+	dma_addr_t cmb_dma_addr;
+	u64 cmb_size;
+	u32 cmbsz;
 	u16 oncs;
 	u16 abort_limit;
 	u8 event_limit;
-- 
cgit v1.2.3-70-g09d2


From 499a24256862714539e902c0499b67da2bb3ab72 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 21 Jul 2015 10:43:46 +0200
Subject: lwtunnel: infrastructure for handling light weight tunnels like mpls

Provides infrastructure to parse/dump/store encap information for
light weight tunnels like mpls. Encap information for such tunnels
is associated with fib routes.

This infrastructure is based on previous suggestions from
Eric Biederman to follow the xfrm infrastructure.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/lwtunnel.h      |   6 ++
 include/net/lwtunnel.h        | 132 +++++++++++++++++++++++++++++++
 include/uapi/linux/lwtunnel.h |  15 ++++
 net/Kconfig                   |   7 ++
 net/core/Makefile             |   1 +
 net/core/lwtunnel.c           | 179 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 340 insertions(+)
 create mode 100644 include/linux/lwtunnel.h
 create mode 100644 include/net/lwtunnel.h
 create mode 100644 include/uapi/linux/lwtunnel.h
 create mode 100644 net/core/lwtunnel.c

(limited to 'include/linux')

diff --git a/include/linux/lwtunnel.h b/include/linux/lwtunnel.h
new file mode 100644
index 000000000000..97f32f8b4ae1
--- /dev/null
+++ b/include/linux/lwtunnel.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_LWTUNNEL_H_
+#define _LINUX_LWTUNNEL_H_
+
+#include <uapi/linux/lwtunnel.h>
+
+#endif /* _LINUX_LWTUNNEL_H_ */
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
new file mode 100644
index 000000000000..df24b3611ff4
--- /dev/null
+++ b/include/net/lwtunnel.h
@@ -0,0 +1,132 @@
+#ifndef __NET_LWTUNNEL_H
+#define __NET_LWTUNNEL_H 1
+
+#include <linux/lwtunnel.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <net/route.h>
+
+#define LWTUNNEL_HASH_BITS   7
+#define LWTUNNEL_HASH_SIZE   (1 << LWTUNNEL_HASH_BITS)
+
+/* lw tunnel state flags */
+#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1
+
+struct lwtunnel_state {
+	__u16		type;
+	__u16		flags;
+	atomic_t	refcnt;
+	int             len;
+	__u8            data[0];
+};
+
+struct lwtunnel_encap_ops {
+	int (*build_state)(struct net_device *dev, struct nlattr *encap,
+			   struct lwtunnel_state **ts);
+	int (*output)(struct sock *sk, struct sk_buff *skb);
+	int (*fill_encap)(struct sk_buff *skb,
+			  struct lwtunnel_state *lwtstate);
+	int (*get_encap_size)(struct lwtunnel_state *lwtstate);
+	int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b);
+};
+
+extern const struct lwtunnel_encap_ops __rcu *
+		lwtun_encaps[LWTUNNEL_ENCAP_MAX+1];
+
+#ifdef CONFIG_LWTUNNEL
+static inline void lwtunnel_state_get(struct lwtunnel_state *lws)
+{
+	atomic_inc(&lws->refcnt);
+}
+
+static inline void lwtunnel_state_put(struct lwtunnel_state *lws)
+{
+	if (!lws)
+		return;
+
+	if (atomic_dec_and_test(&lws->refcnt))
+		kfree(lws);
+}
+
+static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
+{
+	if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT))
+		return true;
+
+	return false;
+}
+
+int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
+			   unsigned int num);
+int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
+			   unsigned int num);
+int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+			 struct nlattr *encap,
+			 struct lwtunnel_state **lws);
+int lwtunnel_fill_encap(struct sk_buff *skb,
+			struct lwtunnel_state *lwtstate);
+int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate);
+struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len);
+int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
+
+#else
+
+static inline void lwtunnel_state_get(struct lwtunnel_state *lws)
+{
+}
+
+static inline void lwtunnel_state_put(struct lwtunnel_state *lws)
+{
+}
+
+static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
+{
+	return false;
+}
+
+static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
+					 unsigned int num)
+{
+	return -EOPNOTSUPP;
+
+}
+
+static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
+					 unsigned int num)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+				       struct nlattr *encap,
+				       struct lwtunnel_state **lws)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int lwtunnel_fill_encap(struct sk_buff *skb,
+				      struct lwtunnel_state *lwtstate)
+{
+	return 0;
+}
+
+static inline int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
+{
+	return 0;
+}
+
+static inline struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len)
+{
+	return NULL;
+}
+
+static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a,
+				     struct lwtunnel_state *b)
+{
+	return 0;
+}
+
+#endif
+
+#endif /* __NET_LWTUNNEL_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
new file mode 100644
index 000000000000..aa611d931a31
--- /dev/null
+++ b/include/uapi/linux/lwtunnel.h
@@ -0,0 +1,15 @@
+#ifndef _UAPI_LWTUNNEL_H_
+#define _UAPI_LWTUNNEL_H_
+
+#include <linux/types.h>
+
+enum lwtunnel_encap_types {
+	LWTUNNEL_ENCAP_NONE,
+	LWTUNNEL_ENCAP_MPLS,
+	__LWTUNNEL_ENCAP_MAX,
+};
+
+#define LWTUNNEL_ENCAP_MAX (__LWTUNNEL_ENCAP_MAX - 1)
+
+
+#endif /* _UAPI_LWTUNNEL_H_ */
diff --git a/net/Kconfig b/net/Kconfig
index 57a7c5af3175..7021c1bf44d6 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -374,6 +374,13 @@ source "net/caif/Kconfig"
 source "net/ceph/Kconfig"
 source "net/nfc/Kconfig"
 
+config LWTUNNEL
+	bool "Network light weight tunnels"
+	---help---
+	  This feature provides an infrastructure to support light weight
+	  tunnels like mpls. There is no netdevice associated with a light
+	  weight tunnel endpoint. Tunnel encapsulation parameters are stored
+	  with light weight tunnel state associated with fib routes.
 
 endif   # if NET
 
diff --git a/net/core/Makefile b/net/core/Makefile
index fec0856dd6c0..086b01fbe1bd 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
 obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
 obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
+obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
new file mode 100644
index 000000000000..d7ae3a235b4b
--- /dev/null
+++ b/net/core/lwtunnel.c
@@ -0,0 +1,179 @@
+/*
+ * lwtunnel	Infrastructure for light weight tunnels like mpls
+ *
+ * Authors:	Roopa Prabhu, <roopa@cumulusnetworks.com>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/lwtunnel.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/err.h>
+
+#include <net/lwtunnel.h>
+#include <net/rtnetlink.h>
+
+struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
+{
+	struct lwtunnel_state *lws;
+
+	lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC);
+
+	return lws;
+}
+EXPORT_SYMBOL(lwtunnel_state_alloc);
+
+const struct lwtunnel_encap_ops __rcu *
+		lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly;
+
+int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops,
+			   unsigned int num)
+{
+	if (num > LWTUNNEL_ENCAP_MAX)
+		return -ERANGE;
+
+	return !cmpxchg((const struct lwtunnel_encap_ops **)
+			&lwtun_encaps[num],
+			NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL(lwtunnel_encap_add_ops);
+
+int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
+			   unsigned int encap_type)
+{
+	int ret;
+
+	if (encap_type == LWTUNNEL_ENCAP_NONE ||
+	    encap_type > LWTUNNEL_ENCAP_MAX)
+		return -ERANGE;
+
+	ret = (cmpxchg((const struct lwtunnel_encap_ops **)
+		       &lwtun_encaps[encap_type],
+		       ops, NULL) == ops) ? 0 : -1;
+
+	synchronize_net();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_encap_del_ops);
+
+int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+			 struct nlattr *encap, struct lwtunnel_state **lws)
+{
+	const struct lwtunnel_encap_ops *ops;
+	int ret = -EINVAL;
+
+	if (encap_type == LWTUNNEL_ENCAP_NONE ||
+	    encap_type > LWTUNNEL_ENCAP_MAX)
+		return ret;
+
+	ret = -EOPNOTSUPP;
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[encap_type]);
+	if (likely(ops && ops->build_state))
+		ret = ops->build_state(dev, encap, lws);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_build_state);
+
+int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
+{
+	const struct lwtunnel_encap_ops *ops;
+	struct nlattr *nest;
+	int ret = -EINVAL;
+
+	if (!lwtstate)
+		return 0;
+
+	if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+	    lwtstate->type > LWTUNNEL_ENCAP_MAX)
+		return 0;
+
+	ret = -EOPNOTSUPP;
+	nest = nla_nest_start(skb, RTA_ENCAP);
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+	if (likely(ops && ops->fill_encap))
+		ret = ops->fill_encap(skb, lwtstate);
+	rcu_read_unlock();
+
+	if (ret)
+		goto nla_put_failure;
+	nla_nest_end(skb, nest);
+	ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type);
+	if (ret)
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+
+	return (ret == -EOPNOTSUPP ? 0 : ret);
+}
+EXPORT_SYMBOL(lwtunnel_fill_encap);
+
+int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
+{
+	const struct lwtunnel_encap_ops *ops;
+	int ret = 0;
+
+	if (!lwtstate)
+		return 0;
+
+	if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+	    lwtstate->type > LWTUNNEL_ENCAP_MAX)
+		return 0;
+
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+	if (likely(ops && ops->get_encap_size))
+		ret = nla_total_size(ops->get_encap_size(lwtstate));
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_get_encap_size);
+
+int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	const struct lwtunnel_encap_ops *ops;
+	int ret = 0;
+
+	if (!a && !b)
+		return 0;
+
+	if (!a || !b)
+		return 1;
+
+	if (a->type != b->type)
+		return 1;
+
+	if (a->type == LWTUNNEL_ENCAP_NONE ||
+	    a->type > LWTUNNEL_ENCAP_MAX)
+		return 0;
+
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[a->type]);
+	if (likely(ops && ops->cmp_encap))
+		ret = ops->cmp_encap(a, b);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_cmp_encap);
-- 
cgit v1.2.3-70-g09d2


From e3e4712ec0961ed586a8db340bd994c4ad7f5dba Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 21 Jul 2015 10:43:53 +0200
Subject: mpls: ip tunnel support

This implementation uses lwtunnel infrastructure to register
hooks for mpls tunnel encaps.

It picks cues from iptunnel_encaps infrastructure and previous
mpls iptunnel RFC patches from Eric W. Biederman and Robert Shearman

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mpls_iptunnel.h      |   6 +
 include/net/mpls_iptunnel.h        |  29 +++++
 include/uapi/linux/mpls_iptunnel.h |  28 +++++
 net/mpls/Kconfig                   |   8 +-
 net/mpls/Makefile                  |   1 +
 net/mpls/mpls_iptunnel.c           | 233 +++++++++++++++++++++++++++++++++++++
 6 files changed, 304 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/mpls_iptunnel.h
 create mode 100644 include/net/mpls_iptunnel.h
 create mode 100644 include/uapi/linux/mpls_iptunnel.h
 create mode 100644 net/mpls/mpls_iptunnel.c

(limited to 'include/linux')

diff --git a/include/linux/mpls_iptunnel.h b/include/linux/mpls_iptunnel.h
new file mode 100644
index 000000000000..ef29eb2d6dfd
--- /dev/null
+++ b/include/linux/mpls_iptunnel.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_MPLS_IPTUNNEL_H
+#define _LINUX_MPLS_IPTUNNEL_H
+
+#include <uapi/linux/mpls_iptunnel.h>
+
+#endif  /* _LINUX_MPLS_IPTUNNEL_H */
diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h
new file mode 100644
index 000000000000..4757997f76ed
--- /dev/null
+++ b/include/net/mpls_iptunnel.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2015 Cumulus Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _NET_MPLS_IPTUNNEL_H
+#define _NET_MPLS_IPTUNNEL_H 1
+
+#define MAX_NEW_LABELS 2
+
+struct mpls_iptunnel_encap {
+	u32	label[MAX_NEW_LABELS];
+	u32	labels;
+};
+
+static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate)
+{
+	return (struct mpls_iptunnel_encap *)lwtstate->data;
+}
+
+#endif
diff --git a/include/uapi/linux/mpls_iptunnel.h b/include/uapi/linux/mpls_iptunnel.h
new file mode 100644
index 000000000000..d80a0498f77e
--- /dev/null
+++ b/include/uapi/linux/mpls_iptunnel.h
@@ -0,0 +1,28 @@
+/*
+ *	mpls tunnel api
+ *
+ *	Authors:
+ *		Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_MPLS_IPTUNNEL_H
+#define _UAPI_LINUX_MPLS_IPTUNNEL_H
+
+/* MPLS tunnel attributes
+ * [RTA_ENCAP] = {
+ *     [MPLS_IPTUNNEL_DST]
+ * }
+ */
+enum {
+	MPLS_IPTUNNEL_UNSPEC,
+	MPLS_IPTUNNEL_DST,
+	__MPLS_IPTUNNEL_MAX,
+};
+#define MPLS_IPTUNNEL_MAX (__MPLS_IPTUNNEL_MAX - 1)
+
+#endif /* _UAPI_LINUX_MPLS_IPTUNNEL_H */
diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig
index 17bde799c854..5c467ef97311 100644
--- a/net/mpls/Kconfig
+++ b/net/mpls/Kconfig
@@ -24,7 +24,13 @@ config NET_MPLS_GSO
 
 config MPLS_ROUTING
 	tristate "MPLS: routing support"
-	help
+	---help---
 	 Add support for forwarding of mpls packets.
 
+config MPLS_IPTUNNEL
+	tristate "MPLS: IP over MPLS tunnel support"
+	depends on LWTUNNEL && MPLS_ROUTING
+	---help---
+	 mpls ip tunnel support.
+
 endif # MPLS
diff --git a/net/mpls/Makefile b/net/mpls/Makefile
index 65bbe68c72e6..9ca923625016 100644
--- a/net/mpls/Makefile
+++ b/net/mpls/Makefile
@@ -3,5 +3,6 @@
 #
 obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o
 obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o
+obj-$(CONFIG_MPLS_IPTUNNEL) += mpls_iptunnel.o
 
 mpls_router-y := af_mpls.o
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
new file mode 100644
index 000000000000..eea096f21ba5
--- /dev/null
+++ b/net/mpls/mpls_iptunnel.c
@@ -0,0 +1,233 @@
+/*
+ * mpls tunnels	An implementation mpls tunnels using the light weight tunnel
+ *		infrastructure
+ *
+ * Authors:	Roopa Prabhu, <roopa@cumulusnetworks.com>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ */
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <linux/mpls.h>
+#include <linux/vmalloc.h>
+#include <net/ip.h>
+#include <net/dst.h>
+#include <net/lwtunnel.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include <net/ip6_fib.h>
+#include <net/route.h>
+#include <net/mpls_iptunnel.h>
+#include <linux/mpls_iptunnel.h>
+#include "internal.h"
+
+static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = {
+	[MPLS_IPTUNNEL_DST]	= { .type = NLA_U32 },
+};
+
+static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en)
+{
+	/* The size of the layer 2.5 labels to be added for this route */
+	return en->labels * sizeof(struct mpls_shim_hdr);
+}
+
+int mpls_output(struct sock *sk, struct sk_buff *skb)
+{
+	struct mpls_iptunnel_encap *tun_encap_info;
+	struct mpls_shim_hdr *hdr;
+	struct net_device *out_dev;
+	unsigned int hh_len;
+	unsigned int new_header_size;
+	unsigned int mtu;
+	struct dst_entry *dst = skb_dst(skb);
+	struct rtable *rt = NULL;
+	struct rt6_info *rt6 = NULL;
+	struct lwtunnel_state *lwtstate = NULL;
+	int err = 0;
+	bool bos;
+	int i;
+	unsigned int ttl;
+
+	/* Obtain the ttl */
+	if (skb->protocol == htons(ETH_P_IP)) {
+		ttl = ip_hdr(skb)->ttl;
+		rt = (struct rtable *)dst;
+		lwtstate = rt->rt_lwtstate;
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		ttl = ipv6_hdr(skb)->hop_limit;
+		rt6 = (struct rt6_info *)dst;
+		lwtstate = rt6->rt6i_lwtstate;
+	} else {
+		goto drop;
+	}
+
+	skb_orphan(skb);
+
+	/* Find the output device */
+	out_dev = rcu_dereference(dst->dev);
+	if (!mpls_output_possible(out_dev) ||
+	    !lwtstate || skb_warn_if_lro(skb))
+		goto drop;
+
+	skb_forward_csum(skb);
+
+	tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+	/* Verify the destination can hold the packet */
+	new_header_size = mpls_encap_size(tun_encap_info);
+	mtu = mpls_dev_mtu(out_dev);
+	if (mpls_pkt_too_big(skb, mtu - new_header_size))
+		goto drop;
+
+	hh_len = LL_RESERVED_SPACE(out_dev);
+	if (!out_dev->header_ops)
+		hh_len = 0;
+
+	/* Ensure there is enough space for the headers in the skb */
+	if (skb_cow(skb, hh_len + new_header_size))
+		goto drop;
+
+	skb_push(skb, new_header_size);
+	skb_reset_network_header(skb);
+
+	skb->dev = out_dev;
+	skb->protocol = htons(ETH_P_MPLS_UC);
+
+	/* Push the new labels */
+	hdr = mpls_hdr(skb);
+	bos = true;
+	for (i = tun_encap_info->labels - 1; i >= 0; i--) {
+		hdr[i] = mpls_entry_encode(tun_encap_info->label[i],
+					   ttl, 0, bos);
+		bos = false;
+	}
+
+	if (rt)
+		err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway,
+				 skb);
+	else if (rt6)
+		err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway,
+				 skb);
+	if (err)
+		net_dbg_ratelimited("%s: packet transmission failed: %d\n",
+				    __func__, err);
+
+	return 0;
+
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
+			    struct lwtunnel_state **ts)
+{
+	struct mpls_iptunnel_encap *tun_encap_info;
+	struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1];
+	struct lwtunnel_state *newts;
+	int tun_encap_info_len;
+	int ret;
+
+	ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla,
+			       mpls_iptunnel_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[MPLS_IPTUNNEL_DST])
+		return -EINVAL;
+
+	tun_encap_info_len = sizeof(*tun_encap_info);
+
+	newts = lwtunnel_state_alloc(tun_encap_info_len);
+	if (!newts)
+		return -ENOMEM;
+
+	newts->len = tun_encap_info_len;
+	tun_encap_info = mpls_lwtunnel_encap(newts);
+	ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS,
+			     &tun_encap_info->labels, tun_encap_info->label);
+	if (ret)
+		goto errout;
+	newts->type = LWTUNNEL_ENCAP_MPLS;
+	newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+	*ts = newts;
+
+	return 0;
+
+errout:
+	kfree(newts);
+	*ts = NULL;
+
+	return ret;
+}
+
+static int mpls_fill_encap_info(struct sk_buff *skb,
+				struct lwtunnel_state *lwtstate)
+{
+	struct mpls_iptunnel_encap *tun_encap_info;
+	
+	tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+	if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels,
+			   tun_encap_info->label))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	struct mpls_iptunnel_encap *tun_encap_info;
+
+	tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+	return nla_total_size(tun_encap_info->labels * 4);
+}
+
+static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a);
+	struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b);
+	int l;
+
+	if (a_hdr->labels != b_hdr->labels)
+		return 1;
+
+	for (l = 0; l < MAX_NEW_LABELS; l++)
+		if (a_hdr->label[l] != b_hdr->label[l])
+			return 1;
+	return 0;
+}
+
+static const struct lwtunnel_encap_ops mpls_iptun_ops = {
+	.build_state = mpls_build_state,
+	.output = mpls_output,
+	.fill_encap = mpls_fill_encap_info,
+	.get_encap_size = mpls_encap_nlsize,
+	.cmp_encap = mpls_encap_cmp,
+};
+
+static int __init mpls_iptunnel_init(void)
+{
+	return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS);
+}
+module_init(mpls_iptunnel_init);
+
+static void __exit mpls_iptunnel_exit(void)
+{
+	lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS);
+}
+module_exit(mpls_iptunnel_exit);
+
+MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3-70-g09d2


From ee122c79d4227f6ec642157834b6a90fcffa4382 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:43:58 +0200
Subject: vxlan: Flow based tunneling

Allows putting a VXLAN device into a new flow-based mode in which
skbs with a ip_tunnel_info dst metadata attached will be encapsulated
according to the instructions stored in there with the VXLAN device
defaults taken into consideration.

Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is
set, the packet processing will populate a ip_tunnel_info struct for
each packet received and attach it to the skb using the new metadata
dst.  The metadata structure will contain the outer header and tunnel
header fields which have been stripped off. Layers further up in the
stack such as routing, tc or netfitler can later match on these fields
and perform forwarding. It is the responsibility of upper layers to
ensure that the flag is set if the metadata is needed. The flag limits
the additional cost of metadata collecting based on demand.

This prepares the VXLAN device to be steered by the routing and other
subsystems which allows to support encapsulation for a large number
of tunnel endpoints and tunnel ids through a single net_device which
improves the scalability.

It also allows for OVS to leverage this mode which in turn allows for
the removal of the OVS specific VXLAN code.

Because the skb is currently scrubed in vxlan_rcv(), the attachment of
the new dst metadata is postponed until after scrubing which requires
the temporary addition of a new member to vxlan_metadata. This member
is removed again in a later commit after the indirect VXLAN receive API
has been removed.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c          | 149 ++++++++++++++++++++++++++++++++++++-------
 include/linux/skbuff.h       |   1 +
 include/net/dst_metadata.h   |  13 ++++
 include/net/ip_tunnels.h     |  14 ++++
 include/net/vxlan.h          |  10 ++-
 include/uapi/linux/if_link.h |   1 +
 6 files changed, 165 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index ec86a11743fd..06c092b05a51 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -49,6 +49,7 @@
 #include <net/ip6_tunnel.h>
 #include <net/ip6_checksum.h>
 #endif
+#include <net/dst_metadata.h>
 
 #define VXLAN_VERSION	"0.1"
 
@@ -140,6 +141,11 @@ struct vxlan_dev {
 static u32 vxlan_salt __read_mostly;
 static struct workqueue_struct *vxlan_wq;
 
+static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
+{
+	return vs->flags & VXLAN_F_COLLECT_METADATA;
+}
+
 #if IS_ENABLED(CONFIG_IPV6)
 static inline
 bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
@@ -1164,10 +1170,13 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 /* Callback from net/ipv4/udp.c to receive packets */
 static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
+	struct metadata_dst *tun_dst = NULL;
+	struct ip_tunnel_info *info;
 	struct vxlan_sock *vs;
 	struct vxlanhdr *vxh;
 	u32 flags, vni;
-	struct vxlan_metadata md = {0};
+	struct vxlan_metadata _md;
+	struct vxlan_metadata *md = &_md;
 
 	/* Need Vxlan and inner Ethernet header to be present */
 	if (!pskb_may_pull(skb, VXLAN_HLEN))
@@ -1202,6 +1211,33 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		vni &= VXLAN_VNI_MASK;
 	}
 
+	if (vxlan_collect_metadata(vs)) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		tun_dst = metadata_dst_alloc(sizeof(*md), GFP_ATOMIC);
+		if (!tun_dst)
+			goto drop;
+
+		info = &tun_dst->u.tun_info;
+		info->key.ipv4_src = iph->saddr;
+		info->key.ipv4_dst = iph->daddr;
+		info->key.ipv4_tos = iph->tos;
+		info->key.ipv4_ttl = iph->ttl;
+		info->key.tp_src = udp_hdr(skb)->source;
+		info->key.tp_dst = udp_hdr(skb)->dest;
+
+		info->mode = IP_TUNNEL_INFO_RX;
+		info->key.tun_flags = TUNNEL_KEY;
+		info->key.tun_id = cpu_to_be64(vni >> 8);
+		if (udp_hdr(skb)->check != 0)
+			info->key.tun_flags |= TUNNEL_CSUM;
+
+		md = ip_tunnel_info_opts(info, sizeof(*md));
+		md->tun_dst = tun_dst;
+	} else {
+		memset(md, 0, sizeof(*md));
+	}
+
 	/* For backwards compatibility, only allow reserved fields to be
 	 * used by VXLAN extensions if explicitly requested.
 	 */
@@ -1209,13 +1245,16 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		struct vxlanhdr_gbp *gbp;
 
 		gbp = (struct vxlanhdr_gbp *)vxh;
-		md.gbp = ntohs(gbp->policy_id);
+		md->gbp = ntohs(gbp->policy_id);
+
+		if (tun_dst)
+			info->key.tun_flags |= TUNNEL_VXLAN_OPT;
 
 		if (gbp->dont_learn)
-			md.gbp |= VXLAN_GBP_DONT_LEARN;
+			md->gbp |= VXLAN_GBP_DONT_LEARN;
 
 		if (gbp->policy_applied)
-			md.gbp |= VXLAN_GBP_POLICY_APPLIED;
+			md->gbp |= VXLAN_GBP_POLICY_APPLIED;
 
 		flags &= ~VXLAN_GBP_USED_BITS;
 	}
@@ -1233,8 +1272,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		goto bad_flags;
 	}
 
-	md.vni = vxh->vx_vni;
-	vs->rcv(vs, skb, &md);
+	md->vni = vxh->vx_vni;
+	vs->rcv(vs, skb, md);
 	return 0;
 
 drop:
@@ -1247,6 +1286,9 @@ bad_flags:
 		   ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
 
 error:
+	if (tun_dst)
+		dst_release((struct dst_entry *)tun_dst);
+
 	/* Return non vxlan pkt */
 	return 1;
 }
@@ -1263,7 +1305,12 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 	int err = 0;
 	union vxlan_addr *remote_ip;
 
-	vni = ntohl(md->vni) >> 8;
+	/* For flow based devices, map all packets to VNI 0 */
+	if (vs->flags & VXLAN_F_FLOW_BASED)
+		vni = 0;
+	else
+		vni = ntohl(md->vni) >> 8;
+
 	/* Is this VNI defined? */
 	vxlan = vxlan_vs_find_vni(vs, vni);
 	if (!vxlan)
@@ -1292,12 +1339,19 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 #endif
 	}
 
+	if (md->tun_dst) {
+		skb_dst_set(skb, (struct dst_entry *)md->tun_dst);
+		md->tun_dst = NULL;
+	}
+
 	if ((vxlan->flags & VXLAN_F_LEARN) &&
 	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
 		goto drop;
 
 	skb_reset_network_header(skb);
-	skb->mark = md->gbp;
+	/* In flow-based mode, GBP is carried in dst_metadata */
+	if (!(vs->flags & VXLAN_F_FLOW_BASED))
+		skb->mark = md->gbp;
 
 	if (oip6)
 		err = IP6_ECN_decapsulate(oip6, skb);
@@ -1330,6 +1384,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 
 	return;
 drop:
+	if (md->tun_dst)
+		dst_release((struct dst_entry *)md->tun_dst);
+
 	/* Consume bad packet */
 	kfree_skb(skb);
 }
@@ -1878,22 +1935,40 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			   struct vxlan_rdst *rdst, bool did_rsc)
 {
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct sock *sk = vxlan->vn_sock->sock->sk;
 	struct rtable *rt = NULL;
 	const struct iphdr *old_iph;
 	struct flowi4 fl4;
 	union vxlan_addr *dst;
-	struct vxlan_metadata md;
+	union vxlan_addr remote_ip;
+	struct vxlan_metadata _md;
+	struct vxlan_metadata *md = &_md;
 	__be16 src_port = 0, dst_port;
 	u32 vni;
 	__be16 df = 0;
 	__u8 tos, ttl;
 	int err;
+	u32 flags = vxlan->flags;
 
-	dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
-	vni = rdst->remote_vni;
-	dst = &rdst->remote_ip;
+	if (rdst) {
+		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
+		vni = rdst->remote_vni;
+		dst = &rdst->remote_ip;
+	} else {
+		if (!info) {
+			WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
+				  dev->name);
+			goto drop;
+		}
+
+		dst_port = info->key.tp_dst ? : vxlan->dst_port;
+		vni = be64_to_cpu(info->key.tun_id);
+		remote_ip.sin.sin_family = AF_INET;
+		remote_ip.sin.sin_addr.s_addr = info->key.ipv4_dst;
+		dst = &remote_ip;
+	}
 
 	if (vxlan_addr_any(dst)) {
 		if (did_rsc) {
@@ -1918,8 +1993,25 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 				     vxlan->port_max, true);
 
 	if (dst->sa.sa_family == AF_INET) {
+		if (info) {
+			if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
+				df = htons(IP_DF);
+			if (info->key.tun_flags & TUNNEL_CSUM)
+				flags |= VXLAN_F_UDP_CSUM;
+			else
+				flags &= ~VXLAN_F_UDP_CSUM;
+
+			ttl = info->key.ipv4_ttl;
+			tos = info->key.ipv4_tos;
+
+			if (info->options_len)
+				md = ip_tunnel_info_opts(info, sizeof(*md));
+		} else {
+			md->gbp = skb->mark;
+		}
+
 		memset(&fl4, 0, sizeof(fl4));
-		fl4.flowi4_oif = rdst->remote_ifindex;
+		fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0;
 		fl4.flowi4_tos = RT_TOS(tos);
 		fl4.flowi4_mark = skb->mark;
 		fl4.flowi4_proto = IPPROTO_UDP;
@@ -1958,14 +2050,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
 		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
-		md.vni = htonl(vni << 8);
-		md.gbp = skb->mark;
-
+		md->vni = htonl(vni << 8);
 		err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr,
 				     dst->sin.sin_addr.s_addr, tos, ttl, df,
-				     src_port, dst_port, &md,
+				     src_port, dst_port, md,
 				     !net_eq(vxlan->net, dev_net(vxlan->dev)),
-				     vxlan->flags);
+				     flags);
 		if (err < 0) {
 			/* skb is already freed. */
 			skb = NULL;
@@ -1980,7 +2070,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		u32 flags;
 
 		memset(&fl6, 0, sizeof(fl6));
-		fl6.flowi6_oif = rdst->remote_ifindex;
+		fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0;
 		fl6.daddr = dst->sin6.sin6_addr;
 		fl6.saddr = vxlan->saddr.sin6.sin6_addr;
 		fl6.flowi6_mark = skb->mark;
@@ -2018,11 +2108,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		}
 
 		ttl = ttl ? : ip6_dst_hoplimit(ndst);
-		md.vni = htonl(vni << 8);
-		md.gbp = skb->mark;
+		md->vni = htonl(vni << 8);
+		md->gbp = skb->mark;
 
 		err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr,
-				      0, ttl, src_port, dst_port, &md,
+				      0, ttl, src_port, dst_port, md,
 				      !net_eq(vxlan->net, dev_net(vxlan->dev)),
 				      vxlan->flags);
 #endif
@@ -2051,6 +2141,7 @@ tx_free:
 static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
+	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	struct ethhdr *eth;
 	bool did_rsc = false;
 	struct vxlan_rdst *rdst, *fdst = NULL;
@@ -2078,6 +2169,12 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 #endif
 	}
 
+	if (vxlan->flags & VXLAN_F_FLOW_BASED &&
+	    info && info->mode == IP_TUNNEL_INFO_TX) {
+		vxlan_xmit_one(skb, dev, NULL, false);
+		return NETDEV_TX_OK;
+	}
+
 	f = vxlan_find_mac(vxlan, eth->h_dest);
 	did_rsc = false;
 
@@ -2405,6 +2502,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_RSC]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_L2MISS]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_L3MISS]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_FLOWBASED]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_PORT]	= { .type = NLA_U16 },
 	[IFLA_VXLAN_UDP_CSUM]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
@@ -2681,6 +2779,10 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
 	if (data[IFLA_VXLAN_LIMIT])
 		vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
 
+	if (data[IFLA_VXLAN_FLOWBASED] &&
+	    nla_get_u8(data[IFLA_VXLAN_FLOWBASED]))
+		vxlan->flags |= VXLAN_F_FLOW_BASED;
+
 	if (data[IFLA_VXLAN_PORT_RANGE]) {
 		const struct ifla_vxlan_port_range *p
 			= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
@@ -2777,6 +2879,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L2MISS */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L3MISS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_FLOWBASED */
 		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_AGEING */
 		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LIMIT */
 		nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
@@ -2843,6 +2946,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			!!(vxlan->flags & VXLAN_F_L2MISS)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_L3MISS,
 			!!(vxlan->flags & VXLAN_F_L3MISS)) ||
+	    nla_put_u8(skb, IFLA_VXLAN_FLOWBASED,
+		       !!(vxlan->flags & VXLAN_F_FLOW_BASED)) ||
 	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) ||
 	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) ||
 	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port) ||
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6bd96fe9416a..648a2c241993 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3469,5 +3469,6 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
 			       skb_network_header(skb);
 	return hdr_len + skb_gso_transport_seglen(skb);
 }
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 4f7694f3c7d0..e843937fb30a 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -8,6 +8,9 @@
 struct metadata_dst {
 	struct dst_entry		dst;
 	size_t				opts_len;
+	union {
+		struct ip_tunnel_info	tun_info;
+	} u;
 };
 
 static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
@@ -20,6 +23,16 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
 	return NULL;
 }
 
+static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb)
+{
+	struct metadata_dst *md_dst = skb_metadata_dst(skb);
+
+	if (md_dst)
+		return &md_dst->u.tun_info;
+
+	return NULL;
+}
+
 static inline bool skb_valid_dst(const struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 6b9d559ce5f5..d11530f1c1e2 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -38,10 +38,19 @@ struct ip_tunnel_key {
 	__be16			tp_dst;
 } __packed __aligned(4); /* Minimize padding. */
 
+/* Indicates whether the tunnel info structure represents receive
+ * or transmit tunnel parameters.
+ */
+enum {
+	IP_TUNNEL_INFO_RX,
+	IP_TUNNEL_INFO_TX,
+};
+
 struct ip_tunnel_info {
 	struct ip_tunnel_key	key;
 	const void		*options;
 	u8			options_len;
+	u8			mode;
 };
 
 /* 6rd prefix/relay information */
@@ -284,6 +293,11 @@ static inline void iptunnel_xmit_stats(int err,
 	}
 }
 
+static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n)
+{
+	return info + 1;
+}
+
 #endif /* CONFIG_INET */
 
 #endif /* __NET_IP_TUNNELS_H */
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 0082b5d33d7d..80a2da29e088 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -7,6 +7,7 @@
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/udp.h>
+#include <net/dst_metadata.h>
 
 #define VNI_HASH_BITS	10
 #define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
@@ -97,6 +98,9 @@ struct vxlanhdr {
 struct vxlan_metadata {
 	__be32		vni;
 	u32		gbp;
+
+	/* Temporary until vxlan_rcv() API is gone */
+	struct metadata_dst *tun_dst;
 };
 
 struct vxlan_sock;
@@ -130,6 +134,8 @@ struct vxlan_sock {
 #define VXLAN_F_REMCSUM_RX		0x400
 #define VXLAN_F_GBP			0x800
 #define VXLAN_F_REMCSUM_NOPARTIAL	0x1000
+#define VXLAN_F_COLLECT_METADATA	0x2000
+#define VXLAN_F_FLOW_BASED		0x4000
 
 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
@@ -137,7 +143,9 @@ struct vxlan_sock {
 #define VXLAN_F_RCV_FLAGS		(VXLAN_F_GBP |			\
 					 VXLAN_F_UDP_ZERO_CSUM6_RX |	\
 					 VXLAN_F_REMCSUM_RX |		\
-					 VXLAN_F_REMCSUM_NOPARTIAL)
+					 VXLAN_F_REMCSUM_NOPARTIAL |	\
+					 VXLAN_F_COLLECT_METADATA |	\
+					 VXLAN_F_FLOW_BASED)
 
 struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
 				  vxlan_rcv_t *rcv, void *data,
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 24d68b797c59..9eeb5d9cf8f0 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -382,6 +382,7 @@ enum {
 	IFLA_VXLAN_REMCSUM_RX,
 	IFLA_VXLAN_GBP,
 	IFLA_VXLAN_REMCSUM_NOPARTIAL,
+	IFLA_VXLAN_FLOWBASED,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
cgit v1.2.3-70-g09d2


From 932c435caba8a2ce473a91753bad0173269ef334 Mon Sep 17 00:00:00 2001
From: Mark Rustad <mark.d.rustad@intel.com>
Date: Mon, 13 Jul 2015 11:40:02 -0700
Subject: PCI: Add dev_flags bit to access VPD through function 0

Add a dev_flags bit, PCI_DEV_FLAGS_VPD_REF_F0, to access VPD through
function 0 to provide VPD access on other functions.  This is for hardware
devices that provide copies of the same VPD capability registers in
multiple functions.  Because the kernel expects that each function has its
own registers, both the locking and the state tracking are affected by VPD
accesses to different functions.

On such devices for example, if a VPD write is performed on function 0,
*any* later attempt to read VPD from any other function of that device will
hang.  This has to do with how the kernel tracks the expected value of the
F bit per function.

Concurrent accesses to different functions of the same device can not only
hang but also corrupt both read and write VPD data.

When hangs occur, typically the error message:

  vpd r/w failed.  This is likely a firmware bug on this device.

will be seen.

Never set this bit on function 0 or there will be an infinite recursion.

Signed-off-by: Mark Rustad <mark.d.rustad@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Alexander Duyck <alexander.h.duyck@redhat.com>
CC: stable@vger.kernel.org
---
 drivers/pci/access.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/pci.h  |  2 ++
 2 files changed, 62 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 5465b005220c..769f7e35f1a2 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -439,6 +439,56 @@ static const struct pci_vpd_ops pci_vpd_pci22_ops = {
 	.release = pci_vpd_pci22_release,
 };
 
+static ssize_t pci_vpd_f0_read(struct pci_dev *dev, loff_t pos, size_t count,
+			       void *arg)
+{
+	struct pci_dev *tdev = pci_get_slot(dev->bus, PCI_SLOT(dev->devfn));
+	ssize_t ret;
+
+	if (!tdev)
+		return -ENODEV;
+
+	ret = pci_read_vpd(tdev, pos, count, arg);
+	pci_dev_put(tdev);
+	return ret;
+}
+
+static ssize_t pci_vpd_f0_write(struct pci_dev *dev, loff_t pos, size_t count,
+				const void *arg)
+{
+	struct pci_dev *tdev = pci_get_slot(dev->bus, PCI_SLOT(dev->devfn));
+	ssize_t ret;
+
+	if (!tdev)
+		return -ENODEV;
+
+	ret = pci_write_vpd(tdev, pos, count, arg);
+	pci_dev_put(tdev);
+	return ret;
+}
+
+static const struct pci_vpd_ops pci_vpd_f0_ops = {
+	.read = pci_vpd_f0_read,
+	.write = pci_vpd_f0_write,
+	.release = pci_vpd_pci22_release,
+};
+
+static int pci_vpd_f0_dev_check(struct pci_dev *dev)
+{
+	struct pci_dev *tdev = pci_get_slot(dev->bus, PCI_SLOT(dev->devfn));
+	int ret = 0;
+
+	if (!tdev)
+		return -ENODEV;
+	if (!tdev->vpd || !tdev->multifunction ||
+	    dev->class != tdev->class || dev->vendor != tdev->vendor ||
+	    dev->device != tdev->device)
+		ret = -ENODEV;
+
+	pci_dev_put(tdev);
+	return ret;
+}
+
 int pci_vpd_pci22_init(struct pci_dev *dev)
 {
 	struct pci_vpd_pci22 *vpd;
@@ -447,12 +497,21 @@ int pci_vpd_pci22_init(struct pci_dev *dev)
 	cap = pci_find_capability(dev, PCI_CAP_ID_VPD);
 	if (!cap)
 		return -ENODEV;
+	if (dev->dev_flags & PCI_DEV_FLAGS_VPD_REF_F0) {
+		int ret = pci_vpd_f0_dev_check(dev);
+
+		if (ret)
+			return ret;
+	}
 	vpd = kzalloc(sizeof(*vpd), GFP_ATOMIC);
 	if (!vpd)
 		return -ENOMEM;
 
 	vpd->base.len = PCI_VPD_PCI22_SIZE;
-	vpd->base.ops = &pci_vpd_pci22_ops;
+	if (dev->dev_flags & PCI_DEV_FLAGS_VPD_REF_F0)
+		vpd->base.ops = &pci_vpd_f0_ops;
+	else
+		vpd->base.ops = &pci_vpd_pci22_ops;
 	mutex_init(&vpd->lock);
 	vpd->cap = cap;
 	vpd->busy = false;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8a0321a8fb59..8edb125db13a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -180,6 +180,8 @@ enum pci_dev_flags {
 	PCI_DEV_FLAGS_NO_BUS_RESET = (__force pci_dev_flags_t) (1 << 6),
 	/* Do not use PM reset even if device advertises NoSoftRst- */
 	PCI_DEV_FLAGS_NO_PM_RESET = (__force pci_dev_flags_t) (1 << 7),
+	/* Get VPD from function 0 VPD */
+	PCI_DEV_FLAGS_VPD_REF_F0 = (__force pci_dev_flags_t) (1 << 8),
 };
 
 enum pci_irq_reroute_variant {
-- 
cgit v1.2.3-70-g09d2


From 019d8817b1b064c2bacfbcf40fc68184438ad05a Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 15 Jul 2015 14:40:06 +0200
Subject: PM / sleep: Allow devices without runtime PM to do direct-complete

Don't unset the direct_complete flag on devices that have runtime PM
disabled, if they are runtime suspended.

This is needed because otherwise ancestor devices wouldn't be able to
do direct_complete without adding runtime PM support to all its
descendants.

Also removes pm_runtime_suspended_if_enabled() because it's now unused.

Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/devices.txt    | 7 +++++++
 Documentation/power/runtime_pm.txt | 4 ----
 drivers/base/power/main.c          | 2 +-
 include/linux/pm_runtime.h         | 6 ------
 4 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index d172bce0fd49..8ba6625fdd63 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -341,6 +341,13 @@ the phases are:
 	and is entirely responsible for bringing the device back to the
 	functional state as appropriate.
 
+	Note that this direct-complete procedure applies even if the device is
+	disabled for runtime PM; only the runtime-PM status matters.  It follows
+	that if a device has system-sleep callbacks but does not support runtime
+	PM, then its prepare callback must never return a positive value.  This
+	is because all devices are initially set to runtime-suspended with
+	runtime PM disabled.
+
     2.	The suspend methods should quiesce the device to stop it from performing
 	I/O.  They also may save the device registers and put it into the
 	appropriate low-power state, depending on the bus type the device is on,
diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
index e76dc0ad4d2b..0784bc3a2ab5 100644
--- a/Documentation/power/runtime_pm.txt
+++ b/Documentation/power/runtime_pm.txt
@@ -445,10 +445,6 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
   bool pm_runtime_status_suspended(struct device *dev);
     - return true if the device's runtime PM status is 'suspended'
 
-  bool pm_runtime_suspended_if_enabled(struct device *dev);
-    - return true if the device's runtime PM status is 'suspended' and its
-      'power.disable_depth' field is equal to 1
-
   void pm_runtime_allow(struct device *dev);
     - set the power.runtime_auto flag for the device and decrease its usage
       counter (used by the /sys/devices/.../power/control interface to
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 30b7bbfdc558..1710c26ba097 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1377,7 +1377,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 	if (dev->power.direct_complete) {
 		if (pm_runtime_status_suspended(dev)) {
 			pm_runtime_disable(dev);
-			if (pm_runtime_suspended_if_enabled(dev))
+			if (pm_runtime_status_suspended(dev))
 				goto Complete;
 
 			pm_runtime_enable(dev);
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 30e84d48bfea..3bdbb4189780 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -98,11 +98,6 @@ static inline bool pm_runtime_status_suspended(struct device *dev)
 	return dev->power.runtime_status == RPM_SUSPENDED;
 }
 
-static inline bool pm_runtime_suspended_if_enabled(struct device *dev)
-{
-	return pm_runtime_status_suspended(dev) && dev->power.disable_depth == 1;
-}
-
 static inline bool pm_runtime_enabled(struct device *dev)
 {
 	return !dev->power.disable_depth;
@@ -164,7 +159,6 @@ static inline void device_set_run_wake(struct device *dev, bool enable) {}
 static inline bool pm_runtime_suspended(struct device *dev) { return false; }
 static inline bool pm_runtime_active(struct device *dev) { return true; }
 static inline bool pm_runtime_status_suspended(struct device *dev) { return false; }
-static inline bool pm_runtime_suspended_if_enabled(struct device *dev) { return false; }
 static inline bool pm_runtime_enabled(struct device *dev) { return false; }
 
 static inline void pm_runtime_no_callbacks(struct device *dev) {}
-- 
cgit v1.2.3-70-g09d2


From 7d0c502040a23a5924d3021651cf5326c8694a77 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Thu, 19 Feb 2015 14:44:24 +0100
Subject: cpufeature: correctly annotate the module init function

A section mismatch warning is reported if an __init annotated function is
specified for module_cpu_feature_match().
Change the module_cpu_feature_match() function and annotate the generated
cpu_feature_match_* function as __init.

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 include/linux/cpufeature.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufeature.h b/include/linux/cpufeature.h
index c4d4eb8ac9fe..986c06c88d81 100644
--- a/include/linux/cpufeature.h
+++ b/include/linux/cpufeature.h
@@ -11,6 +11,7 @@
 
 #ifdef CONFIG_GENERIC_CPU_AUTOPROBE
 
+#include <linux/init.h>
 #include <linux/mod_devicetable.h>
 #include <asm/cpufeature.h>
 
@@ -43,16 +44,16 @@
  * For a list of legal values for 'feature', please consult the file
  * 'asm/cpufeature.h' of your favorite architecture.
  */
-#define module_cpu_feature_match(x, __init)			\
+#define module_cpu_feature_match(x, __initfunc)			\
 static struct cpu_feature const cpu_feature_match_ ## x[] =	\
 	{ { .feature = cpu_feature(x) }, { } };			\
 MODULE_DEVICE_TABLE(cpu, cpu_feature_match_ ## x);		\
 								\
-static int cpu_feature_match_ ## x ## _init(void)		\
+static int __init cpu_feature_match_ ## x ## _init(void)	\
 {								\
 	if (!cpu_have_feature(cpu_feature(x)))			\
 		return -ENODEV;					\
-	return __init();					\
+	return __initfunc();					\
 }								\
 module_init(cpu_feature_match_ ## x ## _init)
 
-- 
cgit v1.2.3-70-g09d2


From c179c9b978b90bdf9cb39f5b5716dede157f1eaf Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 9 Jul 2015 16:00:36 +0800
Subject: PCI: Add helper function msi_desc_to_pci_sysdata()

Add helper function msi_desc_to_pci_sysdata() to retrieve sysdata from
an MSI descriptor. To avoid pulling include/linux/pci.h into
include/linux/msi.h, msi_desc_to_pci_sysdata() is implemented as a normal
function instead of an inline function.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Reviewed-by: Yijing Wang <wangyijing@huawei.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Stuart Yoder <stuart.yoder@freescale.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Alexander Gordeev <agordeev@redhat.com>
Link: http://lkml.kernel.org/r/1436428847-8886-2-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/msi.c   | 8 ++++++++
 include/linux/msi.h | 7 +++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 157eb8817fb8..ab4174243962 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1137,6 +1137,14 @@ int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
 }
 EXPORT_SYMBOL(pci_enable_msix_range);
 
+void *msi_desc_to_pci_sysdata(struct msi_desc *desc)
+{
+	struct pci_dev *dev = msi_desc_to_pci_dev(desc);
+
+	return dev->bus->sysdata;
+}
+EXPORT_SYMBOL_GPL(msi_desc_to_pci_sysdata);
+
 #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
 /**
  * pci_msi_domain_write_msg - Helper to write MSI message to PCI config space
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 8ac4a68ffae2..cfbd2afeaf64 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -60,6 +60,13 @@ static inline struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc)
 {
 	return desc->dev;
 }
+
+void *msi_desc_to_pci_sysdata(struct msi_desc *desc);
+#else /* CONFIG_PCI_MSI */
+static inline void *msi_desc_to_pci_sysdata(struct msi_desc *desc)
+{
+	return NULL;
+}
 #endif /* CONFIG_PCI_MSI */
 
 void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
-- 
cgit v1.2.3-70-g09d2


From 4a7cc831670550e6b48ef5760e7213f89935ff0d Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 9 Jul 2015 16:00:44 +0800
Subject: genirq/MSI: Move msi_list from struct pci_dev to struct device

Move msi_list from struct pci_dev into struct device, so we can
support non-PCI-device based generic MSI interrupts.

msi_list is now conditional under CONFIG_GENERIC_MSI_IRQ, which is
selected from CONFIG_PCI_MSI, so no functional change for PCI MSI
users.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Reviewed-by: Yijing Wang <wangyijing@huawei.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Stuart Yoder <stuart.yoder@freescale.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Joe Perches <joe@perches.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Alexander Gordeev <agordeev@redhat.com>
Link: http://lkml.kernel.org/r/1436428847-8886-10-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/base/core.c    | 3 +++
 drivers/pci/msi.c      | 3 +--
 include/linux/device.h | 4 ++++
 include/linux/msi.h    | 2 +-
 include/linux/pci.h    | 1 -
 5 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index dafae6d2f7ac..18e2a89aa138 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -662,6 +662,9 @@ void device_initialize(struct device *dev)
 	INIT_LIST_HEAD(&dev->devres_head);
 	device_pm_init(dev);
 	set_dev_node(dev, -1);
+#ifdef CONFIG_GENERIC_MSI_IRQ
+	INIT_LIST_HEAD(&dev->msi_list);
+#endif
 }
 EXPORT_SYMBOL_GPL(device_initialize);
 
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index f0714c3fd315..4ef5021a084d 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -900,7 +900,7 @@ void pci_msi_shutdown(struct pci_dev *dev)
 		return;
 
 	BUG_ON(list_empty(dev_to_msi_list(&dev->dev)));
-	desc = first_msi_entry(dev);
+	desc = first_pci_msi_entry(dev);
 
 	pci_msi_set_enable(dev, 0);
 	pci_intx_for_msi(dev, 1);
@@ -1044,7 +1044,6 @@ EXPORT_SYMBOL(pci_msi_enabled);
 
 void pci_msi_init_pci_dev(struct pci_dev *dev)
 {
-	INIT_LIST_HEAD(&dev->msi_list);
 }
 
 /**
diff --git a/include/linux/device.h b/include/linux/device.h
index 5a31bf3a4024..22227e7fe463 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -713,6 +713,7 @@ struct device_dma_parameters {
  * 		along with subsystem-level and driver-level callbacks.
  * @pins:	For device pin management.
  *		See Documentation/pinctrl.txt for details.
+ * @msi_list:	Hosts MSI descriptors
  * @numa_node:	NUMA node this device is close to.
  * @dma_mask:	Dma mask (if dma'ble device).
  * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all
@@ -776,6 +777,9 @@ struct device {
 #ifdef CONFIG_PINCTRL
 	struct dev_pin_info	*pins;
 #endif
+#ifdef CONFIG_GENERIC_MSI_IRQ
+	struct list_head	msi_list;
+#endif
 
 #ifdef CONFIG_NUMA
 	int		numa_node;	/* NUMA node this device is close to */
diff --git a/include/linux/msi.h b/include/linux/msi.h
index cfbd2afeaf64..57fe766a14bf 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -45,7 +45,7 @@ struct msi_desc {
 
 /* Helpers to hide struct msi_desc implementation details */
 #define msi_desc_to_dev(desc)		(&(desc)->dev.dev)
-#define dev_to_msi_list(dev)		(&to_pci_dev((dev))->msi_list)
+#define dev_to_msi_list(dev)		(&(dev)->msi_list)
 #define first_msi_entry(dev)		\
 	list_first_entry(dev_to_msi_list((dev)), struct msi_desc, list)
 #define for_each_msi_entry(desc, dev)	\
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8a0321a8fb59..fbf245f5eba7 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -366,7 +366,6 @@ struct pci_dev {
 	struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */
 	struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */
 #ifdef CONFIG_PCI_MSI
-	struct list_head msi_list;
 	const struct attribute_group **msi_irq_groups;
 #endif
 	struct pci_vpd *vpd;
-- 
cgit v1.2.3-70-g09d2


From 25a98bd4ff9355a218d2e7aa4d6e3c9bc2c27d6f Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 9 Jul 2015 16:00:45 +0800
Subject: genirq/MSI: Store 'struct device' instead of 'struct pci_dev' in
 struct msi_desc

Store 'struct device *' instead of 'struct pci_dev *' in struct msi_desc,
so struct msi_desc can be reused by non PCI based MSI drivers.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Reviewed-by: Yijing Wang <wangyijing@huawei.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Stuart Yoder <stuart.yoder@freescale.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Alexander Gordeev <agordeev@redhat.com>
Link: http://lkml.kernel.org/r/1436428847-8886-11-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/msi.c   |  7 ++++++-
 include/linux/msi.h | 11 ++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 4ef5021a084d..897e1a4bce06 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -413,7 +413,7 @@ static struct msi_desc *alloc_msi_entry(struct pci_dev *dev)
 		return NULL;
 
 	INIT_LIST_HEAD(&desc->list);
-	desc->dev = dev;
+	desc->dev = &dev->dev;
 
 	return desc;
 }
@@ -1140,6 +1140,11 @@ int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
 }
 EXPORT_SYMBOL(pci_enable_msix_range);
 
+struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc)
+{
+	return to_pci_dev(desc->dev);
+}
+
 void *msi_desc_to_pci_sysdata(struct msi_desc *desc)
 {
 	struct pci_dev *dev = msi_desc_to_pci_dev(desc);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 57fe766a14bf..5f77e231f515 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -14,6 +14,7 @@ extern int pci_msi_ignore_mask;
 /* Helper functions */
 struct irq_data;
 struct msi_desc;
+struct pci_dev;
 void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg);
 
@@ -37,14 +38,14 @@ struct msi_desc {
 		void __iomem *mask_base;
 		u8 mask_pos;
 	};
-	struct pci_dev *dev;
+	struct device *dev;
 
 	/* Last set MSI message */
 	struct msi_msg msg;
 };
 
 /* Helpers to hide struct msi_desc implementation details */
-#define msi_desc_to_dev(desc)		(&(desc)->dev.dev)
+#define msi_desc_to_dev(desc)		((desc)->dev)
 #define dev_to_msi_list(dev)		(&(dev)->msi_list)
 #define first_msi_entry(dev)		\
 	list_first_entry(dev_to_msi_list((dev)), struct msi_desc, list)
@@ -56,11 +57,7 @@ struct msi_desc {
 #define for_each_pci_msi_entry(desc, pdev)	\
 	for_each_msi_entry((desc), &(pdev)->dev)
 
-static inline struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc)
-{
-	return desc->dev;
-}
-
+struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc);
 void *msi_desc_to_pci_sysdata(struct msi_desc *desc);
 #else /* CONFIG_PCI_MSI */
 static inline void *msi_desc_to_pci_sysdata(struct msi_desc *desc)
-- 
cgit v1.2.3-70-g09d2


From fc88419cfac50b05c7c1ea218b08e70c31d1b71f Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 9 Jul 2015 16:00:46 +0800
Subject: genirq/MSI: Reorginize struct msi_desc to prepare for support of
 generic MSI

Reorganize struct msi_desc so it could be reused by other MSI
drivers. We have the following layout now:

struct msi_desc {
       /* Shared device/bus independent data */
       ...
       union {
       	     /* PCI specific data */
	     struct {
	     	    ...
	     };
       };
};

We need to have anonymous union and a anonymous structure for the PCI
fields, otherwise we would have to change all instances using these
fields.

For non PCI devices we will enforce a proper namespace and a non
anonymous structure.

[ tglx: Added proper comments to the structure and massaged changelog ]

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Reviewed-by: Yijing Wang <wangyijing@huawei.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Stuart Yoder <stuart.yoder@freescale.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Alexander Gordeev <agordeev@redhat.com>
Link: http://lkml.kernel.org/r/1436428847-8886-12-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/msi.h | 70 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 50 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 5f77e231f515..518e8c4a4064 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -18,30 +18,60 @@ struct pci_dev;
 void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg);
 
+/**
+ * struct msi_desc - Descriptor structure for MSI based interrupts
+ * @list:	List head for management
+ * @irq:	The base interrupt number
+ * @nvec_used:	The number of vectors used
+ * @dev:	Pointer to the device which uses this descriptor
+ * @msg:	The last set MSI message cached for reuse
+ *
+ * @masked:	[PCI MSI/X] Mask bits
+ * @is_msix:	[PCI MSI/X] True if MSI-X
+ * @multiple:	[PCI MSI/X] log2 num of messages allocated
+ * @multi_cap:	[PCI MSI/X] log2 num of messages supported
+ * @maskbit:	[PCI MSI/X] Mask-Pending bit supported?
+ * @is_64:	[PCI MSI/X] Address size: 0=32bit 1=64bit
+ * @entry_nr:	[PCI MSI/X] Entry which is described by this descriptor
+ * @default_irq:[PCI MSI/X] The default pre-assigned non-MSI irq
+ * @mask_pos:	[PCI MSI]   Mask register position
+ * @mask_base:	[PCI MSI-X] Mask register base address
+ */
 struct msi_desc {
-	struct {
-		__u8	is_msix	: 1;
-		__u8	multiple: 3;	/* log2 num of messages allocated */
-		__u8	multi_cap : 3;	/* log2 num of messages supported */
-		__u8	maskbit	: 1;	/* mask-pending bit supported ? */
-		__u8	is_64	: 1;	/* Address size: 0=32bit 1=64bit */
-		__u16	entry_nr;	/* specific enabled entry */
-		unsigned default_irq;	/* default pre-assigned irq */
-	} msi_attrib;
-
-	u32 masked;			/* mask bits */
-	unsigned int irq;
-	unsigned int nvec_used;		/* number of messages */
-	struct list_head list;
+	/* Shared device/bus type independent data */
+	struct list_head		list;
+	unsigned int			irq;
+	unsigned int			nvec_used;
+	struct device			*dev;
+	struct msi_msg			msg;
 
 	union {
-		void __iomem *mask_base;
-		u8 mask_pos;
-	};
-	struct device *dev;
+		/* PCI MSI/X specific data */
+		struct {
+			u32 masked;
+			struct {
+				__u8	is_msix		: 1;
+				__u8	multiple	: 3;
+				__u8	multi_cap	: 3;
+				__u8	maskbit		: 1;
+				__u8	is_64		: 1;
+				__u16	entry_nr;
+				unsigned default_irq;
+			} msi_attrib;
+			union {
+				u8	mask_pos;
+				void __iomem *mask_base;
+			};
+		};
 
-	/* Last set MSI message */
-	struct msi_msg msg;
+		/*
+		 * Non PCI variants add their data structure here. New
+		 * entries need to use a named structure. We want
+		 * proper name spaces for this. The PCI part is
+		 * anonymous for now as it would require an immediate
+		 * tree wide cleanup.
+		 */
+	};
 };
 
 /* Helpers to hide struct msi_desc implementation details */
-- 
cgit v1.2.3-70-g09d2


From aa48b6f708868ab9c22ca737f27a0da832bf7f08 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 9 Jul 2015 16:00:47 +0800
Subject: genirq/MSI: Move alloc_msi_entry() from PCI into generic MSI code

Move alloc_msi_entry() from PCI MSI code into generic MSI code, so it
can be reused by other generic MSI drivers.  Also introduce
free_msi_entry() for completeness.

Suggested-by: Stuart Yoder <stuart.yoder@freescale.com>.
Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Yijing Wang <wangyijing@huawei.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Alexander Gordeev <agordeev@redhat.com>
Link: http://lkml.kernel.org/r/1436428847-8886-13-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/msi.c   | 16 ++--------------
 include/linux/msi.h |  2 ++
 kernel/irq/msi.c    | 17 +++++++++++++++++
 3 files changed, 21 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 897e1a4bce06..cd4c78c193de 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -406,18 +406,6 @@ static void free_msi_irqs(struct pci_dev *dev)
 	}
 }
 
-static struct msi_desc *alloc_msi_entry(struct pci_dev *dev)
-{
-	struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
-	if (!desc)
-		return NULL;
-
-	INIT_LIST_HEAD(&desc->list);
-	desc->dev = &dev->dev;
-
-	return desc;
-}
-
 static void pci_intx_for_msi(struct pci_dev *dev, int enable)
 {
 	if (!(dev->dev_flags & PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG))
@@ -572,7 +560,7 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec)
 	struct msi_desc *entry;
 
 	/* MSI Entry Initialization */
-	entry = alloc_msi_entry(dev);
+	entry = alloc_msi_entry(&dev->dev);
 	if (!entry)
 		return NULL;
 
@@ -700,7 +688,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 	int i;
 
 	for (i = 0; i < nvec; i++) {
-		entry = alloc_msi_entry(dev);
+		entry = alloc_msi_entry(&dev->dev);
 		if (!entry) {
 			if (!i)
 				iounmap(base);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 518e8c4a4064..f83c87e447bc 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -96,6 +96,8 @@ static inline void *msi_desc_to_pci_sysdata(struct msi_desc *desc)
 }
 #endif /* CONFIG_PCI_MSI */
 
+struct msi_desc *alloc_msi_entry(struct device *dev);
+void free_msi_entry(struct msi_desc *entry);
 void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 7bf1f1bbb7fa..7e6512b9dc1f 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -18,6 +18,23 @@
 /* Temparory solution for building, will be removed later */
 #include <linux/pci.h>
 
+struct msi_desc *alloc_msi_entry(struct device *dev)
+{
+	struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+	if (!desc)
+		return NULL;
+
+	INIT_LIST_HEAD(&desc->list);
+	desc->dev = dev;
+
+	return desc;
+}
+
+void free_msi_entry(struct msi_desc *entry)
+{
+	kfree(entry);
+}
+
 void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 {
 	*msg = entry->msg;
-- 
cgit v1.2.3-70-g09d2


From 3985e8a3611a93bb36789f65db862e5700aab65e Mon Sep 17 00:00:00 2001
From: Erik Kline <ek@google.com>
Date: Wed, 22 Jul 2015 16:38:25 +0900
Subject: ipv6: sysctl to restrict candidate source addresses

Per RFC 6724, section 4, "Candidate Source Addresses":

    It is RECOMMENDED that the candidate source addresses be the set
    of unicast addresses assigned to the interface that will be used
    to send to the destination (the "outgoing" interface).

Add a sysctl to enable this behaviour.

Signed-off-by: Erik Kline <ek@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  7 +++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 22 +++++++++++++++++++---
 4 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index f63aeefd2c24..1a5ab21bcca5 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1460,6 +1460,13 @@ router_solicitations - INTEGER
 	routers are present.
 	Default: 3
 
+use_oif_addrs_only - BOOLEAN
+	When enabled, the candidate source addresses for destinations
+	routed via this interface are restricted to the set of addresses
+	configured on this interface (vis. RFC 6724, section 4).
+
+	Default: false
+
 use_tempaddr - INTEGER
 	Preference for Privacy Extensions (RFC3041).
 	  <= 0 : disable Privacy Extensions
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 1319a6bb6b82..06ed637225b8 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -57,6 +57,7 @@ struct ipv6_devconf {
 		bool initialized;
 		struct in6_addr secret;
 	} stable_secret;
+	__s32		use_oif_addrs_only;
 	void		*sysctl;
 };
 
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 5efa54ae567c..641a146ead7d 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -171,6 +171,7 @@ enum {
 	DEVCONF_USE_OPTIMISTIC,
 	DEVCONF_ACCEPT_RA_MTU,
 	DEVCONF_STABLE_SECRET,
+	DEVCONF_USE_OIF_ADDRS_ONLY,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 32153c248959..eb0c6a3a8a00 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -211,7 +211,8 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.accept_ra_mtu		= 1,
 	.stable_secret		= {
 		.initialized = false,
-	}
+	},
+	.use_oif_addrs_only	= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -253,6 +254,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.stable_secret		= {
 		.initialized = false,
 	},
+	.use_oif_addrs_only	= 0,
 };
 
 /* Check if a valid qdisc is available */
@@ -1472,11 +1474,16 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
 	 *    include addresses assigned to interfaces
 	 *    belonging to the same site as the outgoing
 	 *    interface.)
+	 *  - "It is RECOMMENDED that the candidate source addresses
+	 *    be the set of unicast addresses assigned to the
+	 *    interface that will be used to send to the destination
+	 *    (the 'outgoing' interface)." (RFC 6724)
 	 */
 	if (dst_dev) {
+		idev = __in6_dev_get(dst_dev);
 		if ((dst_type & IPV6_ADDR_MULTICAST) ||
-		    dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) {
-			idev = __in6_dev_get(dst_dev);
+		    dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL ||
+		    (idev && idev->cnf.use_oif_addrs_only)) {
 			use_oif_addr = true;
 		}
 	}
@@ -4607,6 +4614,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local;
 	array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu;
 	/* we omit DEVCONF_STABLE_SECRET for now */
+	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5605,6 +5613,14 @@ static struct addrconf_sysctl_table
 			.mode		= 0600,
 			.proc_handler	= addrconf_sysctl_stable_secret,
 		},
+		{
+			.procname       = "use_oif_addrs_only",
+			.data           = &ipv6_devconf.use_oif_addrs_only,
+			.maxlen         = sizeof(int),
+			.mode           = 0644,
+			.proc_handler   = proc_dointvec,
+
+		},
 		{
 			/* sentinel */
 		}
-- 
cgit v1.2.3-70-g09d2


From cd812599796f500b042f5464b6665755eca21137 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 5 Jul 2015 11:12:07 -0400
Subject: NFS: Remove the "NFS_CAP_CHANGE_ATTR" capability

Setting the change attribute has been mandatory for all NFS versions, since
commit 3a1556e8662c ("NFSv2/v3: Simulate the change attribute"). We should
therefore not have anything be conditional on it being set/unset.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/client.c           | 2 +-
 fs/nfs/inode.c            | 4 ++--
 fs/nfs/nfs4proc.c         | 3 ---
 include/linux/nfs_fs_sb.h | 2 +-
 4 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ecebb406cc1a..4a90c9bb3135 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -775,7 +775,7 @@ static int nfs_init_server(struct nfs_server *server,
 	server->options = data->options;
 	server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
 		NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
-		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR;
+		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
 
 	if (data->rsize)
 		server->rsize = nfs_block_size(data->rsize, NULL);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 426e4f8207ef..0adc7d245b3d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -442,7 +442,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
 		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
 			inode->i_version = fattr->change_attr;
-		else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
+		else
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
 				| NFS_INO_REVAL_PAGECACHE);
 		if (fattr->valid & NFS_ATTR_FATTR_SIZE)
@@ -1692,7 +1692,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				nfs_force_lookup_revalidate(inode);
 			inode->i_version = fattr->change_attr;
 		}
-	} else if (server->caps & NFS_CAP_CHANGE_ATTR)
+	} else
 		nfsi->cache_validity |= save_cache_validity;
 
 	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9264994ec9d3..c85ffe67b5f3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8591,7 +8591,6 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 	.minor_version = 0,
 	.init_caps = NFS_CAP_READDIRPLUS
 		| NFS_CAP_ATOMIC_OPEN
-		| NFS_CAP_CHANGE_ATTR
 		| NFS_CAP_POSIX_LOCK,
 	.init_client = nfs40_init_client,
 	.shutdown_client = nfs40_shutdown_client,
@@ -8617,7 +8616,6 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
 	.minor_version = 1,
 	.init_caps = NFS_CAP_READDIRPLUS
 		| NFS_CAP_ATOMIC_OPEN
-		| NFS_CAP_CHANGE_ATTR
 		| NFS_CAP_POSIX_LOCK
 		| NFS_CAP_STATEID_NFSV41
 		| NFS_CAP_ATOMIC_OPEN_V1,
@@ -8640,7 +8638,6 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
 	.minor_version = 2,
 	.init_caps = NFS_CAP_READDIRPLUS
 		| NFS_CAP_ATOMIC_OPEN
-		| NFS_CAP_CHANGE_ATTR
 		| NFS_CAP_POSIX_LOCK
 		| NFS_CAP_STATEID_NFSV41
 		| NFS_CAP_ATOMIC_OPEN_V1
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index a2ea1491d3df..20bc8e51b161 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -220,7 +220,7 @@ struct nfs_server {
 #define NFS_CAP_SYMLINKS	(1U << 2)
 #define NFS_CAP_ACLS		(1U << 3)
 #define NFS_CAP_ATOMIC_OPEN	(1U << 4)
-#define NFS_CAP_CHANGE_ATTR	(1U << 5)
+/* #define NFS_CAP_CHANGE_ATTR	(1U << 5) */
 #define NFS_CAP_FILEID		(1U << 6)
 #define NFS_CAP_MODE		(1U << 7)
 #define NFS_CAP_NLINK		(1U << 8)
-- 
cgit v1.2.3-70-g09d2


From 115c48d7a5351abeadd0c8a3dc87eca3d66a6475 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 5 Jul 2015 12:36:34 -0400
Subject: NFS: nfs_mark_for_revalidate should always set
 NFS_INO_REVAL_PAGECACHE

I'm not aware of any existing bugs around this, but the expectation is
that nfs_mark_for_revalidate() should always force a revalidation of
the cached metadata.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/nfs_fs.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index f91b5ade30c9..874b77228fb9 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -292,9 +292,12 @@ static inline void nfs_mark_for_revalidate(struct inode *inode)
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	spin_lock(&inode->i_lock);
-	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS;
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR |
+				NFS_INO_REVAL_PAGECACHE |
+				NFS_INO_INVALID_ACCESS |
+				NFS_INO_INVALID_ACL;
 	if (S_ISDIR(inode->i_mode))
-		nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
+		nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 	spin_unlock(&inode->i_lock);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 24560056de61d86153cecb84d04e4237437f5888 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 30 May 2015 10:11:24 -0700
Subject: rcu: Add RCU-sched flavors of get-state and cond-sync

The get_state_synchronize_rcu() and cond_synchronize_rcu() functions
allow polling for grace-period completion, with an actual wait for a
grace period occurring only when cond_synchronize_rcu() is called too
soon after the corresponding get_state_synchronize_rcu().  However,
these functions work only for vanilla RCU.  This commit adds the
get_state_synchronize_sched() and cond_synchronize_sched(), which provide
the same capability for RCU-sched.

Reported-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcutiny.h | 10 ++++++++++
 include/linux/rcutree.h |  2 ++
 kernel/rcu/rcutorture.c |  2 ++
 kernel/rcu/tree.c       | 52 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 66 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 3df6c1ec4e25..ff968b7af3a4 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -37,6 +37,16 @@ static inline void cond_synchronize_rcu(unsigned long oldstate)
 	might_sleep();
 }
 
+static inline unsigned long get_state_synchronize_sched(void)
+{
+	return 0;
+}
+
+static inline void cond_synchronize_sched(unsigned long oldstate)
+{
+	might_sleep();
+}
+
 static inline void rcu_barrier_bh(void)
 {
 	wait_rcu_gp(call_rcu_bh);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 456879143f89..5abec82f325e 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -76,6 +76,8 @@ void rcu_barrier_bh(void);
 void rcu_barrier_sched(void);
 unsigned long get_state_synchronize_rcu(void);
 void cond_synchronize_rcu(unsigned long oldstate);
+unsigned long get_state_synchronize_sched(void);
+void cond_synchronize_sched(unsigned long oldstate);
 
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 59e32684c23b..0f2cb55f0ab3 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -635,6 +635,8 @@ static struct rcu_torture_ops sched_ops = {
 	.deferred_free	= rcu_sched_torture_deferred_free,
 	.sync		= synchronize_sched,
 	.exp_sync	= synchronize_sched_expedited,
+	.get_state	= get_state_synchronize_sched,
+	.cond_sync	= cond_synchronize_sched,
 	.call		= call_rcu_sched,
 	.cb_barrier	= rcu_barrier_sched,
 	.fqs		= rcu_sched_force_quiescent_state,
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8b5dd8ba9495..9629298eea24 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3253,6 +3253,58 @@ void cond_synchronize_rcu(unsigned long oldstate)
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
 
+/**
+ * get_state_synchronize_sched - Snapshot current RCU-sched state
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_sched()
+ * to determine whether or not a full grace period has elapsed in the
+ * meantime.
+ */
+unsigned long get_state_synchronize_sched(void)
+{
+	/*
+	 * Any prior manipulation of RCU-protected data must happen
+	 * before the load from ->gpnum.
+	 */
+	smp_mb();  /* ^^^ */
+
+	/*
+	 * Make sure this load happens before the purportedly
+	 * time-consuming work between get_state_synchronize_sched()
+	 * and cond_synchronize_sched().
+	 */
+	return smp_load_acquire(&rcu_sched_state.gpnum);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_sched);
+
+/**
+ * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period
+ *
+ * @oldstate: return value from earlier call to get_state_synchronize_sched()
+ *
+ * If a full RCU-sched grace period has elapsed since the earlier call to
+ * get_state_synchronize_sched(), just return.  Otherwise, invoke
+ * synchronize_sched() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.  But
+ * counter wrap is harmless.  If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for one additional grace period should be just fine.
+ */
+void cond_synchronize_sched(unsigned long oldstate)
+{
+	unsigned long newstate;
+
+	/*
+	 * Ensure that this load happens before any RCU-destructive
+	 * actions the caller might carry out after we return.
+	 */
+	newstate = smp_load_acquire(&rcu_sched_state.completed);
+	if (ULONG_CMP_GE(oldstate, newstate))
+		synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_sched);
+
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
 	/*
-- 
cgit v1.2.3-70-g09d2


From 155d1d12786386d21732f9bba036343ffa43847d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 2 Jun 2015 17:26:48 +0200
Subject: rcu: Use WRITE_ONCE in RCU_INIT_POINTER

For the paranoid amongst us GCC would be in its right to use byte stores
to write our NULL value, tell it not to do that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index def6d45ad61c..c63428c1ed8a 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -995,7 +995,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 #define RCU_INIT_POINTER(p, v) \
 	do { \
 		rcu_dereference_sparse(p, __rcu); \
-		p = RCU_INITIALIZER(v); \
+		WRITE_ONCE(p, RCU_INITIALIZER(v)); \
 	} while (0)
 
 /**
-- 
cgit v1.2.3-70-g09d2


From ec90a194ae2cb8b8e9fe4f6f70dd3d4dc0269b4b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 10 Jun 2015 12:53:06 -0700
Subject: rcu: Create a synchronize_rcu_mult()

There have been several requests for a primitive that waits for
grace periods for several RCU flavors concurrently, so this
commit creates it.  This is a variadic macro, and you pass in
the call_rcu() functions of the flavors of RCU that you wish to
wait for.

Note that you cannot pass in call_srcu() for two reasons: (1) This
would result in a type mismatch and (2) You need to specify which
srcu_struct you want to use.  Handle this by creating a wrapper
function for your SRCU domain, for example:

	void call_srcu_mine(struct rcu_head *head, rcu_callback_t func)
	{
		call_srcu(&ss_mine, head, func);
	}

You can then do something like this:

	synchronize_rcu_mult(call_srcu_mine, call_rcu, call_rcu_sched);

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 35 +++++++++++++++++++++++++++++++----
 include/linux/types.h    |  3 +++
 kernel/rcu/update.c      | 37 +++++++++++++++++++++++++++----------
 3 files changed, 61 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index c63428c1ed8a..33ec16b9c2ee 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -226,6 +226,37 @@ struct rcu_synchronize {
 };
 void wakeme_after_rcu(struct rcu_head *head);
 
+void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
+		   struct rcu_synchronize *rs_array);
+
+#define _wait_rcu_gp(checktiny, ...) \
+do { \
+	call_rcu_func_t __crcu_array[] = { __VA_ARGS__ }; \
+	const int __n = ARRAY_SIZE(__crcu_array); \
+	struct rcu_synchronize __rs_array[__n]; \
+	\
+	__wait_rcu_gp(checktiny, __n, __crcu_array, __rs_array); \
+} while (0)
+
+#define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__)
+
+/**
+ * synchronize_rcu_mult - Wait concurrently for multiple grace periods
+ * @...: List of call_rcu() functions for the flavors to wait on.
+ *
+ * This macro waits concurrently for multiple flavors of RCU grace periods.
+ * For example, synchronize_rcu_mult(call_rcu, call_rcu_bh) would wait
+ * on concurrent RCU and RCU-bh grace periods.  Waiting on a give SRCU
+ * domain requires you to write a wrapper function for that SRCU domain's
+ * call_srcu() function, supplying the corresponding srcu_struct.
+ *
+ * If Tiny RCU, tell _wait_rcu_gp() not to bother waiting for RCU
+ * or RCU-bh, given that anywhere synchronize_rcu_mult() can be called
+ * is automatically a grace period.
+ */
+#define synchronize_rcu_mult(...) \
+	_wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__)
+
 /**
  * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
  * @head: structure to be used for queueing the RCU updates.
@@ -392,10 +423,6 @@ bool __rcu_is_watching(void);
  * TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
  */
 
-typedef void call_rcu_func_t(struct rcu_head *head,
-			     void (*func)(struct rcu_head *head));
-void wait_rcu_gp(call_rcu_func_t crf);
-
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
 #include <linux/rcutree.h>
 #elif defined(CONFIG_TINY_RCU)
diff --git a/include/linux/types.h b/include/linux/types.h
index 8715287c3b1f..c314989d9158 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -212,6 +212,9 @@ struct callback_head {
 };
 #define rcu_head callback_head
 
+typedef void (*rcu_callback_t)(struct rcu_head *head);
+typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func);
+
 /* clocksource cycle base type */
 typedef u64 cycle_t;
 
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index fec5f48b8860..a0a0dd03c73a 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -318,20 +318,37 @@ void wakeme_after_rcu(struct rcu_head *head)
 	rcu = container_of(head, struct rcu_synchronize, head);
 	complete(&rcu->completion);
 }
+EXPORT_SYMBOL_GPL(wakeme_after_rcu);
 
-void wait_rcu_gp(call_rcu_func_t crf)
+void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
+		   struct rcu_synchronize *rs_array)
 {
-	struct rcu_synchronize rcu;
+	int i;
 
-	init_rcu_head_on_stack(&rcu.head);
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	crf(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-	destroy_rcu_head_on_stack(&rcu.head);
+	/* Initialize and register callbacks for each flavor specified. */
+	for (i = 0; i < n; i++) {
+		if (checktiny &&
+		    (crcu_array[i] == call_rcu ||
+		     crcu_array[i] == call_rcu_bh)) {
+			might_sleep();
+			continue;
+		}
+		init_rcu_head_on_stack(&rs_array[i].head);
+		init_completion(&rs_array[i].completion);
+		(crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
+	}
+
+	/* Wait for all callbacks to be invoked. */
+	for (i = 0; i < n; i++) {
+		if (checktiny &&
+		    (crcu_array[i] == call_rcu ||
+		     crcu_array[i] == call_rcu_bh))
+			continue;
+		wait_for_completion(&rs_array[i].completion);
+		destroy_rcu_head_on_stack(&rs_array[i].head);
+	}
 }
-EXPORT_SYMBOL_GPL(wait_rcu_gp);
+EXPORT_SYMBOL_GPL(__wait_rcu_gp);
 
 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
 void init_rcu_head(struct rcu_head *head)
-- 
cgit v1.2.3-70-g09d2


From f78f5b90c4ffa559e400c3919a02236101f29f3f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 18 Jun 2015 15:50:02 -0700
Subject: rcu: Rename rcu_lockdep_assert() to RCU_LOCKDEP_WARN()

This commit renames rcu_lockdep_assert() to RCU_LOCKDEP_WARN() for
consistency with the WARN() series of macros.  This also requires
inverting the sense of the conditional, which this commit also does.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/RCU/whatisRCU.txt  |  2 +-
 arch/x86/kernel/cpu/mcheck/mce.c |  6 ++--
 arch/x86/kernel/traps.c          |  2 +-
 drivers/base/power/opp.c         |  4 +--
 include/linux/fdtable.h          |  4 +--
 include/linux/rcupdate.h         | 63 ++++++++++++++++++++++++++--------------
 kernel/cgroup.c                  |  4 +--
 kernel/pid.c                     |  5 ++--
 kernel/rcu/srcu.c                | 10 +++----
 kernel/rcu/tiny.c                |  8 ++---
 kernel/rcu/tree.c                | 28 +++++++++---------
 kernel/rcu/tree_plugin.h         |  8 ++---
 kernel/rcu/update.c              |  4 +--
 kernel/sched/core.c              |  8 ++---
 kernel/workqueue.c               | 20 ++++++-------
 security/device_cgroup.c         |  6 ++--
 16 files changed, 101 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 5746b0c77f3e..adc2184009c5 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -883,7 +883,7 @@ All:  lockdep-checked RCU-protected pointer access
 
 	rcu_access_pointer
 	rcu_dereference_raw
-	rcu_lockdep_assert
+	RCU_LOCKDEP_WARN
 	rcu_sleep_check
 	RCU_NONIDLE
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index df919ff103c3..3d6b5269fb2e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -54,9 +54,9 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
 
 #define rcu_dereference_check_mce(p) \
 ({ \
-	rcu_lockdep_assert(rcu_read_lock_sched_held() || \
-			   lockdep_is_held(&mce_chrdev_read_mutex), \
-			   "suspicious rcu_dereference_check_mce() usage"); \
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+			 !lockdep_is_held(&mce_chrdev_read_mutex), \
+			 "suspicious rcu_dereference_check_mce() usage"); \
 	smp_load_acquire(&(p)); \
 })
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index f5791927aa64..c5a5231d1d11 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -136,7 +136,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
 	preempt_count_add(HARDIRQ_OFFSET);
 
 	/* This code is a bit fragile.  Test it. */
-	rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
 
 	return prev_state;
 }
diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c
index 677fb2843553..3b188f20b43f 100644
--- a/drivers/base/power/opp.c
+++ b/drivers/base/power/opp.c
@@ -110,8 +110,8 @@ static DEFINE_MUTEX(dev_opp_list_lock);
 
 #define opp_rcu_lockdep_assert()					\
 do {									\
-	rcu_lockdep_assert(rcu_read_lock_held() ||			\
-				lockdep_is_held(&dev_opp_list_lock),	\
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
+				!lockdep_is_held(&dev_opp_list_lock),	\
 			   "Missing rcu_read_lock() or "		\
 			   "dev_opp_list_lock protection");		\
 } while (0)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index fbb88740634a..674e3e226465 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -86,8 +86,8 @@ static inline struct file *__fcheck_files(struct files_struct *files, unsigned i
 
 static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd)
 {
-	rcu_lockdep_assert(rcu_read_lock_held() ||
-			   lockdep_is_held(&files->file_lock),
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
+			   !lockdep_is_held(&files->file_lock),
 			   "suspicious rcu_dereference_check() usage");
 	return __fcheck_files(files, fd);
 }
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 33ec16b9c2ee..ff476515f716 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -536,6 +536,11 @@ static inline int rcu_read_lock_sched_held(void)
 
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
+/* Deprecate rcu_lockdep_assert():  Use RCU_LOCKDEP_WARN() instead. */
+static inline void __attribute((deprecated)) deprecate_rcu_lockdep_assert(void)
+{
+}
+
 #ifdef CONFIG_PROVE_RCU
 
 /**
@@ -546,17 +551,32 @@ static inline int rcu_read_lock_sched_held(void)
 #define rcu_lockdep_assert(c, s)					\
 	do {								\
 		static bool __section(.data.unlikely) __warned;		\
+		deprecate_rcu_lockdep_assert();				\
 		if (debug_lockdep_rcu_enabled() && !__warned && !(c)) {	\
 			__warned = true;				\
 			lockdep_rcu_suspicious(__FILE__, __LINE__, s);	\
 		}							\
 	} while (0)
 
+/**
+ * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met
+ * @c: condition to check
+ * @s: informative message
+ */
+#define RCU_LOCKDEP_WARN(c, s)						\
+	do {								\
+		static bool __section(.data.unlikely) __warned;		\
+		if (debug_lockdep_rcu_enabled() && !__warned && (c)) {	\
+			__warned = true;				\
+			lockdep_rcu_suspicious(__FILE__, __LINE__, s);	\
+		}							\
+	} while (0)
+
 #if defined(CONFIG_PROVE_RCU) && !defined(CONFIG_PREEMPT_RCU)
 static inline void rcu_preempt_sleep_check(void)
 {
-	rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
-			   "Illegal context switch in RCU read-side critical section");
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
+			 "Illegal context switch in RCU read-side critical section");
 }
 #else /* #ifdef CONFIG_PROVE_RCU */
 static inline void rcu_preempt_sleep_check(void)
@@ -567,15 +587,16 @@ static inline void rcu_preempt_sleep_check(void)
 #define rcu_sleep_check()						\
 	do {								\
 		rcu_preempt_sleep_check();				\
-		rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),	\
-				   "Illegal context switch in RCU-bh read-side critical section"); \
-		rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),	\
-				   "Illegal context switch in RCU-sched read-side critical section"); \
+		RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),	\
+				 "Illegal context switch in RCU-bh read-side critical section"); \
+		RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),	\
+				 "Illegal context switch in RCU-sched read-side critical section"); \
 	} while (0)
 
 #else /* #ifdef CONFIG_PROVE_RCU */
 
-#define rcu_lockdep_assert(c, s) do { } while (0)
+#define rcu_lockdep_assert(c, s) deprecate_rcu_lockdep_assert()
+#define RCU_LOCKDEP_WARN(c, s) do { } while (0)
 #define rcu_sleep_check() do { } while (0)
 
 #endif /* #else #ifdef CONFIG_PROVE_RCU */
@@ -606,13 +627,13 @@ static inline void rcu_preempt_sleep_check(void)
 ({ \
 	/* Dependency order vs. p above. */ \
 	typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \
-	rcu_lockdep_assert(c, "suspicious rcu_dereference_check() usage"); \
+	RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \
 	rcu_dereference_sparse(p, space); \
 	((typeof(*p) __force __kernel *)(________p1)); \
 })
 #define __rcu_dereference_protected(p, c, space) \
 ({ \
-	rcu_lockdep_assert(c, "suspicious rcu_dereference_protected() usage"); \
+	RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \
 	rcu_dereference_sparse(p, space); \
 	((typeof(*p) __force __kernel *)(p)); \
 })
@@ -836,8 +857,8 @@ static inline void rcu_read_lock(void)
 	__rcu_read_lock();
 	__acquire(RCU);
 	rcu_lock_acquire(&rcu_lock_map);
-	rcu_lockdep_assert(rcu_is_watching(),
-			   "rcu_read_lock() used illegally while idle");
+	RCU_LOCKDEP_WARN(!rcu_is_watching(),
+			 "rcu_read_lock() used illegally while idle");
 }
 
 /*
@@ -887,8 +908,8 @@ static inline void rcu_read_lock(void)
  */
 static inline void rcu_read_unlock(void)
 {
-	rcu_lockdep_assert(rcu_is_watching(),
-			   "rcu_read_unlock() used illegally while idle");
+	RCU_LOCKDEP_WARN(!rcu_is_watching(),
+			 "rcu_read_unlock() used illegally while idle");
 	__release(RCU);
 	__rcu_read_unlock();
 	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
@@ -916,8 +937,8 @@ static inline void rcu_read_lock_bh(void)
 	local_bh_disable();
 	__acquire(RCU_BH);
 	rcu_lock_acquire(&rcu_bh_lock_map);
-	rcu_lockdep_assert(rcu_is_watching(),
-			   "rcu_read_lock_bh() used illegally while idle");
+	RCU_LOCKDEP_WARN(!rcu_is_watching(),
+			 "rcu_read_lock_bh() used illegally while idle");
 }
 
 /*
@@ -927,8 +948,8 @@ static inline void rcu_read_lock_bh(void)
  */
 static inline void rcu_read_unlock_bh(void)
 {
-	rcu_lockdep_assert(rcu_is_watching(),
-			   "rcu_read_unlock_bh() used illegally while idle");
+	RCU_LOCKDEP_WARN(!rcu_is_watching(),
+			 "rcu_read_unlock_bh() used illegally while idle");
 	rcu_lock_release(&rcu_bh_lock_map);
 	__release(RCU_BH);
 	local_bh_enable();
@@ -952,8 +973,8 @@ static inline void rcu_read_lock_sched(void)
 	preempt_disable();
 	__acquire(RCU_SCHED);
 	rcu_lock_acquire(&rcu_sched_lock_map);
-	rcu_lockdep_assert(rcu_is_watching(),
-			   "rcu_read_lock_sched() used illegally while idle");
+	RCU_LOCKDEP_WARN(!rcu_is_watching(),
+			 "rcu_read_lock_sched() used illegally while idle");
 }
 
 /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
@@ -970,8 +991,8 @@ static inline notrace void rcu_read_lock_sched_notrace(void)
  */
 static inline void rcu_read_unlock_sched(void)
 {
-	rcu_lockdep_assert(rcu_is_watching(),
-			   "rcu_read_unlock_sched() used illegally while idle");
+	RCU_LOCKDEP_WARN(!rcu_is_watching(),
+			 "rcu_read_unlock_sched() used illegally while idle");
 	rcu_lock_release(&rcu_sched_lock_map);
 	__release(RCU_SCHED);
 	preempt_enable();
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f89d9292eee6..b89f3168411b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -107,8 +107,8 @@ static DEFINE_SPINLOCK(release_agent_path_lock);
 struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
 
 #define cgroup_assert_mutex_or_rcu_locked()				\
-	rcu_lockdep_assert(rcu_read_lock_held() ||			\
-			   lockdep_is_held(&cgroup_mutex),		\
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
+			   !lockdep_is_held(&cgroup_mutex),		\
 			   "cgroup_mutex or RCU read lock required");
 
 /*
diff --git a/kernel/pid.c b/kernel/pid.c
index 4fd07d5b7baf..ca368793808e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -451,9 +451,8 @@ EXPORT_SYMBOL(pid_task);
  */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
-	rcu_lockdep_assert(rcu_read_lock_held(),
-			   "find_task_by_pid_ns() needs rcu_read_lock()"
-			   " protection");
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
+			 "find_task_by_pid_ns() needs rcu_read_lock() protection");
 	return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
 
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index de35087c92a5..d3fcb2ec8536 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -415,11 +415,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 	struct rcu_head *head = &rcu.head;
 	bool done = false;
 
-	rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
-			   !lock_is_held(&rcu_bh_lock_map) &&
-			   !lock_is_held(&rcu_lock_map) &&
-			   !lock_is_held(&rcu_sched_lock_map),
-			   "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
+	RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
+			 lock_is_held(&rcu_bh_lock_map) ||
+			 lock_is_held(&rcu_lock_map) ||
+			 lock_is_held(&rcu_sched_lock_map),
+			 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
 
 	might_sleep();
 	init_completion(&rcu.completion);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c291bd65d2cb..d0471056d0af 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -191,10 +191,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
  */
 void synchronize_sched(void)
 {
-	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
-			   !lock_is_held(&rcu_lock_map) &&
-			   !lock_is_held(&rcu_sched_lock_map),
-			   "Illegal synchronize_sched() in RCU read-side critical section");
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+			 lock_is_held(&rcu_lock_map) ||
+			 lock_is_held(&rcu_sched_lock_map),
+			 "Illegal synchronize_sched() in RCU read-side critical section");
 	cond_resched();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index cb64d7e13d24..0a73d26357a2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -649,12 +649,12 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
 	 * It is illegal to enter an extended quiescent state while
 	 * in an RCU read-side critical section.
 	 */
-	rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
-			   "Illegal idle entry in RCU read-side critical section.");
-	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
-			   "Illegal idle entry in RCU-bh read-side critical section.");
-	rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
-			   "Illegal idle entry in RCU-sched read-side critical section.");
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
+			 "Illegal idle entry in RCU read-side critical section.");
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),
+			 "Illegal idle entry in RCU-bh read-side critical section.");
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),
+			 "Illegal idle entry in RCU-sched read-side critical section.");
 }
 
 /*
@@ -3161,10 +3161,10 @@ static inline int rcu_blocking_is_gp(void)
  */
 void synchronize_sched(void)
 {
-	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
-			   !lock_is_held(&rcu_lock_map) &&
-			   !lock_is_held(&rcu_sched_lock_map),
-			   "Illegal synchronize_sched() in RCU-sched read-side critical section");
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+			 lock_is_held(&rcu_lock_map) ||
+			 lock_is_held(&rcu_sched_lock_map),
+			 "Illegal synchronize_sched() in RCU-sched read-side critical section");
 	if (rcu_blocking_is_gp())
 		return;
 	if (rcu_gp_is_expedited())
@@ -3188,10 +3188,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
  */
 void synchronize_rcu_bh(void)
 {
-	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
-			   !lock_is_held(&rcu_lock_map) &&
-			   !lock_is_held(&rcu_sched_lock_map),
-			   "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+			 lock_is_held(&rcu_lock_map) ||
+			 lock_is_held(&rcu_sched_lock_map),
+			 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
 	if (rcu_blocking_is_gp())
 		return;
 	if (rcu_gp_is_expedited())
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index a983bc68a146..9e922f111d63 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -538,10 +538,10 @@ EXPORT_SYMBOL_GPL(call_rcu);
  */
 void synchronize_rcu(void)
 {
-	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
-			   !lock_is_held(&rcu_lock_map) &&
-			   !lock_is_held(&rcu_sched_lock_map),
-			   "Illegal synchronize_rcu() in RCU read-side critical section");
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+			 lock_is_held(&rcu_lock_map) ||
+			 lock_is_held(&rcu_sched_lock_map),
+			 "Illegal synchronize_rcu() in RCU read-side critical section");
 	if (!rcu_scheduler_active)
 		return;
 	if (rcu_gp_is_expedited())
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a0a0dd03c73a..47268fb1d27b 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -589,8 +589,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
 void synchronize_rcu_tasks(void)
 {
 	/* Complain if the scheduler has not started.  */
-	rcu_lockdep_assert(!rcu_scheduler_active,
-			   "synchronize_rcu_tasks called too soon");
+	RCU_LOCKDEP_WARN(rcu_scheduler_active,
+			 "synchronize_rcu_tasks called too soon");
 
 	/* Wait for the grace period. */
 	wait_rcu_gp(call_rcu_tasks);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78b4bad10081..5e73c79fadd0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2200,8 +2200,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
 #ifdef CONFIG_SMP
 inline struct dl_bw *dl_bw_of(int i)
 {
-	rcu_lockdep_assert(rcu_read_lock_sched_held(),
-			   "sched RCU must be held");
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+			 "sched RCU must be held");
 	return &cpu_rq(i)->rd->dl_bw;
 }
 
@@ -2210,8 +2210,8 @@ static inline int dl_bw_cpus(int i)
 	struct root_domain *rd = cpu_rq(i)->rd;
 	int cpus = 0;
 
-	rcu_lockdep_assert(rcu_read_lock_sched_held(),
-			   "sched RCU must be held");
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+			 "sched RCU must be held");
 	for_each_cpu_and(i, rd->span, cpu_active_mask)
 		cpus++;
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4c4f06176f74..cb91c63b4f4a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -338,20 +338,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
 #include <trace/events/workqueue.h>
 
 #define assert_rcu_or_pool_mutex()					\
-	rcu_lockdep_assert(rcu_read_lock_sched_held() ||		\
-			   lockdep_is_held(&wq_pool_mutex),		\
-			   "sched RCU or wq_pool_mutex should be held")
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
+			 !lockdep_is_held(&wq_pool_mutex),		\
+			 "sched RCU or wq_pool_mutex should be held")
 
 #define assert_rcu_or_wq_mutex(wq)					\
-	rcu_lockdep_assert(rcu_read_lock_sched_held() ||		\
-			   lockdep_is_held(&wq->mutex),			\
-			   "sched RCU or wq->mutex should be held")
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
+			 !lockdep_is_held(&wq->mutex),			\
+			 "sched RCU or wq->mutex should be held")
 
 #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)			\
-	rcu_lockdep_assert(rcu_read_lock_sched_held() ||		\
-			   lockdep_is_held(&wq->mutex) ||		\
-			   lockdep_is_held(&wq_pool_mutex),		\
-			   "sched RCU, wq->mutex or wq_pool_mutex should be held")
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
+			 !lockdep_is_held(&wq->mutex) &&		\
+			 !lockdep_is_held(&wq_pool_mutex),		\
+			 "sched RCU, wq->mutex or wq_pool_mutex should be held")
 
 #define for_each_cpu_worker_pool(pool, cpu)				\
 	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];		\
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 188c1d26393b..73455089feef 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -400,9 +400,9 @@ static bool verify_new_ex(struct dev_cgroup *dev_cgroup,
 {
 	bool match = false;
 
-	rcu_lockdep_assert(rcu_read_lock_held() ||
-			   lockdep_is_held(&devcgroup_mutex),
-			   "device_cgroup:verify_new_ex called without proper synchronization");
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
+			 lockdep_is_held(&devcgroup_mutex),
+			 "device_cgroup:verify_new_ex called without proper synchronization");
 
 	if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) {
 		if (behavior == DEVCG_DEFAULT_ALLOW) {
-- 
cgit v1.2.3-70-g09d2


From 38aa420096e565fe9c98f9d9475fd168114501a9 Mon Sep 17 00:00:00 2001
From: Nikhil Badola <nikhil.badola@freescale.com>
Date: Mon, 15 Jun 2015 15:46:37 +0530
Subject: drivers:usb:fsl: Replace macros with enumerated type

Replace macros with enumerated type to represent usb ip
controller version

Signed-off-by: Nikhil Badola <nikhil.badola@freescale.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/fsl-mph-dr-of.c |  8 ++++----
 include/linux/fsl_devices.h      | 16 ++++++++++------
 2 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/host/fsl-mph-dr-of.c b/drivers/usb/host/fsl-mph-dr-of.c
index 5e0d60035216..219595637cb1 100644
--- a/drivers/usb/host/fsl-mph-dr-of.c
+++ b/drivers/usb/host/fsl-mph-dr-of.c
@@ -119,9 +119,9 @@ error:
 
 static const struct of_device_id fsl_usb2_mph_dr_of_match[];
 
-static int usb_get_ver_info(struct device_node *np)
+static enum fsl_usb2_controller_ver usb_get_ver_info(struct device_node *np)
 {
-	int ver = -1;
+	enum fsl_usb2_controller_ver ver = FSL_USB_VER_NONE;
 
 	/*
 	 * returns 1 for usb controller version 1.6
@@ -142,7 +142,7 @@ static int usb_get_ver_info(struct device_node *np)
 		else /* for previous controller versions */
 			ver = FSL_USB_VER_OLD;
 
-		if (ver > -1)
+		if (ver > FSL_USB_VER_NONE)
 			return ver;
 	}
 
@@ -215,7 +215,7 @@ static int fsl_usb2_mph_dr_of_probe(struct platform_device *ofdev)
 	pdata->controller_ver = usb_get_ver_info(np);
 
 	if (pdata->have_sysif_regs) {
-		if (pdata->controller_ver < 0) {
+		if (pdata->controller_ver == FSL_USB_VER_NONE) {
 			dev_warn(&ofdev->dev, "Could not get controller version\n");
 			return -ENODEV;
 		}
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index 2a2f56b292c1..0d4855cd5330 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -20,11 +20,6 @@
 #define FSL_UTMI_PHY_DLY	10	/*As per P1010RM, delay for UTMI
 				PHY CLK to become stable - 10ms*/
 #define FSL_USB_PHY_CLK_TIMEOUT	10000	/* uSec */
-#define FSL_USB_VER_OLD		0
-#define FSL_USB_VER_1_6		1
-#define FSL_USB_VER_2_2		2
-#define FSL_USB_VER_2_4		3
-#define FSL_USB_VER_2_5		4
 
 #include <linux/types.h>
 
@@ -52,6 +47,15 @@
  *
  */
 
+enum fsl_usb2_controller_ver {
+	FSL_USB_VER_NONE = -1,
+	FSL_USB_VER_OLD = 0,
+	FSL_USB_VER_1_6 = 1,
+	FSL_USB_VER_2_2 = 2,
+	FSL_USB_VER_2_4 = 3,
+	FSL_USB_VER_2_5 = 4,
+};
+
 enum fsl_usb2_operating_modes {
 	FSL_USB2_MPH_HOST,
 	FSL_USB2_DR_HOST,
@@ -72,7 +76,7 @@ struct platform_device;
 
 struct fsl_usb2_platform_data {
 	/* board specific information */
-	int				controller_ver;
+	enum fsl_usb2_controller_ver	controller_ver;
 	enum fsl_usb2_operating_modes	operating_mode;
 	enum fsl_usb2_phy_modes		phy_mode;
 	unsigned int			port_enables;
-- 
cgit v1.2.3-70-g09d2


From 523f1dec58408b36e7683a3d61a0286eed1fc1c8 Mon Sep 17 00:00:00 2001
From: Nikhil Badola <nikhil.badola@freescale.com>
Date: Mon, 15 Jun 2015 15:47:29 +0530
Subject: drivers: usb :fsl: Implement Workaround for USB Erratum A007792

USB controller version-2.5 requires to enable internal UTMI
phy and program PTS field in PORTSC register before asserting
controller reset. This is must for successful resetting of the
controller and subsequent enumeration of usb devices

Signed-off-by: Nikhil Badola <nikhil.badola@freescale.com>
Signed-off-by: Suresh Gupta <suresh.gupta@freescale.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/ehci-fsl.c      | 9 +++++++++
 drivers/usb/host/fsl-mph-dr-of.c | 6 ++++++
 include/linux/fsl_devices.h      | 1 +
 3 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/host/ehci-fsl.c b/drivers/usb/host/ehci-fsl.c
index 5352e74b92e2..716aa8be1d6f 100644
--- a/drivers/usb/host/ehci-fsl.c
+++ b/drivers/usb/host/ehci-fsl.c
@@ -129,6 +129,15 @@ static int fsl_ehci_drv_probe(struct platform_device *pdev)
 	if (pdata->have_sysif_regs && pdata->controller_ver < FSL_USB_VER_1_6)
 		setbits32(hcd->regs + FSL_SOC_USB_CTRL, 0x4);
 
+	/*
+	 * Enable UTMI phy and program PTS field in UTMI mode before asserting
+	 * controller reset for USB Controller version 2.5
+	 */
+	if (pdata->has_fsl_erratum_a007792) {
+		writel_be(CTRL_UTMI_PHY_EN, hcd->regs + FSL_SOC_USB_CTRL);
+		writel(PORT_PTS_UTMI, hcd->regs + FSL_SOC_USB_PORTSC1);
+	}
+
 	/* Don't need to set host mode here. It will be done by tdi_reset() */
 
 	retval = usb_add_hcd(hcd, irq, IRQF_SHARED);
diff --git a/drivers/usb/host/fsl-mph-dr-of.c b/drivers/usb/host/fsl-mph-dr-of.c
index 219595637cb1..17e1e6b7a035 100644
--- a/drivers/usb/host/fsl-mph-dr-of.c
+++ b/drivers/usb/host/fsl-mph-dr-of.c
@@ -214,6 +214,12 @@ static int fsl_usb2_mph_dr_of_probe(struct platform_device *ofdev)
 	pdata->phy_mode = determine_usb_phy(prop);
 	pdata->controller_ver = usb_get_ver_info(np);
 
+	/* Activate Erratum by reading property in device tree */
+	if (of_get_property(np, "fsl,usb-erratum-a007792", NULL))
+		pdata->has_fsl_erratum_a007792 = 1;
+	else
+		pdata->has_fsl_erratum_a007792 = 0;
+
 	if (pdata->have_sysif_regs) {
 		if (pdata->controller_ver == FSL_USB_VER_NONE) {
 			dev_warn(&ofdev->dev, "Could not get controller version\n");
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index 0d4855cd5330..bdb40f67180c 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -97,6 +97,7 @@ struct fsl_usb2_platform_data {
 
 	unsigned	suspended:1;
 	unsigned	already_suspended:1;
+	unsigned        has_fsl_erratum_a007792:1;
 
 	/* register save area for suspend/resume */
 	u32		pm_command;
-- 
cgit v1.2.3-70-g09d2


From 6009d95e04cf74c6f80db56fddca21fea476ad24 Mon Sep 17 00:00:00 2001
From: Nikhil Badola <nikhil.badola@freescale.com>
Date: Mon, 15 Jun 2015 15:48:22 +0530
Subject: drivers:usb:fsl: Introduce FSL_USB2_PHY_UTMI_DUAL macro

Introduce FSL_USB2_PHY_UTMI_DUAL macro for setting phy mode
in SOCs such has T4240, T1040, T2080 which have utmi dual-phy

Signed-off-by: Ramneek Mehresh <ramneek.mehresh@freescale.com>
Signed-off-by: Nikhil Badola <nikhil.badola@freescale.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/ehci-fsl.c      | 1 +
 drivers/usb/host/fsl-mph-dr-of.c | 2 ++
 include/linux/fsl_devices.h      | 1 +
 3 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/host/ehci-fsl.c b/drivers/usb/host/ehci-fsl.c
index 716aa8be1d6f..b04c9dbd5c7d 100644
--- a/drivers/usb/host/ehci-fsl.c
+++ b/drivers/usb/host/ehci-fsl.c
@@ -213,6 +213,7 @@ static int ehci_fsl_setup_phy(struct usb_hcd *hcd,
 		portsc |= PORT_PTS_PTW;
 		/* fall through */
 	case FSL_USB2_PHY_UTMI:
+	case FSL_USB2_PHY_UTMI_DUAL:
 		if (pdata->have_sysif_regs && pdata->controller_ver) {
 			/* controller version 1.6 or above */
 			setbits32(non_ehci + FSL_SOC_USB_CTRL, UTMI_PHY_EN);
diff --git a/drivers/usb/host/fsl-mph-dr-of.c b/drivers/usb/host/fsl-mph-dr-of.c
index 17e1e6b7a035..631fc504afda 100644
--- a/drivers/usb/host/fsl-mph-dr-of.c
+++ b/drivers/usb/host/fsl-mph-dr-of.c
@@ -69,6 +69,8 @@ static enum fsl_usb2_phy_modes determine_usb_phy(const char *phy_type)
 		return FSL_USB2_PHY_UTMI;
 	if (!strcasecmp(phy_type, "utmi_wide"))
 		return FSL_USB2_PHY_UTMI_WIDE;
+	if (!strcasecmp(phy_type, "utmi_dual"))
+		return FSL_USB2_PHY_UTMI_DUAL;
 	if (!strcasecmp(phy_type, "serial"))
 		return FSL_USB2_PHY_SERIAL;
 
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index bdb40f67180c..070d9aef90a7 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -69,6 +69,7 @@ enum fsl_usb2_phy_modes {
 	FSL_USB2_PHY_UTMI,
 	FSL_USB2_PHY_UTMI_WIDE,
 	FSL_USB2_PHY_SERIAL,
+	FSL_USB2_PHY_UTMI_DUAL,
 };
 
 struct clk;
-- 
cgit v1.2.3-70-g09d2


From f4fdfaa280a284be8a056d6840cdbbf42c05bf95 Mon Sep 17 00:00:00 2001
From: Nikhil Badola <nikhil.badola@freescale.com>
Date: Tue, 14 Jul 2015 17:28:10 +0530
Subject: drivers: usb: fsl: Modify phy clk valid bit checking

Phy_clk_valid bit is checked only when the boolean
property phy-clk-valid in present in usb node device tree.
This property is added to the usb node via device tree fixup.

Signed-off-by: Nikhil Badola <nikhil.badola@freescale.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/ehci-fsl.c      | 16 ++++++++--------
 drivers/usb/host/fsl-mph-dr-of.c |  9 +++++++++
 include/linux/fsl_devices.h      |  1 +
 3 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/host/ehci-fsl.c b/drivers/usb/host/ehci-fsl.c
index b04c9dbd5c7d..05ebe3dcd618 100644
--- a/drivers/usb/host/ehci-fsl.c
+++ b/drivers/usb/host/ehci-fsl.c
@@ -230,14 +230,14 @@ static int ehci_fsl_setup_phy(struct usb_hcd *hcd,
 		break;
 	}
 
-	if (pdata->have_sysif_regs &&
-	    pdata->controller_ver > FSL_USB_VER_1_6 &&
-	    (phy_mode == FSL_USB2_PHY_ULPI)) {
-		/* check PHY_CLK_VALID to get phy clk valid */
-		if (!(spin_event_timeout(in_be32(non_ehci + FSL_SOC_USB_CTRL) &
-				PHY_CLK_VALID, FSL_USB_PHY_CLK_TIMEOUT, 0) ||
-				in_be32(non_ehci + FSL_SOC_USB_PRICTRL))) {
-			dev_warn(hcd->self.controller, "USB PHY clock invalid\n");
+	/*
+	 * check PHY_CLK_VALID to determine phy clock presence before writing
+	 * to portsc
+	 */
+	if (pdata->check_phy_clk_valid) {
+		if (!(in_be32(non_ehci + FSL_SOC_USB_CTRL) & PHY_CLK_VALID)) {
+			dev_warn(hcd->self.controller,
+				 "USB PHY clock invalid\n");
 			return -EINVAL;
 		}
 	}
diff --git a/drivers/usb/host/fsl-mph-dr-of.c b/drivers/usb/host/fsl-mph-dr-of.c
index 631fc504afda..9f731413ab3e 100644
--- a/drivers/usb/host/fsl-mph-dr-of.c
+++ b/drivers/usb/host/fsl-mph-dr-of.c
@@ -222,6 +222,15 @@ static int fsl_usb2_mph_dr_of_probe(struct platform_device *ofdev)
 	else
 		pdata->has_fsl_erratum_a007792 = 0;
 
+	/*
+	 * Determine whether phy_clk_valid needs to be checked
+	 * by reading property in device tree
+	 */
+	if (of_get_property(np, "phy-clk-valid", NULL))
+		pdata->check_phy_clk_valid = 1;
+	else
+		pdata->check_phy_clk_valid = 0;
+
 	if (pdata->have_sysif_regs) {
 		if (pdata->controller_ver == FSL_USB_VER_NONE) {
 			dev_warn(&ofdev->dev, "Could not get controller version\n");
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index 070d9aef90a7..cebdbbb4aa69 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -99,6 +99,7 @@ struct fsl_usb2_platform_data {
 	unsigned	suspended:1;
 	unsigned	already_suspended:1;
 	unsigned        has_fsl_erratum_a007792:1;
+	unsigned        check_phy_clk_valid:1;
 
 	/* register save area for suspend/resume */
 	u32		pm_command;
-- 
cgit v1.2.3-70-g09d2


From ee86dbc6e327062396748162b95309388c19faab Mon Sep 17 00:00:00 2001
From: Andrey Smetanin <asmetanin@virtuozzo.com>
Date: Fri, 3 Jul 2015 15:01:35 +0300
Subject: kvm: introduce vcpu_debug = kvm_debug + vcpu context

vcpu_debug is useful macro like kvm_debug but additionally
includes vcpu context inside output.

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Peter Hornyack <peterhornyack@google.com>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Gleb Natapov <gleb@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 05e99b8ef465..1d917d9b7f12 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -424,6 +424,9 @@ struct kvm {
 #define vcpu_unimpl(vcpu, fmt, ...)					\
 	kvm_pr_unimpl("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
 
+#define vcpu_debug(vcpu, fmt, ...)					\
+	kvm_debug("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
+
 static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
 {
 	smp_rmb();
-- 
cgit v1.2.3-70-g09d2


From 2ce7918990641b07e70e1b25752d666369e2016e Mon Sep 17 00:00:00 2001
From: Andrey Smetanin <asmetanin@virtuozzo.com>
Date: Fri, 3 Jul 2015 15:01:41 +0300
Subject: kvm/x86: add sending hyper-v crash notification to user space

Sending of notification is done by exiting vcpu to user space
if KVM_REQ_HV_CRASH is enabled for vcpu. At exit to user space
the kvm_run structure contains system_event with type
KVM_SYSTEM_EVENT_CRASH to notify about guest crash occurred.

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Peter Hornyack <peterhornyack@google.com>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Gleb Natapov <gleb@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 5 +++++
 arch/x86/kvm/x86.c                | 6 ++++++
 include/linux/kvm_host.h          | 1 +
 include/uapi/linux/kvm.h          | 1 +
 4 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index a7926a90156f..a4ebcb712375 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3277,6 +3277,7 @@ should put the acknowledged interrupt vector into the 'epr' field.
 		struct {
 #define KVM_SYSTEM_EVENT_SHUTDOWN       1
 #define KVM_SYSTEM_EVENT_RESET          2
+#define KVM_SYSTEM_EVENT_CRASH          3
 			__u32 type;
 			__u64 flags;
 		} system_event;
@@ -3296,6 +3297,10 @@ Valid values for 'type' are:
   KVM_SYSTEM_EVENT_RESET -- the guest has requested a reset of the VM.
    As with SHUTDOWN, userspace can choose to ignore the request, or
    to schedule the reset to occur in the future and may call KVM_RUN again.
+  KVM_SYSTEM_EVENT_CRASH -- the guest crash occurred and the guest
+   has requested a crash condition maintenance. Userspace can choose
+   to ignore the request, or to gather VM memory core dump and/or
+   reset/shutdown of the VM.
 
 		/* Fix the size of the union. */
 		char padding[256];
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cfa3e5a7d6be..28076c266a9a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6263,6 +6263,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			vcpu_scan_ioapic(vcpu);
 		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
 			kvm_vcpu_reload_apic_access_page(vcpu);
+		if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
+			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
+			r = 0;
+			goto out;
+		}
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1d917d9b7f12..51103f0feb7e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -139,6 +139,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_DISABLE_IBS       24
 #define KVM_REQ_APIC_PAGE_RELOAD  25
 #define KVM_REQ_SMI               26
+#define KVM_REQ_HV_CRASH          27
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 716ad4ae4d4b..9ef19ebd9df4 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -317,6 +317,7 @@ struct kvm_run {
 		struct {
 #define KVM_SYSTEM_EVENT_SHUTDOWN       1
 #define KVM_SYSTEM_EVENT_RESET          2
+#define KVM_SYSTEM_EVENT_CRASH          3
 			__u32 type;
 			__u64 flags;
 		} system_event;
-- 
cgit v1.2.3-70-g09d2


From 3bbd14e0a2e3a988b1b5fe702a2539bd8d0ec622 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 20 Jul 2015 13:32:52 +0200
Subject: netfilter: rename local nf_hook_list to hook_list

085db2c04557 ("netfilter: Per network namespace netfilter hooks.") introduced a
new nf_hook_list that is global, so let's avoid this overlap.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/netfilter.h | 14 +++++++-------
 net/netfilter/core.c      | 28 ++++++++++++++--------------
 2 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index e01da73ee6c4..d788ce62d826 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -140,20 +140,20 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg);
 #ifdef HAVE_JUMP_LABEL
 extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 
-static inline bool nf_hook_list_active(struct list_head *nf_hook_list,
+static inline bool nf_hook_list_active(struct list_head *hook_list,
 				       u_int8_t pf, unsigned int hook)
 {
 	if (__builtin_constant_p(pf) &&
 	    __builtin_constant_p(hook))
 		return static_key_false(&nf_hooks_needed[pf][hook]);
 
-	return !list_empty(nf_hook_list);
+	return !list_empty(hook_list);
 }
 #else
-static inline bool nf_hook_list_active(struct list_head *nf_hook_list,
+static inline bool nf_hook_list_active(struct list_head *hook_list,
 				       u_int8_t pf, unsigned int hook)
 {
-	return !list_empty(nf_hook_list);
+	return !list_empty(hook_list);
 }
 #endif
 
@@ -175,12 +175,12 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
 				 int thresh)
 {
 	struct net *net = dev_net(indev ? indev : outdev);
-	struct list_head *nf_hook_list = &net->nf.hooks[pf][hook];
+	struct list_head *hook_list = &net->nf.hooks[pf][hook];
 
-	if (nf_hook_list_active(nf_hook_list, pf, hook)) {
+	if (nf_hook_list_active(hook_list, pf, hook)) {
 		struct nf_hook_state state;
 
-		nf_hook_state_init(&state, nf_hook_list, hook, thresh,
+		nf_hook_state_init(&state, hook_list, hook, thresh,
 				   pf, indev, outdev, sk, okfn);
 		return nf_hook_slow(skb, &state);
 	}
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 0ecb2b52f276..2a5a0704245c 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -62,20 +62,20 @@ EXPORT_SYMBOL(nf_hooks_needed);
 
 static DEFINE_MUTEX(nf_hook_mutex);
 
-static struct list_head *find_nf_hook_list(struct net *net,
+static struct list_head *nf_find_hook_list(struct net *net,
 					   const struct nf_hook_ops *reg)
 {
-	struct list_head *nf_hook_list = NULL;
+	struct list_head *hook_list = NULL;
 
 	if (reg->pf != NFPROTO_NETDEV)
-		nf_hook_list = &net->nf.hooks[reg->pf][reg->hooknum];
+		hook_list = &net->nf.hooks[reg->pf][reg->hooknum];
 	else if (reg->hooknum == NF_NETDEV_INGRESS) {
 #ifdef CONFIG_NETFILTER_INGRESS
 		if (reg->dev && dev_net(reg->dev) == net)
-			nf_hook_list = &reg->dev->nf_hooks_ingress;
+			hook_list = &reg->dev->nf_hooks_ingress;
 #endif
 	}
-	return nf_hook_list;
+	return hook_list;
 }
 
 struct nf_hook_entry {
@@ -85,7 +85,7 @@ struct nf_hook_entry {
 
 int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 {
-	struct list_head *nf_hook_list;
+	struct list_head *hook_list;
 	struct nf_hook_entry *entry;
 	struct nf_hook_ops *elem;
 
@@ -96,14 +96,14 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 	entry->orig_ops	= reg;
 	entry->ops	= *reg;
 
-	nf_hook_list = find_nf_hook_list(net, reg);
-	if (!nf_hook_list) {
+	hook_list = nf_find_hook_list(net, reg);
+	if (!hook_list) {
 		kfree(entry);
 		return -ENOENT;
 	}
 
 	mutex_lock(&nf_hook_mutex);
-	list_for_each_entry(elem, nf_hook_list, list) {
+	list_for_each_entry(elem, hook_list, list) {
 		if (reg->priority < elem->priority)
 			break;
 	}
@@ -122,16 +122,16 @@ EXPORT_SYMBOL(nf_register_net_hook);
 
 void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 {
-	struct list_head *nf_hook_list;
+	struct list_head *hook_list;
 	struct nf_hook_entry *entry;
 	struct nf_hook_ops *elem;
 
-	nf_hook_list = find_nf_hook_list(net, reg);
-	if (!nf_hook_list)
+	hook_list = nf_find_hook_list(net, reg);
+	if (!hook_list)
 		return;
 
 	mutex_lock(&nf_hook_mutex);
-	list_for_each_entry(elem, nf_hook_list, list) {
+	list_for_each_entry(elem, hook_list, list) {
 		entry = container_of(elem, struct nf_hook_entry, ops);
 		if (entry->orig_ops == reg) {
 			list_del_rcu(&entry->ops.list);
@@ -139,7 +139,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 		}
 	}
 	mutex_unlock(&nf_hook_mutex);
-	if (&elem->list == nf_hook_list) {
+	if (&elem->list == hook_list) {
 		WARN(1, "nf_unregister_net_hook: hook not found!\n");
 		return;
 	}
-- 
cgit v1.2.3-70-g09d2


From 6184fc0b8dd76c6aedc7a26e93254993e14e52de Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 24 Jun 2015 18:07:02 +0200
Subject: quota: Propagate error from ->acquire_dquot()

Currently when some error happened in ->acquire_dquot(), dqget() just
returned NULL. That was indistinguishable from a case when e.g. someone
run quotaoff and so was generally silently ignored. However
->acquire_dquot() can fail because of ENOSPC or EIO in which case user
should better know. So propagate error up from ->acquire_dquot properly.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ocfs2/file.c          |  8 ++---
 fs/ocfs2/quota_local.c   |  4 +--
 fs/quota/dquot.c         | 88 ++++++++++++++++++++++++++++++++++--------------
 include/linux/quotaops.h |  5 +--
 4 files changed, 72 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 719f7f4c7a37..4d9e8275ed99 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1209,8 +1209,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
 		    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
 			transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
-			if (!transfer_to[USRQUOTA]) {
-				status = -ESRCH;
+			if (IS_ERR(transfer_to[USRQUOTA])) {
+				status = PTR_ERR(transfer_to[USRQUOTA]);
 				goto bail_unlock;
 			}
 		}
@@ -1218,8 +1218,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
 		    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
 			transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
-			if (!transfer_to[GRPQUOTA]) {
-				status = -ESRCH;
+			if (IS_ERR(transfer_to[GRPQUOTA])) {
+				status = PTR_ERR(transfer_to[GRPQUOTA]);
 				goto bail_unlock;
 			}
 		}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 3d0b63d34225..bb07004df72a 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -499,8 +499,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 			dquot = dqget(sb,
 				      make_kqid(&init_user_ns, type,
 						le64_to_cpu(dqblk->dqb_id)));
-			if (!dquot) {
-				status = -EIO;
+			if (IS_ERR(dquot)) {
+				status = PTR_ERR(dquot);
 				mlog(ML_ERROR, "Failed to get quota structure "
 				     "for id %u, type %d. Cannot finish quota "
 				     "file recovery.\n",
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 20d1f74561cf..fed66e2c9fe8 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -247,7 +247,7 @@ struct dqstats dqstats;
 EXPORT_SYMBOL(dqstats);
 
 static qsize_t inode_get_rsv_space(struct inode *inode);
-static void __dquot_initialize(struct inode *inode, int type);
+static int __dquot_initialize(struct inode *inode, int type);
 
 static inline unsigned int
 hashfn(const struct super_block *sb, struct kqid qid)
@@ -832,16 +832,17 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 struct dquot *dqget(struct super_block *sb, struct kqid qid)
 {
 	unsigned int hashent = hashfn(sb, qid);
-	struct dquot *dquot = NULL, *empty = NULL;
+	struct dquot *dquot, *empty = NULL;
 
         if (!sb_has_quota_active(sb, qid.type))
-		return NULL;
+		return ERR_PTR(-ESRCH);
 we_slept:
 	spin_lock(&dq_list_lock);
 	spin_lock(&dq_state_lock);
 	if (!sb_has_quota_active(sb, qid.type)) {
 		spin_unlock(&dq_state_lock);
 		spin_unlock(&dq_list_lock);
+		dquot = ERR_PTR(-ESRCH);
 		goto out;
 	}
 	spin_unlock(&dq_state_lock);
@@ -876,11 +877,15 @@ we_slept:
 	 * already finished or it will be canceled due to dq_count > 1 test */
 	wait_on_dquot(dquot);
 	/* Read the dquot / allocate space in quota file */
-	if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) &&
-	    sb->dq_op->acquire_dquot(dquot) < 0) {
-		dqput(dquot);
-		dquot = NULL;
-		goto out;
+	if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
+		int err;
+
+		err = sb->dq_op->acquire_dquot(dquot);
+		if (err < 0) {
+			dqput(dquot);
+			dquot = ERR_PTR(err);
+			goto out;
+		}
 	}
 #ifdef CONFIG_QUOTA_DEBUG
 	BUG_ON(!dquot->dq_sb);	/* Has somebody invalidated entry under us? */
@@ -1390,15 +1395,16 @@ static int dquot_active(const struct inode *inode)
  * It is better to call this function outside of any transaction as it
  * might need a lot of space in journal for dquot structure allocation.
  */
-static void __dquot_initialize(struct inode *inode, int type)
+static int __dquot_initialize(struct inode *inode, int type)
 {
 	int cnt, init_needed = 0;
 	struct dquot **dquots, *got[MAXQUOTAS];
 	struct super_block *sb = inode->i_sb;
 	qsize_t rsv;
+	int ret = 0;
 
 	if (!dquot_active(inode))
-		return;
+		return 0;
 
 	dquots = i_dquot(inode);
 
@@ -1407,6 +1413,7 @@ static void __dquot_initialize(struct inode *inode, int type)
 		struct kqid qid;
 		kprojid_t projid;
 		int rc;
+		struct dquot *dquot;
 
 		got[cnt] = NULL;
 		if (type != -1 && cnt != type)
@@ -1438,16 +1445,25 @@ static void __dquot_initialize(struct inode *inode, int type)
 			qid = make_kqid_projid(projid);
 			break;
 		}
-		got[cnt] = dqget(sb, qid);
+		dquot = dqget(sb, qid);
+		if (IS_ERR(dquot)) {
+			/* We raced with somebody turning quotas off... */
+			if (PTR_ERR(dquot) != -ESRCH) {
+				ret = PTR_ERR(dquot);
+				goto out_put;
+			}
+			dquot = NULL;
+		}
+		got[cnt] = dquot;
 	}
 
 	/* All required i_dquot has been initialized */
 	if (!init_needed)
-		return;
+		return 0;
 
 	spin_lock(&dq_data_lock);
 	if (IS_NOQUOTA(inode))
-		goto out_err;
+		goto out_lock;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (type != -1 && cnt != type)
 			continue;
@@ -1469,15 +1485,18 @@ static void __dquot_initialize(struct inode *inode, int type)
 				dquot_resv_space(dquots[cnt], rsv);
 		}
 	}
-out_err:
+out_lock:
 	spin_unlock(&dq_data_lock);
+out_put:
 	/* Drop unused references */
 	dqput_all(got);
+
+	return ret;
 }
 
-void dquot_initialize(struct inode *inode)
+int dquot_initialize(struct inode *inode)
 {
-	__dquot_initialize(inode, -1);
+	return __dquot_initialize(inode, -1);
 }
 EXPORT_SYMBOL(dquot_initialize);
 
@@ -1961,18 +1980,37 @@ EXPORT_SYMBOL(__dquot_transfer);
 int dquot_transfer(struct inode *inode, struct iattr *iattr)
 {
 	struct dquot *transfer_to[MAXQUOTAS] = {};
+	struct dquot *dquot;
 	struct super_block *sb = inode->i_sb;
 	int ret;
 
 	if (!dquot_active(inode))
 		return 0;
 
-	if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid))
-		transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(iattr->ia_uid));
-	if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))
-		transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(iattr->ia_gid));
-
+	if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){
+		dquot = dqget(sb, make_kqid_uid(iattr->ia_uid));
+		if (IS_ERR(dquot)) {
+			if (PTR_ERR(dquot) != -ESRCH) {
+				ret = PTR_ERR(dquot);
+				goto out_put;
+			}
+			dquot = NULL;
+		}
+		transfer_to[USRQUOTA] = dquot;
+	}
+	if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)){
+		dquot = dqget(sb, make_kqid_gid(iattr->ia_gid));
+		if (IS_ERR(dquot)) {
+			if (PTR_ERR(dquot) != -ESRCH) {
+				ret = PTR_ERR(dquot);
+				goto out_put;
+			}
+			dquot = NULL;
+		}
+		transfer_to[GRPQUOTA] = dquot;
+	}
 	ret = __dquot_transfer(inode, transfer_to);
+out_put:
 	dqput_all(transfer_to);
 	return ret;
 }
@@ -2518,8 +2556,8 @@ int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
 	struct dquot *dquot;
 
 	dquot = dqget(sb, qid);
-	if (!dquot)
-		return -ESRCH;
+	if (IS_ERR(dquot))
+		return PTR_ERR(dquot);
 	do_get_dqblk(dquot, di);
 	dqput(dquot);
 
@@ -2631,8 +2669,8 @@ int dquot_set_dqblk(struct super_block *sb, struct kqid qid,
 	int rc;
 
 	dquot = dqget(sb, qid);
-	if (!dquot) {
-		rc = -ESRCH;
+	if (IS_ERR(dquot)) {
+		rc = PTR_ERR(dquot);
 		goto out;
 	}
 	rc = do_set_dqblk(dquot, di);
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 77ca6601ff25..7a57c28eb5e7 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -43,7 +43,7 @@ void inode_claim_rsv_space(struct inode *inode, qsize_t number);
 void inode_sub_rsv_space(struct inode *inode, qsize_t number);
 void inode_reclaim_rsv_space(struct inode *inode, qsize_t number);
 
-void dquot_initialize(struct inode *inode);
+int dquot_initialize(struct inode *inode);
 void dquot_drop(struct inode *inode);
 struct dquot *dqget(struct super_block *sb, struct kqid qid);
 static inline struct dquot *dqgrab(struct dquot *dquot)
@@ -200,8 +200,9 @@ static inline int sb_has_quota_active(struct super_block *sb, int type)
 	return 0;
 }
 
-static inline void dquot_initialize(struct inode *inode)
+static inline int dquot_initialize(struct inode *inode)
 {
+	return 0;
 }
 
 static inline void dquot_drop(struct inode *inode)
-- 
cgit v1.2.3-70-g09d2


From c290ea01abb7907fde602f3ba55905ef10a37477 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 18 Jun 2015 16:52:29 +0200
Subject: fs: Remove ext3 filesystem driver

The functionality of ext3 is fully supported by ext4 driver. Major
distributions (SUSE, RedHat) already use ext4 driver to handle ext3
filesystems for quite some time. There is some ugliness in mm resulting
from jbd cleaning buffers in a dirty page without cleaning page dirty
bit and also support for buffer bouncing in the block layer when stable
pages are required is there only because of jbd. So let's remove the
ext3 driver. This saves us some 28k lines of duplicated code.

Acked-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 Documentation/filesystems/ext2.txt |    4 +-
 Documentation/filesystems/ext3.txt |  209 +--
 Documentation/filesystems/vfs.txt  |    2 +-
 MAINTAINERS                        |   18 +-
 fs/Kconfig                         |    5 +-
 fs/Makefile                        |    2 -
 fs/ext3/Kconfig                    |   89 -
 fs/ext3/Makefile                   |   12 -
 fs/ext3/acl.c                      |  281 ---
 fs/ext3/acl.h                      |   72 -
 fs/ext3/balloc.c                   | 2158 ----------------------
 fs/ext3/bitmap.c                   |   20 -
 fs/ext3/dir.c                      |  537 ------
 fs/ext3/ext3.h                     | 1332 --------------
 fs/ext3/ext3_jbd.c                 |   59 -
 fs/ext3/file.c                     |   79 -
 fs/ext3/fsync.c                    |  109 --
 fs/ext3/hash.c                     |  206 ---
 fs/ext3/ialloc.c                   |  706 -------
 fs/ext3/inode.c                    | 3574 ------------------------------------
 fs/ext3/ioctl.c                    |  327 ----
 fs/ext3/namei.c                    | 2586 --------------------------
 fs/ext3/namei.h                    |   27 -
 fs/ext3/resize.c                   | 1117 -----------
 fs/ext3/super.c                    | 3165 -------------------------------
 fs/ext3/symlink.c                  |   46 -
 fs/ext3/xattr.c                    | 1330 --------------
 fs/ext3/xattr.h                    |  136 --
 fs/ext3/xattr_security.c           |   78 -
 fs/ext3/xattr_trusted.c            |   54 -
 fs/ext3/xattr_user.c               |   58 -
 fs/ext4/Kconfig                    |   41 +-
 fs/ext4/super.c                    |   14 +-
 fs/jbd/Kconfig                     |   30 -
 fs/jbd/Makefile                    |    7 -
 fs/jbd/checkpoint.c                |  782 --------
 fs/jbd/commit.c                    | 1021 ----------
 fs/jbd/journal.c                   | 2145 ----------------------
 fs/jbd/recovery.c                  |  594 ------
 fs/jbd/revoke.c                    |  733 --------
 fs/jbd/transaction.c               | 2237 ----------------------
 include/linux/jbd.h                | 1047 -----------
 include/linux/jbd2.h               |   41 +-
 include/linux/jbd_common.h         |   46 -
 include/trace/events/ext3.h        |  866 ---------
 include/trace/events/jbd.h         |  194 --
 46 files changed, 87 insertions(+), 28109 deletions(-)
 delete mode 100644 fs/ext3/Kconfig
 delete mode 100644 fs/ext3/Makefile
 delete mode 100644 fs/ext3/acl.c
 delete mode 100644 fs/ext3/acl.h
 delete mode 100644 fs/ext3/balloc.c
 delete mode 100644 fs/ext3/bitmap.c
 delete mode 100644 fs/ext3/dir.c
 delete mode 100644 fs/ext3/ext3.h
 delete mode 100644 fs/ext3/ext3_jbd.c
 delete mode 100644 fs/ext3/file.c
 delete mode 100644 fs/ext3/fsync.c
 delete mode 100644 fs/ext3/hash.c
 delete mode 100644 fs/ext3/ialloc.c
 delete mode 100644 fs/ext3/inode.c
 delete mode 100644 fs/ext3/ioctl.c
 delete mode 100644 fs/ext3/namei.c
 delete mode 100644 fs/ext3/namei.h
 delete mode 100644 fs/ext3/resize.c
 delete mode 100644 fs/ext3/super.c
 delete mode 100644 fs/ext3/symlink.c
 delete mode 100644 fs/ext3/xattr.c
 delete mode 100644 fs/ext3/xattr.h
 delete mode 100644 fs/ext3/xattr_security.c
 delete mode 100644 fs/ext3/xattr_trusted.c
 delete mode 100644 fs/ext3/xattr_user.c
 delete mode 100644 fs/jbd/Kconfig
 delete mode 100644 fs/jbd/Makefile
 delete mode 100644 fs/jbd/checkpoint.c
 delete mode 100644 fs/jbd/commit.c
 delete mode 100644 fs/jbd/journal.c
 delete mode 100644 fs/jbd/recovery.c
 delete mode 100644 fs/jbd/revoke.c
 delete mode 100644 fs/jbd/transaction.c
 delete mode 100644 include/linux/jbd.h
 delete mode 100644 include/linux/jbd_common.h
 delete mode 100644 include/trace/events/ext3.h
 delete mode 100644 include/trace/events/jbd.h

(limited to 'include/linux')

diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt
index b9714569e472..55755395d3dc 100644
--- a/Documentation/filesystems/ext2.txt
+++ b/Documentation/filesystems/ext2.txt
@@ -360,8 +360,8 @@ and are copied into the filesystem.  If a transaction is incomplete at
 the time of the crash, then there is no guarantee of consistency for
 the blocks in that transaction so they are discarded (which means any
 filesystem changes they represent are also lost).
-Check Documentation/filesystems/ext3.txt if you want to read more about
-ext3 and journaling.
+Check Documentation/filesystems/ext4.txt if you want to read more about
+ext4 and journaling.
 
 References
 ==========
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 7ed0d17d6721..58758fbef9e0 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -6,210 +6,7 @@ Ext3 was originally released in September 1999. Written by Stephen Tweedie
 for the 2.2 branch, and ported to 2.4 kernels by Peter Braam, Andreas Dilger,
 Andrew Morton, Alexander Viro, Ted Ts'o and Stephen Tweedie.
 
-Ext3 is the ext2 filesystem enhanced with journalling capabilities.
+Ext3 is the ext2 filesystem enhanced with journalling capabilities. The
+filesystem is a subset of ext4 filesystem so use ext4 driver for accessing
+ext3 filesystems.
 
-Options
-=======
-
-When mounting an ext3 filesystem, the following option are accepted:
-(*) == default
-
-ro			Mount filesystem read only. Note that ext3 will replay
-			the journal (and thus write to the partition) even when
-			mounted "read only". Mount options "ro,noload" can be
-			used to prevent writes to the filesystem.
-
-journal=update		Update the ext3 file system's journal to the current
-			format.
-
-journal=inum		When a journal already exists, this option is ignored.
-			Otherwise, it specifies the number of the inode which
-			will represent the ext3 file system's journal file.
-
-journal_path=path
-journal_dev=devnum	When the external journal device's major/minor numbers
-			have changed, these options allow the user to specify
-			the new journal location.  The journal device is
-			identified through either its new major/minor numbers
-			encoded in devnum, or via a path to the device.
-
-norecovery		Don't load the journal on mounting. Note that this forces
-noload			mount of inconsistent filesystem, which can lead to
-			various problems.
-
-data=journal		All data are committed into the journal prior to being
-			written into the main file system.
-
-data=ordered	(*)	All data are forced directly out to the main file
-			system prior to its metadata being committed to the
-			journal.
-
-data=writeback		Data ordering is not preserved, data may be written
-			into the main file system after its metadata has been
-			committed to the journal.
-
-commit=nrsec	(*)	Ext3 can be told to sync all its data and metadata
-			every 'nrsec' seconds. The default value is 5 seconds.
-			This means that if you lose your power, you will lose
-			as much as the latest 5 seconds of work (your
-			filesystem will not be damaged though, thanks to the
-			journaling).  This default value (or any low value)
-			will hurt performance, but it's good for data-safety.
-			Setting it to 0 will have the same effect as leaving
-			it at the default (5 seconds).
-			Setting it to very large values will improve
-			performance.
-
-barrier=<0|1(*)>	This enables/disables the use of write barriers in
-barrier	(*)		the jbd code.  barrier=0 disables, barrier=1 enables.
-nobarrier		This also requires an IO stack which can support
-			barriers, and if jbd gets an error on a barrier
-			write, it will disable again with a warning.
-			Write barriers enforce proper on-disk ordering
-			of journal commits, making volatile disk write caches
-			safe to use, at some performance penalty.  If
-			your disks are battery-backed in one way or another,
-			disabling barriers may safely improve performance.
-			The mount options "barrier" and "nobarrier" can
-			also be used to enable or disable barriers, for
-			consistency with other ext3 mount options.
-
-user_xattr		Enables Extended User Attributes.  Additionally, you
-			need to have extended attribute support enabled in the
-			kernel configuration (CONFIG_EXT3_FS_XATTR).  See the
-			attr(5) manual page and http://acl.bestbits.at/ to
-			learn more about extended attributes.
-
-nouser_xattr		Disables Extended User Attributes.
-
-acl			Enables POSIX Access Control Lists support.
-			Additionally, you need to have ACL support enabled in
-			the kernel configuration (CONFIG_EXT3_FS_POSIX_ACL).
-			See the acl(5) manual page and http://acl.bestbits.at/
-			for more information.
-
-noacl			This option disables POSIX Access Control List
-			support.
-
-reservation
-
-noreservation
-
-bsddf 		(*)	Make 'df' act like BSD.
-minixdf			Make 'df' act like Minix.
-
-check=none		Don't do extra checking of bitmaps on mount.
-nocheck
-
-debug			Extra debugging information is sent to syslog.
-
-errors=remount-ro	Remount the filesystem read-only on an error.
-errors=continue		Keep going on a filesystem error.
-errors=panic		Panic and halt the machine if an error occurs.
-			(These mount options override the errors behavior
-			specified in the superblock, which can be
-			configured using tune2fs.)
-
-data_err=ignore(*)	Just print an error message if an error occurs
-			in a file data buffer in ordered mode.
-data_err=abort		Abort the journal if an error occurs in a file
-			data buffer in ordered mode.
-
-grpid			Give objects the same group ID as their creator.
-bsdgroups
-
-nogrpid		(*)	New objects have the group ID of their creator.
-sysvgroups
-
-resgid=n		The group ID which may use the reserved blocks.
-
-resuid=n		The user ID which may use the reserved blocks.
-
-sb=n			Use alternate superblock at this location.
-
-quota			These options are ignored by the filesystem. They
-noquota			are used only by quota tools to recognize volumes
-grpquota		where quota should be turned on. See documentation
-usrquota		in the quota-tools package for more details
-			(http://sourceforge.net/projects/linuxquota).
-
-jqfmt=<quota type>	These options tell filesystem details about quota
-usrjquota=<file>	so that quota information can be properly updated
-grpjquota=<file>	during journal replay. They replace the above
-			quota options. See documentation in the quota-tools
-			package for more details
-			(http://sourceforge.net/projects/linuxquota).
-
-Specification
-=============
-Ext3 shares all disk implementation with the ext2 filesystem, and adds
-transactions capabilities to ext2.  Journaling is done by the Journaling Block
-Device layer.
-
-Journaling Block Device layer
------------------------------
-The Journaling Block Device layer (JBD) isn't ext3 specific.  It was designed
-to add journaling capabilities to a block device.  The ext3 filesystem code
-will inform the JBD of modifications it is performing (called a transaction).
-The journal supports the transactions start and stop, and in case of a crash,
-the journal can replay the transactions to quickly put the partition back into
-a consistent state.
-
-Handles represent a single atomic update to a filesystem.  JBD can handle an
-external journal on a block device.
-
-Data Mode
----------
-There are 3 different data modes:
-
-* writeback mode
-In data=writeback mode, ext3 does not journal data at all.  This mode provides
-a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
-mode - metadata journaling.  A crash+recovery can cause incorrect data to
-appear in files which were written shortly before the crash.  This mode will
-typically provide the best ext3 performance.
-
-* ordered mode
-In data=ordered mode, ext3 only officially journals metadata, but it logically
-groups metadata and data blocks into a single unit called a transaction.  When
-it's time to write the new metadata out to disk, the associated data blocks
-are written first.  In general, this mode performs slightly slower than
-writeback but significantly faster than journal mode.
-
-* journal mode
-data=journal mode provides full data and metadata journaling.  All new data is
-written to the journal first, and then to its final location.
-In the event of a crash, the journal can be replayed, bringing both data and
-metadata into a consistent state.  This mode is the slowest except when data
-needs to be read from and written to disk at the same time where it
-outperforms all other modes.
-
-Compatibility
--------------
-
-Ext2 partitions can be easily convert to ext3, with `tune2fs -j <dev>`.
-Ext3 is fully compatible with Ext2.  Ext3 partitions can easily be mounted as
-Ext2.
-
-
-External Tools
-==============
-See manual pages to learn more.
-
-tune2fs: 	create a ext3 journal on a ext2 partition with the -j flag.
-mke2fs: 	create a ext3 partition with the -j flag.
-debugfs: 	ext2 and ext3 file system debugger.
-ext2online:	online (mounted) ext2 and ext3 filesystem resizer
-
-
-References
-==========
-
-kernel source:	<file:fs/ext3/>
-		<file:fs/jbd/>
-
-programs: 	http://e2fsprogs.sourceforge.net/
-		http://ext2resize.sourceforge.net
-
-useful links:	http://www.ibm.com/developerworks/library/l-fs7/index.html
-        http://www.ibm.com/developerworks/library/l-fs8/index.html
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 5eb8456fc41e..8c6f07ad373a 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -769,7 +769,7 @@ struct address_space_operations {
 	to stall to allow flushers a chance to complete some IO. Ordinarily
 	it can use PageDirty and PageWriteback but some filesystems have
 	more complex state (unstable pages in NFS prevent reclaim) or
-	do not set those flags due to locking problems (jbd). This callback
+	do not set those flags due to locking problems. This callback
 	allows a filesystem to indicate to the VM if a page should be
 	treated as dirty or writeback for the purposes of stalling.
 
diff --git a/MAINTAINERS b/MAINTAINERS
index a2264167791a..0555bdb72c0d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4059,15 +4059,6 @@ F:	Documentation/filesystems/ext2.txt
 F:	fs/ext2/
 F:	include/linux/ext2*
 
-EXT3 FILE SYSTEM
-M:	Jan Kara <jack@suse.com>
-M:	Andrew Morton <akpm@linux-foundation.org>
-M:	Andreas Dilger <adilger.kernel@dilger.ca>
-L:	linux-ext4@vger.kernel.org
-S:	Maintained
-F:	Documentation/filesystems/ext3.txt
-F:	fs/ext3/
-
 EXT4 FILE SYSTEM
 M:	"Theodore Ts'o" <tytso@mit.edu>
 M:	Andreas Dilger <adilger.kernel@dilger.ca>
@@ -5751,16 +5742,9 @@ S:	Maintained
 F:	fs/jffs2/
 F:	include/uapi/linux/jffs2.h
 
-JOURNALLING LAYER FOR BLOCK DEVICES (JBD)
-M:	Andrew Morton <akpm@linux-foundation.org>
-M:	Jan Kara <jack@suse.com>
-L:	linux-ext4@vger.kernel.org
-S:	Maintained
-F:	fs/jbd/
-F:	include/linux/jbd.h
-
 JOURNALLING LAYER FOR BLOCK DEVICES (JBD2)
 M:	"Theodore Ts'o" <tytso@mit.edu>
+M:	Jan Kara <jack@suse.com>
 L:	linux-ext4@vger.kernel.org
 S:	Maintained
 F:	fs/jbd2/
diff --git a/fs/Kconfig b/fs/Kconfig
index 011f43365d7b..da3f32f1a4e4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -11,18 +11,15 @@ config DCACHE_WORD_ACCESS
 if BLOCK
 
 source "fs/ext2/Kconfig"
-source "fs/ext3/Kconfig"
 source "fs/ext4/Kconfig"
-source "fs/jbd/Kconfig"
 source "fs/jbd2/Kconfig"
 
 config FS_MBCACHE
 # Meta block cache for Extended Attributes (ext2/ext3/ext4)
 	tristate
 	default y if EXT2_FS=y && EXT2_FS_XATTR
-	default y if EXT3_FS=y && EXT3_FS_XATTR
 	default y if EXT4_FS=y
-	default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS
+	default m if EXT2_FS_XATTR || EXT4_FS
 
 source "fs/reiserfs/Kconfig"
 source "fs/jfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index cb20e4bf2303..09e051fefc5b 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -62,12 +62,10 @@ obj-$(CONFIG_DLM)		+= dlm/
 # Do not add any filesystems before this line
 obj-$(CONFIG_FSCACHE)		+= fscache/
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
-obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
 obj-$(CONFIG_EXT2_FS)		+= ext2/
 # We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
 # unless explicitly requested by rootfstype
 obj-$(CONFIG_EXT4_FS)		+= ext4/
-obj-$(CONFIG_JBD)		+= jbd/
 obj-$(CONFIG_JBD2)		+= jbd2/
 obj-$(CONFIG_CRAMFS)		+= cramfs/
 obj-$(CONFIG_SQUASHFS)		+= squashfs/
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
deleted file mode 100644
index e8c6ba0e4a3e..000000000000
--- a/fs/ext3/Kconfig
+++ /dev/null
@@ -1,89 +0,0 @@
-config EXT3_FS
-	tristate "Ext3 journalling file system support"
-	select JBD
-	help
-	  This is the journalling version of the Second extended file system
-	  (often called ext3), the de facto standard Linux file system
-	  (method to organize files on a storage device) for hard disks.
-
-	  The journalling code included in this driver means you do not have
-	  to run e2fsck (file system checker) on your file systems after a
-	  crash.  The journal keeps track of any changes that were being made
-	  at the time the system crashed, and can ensure that your file system
-	  is consistent without the need for a lengthy check.
-
-	  Other than adding the journal to the file system, the on-disk format
-	  of ext3 is identical to ext2.  It is possible to freely switch
-	  between using the ext3 driver and the ext2 driver, as long as the
-	  file system has been cleanly unmounted, or e2fsck is run on the file
-	  system.
-
-	  To add a journal on an existing ext2 file system or change the
-	  behavior of ext3 file systems, you can use the tune2fs utility ("man
-	  tune2fs").  To modify attributes of files and directories on ext3
-	  file systems, use chattr ("man chattr").  You need to be using
-	  e2fsprogs version 1.20 or later in order to create ext3 journals
-	  (available at <http://sourceforge.net/projects/e2fsprogs/>).
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called ext3.
-
-config EXT3_DEFAULTS_TO_ORDERED
-	bool "Default to 'data=ordered' in ext3"
-	depends on EXT3_FS
-	default y
-	help
-	  The journal mode options for ext3 have different tradeoffs
-	  between when data is guaranteed to be on disk and
-	  performance.	The use of "data=writeback" can cause
-	  unwritten data to appear in files after an system crash or
-	  power failure, which can be a security issue.	 However,
-	  "data=ordered" mode can also result in major performance
-	  problems, including seconds-long delays before an fsync()
-	  call returns.	 For details, see:
-
-	  http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs
-
-	  If you have been historically happy with ext3's performance,
-	  data=ordered mode will be a safe choice and you should
-	  answer 'y' here.  If you understand the reliability and data
-	  privacy issues of data=writeback and are willing to make
-	  that trade off, answer 'n'.
-
-config EXT3_FS_XATTR
-	bool "Ext3 extended attributes"
-	depends on EXT3_FS
-	default y
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page, or visit
-	  <http://acl.bestbits.at/> for details).
-
-	  If unsure, say N.
-
-	  You need this for POSIX ACL support on ext3.
-
-config EXT3_FS_POSIX_ACL
-	bool "Ext3 POSIX Access Control Lists"
-	depends on EXT3_FS_XATTR
-	select FS_POSIX_ACL
-	help
-	  Posix Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  To learn more about Access Control Lists, visit the Posix ACLs for
-	  Linux website <http://acl.bestbits.at/>.
-
-	  If you don't know what Access Control Lists are, say N
-
-config EXT3_FS_SECURITY
-	bool "Ext3 Security Labels"
-	depends on EXT3_FS_XATTR
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the ext3 filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
diff --git a/fs/ext3/Makefile b/fs/ext3/Makefile
deleted file mode 100644
index e77766a8b3f0..000000000000
--- a/fs/ext3/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-#
-# Makefile for the linux ext3-filesystem routines.
-#
-
-obj-$(CONFIG_EXT3_FS) += ext3.o
-
-ext3-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-	   ioctl.o namei.o super.o symlink.o hash.o resize.o ext3_jbd.o
-
-ext3-$(CONFIG_EXT3_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
-ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
-ext3-$(CONFIG_EXT3_FS_SECURITY)	 += xattr_security.o
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
deleted file mode 100644
index 8bbaf5bcf982..000000000000
--- a/fs/ext3/acl.c
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * linux/fs/ext3/acl.c
- *
- * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
- */
-
-#include "ext3.h"
-#include "xattr.h"
-#include "acl.h"
-
-/*
- * Convert from filesystem to in-memory representation.
- */
-static struct posix_acl *
-ext3_acl_from_disk(const void *value, size_t size)
-{
-	const char *end = (char *)value + size;
-	int n, count;
-	struct posix_acl *acl;
-
-	if (!value)
-		return NULL;
-	if (size < sizeof(ext3_acl_header))
-		 return ERR_PTR(-EINVAL);
-	if (((ext3_acl_header *)value)->a_version !=
-	    cpu_to_le32(EXT3_ACL_VERSION))
-		return ERR_PTR(-EINVAL);
-	value = (char *)value + sizeof(ext3_acl_header);
-	count = ext3_acl_count(size);
-	if (count < 0)
-		return ERR_PTR(-EINVAL);
-	if (count == 0)
-		return NULL;
-	acl = posix_acl_alloc(count, GFP_NOFS);
-	if (!acl)
-		return ERR_PTR(-ENOMEM);
-	for (n=0; n < count; n++) {
-		ext3_acl_entry *entry =
-			(ext3_acl_entry *)value;
-		if ((char *)value + sizeof(ext3_acl_entry_short) > end)
-			goto fail;
-		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
-		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
-		switch(acl->a_entries[n].e_tag) {
-			case ACL_USER_OBJ:
-			case ACL_GROUP_OBJ:
-			case ACL_MASK:
-			case ACL_OTHER:
-				value = (char *)value +
-					sizeof(ext3_acl_entry_short);
-				break;
-
-			case ACL_USER:
-				value = (char *)value + sizeof(ext3_acl_entry);
-				if ((char *)value > end)
-					goto fail;
-				acl->a_entries[n].e_uid =
-					make_kuid(&init_user_ns,
-						  le32_to_cpu(entry->e_id));
-				break;
-			case ACL_GROUP:
-				value = (char *)value + sizeof(ext3_acl_entry);
-				if ((char *)value > end)
-					goto fail;
-				acl->a_entries[n].e_gid =
-					make_kgid(&init_user_ns,
-						  le32_to_cpu(entry->e_id));
-				break;
-
-			default:
-				goto fail;
-		}
-	}
-	if (value != end)
-		goto fail;
-	return acl;
-
-fail:
-	posix_acl_release(acl);
-	return ERR_PTR(-EINVAL);
-}
-
-/*
- * Convert from in-memory to filesystem representation.
- */
-static void *
-ext3_acl_to_disk(const struct posix_acl *acl, size_t *size)
-{
-	ext3_acl_header *ext_acl;
-	char *e;
-	size_t n;
-
-	*size = ext3_acl_size(acl->a_count);
-	ext_acl = kmalloc(sizeof(ext3_acl_header) + acl->a_count *
-			sizeof(ext3_acl_entry), GFP_NOFS);
-	if (!ext_acl)
-		return ERR_PTR(-ENOMEM);
-	ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION);
-	e = (char *)ext_acl + sizeof(ext3_acl_header);
-	for (n=0; n < acl->a_count; n++) {
-		const struct posix_acl_entry *acl_e = &acl->a_entries[n];
-		ext3_acl_entry *entry = (ext3_acl_entry *)e;
-		entry->e_tag  = cpu_to_le16(acl_e->e_tag);
-		entry->e_perm = cpu_to_le16(acl_e->e_perm);
-		switch(acl_e->e_tag) {
-			case ACL_USER:
-				entry->e_id = cpu_to_le32(
-					from_kuid(&init_user_ns, acl_e->e_uid));
-				e += sizeof(ext3_acl_entry);
-				break;
-			case ACL_GROUP:
-				entry->e_id = cpu_to_le32(
-					from_kgid(&init_user_ns, acl_e->e_gid));
-				e += sizeof(ext3_acl_entry);
-				break;
-
-			case ACL_USER_OBJ:
-			case ACL_GROUP_OBJ:
-			case ACL_MASK:
-			case ACL_OTHER:
-				e += sizeof(ext3_acl_entry_short);
-				break;
-
-			default:
-				goto fail;
-		}
-	}
-	return (char *)ext_acl;
-
-fail:
-	kfree(ext_acl);
-	return ERR_PTR(-EINVAL);
-}
-
-/*
- * Inode operation get_posix_acl().
- *
- * inode->i_mutex: don't care
- */
-struct posix_acl *
-ext3_get_acl(struct inode *inode, int type)
-{
-	int name_index;
-	char *value = NULL;
-	struct posix_acl *acl;
-	int retval;
-
-	switch (type) {
-	case ACL_TYPE_ACCESS:
-		name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
-		break;
-	case ACL_TYPE_DEFAULT:
-		name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
-		break;
-	default:
-		BUG();
-	}
-
-	retval = ext3_xattr_get(inode, name_index, "", NULL, 0);
-	if (retval > 0) {
-		value = kmalloc(retval, GFP_NOFS);
-		if (!value)
-			return ERR_PTR(-ENOMEM);
-		retval = ext3_xattr_get(inode, name_index, "", value, retval);
-	}
-	if (retval > 0)
-		acl = ext3_acl_from_disk(value, retval);
-	else if (retval == -ENODATA || retval == -ENOSYS)
-		acl = NULL;
-	else
-		acl = ERR_PTR(retval);
-	kfree(value);
-
-	if (!IS_ERR(acl))
-		set_cached_acl(inode, type, acl);
-
-	return acl;
-}
-
-/*
- * Set the access or default ACL of an inode.
- *
- * inode->i_mutex: down unless called from ext3_new_inode
- */
-static int
-__ext3_set_acl(handle_t *handle, struct inode *inode, int type,
-	     struct posix_acl *acl)
-{
-	int name_index;
-	void *value = NULL;
-	size_t size = 0;
-	int error;
-
-	switch(type) {
-		case ACL_TYPE_ACCESS:
-			name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
-			if (acl) {
-				error = posix_acl_equiv_mode(acl, &inode->i_mode);
-				if (error < 0)
-					return error;
-				else {
-					inode->i_ctime = CURRENT_TIME_SEC;
-					ext3_mark_inode_dirty(handle, inode);
-					if (error == 0)
-						acl = NULL;
-				}
-			}
-			break;
-
-		case ACL_TYPE_DEFAULT:
-			name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
-			if (!S_ISDIR(inode->i_mode))
-				return acl ? -EACCES : 0;
-			break;
-
-		default:
-			return -EINVAL;
-	}
-	if (acl) {
-		value = ext3_acl_to_disk(acl, &size);
-		if (IS_ERR(value))
-			return (int)PTR_ERR(value);
-	}
-
-	error = ext3_xattr_set_handle(handle, inode, name_index, "",
-				      value, size, 0);
-
-	kfree(value);
-
-	if (!error)
-		set_cached_acl(inode, type, acl);
-
-	return error;
-}
-
-int
-ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
-{
-	handle_t *handle;
-	int error, retries = 0;
-
-retry:
-	handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-	error = __ext3_set_acl(handle, inode, type, acl);
-	ext3_journal_stop(handle);
-	if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
-		goto retry;
-	return error;
-}
-
-/*
- * Initialize the ACLs of a new inode. Called from ext3_new_inode.
- *
- * dir->i_mutex: down
- * inode->i_mutex: up (access to inode is still exclusive)
- */
-int
-ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
-{
-	struct posix_acl *default_acl, *acl;
-	int error;
-
-	error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
-	if (error)
-		return error;
-
-	if (default_acl) {
-		error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT,
-				       default_acl);
-		posix_acl_release(default_acl);
-	}
-	if (acl) {
-		if (!error)
-			error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS,
-					       acl);
-		posix_acl_release(acl);
-	}
-	return error;
-}
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
deleted file mode 100644
index ea1c69edab9e..000000000000
--- a/fs/ext3/acl.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
-  File: fs/ext3/acl.h
-
-  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
-*/
-
-#include <linux/posix_acl_xattr.h>
-
-#define EXT3_ACL_VERSION	0x0001
-
-typedef struct {
-	__le16		e_tag;
-	__le16		e_perm;
-	__le32		e_id;
-} ext3_acl_entry;
-
-typedef struct {
-	__le16		e_tag;
-	__le16		e_perm;
-} ext3_acl_entry_short;
-
-typedef struct {
-	__le32		a_version;
-} ext3_acl_header;
-
-static inline size_t ext3_acl_size(int count)
-{
-	if (count <= 4) {
-		return sizeof(ext3_acl_header) +
-		       count * sizeof(ext3_acl_entry_short);
-	} else {
-		return sizeof(ext3_acl_header) +
-		       4 * sizeof(ext3_acl_entry_short) +
-		       (count - 4) * sizeof(ext3_acl_entry);
-	}
-}
-
-static inline int ext3_acl_count(size_t size)
-{
-	ssize_t s;
-	size -= sizeof(ext3_acl_header);
-	s = size - 4 * sizeof(ext3_acl_entry_short);
-	if (s < 0) {
-		if (size % sizeof(ext3_acl_entry_short))
-			return -1;
-		return size / sizeof(ext3_acl_entry_short);
-	} else {
-		if (s % sizeof(ext3_acl_entry))
-			return -1;
-		return s / sizeof(ext3_acl_entry) + 4;
-	}
-}
-
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-
-/* acl.c */
-extern struct posix_acl *ext3_get_acl(struct inode *inode, int type);
-extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
-
-#else  /* CONFIG_EXT3_FS_POSIX_ACL */
-#include <linux/sched.h>
-#define ext3_get_acl NULL
-#define ext3_set_acl NULL
-
-static inline int
-ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
-{
-	return 0;
-}
-#endif  /* CONFIG_EXT3_FS_POSIX_ACL */
-
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
deleted file mode 100644
index 158b5d4ce067..000000000000
--- a/fs/ext3/balloc.c
+++ /dev/null
@@ -1,2158 +0,0 @@
-/*
- *  linux/fs/ext3/balloc.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
- *  Big-endian to little-endian byte-swapping/bitmaps by
- *        David S. Miller (davem@caip.rutgers.edu), 1995
- */
-
-#include <linux/quotaops.h>
-#include <linux/blkdev.h>
-#include "ext3.h"
-
-/*
- * balloc.c contains the blocks allocation and deallocation routines
- */
-
-/*
- * The free blocks are managed by bitmaps.  A file system contains several
- * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
- * block for inodes, N blocks for the inode table and data blocks.
- *
- * The file system contains group descriptors which are located after the
- * super block.  Each descriptor contains the number of the bitmap block and
- * the free blocks count in the block.  The descriptors are loaded in memory
- * when a file system is mounted (see ext3_fill_super).
- */
-
-
-#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
-
-/*
- * Calculate the block group number and offset, given a block number
- */
-static void ext3_get_group_no_and_offset(struct super_block *sb,
-	ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
-{
-	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
-
-	blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
-	if (offsetp)
-		*offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
-	if (blockgrpp)
-		*blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
-}
-
-/**
- * ext3_get_group_desc() -- load group descriptor from disk
- * @sb:			super block
- * @block_group:	given block group
- * @bh:			pointer to the buffer head to store the block
- *			group descriptor
- */
-struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
-					     unsigned int block_group,
-					     struct buffer_head ** bh)
-{
-	unsigned long group_desc;
-	unsigned long offset;
-	struct ext3_group_desc * desc;
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-
-	if (block_group >= sbi->s_groups_count) {
-		ext3_error (sb, "ext3_get_group_desc",
-			    "block_group >= groups_count - "
-			    "block_group = %d, groups_count = %lu",
-			    block_group, sbi->s_groups_count);
-
-		return NULL;
-	}
-	smp_rmb();
-
-	group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
-	offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
-	if (!sbi->s_group_desc[group_desc]) {
-		ext3_error (sb, "ext3_get_group_desc",
-			    "Group descriptor not loaded - "
-			    "block_group = %d, group_desc = %lu, desc = %lu",
-			     block_group, group_desc, offset);
-		return NULL;
-	}
-
-	desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data;
-	if (bh)
-		*bh = sbi->s_group_desc[group_desc];
-	return desc + offset;
-}
-
-static int ext3_valid_block_bitmap(struct super_block *sb,
-					struct ext3_group_desc *desc,
-					unsigned int block_group,
-					struct buffer_head *bh)
-{
-	ext3_grpblk_t offset;
-	ext3_grpblk_t next_zero_bit;
-	ext3_fsblk_t bitmap_blk;
-	ext3_fsblk_t group_first_block;
-
-	group_first_block = ext3_group_first_block_no(sb, block_group);
-
-	/* check whether block bitmap block number is set */
-	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
-	offset = bitmap_blk - group_first_block;
-	if (!ext3_test_bit(offset, bh->b_data))
-		/* bad block bitmap */
-		goto err_out;
-
-	/* check whether the inode bitmap block number is set */
-	bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
-	offset = bitmap_blk - group_first_block;
-	if (!ext3_test_bit(offset, bh->b_data))
-		/* bad block bitmap */
-		goto err_out;
-
-	/* check whether the inode table block number is set */
-	bitmap_blk = le32_to_cpu(desc->bg_inode_table);
-	offset = bitmap_blk - group_first_block;
-	next_zero_bit = ext3_find_next_zero_bit(bh->b_data,
-				offset + EXT3_SB(sb)->s_itb_per_group,
-				offset);
-	if (next_zero_bit >= offset + EXT3_SB(sb)->s_itb_per_group)
-		/* good bitmap for inode tables */
-		return 1;
-
-err_out:
-	ext3_error(sb, __func__,
-			"Invalid block bitmap - "
-			"block_group = %d, block = %lu",
-			block_group, bitmap_blk);
-	return 0;
-}
-
-/**
- * read_block_bitmap()
- * @sb:			super block
- * @block_group:	given block group
- *
- * Read the bitmap for a given block_group,and validate the
- * bits for block/inode/inode tables are set in the bitmaps
- *
- * Return buffer_head on success or NULL in case of failure.
- */
-static struct buffer_head *
-read_block_bitmap(struct super_block *sb, unsigned int block_group)
-{
-	struct ext3_group_desc * desc;
-	struct buffer_head * bh = NULL;
-	ext3_fsblk_t bitmap_blk;
-
-	desc = ext3_get_group_desc(sb, block_group, NULL);
-	if (!desc)
-		return NULL;
-	trace_ext3_read_block_bitmap(sb, block_group);
-	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
-	bh = sb_getblk(sb, bitmap_blk);
-	if (unlikely(!bh)) {
-		ext3_error(sb, __func__,
-			    "Cannot read block bitmap - "
-			    "block_group = %d, block_bitmap = %u",
-			    block_group, le32_to_cpu(desc->bg_block_bitmap));
-		return NULL;
-	}
-	if (likely(bh_uptodate_or_lock(bh)))
-		return bh;
-
-	if (bh_submit_read(bh) < 0) {
-		brelse(bh);
-		ext3_error(sb, __func__,
-			    "Cannot read block bitmap - "
-			    "block_group = %d, block_bitmap = %u",
-			    block_group, le32_to_cpu(desc->bg_block_bitmap));
-		return NULL;
-	}
-	ext3_valid_block_bitmap(sb, desc, block_group, bh);
-	/*
-	 * file system mounted not to panic on error, continue with corrupt
-	 * bitmap
-	 */
-	return bh;
-}
-/*
- * The reservation window structure operations
- * --------------------------------------------
- * Operations include:
- * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
- *
- * We use a red-black tree to represent per-filesystem reservation
- * windows.
- *
- */
-
-/**
- * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
- * @rb_root:		root of per-filesystem reservation rb tree
- * @verbose:		verbose mode
- * @fn:			function which wishes to dump the reservation map
- *
- * If verbose is turned on, it will print the whole block reservation
- * windows(start, end).	Otherwise, it will only print out the "bad" windows,
- * those windows that overlap with their immediate neighbors.
- */
-#if 1
-static void __rsv_window_dump(struct rb_root *root, int verbose,
-			      const char *fn)
-{
-	struct rb_node *n;
-	struct ext3_reserve_window_node *rsv, *prev;
-	int bad;
-
-restart:
-	n = rb_first(root);
-	bad = 0;
-	prev = NULL;
-
-	printk("Block Allocation Reservation Windows Map (%s):\n", fn);
-	while (n) {
-		rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
-		if (verbose)
-			printk("reservation window 0x%p "
-			       "start:  %lu, end:  %lu\n",
-			       rsv, rsv->rsv_start, rsv->rsv_end);
-		if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
-			printk("Bad reservation %p (start >= end)\n",
-			       rsv);
-			bad = 1;
-		}
-		if (prev && prev->rsv_end >= rsv->rsv_start) {
-			printk("Bad reservation %p (prev->end >= start)\n",
-			       rsv);
-			bad = 1;
-		}
-		if (bad) {
-			if (!verbose) {
-				printk("Restarting reservation walk in verbose mode\n");
-				verbose = 1;
-				goto restart;
-			}
-		}
-		n = rb_next(n);
-		prev = rsv;
-	}
-	printk("Window map complete.\n");
-	BUG_ON(bad);
-}
-#define rsv_window_dump(root, verbose) \
-	__rsv_window_dump((root), (verbose), __func__)
-#else
-#define rsv_window_dump(root, verbose) do {} while (0)
-#endif
-
-/**
- * goal_in_my_reservation()
- * @rsv:		inode's reservation window
- * @grp_goal:		given goal block relative to the allocation block group
- * @group:		the current allocation block group
- * @sb:			filesystem super block
- *
- * Test if the given goal block (group relative) is within the file's
- * own block reservation window range.
- *
- * If the reservation window is outside the goal allocation group, return 0;
- * grp_goal (given goal block) could be -1, which means no specific
- * goal block. In this case, always return 1.
- * If the goal block is within the reservation window, return 1;
- * otherwise, return 0;
- */
-static int
-goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
-			unsigned int group, struct super_block * sb)
-{
-	ext3_fsblk_t group_first_block, group_last_block;
-
-	group_first_block = ext3_group_first_block_no(sb, group);
-	group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
-
-	if ((rsv->_rsv_start > group_last_block) ||
-	    (rsv->_rsv_end < group_first_block))
-		return 0;
-	if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
-		|| (grp_goal + group_first_block > rsv->_rsv_end)))
-		return 0;
-	return 1;
-}
-
-/**
- * search_reserve_window()
- * @rb_root:		root of reservation tree
- * @goal:		target allocation block
- *
- * Find the reserved window which includes the goal, or the previous one
- * if the goal is not in any window.
- * Returns NULL if there are no windows or if all windows start after the goal.
- */
-static struct ext3_reserve_window_node *
-search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
-{
-	struct rb_node *n = root->rb_node;
-	struct ext3_reserve_window_node *rsv;
-
-	if (!n)
-		return NULL;
-
-	do {
-		rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
-
-		if (goal < rsv->rsv_start)
-			n = n->rb_left;
-		else if (goal > rsv->rsv_end)
-			n = n->rb_right;
-		else
-			return rsv;
-	} while (n);
-	/*
-	 * We've fallen off the end of the tree: the goal wasn't inside
-	 * any particular node.  OK, the previous node must be to one
-	 * side of the interval containing the goal.  If it's the RHS,
-	 * we need to back up one.
-	 */
-	if (rsv->rsv_start > goal) {
-		n = rb_prev(&rsv->rsv_node);
-		rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
-	}
-	return rsv;
-}
-
-/**
- * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree.
- * @sb:			super block
- * @rsv:		reservation window to add
- *
- * Must be called with rsv_lock hold.
- */
-void ext3_rsv_window_add(struct super_block *sb,
-		    struct ext3_reserve_window_node *rsv)
-{
-	struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
-	struct rb_node *node = &rsv->rsv_node;
-	ext3_fsblk_t start = rsv->rsv_start;
-
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
-	struct ext3_reserve_window_node *this;
-
-	trace_ext3_rsv_window_add(sb, rsv);
-	while (*p)
-	{
-		parent = *p;
-		this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node);
-
-		if (start < this->rsv_start)
-			p = &(*p)->rb_left;
-		else if (start > this->rsv_end)
-			p = &(*p)->rb_right;
-		else {
-			rsv_window_dump(root, 1);
-			BUG();
-		}
-	}
-
-	rb_link_node(node, parent, p);
-	rb_insert_color(node, root);
-}
-
-/**
- * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree
- * @sb:			super block
- * @rsv:		reservation window to remove
- *
- * Mark the block reservation window as not allocated, and unlink it
- * from the filesystem reservation window rb tree. Must be called with
- * rsv_lock hold.
- */
-static void rsv_window_remove(struct super_block *sb,
-			      struct ext3_reserve_window_node *rsv)
-{
-	rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
-	rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
-	rsv->rsv_alloc_hit = 0;
-	rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
-}
-
-/*
- * rsv_is_empty() -- Check if the reservation window is allocated.
- * @rsv:		given reservation window to check
- *
- * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED.
- */
-static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
-{
-	/* a valid reservation end block could not be 0 */
-	return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
-}
-
-/**
- * ext3_init_block_alloc_info()
- * @inode:		file inode structure
- *
- * Allocate and initialize the	reservation window structure, and
- * link the window to the ext3 inode structure at last
- *
- * The reservation window structure is only dynamically allocated
- * and linked to ext3 inode the first time the open file
- * needs a new block. So, before every ext3_new_block(s) call, for
- * regular files, we should check whether the reservation window
- * structure exists or not. In the latter case, this function is called.
- * Fail to do so will result in block reservation being turned off for that
- * open file.
- *
- * This function is called from ext3_get_blocks_handle(), also called
- * when setting the reservation window size through ioctl before the file
- * is open for write (needs block allocation).
- *
- * Needs truncate_mutex protection prior to call this function.
- */
-void ext3_init_block_alloc_info(struct inode *inode)
-{
-	struct ext3_inode_info *ei = EXT3_I(inode);
-	struct ext3_block_alloc_info *block_i;
-	struct super_block *sb = inode->i_sb;
-
-	block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
-	if (block_i) {
-		struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node;
-
-		rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
-		rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
-
-		/*
-		 * if filesystem is mounted with NORESERVATION, the goal
-		 * reservation window size is set to zero to indicate
-		 * block reservation is off
-		 */
-		if (!test_opt(sb, RESERVATION))
-			rsv->rsv_goal_size = 0;
-		else
-			rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS;
-		rsv->rsv_alloc_hit = 0;
-		block_i->last_alloc_logical_block = 0;
-		block_i->last_alloc_physical_block = 0;
-	}
-	ei->i_block_alloc_info = block_i;
-}
-
-/**
- * ext3_discard_reservation()
- * @inode:		inode
- *
- * Discard(free) block reservation window on last file close, or truncate
- * or at last iput().
- *
- * It is being called in three cases:
- *	ext3_release_file(): last writer close the file
- *	ext3_clear_inode(): last iput(), when nobody link to this file.
- *	ext3_truncate(): when the block indirect map is about to change.
- *
- */
-void ext3_discard_reservation(struct inode *inode)
-{
-	struct ext3_inode_info *ei = EXT3_I(inode);
-	struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
-	struct ext3_reserve_window_node *rsv;
-	spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock;
-
-	if (!block_i)
-		return;
-
-	rsv = &block_i->rsv_window_node;
-	if (!rsv_is_empty(&rsv->rsv_window)) {
-		spin_lock(rsv_lock);
-		if (!rsv_is_empty(&rsv->rsv_window)) {
-			trace_ext3_discard_reservation(inode, rsv);
-			rsv_window_remove(inode->i_sb, rsv);
-		}
-		spin_unlock(rsv_lock);
-	}
-}
-
-/**
- * ext3_free_blocks_sb() -- Free given blocks and update quota
- * @handle:			handle to this transaction
- * @sb:				super block
- * @block:			start physical block to free
- * @count:			number of blocks to free
- * @pdquot_freed_blocks:	pointer to quota
- */
-void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
-			 ext3_fsblk_t block, unsigned long count,
-			 unsigned long *pdquot_freed_blocks)
-{
-	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *gd_bh;
-	unsigned long block_group;
-	ext3_grpblk_t bit;
-	unsigned long i;
-	unsigned long overflow;
-	struct ext3_group_desc * desc;
-	struct ext3_super_block * es;
-	struct ext3_sb_info *sbi;
-	int err = 0, ret;
-	ext3_grpblk_t group_freed;
-
-	*pdquot_freed_blocks = 0;
-	sbi = EXT3_SB(sb);
-	es = sbi->s_es;
-	if (block < le32_to_cpu(es->s_first_data_block) ||
-	    block + count < block ||
-	    block + count > le32_to_cpu(es->s_blocks_count)) {
-		ext3_error (sb, "ext3_free_blocks",
-			    "Freeing blocks not in datazone - "
-			    "block = "E3FSBLK", count = %lu", block, count);
-		goto error_return;
-	}
-
-	ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1);
-
-do_more:
-	overflow = 0;
-	block_group = (block - le32_to_cpu(es->s_first_data_block)) /
-		      EXT3_BLOCKS_PER_GROUP(sb);
-	bit = (block - le32_to_cpu(es->s_first_data_block)) %
-		      EXT3_BLOCKS_PER_GROUP(sb);
-	/*
-	 * Check to see if we are freeing blocks across a group
-	 * boundary.
-	 */
-	if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
-		overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
-		count -= overflow;
-	}
-	brelse(bitmap_bh);
-	bitmap_bh = read_block_bitmap(sb, block_group);
-	if (!bitmap_bh)
-		goto error_return;
-	desc = ext3_get_group_desc (sb, block_group, &gd_bh);
-	if (!desc)
-		goto error_return;
-
-	if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
-	    in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
-	    in_range (block, le32_to_cpu(desc->bg_inode_table),
-		      sbi->s_itb_per_group) ||
-	    in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
-		      sbi->s_itb_per_group)) {
-		ext3_error (sb, "ext3_free_blocks",
-			    "Freeing blocks in system zones - "
-			    "Block = "E3FSBLK", count = %lu",
-			    block, count);
-		goto error_return;
-	}
-
-	/*
-	 * We are about to start releasing blocks in the bitmap,
-	 * so we need undo access.
-	 */
-	/* @@@ check errors */
-	BUFFER_TRACE(bitmap_bh, "getting undo access");
-	err = ext3_journal_get_undo_access(handle, bitmap_bh);
-	if (err)
-		goto error_return;
-
-	/*
-	 * We are about to modify some metadata.  Call the journal APIs
-	 * to unshare ->b_data if a currently-committing transaction is
-	 * using it
-	 */
-	BUFFER_TRACE(gd_bh, "get_write_access");
-	err = ext3_journal_get_write_access(handle, gd_bh);
-	if (err)
-		goto error_return;
-
-	jbd_lock_bh_state(bitmap_bh);
-
-	for (i = 0, group_freed = 0; i < count; i++) {
-		/*
-		 * An HJ special.  This is expensive...
-		 */
-#ifdef CONFIG_JBD_DEBUG
-		jbd_unlock_bh_state(bitmap_bh);
-		{
-			struct buffer_head *debug_bh;
-			debug_bh = sb_find_get_block(sb, block + i);
-			if (debug_bh) {
-				BUFFER_TRACE(debug_bh, "Deleted!");
-				if (!bh2jh(bitmap_bh)->b_committed_data)
-					BUFFER_TRACE(debug_bh,
-						"No committed data in bitmap");
-				BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
-				__brelse(debug_bh);
-			}
-		}
-		jbd_lock_bh_state(bitmap_bh);
-#endif
-		if (need_resched()) {
-			jbd_unlock_bh_state(bitmap_bh);
-			cond_resched();
-			jbd_lock_bh_state(bitmap_bh);
-		}
-		/* @@@ This prevents newly-allocated data from being
-		 * freed and then reallocated within the same
-		 * transaction.
-		 *
-		 * Ideally we would want to allow that to happen, but to
-		 * do so requires making journal_forget() capable of
-		 * revoking the queued write of a data block, which
-		 * implies blocking on the journal lock.  *forget()
-		 * cannot block due to truncate races.
-		 *
-		 * Eventually we can fix this by making journal_forget()
-		 * return a status indicating whether or not it was able
-		 * to revoke the buffer.  On successful revoke, it is
-		 * safe not to set the allocation bit in the committed
-		 * bitmap, because we know that there is no outstanding
-		 * activity on the buffer any more and so it is safe to
-		 * reallocate it.
-		 */
-		BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
-		J_ASSERT_BH(bitmap_bh,
-				bh2jh(bitmap_bh)->b_committed_data != NULL);
-		ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
-				bh2jh(bitmap_bh)->b_committed_data);
-
-		/*
-		 * We clear the bit in the bitmap after setting the committed
-		 * data bit, because this is the reverse order to that which
-		 * the allocator uses.
-		 */
-		BUFFER_TRACE(bitmap_bh, "clear bit");
-		if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
-						bit + i, bitmap_bh->b_data)) {
-			jbd_unlock_bh_state(bitmap_bh);
-			ext3_error(sb, __func__,
-				"bit already cleared for block "E3FSBLK,
-				 block + i);
-			jbd_lock_bh_state(bitmap_bh);
-			BUFFER_TRACE(bitmap_bh, "bit already cleared");
-		} else {
-			group_freed++;
-		}
-	}
-	jbd_unlock_bh_state(bitmap_bh);
-
-	spin_lock(sb_bgl_lock(sbi, block_group));
-	le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
-	spin_unlock(sb_bgl_lock(sbi, block_group));
-	percpu_counter_add(&sbi->s_freeblocks_counter, count);
-
-	/* We dirtied the bitmap block */
-	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-	err = ext3_journal_dirty_metadata(handle, bitmap_bh);
-
-	/* And the group descriptor block */
-	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-	ret = ext3_journal_dirty_metadata(handle, gd_bh);
-	if (!err) err = ret;
-	*pdquot_freed_blocks += group_freed;
-
-	if (overflow && !err) {
-		block += count;
-		count = overflow;
-		goto do_more;
-	}
-
-error_return:
-	brelse(bitmap_bh);
-	ext3_std_error(sb, err);
-	return;
-}
-
-/**
- * ext3_free_blocks() -- Free given blocks and update quota
- * @handle:		handle for this transaction
- * @inode:		inode
- * @block:		start physical block to free
- * @count:		number of blocks to count
- */
-void ext3_free_blocks(handle_t *handle, struct inode *inode,
-			ext3_fsblk_t block, unsigned long count)
-{
-	struct super_block *sb = inode->i_sb;
-	unsigned long dquot_freed_blocks;
-
-	trace_ext3_free_blocks(inode, block, count);
-	ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
-	if (dquot_freed_blocks)
-		dquot_free_block(inode, dquot_freed_blocks);
-	return;
-}
-
-/**
- * ext3_test_allocatable()
- * @nr:			given allocation block group
- * @bh:			bufferhead contains the bitmap of the given block group
- *
- * For ext3 allocations, we must not reuse any blocks which are
- * allocated in the bitmap buffer's "last committed data" copy.  This
- * prevents deletes from freeing up the page for reuse until we have
- * committed the delete transaction.
- *
- * If we didn't do this, then deleting something and reallocating it as
- * data would allow the old block to be overwritten before the
- * transaction committed (because we force data to disk before commit).
- * This would lead to corruption if we crashed between overwriting the
- * data and committing the delete.
- *
- * @@@ We may want to make this allocation behaviour conditional on
- * data-writes at some point, and disable it for metadata allocations or
- * sync-data inodes.
- */
-static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
-{
-	int ret;
-	struct journal_head *jh = bh2jh(bh);
-
-	if (ext3_test_bit(nr, bh->b_data))
-		return 0;
-
-	jbd_lock_bh_state(bh);
-	if (!jh->b_committed_data)
-		ret = 1;
-	else
-		ret = !ext3_test_bit(nr, jh->b_committed_data);
-	jbd_unlock_bh_state(bh);
-	return ret;
-}
-
-/**
- * bitmap_search_next_usable_block()
- * @start:		the starting block (group relative) of the search
- * @bh:			bufferhead contains the block group bitmap
- * @maxblocks:		the ending block (group relative) of the reservation
- *
- * The bitmap search --- search forward alternately through the actual
- * bitmap on disk and the last-committed copy in journal, until we find a
- * bit free in both bitmaps.
- */
-static ext3_grpblk_t
-bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
-					ext3_grpblk_t maxblocks)
-{
-	ext3_grpblk_t next;
-	struct journal_head *jh = bh2jh(bh);
-
-	while (start < maxblocks) {
-		next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
-		if (next >= maxblocks)
-			return -1;
-		if (ext3_test_allocatable(next, bh))
-			return next;
-		jbd_lock_bh_state(bh);
-		if (jh->b_committed_data)
-			start = ext3_find_next_zero_bit(jh->b_committed_data,
-							maxblocks, next);
-		jbd_unlock_bh_state(bh);
-	}
-	return -1;
-}
-
-/**
- * find_next_usable_block()
- * @start:		the starting block (group relative) to find next
- *			allocatable block in bitmap.
- * @bh:			bufferhead contains the block group bitmap
- * @maxblocks:		the ending block (group relative) for the search
- *
- * Find an allocatable block in a bitmap.  We honor both the bitmap and
- * its last-committed copy (if that exists), and perform the "most
- * appropriate allocation" algorithm of looking for a free block near
- * the initial goal; then for a free byte somewhere in the bitmap; then
- * for any free bit in the bitmap.
- */
-static ext3_grpblk_t
-find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
-			ext3_grpblk_t maxblocks)
-{
-	ext3_grpblk_t here, next;
-	char *p, *r;
-
-	if (start > 0) {
-		/*
-		 * The goal was occupied; search forward for a free
-		 * block within the next XX blocks.
-		 *
-		 * end_goal is more or less random, but it has to be
-		 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
-		 * next 64-bit boundary is simple..
-		 */
-		ext3_grpblk_t end_goal = (start + 63) & ~63;
-		if (end_goal > maxblocks)
-			end_goal = maxblocks;
-		here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
-		if (here < end_goal && ext3_test_allocatable(here, bh))
-			return here;
-		ext3_debug("Bit not found near goal\n");
-	}
-
-	here = start;
-	if (here < 0)
-		here = 0;
-
-	p = bh->b_data + (here >> 3);
-	r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
-	next = (r - bh->b_data) << 3;
-
-	if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
-		return next;
-
-	/*
-	 * The bitmap search --- search forward alternately through the actual
-	 * bitmap and the last-committed copy until we find a bit free in
-	 * both
-	 */
-	here = bitmap_search_next_usable_block(here, bh, maxblocks);
-	return here;
-}
-
-/**
- * claim_block()
- * @lock:		the spin lock for this block group
- * @block:		the free block (group relative) to allocate
- * @bh:			the buffer_head contains the block group bitmap
- *
- * We think we can allocate this block in this bitmap.  Try to set the bit.
- * If that succeeds then check that nobody has allocated and then freed the
- * block since we saw that is was not marked in b_committed_data.  If it _was_
- * allocated and freed then clear the bit in the bitmap again and return
- * zero (failure).
- */
-static inline int
-claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
-{
-	struct journal_head *jh = bh2jh(bh);
-	int ret;
-
-	if (ext3_set_bit_atomic(lock, block, bh->b_data))
-		return 0;
-	jbd_lock_bh_state(bh);
-	if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) {
-		ext3_clear_bit_atomic(lock, block, bh->b_data);
-		ret = 0;
-	} else {
-		ret = 1;
-	}
-	jbd_unlock_bh_state(bh);
-	return ret;
-}
-
-/**
- * ext3_try_to_allocate()
- * @sb:			superblock
- * @handle:		handle to this transaction
- * @group:		given allocation block group
- * @bitmap_bh:		bufferhead holds the block bitmap
- * @grp_goal:		given target block within the group
- * @count:		target number of blocks to allocate
- * @my_rsv:		reservation window
- *
- * Attempt to allocate blocks within a give range. Set the range of allocation
- * first, then find the first free bit(s) from the bitmap (within the range),
- * and at last, allocate the blocks by claiming the found free bit as allocated.
- *
- * To set the range of this allocation:
- *	if there is a reservation window, only try to allocate block(s) from the
- *	file's own reservation window;
- *	Otherwise, the allocation range starts from the give goal block, ends at
- *	the block group's last block.
- *
- * If we failed to allocate the desired block then we may end up crossing to a
- * new bitmap.  In that case we must release write access to the old one via
- * ext3_journal_release_buffer(), else we'll run out of credits.
- */
-static ext3_grpblk_t
-ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
-			struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
-			unsigned long *count, struct ext3_reserve_window *my_rsv)
-{
-	ext3_fsblk_t group_first_block;
-	ext3_grpblk_t start, end;
-	unsigned long num = 0;
-
-	/* we do allocation within the reservation window if we have a window */
-	if (my_rsv) {
-		group_first_block = ext3_group_first_block_no(sb, group);
-		if (my_rsv->_rsv_start >= group_first_block)
-			start = my_rsv->_rsv_start - group_first_block;
-		else
-			/* reservation window cross group boundary */
-			start = 0;
-		end = my_rsv->_rsv_end - group_first_block + 1;
-		if (end > EXT3_BLOCKS_PER_GROUP(sb))
-			/* reservation window crosses group boundary */
-			end = EXT3_BLOCKS_PER_GROUP(sb);
-		if ((start <= grp_goal) && (grp_goal < end))
-			start = grp_goal;
-		else
-			grp_goal = -1;
-	} else {
-		if (grp_goal > 0)
-			start = grp_goal;
-		else
-			start = 0;
-		end = EXT3_BLOCKS_PER_GROUP(sb);
-	}
-
-	BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
-
-repeat:
-	if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
-		grp_goal = find_next_usable_block(start, bitmap_bh, end);
-		if (grp_goal < 0)
-			goto fail_access;
-		if (!my_rsv) {
-			int i;
-
-			for (i = 0; i < 7 && grp_goal > start &&
-					ext3_test_allocatable(grp_goal - 1,
-								bitmap_bh);
-					i++, grp_goal--)
-				;
-		}
-	}
-	start = grp_goal;
-
-	if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group),
-		grp_goal, bitmap_bh)) {
-		/*
-		 * The block was allocated by another thread, or it was
-		 * allocated and then freed by another thread
-		 */
-		start++;
-		grp_goal++;
-		if (start >= end)
-			goto fail_access;
-		goto repeat;
-	}
-	num++;
-	grp_goal++;
-	while (num < *count && grp_goal < end
-		&& ext3_test_allocatable(grp_goal, bitmap_bh)
-		&& claim_block(sb_bgl_lock(EXT3_SB(sb), group),
-				grp_goal, bitmap_bh)) {
-		num++;
-		grp_goal++;
-	}
-	*count = num;
-	return grp_goal - num;
-fail_access:
-	*count = num;
-	return -1;
-}
-
-/**
- *	find_next_reservable_window():
- *		find a reservable space within the given range.
- *		It does not allocate the reservation window for now:
- *		alloc_new_reservation() will do the work later.
- *
- *	@search_head: the head of the searching list;
- *		This is not necessarily the list head of the whole filesystem
- *
- *		We have both head and start_block to assist the search
- *		for the reservable space. The list starts from head,
- *		but we will shift to the place where start_block is,
- *		then start from there, when looking for a reservable space.
- *
- *	@my_rsv: the reservation window
- *
- *	@sb: the super block
- *
- *	@start_block: the first block we consider to start
- *			the real search from
- *
- *	@last_block:
- *		the maximum block number that our goal reservable space
- *		could start from. This is normally the last block in this
- *		group. The search will end when we found the start of next
- *		possible reservable space is out of this boundary.
- *		This could handle the cross boundary reservation window
- *		request.
- *
- *	basically we search from the given range, rather than the whole
- *	reservation double linked list, (start_block, last_block)
- *	to find a free region that is of my size and has not
- *	been reserved.
- *
- */
-static int find_next_reservable_window(
-				struct ext3_reserve_window_node *search_head,
-				struct ext3_reserve_window_node *my_rsv,
-				struct super_block * sb,
-				ext3_fsblk_t start_block,
-				ext3_fsblk_t last_block)
-{
-	struct rb_node *next;
-	struct ext3_reserve_window_node *rsv, *prev;
-	ext3_fsblk_t cur;
-	int size = my_rsv->rsv_goal_size;
-
-	/* TODO: make the start of the reservation window byte-aligned */
-	/* cur = *start_block & ~7;*/
-	cur = start_block;
-	rsv = search_head;
-	if (!rsv)
-		return -1;
-
-	while (1) {
-		if (cur <= rsv->rsv_end)
-			cur = rsv->rsv_end + 1;
-
-		/* TODO?
-		 * in the case we could not find a reservable space
-		 * that is what is expected, during the re-search, we could
-		 * remember what's the largest reservable space we could have
-		 * and return that one.
-		 *
-		 * For now it will fail if we could not find the reservable
-		 * space with expected-size (or more)...
-		 */
-		if (cur > last_block)
-			return -1;		/* fail */
-
-		prev = rsv;
-		next = rb_next(&rsv->rsv_node);
-		rsv = rb_entry(next,struct ext3_reserve_window_node,rsv_node);
-
-		/*
-		 * Reached the last reservation, we can just append to the
-		 * previous one.
-		 */
-		if (!next)
-			break;
-
-		if (cur + size <= rsv->rsv_start) {
-			/*
-			 * Found a reserveable space big enough.  We could
-			 * have a reservation across the group boundary here
-			 */
-			break;
-		}
-	}
-	/*
-	 * we come here either :
-	 * when we reach the end of the whole list,
-	 * and there is empty reservable space after last entry in the list.
-	 * append it to the end of the list.
-	 *
-	 * or we found one reservable space in the middle of the list,
-	 * return the reservation window that we could append to.
-	 * succeed.
-	 */
-
-	if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
-		rsv_window_remove(sb, my_rsv);
-
-	/*
-	 * Let's book the whole available window for now.  We will check the
-	 * disk bitmap later and then, if there are free blocks then we adjust
-	 * the window size if it's larger than requested.
-	 * Otherwise, we will remove this node from the tree next time
-	 * call find_next_reservable_window.
-	 */
-	my_rsv->rsv_start = cur;
-	my_rsv->rsv_end = cur + size - 1;
-	my_rsv->rsv_alloc_hit = 0;
-
-	if (prev != my_rsv)
-		ext3_rsv_window_add(sb, my_rsv);
-
-	return 0;
-}
-
-/**
- *	alloc_new_reservation()--allocate a new reservation window
- *
- *		To make a new reservation, we search part of the filesystem
- *		reservation list (the list that inside the group). We try to
- *		allocate a new reservation window near the allocation goal,
- *		or the beginning of the group, if there is no goal.
- *
- *		We first find a reservable space after the goal, then from
- *		there, we check the bitmap for the first free block after
- *		it. If there is no free block until the end of group, then the
- *		whole group is full, we failed. Otherwise, check if the free
- *		block is inside the expected reservable space, if so, we
- *		succeed.
- *		If the first free block is outside the reservable space, then
- *		start from the first free block, we search for next available
- *		space, and go on.
- *
- *	on succeed, a new reservation will be found and inserted into the list
- *	It contains at least one free block, and it does not overlap with other
- *	reservation windows.
- *
- *	failed: we failed to find a reservation window in this group
- *
- *	@my_rsv: the reservation window
- *
- *	@grp_goal: The goal (group-relative).  It is where the search for a
- *		free reservable space should start from.
- *		if we have a grp_goal(grp_goal >0 ), then start from there,
- *		no grp_goal(grp_goal = -1), we start from the first block
- *		of the group.
- *
- *	@sb: the super block
- *	@group: the group we are trying to allocate in
- *	@bitmap_bh: the block group block bitmap
- *
- */
-static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
-		ext3_grpblk_t grp_goal, struct super_block *sb,
-		unsigned int group, struct buffer_head *bitmap_bh)
-{
-	struct ext3_reserve_window_node *search_head;
-	ext3_fsblk_t group_first_block, group_end_block, start_block;
-	ext3_grpblk_t first_free_block;
-	struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
-	unsigned long size;
-	int ret;
-	spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
-
-	group_first_block = ext3_group_first_block_no(sb, group);
-	group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
-
-	if (grp_goal < 0)
-		start_block = group_first_block;
-	else
-		start_block = grp_goal + group_first_block;
-
-	trace_ext3_alloc_new_reservation(sb, start_block);
-	size = my_rsv->rsv_goal_size;
-
-	if (!rsv_is_empty(&my_rsv->rsv_window)) {
-		/*
-		 * if the old reservation is cross group boundary
-		 * and if the goal is inside the old reservation window,
-		 * we will come here when we just failed to allocate from
-		 * the first part of the window. We still have another part
-		 * that belongs to the next group. In this case, there is no
-		 * point to discard our window and try to allocate a new one
-		 * in this group(which will fail). we should
-		 * keep the reservation window, just simply move on.
-		 *
-		 * Maybe we could shift the start block of the reservation
-		 * window to the first block of next group.
-		 */
-
-		if ((my_rsv->rsv_start <= group_end_block) &&
-				(my_rsv->rsv_end > group_end_block) &&
-				(start_block >= my_rsv->rsv_start))
-			return -1;
-
-		if ((my_rsv->rsv_alloc_hit >
-		     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
-			/*
-			 * if the previously allocation hit ratio is
-			 * greater than 1/2, then we double the size of
-			 * the reservation window the next time,
-			 * otherwise we keep the same size window
-			 */
-			size = size * 2;
-			if (size > EXT3_MAX_RESERVE_BLOCKS)
-				size = EXT3_MAX_RESERVE_BLOCKS;
-			my_rsv->rsv_goal_size= size;
-		}
-	}
-
-	spin_lock(rsv_lock);
-	/*
-	 * shift the search start to the window near the goal block
-	 */
-	search_head = search_reserve_window(fs_rsv_root, start_block);
-
-	/*
-	 * find_next_reservable_window() simply finds a reservable window
-	 * inside the given range(start_block, group_end_block).
-	 *
-	 * To make sure the reservation window has a free bit inside it, we
-	 * need to check the bitmap after we found a reservable window.
-	 */
-retry:
-	ret = find_next_reservable_window(search_head, my_rsv, sb,
-						start_block, group_end_block);
-
-	if (ret == -1) {
-		if (!rsv_is_empty(&my_rsv->rsv_window))
-			rsv_window_remove(sb, my_rsv);
-		spin_unlock(rsv_lock);
-		return -1;
-	}
-
-	/*
-	 * On success, find_next_reservable_window() returns the
-	 * reservation window where there is a reservable space after it.
-	 * Before we reserve this reservable space, we need
-	 * to make sure there is at least a free block inside this region.
-	 *
-	 * searching the first free bit on the block bitmap and copy of
-	 * last committed bitmap alternatively, until we found a allocatable
-	 * block. Search start from the start block of the reservable space
-	 * we just found.
-	 */
-	spin_unlock(rsv_lock);
-	first_free_block = bitmap_search_next_usable_block(
-			my_rsv->rsv_start - group_first_block,
-			bitmap_bh, group_end_block - group_first_block + 1);
-
-	if (first_free_block < 0) {
-		/*
-		 * no free block left on the bitmap, no point
-		 * to reserve the space. return failed.
-		 */
-		spin_lock(rsv_lock);
-		if (!rsv_is_empty(&my_rsv->rsv_window))
-			rsv_window_remove(sb, my_rsv);
-		spin_unlock(rsv_lock);
-		return -1;		/* failed */
-	}
-
-	start_block = first_free_block + group_first_block;
-	/*
-	 * check if the first free block is within the
-	 * free space we just reserved
-	 */
-	if (start_block >= my_rsv->rsv_start &&
-	    start_block <= my_rsv->rsv_end) {
-		trace_ext3_reserved(sb, start_block, my_rsv);
-		return 0;		/* success */
-	}
-	/*
-	 * if the first free bit we found is out of the reservable space
-	 * continue search for next reservable space,
-	 * start from where the free block is,
-	 * we also shift the list head to where we stopped last time
-	 */
-	search_head = my_rsv;
-	spin_lock(rsv_lock);
-	goto retry;
-}
-
-/**
- * try_to_extend_reservation()
- * @my_rsv:		given reservation window
- * @sb:			super block
- * @size:		the delta to extend
- *
- * Attempt to expand the reservation window large enough to have
- * required number of free blocks
- *
- * Since ext3_try_to_allocate() will always allocate blocks within
- * the reservation window range, if the window size is too small,
- * multiple blocks allocation has to stop at the end of the reservation
- * window. To make this more efficient, given the total number of
- * blocks needed and the current size of the window, we try to
- * expand the reservation window size if necessary on a best-effort
- * basis before ext3_new_blocks() tries to allocate blocks,
- */
-static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
-			struct super_block *sb, int size)
-{
-	struct ext3_reserve_window_node *next_rsv;
-	struct rb_node *next;
-	spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
-
-	if (!spin_trylock(rsv_lock))
-		return;
-
-	next = rb_next(&my_rsv->rsv_node);
-
-	if (!next)
-		my_rsv->rsv_end += size;
-	else {
-		next_rsv = rb_entry(next, struct ext3_reserve_window_node, rsv_node);
-
-		if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
-			my_rsv->rsv_end += size;
-		else
-			my_rsv->rsv_end = next_rsv->rsv_start - 1;
-	}
-	spin_unlock(rsv_lock);
-}
-
-/**
- * ext3_try_to_allocate_with_rsv()
- * @sb:			superblock
- * @handle:		handle to this transaction
- * @group:		given allocation block group
- * @bitmap_bh:		bufferhead holds the block bitmap
- * @grp_goal:		given target block within the group
- * @my_rsv:		reservation window
- * @count:		target number of blocks to allocate
- * @errp:		pointer to store the error code
- *
- * This is the main function used to allocate a new block and its reservation
- * window.
- *
- * Each time when a new block allocation is need, first try to allocate from
- * its own reservation.  If it does not have a reservation window, instead of
- * looking for a free bit on bitmap first, then look up the reservation list to
- * see if it is inside somebody else's reservation window, we try to allocate a
- * reservation window for it starting from the goal first. Then do the block
- * allocation within the reservation window.
- *
- * This will avoid keeping on searching the reservation list again and
- * again when somebody is looking for a free block (without
- * reservation), and there are lots of free blocks, but they are all
- * being reserved.
- *
- * We use a red-black tree for the per-filesystem reservation list.
- *
- */
-static ext3_grpblk_t
-ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
-			unsigned int group, struct buffer_head *bitmap_bh,
-			ext3_grpblk_t grp_goal,
-			struct ext3_reserve_window_node * my_rsv,
-			unsigned long *count, int *errp)
-{
-	ext3_fsblk_t group_first_block, group_last_block;
-	ext3_grpblk_t ret = 0;
-	int fatal;
-	unsigned long num = *count;
-
-	*errp = 0;
-
-	/*
-	 * Make sure we use undo access for the bitmap, because it is critical
-	 * that we do the frozen_data COW on bitmap buffers in all cases even
-	 * if the buffer is in BJ_Forget state in the committing transaction.
-	 */
-	BUFFER_TRACE(bitmap_bh, "get undo access for new block");
-	fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
-	if (fatal) {
-		*errp = fatal;
-		return -1;
-	}
-
-	/*
-	 * we don't deal with reservation when
-	 * filesystem is mounted without reservation
-	 * or the file is not a regular file
-	 * or last attempt to allocate a block with reservation turned on failed
-	 */
-	if (my_rsv == NULL ) {
-		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
-						grp_goal, count, NULL);
-		goto out;
-	}
-	/*
-	 * grp_goal is a group relative block number (if there is a goal)
-	 * 0 <= grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
-	 * first block is a filesystem wide block number
-	 * first block is the block number of the first block in this group
-	 */
-	group_first_block = ext3_group_first_block_no(sb, group);
-	group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
-
-	/*
-	 * Basically we will allocate a new block from inode's reservation
-	 * window.
-	 *
-	 * We need to allocate a new reservation window, if:
-	 * a) inode does not have a reservation window; or
-	 * b) last attempt to allocate a block from existing reservation
-	 *    failed; or
-	 * c) we come here with a goal and with a reservation window
-	 *
-	 * We do not need to allocate a new reservation window if we come here
-	 * at the beginning with a goal and the goal is inside the window, or
-	 * we don't have a goal but already have a reservation window.
-	 * then we could go to allocate from the reservation window directly.
-	 */
-	while (1) {
-		if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
-			!goal_in_my_reservation(&my_rsv->rsv_window,
-						grp_goal, group, sb)) {
-			if (my_rsv->rsv_goal_size < *count)
-				my_rsv->rsv_goal_size = *count;
-			ret = alloc_new_reservation(my_rsv, grp_goal, sb,
-							group, bitmap_bh);
-			if (ret < 0)
-				break;			/* failed */
-
-			if (!goal_in_my_reservation(&my_rsv->rsv_window,
-							grp_goal, group, sb))
-				grp_goal = -1;
-		} else if (grp_goal >= 0) {
-			int curr = my_rsv->rsv_end -
-					(grp_goal + group_first_block) + 1;
-
-			if (curr < *count)
-				try_to_extend_reservation(my_rsv, sb,
-							*count - curr);
-		}
-
-		if ((my_rsv->rsv_start > group_last_block) ||
-				(my_rsv->rsv_end < group_first_block)) {
-			rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1);
-			BUG();
-		}
-		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
-					   grp_goal, &num, &my_rsv->rsv_window);
-		if (ret >= 0) {
-			my_rsv->rsv_alloc_hit += num;
-			*count = num;
-			break;				/* succeed */
-		}
-		num = *count;
-	}
-out:
-	if (ret >= 0) {
-		BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
-					"bitmap block");
-		fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
-		if (fatal) {
-			*errp = fatal;
-			return -1;
-		}
-		return ret;
-	}
-
-	BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
-	ext3_journal_release_buffer(handle, bitmap_bh);
-	return ret;
-}
-
-/**
- * ext3_has_free_blocks()
- * @sbi:		in-core super block structure.
- *
- * Check if filesystem has at least 1 free block available for allocation.
- */
-static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation)
-{
-	ext3_fsblk_t free_blocks, root_blocks;
-
-	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
-	root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
-	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
-		!use_reservation && !uid_eq(sbi->s_resuid, current_fsuid()) &&
-		(gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) ||
-		 !in_group_p (sbi->s_resgid))) {
-		return 0;
-	}
-	return 1;
-}
-
-/**
- * ext3_should_retry_alloc()
- * @sb:			super block
- * @retries		number of attemps has been made
- *
- * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
- * it is profitable to retry the operation, this function will wait
- * for the current or committing transaction to complete, and then
- * return TRUE.
- *
- * if the total number of retries exceed three times, return FALSE.
- */
-int ext3_should_retry_alloc(struct super_block *sb, int *retries)
-{
-	if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3)
-		return 0;
-
-	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
-
-	return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
-}
-
-/**
- * ext3_new_blocks() -- core block(s) allocation function
- * @handle:		handle to this transaction
- * @inode:		file inode
- * @goal:		given target block(filesystem wide)
- * @count:		target number of blocks to allocate
- * @errp:		error code
- *
- * ext3_new_blocks uses a goal block to assist allocation.  It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
- *
- */
-ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
-			ext3_fsblk_t goal, unsigned long *count, int *errp)
-{
-	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *gdp_bh;
-	int group_no;
-	int goal_group;
-	ext3_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
-	ext3_grpblk_t grp_alloc_blk;	/* blockgroup-relative allocated block*/
-	ext3_fsblk_t ret_block;		/* filesyetem-wide allocated block */
-	int bgi;			/* blockgroup iteration index */
-	int fatal = 0, err;
-	int performed_allocation = 0;
-	ext3_grpblk_t free_blocks;	/* number of free blocks in a group */
-	struct super_block *sb;
-	struct ext3_group_desc *gdp;
-	struct ext3_super_block *es;
-	struct ext3_sb_info *sbi;
-	struct ext3_reserve_window_node *my_rsv = NULL;
-	struct ext3_block_alloc_info *block_i;
-	unsigned short windowsz = 0;
-#ifdef EXT3FS_DEBUG
-	static int goal_hits, goal_attempts;
-#endif
-	unsigned long ngroups;
-	unsigned long num = *count;
-
-	*errp = -ENOSPC;
-	sb = inode->i_sb;
-
-	/*
-	 * Check quota for allocation of this block.
-	 */
-	err = dquot_alloc_block(inode, num);
-	if (err) {
-		*errp = err;
-		return 0;
-	}
-
-	trace_ext3_request_blocks(inode, goal, num);
-
-	sbi = EXT3_SB(sb);
-	es = sbi->s_es;
-	ext3_debug("goal=%lu.\n", goal);
-	/*
-	 * Allocate a block from reservation only when
-	 * filesystem is mounted with reservation(default,-o reservation), and
-	 * it's a regular file, and
-	 * the desired window size is greater than 0 (One could use ioctl
-	 * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off
-	 * reservation on that particular file)
-	 */
-	block_i = EXT3_I(inode)->i_block_alloc_info;
-	if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
-		my_rsv = &block_i->rsv_window_node;
-
-	if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) {
-		*errp = -ENOSPC;
-		goto out;
-	}
-
-	/*
-	 * First, test whether the goal block is free.
-	 */
-	if (goal < le32_to_cpu(es->s_first_data_block) ||
-	    goal >= le32_to_cpu(es->s_blocks_count))
-		goal = le32_to_cpu(es->s_first_data_block);
-	group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
-			EXT3_BLOCKS_PER_GROUP(sb);
-	goal_group = group_no;
-retry_alloc:
-	gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
-	if (!gdp)
-		goto io_error;
-
-	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-	/*
-	 * if there is not enough free blocks to make a new resevation
-	 * turn off reservation for this allocation
-	 */
-	if (my_rsv && (free_blocks < windowsz)
-		&& (free_blocks > 0)
-		&& (rsv_is_empty(&my_rsv->rsv_window)))
-		my_rsv = NULL;
-
-	if (free_blocks > 0) {
-		grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
-				EXT3_BLOCKS_PER_GROUP(sb));
-		bitmap_bh = read_block_bitmap(sb, group_no);
-		if (!bitmap_bh)
-			goto io_error;
-		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
-					group_no, bitmap_bh, grp_target_blk,
-					my_rsv,	&num, &fatal);
-		if (fatal)
-			goto out;
-		if (grp_alloc_blk >= 0)
-			goto allocated;
-	}
-
-	ngroups = EXT3_SB(sb)->s_groups_count;
-	smp_rmb();
-
-	/*
-	 * Now search the rest of the groups.  We assume that
-	 * group_no and gdp correctly point to the last group visited.
-	 */
-	for (bgi = 0; bgi < ngroups; bgi++) {
-		group_no++;
-		if (group_no >= ngroups)
-			group_no = 0;
-		gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
-		if (!gdp)
-			goto io_error;
-		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-		/*
-		 * skip this group (and avoid loading bitmap) if there
-		 * are no free blocks
-		 */
-		if (!free_blocks)
-			continue;
-		/*
-		 * skip this group if the number of
-		 * free blocks is less than half of the reservation
-		 * window size.
-		 */
-		if (my_rsv && (free_blocks <= (windowsz/2)))
-			continue;
-
-		brelse(bitmap_bh);
-		bitmap_bh = read_block_bitmap(sb, group_no);
-		if (!bitmap_bh)
-			goto io_error;
-		/*
-		 * try to allocate block(s) from this group, without a goal(-1).
-		 */
-		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
-					group_no, bitmap_bh, -1, my_rsv,
-					&num, &fatal);
-		if (fatal)
-			goto out;
-		if (grp_alloc_blk >= 0)
-			goto allocated;
-	}
-	/*
-	 * We may end up a bogus earlier ENOSPC error due to
-	 * filesystem is "full" of reservations, but
-	 * there maybe indeed free blocks available on disk
-	 * In this case, we just forget about the reservations
-	 * just do block allocation as without reservations.
-	 */
-	if (my_rsv) {
-		my_rsv = NULL;
-		windowsz = 0;
-		group_no = goal_group;
-		goto retry_alloc;
-	}
-	/* No space left on the device */
-	*errp = -ENOSPC;
-	goto out;
-
-allocated:
-
-	ext3_debug("using block group %d(%d)\n",
-			group_no, gdp->bg_free_blocks_count);
-
-	BUFFER_TRACE(gdp_bh, "get_write_access");
-	fatal = ext3_journal_get_write_access(handle, gdp_bh);
-	if (fatal)
-		goto out;
-
-	ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
-
-	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
-	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
-	    in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
-		      EXT3_SB(sb)->s_itb_per_group) ||
-	    in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
-		      EXT3_SB(sb)->s_itb_per_group)) {
-		ext3_error(sb, "ext3_new_block",
-			    "Allocating block in system zone - "
-			    "blocks from "E3FSBLK", length %lu",
-			     ret_block, num);
-		/*
-		 * claim_block() marked the blocks we allocated as in use. So we
-		 * may want to selectively mark some of the blocks as free.
-		 */
-		goto retry_alloc;
-	}
-
-	performed_allocation = 1;
-
-#ifdef CONFIG_JBD_DEBUG
-	{
-		struct buffer_head *debug_bh;
-
-		/* Record bitmap buffer state in the newly allocated block */
-		debug_bh = sb_find_get_block(sb, ret_block);
-		if (debug_bh) {
-			BUFFER_TRACE(debug_bh, "state when allocated");
-			BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
-			brelse(debug_bh);
-		}
-	}
-	jbd_lock_bh_state(bitmap_bh);
-	spin_lock(sb_bgl_lock(sbi, group_no));
-	if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
-		int i;
-
-		for (i = 0; i < num; i++) {
-			if (ext3_test_bit(grp_alloc_blk+i,
-					bh2jh(bitmap_bh)->b_committed_data)) {
-				printk("%s: block was unexpectedly set in "
-					"b_committed_data\n", __func__);
-			}
-		}
-	}
-	ext3_debug("found bit %d\n", grp_alloc_blk);
-	spin_unlock(sb_bgl_lock(sbi, group_no));
-	jbd_unlock_bh_state(bitmap_bh);
-#endif
-
-	if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
-		ext3_error(sb, "ext3_new_block",
-			    "block("E3FSBLK") >= blocks count(%d) - "
-			    "block_group = %d, es == %p ", ret_block,
-			le32_to_cpu(es->s_blocks_count), group_no, es);
-		goto out;
-	}
-
-	/*
-	 * It is up to the caller to add the new buffer to a journal
-	 * list of some description.  We don't know in advance whether
-	 * the caller wants to use it as metadata or data.
-	 */
-	ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
-			ret_block, goal_hits, goal_attempts);
-
-	spin_lock(sb_bgl_lock(sbi, group_no));
-	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
-	spin_unlock(sb_bgl_lock(sbi, group_no));
-	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
-
-	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
-	fatal = ext3_journal_dirty_metadata(handle, gdp_bh);
-	if (fatal)
-		goto out;
-
-	*errp = 0;
-	brelse(bitmap_bh);
-
-	if (num < *count) {
-		dquot_free_block(inode, *count-num);
-		*count = num;
-	}
-
-	trace_ext3_allocate_blocks(inode, goal, num,
-				   (unsigned long long)ret_block);
-
-	return ret_block;
-
-io_error:
-	*errp = -EIO;
-out:
-	if (fatal) {
-		*errp = fatal;
-		ext3_std_error(sb, fatal);
-	}
-	/*
-	 * Undo the block allocation
-	 */
-	if (!performed_allocation)
-		dquot_free_block(inode, *count);
-	brelse(bitmap_bh);
-	return 0;
-}
-
-ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
-			ext3_fsblk_t goal, int *errp)
-{
-	unsigned long count = 1;
-
-	return ext3_new_blocks(handle, inode, goal, &count, errp);
-}
-
-/**
- * ext3_count_free_blocks() -- count filesystem free blocks
- * @sb:		superblock
- *
- * Adds up the number of free blocks from each block group.
- */
-ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
-{
-	ext3_fsblk_t desc_count;
-	struct ext3_group_desc *gdp;
-	int i;
-	unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
-#ifdef EXT3FS_DEBUG
-	struct ext3_super_block *es;
-	ext3_fsblk_t bitmap_count;
-	unsigned long x;
-	struct buffer_head *bitmap_bh = NULL;
-
-	es = EXT3_SB(sb)->s_es;
-	desc_count = 0;
-	bitmap_count = 0;
-	gdp = NULL;
-
-	smp_rmb();
-	for (i = 0; i < ngroups; i++) {
-		gdp = ext3_get_group_desc(sb, i, NULL);
-		if (!gdp)
-			continue;
-		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
-		brelse(bitmap_bh);
-		bitmap_bh = read_block_bitmap(sb, i);
-		if (bitmap_bh == NULL)
-			continue;
-
-		x = ext3_count_free(bitmap_bh, sb->s_blocksize);
-		printk("group %d: stored = %d, counted = %lu\n",
-			i, le16_to_cpu(gdp->bg_free_blocks_count), x);
-		bitmap_count += x;
-	}
-	brelse(bitmap_bh);
-	printk("ext3_count_free_blocks: stored = "E3FSBLK
-		", computed = "E3FSBLK", "E3FSBLK"\n",
-	       (ext3_fsblk_t)le32_to_cpu(es->s_free_blocks_count),
-		desc_count, bitmap_count);
-	return bitmap_count;
-#else
-	desc_count = 0;
-	smp_rmb();
-	for (i = 0; i < ngroups; i++) {
-		gdp = ext3_get_group_desc(sb, i, NULL);
-		if (!gdp)
-			continue;
-		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
-	}
-
-	return desc_count;
-#endif
-}
-
-static inline int test_root(int a, int b)
-{
-	int num = b;
-
-	while (a > num)
-		num *= b;
-	return num == a;
-}
-
-static int ext3_group_sparse(int group)
-{
-	if (group <= 1)
-		return 1;
-	if (!(group & 1))
-		return 0;
-	return (test_root(group, 7) || test_root(group, 5) ||
-		test_root(group, 3));
-}
-
-/**
- *	ext3_bg_has_super - number of blocks used by the superblock in group
- *	@sb: superblock for filesystem
- *	@group: group number to check
- *
- *	Return the number of blocks used by the superblock (primary or backup)
- *	in this group.  Currently this will be only 0 or 1.
- */
-int ext3_bg_has_super(struct super_block *sb, int group)
-{
-	if (EXT3_HAS_RO_COMPAT_FEATURE(sb,
-				EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
-			!ext3_group_sparse(group))
-		return 0;
-	return 1;
-}
-
-static unsigned long ext3_bg_num_gdb_meta(struct super_block *sb, int group)
-{
-	unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
-	unsigned long first = metagroup * EXT3_DESC_PER_BLOCK(sb);
-	unsigned long last = first + EXT3_DESC_PER_BLOCK(sb) - 1;
-
-	if (group == first || group == first + 1 || group == last)
-		return 1;
-	return 0;
-}
-
-static unsigned long ext3_bg_num_gdb_nometa(struct super_block *sb, int group)
-{
-	return ext3_bg_has_super(sb, group) ? EXT3_SB(sb)->s_gdb_count : 0;
-}
-
-/**
- *	ext3_bg_num_gdb - number of blocks used by the group table in group
- *	@sb: superblock for filesystem
- *	@group: group number to check
- *
- *	Return the number of blocks used by the group descriptor table
- *	(primary or backup) in this group.  In the future there may be a
- *	different number of descriptor blocks in each group.
- */
-unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
-{
-	unsigned long first_meta_bg =
-			le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg);
-	unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
-
-	if (!EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) ||
-			metagroup < first_meta_bg)
-		return ext3_bg_num_gdb_nometa(sb,group);
-
-	return ext3_bg_num_gdb_meta(sb,group);
-
-}
-
-/**
- * ext3_trim_all_free -- function to trim all free space in alloc. group
- * @sb:			super block for file system
- * @group:		allocation group to trim
- * @start:		first group block to examine
- * @max:		last group block to examine
- * @gdp:		allocation group description structure
- * @minblocks:		minimum extent block count
- *
- * ext3_trim_all_free walks through group's block bitmap searching for free
- * blocks. When the free block is found, it tries to allocate this block and
- * consequent free block to get the biggest free extent possible, until it
- * reaches any used block. Then issue a TRIM command on this extent and free
- * the extent in the block bitmap. This is done until whole group is scanned.
- */
-static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
-					unsigned int group,
-					ext3_grpblk_t start, ext3_grpblk_t max,
-					ext3_grpblk_t minblocks)
-{
-	handle_t *handle;
-	ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
-	ext3_fsblk_t discard_block;
-	struct ext3_sb_info *sbi;
-	struct buffer_head *gdp_bh, *bitmap_bh = NULL;
-	struct ext3_group_desc *gdp;
-	int err = 0, ret = 0;
-
-	/*
-	 * We will update one block bitmap, and one group descriptor
-	 */
-	handle = ext3_journal_start_sb(sb, 2);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	bitmap_bh = read_block_bitmap(sb, group);
-	if (!bitmap_bh) {
-		err = -EIO;
-		goto err_out;
-	}
-
-	BUFFER_TRACE(bitmap_bh, "getting undo access");
-	err = ext3_journal_get_undo_access(handle, bitmap_bh);
-	if (err)
-		goto err_out;
-
-	gdp = ext3_get_group_desc(sb, group, &gdp_bh);
-	if (!gdp) {
-		err = -EIO;
-		goto err_out;
-	}
-
-	BUFFER_TRACE(gdp_bh, "get_write_access");
-	err = ext3_journal_get_write_access(handle, gdp_bh);
-	if (err)
-		goto err_out;
-
-	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-	sbi = EXT3_SB(sb);
-
-	 /* Walk through the whole group */
-	while (start <= max) {
-		start = bitmap_search_next_usable_block(start, bitmap_bh, max);
-		if (start < 0)
-			break;
-		next = start;
-
-		/*
-		 * Allocate contiguous free extents by setting bits in the
-		 * block bitmap
-		 */
-		while (next <= max
-			&& claim_block(sb_bgl_lock(sbi, group),
-					next, bitmap_bh)) {
-			next++;
-		}
-
-		 /* We did not claim any blocks */
-		if (next == start)
-			continue;
-
-		discard_block = (ext3_fsblk_t)start +
-				ext3_group_first_block_no(sb, group);
-
-		/* Update counters */
-		spin_lock(sb_bgl_lock(sbi, group));
-		le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
-		spin_unlock(sb_bgl_lock(sbi, group));
-		percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
-
-		free_blocks -= next - start;
-		/* Do not issue a TRIM on extents smaller than minblocks */
-		if ((next - start) < minblocks)
-			goto free_extent;
-
-		trace_ext3_discard_blocks(sb, discard_block, next - start);
-		 /* Send the TRIM command down to the device */
-		err = sb_issue_discard(sb, discard_block, next - start,
-				       GFP_NOFS, 0);
-		count += (next - start);
-free_extent:
-		freed = 0;
-
-		/*
-		 * Clear bits in the bitmap
-		 */
-		for (bit = start; bit < next; bit++) {
-			BUFFER_TRACE(bitmap_bh, "clear bit");
-			if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
-						bit, bitmap_bh->b_data)) {
-				ext3_error(sb, __func__,
-					"bit already cleared for block "E3FSBLK,
-					 (unsigned long)bit);
-				BUFFER_TRACE(bitmap_bh, "bit already cleared");
-			} else {
-				freed++;
-			}
-		}
-
-		/* Update couters */
-		spin_lock(sb_bgl_lock(sbi, group));
-		le16_add_cpu(&gdp->bg_free_blocks_count, freed);
-		spin_unlock(sb_bgl_lock(sbi, group));
-		percpu_counter_add(&sbi->s_freeblocks_counter, freed);
-
-		start = next;
-		if (err < 0) {
-			if (err != -EOPNOTSUPP)
-				ext3_warning(sb, __func__, "Discard command "
-					     "returned error %d\n", err);
-			break;
-		}
-
-		if (fatal_signal_pending(current)) {
-			err = -ERESTARTSYS;
-			break;
-		}
-
-		cond_resched();
-
-		/* No more suitable extents */
-		if (free_blocks < minblocks)
-			break;
-	}
-
-	/* We dirtied the bitmap block */
-	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-	ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
-	if (!err)
-		err = ret;
-
-	/* And the group descriptor block */
-	BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
-	ret = ext3_journal_dirty_metadata(handle, gdp_bh);
-	if (!err)
-		err = ret;
-
-	ext3_debug("trimmed %d blocks in the group %d\n",
-		count, group);
-
-err_out:
-	if (err)
-		count = err;
-	ext3_journal_stop(handle);
-	brelse(bitmap_bh);
-
-	return count;
-}
-
-/**
- * ext3_trim_fs() -- trim ioctl handle function
- * @sb:			superblock for filesystem
- * @start:		First Byte to trim
- * @len:		number of Bytes to trim from start
- * @minlen:		minimum extent length in Bytes
- *
- * ext3_trim_fs goes through all allocation groups containing Bytes from
- * start to start+len. For each such a group ext3_trim_all_free function
- * is invoked to trim all free space.
- */
-int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
-{
-	ext3_grpblk_t last_block, first_block;
-	unsigned long group, first_group, last_group;
-	struct ext3_group_desc *gdp;
-	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
-	uint64_t start, minlen, end, trimmed = 0;
-	ext3_fsblk_t first_data_blk =
-			le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
-	ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
-	int ret = 0;
-
-	start = range->start >> sb->s_blocksize_bits;
-	end = start + (range->len >> sb->s_blocksize_bits) - 1;
-	minlen = range->minlen >> sb->s_blocksize_bits;
-
-	if (minlen > EXT3_BLOCKS_PER_GROUP(sb) ||
-	    start >= max_blks ||
-	    range->len < sb->s_blocksize)
-		return -EINVAL;
-	if (end >= max_blks)
-		end = max_blks - 1;
-	if (end <= first_data_blk)
-		goto out;
-	if (start < first_data_blk)
-		start = first_data_blk;
-
-	smp_rmb();
-
-	/* Determine first and last group to examine based on start and len */
-	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
-				     &first_group, &first_block);
-	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end,
-				     &last_group, &last_block);
-
-	/* end now represents the last block to discard in this group */
-	end = EXT3_BLOCKS_PER_GROUP(sb) - 1;
-
-	for (group = first_group; group <= last_group; group++) {
-		gdp = ext3_get_group_desc(sb, group, NULL);
-		if (!gdp)
-			break;
-
-		/*
-		 * For all the groups except the last one, last block will
-		 * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to
-		 * change it for the last group, note that last_block is
-		 * already computed earlier by ext3_get_group_no_and_offset()
-		 */
-		if (group == last_group)
-			end = last_block;
-
-		if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) {
-			ret = ext3_trim_all_free(sb, group, first_block,
-						 end, minlen);
-			if (ret < 0)
-				break;
-			trimmed += ret;
-		}
-
-		/*
-		 * For every group except the first one, we are sure
-		 * that the first block to discard will be block #0.
-		 */
-		first_block = 0;
-	}
-
-	if (ret > 0)
-		ret = 0;
-
-out:
-	range->len = trimmed * sb->s_blocksize;
-	return ret;
-}
diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c
deleted file mode 100644
index ef9c643e8e9d..000000000000
--- a/fs/ext3/bitmap.c
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- *  linux/fs/ext3/bitmap.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- */
-
-#include "ext3.h"
-
-#ifdef EXT3FS_DEBUG
-
-unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
-{
-	return numchars * BITS_PER_BYTE - memweight(map->b_data, numchars);
-}
-
-#endif  /*  EXT3FS_DEBUG  */
-
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
deleted file mode 100644
index 17742eed2c16..000000000000
--- a/fs/ext3/dir.c
+++ /dev/null
@@ -1,537 +0,0 @@
-/*
- *  linux/fs/ext3/dir.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/fs/minix/dir.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *
- *  ext3 directory handling functions
- *
- *  Big-endian to little-endian byte-swapping/bitmaps by
- *        David S. Miller (davem@caip.rutgers.edu), 1995
- *
- * Hash Tree Directory indexing (c) 2001  Daniel Phillips
- *
- */
-
-#include <linux/compat.h>
-#include "ext3.h"
-
-static unsigned char ext3_filetype_table[] = {
-	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
-
-static int ext3_dx_readdir(struct file *, struct dir_context *);
-
-static unsigned char get_dtype(struct super_block *sb, int filetype)
-{
-	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
-	    (filetype >= EXT3_FT_MAX))
-		return DT_UNKNOWN;
-
-	return (ext3_filetype_table[filetype]);
-}
-
-/**
- * Check if the given dir-inode refers to an htree-indexed directory
- * (or a directory which could potentially get converted to use htree
- * indexing).
- *
- * Return 1 if it is a dx dir, 0 if not
- */
-static int is_dx_dir(struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
-
-	if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
-		     EXT3_FEATURE_COMPAT_DIR_INDEX) &&
-	    ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
-	     ((inode->i_size >> sb->s_blocksize_bits) == 1)))
-		return 1;
-
-	return 0;
-}
-
-int ext3_check_dir_entry (const char * function, struct inode * dir,
-			  struct ext3_dir_entry_2 * de,
-			  struct buffer_head * bh,
-			  unsigned long offset)
-{
-	const char * error_msg = NULL;
-	const int rlen = ext3_rec_len_from_disk(de->rec_len);
-
-	if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
-		error_msg = "rec_len is smaller than minimal";
-	else if (unlikely(rlen % 4 != 0))
-		error_msg = "rec_len % 4 != 0";
-	else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
-		error_msg = "rec_len is too small for name_len";
-	else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
-		error_msg = "directory entry across blocks";
-	else if (unlikely(le32_to_cpu(de->inode) >
-			le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
-		error_msg = "inode out of bounds";
-
-	if (unlikely(error_msg != NULL))
-		ext3_error (dir->i_sb, function,
-			"bad entry in directory #%lu: %s - "
-			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
-			dir->i_ino, error_msg, offset,
-			(unsigned long) le32_to_cpu(de->inode),
-			rlen, de->name_len);
-
-	return error_msg == NULL ? 1 : 0;
-}
-
-static int ext3_readdir(struct file *file, struct dir_context *ctx)
-{
-	unsigned long offset;
-	int i;
-	struct ext3_dir_entry_2 *de;
-	int err;
-	struct inode *inode = file_inode(file);
-	struct super_block *sb = inode->i_sb;
-	int dir_has_error = 0;
-
-	if (is_dx_dir(inode)) {
-		err = ext3_dx_readdir(file, ctx);
-		if (err != ERR_BAD_DX_DIR)
-			return err;
-		/*
-		 * We don't set the inode dirty flag since it's not
-		 * critical that it get flushed back to the disk.
-		 */
-		EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
-	}
-	offset = ctx->pos & (sb->s_blocksize - 1);
-
-	while (ctx->pos < inode->i_size) {
-		unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb);
-		struct buffer_head map_bh;
-		struct buffer_head *bh = NULL;
-
-		map_bh.b_state = 0;
-		err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
-		if (err > 0) {
-			pgoff_t index = map_bh.b_blocknr >>
-					(PAGE_CACHE_SHIFT - inode->i_blkbits);
-			if (!ra_has_index(&file->f_ra, index))
-				page_cache_sync_readahead(
-					sb->s_bdev->bd_inode->i_mapping,
-					&file->f_ra, file,
-					index, 1);
-			file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
-			bh = ext3_bread(NULL, inode, blk, 0, &err);
-		}
-
-		/*
-		 * We ignore I/O errors on directories so users have a chance
-		 * of recovering data when there's a bad sector
-		 */
-		if (!bh) {
-			if (!dir_has_error) {
-				ext3_error(sb, __func__, "directory #%lu "
-					"contains a hole at offset %lld",
-					inode->i_ino, ctx->pos);
-				dir_has_error = 1;
-			}
-			/* corrupt size?  Maybe no more blocks to read */
-			if (ctx->pos > inode->i_blocks << 9)
-				break;
-			ctx->pos += sb->s_blocksize - offset;
-			continue;
-		}
-
-		/* If the dir block has changed since the last call to
-		 * readdir(2), then we might be pointing to an invalid
-		 * dirent right now.  Scan from the start of the block
-		 * to make sure. */
-		if (offset && file->f_version != inode->i_version) {
-			for (i = 0; i < sb->s_blocksize && i < offset; ) {
-				de = (struct ext3_dir_entry_2 *)
-					(bh->b_data + i);
-				/* It's too expensive to do a full
-				 * dirent test each time round this
-				 * loop, but we do have to test at
-				 * least that it is non-zero.  A
-				 * failure will be detected in the
-				 * dirent test below. */
-				if (ext3_rec_len_from_disk(de->rec_len) <
-						EXT3_DIR_REC_LEN(1))
-					break;
-				i += ext3_rec_len_from_disk(de->rec_len);
-			}
-			offset = i;
-			ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
-				| offset;
-			file->f_version = inode->i_version;
-		}
-
-		while (ctx->pos < inode->i_size
-		       && offset < sb->s_blocksize) {
-			de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
-			if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
-						   bh, offset)) {
-				/* On error, skip the to the
-                                   next block. */
-				ctx->pos = (ctx->pos |
-						(sb->s_blocksize - 1)) + 1;
-				break;
-			}
-			offset += ext3_rec_len_from_disk(de->rec_len);
-			if (le32_to_cpu(de->inode)) {
-				if (!dir_emit(ctx, de->name, de->name_len,
-					      le32_to_cpu(de->inode),
-					      get_dtype(sb, de->file_type))) {
-					brelse(bh);
-					return 0;
-				}
-			}
-			ctx->pos += ext3_rec_len_from_disk(de->rec_len);
-		}
-		offset = 0;
-		brelse (bh);
-		if (ctx->pos < inode->i_size)
-			if (!dir_relax(inode))
-				return 0;
-	}
-	return 0;
-}
-
-static inline int is_32bit_api(void)
-{
-#ifdef CONFIG_COMPAT
-	return is_compat_task();
-#else
-	return (BITS_PER_LONG == 32);
-#endif
-}
-
-/*
- * These functions convert from the major/minor hash to an f_pos
- * value for dx directories
- *
- * Upper layer (for example NFS) should specify FMODE_32BITHASH or
- * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted
- * directly on both 32-bit and 64-bit nodes, under such case, neither
- * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
- */
-static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
-{
-	if ((filp->f_mode & FMODE_32BITHASH) ||
-	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
-		return major >> 1;
-	else
-		return ((__u64)(major >> 1) << 32) | (__u64)minor;
-}
-
-static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
-{
-	if ((filp->f_mode & FMODE_32BITHASH) ||
-	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
-		return (pos << 1) & 0xffffffff;
-	else
-		return ((pos >> 32) << 1) & 0xffffffff;
-}
-
-static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
-{
-	if ((filp->f_mode & FMODE_32BITHASH) ||
-	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
-		return 0;
-	else
-		return pos & 0xffffffff;
-}
-
-/*
- * Return 32- or 64-bit end-of-file for dx directories
- */
-static inline loff_t ext3_get_htree_eof(struct file *filp)
-{
-	if ((filp->f_mode & FMODE_32BITHASH) ||
-	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
-		return EXT3_HTREE_EOF_32BIT;
-	else
-		return EXT3_HTREE_EOF_64BIT;
-}
-
-
-/*
- * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both
- * non-htree and htree directories, where the "offset" is in terms
- * of the filename hash value instead of the byte offset.
- *
- * Because we may return a 64-bit hash that is well beyond s_maxbytes,
- * we need to pass the max hash as the maximum allowable offset in
- * the htree directory case.
- *
- * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
- *       will be invalid once the directory was converted into a dx directory
- */
-static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
-{
-	struct inode *inode = file->f_mapping->host;
-	int dx_dir = is_dx_dir(inode);
-	loff_t htree_max = ext3_get_htree_eof(file);
-
-	if (likely(dx_dir))
-		return generic_file_llseek_size(file, offset, whence,
-					        htree_max, htree_max);
-	else
-		return generic_file_llseek(file, offset, whence);
-}
-
-/*
- * This structure holds the nodes of the red-black tree used to store
- * the directory entry in hash order.
- */
-struct fname {
-	__u32		hash;
-	__u32		minor_hash;
-	struct rb_node	rb_hash;
-	struct fname	*next;
-	__u32		inode;
-	__u8		name_len;
-	__u8		file_type;
-	char		name[0];
-};
-
-/*
- * This functoin implements a non-recursive way of freeing all of the
- * nodes in the red-black tree.
- */
-static void free_rb_tree_fname(struct rb_root *root)
-{
-	struct fname *fname, *next;
-
-	rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
-		do {
-			struct fname *old = fname;
-			fname = fname->next;
-			kfree(old);
-		} while (fname);
-
-	*root = RB_ROOT;
-}
-
-static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
-							   loff_t pos)
-{
-	struct dir_private_info *p;
-
-	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
-	if (!p)
-		return NULL;
-	p->curr_hash = pos2maj_hash(filp, pos);
-	p->curr_minor_hash = pos2min_hash(filp, pos);
-	return p;
-}
-
-void ext3_htree_free_dir_info(struct dir_private_info *p)
-{
-	free_rb_tree_fname(&p->root);
-	kfree(p);
-}
-
-/*
- * Given a directory entry, enter it into the fname rb tree.
- */
-int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
-			     __u32 minor_hash,
-			     struct ext3_dir_entry_2 *dirent)
-{
-	struct rb_node **p, *parent = NULL;
-	struct fname * fname, *new_fn;
-	struct dir_private_info *info;
-	int len;
-
-	info = (struct dir_private_info *) dir_file->private_data;
-	p = &info->root.rb_node;
-
-	/* Create and allocate the fname structure */
-	len = sizeof(struct fname) + dirent->name_len + 1;
-	new_fn = kzalloc(len, GFP_KERNEL);
-	if (!new_fn)
-		return -ENOMEM;
-	new_fn->hash = hash;
-	new_fn->minor_hash = minor_hash;
-	new_fn->inode = le32_to_cpu(dirent->inode);
-	new_fn->name_len = dirent->name_len;
-	new_fn->file_type = dirent->file_type;
-	memcpy(new_fn->name, dirent->name, dirent->name_len);
-	new_fn->name[dirent->name_len] = 0;
-
-	while (*p) {
-		parent = *p;
-		fname = rb_entry(parent, struct fname, rb_hash);
-
-		/*
-		 * If the hash and minor hash match up, then we put
-		 * them on a linked list.  This rarely happens...
-		 */
-		if ((new_fn->hash == fname->hash) &&
-		    (new_fn->minor_hash == fname->minor_hash)) {
-			new_fn->next = fname->next;
-			fname->next = new_fn;
-			return 0;
-		}
-
-		if (new_fn->hash < fname->hash)
-			p = &(*p)->rb_left;
-		else if (new_fn->hash > fname->hash)
-			p = &(*p)->rb_right;
-		else if (new_fn->minor_hash < fname->minor_hash)
-			p = &(*p)->rb_left;
-		else /* if (new_fn->minor_hash > fname->minor_hash) */
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&new_fn->rb_hash, parent, p);
-	rb_insert_color(&new_fn->rb_hash, &info->root);
-	return 0;
-}
-
-
-
-/*
- * This is a helper function for ext3_dx_readdir.  It calls filldir
- * for all entres on the fname linked list.  (Normally there is only
- * one entry on the linked list, unless there are 62 bit hash collisions.)
- */
-static bool call_filldir(struct file *file, struct dir_context *ctx,
-			struct fname *fname)
-{
-	struct dir_private_info *info = file->private_data;
-	struct inode *inode = file_inode(file);
-	struct super_block *sb = inode->i_sb;
-
-	if (!fname) {
-		printk("call_filldir: called with null fname?!?\n");
-		return true;
-	}
-	ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
-	while (fname) {
-		if (!dir_emit(ctx, fname->name, fname->name_len,
-				fname->inode,
-				get_dtype(sb, fname->file_type))) {
-			info->extra_fname = fname;
-			return false;
-		}
-		fname = fname->next;
-	}
-	return true;
-}
-
-static int ext3_dx_readdir(struct file *file, struct dir_context *ctx)
-{
-	struct dir_private_info *info = file->private_data;
-	struct inode *inode = file_inode(file);
-	struct fname *fname;
-	int	ret;
-
-	if (!info) {
-		info = ext3_htree_create_dir_info(file, ctx->pos);
-		if (!info)
-			return -ENOMEM;
-		file->private_data = info;
-	}
-
-	if (ctx->pos == ext3_get_htree_eof(file))
-		return 0;	/* EOF */
-
-	/* Some one has messed with f_pos; reset the world */
-	if (info->last_pos != ctx->pos) {
-		free_rb_tree_fname(&info->root);
-		info->curr_node = NULL;
-		info->extra_fname = NULL;
-		info->curr_hash = pos2maj_hash(file, ctx->pos);
-		info->curr_minor_hash = pos2min_hash(file, ctx->pos);
-	}
-
-	/*
-	 * If there are any leftover names on the hash collision
-	 * chain, return them first.
-	 */
-	if (info->extra_fname) {
-		if (!call_filldir(file, ctx, info->extra_fname))
-			goto finished;
-		info->extra_fname = NULL;
-		goto next_node;
-	} else if (!info->curr_node)
-		info->curr_node = rb_first(&info->root);
-
-	while (1) {
-		/*
-		 * Fill the rbtree if we have no more entries,
-		 * or the inode has changed since we last read in the
-		 * cached entries.
-		 */
-		if ((!info->curr_node) ||
-		    (file->f_version != inode->i_version)) {
-			info->curr_node = NULL;
-			free_rb_tree_fname(&info->root);
-			file->f_version = inode->i_version;
-			ret = ext3_htree_fill_tree(file, info->curr_hash,
-						   info->curr_minor_hash,
-						   &info->next_hash);
-			if (ret < 0)
-				return ret;
-			if (ret == 0) {
-				ctx->pos = ext3_get_htree_eof(file);
-				break;
-			}
-			info->curr_node = rb_first(&info->root);
-		}
-
-		fname = rb_entry(info->curr_node, struct fname, rb_hash);
-		info->curr_hash = fname->hash;
-		info->curr_minor_hash = fname->minor_hash;
-		if (!call_filldir(file, ctx, fname))
-			break;
-	next_node:
-		info->curr_node = rb_next(info->curr_node);
-		if (info->curr_node) {
-			fname = rb_entry(info->curr_node, struct fname,
-					 rb_hash);
-			info->curr_hash = fname->hash;
-			info->curr_minor_hash = fname->minor_hash;
-		} else {
-			if (info->next_hash == ~0) {
-				ctx->pos = ext3_get_htree_eof(file);
-				break;
-			}
-			info->curr_hash = info->next_hash;
-			info->curr_minor_hash = 0;
-		}
-	}
-finished:
-	info->last_pos = ctx->pos;
-	return 0;
-}
-
-static int ext3_release_dir (struct inode * inode, struct file * filp)
-{
-       if (filp->private_data)
-		ext3_htree_free_dir_info(filp->private_data);
-
-	return 0;
-}
-
-const struct file_operations ext3_dir_operations = {
-	.llseek		= ext3_dir_llseek,
-	.read		= generic_read_dir,
-	.iterate	= ext3_readdir,
-	.unlocked_ioctl = ext3_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= ext3_compat_ioctl,
-#endif
-	.fsync		= ext3_sync_file,
-	.release	= ext3_release_dir,
-};
diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
deleted file mode 100644
index f483a80b3fe7..000000000000
--- a/fs/ext3/ext3.h
+++ /dev/null
@@ -1,1332 +0,0 @@
-/*
- * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
- *
- * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/include/linux/minix_fs.h
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/magic.h>
-#include <linux/bug.h>
-#include <linux/blockgroup_lock.h>
-
-/*
- * The second extended filesystem constants/structures
- */
-
-/*
- * Define EXT3FS_DEBUG to produce debug messages
- */
-#undef EXT3FS_DEBUG
-
-/*
- * Define EXT3_RESERVATION to reserve data blocks for expanding files
- */
-#define EXT3_DEFAULT_RESERVE_BLOCKS     8
-/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
-#define EXT3_MAX_RESERVE_BLOCKS         1027
-#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0
-
-/*
- * Debug code
- */
-#ifdef EXT3FS_DEBUG
-#define ext3_debug(f, a...)						\
-	do {								\
-		printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:",	\
-			__FILE__, __LINE__, __func__);		\
-		printk (KERN_DEBUG f, ## a);				\
-	} while (0)
-#else
-#define ext3_debug(f, a...)	do {} while (0)
-#endif
-
-/*
- * Special inodes numbers
- */
-#define	EXT3_BAD_INO		 1	/* Bad blocks inode */
-#define EXT3_ROOT_INO		 2	/* Root inode */
-#define EXT3_BOOT_LOADER_INO	 5	/* Boot loader inode */
-#define EXT3_UNDEL_DIR_INO	 6	/* Undelete directory inode */
-#define EXT3_RESIZE_INO		 7	/* Reserved group descriptors inode */
-#define EXT3_JOURNAL_INO	 8	/* Journal inode */
-
-/* First non-reserved inode for old ext3 filesystems */
-#define EXT3_GOOD_OLD_FIRST_INO	11
-
-/*
- * Maximal count of links to a file
- */
-#define EXT3_LINK_MAX		32000
-
-/*
- * Macro-instructions used to manage several block sizes
- */
-#define EXT3_MIN_BLOCK_SIZE		1024
-#define	EXT3_MAX_BLOCK_SIZE		65536
-#define EXT3_MIN_BLOCK_LOG_SIZE		10
-#define EXT3_BLOCK_SIZE(s)		((s)->s_blocksize)
-#define	EXT3_ADDR_PER_BLOCK(s)		(EXT3_BLOCK_SIZE(s) / sizeof (__u32))
-#define EXT3_BLOCK_SIZE_BITS(s)	((s)->s_blocksize_bits)
-#define	EXT3_ADDR_PER_BLOCK_BITS(s)	(EXT3_SB(s)->s_addr_per_block_bits)
-#define EXT3_INODE_SIZE(s)		(EXT3_SB(s)->s_inode_size)
-#define EXT3_FIRST_INO(s)		(EXT3_SB(s)->s_first_ino)
-
-/*
- * Macro-instructions used to manage fragments
- */
-#define EXT3_MIN_FRAG_SIZE		1024
-#define	EXT3_MAX_FRAG_SIZE		4096
-#define EXT3_MIN_FRAG_LOG_SIZE		  10
-#define EXT3_FRAG_SIZE(s)		(EXT3_SB(s)->s_frag_size)
-#define EXT3_FRAGS_PER_BLOCK(s)		(EXT3_SB(s)->s_frags_per_block)
-
-/*
- * Structure of a blocks group descriptor
- */
-struct ext3_group_desc
-{
-	__le32	bg_block_bitmap;		/* Blocks bitmap block */
-	__le32	bg_inode_bitmap;		/* Inodes bitmap block */
-	__le32	bg_inode_table;		/* Inodes table block */
-	__le16	bg_free_blocks_count;	/* Free blocks count */
-	__le16	bg_free_inodes_count;	/* Free inodes count */
-	__le16	bg_used_dirs_count;	/* Directories count */
-	__u16	bg_pad;
-	__le32	bg_reserved[3];
-};
-
-/*
- * Macro-instructions used to manage group descriptors
- */
-#define EXT3_BLOCKS_PER_GROUP(s)	(EXT3_SB(s)->s_blocks_per_group)
-#define EXT3_DESC_PER_BLOCK(s)		(EXT3_SB(s)->s_desc_per_block)
-#define EXT3_INODES_PER_GROUP(s)	(EXT3_SB(s)->s_inodes_per_group)
-#define EXT3_DESC_PER_BLOCK_BITS(s)	(EXT3_SB(s)->s_desc_per_block_bits)
-
-/*
- * Constants relative to the data blocks
- */
-#define	EXT3_NDIR_BLOCKS		12
-#define	EXT3_IND_BLOCK			EXT3_NDIR_BLOCKS
-#define	EXT3_DIND_BLOCK			(EXT3_IND_BLOCK + 1)
-#define	EXT3_TIND_BLOCK			(EXT3_DIND_BLOCK + 1)
-#define	EXT3_N_BLOCKS			(EXT3_TIND_BLOCK + 1)
-
-/*
- * Inode flags
- */
-#define	EXT3_SECRM_FL			0x00000001 /* Secure deletion */
-#define	EXT3_UNRM_FL			0x00000002 /* Undelete */
-#define	EXT3_COMPR_FL			0x00000004 /* Compress file */
-#define EXT3_SYNC_FL			0x00000008 /* Synchronous updates */
-#define EXT3_IMMUTABLE_FL		0x00000010 /* Immutable file */
-#define EXT3_APPEND_FL			0x00000020 /* writes to file may only append */
-#define EXT3_NODUMP_FL			0x00000040 /* do not dump file */
-#define EXT3_NOATIME_FL			0x00000080 /* do not update atime */
-/* Reserved for compression usage... */
-#define EXT3_DIRTY_FL			0x00000100
-#define EXT3_COMPRBLK_FL		0x00000200 /* One or more compressed clusters */
-#define EXT3_NOCOMPR_FL			0x00000400 /* Don't compress */
-#define EXT3_ECOMPR_FL			0x00000800 /* Compression error */
-/* End compression flags --- maybe not all used */
-#define EXT3_INDEX_FL			0x00001000 /* hash-indexed directory */
-#define EXT3_IMAGIC_FL			0x00002000 /* AFS directory */
-#define EXT3_JOURNAL_DATA_FL		0x00004000 /* file data should be journaled */
-#define EXT3_NOTAIL_FL			0x00008000 /* file tail should not be merged */
-#define EXT3_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
-#define EXT3_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
-#define EXT3_RESERVED_FL		0x80000000 /* reserved for ext3 lib */
-
-#define EXT3_FL_USER_VISIBLE		0x0003DFFF /* User visible flags */
-#define EXT3_FL_USER_MODIFIABLE		0x000380FF /* User modifiable flags */
-
-/* Flags that should be inherited by new inodes from their parent. */
-#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\
-			   EXT3_SYNC_FL | EXT3_NODUMP_FL |\
-			   EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\
-			   EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\
-			   EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL)
-
-/* Flags that are appropriate for regular files (all but dir-specific ones). */
-#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL))
-
-/* Flags that are appropriate for non-directories/regular files. */
-#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL)
-
-/* Mask out flags that are inappropriate for the given type of inode. */
-static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags)
-{
-	if (S_ISDIR(mode))
-		return flags;
-	else if (S_ISREG(mode))
-		return flags & EXT3_REG_FLMASK;
-	else
-		return flags & EXT3_OTHER_FLMASK;
-}
-
-/* Used to pass group descriptor data when online resize is done */
-struct ext3_new_group_input {
-	__u32 group;            /* Group number for this data */
-	__u32 block_bitmap;     /* Absolute block number of block bitmap */
-	__u32 inode_bitmap;     /* Absolute block number of inode bitmap */
-	__u32 inode_table;      /* Absolute block number of inode table start */
-	__u32 blocks_count;     /* Total number of blocks in this group */
-	__u16 reserved_blocks;  /* Number of reserved blocks in this group */
-	__u16 unused;
-};
-
-/* The struct ext3_new_group_input in kernel space, with free_blocks_count */
-struct ext3_new_group_data {
-	__u32 group;
-	__u32 block_bitmap;
-	__u32 inode_bitmap;
-	__u32 inode_table;
-	__u32 blocks_count;
-	__u16 reserved_blocks;
-	__u16 unused;
-	__u32 free_blocks_count;
-};
-
-
-/*
- * ioctl commands
- */
-#define	EXT3_IOC_GETFLAGS		FS_IOC_GETFLAGS
-#define	EXT3_IOC_SETFLAGS		FS_IOC_SETFLAGS
-#define	EXT3_IOC_GETVERSION		_IOR('f', 3, long)
-#define	EXT3_IOC_SETVERSION		_IOW('f', 4, long)
-#define EXT3_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
-#define EXT3_IOC_GROUP_ADD		_IOW('f', 8,struct ext3_new_group_input)
-#define	EXT3_IOC_GETVERSION_OLD		FS_IOC_GETVERSION
-#define	EXT3_IOC_SETVERSION_OLD		FS_IOC_SETVERSION
-#ifdef CONFIG_JBD_DEBUG
-#define EXT3_IOC_WAIT_FOR_READONLY	_IOR('f', 99, long)
-#endif
-#define EXT3_IOC_GETRSVSZ		_IOR('f', 5, long)
-#define EXT3_IOC_SETRSVSZ		_IOW('f', 6, long)
-
-/*
- * ioctl commands in 32 bit emulation
- */
-#define EXT3_IOC32_GETFLAGS		FS_IOC32_GETFLAGS
-#define EXT3_IOC32_SETFLAGS		FS_IOC32_SETFLAGS
-#define EXT3_IOC32_GETVERSION		_IOR('f', 3, int)
-#define EXT3_IOC32_SETVERSION		_IOW('f', 4, int)
-#define EXT3_IOC32_GETRSVSZ		_IOR('f', 5, int)
-#define EXT3_IOC32_SETRSVSZ		_IOW('f', 6, int)
-#define EXT3_IOC32_GROUP_EXTEND		_IOW('f', 7, unsigned int)
-#ifdef CONFIG_JBD_DEBUG
-#define EXT3_IOC32_WAIT_FOR_READONLY	_IOR('f', 99, int)
-#endif
-#define EXT3_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
-#define EXT3_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
-
-/* Number of supported quota types */
-#define EXT3_MAXQUOTAS 2
-
-/*
- *  Mount options
- */
-struct ext3_mount_options {
-	unsigned long s_mount_opt;
-	kuid_t s_resuid;
-	kgid_t s_resgid;
-	unsigned long s_commit_interval;
-#ifdef CONFIG_QUOTA
-	int s_jquota_fmt;
-	char *s_qf_names[EXT3_MAXQUOTAS];
-#endif
-};
-
-/*
- * Structure of an inode on the disk
- */
-struct ext3_inode {
-	__le16	i_mode;		/* File mode */
-	__le16	i_uid;		/* Low 16 bits of Owner Uid */
-	__le32	i_size;		/* Size in bytes */
-	__le32	i_atime;	/* Access time */
-	__le32	i_ctime;	/* Creation time */
-	__le32	i_mtime;	/* Modification time */
-	__le32	i_dtime;	/* Deletion Time */
-	__le16	i_gid;		/* Low 16 bits of Group Id */
-	__le16	i_links_count;	/* Links count */
-	__le32	i_blocks;	/* Blocks count */
-	__le32	i_flags;	/* File flags */
-	union {
-		struct {
-			__u32  l_i_reserved1;
-		} linux1;
-		struct {
-			__u32  h_i_translator;
-		} hurd1;
-		struct {
-			__u32  m_i_reserved1;
-		} masix1;
-	} osd1;				/* OS dependent 1 */
-	__le32	i_block[EXT3_N_BLOCKS];/* Pointers to blocks */
-	__le32	i_generation;	/* File version (for NFS) */
-	__le32	i_file_acl;	/* File ACL */
-	__le32	i_dir_acl;	/* Directory ACL */
-	__le32	i_faddr;	/* Fragment address */
-	union {
-		struct {
-			__u8	l_i_frag;	/* Fragment number */
-			__u8	l_i_fsize;	/* Fragment size */
-			__u16	i_pad1;
-			__le16	l_i_uid_high;	/* these 2 fields    */
-			__le16	l_i_gid_high;	/* were reserved2[0] */
-			__u32	l_i_reserved2;
-		} linux2;
-		struct {
-			__u8	h_i_frag;	/* Fragment number */
-			__u8	h_i_fsize;	/* Fragment size */
-			__u16	h_i_mode_high;
-			__u16	h_i_uid_high;
-			__u16	h_i_gid_high;
-			__u32	h_i_author;
-		} hurd2;
-		struct {
-			__u8	m_i_frag;	/* Fragment number */
-			__u8	m_i_fsize;	/* Fragment size */
-			__u16	m_pad1;
-			__u32	m_i_reserved2[2];
-		} masix2;
-	} osd2;				/* OS dependent 2 */
-	__le16	i_extra_isize;
-	__le16	i_pad1;
-};
-
-#define i_size_high	i_dir_acl
-
-#define i_reserved1	osd1.linux1.l_i_reserved1
-#define i_frag		osd2.linux2.l_i_frag
-#define i_fsize		osd2.linux2.l_i_fsize
-#define i_uid_low	i_uid
-#define i_gid_low	i_gid
-#define i_uid_high	osd2.linux2.l_i_uid_high
-#define i_gid_high	osd2.linux2.l_i_gid_high
-#define i_reserved2	osd2.linux2.l_i_reserved2
-
-/*
- * File system states
- */
-#define	EXT3_VALID_FS			0x0001	/* Unmounted cleanly */
-#define	EXT3_ERROR_FS			0x0002	/* Errors detected */
-#define	EXT3_ORPHAN_FS			0x0004	/* Orphans being recovered */
-
-/*
- * Misc. filesystem flags
- */
-#define EXT2_FLAGS_SIGNED_HASH		0x0001  /* Signed dirhash in use */
-#define EXT2_FLAGS_UNSIGNED_HASH	0x0002  /* Unsigned dirhash in use */
-#define EXT2_FLAGS_TEST_FILESYS		0x0004	/* to test development code */
-
-/*
- * Mount flags
- */
-#define EXT3_MOUNT_CHECK		0x00001	/* Do mount-time checks */
-/* EXT3_MOUNT_OLDALLOC was there */
-#define EXT3_MOUNT_GRPID		0x00004	/* Create files with directory's group */
-#define EXT3_MOUNT_DEBUG		0x00008	/* Some debugging messages */
-#define EXT3_MOUNT_ERRORS_CONT		0x00010	/* Continue on errors */
-#define EXT3_MOUNT_ERRORS_RO		0x00020	/* Remount fs ro on errors */
-#define EXT3_MOUNT_ERRORS_PANIC		0x00040	/* Panic on errors */
-#define EXT3_MOUNT_MINIX_DF		0x00080	/* Mimics the Minix statfs */
-#define EXT3_MOUNT_NOLOAD		0x00100	/* Don't use existing journal*/
-#define EXT3_MOUNT_ABORT		0x00200	/* Fatal error detected */
-#define EXT3_MOUNT_DATA_FLAGS		0x00C00	/* Mode for data writes: */
-#define EXT3_MOUNT_JOURNAL_DATA		0x00400	/* Write data to journal */
-#define EXT3_MOUNT_ORDERED_DATA		0x00800	/* Flush data before commit */
-#define EXT3_MOUNT_WRITEBACK_DATA	0x00C00	/* No data ordering */
-#define EXT3_MOUNT_UPDATE_JOURNAL	0x01000	/* Update the journal format */
-#define EXT3_MOUNT_NO_UID32		0x02000  /* Disable 32-bit UIDs */
-#define EXT3_MOUNT_XATTR_USER		0x04000	/* Extended user attributes */
-#define EXT3_MOUNT_POSIX_ACL		0x08000	/* POSIX Access Control Lists */
-#define EXT3_MOUNT_RESERVATION		0x10000	/* Preallocation */
-#define EXT3_MOUNT_BARRIER		0x20000 /* Use block barriers */
-#define EXT3_MOUNT_QUOTA		0x80000 /* Some quota option set */
-#define EXT3_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
-#define EXT3_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
-#define EXT3_MOUNT_DATA_ERR_ABORT	0x400000 /* Abort on file data write
-						  * error in ordered mode */
-
-/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
-#ifndef _LINUX_EXT2_FS_H
-#define clear_opt(o, opt)		o &= ~EXT3_MOUNT_##opt
-#define set_opt(o, opt)			o |= EXT3_MOUNT_##opt
-#define test_opt(sb, opt)		(EXT3_SB(sb)->s_mount_opt & \
-					 EXT3_MOUNT_##opt)
-#else
-#define EXT2_MOUNT_NOLOAD		EXT3_MOUNT_NOLOAD
-#define EXT2_MOUNT_ABORT		EXT3_MOUNT_ABORT
-#define EXT2_MOUNT_DATA_FLAGS		EXT3_MOUNT_DATA_FLAGS
-#endif
-
-#define ext3_set_bit			__set_bit_le
-#define ext3_set_bit_atomic		ext2_set_bit_atomic
-#define ext3_clear_bit			__clear_bit_le
-#define ext3_clear_bit_atomic		ext2_clear_bit_atomic
-#define ext3_test_bit			test_bit_le
-#define ext3_find_next_zero_bit		find_next_zero_bit_le
-
-/*
- * Maximal mount counts between two filesystem checks
- */
-#define EXT3_DFL_MAX_MNT_COUNT		20	/* Allow 20 mounts */
-#define EXT3_DFL_CHECKINTERVAL		0	/* Don't use interval check */
-
-/*
- * Behaviour when detecting errors
- */
-#define EXT3_ERRORS_CONTINUE		1	/* Continue execution */
-#define EXT3_ERRORS_RO			2	/* Remount fs read-only */
-#define EXT3_ERRORS_PANIC		3	/* Panic */
-#define EXT3_ERRORS_DEFAULT		EXT3_ERRORS_CONTINUE
-
-/*
- * Structure of the super block
- */
-struct ext3_super_block {
-/*00*/	__le32	s_inodes_count;		/* Inodes count */
-	__le32	s_blocks_count;		/* Blocks count */
-	__le32	s_r_blocks_count;	/* Reserved blocks count */
-	__le32	s_free_blocks_count;	/* Free blocks count */
-/*10*/	__le32	s_free_inodes_count;	/* Free inodes count */
-	__le32	s_first_data_block;	/* First Data Block */
-	__le32	s_log_block_size;	/* Block size */
-	__le32	s_log_frag_size;	/* Fragment size */
-/*20*/	__le32	s_blocks_per_group;	/* # Blocks per group */
-	__le32	s_frags_per_group;	/* # Fragments per group */
-	__le32	s_inodes_per_group;	/* # Inodes per group */
-	__le32	s_mtime;		/* Mount time */
-/*30*/	__le32	s_wtime;		/* Write time */
-	__le16	s_mnt_count;		/* Mount count */
-	__le16	s_max_mnt_count;	/* Maximal mount count */
-	__le16	s_magic;		/* Magic signature */
-	__le16	s_state;		/* File system state */
-	__le16	s_errors;		/* Behaviour when detecting errors */
-	__le16	s_minor_rev_level;	/* minor revision level */
-/*40*/	__le32	s_lastcheck;		/* time of last check */
-	__le32	s_checkinterval;	/* max. time between checks */
-	__le32	s_creator_os;		/* OS */
-	__le32	s_rev_level;		/* Revision level */
-/*50*/	__le16	s_def_resuid;		/* Default uid for reserved blocks */
-	__le16	s_def_resgid;		/* Default gid for reserved blocks */
-	/*
-	 * These fields are for EXT3_DYNAMIC_REV superblocks only.
-	 *
-	 * Note: the difference between the compatible feature set and
-	 * the incompatible feature set is that if there is a bit set
-	 * in the incompatible feature set that the kernel doesn't
-	 * know about, it should refuse to mount the filesystem.
-	 *
-	 * e2fsck's requirements are more strict; if it doesn't know
-	 * about a feature in either the compatible or incompatible
-	 * feature set, it must abort and not try to meddle with
-	 * things it doesn't understand...
-	 */
-	__le32	s_first_ino;		/* First non-reserved inode */
-	__le16   s_inode_size;		/* size of inode structure */
-	__le16	s_block_group_nr;	/* block group # of this superblock */
-	__le32	s_feature_compat;	/* compatible feature set */
-/*60*/	__le32	s_feature_incompat;	/* incompatible feature set */
-	__le32	s_feature_ro_compat;	/* readonly-compatible feature set */
-/*68*/	__u8	s_uuid[16];		/* 128-bit uuid for volume */
-/*78*/	char	s_volume_name[16];	/* volume name */
-/*88*/	char	s_last_mounted[64];	/* directory where last mounted */
-/*C8*/	__le32	s_algorithm_usage_bitmap; /* For compression */
-	/*
-	 * Performance hints.  Directory preallocation should only
-	 * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on.
-	 */
-	__u8	s_prealloc_blocks;	/* Nr of blocks to try to preallocate*/
-	__u8	s_prealloc_dir_blocks;	/* Nr to preallocate for dirs */
-	__le16	s_reserved_gdt_blocks;	/* Per group desc for online growth */
-	/*
-	 * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set.
-	 */
-/*D0*/	__u8	s_journal_uuid[16];	/* uuid of journal superblock */
-/*E0*/	__le32	s_journal_inum;		/* inode number of journal file */
-	__le32	s_journal_dev;		/* device number of journal file */
-	__le32	s_last_orphan;		/* start of list of inodes to delete */
-	__le32	s_hash_seed[4];		/* HTREE hash seed */
-	__u8	s_def_hash_version;	/* Default hash version to use */
-	__u8	s_reserved_char_pad;
-	__u16	s_reserved_word_pad;
-	__le32	s_default_mount_opts;
-	__le32	s_first_meta_bg;	/* First metablock block group */
-	__le32	s_mkfs_time;		/* When the filesystem was created */
-	__le32	s_jnl_blocks[17];	/* Backup of the journal inode */
-	/* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
-/*150*/	__le32	s_blocks_count_hi;	/* Blocks count */
-	__le32	s_r_blocks_count_hi;	/* Reserved blocks count */
-	__le32	s_free_blocks_count_hi;	/* Free blocks count */
-	__le16	s_min_extra_isize;	/* All inodes have at least # bytes */
-	__le16	s_want_extra_isize; 	/* New inodes should reserve # bytes */
-	__le32	s_flags;		/* Miscellaneous flags */
-	__le16  s_raid_stride;		/* RAID stride */
-	__le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
-	__le64  s_mmp_block;            /* Block for multi-mount protection */
-	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
-	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
-	__u8	s_reserved_char_pad2;
-	__le16  s_reserved_pad;
-	__u32   s_reserved[162];        /* Padding to the end of the block */
-};
-
-/* data type for block offset of block group */
-typedef int ext3_grpblk_t;
-
-/* data type for filesystem-wide blocks number */
-typedef unsigned long ext3_fsblk_t;
-
-#define E3FSBLK "%lu"
-
-struct ext3_reserve_window {
-	ext3_fsblk_t	_rsv_start;	/* First byte reserved */
-	ext3_fsblk_t	_rsv_end;	/* Last byte reserved or 0 */
-};
-
-struct ext3_reserve_window_node {
-	struct rb_node		rsv_node;
-	__u32			rsv_goal_size;
-	__u32			rsv_alloc_hit;
-	struct ext3_reserve_window	rsv_window;
-};
-
-struct ext3_block_alloc_info {
-	/* information about reservation window */
-	struct ext3_reserve_window_node	rsv_window_node;
-	/*
-	 * was i_next_alloc_block in ext3_inode_info
-	 * is the logical (file-relative) number of the
-	 * most-recently-allocated block in this file.
-	 * We use this for detecting linearly ascending allocation requests.
-	 */
-	__u32                   last_alloc_logical_block;
-	/*
-	 * Was i_next_alloc_goal in ext3_inode_info
-	 * is the *physical* companion to i_next_alloc_block.
-	 * it the physical block number of the block which was most-recentl
-	 * allocated to this file.  This give us the goal (target) for the next
-	 * allocation when we detect linearly ascending requests.
-	 */
-	ext3_fsblk_t		last_alloc_physical_block;
-};
-
-#define rsv_start rsv_window._rsv_start
-#define rsv_end rsv_window._rsv_end
-
-/*
- * third extended file system inode data in memory
- */
-struct ext3_inode_info {
-	__le32	i_data[15];	/* unconverted */
-	__u32	i_flags;
-#ifdef EXT3_FRAGMENTS
-	__u32	i_faddr;
-	__u8	i_frag_no;
-	__u8	i_frag_size;
-#endif
-	ext3_fsblk_t	i_file_acl;
-	__u32	i_dir_acl;
-	__u32	i_dtime;
-
-	/*
-	 * i_block_group is the number of the block group which contains
-	 * this file's inode.  Constant across the lifetime of the inode,
-	 * it is ued for making block allocation decisions - we try to
-	 * place a file's data blocks near its inode block, and new inodes
-	 * near to their parent directory's inode.
-	 */
-	__u32	i_block_group;
-	unsigned long	i_state_flags;	/* Dynamic state flags for ext3 */
-
-	/* block reservation info */
-	struct ext3_block_alloc_info *i_block_alloc_info;
-
-	__u32	i_dir_start_lookup;
-#ifdef CONFIG_EXT3_FS_XATTR
-	/*
-	 * Extended attributes can be read independently of the main file
-	 * data. Taking i_mutex even when reading would cause contention
-	 * between readers of EAs and writers of regular file data, so
-	 * instead we synchronize on xattr_sem when reading or changing
-	 * EAs.
-	 */
-	struct rw_semaphore xattr_sem;
-#endif
-
-	struct list_head i_orphan;	/* unlinked but open inodes */
-
-	/*
-	 * i_disksize keeps track of what the inode size is ON DISK, not
-	 * in memory.  During truncate, i_size is set to the new size by
-	 * the VFS prior to calling ext3_truncate(), but the filesystem won't
-	 * set i_disksize to 0 until the truncate is actually under way.
-	 *
-	 * The intent is that i_disksize always represents the blocks which
-	 * are used by this file.  This allows recovery to restart truncate
-	 * on orphans if we crash during truncate.  We actually write i_disksize
-	 * into the on-disk inode when writing inodes out, instead of i_size.
-	 *
-	 * The only time when i_disksize and i_size may be different is when
-	 * a truncate is in progress.  The only things which change i_disksize
-	 * are ext3_get_block (growth) and ext3_truncate (shrinkth).
-	 */
-	loff_t	i_disksize;
-
-	/* on-disk additional length */
-	__u16 i_extra_isize;
-
-	/*
-	 * truncate_mutex is for serialising ext3_truncate() against
-	 * ext3_getblock().  In the 2.4 ext2 design, great chunks of inode's
-	 * data tree are chopped off during truncate. We can't do that in
-	 * ext3 because whenever we perform intermediate commits during
-	 * truncate, the inode and all the metadata blocks *must* be in a
-	 * consistent state which allows truncation of the orphans to restart
-	 * during recovery.  Hence we must fix the get_block-vs-truncate race
-	 * by other means, so we have truncate_mutex.
-	 */
-	struct mutex truncate_mutex;
-
-	/*
-	 * Transactions that contain inode's metadata needed to complete
-	 * fsync and fdatasync, respectively.
-	 */
-	atomic_t i_sync_tid;
-	atomic_t i_datasync_tid;
-
-#ifdef CONFIG_QUOTA
-	struct dquot *i_dquot[MAXQUOTAS];
-#endif
-
-	struct inode vfs_inode;
-};
-
-/*
- * third extended-fs super-block data in memory
- */
-struct ext3_sb_info {
-	unsigned long s_frag_size;	/* Size of a fragment in bytes */
-	unsigned long s_frags_per_block;/* Number of fragments per block */
-	unsigned long s_inodes_per_block;/* Number of inodes per block */
-	unsigned long s_frags_per_group;/* Number of fragments in a group */
-	unsigned long s_blocks_per_group;/* Number of blocks in a group */
-	unsigned long s_inodes_per_group;/* Number of inodes in a group */
-	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
-	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
-	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
-	unsigned long s_groups_count;	/* Number of groups in the fs */
-	unsigned long s_overhead_last;  /* Last calculated overhead */
-	unsigned long s_blocks_last;    /* Last seen block count */
-	struct buffer_head * s_sbh;	/* Buffer containing the super block */
-	struct ext3_super_block * s_es;	/* Pointer to the super block in the buffer */
-	struct buffer_head ** s_group_desc;
-	unsigned long  s_mount_opt;
-	ext3_fsblk_t s_sb_block;
-	kuid_t s_resuid;
-	kgid_t s_resgid;
-	unsigned short s_mount_state;
-	unsigned short s_pad;
-	int s_addr_per_block_bits;
-	int s_desc_per_block_bits;
-	int s_inode_size;
-	int s_first_ino;
-	spinlock_t s_next_gen_lock;
-	u32 s_next_generation;
-	u32 s_hash_seed[4];
-	int s_def_hash_version;
-	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
-	struct percpu_counter s_freeblocks_counter;
-	struct percpu_counter s_freeinodes_counter;
-	struct percpu_counter s_dirs_counter;
-	struct blockgroup_lock *s_blockgroup_lock;
-
-	/* root of the per fs reservation window tree */
-	spinlock_t s_rsv_window_lock;
-	struct rb_root s_rsv_window_root;
-	struct ext3_reserve_window_node s_rsv_window_head;
-
-	/* Journaling */
-	struct inode * s_journal_inode;
-	struct journal_s * s_journal;
-	struct list_head s_orphan;
-	struct mutex s_orphan_lock;
-	struct mutex s_resize_lock;
-	unsigned long s_commit_interval;
-	struct block_device *journal_bdev;
-#ifdef CONFIG_QUOTA
-	char *s_qf_names[EXT3_MAXQUOTAS];	/* Names of quota files with journalled quota */
-	int s_jquota_fmt;			/* Format of quota to use */
-#endif
-};
-
-static inline spinlock_t *
-sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group)
-{
-	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
-}
-
-static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-static inline struct ext3_inode_info *EXT3_I(struct inode *inode)
-{
-	return container_of(inode, struct ext3_inode_info, vfs_inode);
-}
-
-static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino)
-{
-	return ino == EXT3_ROOT_INO ||
-		ino == EXT3_JOURNAL_INO ||
-		ino == EXT3_RESIZE_INO ||
-		(ino >= EXT3_FIRST_INO(sb) &&
-		 ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count));
-}
-
-/*
- * Inode dynamic state flags
- */
-enum {
-	EXT3_STATE_JDATA,		/* journaled data exists */
-	EXT3_STATE_NEW,			/* inode is newly created */
-	EXT3_STATE_XATTR,		/* has in-inode xattrs */
-	EXT3_STATE_FLUSH_ON_CLOSE,	/* flush dirty pages on close */
-};
-
-static inline int ext3_test_inode_state(struct inode *inode, int bit)
-{
-	return test_bit(bit, &EXT3_I(inode)->i_state_flags);
-}
-
-static inline void ext3_set_inode_state(struct inode *inode, int bit)
-{
-	set_bit(bit, &EXT3_I(inode)->i_state_flags);
-}
-
-static inline void ext3_clear_inode_state(struct inode *inode, int bit)
-{
-	clear_bit(bit, &EXT3_I(inode)->i_state_flags);
-}
-
-#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime
-
-/*
- * Codes for operating systems
- */
-#define EXT3_OS_LINUX		0
-#define EXT3_OS_HURD		1
-#define EXT3_OS_MASIX		2
-#define EXT3_OS_FREEBSD		3
-#define EXT3_OS_LITES		4
-
-/*
- * Revision levels
- */
-#define EXT3_GOOD_OLD_REV	0	/* The good old (original) format */
-#define EXT3_DYNAMIC_REV	1	/* V2 format w/ dynamic inode sizes */
-
-#define EXT3_CURRENT_REV	EXT3_GOOD_OLD_REV
-#define EXT3_MAX_SUPP_REV	EXT3_DYNAMIC_REV
-
-#define EXT3_GOOD_OLD_INODE_SIZE 128
-
-/*
- * Feature set definitions
- */
-
-#define EXT3_HAS_COMPAT_FEATURE(sb,mask)			\
-	( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
-#define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask)			\
-	( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
-#define EXT3_HAS_INCOMPAT_FEATURE(sb,mask)			\
-	( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
-#define EXT3_SET_COMPAT_FEATURE(sb,mask)			\
-	EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
-#define EXT3_SET_RO_COMPAT_FEATURE(sb,mask)			\
-	EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
-#define EXT3_SET_INCOMPAT_FEATURE(sb,mask)			\
-	EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
-#define EXT3_CLEAR_COMPAT_FEATURE(sb,mask)			\
-	EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
-#define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
-	EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
-#define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
-	EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
-
-#define EXT3_FEATURE_COMPAT_DIR_PREALLOC	0x0001
-#define EXT3_FEATURE_COMPAT_IMAGIC_INODES	0x0002
-#define EXT3_FEATURE_COMPAT_HAS_JOURNAL		0x0004
-#define EXT3_FEATURE_COMPAT_EXT_ATTR		0x0008
-#define EXT3_FEATURE_COMPAT_RESIZE_INODE	0x0010
-#define EXT3_FEATURE_COMPAT_DIR_INDEX		0x0020
-
-#define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
-#define EXT3_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
-#define EXT3_FEATURE_RO_COMPAT_BTREE_DIR	0x0004
-
-#define EXT3_FEATURE_INCOMPAT_COMPRESSION	0x0001
-#define EXT3_FEATURE_INCOMPAT_FILETYPE		0x0002
-#define EXT3_FEATURE_INCOMPAT_RECOVER		0x0004 /* Needs recovery */
-#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV	0x0008 /* Journal device */
-#define EXT3_FEATURE_INCOMPAT_META_BG		0x0010
-
-#define EXT3_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
-#define EXT3_FEATURE_INCOMPAT_SUPP	(EXT3_FEATURE_INCOMPAT_FILETYPE| \
-					 EXT3_FEATURE_INCOMPAT_RECOVER| \
-					 EXT3_FEATURE_INCOMPAT_META_BG)
-#define EXT3_FEATURE_RO_COMPAT_SUPP	(EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
-					 EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
-					 EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
-
-/*
- * Default values for user and/or group using reserved blocks
- */
-#define	EXT3_DEF_RESUID		0
-#define	EXT3_DEF_RESGID		0
-
-/*
- * Default mount options
- */
-#define EXT3_DEFM_DEBUG		0x0001
-#define EXT3_DEFM_BSDGROUPS	0x0002
-#define EXT3_DEFM_XATTR_USER	0x0004
-#define EXT3_DEFM_ACL		0x0008
-#define EXT3_DEFM_UID16		0x0010
-#define EXT3_DEFM_JMODE		0x0060
-#define EXT3_DEFM_JMODE_DATA	0x0020
-#define EXT3_DEFM_JMODE_ORDERED	0x0040
-#define EXT3_DEFM_JMODE_WBACK	0x0060
-
-/*
- * Structure of a directory entry
- */
-#define EXT3_NAME_LEN 255
-
-struct ext3_dir_entry {
-	__le32	inode;			/* Inode number */
-	__le16	rec_len;		/* Directory entry length */
-	__le16	name_len;		/* Name length */
-	char	name[EXT3_NAME_LEN];	/* File name */
-};
-
-/*
- * The new version of the directory entry.  Since EXT3 structures are
- * stored in intel byte order, and the name_len field could never be
- * bigger than 255 chars, it's safe to reclaim the extra byte for the
- * file_type field.
- */
-struct ext3_dir_entry_2 {
-	__le32	inode;			/* Inode number */
-	__le16	rec_len;		/* Directory entry length */
-	__u8	name_len;		/* Name length */
-	__u8	file_type;
-	char	name[EXT3_NAME_LEN];	/* File name */
-};
-
-/*
- * Ext3 directory file types.  Only the low 3 bits are used.  The
- * other bits are reserved for now.
- */
-#define EXT3_FT_UNKNOWN		0
-#define EXT3_FT_REG_FILE	1
-#define EXT3_FT_DIR		2
-#define EXT3_FT_CHRDEV		3
-#define EXT3_FT_BLKDEV		4
-#define EXT3_FT_FIFO		5
-#define EXT3_FT_SOCK		6
-#define EXT3_FT_SYMLINK		7
-
-#define EXT3_FT_MAX		8
-
-/*
- * EXT3_DIR_PAD defines the directory entries boundaries
- *
- * NOTE: It must be a multiple of 4
- */
-#define EXT3_DIR_PAD			4
-#define EXT3_DIR_ROUND			(EXT3_DIR_PAD - 1)
-#define EXT3_DIR_REC_LEN(name_len)	(((name_len) + 8 + EXT3_DIR_ROUND) & \
-					 ~EXT3_DIR_ROUND)
-#define EXT3_MAX_REC_LEN		((1<<16)-1)
-
-/*
- * Tests against MAX_REC_LEN etc were put in place for 64k block
- * sizes; if that is not possible on this arch, we can skip
- * those tests and speed things up.
- */
-static inline unsigned ext3_rec_len_from_disk(__le16 dlen)
-{
-	unsigned len = le16_to_cpu(dlen);
-
-#if (PAGE_CACHE_SIZE >= 65536)
-	if (len == EXT3_MAX_REC_LEN)
-		return 1 << 16;
-#endif
-	return len;
-}
-
-static inline __le16 ext3_rec_len_to_disk(unsigned len)
-{
-#if (PAGE_CACHE_SIZE >= 65536)
-	if (len == (1 << 16))
-		return cpu_to_le16(EXT3_MAX_REC_LEN);
-	else if (len > (1 << 16))
-		BUG();
-#endif
-	return cpu_to_le16(len);
-}
-
-/*
- * Hash Tree Directory indexing
- * (c) Daniel Phillips, 2001
- */
-
-#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-				      EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-		      (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
-#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
-#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-
-/* Legal values for the dx_root hash_version field: */
-
-#define DX_HASH_LEGACY		0
-#define DX_HASH_HALF_MD4	1
-#define DX_HASH_TEA		2
-#define DX_HASH_LEGACY_UNSIGNED	3
-#define DX_HASH_HALF_MD4_UNSIGNED	4
-#define DX_HASH_TEA_UNSIGNED		5
-
-/* hash info structure used by the directory hash */
-struct dx_hash_info
-{
-	u32		hash;
-	u32		minor_hash;
-	int		hash_version;
-	u32		*seed;
-};
-
-
-/* 32 and 64 bit signed EOF for dx directories */
-#define EXT3_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
-#define EXT3_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)
-
-
-/*
- * Control parameters used by ext3_htree_next_block
- */
-#define HASH_NB_ALWAYS		1
-
-
-/*
- * Describe an inode's exact location on disk and in memory
- */
-struct ext3_iloc
-{
-	struct buffer_head *bh;
-	unsigned long offset;
-	unsigned long block_group;
-};
-
-static inline struct ext3_inode *ext3_raw_inode(struct ext3_iloc *iloc)
-{
-	return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset);
-}
-
-/*
- * This structure is stuffed into the struct file's private_data field
- * for directories.  It is where we put information so that we can do
- * readdir operations in hash tree order.
- */
-struct dir_private_info {
-	struct rb_root	root;
-	struct rb_node	*curr_node;
-	struct fname	*extra_fname;
-	loff_t		last_pos;
-	__u32		curr_hash;
-	__u32		curr_minor_hash;
-	__u32		next_hash;
-};
-
-/* calculate the first block number of the group */
-static inline ext3_fsblk_t
-ext3_group_first_block_no(struct super_block *sb, unsigned long group_no)
-{
-	return group_no * (ext3_fsblk_t)EXT3_BLOCKS_PER_GROUP(sb) +
-		le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
-}
-
-/*
- * Special error return code only used by dx_probe() and its callers.
- */
-#define ERR_BAD_DX_DIR	-75000
-
-/*
- * Function prototypes
- */
-
-/*
- * Ok, these declarations are also in <linux/kernel.h> but none of the
- * ext3 source programs needs to include it so they are duplicated here.
- */
-# define NORET_TYPE    /**/
-# define ATTRIB_NORET  __attribute__((noreturn))
-# define NORET_AND     noreturn,
-
-/* balloc.c */
-extern int ext3_bg_has_super(struct super_block *sb, int group);
-extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
-extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
-			ext3_fsblk_t goal, int *errp);
-extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
-			ext3_fsblk_t goal, unsigned long *count, int *errp);
-extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
-			ext3_fsblk_t block, unsigned long count);
-extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
-				 ext3_fsblk_t block, unsigned long count,
-				unsigned long *pdquot_freed_blocks);
-extern ext3_fsblk_t ext3_count_free_blocks (struct super_block *);
-extern void ext3_check_blocks_bitmap (struct super_block *);
-extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
-						    unsigned int block_group,
-						    struct buffer_head ** bh);
-extern int ext3_should_retry_alloc(struct super_block *sb, int *retries);
-extern void ext3_init_block_alloc_info(struct inode *);
-extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv);
-extern int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range);
-
-/* dir.c */
-extern int ext3_check_dir_entry(const char *, struct inode *,
-				struct ext3_dir_entry_2 *,
-				struct buffer_head *, unsigned long);
-extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
-				    __u32 minor_hash,
-				    struct ext3_dir_entry_2 *dirent);
-extern void ext3_htree_free_dir_info(struct dir_private_info *p);
-
-/* fsync.c */
-extern int ext3_sync_file(struct file *, loff_t, loff_t, int);
-
-/* hash.c */
-extern int ext3fs_dirhash(const char *name, int len, struct
-			  dx_hash_info *hinfo);
-
-/* ialloc.c */
-extern struct inode * ext3_new_inode (handle_t *, struct inode *,
-				      const struct qstr *, umode_t);
-extern void ext3_free_inode (handle_t *, struct inode *);
-extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
-extern unsigned long ext3_count_free_inodes (struct super_block *);
-extern unsigned long ext3_count_dirs (struct super_block *);
-extern void ext3_check_inodes_bitmap (struct super_block *);
-extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
-
-
-/* inode.c */
-int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
-		struct buffer_head *bh, ext3_fsblk_t blocknr);
-struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
-struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
-int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
-	sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result,
-	int create);
-
-extern struct inode *ext3_iget(struct super_block *, unsigned long);
-extern int  ext3_write_inode (struct inode *, struct writeback_control *);
-extern int  ext3_setattr (struct dentry *, struct iattr *);
-extern void ext3_evict_inode (struct inode *);
-extern int  ext3_sync_inode (handle_t *, struct inode *);
-extern void ext3_discard_reservation (struct inode *);
-extern void ext3_dirty_inode(struct inode *, int);
-extern int ext3_change_inode_journal_flag(struct inode *, int);
-extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *);
-extern int ext3_can_truncate(struct inode *inode);
-extern void ext3_truncate(struct inode *inode);
-extern void ext3_set_inode_flags(struct inode *);
-extern void ext3_get_inode_flags(struct ext3_inode_info *);
-extern void ext3_set_aops(struct inode *inode);
-extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-		       u64 start, u64 len);
-
-/* ioctl.c */
-extern long ext3_ioctl(struct file *, unsigned int, unsigned long);
-extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long);
-
-/* namei.c */
-extern int ext3_orphan_add(handle_t *, struct inode *);
-extern int ext3_orphan_del(handle_t *, struct inode *);
-extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
-				__u32 start_minor_hash, __u32 *next_hash);
-
-/* resize.c */
-extern int ext3_group_add(struct super_block *sb,
-				struct ext3_new_group_data *input);
-extern int ext3_group_extend(struct super_block *sb,
-				struct ext3_super_block *es,
-				ext3_fsblk_t n_blocks_count);
-
-/* super.c */
-extern __printf(3, 4)
-void ext3_error(struct super_block *, const char *, const char *, ...);
-extern void __ext3_std_error (struct super_block *, const char *, int);
-extern __printf(3, 4)
-void ext3_abort(struct super_block *, const char *, const char *, ...);
-extern __printf(3, 4)
-void ext3_warning(struct super_block *, const char *, const char *, ...);
-extern __printf(3, 4)
-void ext3_msg(struct super_block *, const char *, const char *, ...);
-extern void ext3_update_dynamic_rev (struct super_block *sb);
-
-#define ext3_std_error(sb, errno)				\
-do {								\
-	if ((errno))						\
-		__ext3_std_error((sb), __func__, (errno));	\
-} while (0)
-
-/*
- * Inodes and files operations
- */
-
-/* dir.c */
-extern const struct file_operations ext3_dir_operations;
-
-/* file.c */
-extern const struct inode_operations ext3_file_inode_operations;
-extern const struct file_operations ext3_file_operations;
-
-/* namei.c */
-extern const struct inode_operations ext3_dir_inode_operations;
-extern const struct inode_operations ext3_special_inode_operations;
-
-/* symlink.c */
-extern const struct inode_operations ext3_symlink_inode_operations;
-extern const struct inode_operations ext3_fast_symlink_inode_operations;
-
-#define EXT3_JOURNAL(inode)	(EXT3_SB((inode)->i_sb)->s_journal)
-
-/* Define the number of blocks we need to account to a transaction to
- * modify one block of data.
- *
- * We may have to touch one inode, one bitmap buffer, up to three
- * indirection blocks, the group and superblock summaries, and the data
- * block to complete the transaction.  */
-
-#define EXT3_SINGLEDATA_TRANS_BLOCKS	8U
-
-/* Extended attribute operations touch at most two data buffers,
- * two bitmap buffers, and two group summaries, in addition to the inode
- * and the superblock, which are already accounted for. */
-
-#define EXT3_XATTR_TRANS_BLOCKS		6U
-
-/* Define the minimum size for a transaction which modifies data.  This
- * needs to take into account the fact that we may end up modifying two
- * quota files too (one for the group, one for the user quota).  The
- * superblock only gets updated once, of course, so don't bother
- * counting that again for the quota updates. */
-
-#define EXT3_DATA_TRANS_BLOCKS(sb)	(EXT3_SINGLEDATA_TRANS_BLOCKS + \
-					 EXT3_XATTR_TRANS_BLOCKS - 2 + \
-					 EXT3_MAXQUOTAS_TRANS_BLOCKS(sb))
-
-/* Delete operations potentially hit one directory's namespace plus an
- * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
- * generous.  We can grow the delete transaction later if necessary. */
-
-#define EXT3_DELETE_TRANS_BLOCKS(sb)   (EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) + 64)
-
-/* Define an arbitrary limit for the amount of data we will anticipate
- * writing to any given transaction.  For unbounded transactions such as
- * write(2) and truncate(2) we can write more than this, but we always
- * start off at the maximum transaction size and grow the transaction
- * optimistically as we go. */
-
-#define EXT3_MAX_TRANS_DATA		64U
-
-/* We break up a large truncate or write transaction once the handle's
- * buffer credits gets this low, we need either to extend the
- * transaction or to start a new one.  Reserve enough space here for
- * inode, bitmap, superblock, group and indirection updates for at least
- * one block, plus two quota updates.  Quota allocations are not
- * needed. */
-
-#define EXT3_RESERVE_TRANS_BLOCKS	12U
-
-#define EXT3_INDEX_EXTRA_TRANS_BLOCKS	8
-
-#ifdef CONFIG_QUOTA
-/* Amount of blocks needed for quota update - we know that the structure was
- * allocated so we need to update only inode+data */
-#define EXT3_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
-/* Amount of blocks needed for quota insert/delete - we do some block writes
- * but inode, sb and group updates are done only once */
-#define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
-		(EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0)
-#define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
-		(EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0)
-#else
-#define EXT3_QUOTA_TRANS_BLOCKS(sb) 0
-#define EXT3_QUOTA_INIT_BLOCKS(sb) 0
-#define EXT3_QUOTA_DEL_BLOCKS(sb) 0
-#endif
-#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb))
-#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb))
-#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb))
-
-int
-ext3_mark_iloc_dirty(handle_t *handle,
-		     struct inode *inode,
-		     struct ext3_iloc *iloc);
-
-/*
- * On success, We end up with an outstanding reference count against
- * iloc->bh.  This _must_ be cleaned up later.
- */
-
-int ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
-			struct ext3_iloc *iloc);
-
-int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode);
-
-/*
- * Wrapper functions with which ext3 calls into JBD.  The intent here is
- * to allow these to be turned into appropriate stubs so ext3 can control
- * ext2 filesystems, so ext2+ext3 systems only nee one fs.  This work hasn't
- * been done yet.
- */
-
-static inline void ext3_journal_release_buffer(handle_t *handle,
-						struct buffer_head *bh)
-{
-	journal_release_buffer(handle, bh);
-}
-
-void ext3_journal_abort_handle(const char *caller, const char *err_fn,
-		struct buffer_head *bh, handle_t *handle, int err);
-
-int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
-				struct buffer_head *bh);
-
-int __ext3_journal_get_write_access(const char *where, handle_t *handle,
-				struct buffer_head *bh);
-
-int __ext3_journal_forget(const char *where, handle_t *handle,
-				struct buffer_head *bh);
-
-int __ext3_journal_revoke(const char *where, handle_t *handle,
-				unsigned long blocknr, struct buffer_head *bh);
-
-int __ext3_journal_get_create_access(const char *where,
-				handle_t *handle, struct buffer_head *bh);
-
-int __ext3_journal_dirty_metadata(const char *where,
-				handle_t *handle, struct buffer_head *bh);
-
-#define ext3_journal_get_undo_access(handle, bh) \
-	__ext3_journal_get_undo_access(__func__, (handle), (bh))
-#define ext3_journal_get_write_access(handle, bh) \
-	__ext3_journal_get_write_access(__func__, (handle), (bh))
-#define ext3_journal_revoke(handle, blocknr, bh) \
-	__ext3_journal_revoke(__func__, (handle), (blocknr), (bh))
-#define ext3_journal_get_create_access(handle, bh) \
-	__ext3_journal_get_create_access(__func__, (handle), (bh))
-#define ext3_journal_dirty_metadata(handle, bh) \
-	__ext3_journal_dirty_metadata(__func__, (handle), (bh))
-#define ext3_journal_forget(handle, bh) \
-	__ext3_journal_forget(__func__, (handle), (bh))
-
-int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
-
-handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks);
-int __ext3_journal_stop(const char *where, handle_t *handle);
-
-static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks)
-{
-	return ext3_journal_start_sb(inode->i_sb, nblocks);
-}
-
-#define ext3_journal_stop(handle) \
-	__ext3_journal_stop(__func__, (handle))
-
-static inline handle_t *ext3_journal_current_handle(void)
-{
-	return journal_current_handle();
-}
-
-static inline int ext3_journal_extend(handle_t *handle, int nblocks)
-{
-	return journal_extend(handle, nblocks);
-}
-
-static inline int ext3_journal_restart(handle_t *handle, int nblocks)
-{
-	return journal_restart(handle, nblocks);
-}
-
-static inline int ext3_journal_blocks_per_page(struct inode *inode)
-{
-	return journal_blocks_per_page(inode);
-}
-
-static inline int ext3_journal_force_commit(journal_t *journal)
-{
-	return journal_force_commit(journal);
-}
-
-/* super.c */
-int ext3_force_commit(struct super_block *sb);
-
-static inline int ext3_should_journal_data(struct inode *inode)
-{
-	if (!S_ISREG(inode->i_mode))
-		return 1;
-	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
-		return 1;
-	if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
-		return 1;
-	return 0;
-}
-
-static inline int ext3_should_order_data(struct inode *inode)
-{
-	if (!S_ISREG(inode->i_mode))
-		return 0;
-	if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
-		return 0;
-	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
-		return 1;
-	return 0;
-}
-
-static inline int ext3_should_writeback_data(struct inode *inode)
-{
-	if (!S_ISREG(inode->i_mode))
-		return 0;
-	if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
-		return 0;
-	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
-		return 1;
-	return 0;
-}
-
-#include <trace/events/ext3.h>
diff --git a/fs/ext3/ext3_jbd.c b/fs/ext3/ext3_jbd.c
deleted file mode 100644
index 785a3261a26c..000000000000
--- a/fs/ext3/ext3_jbd.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Interface between ext3 and JBD
- */
-
-#include "ext3.h"
-
-int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
-				struct buffer_head *bh)
-{
-	int err = journal_get_undo_access(handle, bh);
-	if (err)
-		ext3_journal_abort_handle(where, __func__, bh, handle,err);
-	return err;
-}
-
-int __ext3_journal_get_write_access(const char *where, handle_t *handle,
-				struct buffer_head *bh)
-{
-	int err = journal_get_write_access(handle, bh);
-	if (err)
-		ext3_journal_abort_handle(where, __func__, bh, handle,err);
-	return err;
-}
-
-int __ext3_journal_forget(const char *where, handle_t *handle,
-				struct buffer_head *bh)
-{
-	int err = journal_forget(handle, bh);
-	if (err)
-		ext3_journal_abort_handle(where, __func__, bh, handle,err);
-	return err;
-}
-
-int __ext3_journal_revoke(const char *where, handle_t *handle,
-				unsigned long blocknr, struct buffer_head *bh)
-{
-	int err = journal_revoke(handle, blocknr, bh);
-	if (err)
-		ext3_journal_abort_handle(where, __func__, bh, handle,err);
-	return err;
-}
-
-int __ext3_journal_get_create_access(const char *where,
-				handle_t *handle, struct buffer_head *bh)
-{
-	int err = journal_get_create_access(handle, bh);
-	if (err)
-		ext3_journal_abort_handle(where, __func__, bh, handle,err);
-	return err;
-}
-
-int __ext3_journal_dirty_metadata(const char *where,
-				handle_t *handle, struct buffer_head *bh)
-{
-	int err = journal_dirty_metadata(handle, bh);
-	if (err)
-		ext3_journal_abort_handle(where, __func__, bh, handle,err);
-	return err;
-}
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
deleted file mode 100644
index 3b8f650de22c..000000000000
--- a/fs/ext3/file.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- *  linux/fs/ext3/file.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/fs/minix/file.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *
- *  ext3 fs regular file handling primitives
- *
- *  64-bit file support on 64-bit platforms by Jakub Jelinek
- *	(jj@sunsite.ms.mff.cuni.cz)
- */
-
-#include <linux/quotaops.h>
-#include "ext3.h"
-#include "xattr.h"
-#include "acl.h"
-
-/*
- * Called when an inode is released. Note that this is different
- * from ext3_file_open: open gets called at every open, but release
- * gets called only when /all/ the files are closed.
- */
-static int ext3_release_file (struct inode * inode, struct file * filp)
-{
-	if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) {
-		filemap_flush(inode->i_mapping);
-		ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
-	}
-	/* if we are the last writer on the inode, drop the block reservation */
-	if ((filp->f_mode & FMODE_WRITE) &&
-			(atomic_read(&inode->i_writecount) == 1))
-	{
-		mutex_lock(&EXT3_I(inode)->truncate_mutex);
-		ext3_discard_reservation(inode);
-		mutex_unlock(&EXT3_I(inode)->truncate_mutex);
-	}
-	if (is_dx(inode) && filp->private_data)
-		ext3_htree_free_dir_info(filp->private_data);
-
-	return 0;
-}
-
-const struct file_operations ext3_file_operations = {
-	.llseek		= generic_file_llseek,
-	.read_iter	= generic_file_read_iter,
-	.write_iter	= generic_file_write_iter,
-	.unlocked_ioctl	= ext3_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= ext3_compat_ioctl,
-#endif
-	.mmap		= generic_file_mmap,
-	.open		= dquot_file_open,
-	.release	= ext3_release_file,
-	.fsync		= ext3_sync_file,
-	.splice_read	= generic_file_splice_read,
-	.splice_write	= iter_file_splice_write,
-};
-
-const struct inode_operations ext3_file_inode_operations = {
-	.setattr	= ext3_setattr,
-#ifdef CONFIG_EXT3_FS_XATTR
-	.setxattr	= generic_setxattr,
-	.getxattr	= generic_getxattr,
-	.listxattr	= ext3_listxattr,
-	.removexattr	= generic_removexattr,
-#endif
-	.get_acl	= ext3_get_acl,
-	.set_acl	= ext3_set_acl,
-	.fiemap		= ext3_fiemap,
-};
-
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
deleted file mode 100644
index 1cb9c7e10c6f..000000000000
--- a/fs/ext3/fsync.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- *  linux/fs/ext3/fsync.c
- *
- *  Copyright (C) 1993  Stephen Tweedie (sct@redhat.com)
- *  from
- *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
- *                      Laboratoire MASI - Institut Blaise Pascal
- *                      Universite Pierre et Marie Curie (Paris VI)
- *  from
- *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
- *
- *  ext3fs fsync primitive
- *
- *  Big-endian to little-endian byte-swapping/bitmaps by
- *        David S. Miller (davem@caip.rutgers.edu), 1995
- *
- *  Removed unnecessary code duplication for little endian machines
- *  and excessive __inline__s.
- *        Andi Kleen, 1997
- *
- * Major simplications and cleanup - we only need to do the metadata, because
- * we can depend on generic_block_fdatasync() to sync the data blocks.
- */
-
-#include <linux/blkdev.h>
-#include <linux/writeback.h>
-#include "ext3.h"
-
-/*
- * akpm: A new design for ext3_sync_file().
- *
- * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
- * There cannot be a transaction open by this task.
- * Another task could have dirtied this inode.  Its data can be in any
- * state in the journalling system.
- *
- * What we do is just kick off a commit and wait on it.  This will snapshot the
- * inode to disk.
- */
-
-int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
-{
-	struct inode *inode = file->f_mapping->host;
-	struct ext3_inode_info *ei = EXT3_I(inode);
-	journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
-	int ret, needs_barrier = 0;
-	tid_t commit_tid;
-
-	trace_ext3_sync_file_enter(file, datasync);
-
-	if (inode->i_sb->s_flags & MS_RDONLY) {
-		/* Make sure that we read updated state */
-		smp_rmb();
-		if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
-			return -EROFS;
-		return 0;
-	}
-	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	if (ret)
-		goto out;
-
-	J_ASSERT(ext3_journal_current_handle() == NULL);
-
-	/*
-	 * data=writeback,ordered:
-	 *  The caller's filemap_fdatawrite()/wait will sync the data.
-	 *  Metadata is in the journal, we wait for a proper transaction
-	 *  to commit here.
-	 *
-	 * data=journal:
-	 *  filemap_fdatawrite won't do anything (the buffers are clean).
-	 *  ext3_force_commit will write the file data into the journal and
-	 *  will wait on that.
-	 *  filemap_fdatawait() will encounter a ton of newly-dirtied pages
-	 *  (they were dirtied by commit).  But that's OK - the blocks are
-	 *  safe in-journal, which is all fsync() needs to ensure.
-	 */
-	if (ext3_should_journal_data(inode)) {
-		ret = ext3_force_commit(inode->i_sb);
-		goto out;
-	}
-
-	if (datasync)
-		commit_tid = atomic_read(&ei->i_datasync_tid);
-	else
-		commit_tid = atomic_read(&ei->i_sync_tid);
-
-	if (test_opt(inode->i_sb, BARRIER) &&
-	    !journal_trans_will_send_data_barrier(journal, commit_tid))
-		needs_barrier = 1;
-	log_start_commit(journal, commit_tid);
-	ret = log_wait_commit(journal, commit_tid);
-
-	/*
-	 * In case we didn't commit a transaction, we have to flush
-	 * disk caches manually so that data really is on persistent
-	 * storage
-	 */
-	if (needs_barrier) {
-		int err;
-
-		err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-		if (!ret)
-			ret = err;
-	}
-out:
-	trace_ext3_sync_file_exit(inode, ret);
-	return ret;
-}
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
deleted file mode 100644
index ede315cdf126..000000000000
--- a/fs/ext3/hash.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- *  linux/fs/ext3/hash.c
- *
- * Copyright (C) 2002 by Theodore Ts'o
- *
- * This file is released under the GPL v2.
- *
- * This file may be redistributed under the terms of the GNU Public
- * License.
- */
-
-#include "ext3.h"
-#include <linux/cryptohash.h>
-
-#define DELTA 0x9E3779B9
-
-static void TEA_transform(__u32 buf[4], __u32 const in[])
-{
-	__u32	sum = 0;
-	__u32	b0 = buf[0], b1 = buf[1];
-	__u32	a = in[0], b = in[1], c = in[2], d = in[3];
-	int	n = 16;
-
-	do {
-		sum += DELTA;
-		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
-		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
-	} while(--n);
-
-	buf[0] += b0;
-	buf[1] += b1;
-}
-
-
-/* The old legacy hash */
-static __u32 dx_hack_hash_unsigned(const char *name, int len)
-{
-	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
-	const unsigned char *ucp = (const unsigned char *) name;
-
-	while (len--) {
-		hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
-
-		if (hash & 0x80000000)
-			hash -= 0x7fffffff;
-		hash1 = hash0;
-		hash0 = hash;
-	}
-	return hash0 << 1;
-}
-
-static __u32 dx_hack_hash_signed(const char *name, int len)
-{
-	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
-	const signed char *scp = (const signed char *) name;
-
-	while (len--) {
-		hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
-
-		if (hash & 0x80000000)
-			hash -= 0x7fffffff;
-		hash1 = hash0;
-		hash0 = hash;
-	}
-	return hash0 << 1;
-}
-
-static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
-{
-	__u32	pad, val;
-	int	i;
-	const signed char *scp = (const signed char *) msg;
-
-	pad = (__u32)len | ((__u32)len << 8);
-	pad |= pad << 16;
-
-	val = pad;
-	if (len > num*4)
-		len = num * 4;
-	for (i = 0; i < len; i++) {
-		if ((i % 4) == 0)
-			val = pad;
-		val = ((int) scp[i]) + (val << 8);
-		if ((i % 4) == 3) {
-			*buf++ = val;
-			val = pad;
-			num--;
-		}
-	}
-	if (--num >= 0)
-		*buf++ = val;
-	while (--num >= 0)
-		*buf++ = pad;
-}
-
-static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
-{
-	__u32	pad, val;
-	int	i;
-	const unsigned char *ucp = (const unsigned char *) msg;
-
-	pad = (__u32)len | ((__u32)len << 8);
-	pad |= pad << 16;
-
-	val = pad;
-	if (len > num*4)
-		len = num * 4;
-	for (i=0; i < len; i++) {
-		if ((i % 4) == 0)
-			val = pad;
-		val = ((int) ucp[i]) + (val << 8);
-		if ((i % 4) == 3) {
-			*buf++ = val;
-			val = pad;
-			num--;
-		}
-	}
-	if (--num >= 0)
-		*buf++ = val;
-	while (--num >= 0)
-		*buf++ = pad;
-}
-
-/*
- * Returns the hash of a filename.  If len is 0 and name is NULL, then
- * this function can be used to test whether or not a hash version is
- * supported.
- *
- * The seed is an 4 longword (32 bits) "secret" which can be used to
- * uniquify a hash.  If the seed is all zero's, then some default seed
- * may be used.
- *
- * A particular hash version specifies whether or not the seed is
- * represented, and whether or not the returned hash is 32 bits or 64
- * bits.  32 bit hashes will return 0 for the minor hash.
- */
-int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
-{
-	__u32	hash;
-	__u32	minor_hash = 0;
-	const char	*p;
-	int		i;
-	__u32		in[8], buf[4];
-	void		(*str2hashbuf)(const char *, int, __u32 *, int) =
-				str2hashbuf_signed;
-
-	/* Initialize the default seed for the hash checksum functions */
-	buf[0] = 0x67452301;
-	buf[1] = 0xefcdab89;
-	buf[2] = 0x98badcfe;
-	buf[3] = 0x10325476;
-
-	/* Check to see if the seed is all zero's */
-	if (hinfo->seed) {
-		for (i=0; i < 4; i++) {
-			if (hinfo->seed[i])
-				break;
-		}
-		if (i < 4)
-			memcpy(buf, hinfo->seed, sizeof(buf));
-	}
-
-	switch (hinfo->hash_version) {
-	case DX_HASH_LEGACY_UNSIGNED:
-		hash = dx_hack_hash_unsigned(name, len);
-		break;
-	case DX_HASH_LEGACY:
-		hash = dx_hack_hash_signed(name, len);
-		break;
-	case DX_HASH_HALF_MD4_UNSIGNED:
-		str2hashbuf = str2hashbuf_unsigned;
-	case DX_HASH_HALF_MD4:
-		p = name;
-		while (len > 0) {
-			(*str2hashbuf)(p, len, in, 8);
-			half_md4_transform(buf, in);
-			len -= 32;
-			p += 32;
-		}
-		minor_hash = buf[2];
-		hash = buf[1];
-		break;
-	case DX_HASH_TEA_UNSIGNED:
-		str2hashbuf = str2hashbuf_unsigned;
-	case DX_HASH_TEA:
-		p = name;
-		while (len > 0) {
-			(*str2hashbuf)(p, len, in, 4);
-			TEA_transform(buf, in);
-			len -= 16;
-			p += 16;
-		}
-		hash = buf[0];
-		minor_hash = buf[1];
-		break;
-	default:
-		hinfo->hash = 0;
-		return -1;
-	}
-	hash = hash & ~1;
-	if (hash == (EXT3_HTREE_EOF_32BIT << 1))
-		hash = (EXT3_HTREE_EOF_32BIT - 1) << 1;
-	hinfo->hash = hash;
-	hinfo->minor_hash = minor_hash;
-	return 0;
-}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
deleted file mode 100644
index 3ad242e5840e..000000000000
--- a/fs/ext3/ialloc.c
+++ /dev/null
@@ -1,706 +0,0 @@
-/*
- *  linux/fs/ext3/ialloc.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  BSD ufs-inspired inode and directory allocation by
- *  Stephen Tweedie (sct@redhat.com), 1993
- *  Big-endian to little-endian byte-swapping/bitmaps by
- *        David S. Miller (davem@caip.rutgers.edu), 1995
- */
-
-#include <linux/quotaops.h>
-#include <linux/random.h>
-
-#include "ext3.h"
-#include "xattr.h"
-#include "acl.h"
-
-/*
- * ialloc.c contains the inodes allocation and deallocation routines
- */
-
-/*
- * The free inodes are managed by bitmaps.  A file system contains several
- * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
- * block for inodes, N blocks for the inode table and data blocks.
- *
- * The file system contains group descriptors which are located after the
- * super block.  Each descriptor contains the number of the bitmap block and
- * the free blocks count in the block.
- */
-
-
-/*
- * Read the inode allocation bitmap for a given block_group, reading
- * into the specified slot in the superblock's bitmap cache.
- *
- * Return buffer_head of bitmap on success or NULL.
- */
-static struct buffer_head *
-read_inode_bitmap(struct super_block * sb, unsigned long block_group)
-{
-	struct ext3_group_desc *desc;
-	struct buffer_head *bh = NULL;
-
-	desc = ext3_get_group_desc(sb, block_group, NULL);
-	if (!desc)
-		goto error_out;
-
-	bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap));
-	if (!bh)
-		ext3_error(sb, "read_inode_bitmap",
-			    "Cannot read inode bitmap - "
-			    "block_group = %lu, inode_bitmap = %u",
-			    block_group, le32_to_cpu(desc->bg_inode_bitmap));
-error_out:
-	return bh;
-}
-
-/*
- * NOTE! When we get the inode, we're the only people
- * that have access to it, and as such there are no
- * race conditions we have to worry about. The inode
- * is not on the hash-lists, and it cannot be reached
- * through the filesystem because the directory entry
- * has been deleted earlier.
- *
- * HOWEVER: we must make sure that we get no aliases,
- * which means that we have to call "clear_inode()"
- * _before_ we mark the inode not in use in the inode
- * bitmaps. Otherwise a newly created file might use
- * the same inode number (not actually the same pointer
- * though), and then we'd have two inodes sharing the
- * same inode number and space on the harddisk.
- */
-void ext3_free_inode (handle_t *handle, struct inode * inode)
-{
-	struct super_block * sb = inode->i_sb;
-	int is_directory;
-	unsigned long ino;
-	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *bh2;
-	unsigned long block_group;
-	unsigned long bit;
-	struct ext3_group_desc * gdp;
-	struct ext3_super_block * es;
-	struct ext3_sb_info *sbi;
-	int fatal = 0, err;
-
-	if (atomic_read(&inode->i_count) > 1) {
-		printk ("ext3_free_inode: inode has count=%d\n",
-					atomic_read(&inode->i_count));
-		return;
-	}
-	if (inode->i_nlink) {
-		printk ("ext3_free_inode: inode has nlink=%d\n",
-			inode->i_nlink);
-		return;
-	}
-	if (!sb) {
-		printk("ext3_free_inode: inode on nonexistent device\n");
-		return;
-	}
-	sbi = EXT3_SB(sb);
-
-	ino = inode->i_ino;
-	ext3_debug ("freeing inode %lu\n", ino);
-	trace_ext3_free_inode(inode);
-
-	is_directory = S_ISDIR(inode->i_mode);
-
-	es = EXT3_SB(sb)->s_es;
-	if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
-		ext3_error (sb, "ext3_free_inode",
-			    "reserved or nonexistent inode %lu", ino);
-		goto error_return;
-	}
-	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
-	bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
-	bitmap_bh = read_inode_bitmap(sb, block_group);
-	if (!bitmap_bh)
-		goto error_return;
-
-	BUFFER_TRACE(bitmap_bh, "get_write_access");
-	fatal = ext3_journal_get_write_access(handle, bitmap_bh);
-	if (fatal)
-		goto error_return;
-
-	/* Ok, now we can actually update the inode bitmaps.. */
-	if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
-					bit, bitmap_bh->b_data))
-		ext3_error (sb, "ext3_free_inode",
-			      "bit already cleared for inode %lu", ino);
-	else {
-		gdp = ext3_get_group_desc (sb, block_group, &bh2);
-
-		BUFFER_TRACE(bh2, "get_write_access");
-		fatal = ext3_journal_get_write_access(handle, bh2);
-		if (fatal) goto error_return;
-
-		if (gdp) {
-			spin_lock(sb_bgl_lock(sbi, block_group));
-			le16_add_cpu(&gdp->bg_free_inodes_count, 1);
-			if (is_directory)
-				le16_add_cpu(&gdp->bg_used_dirs_count, -1);
-			spin_unlock(sb_bgl_lock(sbi, block_group));
-			percpu_counter_inc(&sbi->s_freeinodes_counter);
-			if (is_directory)
-				percpu_counter_dec(&sbi->s_dirs_counter);
-
-		}
-		BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
-		err = ext3_journal_dirty_metadata(handle, bh2);
-		if (!fatal) fatal = err;
-	}
-	BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata");
-	err = ext3_journal_dirty_metadata(handle, bitmap_bh);
-	if (!fatal)
-		fatal = err;
-
-error_return:
-	brelse(bitmap_bh);
-	ext3_std_error(sb, fatal);
-}
-
-/*
- * Orlov's allocator for directories.
- *
- * We always try to spread first-level directories.
- *
- * If there are blockgroups with both free inodes and free blocks counts
- * not worse than average we return one with smallest directory count.
- * Otherwise we simply return a random group.
- *
- * For the rest rules look so:
- *
- * It's OK to put directory into a group unless
- * it has too many directories already (max_dirs) or
- * it has too few free inodes left (min_inodes) or
- * it has too few free blocks left (min_blocks).
- * Parent's group is preferred, if it doesn't satisfy these
- * conditions we search cyclically through the rest. If none
- * of the groups look good we just look for a group with more
- * free inodes than average (starting at parent's group).
- *
- * Debt is incremented each time we allocate a directory and decremented
- * when we allocate an inode, within 0--255.
- */
-
-static int find_group_orlov(struct super_block *sb, struct inode *parent)
-{
-	int parent_group = EXT3_I(parent)->i_block_group;
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	int ngroups = sbi->s_groups_count;
-	int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
-	unsigned int freei, avefreei;
-	ext3_fsblk_t freeb, avefreeb;
-	unsigned int ndirs;
-	int max_dirs, min_inodes;
-	ext3_grpblk_t min_blocks;
-	int group = -1, i;
-	struct ext3_group_desc *desc;
-
-	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
-	avefreei = freei / ngroups;
-	freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
-	avefreeb = freeb / ngroups;
-	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
-
-	if ((parent == d_inode(sb->s_root)) ||
-	    (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) {
-		int best_ndir = inodes_per_group;
-		int best_group = -1;
-
-		group = prandom_u32();
-		parent_group = (unsigned)group % ngroups;
-		for (i = 0; i < ngroups; i++) {
-			group = (parent_group + i) % ngroups;
-			desc = ext3_get_group_desc (sb, group, NULL);
-			if (!desc || !desc->bg_free_inodes_count)
-				continue;
-			if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
-				continue;
-			if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
-				continue;
-			if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
-				continue;
-			best_group = group;
-			best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
-		}
-		if (best_group >= 0)
-			return best_group;
-		goto fallback;
-	}
-
-	max_dirs = ndirs / ngroups + inodes_per_group / 16;
-	min_inodes = avefreei - inodes_per_group / 4;
-	min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
-
-	for (i = 0; i < ngroups; i++) {
-		group = (parent_group + i) % ngroups;
-		desc = ext3_get_group_desc (sb, group, NULL);
-		if (!desc || !desc->bg_free_inodes_count)
-			continue;
-		if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
-			continue;
-		if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
-			continue;
-		if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
-			continue;
-		return group;
-	}
-
-fallback:
-	for (i = 0; i < ngroups; i++) {
-		group = (parent_group + i) % ngroups;
-		desc = ext3_get_group_desc (sb, group, NULL);
-		if (!desc || !desc->bg_free_inodes_count)
-			continue;
-		if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
-			return group;
-	}
-
-	if (avefreei) {
-		/*
-		 * The free-inodes counter is approximate, and for really small
-		 * filesystems the above test can fail to find any blockgroups
-		 */
-		avefreei = 0;
-		goto fallback;
-	}
-
-	return -1;
-}
-
-static int find_group_other(struct super_block *sb, struct inode *parent)
-{
-	int parent_group = EXT3_I(parent)->i_block_group;
-	int ngroups = EXT3_SB(sb)->s_groups_count;
-	struct ext3_group_desc *desc;
-	int group, i;
-
-	/*
-	 * Try to place the inode in its parent directory
-	 */
-	group = parent_group;
-	desc = ext3_get_group_desc (sb, group, NULL);
-	if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
-			le16_to_cpu(desc->bg_free_blocks_count))
-		return group;
-
-	/*
-	 * We're going to place this inode in a different blockgroup from its
-	 * parent.  We want to cause files in a common directory to all land in
-	 * the same blockgroup.  But we want files which are in a different
-	 * directory which shares a blockgroup with our parent to land in a
-	 * different blockgroup.
-	 *
-	 * So add our directory's i_ino into the starting point for the hash.
-	 */
-	group = (group + parent->i_ino) % ngroups;
-
-	/*
-	 * Use a quadratic hash to find a group with a free inode and some free
-	 * blocks.
-	 */
-	for (i = 1; i < ngroups; i <<= 1) {
-		group += i;
-		if (group >= ngroups)
-			group -= ngroups;
-		desc = ext3_get_group_desc (sb, group, NULL);
-		if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
-				le16_to_cpu(desc->bg_free_blocks_count))
-			return group;
-	}
-
-	/*
-	 * That failed: try linear search for a free inode, even if that group
-	 * has no free blocks.
-	 */
-	group = parent_group;
-	for (i = 0; i < ngroups; i++) {
-		if (++group >= ngroups)
-			group = 0;
-		desc = ext3_get_group_desc (sb, group, NULL);
-		if (desc && le16_to_cpu(desc->bg_free_inodes_count))
-			return group;
-	}
-
-	return -1;
-}
-
-/*
- * There are two policies for allocating an inode.  If the new inode is
- * a directory, then a forward search is made for a block group with both
- * free space and a low directory-to-inode ratio; if that fails, then of
- * the groups with above-average free space, that group with the fewest
- * directories already is chosen.
- *
- * For other inodes, search forward from the parent directory's block
- * group to find a free inode.
- */
-struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
-			     const struct qstr *qstr, umode_t mode)
-{
-	struct super_block *sb;
-	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *bh2;
-	int group;
-	unsigned long ino = 0;
-	struct inode * inode;
-	struct ext3_group_desc * gdp = NULL;
-	struct ext3_super_block * es;
-	struct ext3_inode_info *ei;
-	struct ext3_sb_info *sbi;
-	int err = 0;
-	struct inode *ret;
-	int i;
-
-	/* Cannot create files in a deleted directory */
-	if (!dir || !dir->i_nlink)
-		return ERR_PTR(-EPERM);
-
-	sb = dir->i_sb;
-	trace_ext3_request_inode(dir, mode);
-	inode = new_inode(sb);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-	ei = EXT3_I(inode);
-
-	sbi = EXT3_SB(sb);
-	es = sbi->s_es;
-	if (S_ISDIR(mode))
-		group = find_group_orlov(sb, dir);
-	else
-		group = find_group_other(sb, dir);
-
-	err = -ENOSPC;
-	if (group == -1)
-		goto out;
-
-	for (i = 0; i < sbi->s_groups_count; i++) {
-		err = -EIO;
-
-		gdp = ext3_get_group_desc(sb, group, &bh2);
-		if (!gdp)
-			goto fail;
-
-		brelse(bitmap_bh);
-		bitmap_bh = read_inode_bitmap(sb, group);
-		if (!bitmap_bh)
-			goto fail;
-
-		ino = 0;
-
-repeat_in_this_group:
-		ino = ext3_find_next_zero_bit((unsigned long *)
-				bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino);
-		if (ino < EXT3_INODES_PER_GROUP(sb)) {
-
-			BUFFER_TRACE(bitmap_bh, "get_write_access");
-			err = ext3_journal_get_write_access(handle, bitmap_bh);
-			if (err)
-				goto fail;
-
-			if (!ext3_set_bit_atomic(sb_bgl_lock(sbi, group),
-						ino, bitmap_bh->b_data)) {
-				/* we won it */
-				BUFFER_TRACE(bitmap_bh,
-					"call ext3_journal_dirty_metadata");
-				err = ext3_journal_dirty_metadata(handle,
-								bitmap_bh);
-				if (err)
-					goto fail;
-				goto got;
-			}
-			/* we lost it */
-			journal_release_buffer(handle, bitmap_bh);
-
-			if (++ino < EXT3_INODES_PER_GROUP(sb))
-				goto repeat_in_this_group;
-		}
-
-		/*
-		 * This case is possible in concurrent environment.  It is very
-		 * rare.  We cannot repeat the find_group_xxx() call because
-		 * that will simply return the same blockgroup, because the
-		 * group descriptor metadata has not yet been updated.
-		 * So we just go onto the next blockgroup.
-		 */
-		if (++group == sbi->s_groups_count)
-			group = 0;
-	}
-	err = -ENOSPC;
-	goto out;
-
-got:
-	ino += group * EXT3_INODES_PER_GROUP(sb) + 1;
-	if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
-		ext3_error (sb, "ext3_new_inode",
-			    "reserved inode or inode > inodes count - "
-			    "block_group = %d, inode=%lu", group, ino);
-		err = -EIO;
-		goto fail;
-	}
-
-	BUFFER_TRACE(bh2, "get_write_access");
-	err = ext3_journal_get_write_access(handle, bh2);
-	if (err) goto fail;
-	spin_lock(sb_bgl_lock(sbi, group));
-	le16_add_cpu(&gdp->bg_free_inodes_count, -1);
-	if (S_ISDIR(mode)) {
-		le16_add_cpu(&gdp->bg_used_dirs_count, 1);
-	}
-	spin_unlock(sb_bgl_lock(sbi, group));
-	BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
-	err = ext3_journal_dirty_metadata(handle, bh2);
-	if (err) goto fail;
-
-	percpu_counter_dec(&sbi->s_freeinodes_counter);
-	if (S_ISDIR(mode))
-		percpu_counter_inc(&sbi->s_dirs_counter);
-
-
-	if (test_opt(sb, GRPID)) {
-		inode->i_mode = mode;
-		inode->i_uid = current_fsuid();
-		inode->i_gid = dir->i_gid;
-	} else
-		inode_init_owner(inode, dir, mode);
-
-	inode->i_ino = ino;
-	/* This is the optimal IO size (for stat), not the fs block size */
-	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-
-	memset(ei->i_data, 0, sizeof(ei->i_data));
-	ei->i_dir_start_lookup = 0;
-	ei->i_disksize = 0;
-
-	ei->i_flags =
-		ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
-#ifdef EXT3_FRAGMENTS
-	ei->i_faddr = 0;
-	ei->i_frag_no = 0;
-	ei->i_frag_size = 0;
-#endif
-	ei->i_file_acl = 0;
-	ei->i_dir_acl = 0;
-	ei->i_dtime = 0;
-	ei->i_block_alloc_info = NULL;
-	ei->i_block_group = group;
-
-	ext3_set_inode_flags(inode);
-	if (IS_DIRSYNC(inode))
-		handle->h_sync = 1;
-	if (insert_inode_locked(inode) < 0) {
-		/*
-		 * Likely a bitmap corruption causing inode to be allocated
-		 * twice.
-		 */
-		err = -EIO;
-		goto fail;
-	}
-	spin_lock(&sbi->s_next_gen_lock);
-	inode->i_generation = sbi->s_next_generation++;
-	spin_unlock(&sbi->s_next_gen_lock);
-
-	ei->i_state_flags = 0;
-	ext3_set_inode_state(inode, EXT3_STATE_NEW);
-
-	/* See comment in ext3_iget for explanation */
-	if (ino >= EXT3_FIRST_INO(sb) + 1 &&
-	    EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) {
-		ei->i_extra_isize =
-			sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE;
-	} else {
-		ei->i_extra_isize = 0;
-	}
-
-	ret = inode;
-	dquot_initialize(inode);
-	err = dquot_alloc_inode(inode);
-	if (err)
-		goto fail_drop;
-
-	err = ext3_init_acl(handle, inode, dir);
-	if (err)
-		goto fail_free_drop;
-
-	err = ext3_init_security(handle, inode, dir, qstr);
-	if (err)
-		goto fail_free_drop;
-
-	err = ext3_mark_inode_dirty(handle, inode);
-	if (err) {
-		ext3_std_error(sb, err);
-		goto fail_free_drop;
-	}
-
-	ext3_debug("allocating inode %lu\n", inode->i_ino);
-	trace_ext3_allocate_inode(inode, dir, mode);
-	goto really_out;
-fail:
-	ext3_std_error(sb, err);
-out:
-	iput(inode);
-	ret = ERR_PTR(err);
-really_out:
-	brelse(bitmap_bh);
-	return ret;
-
-fail_free_drop:
-	dquot_free_inode(inode);
-
-fail_drop:
-	dquot_drop(inode);
-	inode->i_flags |= S_NOQUOTA;
-	clear_nlink(inode);
-	unlock_new_inode(inode);
-	iput(inode);
-	brelse(bitmap_bh);
-	return ERR_PTR(err);
-}
-
-/* Verify that we are loading a valid orphan from disk */
-struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
-{
-	unsigned long max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
-	unsigned long block_group;
-	int bit;
-	struct buffer_head *bitmap_bh;
-	struct inode *inode = NULL;
-	long err = -EIO;
-
-	/* Error cases - e2fsck has already cleaned up for us */
-	if (ino > max_ino) {
-		ext3_warning(sb, __func__,
-			     "bad orphan ino %lu!  e2fsck was run?", ino);
-		goto error;
-	}
-
-	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
-	bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
-	bitmap_bh = read_inode_bitmap(sb, block_group);
-	if (!bitmap_bh) {
-		ext3_warning(sb, __func__,
-			     "inode bitmap error for orphan %lu", ino);
-		goto error;
-	}
-
-	/* Having the inode bit set should be a 100% indicator that this
-	 * is a valid orphan (no e2fsck run on fs).  Orphans also include
-	 * inodes that were being truncated, so we can't check i_nlink==0.
-	 */
-	if (!ext3_test_bit(bit, bitmap_bh->b_data))
-		goto bad_orphan;
-
-	inode = ext3_iget(sb, ino);
-	if (IS_ERR(inode))
-		goto iget_failed;
-
-	/*
-	 * If the orphans has i_nlinks > 0 then it should be able to be
-	 * truncated, otherwise it won't be removed from the orphan list
-	 * during processing and an infinite loop will result.
-	 */
-	if (inode->i_nlink && !ext3_can_truncate(inode))
-		goto bad_orphan;
-
-	if (NEXT_ORPHAN(inode) > max_ino)
-		goto bad_orphan;
-	brelse(bitmap_bh);
-	return inode;
-
-iget_failed:
-	err = PTR_ERR(inode);
-	inode = NULL;
-bad_orphan:
-	ext3_warning(sb, __func__,
-		     "bad orphan inode %lu!  e2fsck was run?", ino);
-	printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n",
-	       bit, (unsigned long long)bitmap_bh->b_blocknr,
-	       ext3_test_bit(bit, bitmap_bh->b_data));
-	printk(KERN_NOTICE "inode=%p\n", inode);
-	if (inode) {
-		printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
-		       is_bad_inode(inode));
-		printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
-		       NEXT_ORPHAN(inode));
-		printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
-		printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
-		/* Avoid freeing blocks if we got a bad deleted inode */
-		if (inode->i_nlink == 0)
-			inode->i_blocks = 0;
-		iput(inode);
-	}
-	brelse(bitmap_bh);
-error:
-	return ERR_PTR(err);
-}
-
-unsigned long ext3_count_free_inodes (struct super_block * sb)
-{
-	unsigned long desc_count;
-	struct ext3_group_desc *gdp;
-	int i;
-#ifdef EXT3FS_DEBUG
-	struct ext3_super_block *es;
-	unsigned long bitmap_count, x;
-	struct buffer_head *bitmap_bh = NULL;
-
-	es = EXT3_SB(sb)->s_es;
-	desc_count = 0;
-	bitmap_count = 0;
-	gdp = NULL;
-	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
-		gdp = ext3_get_group_desc (sb, i, NULL);
-		if (!gdp)
-			continue;
-		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
-		brelse(bitmap_bh);
-		bitmap_bh = read_inode_bitmap(sb, i);
-		if (!bitmap_bh)
-			continue;
-
-		x = ext3_count_free(bitmap_bh, EXT3_INODES_PER_GROUP(sb) / 8);
-		printk("group %d: stored = %d, counted = %lu\n",
-			i, le16_to_cpu(gdp->bg_free_inodes_count), x);
-		bitmap_count += x;
-	}
-	brelse(bitmap_bh);
-	printk("ext3_count_free_inodes: stored = %u, computed = %lu, %lu\n",
-		le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
-	return desc_count;
-#else
-	desc_count = 0;
-	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
-		gdp = ext3_get_group_desc (sb, i, NULL);
-		if (!gdp)
-			continue;
-		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
-		cond_resched();
-	}
-	return desc_count;
-#endif
-}
-
-/* Called at mount-time, super-block is locked */
-unsigned long ext3_count_dirs (struct super_block * sb)
-{
-	unsigned long count = 0;
-	int i;
-
-	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
-		struct ext3_group_desc *gdp = ext3_get_group_desc (sb, i, NULL);
-		if (!gdp)
-			continue;
-		count += le16_to_cpu(gdp->bg_used_dirs_count);
-	}
-	return count;
-}
-
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
deleted file mode 100644
index 6c7e5468a2f8..000000000000
--- a/fs/ext3/inode.c
+++ /dev/null
@@ -1,3574 +0,0 @@
-/*
- *  linux/fs/ext3/inode.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/fs/minix/inode.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *
- *  Goal-directed block allocation by Stephen Tweedie
- *	(sct@redhat.com), 1993, 1998
- *  Big-endian to little-endian byte-swapping/bitmaps by
- *        David S. Miller (davem@caip.rutgers.edu), 1995
- *  64-bit file support on 64-bit platforms by Jakub Jelinek
- *	(jj@sunsite.ms.mff.cuni.cz)
- *
- *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
- */
-
-#include <linux/highuid.h>
-#include <linux/quotaops.h>
-#include <linux/writeback.h>
-#include <linux/mpage.h>
-#include <linux/namei.h>
-#include <linux/uio.h>
-#include "ext3.h"
-#include "xattr.h"
-#include "acl.h"
-
-static int ext3_writepage_trans_blocks(struct inode *inode);
-static int ext3_block_truncate_page(struct inode *inode, loff_t from);
-
-/*
- * Test whether an inode is a fast symlink.
- */
-static int ext3_inode_is_fast_symlink(struct inode *inode)
-{
-	int ea_blocks = EXT3_I(inode)->i_file_acl ?
-		(inode->i_sb->s_blocksize >> 9) : 0;
-
-	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
-}
-
-/*
- * The ext3 forget function must perform a revoke if we are freeing data
- * which has been journaled.  Metadata (eg. indirect blocks) must be
- * revoked in all cases.
- *
- * "bh" may be NULL: a metadata block may have been freed from memory
- * but there may still be a record of it in the journal, and that record
- * still needs to be revoked.
- */
-int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
-			struct buffer_head *bh, ext3_fsblk_t blocknr)
-{
-	int err;
-
-	might_sleep();
-
-	trace_ext3_forget(inode, is_metadata, blocknr);
-	BUFFER_TRACE(bh, "enter");
-
-	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-		  "data mode %lx\n",
-		  bh, is_metadata, inode->i_mode,
-		  test_opt(inode->i_sb, DATA_FLAGS));
-
-	/* Never use the revoke function if we are doing full data
-	 * journaling: there is no need to, and a V1 superblock won't
-	 * support it.  Otherwise, only skip the revoke on un-journaled
-	 * data blocks. */
-
-	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
-	    (!is_metadata && !ext3_should_journal_data(inode))) {
-		if (bh) {
-			BUFFER_TRACE(bh, "call journal_forget");
-			return ext3_journal_forget(handle, bh);
-		}
-		return 0;
-	}
-
-	/*
-	 * data!=journal && (is_metadata || should_journal_data(inode))
-	 */
-	BUFFER_TRACE(bh, "call ext3_journal_revoke");
-	err = ext3_journal_revoke(handle, blocknr, bh);
-	if (err)
-		ext3_abort(inode->i_sb, __func__,
-			   "error %d when attempting revoke", err);
-	BUFFER_TRACE(bh, "exit");
-	return err;
-}
-
-/*
- * Work out how many blocks we need to proceed with the next chunk of a
- * truncate transaction.
- */
-static unsigned long blocks_for_truncate(struct inode *inode)
-{
-	unsigned long needed;
-
-	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
-
-	/* Give ourselves just enough room to cope with inodes in which
-	 * i_blocks is corrupt: we've seen disk corruptions in the past
-	 * which resulted in random data in an inode which looked enough
-	 * like a regular file for ext3 to try to delete it.  Things
-	 * will go a bit crazy if that happens, but at least we should
-	 * try not to panic the whole kernel. */
-	if (needed < 2)
-		needed = 2;
-
-	/* But we need to bound the transaction so we don't overflow the
-	 * journal. */
-	if (needed > EXT3_MAX_TRANS_DATA)
-		needed = EXT3_MAX_TRANS_DATA;
-
-	return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
-}
-
-/*
- * Truncate transactions can be complex and absolutely huge.  So we need to
- * be able to restart the transaction at a conventient checkpoint to make
- * sure we don't overflow the journal.
- *
- * start_transaction gets us a new handle for a truncate transaction,
- * and extend_transaction tries to extend the existing one a bit.  If
- * extend fails, we need to propagate the failure up and restart the
- * transaction in the top-level truncate loop. --sct
- */
-static handle_t *start_transaction(struct inode *inode)
-{
-	handle_t *result;
-
-	result = ext3_journal_start(inode, blocks_for_truncate(inode));
-	if (!IS_ERR(result))
-		return result;
-
-	ext3_std_error(inode->i_sb, PTR_ERR(result));
-	return result;
-}
-
-/*
- * Try to extend this transaction for the purposes of truncation.
- *
- * Returns 0 if we managed to create more room.  If we can't create more
- * room, and the transaction must be restarted we return 1.
- */
-static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
-{
-	if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
-		return 0;
-	if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
-		return 0;
-	return 1;
-}
-
-/*
- * Restart the transaction associated with *handle.  This does a commit,
- * so before we call here everything must be consistently dirtied against
- * this transaction.
- */
-static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
-{
-	int ret;
-
-	jbd_debug(2, "restarting handle %p\n", handle);
-	/*
-	 * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
-	 * At this moment, get_block can be called only for blocks inside
-	 * i_size since page cache has been already dropped and writes are
-	 * blocked by i_mutex. So we can safely drop the truncate_mutex.
-	 */
-	mutex_unlock(&EXT3_I(inode)->truncate_mutex);
-	ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
-	mutex_lock(&EXT3_I(inode)->truncate_mutex);
-	return ret;
-}
-
-/*
- * Called at inode eviction from icache
- */
-void ext3_evict_inode (struct inode *inode)
-{
-	struct ext3_inode_info *ei = EXT3_I(inode);
-	struct ext3_block_alloc_info *rsv;
-	handle_t *handle;
-	int want_delete = 0;
-
-	trace_ext3_evict_inode(inode);
-	if (!inode->i_nlink && !is_bad_inode(inode)) {
-		dquot_initialize(inode);
-		want_delete = 1;
-	}
-
-	/*
-	 * When journalling data dirty buffers are tracked only in the journal.
-	 * So although mm thinks everything is clean and ready for reaping the
-	 * inode might still have some pages to write in the running
-	 * transaction or waiting to be checkpointed. Thus calling
-	 * journal_invalidatepage() (via truncate_inode_pages()) to discard
-	 * these buffers can cause data loss. Also even if we did not discard
-	 * these buffers, we would have no way to find them after the inode
-	 * is reaped and thus user could see stale data if he tries to read
-	 * them before the transaction is checkpointed. So be careful and
-	 * force everything to disk here... We use ei->i_datasync_tid to
-	 * store the newest transaction containing inode's data.
-	 *
-	 * Note that directories do not have this problem because they don't
-	 * use page cache.
-	 *
-	 * The s_journal check handles the case when ext3_get_journal() fails
-	 * and puts the journal inode.
-	 */
-	if (inode->i_nlink && ext3_should_journal_data(inode) &&
-	    EXT3_SB(inode->i_sb)->s_journal &&
-	    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
-	    inode->i_ino != EXT3_JOURNAL_INO) {
-		tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
-		journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
-
-		log_start_commit(journal, commit_tid);
-		log_wait_commit(journal, commit_tid);
-		filemap_write_and_wait(&inode->i_data);
-	}
-	truncate_inode_pages_final(&inode->i_data);
-
-	ext3_discard_reservation(inode);
-	rsv = ei->i_block_alloc_info;
-	ei->i_block_alloc_info = NULL;
-	if (unlikely(rsv))
-		kfree(rsv);
-
-	if (!want_delete)
-		goto no_delete;
-
-	handle = start_transaction(inode);
-	if (IS_ERR(handle)) {
-		/*
-		 * If we're going to skip the normal cleanup, we still need to
-		 * make sure that the in-core orphan linked list is properly
-		 * cleaned up.
-		 */
-		ext3_orphan_del(NULL, inode);
-		goto no_delete;
-	}
-
-	if (IS_SYNC(inode))
-		handle->h_sync = 1;
-	inode->i_size = 0;
-	if (inode->i_blocks)
-		ext3_truncate(inode);
-	/*
-	 * Kill off the orphan record created when the inode lost the last
-	 * link.  Note that ext3_orphan_del() has to be able to cope with the
-	 * deletion of a non-existent orphan - ext3_truncate() could
-	 * have removed the record.
-	 */
-	ext3_orphan_del(handle, inode);
-	ei->i_dtime = get_seconds();
-
-	/*
-	 * One subtle ordering requirement: if anything has gone wrong
-	 * (transaction abort, IO errors, whatever), then we can still
-	 * do these next steps (the fs will already have been marked as
-	 * having errors), but we can't free the inode if the mark_dirty
-	 * fails.
-	 */
-	if (ext3_mark_inode_dirty(handle, inode)) {
-		/* If that failed, just dquot_drop() and be done with that */
-		dquot_drop(inode);
-		clear_inode(inode);
-	} else {
-		ext3_xattr_delete_inode(handle, inode);
-		dquot_free_inode(inode);
-		dquot_drop(inode);
-		clear_inode(inode);
-		ext3_free_inode(handle, inode);
-	}
-	ext3_journal_stop(handle);
-	return;
-no_delete:
-	clear_inode(inode);
-	dquot_drop(inode);
-}
-
-typedef struct {
-	__le32	*p;
-	__le32	key;
-	struct buffer_head *bh;
-} Indirect;
-
-static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
-{
-	p->key = *(p->p = v);
-	p->bh = bh;
-}
-
-static int verify_chain(Indirect *from, Indirect *to)
-{
-	while (from <= to && from->key == *from->p)
-		from++;
-	return (from > to);
-}
-
-/**
- *	ext3_block_to_path - parse the block number into array of offsets
- *	@inode: inode in question (we are only interested in its superblock)
- *	@i_block: block number to be parsed
- *	@offsets: array to store the offsets in
- *      @boundary: set this non-zero if the referred-to block is likely to be
- *             followed (on disk) by an indirect block.
- *
- *	To store the locations of file's data ext3 uses a data structure common
- *	for UNIX filesystems - tree of pointers anchored in the inode, with
- *	data blocks at leaves and indirect blocks in intermediate nodes.
- *	This function translates the block number into path in that tree -
- *	return value is the path length and @offsets[n] is the offset of
- *	pointer to (n+1)th node in the nth one. If @block is out of range
- *	(negative or too large) warning is printed and zero returned.
- *
- *	Note: function doesn't find node addresses, so no IO is needed. All
- *	we need to know is the capacity of indirect blocks (taken from the
- *	inode->i_sb).
- */
-
-/*
- * Portability note: the last comparison (check that we fit into triple
- * indirect block) is spelled differently, because otherwise on an
- * architecture with 32-bit longs and 8Kb pages we might get into trouble
- * if our filesystem had 8Kb blocks. We might use long long, but that would
- * kill us on x86. Oh, well, at least the sign propagation does not matter -
- * i_block would have to be negative in the very beginning, so we would not
- * get there at all.
- */
-
-static int ext3_block_to_path(struct inode *inode,
-			long i_block, int offsets[4], int *boundary)
-{
-	int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
-	int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
-	const long direct_blocks = EXT3_NDIR_BLOCKS,
-		indirect_blocks = ptrs,
-		double_blocks = (1 << (ptrs_bits * 2));
-	int n = 0;
-	int final = 0;
-
-	if (i_block < 0) {
-		ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
-	} else if (i_block < direct_blocks) {
-		offsets[n++] = i_block;
-		final = direct_blocks;
-	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
-		offsets[n++] = EXT3_IND_BLOCK;
-		offsets[n++] = i_block;
-		final = ptrs;
-	} else if ((i_block -= indirect_blocks) < double_blocks) {
-		offsets[n++] = EXT3_DIND_BLOCK;
-		offsets[n++] = i_block >> ptrs_bits;
-		offsets[n++] = i_block & (ptrs - 1);
-		final = ptrs;
-	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
-		offsets[n++] = EXT3_TIND_BLOCK;
-		offsets[n++] = i_block >> (ptrs_bits * 2);
-		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
-		offsets[n++] = i_block & (ptrs - 1);
-		final = ptrs;
-	} else {
-		ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big");
-	}
-	if (boundary)
-		*boundary = final - 1 - (i_block & (ptrs - 1));
-	return n;
-}
-
-/**
- *	ext3_get_branch - read the chain of indirect blocks leading to data
- *	@inode: inode in question
- *	@depth: depth of the chain (1 - direct pointer, etc.)
- *	@offsets: offsets of pointers in inode/indirect blocks
- *	@chain: place to store the result
- *	@err: here we store the error value
- *
- *	Function fills the array of triples <key, p, bh> and returns %NULL
- *	if everything went OK or the pointer to the last filled triple
- *	(incomplete one) otherwise. Upon the return chain[i].key contains
- *	the number of (i+1)-th block in the chain (as it is stored in memory,
- *	i.e. little-endian 32-bit), chain[i].p contains the address of that
- *	number (it points into struct inode for i==0 and into the bh->b_data
- *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
- *	block for i>0 and NULL for i==0. In other words, it holds the block
- *	numbers of the chain, addresses they were taken from (and where we can
- *	verify that chain did not change) and buffer_heads hosting these
- *	numbers.
- *
- *	Function stops when it stumbles upon zero pointer (absent block)
- *		(pointer to last triple returned, *@err == 0)
- *	or when it gets an IO error reading an indirect block
- *		(ditto, *@err == -EIO)
- *	or when it notices that chain had been changed while it was reading
- *		(ditto, *@err == -EAGAIN)
- *	or when it reads all @depth-1 indirect blocks successfully and finds
- *	the whole chain, all way to the data (returns %NULL, *err == 0).
- */
-static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
-				 Indirect chain[4], int *err)
-{
-	struct super_block *sb = inode->i_sb;
-	Indirect *p = chain;
-	struct buffer_head *bh;
-
-	*err = 0;
-	/* i_data is not going away, no lock needed */
-	add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
-	if (!p->key)
-		goto no_block;
-	while (--depth) {
-		bh = sb_bread(sb, le32_to_cpu(p->key));
-		if (!bh)
-			goto failure;
-		/* Reader: pointers */
-		if (!verify_chain(chain, p))
-			goto changed;
-		add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
-		/* Reader: end */
-		if (!p->key)
-			goto no_block;
-	}
-	return NULL;
-
-changed:
-	brelse(bh);
-	*err = -EAGAIN;
-	goto no_block;
-failure:
-	*err = -EIO;
-no_block:
-	return p;
-}
-
-/**
- *	ext3_find_near - find a place for allocation with sufficient locality
- *	@inode: owner
- *	@ind: descriptor of indirect block.
- *
- *	This function returns the preferred place for block allocation.
- *	It is used when heuristic for sequential allocation fails.
- *	Rules are:
- *	  + if there is a block to the left of our position - allocate near it.
- *	  + if pointer will live in indirect block - allocate near that block.
- *	  + if pointer will live in inode - allocate in the same
- *	    cylinder group.
- *
- * In the latter case we colour the starting block by the callers PID to
- * prevent it from clashing with concurrent allocations for a different inode
- * in the same block group.   The PID is used here so that functionally related
- * files will be close-by on-disk.
- *
- *	Caller must make sure that @ind is valid and will stay that way.
- */
-static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
-{
-	struct ext3_inode_info *ei = EXT3_I(inode);
-	__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
-	__le32 *p;
-	ext3_fsblk_t bg_start;
-	ext3_grpblk_t colour;
-
-	/* Try to find previous block */
-	for (p = ind->p - 1; p >= start; p--) {
-		if (*p)
-			return le32_to_cpu(*p);
-	}
-
-	/* No such thing, so let's try location of indirect block */
-	if (ind->bh)
-		return ind->bh->b_blocknr;
-
-	/*
-	 * It is going to be referred to from the inode itself? OK, just put it
-	 * into the same cylinder group then.
-	 */
-	bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
-	colour = (current->pid % 16) *
-			(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
-	return bg_start + colour;
-}
-
-/**
- *	ext3_find_goal - find a preferred place for allocation.
- *	@inode: owner
- *	@block:  block we want
- *	@partial: pointer to the last triple within a chain
- *
- *	Normally this function find the preferred place for block allocation,
- *	returns it.
- */
-
-static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
-				   Indirect *partial)
-{
-	struct ext3_block_alloc_info *block_i;
-
-	block_i =  EXT3_I(inode)->i_block_alloc_info;
-
-	/*
-	 * try the heuristic for sequential allocation,
-	 * failing that at least try to get decent locality.
-	 */
-	if (block_i && (block == block_i->last_alloc_logical_block + 1)
-		&& (block_i->last_alloc_physical_block != 0)) {
-		return block_i->last_alloc_physical_block + 1;
-	}
-
-	return ext3_find_near(inode, partial);
-}
-
-/**
- *	ext3_blks_to_allocate - Look up the block map and count the number
- *	of direct blocks need to be allocated for the given branch.
- *
- *	@branch: chain of indirect blocks
- *	@k: number of blocks need for indirect blocks
- *	@blks: number of data blocks to be mapped.
- *	@blocks_to_boundary:  the offset in the indirect block
- *
- *	return the total number of blocks to be allocate, including the
- *	direct and indirect blocks.
- */
-static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
-		int blocks_to_boundary)
-{
-	unsigned long count = 0;
-
-	/*
-	 * Simple case, [t,d]Indirect block(s) has not allocated yet
-	 * then it's clear blocks on that path have not allocated
-	 */
-	if (k > 0) {
-		/* right now we don't handle cross boundary allocation */
-		if (blks < blocks_to_boundary + 1)
-			count += blks;
-		else
-			count += blocks_to_boundary + 1;
-		return count;
-	}
-
-	count++;
-	while (count < blks && count <= blocks_to_boundary &&
-		le32_to_cpu(*(branch[0].p + count)) == 0) {
-		count++;
-	}
-	return count;
-}
-
-/**
- *	ext3_alloc_blocks - multiple allocate blocks needed for a branch
- *	@handle: handle for this transaction
- *	@inode: owner
- *	@goal: preferred place for allocation
- *	@indirect_blks: the number of blocks need to allocate for indirect
- *			blocks
- *	@blks:	number of blocks need to allocated for direct blocks
- *	@new_blocks: on return it will store the new block numbers for
- *	the indirect blocks(if needed) and the first direct block,
- *	@err: here we store the error value
- *
- *	return the number of direct blocks allocated
- */
-static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
-			ext3_fsblk_t goal, int indirect_blks, int blks,
-			ext3_fsblk_t new_blocks[4], int *err)
-{
-	int target, i;
-	unsigned long count = 0;
-	int index = 0;
-	ext3_fsblk_t current_block = 0;
-	int ret = 0;
-
-	/*
-	 * Here we try to allocate the requested multiple blocks at once,
-	 * on a best-effort basis.
-	 * To build a branch, we should allocate blocks for
-	 * the indirect blocks(if not allocated yet), and at least
-	 * the first direct block of this branch.  That's the
-	 * minimum number of blocks need to allocate(required)
-	 */
-	target = blks + indirect_blks;
-
-	while (1) {
-		count = target;
-		/* allocating blocks for indirect blocks and direct blocks */
-		current_block = ext3_new_blocks(handle,inode,goal,&count,err);
-		if (*err)
-			goto failed_out;
-
-		target -= count;
-		/* allocate blocks for indirect blocks */
-		while (index < indirect_blks && count) {
-			new_blocks[index++] = current_block++;
-			count--;
-		}
-
-		if (count > 0)
-			break;
-	}
-
-	/* save the new block number for the first direct block */
-	new_blocks[index] = current_block;
-
-	/* total number of blocks allocated for direct blocks */
-	ret = count;
-	*err = 0;
-	return ret;
-failed_out:
-	for (i = 0; i <index; i++)
-		ext3_free_blocks(handle, inode, new_blocks[i], 1);
-	return ret;
-}
-
-/**
- *	ext3_alloc_branch - allocate and set up a chain of blocks.
- *	@handle: handle for this transaction
- *	@inode: owner
- *	@indirect_blks: number of allocated indirect blocks
- *	@blks: number of allocated direct blocks
- *	@goal: preferred place for allocation
- *	@offsets: offsets (in the blocks) to store the pointers to next.
- *	@branch: place to store the chain in.
- *
- *	This function allocates blocks, zeroes out all but the last one,
- *	links them into chain and (if we are synchronous) writes them to disk.
- *	In other words, it prepares a branch that can be spliced onto the
- *	inode. It stores the information about that chain in the branch[], in
- *	the same format as ext3_get_branch() would do. We are calling it after
- *	we had read the existing part of chain and partial points to the last
- *	triple of that (one with zero ->key). Upon the exit we have the same
- *	picture as after the successful ext3_get_block(), except that in one
- *	place chain is disconnected - *branch->p is still zero (we did not
- *	set the last link), but branch->key contains the number that should
- *	be placed into *branch->p to fill that gap.
- *
- *	If allocation fails we free all blocks we've allocated (and forget
- *	their buffer_heads) and return the error value the from failed
- *	ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
- *	as described above and return 0.
- */
-static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
-			int indirect_blks, int *blks, ext3_fsblk_t goal,
-			int *offsets, Indirect *branch)
-{
-	int blocksize = inode->i_sb->s_blocksize;
-	int i, n = 0;
-	int err = 0;
-	struct buffer_head *bh;
-	int num;
-	ext3_fsblk_t new_blocks[4];
-	ext3_fsblk_t current_block;
-
-	num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
-				*blks, new_blocks, &err);
-	if (err)
-		return err;
-
-	branch[0].key = cpu_to_le32(new_blocks[0]);
-	/*
-	 * metadata blocks and data blocks are allocated.
-	 */
-	for (n = 1; n <= indirect_blks;  n++) {
-		/*
-		 * Get buffer_head for parent block, zero it out
-		 * and set the pointer to new one, then send
-		 * parent to disk.
-		 */
-		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
-		if (unlikely(!bh)) {
-			err = -ENOMEM;
-			goto failed;
-		}
-		branch[n].bh = bh;
-		lock_buffer(bh);
-		BUFFER_TRACE(bh, "call get_create_access");
-		err = ext3_journal_get_create_access(handle, bh);
-		if (err) {
-			unlock_buffer(bh);
-			brelse(bh);
-			goto failed;
-		}
-
-		memset(bh->b_data, 0, blocksize);
-		branch[n].p = (__le32 *) bh->b_data + offsets[n];
-		branch[n].key = cpu_to_le32(new_blocks[n]);
-		*branch[n].p = branch[n].key;
-		if ( n == indirect_blks) {
-			current_block = new_blocks[n];
-			/*
-			 * End of chain, update the last new metablock of
-			 * the chain to point to the new allocated
-			 * data blocks numbers
-			 */
-			for (i=1; i < num; i++)
-				*(branch[n].p + i) = cpu_to_le32(++current_block);
-		}
-		BUFFER_TRACE(bh, "marking uptodate");
-		set_buffer_uptodate(bh);
-		unlock_buffer(bh);
-
-		BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-		err = ext3_journal_dirty_metadata(handle, bh);
-		if (err)
-			goto failed;
-	}
-	*blks = num;
-	return err;
-failed:
-	/* Allocation failed, free what we already allocated */
-	for (i = 1; i <= n ; i++) {
-		BUFFER_TRACE(branch[i].bh, "call journal_forget");
-		ext3_journal_forget(handle, branch[i].bh);
-	}
-	for (i = 0; i < indirect_blks; i++)
-		ext3_free_blocks(handle, inode, new_blocks[i], 1);
-
-	ext3_free_blocks(handle, inode, new_blocks[i], num);
-
-	return err;
-}
-
-/**
- * ext3_splice_branch - splice the allocated branch onto inode.
- * @handle: handle for this transaction
- * @inode: owner
- * @block: (logical) number of block we are adding
- * @where: location of missing link
- * @num:   number of indirect blocks we are adding
- * @blks:  number of direct blocks we are adding
- *
- * This function fills the missing link and does all housekeeping needed in
- * inode (->i_blocks, etc.). In case of success we end up with the full
- * chain to new block and return 0.
- */
-static int ext3_splice_branch(handle_t *handle, struct inode *inode,
-			long block, Indirect *where, int num, int blks)
-{
-	int i;
-	int err = 0;
-	struct ext3_block_alloc_info *block_i;
-	ext3_fsblk_t current_block;
-	struct ext3_inode_info *ei = EXT3_I(inode);
-	struct timespec now;
-
-	block_i = ei->i_block_alloc_info;
-	/*
-	 * If we're splicing into a [td]indirect block (as opposed to the
-	 * inode) then we need to get write access to the [td]indirect block
-	 * before the splice.
-	 */
-	if (where->bh) {
-		BUFFER_TRACE(where->bh, "get_write_access");
-		err = ext3_journal_get_write_access(handle, where->bh);
-		if (err)
-			goto err_out;
-	}
-	/* That's it */
-
-	*where->p = where->key;
-
-	/*
-	 * Update the host buffer_head or inode to point to more just allocated
-	 * direct blocks blocks
-	 */
-	if (num == 0 && blks > 1) {
-		current_block = le32_to_cpu(where->key) + 1;
-		for (i = 1; i < blks; i++)
-			*(where->p + i ) = cpu_to_le32(current_block++);
-	}
-
-	/*
-	 * update the most recently allocated logical & physical block
-	 * in i_block_alloc_info, to assist find the proper goal block for next
-	 * allocation
-	 */
-	if (block_i) {
-		block_i->last_alloc_logical_block = block + blks - 1;
-		block_i->last_alloc_physical_block =
-				le32_to_cpu(where[num].key) + blks - 1;
-	}
-
-	/* We are done with atomic stuff, now do the rest of housekeeping */
-	now = CURRENT_TIME_SEC;
-	if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) {
-		inode->i_ctime = now;
-		ext3_mark_inode_dirty(handle, inode);
-	}
-	/* ext3_mark_inode_dirty already updated i_sync_tid */
-	atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
-
-	/* had we spliced it onto indirect block? */
-	if (where->bh) {
-		/*
-		 * If we spliced it onto an indirect block, we haven't
-		 * altered the inode.  Note however that if it is being spliced
-		 * onto an indirect block at the very end of the file (the
-		 * file is growing) then we *will* alter the inode to reflect
-		 * the new i_size.  But that is not done here - it is done in
-		 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
-		 */
-		jbd_debug(5, "splicing indirect only\n");
-		BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
-		err = ext3_journal_dirty_metadata(handle, where->bh);
-		if (err)
-			goto err_out;
-	} else {
-		/*
-		 * OK, we spliced it into the inode itself on a direct block.
-		 * Inode was dirtied above.
-		 */
-		jbd_debug(5, "splicing direct\n");
-	}
-	return err;
-
-err_out:
-	for (i = 1; i <= num; i++) {
-		BUFFER_TRACE(where[i].bh, "call journal_forget");
-		ext3_journal_forget(handle, where[i].bh);
-		ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
-	}
-	ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
-
-	return err;
-}
-
-/*
- * Allocation strategy is simple: if we have to allocate something, we will
- * have to go the whole way to leaf. So let's do it before attaching anything
- * to tree, set linkage between the newborn blocks, write them if sync is
- * required, recheck the path, free and repeat if check fails, otherwise
- * set the last missing link (that will protect us from any truncate-generated
- * removals - all blocks on the path are immune now) and possibly force the
- * write on the parent block.
- * That has a nice additional property: no special recovery from the failed
- * allocations is needed - we simply release blocks and do not touch anything
- * reachable from inode.
- *
- * `handle' can be NULL if create == 0.
- *
- * The BKL may not be held on entry here.  Be sure to take it early.
- * return > 0, # of blocks mapped or allocated.
- * return = 0, if plain lookup failed.
- * return < 0, error case.
- */
-int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
-		sector_t iblock, unsigned long maxblocks,
-		struct buffer_head *bh_result,
-		int create)
-{
-	int err = -EIO;
-	int offsets[4];
-	Indirect chain[4];
-	Indirect *partial;
-	ext3_fsblk_t goal;
-	int indirect_blks;
-	int blocks_to_boundary = 0;
-	int depth;
-	struct ext3_inode_info *ei = EXT3_I(inode);
-	int count = 0;
-	ext3_fsblk_t first_block = 0;
-
-
-	trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create);
-	J_ASSERT(handle != NULL || create == 0);
-	depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
-
-	if (depth == 0)
-		goto out;
-
-	partial = ext3_get_branch(inode, depth, offsets, chain, &err);
-
-	/* Simplest case - block found, no allocation needed */
-	if (!partial) {
-		first_block = le32_to_cpu(chain[depth - 1].key);
-		clear_buffer_new(bh_result);
-		count++;
-		/*map more blocks*/
-		while (count < maxblocks && count <= blocks_to_boundary) {
-			ext3_fsblk_t blk;
-
-			if (!verify_chain(chain, chain + depth - 1)) {
-				/*
-				 * Indirect block might be removed by
-				 * truncate while we were reading it.
-				 * Handling of that case: forget what we've
-				 * got now. Flag the err as EAGAIN, so it
-				 * will reread.
-				 */
-				err = -EAGAIN;
-				count = 0;
-				break;
-			}
-			blk = le32_to_cpu(*(chain[depth-1].p + count));
-
-			if (blk == first_block + count)
-				count++;
-			else
-				break;
-		}
-		if (err != -EAGAIN)
-			goto got_it;
-	}
-
-	/* Next simple case - plain lookup or failed read of indirect block */
-	if (!create || err == -EIO)
-		goto cleanup;
-
-	/*
-	 * Block out ext3_truncate while we alter the tree
-	 */
-	mutex_lock(&ei->truncate_mutex);
-
-	/*
-	 * If the indirect block is missing while we are reading
-	 * the chain(ext3_get_branch() returns -EAGAIN err), or
-	 * if the chain has been changed after we grab the semaphore,
-	 * (either because another process truncated this branch, or
-	 * another get_block allocated this branch) re-grab the chain to see if
-	 * the request block has been allocated or not.
-	 *
-	 * Since we already block the truncate/other get_block
-	 * at this point, we will have the current copy of the chain when we
-	 * splice the branch into the tree.
-	 */
-	if (err == -EAGAIN || !verify_chain(chain, partial)) {
-		while (partial > chain) {
-			brelse(partial->bh);
-			partial--;
-		}
-		partial = ext3_get_branch(inode, depth, offsets, chain, &err);
-		if (!partial) {
-			count++;
-			mutex_unlock(&ei->truncate_mutex);
-			if (err)
-				goto cleanup;
-			clear_buffer_new(bh_result);
-			goto got_it;
-		}
-	}
-
-	/*
-	 * Okay, we need to do block allocation.  Lazily initialize the block
-	 * allocation info here if necessary
-	*/
-	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
-		ext3_init_block_alloc_info(inode);
-
-	goal = ext3_find_goal(inode, iblock, partial);
-
-	/* the number of blocks need to allocate for [d,t]indirect blocks */
-	indirect_blks = (chain + depth) - partial - 1;
-
-	/*
-	 * Next look up the indirect map to count the totoal number of
-	 * direct blocks to allocate for this branch.
-	 */
-	count = ext3_blks_to_allocate(partial, indirect_blks,
-					maxblocks, blocks_to_boundary);
-	err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
-				offsets + (partial - chain), partial);
-
-	/*
-	 * The ext3_splice_branch call will free and forget any buffers
-	 * on the new chain if there is a failure, but that risks using
-	 * up transaction credits, especially for bitmaps where the
-	 * credits cannot be returned.  Can we handle this somehow?  We
-	 * may need to return -EAGAIN upwards in the worst case.  --sct
-	 */
-	if (!err)
-		err = ext3_splice_branch(handle, inode, iblock,
-					partial, indirect_blks, count);
-	mutex_unlock(&ei->truncate_mutex);
-	if (err)
-		goto cleanup;
-
-	set_buffer_new(bh_result);
-got_it:
-	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
-	if (count > blocks_to_boundary)
-		set_buffer_boundary(bh_result);
-	err = count;
-	/* Clean up and exit */
-	partial = chain + depth - 1;	/* the whole chain */
-cleanup:
-	while (partial > chain) {
-		BUFFER_TRACE(partial->bh, "call brelse");
-		brelse(partial->bh);
-		partial--;
-	}
-	BUFFER_TRACE(bh_result, "returned");
-out:
-	trace_ext3_get_blocks_exit(inode, iblock,
-				   depth ? le32_to_cpu(chain[depth-1].key) : 0,
-				   count, err);
-	return err;
-}
-
-/* Maximum number of blocks we map for direct IO at once. */
-#define DIO_MAX_BLOCKS 4096
-/*
- * Number of credits we need for writing DIO_MAX_BLOCKS:
- * We need sb + group descriptor + bitmap + inode -> 4
- * For B blocks with A block pointers per block we need:
- * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
- * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
- */
-#define DIO_CREDITS 25
-
-static int ext3_get_block(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh_result, int create)
-{
-	handle_t *handle = ext3_journal_current_handle();
-	int ret = 0, started = 0;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-
-	if (create && !handle) {	/* Direct IO write... */
-		if (max_blocks > DIO_MAX_BLOCKS)
-			max_blocks = DIO_MAX_BLOCKS;
-		handle = ext3_journal_start(inode, DIO_CREDITS +
-				EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			goto out;
-		}
-		started = 1;
-	}
-
-	ret = ext3_get_blocks_handle(handle, inode, iblock,
-					max_blocks, bh_result, create);
-	if (ret > 0) {
-		bh_result->b_size = (ret << inode->i_blkbits);
-		ret = 0;
-	}
-	if (started)
-		ext3_journal_stop(handle);
-out:
-	return ret;
-}
-
-int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-		u64 start, u64 len)
-{
-	return generic_block_fiemap(inode, fieinfo, start, len,
-				    ext3_get_block);
-}
-
-/*
- * `handle' can be NULL if create is zero
- */
-struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
-				long block, int create, int *errp)
-{
-	struct buffer_head dummy;
-	int fatal = 0, err;
-
-	J_ASSERT(handle != NULL || create == 0);
-
-	dummy.b_state = 0;
-	dummy.b_blocknr = -1000;
-	buffer_trace_init(&dummy.b_history);
-	err = ext3_get_blocks_handle(handle, inode, block, 1,
-					&dummy, create);
-	/*
-	 * ext3_get_blocks_handle() returns number of blocks
-	 * mapped. 0 in case of a HOLE.
-	 */
-	if (err > 0) {
-		WARN_ON(err > 1);
-		err = 0;
-	}
-	*errp = err;
-	if (!err && buffer_mapped(&dummy)) {
-		struct buffer_head *bh;
-		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
-		if (unlikely(!bh)) {
-			*errp = -ENOMEM;
-			goto err;
-		}
-		if (buffer_new(&dummy)) {
-			J_ASSERT(create != 0);
-			J_ASSERT(handle != NULL);
-
-			/*
-			 * Now that we do not always journal data, we should
-			 * keep in mind whether this should always journal the
-			 * new buffer as metadata.  For now, regular file
-			 * writes use ext3_get_block instead, so it's not a
-			 * problem.
-			 */
-			lock_buffer(bh);
-			BUFFER_TRACE(bh, "call get_create_access");
-			fatal = ext3_journal_get_create_access(handle, bh);
-			if (!fatal && !buffer_uptodate(bh)) {
-				memset(bh->b_data,0,inode->i_sb->s_blocksize);
-				set_buffer_uptodate(bh);
-			}
-			unlock_buffer(bh);
-			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-			err = ext3_journal_dirty_metadata(handle, bh);
-			if (!fatal)
-				fatal = err;
-		} else {
-			BUFFER_TRACE(bh, "not a new buffer");
-		}
-		if (fatal) {
-			*errp = fatal;
-			brelse(bh);
-			bh = NULL;
-		}
-		return bh;
-	}
-err:
-	return NULL;
-}
-
-struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
-			       int block, int create, int *err)
-{
-	struct buffer_head * bh;
-
-	bh = ext3_getblk(handle, inode, block, create, err);
-	if (!bh)
-		return bh;
-	if (bh_uptodate_or_lock(bh))
-		return bh;
-	get_bh(bh);
-	bh->b_end_io = end_buffer_read_sync;
-	submit_bh(READ | REQ_META | REQ_PRIO, bh);
-	wait_on_buffer(bh);
-	if (buffer_uptodate(bh))
-		return bh;
-	put_bh(bh);
-	*err = -EIO;
-	return NULL;
-}
-
-static int walk_page_buffers(	handle_t *handle,
-				struct buffer_head *head,
-				unsigned from,
-				unsigned to,
-				int *partial,
-				int (*fn)(	handle_t *handle,
-						struct buffer_head *bh))
-{
-	struct buffer_head *bh;
-	unsigned block_start, block_end;
-	unsigned blocksize = head->b_size;
-	int err, ret = 0;
-	struct buffer_head *next;
-
-	for (	bh = head, block_start = 0;
-		ret == 0 && (bh != head || !block_start);
-		block_start = block_end, bh = next)
-	{
-		next = bh->b_this_page;
-		block_end = block_start + blocksize;
-		if (block_end <= from || block_start >= to) {
-			if (partial && !buffer_uptodate(bh))
-				*partial = 1;
-			continue;
-		}
-		err = (*fn)(handle, bh);
-		if (!ret)
-			ret = err;
-	}
-	return ret;
-}
-
-/*
- * To preserve ordering, it is essential that the hole instantiation and
- * the data write be encapsulated in a single transaction.  We cannot
- * close off a transaction and start a new one between the ext3_get_block()
- * and the commit_write().  So doing the journal_start at the start of
- * prepare_write() is the right place.
- *
- * Also, this function can nest inside ext3_writepage() ->
- * block_write_full_page(). In that case, we *know* that ext3_writepage()
- * has generated enough buffer credits to do the whole page.  So we won't
- * block on the journal in that case, which is good, because the caller may
- * be PF_MEMALLOC.
- *
- * By accident, ext3 can be reentered when a transaction is open via
- * quota file writes.  If we were to commit the transaction while thus
- * reentered, there can be a deadlock - we would be holding a quota
- * lock, and the commit would never complete if another thread had a
- * transaction open and was blocking on the quota lock - a ranking
- * violation.
- *
- * So what we do is to rely on the fact that journal_stop/journal_start
- * will _not_ run commit under these circumstances because handle->h_ref
- * is elevated.  We'll still have enough credits for the tiny quotafile
- * write.
- */
-static int do_journal_get_write_access(handle_t *handle,
-					struct buffer_head *bh)
-{
-	int dirty = buffer_dirty(bh);
-	int ret;
-
-	if (!buffer_mapped(bh) || buffer_freed(bh))
-		return 0;
-	/*
-	 * __block_prepare_write() could have dirtied some buffers. Clean
-	 * the dirty bit as jbd2_journal_get_write_access() could complain
-	 * otherwise about fs integrity issues. Setting of the dirty bit
-	 * by __block_prepare_write() isn't a real problem here as we clear
-	 * the bit before releasing a page lock and thus writeback cannot
-	 * ever write the buffer.
-	 */
-	if (dirty)
-		clear_buffer_dirty(bh);
-	ret = ext3_journal_get_write_access(handle, bh);
-	if (!ret && dirty)
-		ret = ext3_journal_dirty_metadata(handle, bh);
-	return ret;
-}
-
-/*
- * Truncate blocks that were not used by write. We have to truncate the
- * pagecache as well so that corresponding buffers get properly unmapped.
- */
-static void ext3_truncate_failed_write(struct inode *inode)
-{
-	truncate_inode_pages(inode->i_mapping, inode->i_size);
-	ext3_truncate(inode);
-}
-
-/*
- * Truncate blocks that were not used by direct IO write. We have to zero out
- * the last file block as well because direct IO might have written to it.
- */
-static void ext3_truncate_failed_direct_write(struct inode *inode)
-{
-	ext3_block_truncate_page(inode, inode->i_size);
-	ext3_truncate(inode);
-}
-
-static int ext3_write_begin(struct file *file, struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned flags,
-				struct page **pagep, void **fsdata)
-{
-	struct inode *inode = mapping->host;
-	int ret;
-	handle_t *handle;
-	int retries = 0;
-	struct page *page;
-	pgoff_t index;
-	unsigned from, to;
-	/* Reserve one block more for addition to orphan list in case
-	 * we allocate blocks but write fails for some reason */
-	int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
-
-	trace_ext3_write_begin(inode, pos, len, flags);
-
-	index = pos >> PAGE_CACHE_SHIFT;
-	from = pos & (PAGE_CACHE_SIZE - 1);
-	to = from + len;
-
-retry:
-	page = grab_cache_page_write_begin(mapping, index, flags);
-	if (!page)
-		return -ENOMEM;
-	*pagep = page;
-
-	handle = ext3_journal_start(inode, needed_blocks);
-	if (IS_ERR(handle)) {
-		unlock_page(page);
-		page_cache_release(page);
-		ret = PTR_ERR(handle);
-		goto out;
-	}
-	ret = __block_write_begin(page, pos, len, ext3_get_block);
-	if (ret)
-		goto write_begin_failed;
-
-	if (ext3_should_journal_data(inode)) {
-		ret = walk_page_buffers(handle, page_buffers(page),
-				from, to, NULL, do_journal_get_write_access);
-	}
-write_begin_failed:
-	if (ret) {
-		/*
-		 * block_write_begin may have instantiated a few blocks
-		 * outside i_size.  Trim these off again. Don't need
-		 * i_size_read because we hold i_mutex.
-		 *
-		 * Add inode to orphan list in case we crash before truncate
-		 * finishes. Do this only if ext3_can_truncate() agrees so
-		 * that orphan processing code is happy.
-		 */
-		if (pos + len > inode->i_size && ext3_can_truncate(inode))
-			ext3_orphan_add(handle, inode);
-		ext3_journal_stop(handle);
-		unlock_page(page);
-		page_cache_release(page);
-		if (pos + len > inode->i_size)
-			ext3_truncate_failed_write(inode);
-	}
-	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
-		goto retry;
-out:
-	return ret;
-}
-
-
-int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-	int err = journal_dirty_data(handle, bh);
-	if (err)
-		ext3_journal_abort_handle(__func__, __func__,
-						bh, handle, err);
-	return err;
-}
-
-/* For ordered writepage and write_end functions */
-static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
-	/*
-	 * Write could have mapped the buffer but it didn't copy the data in
-	 * yet. So avoid filing such buffer into a transaction.
-	 */
-	if (buffer_mapped(bh) && buffer_uptodate(bh))
-		return ext3_journal_dirty_data(handle, bh);
-	return 0;
-}
-
-/* For write_end() in data=journal mode */
-static int write_end_fn(handle_t *handle, struct buffer_head *bh)
-{
-	if (!buffer_mapped(bh) || buffer_freed(bh))
-		return 0;
-	set_buffer_uptodate(bh);
-	return ext3_journal_dirty_metadata(handle, bh);
-}
-
-/*
- * This is nasty and subtle: ext3_write_begin() could have allocated blocks
- * for the whole page but later we failed to copy the data in. Update inode
- * size according to what we managed to copy. The rest is going to be
- * truncated in write_end function.
- */
-static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
-{
-	/* What matters to us is i_disksize. We don't write i_size anywhere */
-	if (pos + copied > inode->i_size)
-		i_size_write(inode, pos + copied);
-	if (pos + copied > EXT3_I(inode)->i_disksize) {
-		EXT3_I(inode)->i_disksize = pos + copied;
-		mark_inode_dirty(inode);
-	}
-}
-
-/*
- * We need to pick up the new inode size which generic_commit_write gave us
- * `file' can be NULL - eg, when called from page_symlink().
- *
- * ext3 never places buffers on inode->i_mapping->private_list.  metadata
- * buffers are managed internally.
- */
-static int ext3_ordered_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
-{
-	handle_t *handle = ext3_journal_current_handle();
-	struct inode *inode = file->f_mapping->host;
-	unsigned from, to;
-	int ret = 0, ret2;
-
-	trace_ext3_ordered_write_end(inode, pos, len, copied);
-	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
-	from = pos & (PAGE_CACHE_SIZE - 1);
-	to = from + copied;
-	ret = walk_page_buffers(handle, page_buffers(page),
-		from, to, NULL, journal_dirty_data_fn);
-
-	if (ret == 0)
-		update_file_sizes(inode, pos, copied);
-	/*
-	 * There may be allocated blocks outside of i_size because
-	 * we failed to copy some data. Prepare for truncate.
-	 */
-	if (pos + len > inode->i_size && ext3_can_truncate(inode))
-		ext3_orphan_add(handle, inode);
-	ret2 = ext3_journal_stop(handle);
-	if (!ret)
-		ret = ret2;
-	unlock_page(page);
-	page_cache_release(page);
-
-	if (pos + len > inode->i_size)
-		ext3_truncate_failed_write(inode);
-	return ret ? ret : copied;
-}
-
-static int ext3_writeback_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
-{
-	handle_t *handle = ext3_journal_current_handle();
-	struct inode *inode = file->f_mapping->host;
-	int ret;
-
-	trace_ext3_writeback_write_end(inode, pos, len, copied);
-	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-	update_file_sizes(inode, pos, copied);
-	/*
-	 * There may be allocated blocks outside of i_size because
-	 * we failed to copy some data. Prepare for truncate.
-	 */
-	if (pos + len > inode->i_size && ext3_can_truncate(inode))
-		ext3_orphan_add(handle, inode);
-	ret = ext3_journal_stop(handle);
-	unlock_page(page);
-	page_cache_release(page);
-
-	if (pos + len > inode->i_size)
-		ext3_truncate_failed_write(inode);
-	return ret ? ret : copied;
-}
-
-static int ext3_journalled_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
-{
-	handle_t *handle = ext3_journal_current_handle();
-	struct inode *inode = mapping->host;
-	struct ext3_inode_info *ei = EXT3_I(inode);
-	int ret = 0, ret2;
-	int partial = 0;
-	unsigned from, to;
-
-	trace_ext3_journalled_write_end(inode, pos, len, copied);
-	from = pos & (PAGE_CACHE_SIZE - 1);
-	to = from + len;
-
-	if (copied < len) {
-		if (!PageUptodate(page))
-			copied = 0;
-		page_zero_new_buffers(page, from + copied, to);
-		to = from + copied;
-	}
-
-	ret = walk_page_buffers(handle, page_buffers(page), from,
-				to, &partial, write_end_fn);
-	if (!partial)
-		SetPageUptodate(page);
-
-	if (pos + copied > inode->i_size)
-		i_size_write(inode, pos + copied);
-	/*
-	 * There may be allocated blocks outside of i_size because
-	 * we failed to copy some data. Prepare for truncate.
-	 */
-	if (pos + len > inode->i_size && ext3_can_truncate(inode))
-		ext3_orphan_add(handle, inode);
-	ext3_set_inode_state(inode, EXT3_STATE_JDATA);
-	atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
-	if (inode->i_size > ei->i_disksize) {
-		ei->i_disksize = inode->i_size;
-		ret2 = ext3_mark_inode_dirty(handle, inode);
-		if (!ret)
-			ret = ret2;
-	}
-
-	ret2 = ext3_journal_stop(handle);
-	if (!ret)
-		ret = ret2;
-	unlock_page(page);
-	page_cache_release(page);
-
-	if (pos + len > inode->i_size)
-		ext3_truncate_failed_write(inode);
-	return ret ? ret : copied;
-}
-
-/*
- * bmap() is special.  It gets used by applications such as lilo and by
- * the swapper to find the on-disk block of a specific piece of data.
- *
- * Naturally, this is dangerous if the block concerned is still in the
- * journal.  If somebody makes a swapfile on an ext3 data-journaling
- * filesystem and enables swap, then they may get a nasty shock when the
- * data getting swapped to that swapfile suddenly gets overwritten by
- * the original zero's written out previously to the journal and
- * awaiting writeback in the kernel's buffer cache.
- *
- * So, if we see any bmap calls here on a modified, data-journaled file,
- * take extra steps to flush any blocks which might be in the cache.
- */
-static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
-{
-	struct inode *inode = mapping->host;
-	journal_t *journal;
-	int err;
-
-	if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
-		/*
-		 * This is a REALLY heavyweight approach, but the use of
-		 * bmap on dirty files is expected to be extremely rare:
-		 * only if we run lilo or swapon on a freshly made file
-		 * do we expect this to happen.
-		 *
-		 * (bmap requires CAP_SYS_RAWIO so this does not
-		 * represent an unprivileged user DOS attack --- we'd be
-		 * in trouble if mortal users could trigger this path at
-		 * will.)
-		 *
-		 * NB. EXT3_STATE_JDATA is not set on files other than
-		 * regular files.  If somebody wants to bmap a directory
-		 * or symlink and gets confused because the buffer
-		 * hasn't yet been flushed to disk, they deserve
-		 * everything they get.
-		 */
-
-		ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
-		journal = EXT3_JOURNAL(inode);
-		journal_lock_updates(journal);
-		err = journal_flush(journal);
-		journal_unlock_updates(journal);
-
-		if (err)
-			return 0;
-	}
-
-	return generic_block_bmap(mapping,block,ext3_get_block);
-}
-
-static int bget_one(handle_t *handle, struct buffer_head *bh)
-{
-	get_bh(bh);
-	return 0;
-}
-
-static int bput_one(handle_t *handle, struct buffer_head *bh)
-{
-	put_bh(bh);
-	return 0;
-}
-
-static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
-{
-	return !buffer_mapped(bh);
-}
-
-/*
- * Note that whenever we need to map blocks we start a transaction even if
- * we're not journalling data.  This is to preserve ordering: any hole
- * instantiation within __block_write_full_page -> ext3_get_block() should be
- * journalled along with the data so we don't crash and then get metadata which
- * refers to old data.
- *
- * In all journalling modes block_write_full_page() will start the I/O.
- *
- * We don't honour synchronous mounts for writepage().  That would be
- * disastrous.  Any write() or metadata operation will sync the fs for
- * us.
- */
-static int ext3_ordered_writepage(struct page *page,
-				struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	struct buffer_head *page_bufs;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
-
-	J_ASSERT(PageLocked(page));
-	/*
-	 * We don't want to warn for emergency remount. The condition is
-	 * ordered to avoid dereferencing inode->i_sb in non-error case to
-	 * avoid slow-downs.
-	 */
-	WARN_ON_ONCE(IS_RDONLY(inode) &&
-		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
-
-	/*
-	 * We give up here if we're reentered, because it might be for a
-	 * different filesystem.
-	 */
-	if (ext3_journal_current_handle())
-		goto out_fail;
-
-	trace_ext3_ordered_writepage(page);
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, inode->i_sb->s_blocksize,
-				(1 << BH_Dirty)|(1 << BH_Uptodate));
-		page_bufs = page_buffers(page);
-	} else {
-		page_bufs = page_buffers(page);
-		if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
-				       NULL, buffer_unmapped)) {
-			/* Provide NULL get_block() to catch bugs if buffers
-			 * weren't really mapped */
-			return block_write_full_page(page, NULL, wbc);
-		}
-	}
-	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
-
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out_fail;
-	}
-
-	walk_page_buffers(handle, page_bufs, 0,
-			PAGE_CACHE_SIZE, NULL, bget_one);
-
-	ret = block_write_full_page(page, ext3_get_block, wbc);
-
-	/*
-	 * The page can become unlocked at any point now, and
-	 * truncate can then come in and change things.  So we
-	 * can't touch *page from now on.  But *page_bufs is
-	 * safe due to elevated refcount.
-	 */
-
-	/*
-	 * And attach them to the current transaction.  But only if
-	 * block_write_full_page() succeeded.  Otherwise they are unmapped,
-	 * and generally junk.
-	 */
-	if (ret == 0)
-		ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
-					NULL, journal_dirty_data_fn);
-	walk_page_buffers(handle, page_bufs, 0,
-			PAGE_CACHE_SIZE, NULL, bput_one);
-	err = ext3_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-
-out_fail:
-	redirty_page_for_writepage(wbc, page);
-	unlock_page(page);
-	return ret;
-}
-
-static int ext3_writeback_writepage(struct page *page,
-				struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
-
-	J_ASSERT(PageLocked(page));
-	/*
-	 * We don't want to warn for emergency remount. The condition is
-	 * ordered to avoid dereferencing inode->i_sb in non-error case to
-	 * avoid slow-downs.
-	 */
-	WARN_ON_ONCE(IS_RDONLY(inode) &&
-		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
-
-	if (ext3_journal_current_handle())
-		goto out_fail;
-
-	trace_ext3_writeback_writepage(page);
-	if (page_has_buffers(page)) {
-		if (!walk_page_buffers(NULL, page_buffers(page), 0,
-				      PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
-			/* Provide NULL get_block() to catch bugs if buffers
-			 * weren't really mapped */
-			return block_write_full_page(page, NULL, wbc);
-		}
-	}
-
-	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out_fail;
-	}
-
-	ret = block_write_full_page(page, ext3_get_block, wbc);
-
-	err = ext3_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-
-out_fail:
-	redirty_page_for_writepage(wbc, page);
-	unlock_page(page);
-	return ret;
-}
-
-static int ext3_journalled_writepage(struct page *page,
-				struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
-
-	J_ASSERT(PageLocked(page));
-	/*
-	 * We don't want to warn for emergency remount. The condition is
-	 * ordered to avoid dereferencing inode->i_sb in non-error case to
-	 * avoid slow-downs.
-	 */
-	WARN_ON_ONCE(IS_RDONLY(inode) &&
-		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
-
-	trace_ext3_journalled_writepage(page);
-	if (!page_has_buffers(page) || PageChecked(page)) {
-		if (ext3_journal_current_handle())
-			goto no_write;
-
-		handle = ext3_journal_start(inode,
-					    ext3_writepage_trans_blocks(inode));
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			goto no_write;
-		}
-		/*
-		 * It's mmapped pagecache.  Add buffers and journal it.  There
-		 * doesn't seem much point in redirtying the page here.
-		 */
-		ClearPageChecked(page);
-		ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
-					  ext3_get_block);
-		if (ret != 0) {
-			ext3_journal_stop(handle);
-			goto out_unlock;
-		}
-		ret = walk_page_buffers(handle, page_buffers(page), 0,
-			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-
-		err = walk_page_buffers(handle, page_buffers(page), 0,
-				PAGE_CACHE_SIZE, NULL, write_end_fn);
-		if (ret == 0)
-			ret = err;
-		ext3_set_inode_state(inode, EXT3_STATE_JDATA);
-		atomic_set(&EXT3_I(inode)->i_datasync_tid,
-			   handle->h_transaction->t_tid);
-		unlock_page(page);
-		err = ext3_journal_stop(handle);
-		if (!ret)
-			ret = err;
-	} else {
-		/*
-		 * It is a page full of checkpoint-mode buffers. Go and write
-		 * them. They should have been already mapped when they went
-		 * to the journal so provide NULL get_block function to catch
-		 * errors.
-		 */
-		ret = block_write_full_page(page, NULL, wbc);
-	}
-out:
-	return ret;
-
-no_write:
-	redirty_page_for_writepage(wbc, page);
-out_unlock:
-	unlock_page(page);
-	goto out;
-}
-
-static int ext3_readpage(struct file *file, struct page *page)
-{
-	trace_ext3_readpage(page);
-	return mpage_readpage(page, ext3_get_block);
-}
-
-static int
-ext3_readpages(struct file *file, struct address_space *mapping,
-		struct list_head *pages, unsigned nr_pages)
-{
-	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
-}
-
-static void ext3_invalidatepage(struct page *page, unsigned int offset,
-				unsigned int length)
-{
-	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
-
-	trace_ext3_invalidatepage(page, offset, length);
-
-	/*
-	 * If it's a full truncate we just forget about the pending dirtying
-	 */
-	if (offset == 0 && length == PAGE_CACHE_SIZE)
-		ClearPageChecked(page);
-
-	journal_invalidatepage(journal, page, offset, length);
-}
-
-static int ext3_releasepage(struct page *page, gfp_t wait)
-{
-	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
-
-	trace_ext3_releasepage(page);
-	WARN_ON(PageChecked(page));
-	if (!page_has_buffers(page))
-		return 0;
-	return journal_try_to_free_buffers(journal, page, wait);
-}
-
-/*
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list.  So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file. But current
- * VFS code falls back into buffered path in that case so we are safe.
- */
-static ssize_t ext3_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			      loff_t offset)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_mapping->host;
-	struct ext3_inode_info *ei = EXT3_I(inode);
-	handle_t *handle;
-	ssize_t ret;
-	int orphan = 0;
-	size_t count = iov_iter_count(iter);
-	int retries = 0;
-
-	trace_ext3_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
-
-	if (iov_iter_rw(iter) == WRITE) {
-		loff_t final_size = offset + count;
-
-		if (final_size > inode->i_size) {
-			/* Credits for sb + inode write */
-			handle = ext3_journal_start(inode, 2);
-			if (IS_ERR(handle)) {
-				ret = PTR_ERR(handle);
-				goto out;
-			}
-			ret = ext3_orphan_add(handle, inode);
-			if (ret) {
-				ext3_journal_stop(handle);
-				goto out;
-			}
-			orphan = 1;
-			ei->i_disksize = inode->i_size;
-			ext3_journal_stop(handle);
-		}
-	}
-
-retry:
-	ret = blockdev_direct_IO(iocb, inode, iter, offset, ext3_get_block);
-	/*
-	 * In case of error extending write may have instantiated a few
-	 * blocks outside i_size. Trim these off again.
-	 */
-	if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
-		loff_t isize = i_size_read(inode);
-		loff_t end = offset + count;
-
-		if (end > isize)
-			ext3_truncate_failed_direct_write(inode);
-	}
-	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
-		goto retry;
-
-	if (orphan) {
-		int err;
-
-		/* Credits for sb + inode write */
-		handle = ext3_journal_start(inode, 2);
-		if (IS_ERR(handle)) {
-			/* This is really bad luck. We've written the data
-			 * but cannot extend i_size. Truncate allocated blocks
-			 * and pretend the write failed... */
-			ext3_truncate_failed_direct_write(inode);
-			ret = PTR_ERR(handle);
-			if (inode->i_nlink)
-				ext3_orphan_del(NULL, inode);
-			goto out;
-		}
-		if (inode->i_nlink)
-			ext3_orphan_del(handle, inode);
-		if (ret > 0) {
-			loff_t end = offset + ret;
-			if (end > inode->i_size) {
-				ei->i_disksize = end;
-				i_size_write(inode, end);
-				/*
-				 * We're going to return a positive `ret'
-				 * here due to non-zero-length I/O, so there's
-				 * no way of reporting error returns from
-				 * ext3_mark_inode_dirty() to userspace.  So
-				 * ignore it.
-				 */
-				ext3_mark_inode_dirty(handle, inode);
-			}
-		}
-		err = ext3_journal_stop(handle);
-		if (ret == 0)
-			ret = err;
-	}
-out:
-	trace_ext3_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
-	return ret;
-}
-
-/*
- * Pages can be marked dirty completely asynchronously from ext3's journalling
- * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
- * much here because ->set_page_dirty is called under VFS locks.  The page is
- * not necessarily locked.
- *
- * We cannot just dirty the page and leave attached buffers clean, because the
- * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
- * or jbddirty because all the journalling code will explode.
- *
- * So what we do is to mark the page "pending dirty" and next time writepage
- * is called, propagate that into the buffers appropriately.
- */
-static int ext3_journalled_set_page_dirty(struct page *page)
-{
-	SetPageChecked(page);
-	return __set_page_dirty_nobuffers(page);
-}
-
-static const struct address_space_operations ext3_ordered_aops = {
-	.readpage		= ext3_readpage,
-	.readpages		= ext3_readpages,
-	.writepage		= ext3_ordered_writepage,
-	.write_begin		= ext3_write_begin,
-	.write_end		= ext3_ordered_write_end,
-	.bmap			= ext3_bmap,
-	.invalidatepage		= ext3_invalidatepage,
-	.releasepage		= ext3_releasepage,
-	.direct_IO		= ext3_direct_IO,
-	.migratepage		= buffer_migrate_page,
-	.is_partially_uptodate  = block_is_partially_uptodate,
-	.is_dirty_writeback	= buffer_check_dirty_writeback,
-	.error_remove_page	= generic_error_remove_page,
-};
-
-static const struct address_space_operations ext3_writeback_aops = {
-	.readpage		= ext3_readpage,
-	.readpages		= ext3_readpages,
-	.writepage		= ext3_writeback_writepage,
-	.write_begin		= ext3_write_begin,
-	.write_end		= ext3_writeback_write_end,
-	.bmap			= ext3_bmap,
-	.invalidatepage		= ext3_invalidatepage,
-	.releasepage		= ext3_releasepage,
-	.direct_IO		= ext3_direct_IO,
-	.migratepage		= buffer_migrate_page,
-	.is_partially_uptodate  = block_is_partially_uptodate,
-	.error_remove_page	= generic_error_remove_page,
-};
-
-static const struct address_space_operations ext3_journalled_aops = {
-	.readpage		= ext3_readpage,
-	.readpages		= ext3_readpages,
-	.writepage		= ext3_journalled_writepage,
-	.write_begin		= ext3_write_begin,
-	.write_end		= ext3_journalled_write_end,
-	.set_page_dirty		= ext3_journalled_set_page_dirty,
-	.bmap			= ext3_bmap,
-	.invalidatepage		= ext3_invalidatepage,
-	.releasepage		= ext3_releasepage,
-	.is_partially_uptodate  = block_is_partially_uptodate,
-	.error_remove_page	= generic_error_remove_page,
-};
-
-void ext3_set_aops(struct inode *inode)
-{
-	if (ext3_should_order_data(inode))
-		inode->i_mapping->a_ops = &ext3_ordered_aops;
-	else if (ext3_should_writeback_data(inode))
-		inode->i_mapping->a_ops = &ext3_writeback_aops;
-	else
-		inode->i_mapping->a_ops = &ext3_journalled_aops;
-}
-
-/*
- * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
- */
-static int ext3_block_truncate_page(struct inode *inode, loff_t from)
-{
-	ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
-	unsigned offset = from & (PAGE_CACHE_SIZE - 1);
-	unsigned blocksize, iblock, length, pos;
-	struct page *page;
-	handle_t *handle = NULL;
-	struct buffer_head *bh;
-	int err = 0;
-
-	/* Truncated on block boundary - nothing to do */
-	blocksize = inode->i_sb->s_blocksize;
-	if ((from & (blocksize - 1)) == 0)
-		return 0;
-
-	page = grab_cache_page(inode->i_mapping, index);
-	if (!page)
-		return -ENOMEM;
-	length = blocksize - (offset & (blocksize - 1));
-	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, blocksize, 0);
-
-	/* Find the buffer that contains "offset" */
-	bh = page_buffers(page);
-	pos = blocksize;
-	while (offset >= pos) {
-		bh = bh->b_this_page;
-		iblock++;
-		pos += blocksize;
-	}
-
-	err = 0;
-	if (buffer_freed(bh)) {
-		BUFFER_TRACE(bh, "freed: skip");
-		goto unlock;
-	}
-
-	if (!buffer_mapped(bh)) {
-		BUFFER_TRACE(bh, "unmapped");
-		ext3_get_block(inode, iblock, bh, 0);
-		/* unmapped? It's a hole - nothing to do */
-		if (!buffer_mapped(bh)) {
-			BUFFER_TRACE(bh, "still unmapped");
-			goto unlock;
-		}
-	}
-
-	/* Ok, it's mapped. Make sure it's up-to-date */
-	if (PageUptodate(page))
-		set_buffer_uptodate(bh);
-
-	if (!bh_uptodate_or_lock(bh)) {
-		err = bh_submit_read(bh);
-		/* Uhhuh. Read error. Complain and punt. */
-		if (err)
-			goto unlock;
-	}
-
-	/* data=writeback mode doesn't need transaction to zero-out data */
-	if (!ext3_should_writeback_data(inode)) {
-		/* We journal at most one block */
-		handle = ext3_journal_start(inode, 1);
-		if (IS_ERR(handle)) {
-			clear_highpage(page);
-			flush_dcache_page(page);
-			err = PTR_ERR(handle);
-			goto unlock;
-		}
-	}
-
-	if (ext3_should_journal_data(inode)) {
-		BUFFER_TRACE(bh, "get write access");
-		err = ext3_journal_get_write_access(handle, bh);
-		if (err)
-			goto stop;
-	}
-
-	zero_user(page, offset, length);
-	BUFFER_TRACE(bh, "zeroed end of block");
-
-	err = 0;
-	if (ext3_should_journal_data(inode)) {
-		err = ext3_journal_dirty_metadata(handle, bh);
-	} else {
-		if (ext3_should_order_data(inode))
-			err = ext3_journal_dirty_data(handle, bh);
-		mark_buffer_dirty(bh);
-	}
-stop:
-	if (handle)
-		ext3_journal_stop(handle);
-
-unlock:
-	unlock_page(page);
-	page_cache_release(page);
-	return err;
-}
-
-/*
- * Probably it should be a library function... search for first non-zero word
- * or memcmp with zero_page, whatever is better for particular architecture.
- * Linus?
- */
-static inline int all_zeroes(__le32 *p, __le32 *q)
-{
-	while (p < q)
-		if (*p++)
-			return 0;
-	return 1;
-}
-
-/**
- *	ext3_find_shared - find the indirect blocks for partial truncation.
- *	@inode:	  inode in question
- *	@depth:	  depth of the affected branch
- *	@offsets: offsets of pointers in that branch (see ext3_block_to_path)
- *	@chain:	  place to store the pointers to partial indirect blocks
- *	@top:	  place to the (detached) top of branch
- *
- *	This is a helper function used by ext3_truncate().
- *
- *	When we do truncate() we may have to clean the ends of several
- *	indirect blocks but leave the blocks themselves alive. Block is
- *	partially truncated if some data below the new i_size is referred
- *	from it (and it is on the path to the first completely truncated
- *	data block, indeed).  We have to free the top of that path along
- *	with everything to the right of the path. Since no allocation
- *	past the truncation point is possible until ext3_truncate()
- *	finishes, we may safely do the latter, but top of branch may
- *	require special attention - pageout below the truncation point
- *	might try to populate it.
- *
- *	We atomically detach the top of branch from the tree, store the
- *	block number of its root in *@top, pointers to buffer_heads of
- *	partially truncated blocks - in @chain[].bh and pointers to
- *	their last elements that should not be removed - in
- *	@chain[].p. Return value is the pointer to last filled element
- *	of @chain.
- *
- *	The work left to caller to do the actual freeing of subtrees:
- *		a) free the subtree starting from *@top
- *		b) free the subtrees whose roots are stored in
- *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
- *		c) free the subtrees growing from the inode past the @chain[0].
- *			(no partially truncated stuff there).  */
-
-static Indirect *ext3_find_shared(struct inode *inode, int depth,
-			int offsets[4], Indirect chain[4], __le32 *top)
-{
-	Indirect *partial, *p;
-	int k, err;
-
-	*top = 0;
-	/* Make k index the deepest non-null offset + 1 */
-	for (k = depth; k > 1 && !offsets[k-1]; k--)
-		;
-	partial = ext3_get_branch(inode, k, offsets, chain, &err);
-	/* Writer: pointers */
-	if (!partial)
-		partial = chain + k-1;
-	/*
-	 * If the branch acquired continuation since we've looked at it -
-	 * fine, it should all survive and (new) top doesn't belong to us.
-	 */
-	if (!partial->key && *partial->p)
-		/* Writer: end */
-		goto no_top;
-	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
-		;
-	/*
-	 * OK, we've found the last block that must survive. The rest of our
-	 * branch should be detached before unlocking. However, if that rest
-	 * of branch is all ours and does not grow immediately from the inode
-	 * it's easier to cheat and just decrement partial->p.
-	 */
-	if (p == chain + k - 1 && p > chain) {
-		p->p--;
-	} else {
-		*top = *p->p;
-		/* Nope, don't do this in ext3.  Must leave the tree intact */
-#if 0
-		*p->p = 0;
-#endif
-	}
-	/* Writer: end */
-
-	while(partial > p) {
-		brelse(partial->bh);
-		partial--;
-	}
-no_top:
-	return partial;
-}
-
-/*
- * Zero a number of block pointers in either an inode or an indirect block.
- * If we restart the transaction we must again get write access to the
- * indirect block for further modification.
- *
- * We release `count' blocks on disk, but (last - first) may be greater
- * than `count' because there can be holes in there.
- */
-static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
-		struct buffer_head *bh, ext3_fsblk_t block_to_free,
-		unsigned long count, __le32 *first, __le32 *last)
-{
-	__le32 *p;
-	if (try_to_extend_transaction(handle, inode)) {
-		if (bh) {
-			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-			if (ext3_journal_dirty_metadata(handle, bh))
-				return;
-		}
-		ext3_mark_inode_dirty(handle, inode);
-		truncate_restart_transaction(handle, inode);
-		if (bh) {
-			BUFFER_TRACE(bh, "retaking write access");
-			if (ext3_journal_get_write_access(handle, bh))
-				return;
-		}
-	}
-
-	/*
-	 * Any buffers which are on the journal will be in memory. We find
-	 * them on the hash table so journal_revoke() will run journal_forget()
-	 * on them.  We've already detached each block from the file, so
-	 * bforget() in journal_forget() should be safe.
-	 *
-	 * AKPM: turn on bforget in journal_forget()!!!
-	 */
-	for (p = first; p < last; p++) {
-		u32 nr = le32_to_cpu(*p);
-		if (nr) {
-			struct buffer_head *bh;
-
-			*p = 0;
-			bh = sb_find_get_block(inode->i_sb, nr);
-			ext3_forget(handle, 0, inode, bh, nr);
-		}
-	}
-
-	ext3_free_blocks(handle, inode, block_to_free, count);
-}
-
-/**
- * ext3_free_data - free a list of data blocks
- * @handle:	handle for this transaction
- * @inode:	inode we are dealing with
- * @this_bh:	indirect buffer_head which contains *@first and *@last
- * @first:	array of block numbers
- * @last:	points immediately past the end of array
- *
- * We are freeing all blocks referred from that array (numbers are stored as
- * little-endian 32-bit) and updating @inode->i_blocks appropriately.
- *
- * We accumulate contiguous runs of blocks to free.  Conveniently, if these
- * blocks are contiguous then releasing them at one time will only affect one
- * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
- * actually use a lot of journal space.
- *
- * @this_bh will be %NULL if @first and @last point into the inode's direct
- * block pointers.
- */
-static void ext3_free_data(handle_t *handle, struct inode *inode,
-			   struct buffer_head *this_bh,
-			   __le32 *first, __le32 *last)
-{
-	ext3_fsblk_t block_to_free = 0;    /* Starting block # of a run */
-	unsigned long count = 0;	    /* Number of blocks in the run */
-	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
-					       corresponding to
-					       block_to_free */
-	ext3_fsblk_t nr;		    /* Current block # */
-	__le32 *p;			    /* Pointer into inode/ind
-					       for current block */
-	int err;
-
-	if (this_bh) {				/* For indirect block */
-		BUFFER_TRACE(this_bh, "get_write_access");
-		err = ext3_journal_get_write_access(handle, this_bh);
-		/* Important: if we can't update the indirect pointers
-		 * to the blocks, we can't free them. */
-		if (err)
-			return;
-	}
-
-	for (p = first; p < last; p++) {
-		nr = le32_to_cpu(*p);
-		if (nr) {
-			/* accumulate blocks to free if they're contiguous */
-			if (count == 0) {
-				block_to_free = nr;
-				block_to_free_p = p;
-				count = 1;
-			} else if (nr == block_to_free + count) {
-				count++;
-			} else {
-				ext3_clear_blocks(handle, inode, this_bh,
-						  block_to_free,
-						  count, block_to_free_p, p);
-				block_to_free = nr;
-				block_to_free_p = p;
-				count = 1;
-			}
-		}
-	}
-
-	if (count > 0)
-		ext3_clear_blocks(handle, inode, this_bh, block_to_free,
-				  count, block_to_free_p, p);
-
-	if (this_bh) {
-		BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
-
-		/*
-		 * The buffer head should have an attached journal head at this
-		 * point. However, if the data is corrupted and an indirect
-		 * block pointed to itself, it would have been detached when
-		 * the block was cleared. Check for this instead of OOPSing.
-		 */
-		if (bh2jh(this_bh))
-			ext3_journal_dirty_metadata(handle, this_bh);
-		else
-			ext3_error(inode->i_sb, "ext3_free_data",
-				   "circular indirect block detected, "
-				   "inode=%lu, block=%llu",
-				   inode->i_ino,
-				   (unsigned long long)this_bh->b_blocknr);
-	}
-}
-
-/**
- *	ext3_free_branches - free an array of branches
- *	@handle: JBD handle for this transaction
- *	@inode:	inode we are dealing with
- *	@parent_bh: the buffer_head which contains *@first and *@last
- *	@first:	array of block numbers
- *	@last:	pointer immediately past the end of array
- *	@depth:	depth of the branches to free
- *
- *	We are freeing all blocks referred from these branches (numbers are
- *	stored as little-endian 32-bit) and updating @inode->i_blocks
- *	appropriately.
- */
-static void ext3_free_branches(handle_t *handle, struct inode *inode,
-			       struct buffer_head *parent_bh,
-			       __le32 *first, __le32 *last, int depth)
-{
-	ext3_fsblk_t nr;
-	__le32 *p;
-
-	if (is_handle_aborted(handle))
-		return;
-
-	if (depth--) {
-		struct buffer_head *bh;
-		int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
-		p = last;
-		while (--p >= first) {
-			nr = le32_to_cpu(*p);
-			if (!nr)
-				continue;		/* A hole */
-
-			/* Go read the buffer for the next level down */
-			bh = sb_bread(inode->i_sb, nr);
-
-			/*
-			 * A read failure? Report error and clear slot
-			 * (should be rare).
-			 */
-			if (!bh) {
-				ext3_error(inode->i_sb, "ext3_free_branches",
-					   "Read failure, inode=%lu, block="E3FSBLK,
-					   inode->i_ino, nr);
-				continue;
-			}
-
-			/* This zaps the entire block.  Bottom up. */
-			BUFFER_TRACE(bh, "free child branches");
-			ext3_free_branches(handle, inode, bh,
-					   (__le32*)bh->b_data,
-					   (__le32*)bh->b_data + addr_per_block,
-					   depth);
-
-			/*
-			 * Everything below this this pointer has been
-			 * released.  Now let this top-of-subtree go.
-			 *
-			 * We want the freeing of this indirect block to be
-			 * atomic in the journal with the updating of the
-			 * bitmap block which owns it.  So make some room in
-			 * the journal.
-			 *
-			 * We zero the parent pointer *after* freeing its
-			 * pointee in the bitmaps, so if extend_transaction()
-			 * for some reason fails to put the bitmap changes and
-			 * the release into the same transaction, recovery
-			 * will merely complain about releasing a free block,
-			 * rather than leaking blocks.
-			 */
-			if (is_handle_aborted(handle))
-				return;
-			if (try_to_extend_transaction(handle, inode)) {
-				ext3_mark_inode_dirty(handle, inode);
-				truncate_restart_transaction(handle, inode);
-			}
-
-			/*
-			 * We've probably journalled the indirect block several
-			 * times during the truncate.  But it's no longer
-			 * needed and we now drop it from the transaction via
-			 * journal_revoke().
-			 *
-			 * That's easy if it's exclusively part of this
-			 * transaction.  But if it's part of the committing
-			 * transaction then journal_forget() will simply
-			 * brelse() it.  That means that if the underlying
-			 * block is reallocated in ext3_get_block(),
-			 * unmap_underlying_metadata() will find this block
-			 * and will try to get rid of it.  damn, damn. Thus
-			 * we don't allow a block to be reallocated until
-			 * a transaction freeing it has fully committed.
-			 *
-			 * We also have to make sure journal replay after a
-			 * crash does not overwrite non-journaled data blocks
-			 * with old metadata when the block got reallocated for
-			 * data.  Thus we have to store a revoke record for a
-			 * block in the same transaction in which we free the
-			 * block.
-			 */
-			ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
-
-			ext3_free_blocks(handle, inode, nr, 1);
-
-			if (parent_bh) {
-				/*
-				 * The block which we have just freed is
-				 * pointed to by an indirect block: journal it
-				 */
-				BUFFER_TRACE(parent_bh, "get_write_access");
-				if (!ext3_journal_get_write_access(handle,
-								   parent_bh)){
-					*p = 0;
-					BUFFER_TRACE(parent_bh,
-					"call ext3_journal_dirty_metadata");
-					ext3_journal_dirty_metadata(handle,
-								    parent_bh);
-				}
-			}
-		}
-	} else {
-		/* We have reached the bottom of the tree. */
-		BUFFER_TRACE(parent_bh, "free data blocks");
-		ext3_free_data(handle, inode, parent_bh, first, last);
-	}
-}
-
-int ext3_can_truncate(struct inode *inode)
-{
-	if (S_ISREG(inode->i_mode))
-		return 1;
-	if (S_ISDIR(inode->i_mode))
-		return 1;
-	if (S_ISLNK(inode->i_mode))
-		return !ext3_inode_is_fast_symlink(inode);
-	return 0;
-}
-
-/*
- * ext3_truncate()
- *
- * We block out ext3_get_block() block instantiations across the entire
- * transaction, and VFS/VM ensures that ext3_truncate() cannot run
- * simultaneously on behalf of the same inode.
- *
- * As we work through the truncate and commit bits of it to the journal there
- * is one core, guiding principle: the file's tree must always be consistent on
- * disk.  We must be able to restart the truncate after a crash.
- *
- * The file's tree may be transiently inconsistent in memory (although it
- * probably isn't), but whenever we close off and commit a journal transaction,
- * the contents of (the filesystem + the journal) must be consistent and
- * restartable.  It's pretty simple, really: bottom up, right to left (although
- * left-to-right works OK too).
- *
- * Note that at recovery time, journal replay occurs *before* the restart of
- * truncate against the orphan inode list.
- *
- * The committed inode has the new, desired i_size (which is the same as
- * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
- * that this inode's truncate did not complete and it will again call
- * ext3_truncate() to have another go.  So there will be instantiated blocks
- * to the right of the truncation point in a crashed ext3 filesystem.  But
- * that's fine - as long as they are linked from the inode, the post-crash
- * ext3_truncate() run will find them and release them.
- */
-void ext3_truncate(struct inode *inode)
-{
-	handle_t *handle;
-	struct ext3_inode_info *ei = EXT3_I(inode);
-	__le32 *i_data = ei->i_data;
-	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
-	int offsets[4];
-	Indirect chain[4];
-	Indirect *partial;
-	__le32 nr = 0;
-	int n;
-	long last_block;
-	unsigned blocksize = inode->i_sb->s_blocksize;
-
-	trace_ext3_truncate_enter(inode);
-
-	if (!ext3_can_truncate(inode))
-		goto out_notrans;
-
-	if (inode->i_size == 0 && ext3_should_writeback_data(inode))
-		ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
-
-	handle = start_transaction(inode);
-	if (IS_ERR(handle))
-		goto out_notrans;
-
-	last_block = (inode->i_size + blocksize-1)
-					>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
-	n = ext3_block_to_path(inode, last_block, offsets, NULL);
-	if (n == 0)
-		goto out_stop;	/* error */
-
-	/*
-	 * OK.  This truncate is going to happen.  We add the inode to the
-	 * orphan list, so that if this truncate spans multiple transactions,
-	 * and we crash, we will resume the truncate when the filesystem
-	 * recovers.  It also marks the inode dirty, to catch the new size.
-	 *
-	 * Implication: the file must always be in a sane, consistent
-	 * truncatable state while each transaction commits.
-	 */
-	if (ext3_orphan_add(handle, inode))
-		goto out_stop;
-
-	/*
-	 * The orphan list entry will now protect us from any crash which
-	 * occurs before the truncate completes, so it is now safe to propagate
-	 * the new, shorter inode size (held for now in i_size) into the
-	 * on-disk inode. We do this via i_disksize, which is the value which
-	 * ext3 *really* writes onto the disk inode.
-	 */
-	ei->i_disksize = inode->i_size;
-
-	/*
-	 * From here we block out all ext3_get_block() callers who want to
-	 * modify the block allocation tree.
-	 */
-	mutex_lock(&ei->truncate_mutex);
-
-	if (n == 1) {		/* direct blocks */
-		ext3_free_data(handle, inode, NULL, i_data+offsets[0],
-			       i_data + EXT3_NDIR_BLOCKS);
-		goto do_indirects;
-	}
-
-	partial = ext3_find_shared(inode, n, offsets, chain, &nr);
-	/* Kill the top of shared branch (not detached) */
-	if (nr) {
-		if (partial == chain) {
-			/* Shared branch grows from the inode */
-			ext3_free_branches(handle, inode, NULL,
-					   &nr, &nr+1, (chain+n-1) - partial);
-			*partial->p = 0;
-			/*
-			 * We mark the inode dirty prior to restart,
-			 * and prior to stop.  No need for it here.
-			 */
-		} else {
-			/* Shared branch grows from an indirect block */
-			ext3_free_branches(handle, inode, partial->bh,
-					partial->p,
-					partial->p+1, (chain+n-1) - partial);
-		}
-	}
-	/* Clear the ends of indirect blocks on the shared branch */
-	while (partial > chain) {
-		ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
-				   (__le32*)partial->bh->b_data+addr_per_block,
-				   (chain+n-1) - partial);
-		BUFFER_TRACE(partial->bh, "call brelse");
-		brelse (partial->bh);
-		partial--;
-	}
-do_indirects:
-	/* Kill the remaining (whole) subtrees */
-	switch (offsets[0]) {
-	default:
-		nr = i_data[EXT3_IND_BLOCK];
-		if (nr) {
-			ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
-			i_data[EXT3_IND_BLOCK] = 0;
-		}
-	case EXT3_IND_BLOCK:
-		nr = i_data[EXT3_DIND_BLOCK];
-		if (nr) {
-			ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
-			i_data[EXT3_DIND_BLOCK] = 0;
-		}
-	case EXT3_DIND_BLOCK:
-		nr = i_data[EXT3_TIND_BLOCK];
-		if (nr) {
-			ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
-			i_data[EXT3_TIND_BLOCK] = 0;
-		}
-	case EXT3_TIND_BLOCK:
-		;
-	}
-
-	ext3_discard_reservation(inode);
-
-	mutex_unlock(&ei->truncate_mutex);
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
-	ext3_mark_inode_dirty(handle, inode);
-
-	/*
-	 * In a multi-transaction truncate, we only make the final transaction
-	 * synchronous
-	 */
-	if (IS_SYNC(inode))
-		handle->h_sync = 1;
-out_stop:
-	/*
-	 * If this was a simple ftruncate(), and the file will remain alive
-	 * then we need to clear up the orphan record which we created above.
-	 * However, if this was a real unlink then we were called by
-	 * ext3_evict_inode(), and we allow that function to clean up the
-	 * orphan info for us.
-	 */
-	if (inode->i_nlink)
-		ext3_orphan_del(handle, inode);
-
-	ext3_journal_stop(handle);
-	trace_ext3_truncate_exit(inode);
-	return;
-out_notrans:
-	/*
-	 * Delete the inode from orphan list so that it doesn't stay there
-	 * forever and trigger assertion on umount.
-	 */
-	if (inode->i_nlink)
-		ext3_orphan_del(NULL, inode);
-	trace_ext3_truncate_exit(inode);
-}
-
-static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
-		unsigned long ino, struct ext3_iloc *iloc)
-{
-	unsigned long block_group;
-	unsigned long offset;
-	ext3_fsblk_t block;
-	struct ext3_group_desc *gdp;
-
-	if (!ext3_valid_inum(sb, ino)) {
-		/*
-		 * This error is already checked for in namei.c unless we are
-		 * looking at an NFS filehandle, in which case no error
-		 * report is needed
-		 */
-		return 0;
-	}
-
-	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
-	gdp = ext3_get_group_desc(sb, block_group, NULL);
-	if (!gdp)
-		return 0;
-	/*
-	 * Figure out the offset within the block group inode table
-	 */
-	offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
-		EXT3_INODE_SIZE(sb);
-	block = le32_to_cpu(gdp->bg_inode_table) +
-		(offset >> EXT3_BLOCK_SIZE_BITS(sb));
-
-	iloc->block_group = block_group;
-	iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
-	return block;
-}
-
-/*
- * ext3_get_inode_loc returns with an extra refcount against the inode's
- * underlying buffer_head on success. If 'in_mem' is true, we have all
- * data in memory that is needed to recreate the on-disk version of this
- * inode.
- */
-static int __ext3_get_inode_loc(struct inode *inode,
-				struct ext3_iloc *iloc, int in_mem)
-{
-	ext3_fsblk_t block;
-	struct buffer_head *bh;
-
-	block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
-	if (!block)
-		return -EIO;
-
-	bh = sb_getblk(inode->i_sb, block);
-	if (unlikely(!bh)) {
-		ext3_error (inode->i_sb, "ext3_get_inode_loc",
-				"unable to read inode block - "
-				"inode=%lu, block="E3FSBLK,
-				 inode->i_ino, block);
-		return -ENOMEM;
-	}
-	if (!buffer_uptodate(bh)) {
-		lock_buffer(bh);
-
-		/*
-		 * If the buffer has the write error flag, we have failed
-		 * to write out another inode in the same block.  In this
-		 * case, we don't have to read the block because we may
-		 * read the old inode data successfully.
-		 */
-		if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
-			set_buffer_uptodate(bh);
-
-		if (buffer_uptodate(bh)) {
-			/* someone brought it uptodate while we waited */
-			unlock_buffer(bh);
-			goto has_buffer;
-		}
-
-		/*
-		 * If we have all information of the inode in memory and this
-		 * is the only valid inode in the block, we need not read the
-		 * block.
-		 */
-		if (in_mem) {
-			struct buffer_head *bitmap_bh;
-			struct ext3_group_desc *desc;
-			int inodes_per_buffer;
-			int inode_offset, i;
-			int block_group;
-			int start;
-
-			block_group = (inode->i_ino - 1) /
-					EXT3_INODES_PER_GROUP(inode->i_sb);
-			inodes_per_buffer = bh->b_size /
-				EXT3_INODE_SIZE(inode->i_sb);
-			inode_offset = ((inode->i_ino - 1) %
-					EXT3_INODES_PER_GROUP(inode->i_sb));
-			start = inode_offset & ~(inodes_per_buffer - 1);
-
-			/* Is the inode bitmap in cache? */
-			desc = ext3_get_group_desc(inode->i_sb,
-						block_group, NULL);
-			if (!desc)
-				goto make_io;
-
-			bitmap_bh = sb_getblk(inode->i_sb,
-					le32_to_cpu(desc->bg_inode_bitmap));
-			if (unlikely(!bitmap_bh))
-				goto make_io;
-
-			/*
-			 * If the inode bitmap isn't in cache then the
-			 * optimisation may end up performing two reads instead
-			 * of one, so skip it.
-			 */
-			if (!buffer_uptodate(bitmap_bh)) {
-				brelse(bitmap_bh);
-				goto make_io;
-			}
-			for (i = start; i < start + inodes_per_buffer; i++) {
-				if (i == inode_offset)
-					continue;
-				if (ext3_test_bit(i, bitmap_bh->b_data))
-					break;
-			}
-			brelse(bitmap_bh);
-			if (i == start + inodes_per_buffer) {
-				/* all other inodes are free, so skip I/O */
-				memset(bh->b_data, 0, bh->b_size);
-				set_buffer_uptodate(bh);
-				unlock_buffer(bh);
-				goto has_buffer;
-			}
-		}
-
-make_io:
-		/*
-		 * There are other valid inodes in the buffer, this inode
-		 * has in-inode xattrs, or we don't have this inode in memory.
-		 * Read the block from disk.
-		 */
-		trace_ext3_load_inode(inode);
-		get_bh(bh);
-		bh->b_end_io = end_buffer_read_sync;
-		submit_bh(READ | REQ_META | REQ_PRIO, bh);
-		wait_on_buffer(bh);
-		if (!buffer_uptodate(bh)) {
-			ext3_error(inode->i_sb, "ext3_get_inode_loc",
-					"unable to read inode block - "
-					"inode=%lu, block="E3FSBLK,
-					inode->i_ino, block);
-			brelse(bh);
-			return -EIO;
-		}
-	}
-has_buffer:
-	iloc->bh = bh;
-	return 0;
-}
-
-int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
-{
-	/* We have all inode data except xattrs in memory here. */
-	return __ext3_get_inode_loc(inode, iloc,
-		!ext3_test_inode_state(inode, EXT3_STATE_XATTR));
-}
-
-void ext3_set_inode_flags(struct inode *inode)
-{
-	unsigned int flags = EXT3_I(inode)->i_flags;
-
-	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
-	if (flags & EXT3_SYNC_FL)
-		inode->i_flags |= S_SYNC;
-	if (flags & EXT3_APPEND_FL)
-		inode->i_flags |= S_APPEND;
-	if (flags & EXT3_IMMUTABLE_FL)
-		inode->i_flags |= S_IMMUTABLE;
-	if (flags & EXT3_NOATIME_FL)
-		inode->i_flags |= S_NOATIME;
-	if (flags & EXT3_DIRSYNC_FL)
-		inode->i_flags |= S_DIRSYNC;
-}
-
-/* Propagate flags from i_flags to EXT3_I(inode)->i_flags */
-void ext3_get_inode_flags(struct ext3_inode_info *ei)
-{
-	unsigned int flags = ei->vfs_inode.i_flags;
-
-	ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL|
-			EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL);
-	if (flags & S_SYNC)
-		ei->i_flags |= EXT3_SYNC_FL;
-	if (flags & S_APPEND)
-		ei->i_flags |= EXT3_APPEND_FL;
-	if (flags & S_IMMUTABLE)
-		ei->i_flags |= EXT3_IMMUTABLE_FL;
-	if (flags & S_NOATIME)
-		ei->i_flags |= EXT3_NOATIME_FL;
-	if (flags & S_DIRSYNC)
-		ei->i_flags |= EXT3_DIRSYNC_FL;
-}
-
-struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
-{
-	struct ext3_iloc iloc;
-	struct ext3_inode *raw_inode;
-	struct ext3_inode_info *ei;
-	struct buffer_head *bh;
-	struct inode *inode;
-	journal_t *journal = EXT3_SB(sb)->s_journal;
-	transaction_t *transaction;
-	long ret;
-	int block;
-	uid_t i_uid;
-	gid_t i_gid;
-
-	inode = iget_locked(sb, ino);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-	if (!(inode->i_state & I_NEW))
-		return inode;
-
-	ei = EXT3_I(inode);
-	ei->i_block_alloc_info = NULL;
-
-	ret = __ext3_get_inode_loc(inode, &iloc, 0);
-	if (ret < 0)
-		goto bad_inode;
-	bh = iloc.bh;
-	raw_inode = ext3_raw_inode(&iloc);
-	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
-	i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
-	i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
-	if(!(test_opt (inode->i_sb, NO_UID32))) {
-		i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
-		i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
-	}
-	i_uid_write(inode, i_uid);
-	i_gid_write(inode, i_gid);
-	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
-	inode->i_size = le32_to_cpu(raw_inode->i_size);
-	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
-	inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
-	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
-	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
-
-	ei->i_state_flags = 0;
-	ei->i_dir_start_lookup = 0;
-	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
-	/* We now have enough fields to check if the inode was active or not.
-	 * This is needed because nfsd might try to access dead inodes
-	 * the test is that same one that e2fsck uses
-	 * NeilBrown 1999oct15
-	 */
-	if (inode->i_nlink == 0) {
-		if (inode->i_mode == 0 ||
-		    !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
-			/* this inode is deleted */
-			brelse (bh);
-			ret = -ESTALE;
-			goto bad_inode;
-		}
-		/* The only unlinked inodes we let through here have
-		 * valid i_mode and are being read by the orphan
-		 * recovery code: that's fine, we're about to complete
-		 * the process of deleting those. */
-	}
-	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
-	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
-#ifdef EXT3_FRAGMENTS
-	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
-	ei->i_frag_no = raw_inode->i_frag;
-	ei->i_frag_size = raw_inode->i_fsize;
-#endif
-	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
-	if (!S_ISREG(inode->i_mode)) {
-		ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
-	} else {
-		inode->i_size |=
-			((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
-	}
-	ei->i_disksize = inode->i_size;
-	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
-	ei->i_block_group = iloc.block_group;
-	/*
-	 * NOTE! The in-memory inode i_data array is in little-endian order
-	 * even on big-endian machines: we do NOT byteswap the block numbers!
-	 */
-	for (block = 0; block < EXT3_N_BLOCKS; block++)
-		ei->i_data[block] = raw_inode->i_block[block];
-	INIT_LIST_HEAD(&ei->i_orphan);
-
-	/*
-	 * Set transaction id's of transactions that have to be committed
-	 * to finish f[data]sync. We set them to currently running transaction
-	 * as we cannot be sure that the inode or some of its metadata isn't
-	 * part of the transaction - the inode could have been reclaimed and
-	 * now it is reread from disk.
-	 */
-	if (journal) {
-		tid_t tid;
-
-		spin_lock(&journal->j_state_lock);
-		if (journal->j_running_transaction)
-			transaction = journal->j_running_transaction;
-		else
-			transaction = journal->j_committing_transaction;
-		if (transaction)
-			tid = transaction->t_tid;
-		else
-			tid = journal->j_commit_sequence;
-		spin_unlock(&journal->j_state_lock);
-		atomic_set(&ei->i_sync_tid, tid);
-		atomic_set(&ei->i_datasync_tid, tid);
-	}
-
-	if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
-	    EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
-		/*
-		 * When mke2fs creates big inodes it does not zero out
-		 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
-		 * so ignore those first few inodes.
-		 */
-		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
-		if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
-		    EXT3_INODE_SIZE(inode->i_sb)) {
-			brelse (bh);
-			ret = -EIO;
-			goto bad_inode;
-		}
-		if (ei->i_extra_isize == 0) {
-			/* The extra space is currently unused. Use it. */
-			ei->i_extra_isize = sizeof(struct ext3_inode) -
-					    EXT3_GOOD_OLD_INODE_SIZE;
-		} else {
-			__le32 *magic = (void *)raw_inode +
-					EXT3_GOOD_OLD_INODE_SIZE +
-					ei->i_extra_isize;
-			if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
-				 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
-		}
-	} else
-		ei->i_extra_isize = 0;
-
-	if (S_ISREG(inode->i_mode)) {
-		inode->i_op = &ext3_file_inode_operations;
-		inode->i_fop = &ext3_file_operations;
-		ext3_set_aops(inode);
-	} else if (S_ISDIR(inode->i_mode)) {
-		inode->i_op = &ext3_dir_inode_operations;
-		inode->i_fop = &ext3_dir_operations;
-	} else if (S_ISLNK(inode->i_mode)) {
-		if (ext3_inode_is_fast_symlink(inode)) {
-			inode->i_op = &ext3_fast_symlink_inode_operations;
-			nd_terminate_link(ei->i_data, inode->i_size,
-				sizeof(ei->i_data) - 1);
-			inode->i_link = (char *)ei->i_data;
-		} else {
-			inode->i_op = &ext3_symlink_inode_operations;
-			ext3_set_aops(inode);
-		}
-	} else {
-		inode->i_op = &ext3_special_inode_operations;
-		if (raw_inode->i_block[0])
-			init_special_inode(inode, inode->i_mode,
-			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
-		else
-			init_special_inode(inode, inode->i_mode,
-			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
-	}
-	brelse (iloc.bh);
-	ext3_set_inode_flags(inode);
-	unlock_new_inode(inode);
-	return inode;
-
-bad_inode:
-	iget_failed(inode);
-	return ERR_PTR(ret);
-}
-
-/*
- * Post the struct inode info into an on-disk inode location in the
- * buffer-cache.  This gobbles the caller's reference to the
- * buffer_head in the inode location struct.
- *
- * The caller must have write access to iloc->bh.
- */
-static int ext3_do_update_inode(handle_t *handle,
-				struct inode *inode,
-				struct ext3_iloc *iloc)
-{
-	struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
-	struct ext3_inode_info *ei = EXT3_I(inode);
-	struct buffer_head *bh = iloc->bh;
-	int err = 0, rc, block;
-	int need_datasync = 0;
-	__le32 disksize;
-	uid_t i_uid;
-	gid_t i_gid;
-
-again:
-	/* we can't allow multiple procs in here at once, its a bit racey */
-	lock_buffer(bh);
-
-	/* For fields not not tracking in the in-memory inode,
-	 * initialise them to zero for new inodes. */
-	if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
-		memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
-
-	ext3_get_inode_flags(ei);
-	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
-	i_uid = i_uid_read(inode);
-	i_gid = i_gid_read(inode);
-	if(!(test_opt(inode->i_sb, NO_UID32))) {
-		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
-		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
-/*
- * Fix up interoperability with old kernels. Otherwise, old inodes get
- * re-used with the upper 16 bits of the uid/gid intact
- */
-		if(!ei->i_dtime) {
-			raw_inode->i_uid_high =
-				cpu_to_le16(high_16_bits(i_uid));
-			raw_inode->i_gid_high =
-				cpu_to_le16(high_16_bits(i_gid));
-		} else {
-			raw_inode->i_uid_high = 0;
-			raw_inode->i_gid_high = 0;
-		}
-	} else {
-		raw_inode->i_uid_low =
-			cpu_to_le16(fs_high2lowuid(i_uid));
-		raw_inode->i_gid_low =
-			cpu_to_le16(fs_high2lowgid(i_gid));
-		raw_inode->i_uid_high = 0;
-		raw_inode->i_gid_high = 0;
-	}
-	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
-	disksize = cpu_to_le32(ei->i_disksize);
-	if (disksize != raw_inode->i_size) {
-		need_datasync = 1;
-		raw_inode->i_size = disksize;
-	}
-	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
-	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
-	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
-	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
-	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
-	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
-#ifdef EXT3_FRAGMENTS
-	raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
-	raw_inode->i_frag = ei->i_frag_no;
-	raw_inode->i_fsize = ei->i_frag_size;
-#endif
-	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
-	if (!S_ISREG(inode->i_mode)) {
-		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
-	} else {
-		disksize = cpu_to_le32(ei->i_disksize >> 32);
-		if (disksize != raw_inode->i_size_high) {
-			raw_inode->i_size_high = disksize;
-			need_datasync = 1;
-		}
-		if (ei->i_disksize > 0x7fffffffULL) {
-			struct super_block *sb = inode->i_sb;
-			if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
-					EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
-			    EXT3_SB(sb)->s_es->s_rev_level ==
-					cpu_to_le32(EXT3_GOOD_OLD_REV)) {
-			       /* If this is the first large file
-				* created, add a flag to the superblock.
-				*/
-				unlock_buffer(bh);
-				err = ext3_journal_get_write_access(handle,
-						EXT3_SB(sb)->s_sbh);
-				if (err)
-					goto out_brelse;
-
-				ext3_update_dynamic_rev(sb);
-				EXT3_SET_RO_COMPAT_FEATURE(sb,
-					EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
-				handle->h_sync = 1;
-				err = ext3_journal_dirty_metadata(handle,
-						EXT3_SB(sb)->s_sbh);
-				/* get our lock and start over */
-				goto again;
-			}
-		}
-	}
-	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
-	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
-		if (old_valid_dev(inode->i_rdev)) {
-			raw_inode->i_block[0] =
-				cpu_to_le32(old_encode_dev(inode->i_rdev));
-			raw_inode->i_block[1] = 0;
-		} else {
-			raw_inode->i_block[0] = 0;
-			raw_inode->i_block[1] =
-				cpu_to_le32(new_encode_dev(inode->i_rdev));
-			raw_inode->i_block[2] = 0;
-		}
-	} else for (block = 0; block < EXT3_N_BLOCKS; block++)
-		raw_inode->i_block[block] = ei->i_data[block];
-
-	if (ei->i_extra_isize)
-		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
-
-	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-	unlock_buffer(bh);
-	rc = ext3_journal_dirty_metadata(handle, bh);
-	if (!err)
-		err = rc;
-	ext3_clear_inode_state(inode, EXT3_STATE_NEW);
-
-	atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
-	if (need_datasync)
-		atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
-out_brelse:
-	brelse (bh);
-	ext3_std_error(inode->i_sb, err);
-	return err;
-}
-
-/*
- * ext3_write_inode()
- *
- * We are called from a few places:
- *
- * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
- *   Here, there will be no transaction running. We wait for any running
- *   transaction to commit.
- *
- * - Within flush work (for sys_sync(), kupdate and such).
- *   We wait on commit, if told to.
- *
- * - Within iput_final() -> write_inode_now()
- *   We wait on commit, if told to.
- *
- * In all cases it is actually safe for us to return without doing anything,
- * because the inode has been copied into a raw inode buffer in
- * ext3_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL
- * writeback.
- *
- * Note that we are absolutely dependent upon all inode dirtiers doing the
- * right thing: they *must* call mark_inode_dirty() after dirtying info in
- * which we are interested.
- *
- * It would be a bug for them to not do this.  The code:
- *
- *	mark_inode_dirty(inode)
- *	stuff();
- *	inode->i_size = expr;
- *
- * is in error because write_inode() could occur while `stuff()' is running,
- * and the new i_size will be lost.  Plus the inode will no longer be on the
- * superblock's dirty inode list.
- */
-int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
-	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
-		return 0;
-
-	if (ext3_journal_current_handle()) {
-		jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
-		dump_stack();
-		return -EIO;
-	}
-
-	/*
-	 * No need to force transaction in WB_SYNC_NONE mode. Also
-	 * ext3_sync_fs() will force the commit after everything is
-	 * written.
-	 */
-	if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
-		return 0;
-
-	return ext3_force_commit(inode->i_sb);
-}
-
-/*
- * ext3_setattr()
- *
- * Called from notify_change.
- *
- * We want to trap VFS attempts to truncate the file as soon as
- * possible.  In particular, we want to make sure that when the VFS
- * shrinks i_size, we put the inode on the orphan list and modify
- * i_disksize immediately, so that during the subsequent flushing of
- * dirty pages and freeing of disk blocks, we can guarantee that any
- * commit will leave the blocks being flushed in an unused state on
- * disk.  (On recovery, the inode will get truncated and the blocks will
- * be freed, so we have a strong guarantee that no future commit will
- * leave these blocks visible to the user.)
- *
- * Called with inode->sem down.
- */
-int ext3_setattr(struct dentry *dentry, struct iattr *attr)
-{
-	struct inode *inode = d_inode(dentry);
-	int error, rc = 0;
-	const unsigned int ia_valid = attr->ia_valid;
-
-	error = inode_change_ok(inode, attr);
-	if (error)
-		return error;
-
-	if (is_quota_modification(inode, attr))
-		dquot_initialize(inode);
-	if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
-	    (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
-		handle_t *handle;
-
-		/* (user+group)*(old+new) structure, inode write (sb,
-		 * inode block, ? - but truncate inode update has it) */
-		handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
-					EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3);
-		if (IS_ERR(handle)) {
-			error = PTR_ERR(handle);
-			goto err_out;
-		}
-		error = dquot_transfer(inode, attr);
-		if (error) {
-			ext3_journal_stop(handle);
-			return error;
-		}
-		/* Update corresponding info in inode so that everything is in
-		 * one transaction */
-		if (attr->ia_valid & ATTR_UID)
-			inode->i_uid = attr->ia_uid;
-		if (attr->ia_valid & ATTR_GID)
-			inode->i_gid = attr->ia_gid;
-		error = ext3_mark_inode_dirty(handle, inode);
-		ext3_journal_stop(handle);
-	}
-
-	if (attr->ia_valid & ATTR_SIZE)
-		inode_dio_wait(inode);
-
-	if (S_ISREG(inode->i_mode) &&
-	    attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
-		handle_t *handle;
-
-		handle = ext3_journal_start(inode, 3);
-		if (IS_ERR(handle)) {
-			error = PTR_ERR(handle);
-			goto err_out;
-		}
-
-		error = ext3_orphan_add(handle, inode);
-		if (error) {
-			ext3_journal_stop(handle);
-			goto err_out;
-		}
-		EXT3_I(inode)->i_disksize = attr->ia_size;
-		error = ext3_mark_inode_dirty(handle, inode);
-		ext3_journal_stop(handle);
-		if (error) {
-			/* Some hard fs error must have happened. Bail out. */
-			ext3_orphan_del(NULL, inode);
-			goto err_out;
-		}
-		rc = ext3_block_truncate_page(inode, attr->ia_size);
-		if (rc) {
-			/* Cleanup orphan list and exit */
-			handle = ext3_journal_start(inode, 3);
-			if (IS_ERR(handle)) {
-				ext3_orphan_del(NULL, inode);
-				goto err_out;
-			}
-			ext3_orphan_del(handle, inode);
-			ext3_journal_stop(handle);
-			goto err_out;
-		}
-	}
-
-	if ((attr->ia_valid & ATTR_SIZE) &&
-	    attr->ia_size != i_size_read(inode)) {
-		truncate_setsize(inode, attr->ia_size);
-		ext3_truncate(inode);
-	}
-
-	setattr_copy(inode, attr);
-	mark_inode_dirty(inode);
-
-	if (ia_valid & ATTR_MODE)
-		rc = posix_acl_chmod(inode, inode->i_mode);
-
-err_out:
-	ext3_std_error(inode->i_sb, error);
-	if (!error)
-		error = rc;
-	return error;
-}
-
-
-/*
- * How many blocks doth make a writepage()?
- *
- * With N blocks per page, it may be:
- * N data blocks
- * 2 indirect block
- * 2 dindirect
- * 1 tindirect
- * N+5 bitmap blocks (from the above)
- * N+5 group descriptor summary blocks
- * 1 inode block
- * 1 superblock.
- * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
- *
- * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
- *
- * With ordered or writeback data it's the same, less the N data blocks.
- *
- * If the inode's direct blocks can hold an integral number of pages then a
- * page cannot straddle two indirect blocks, and we can only touch one indirect
- * and dindirect block, and the "5" above becomes "3".
- *
- * This still overestimates under most circumstances.  If we were to pass the
- * start and end offsets in here as well we could do block_to_path() on each
- * block and work out the exact number of indirects which are touched.  Pah.
- */
-
-static int ext3_writepage_trans_blocks(struct inode *inode)
-{
-	int bpp = ext3_journal_blocks_per_page(inode);
-	int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
-	int ret;
-
-	if (ext3_should_journal_data(inode))
-		ret = 3 * (bpp + indirects) + 2;
-	else
-		ret = 2 * (bpp + indirects) + indirects + 2;
-
-#ifdef CONFIG_QUOTA
-	/* We know that structure was already allocated during dquot_initialize so
-	 * we will be updating only the data blocks + inodes */
-	ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
-#endif
-
-	return ret;
-}
-
-/*
- * The caller must have previously called ext3_reserve_inode_write().
- * Give this, we know that the caller already has write access to iloc->bh.
- */
-int ext3_mark_iloc_dirty(handle_t *handle,
-		struct inode *inode, struct ext3_iloc *iloc)
-{
-	int err = 0;
-
-	/* the do_update_inode consumes one bh->b_count */
-	get_bh(iloc->bh);
-
-	/* ext3_do_update_inode() does journal_dirty_metadata */
-	err = ext3_do_update_inode(handle, inode, iloc);
-	put_bh(iloc->bh);
-	return err;
-}
-
-/*
- * On success, We end up with an outstanding reference count against
- * iloc->bh.  This _must_ be cleaned up later.
- */
-
-int
-ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
-			 struct ext3_iloc *iloc)
-{
-	int err = 0;
-	if (handle) {
-		err = ext3_get_inode_loc(inode, iloc);
-		if (!err) {
-			BUFFER_TRACE(iloc->bh, "get_write_access");
-			err = ext3_journal_get_write_access(handle, iloc->bh);
-			if (err) {
-				brelse(iloc->bh);
-				iloc->bh = NULL;
-			}
-		}
-	}
-	ext3_std_error(inode->i_sb, err);
-	return err;
-}
-
-/*
- * What we do here is to mark the in-core inode as clean with respect to inode
- * dirtiness (it may still be data-dirty).
- * This means that the in-core inode may be reaped by prune_icache
- * without having to perform any I/O.  This is a very good thing,
- * because *any* task may call prune_icache - even ones which
- * have a transaction open against a different journal.
- *
- * Is this cheating?  Not really.  Sure, we haven't written the
- * inode out, but prune_icache isn't a user-visible syncing function.
- * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
- * we start and wait on commits.
- */
-int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
-{
-	struct ext3_iloc iloc;
-	int err;
-
-	might_sleep();
-	trace_ext3_mark_inode_dirty(inode, _RET_IP_);
-	err = ext3_reserve_inode_write(handle, inode, &iloc);
-	if (!err)
-		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
-	return err;
-}
-
-/*
- * ext3_dirty_inode() is called from __mark_inode_dirty()
- *
- * We're really interested in the case where a file is being extended.
- * i_size has been changed by generic_commit_write() and we thus need
- * to include the updated inode in the current transaction.
- *
- * Also, dquot_alloc_space() will always dirty the inode when blocks
- * are allocated to the file.
- *
- * If the inode is marked synchronous, we don't honour that here - doing
- * so would cause a commit on atime updates, which we don't bother doing.
- * We handle synchronous inodes at the highest possible level.
- */
-void ext3_dirty_inode(struct inode *inode, int flags)
-{
-	handle_t *current_handle = ext3_journal_current_handle();
-	handle_t *handle;
-
-	handle = ext3_journal_start(inode, 2);
-	if (IS_ERR(handle))
-		goto out;
-	if (current_handle &&
-		current_handle->h_transaction != handle->h_transaction) {
-		/* This task has a transaction open against a different fs */
-		printk(KERN_EMERG "%s: transactions do not match!\n",
-		       __func__);
-	} else {
-		jbd_debug(5, "marking dirty.  outer handle=%p\n",
-				current_handle);
-		ext3_mark_inode_dirty(handle, inode);
-	}
-	ext3_journal_stop(handle);
-out:
-	return;
-}
-
-#if 0
-/*
- * Bind an inode's backing buffer_head into this transaction, to prevent
- * it from being flushed to disk early.  Unlike
- * ext3_reserve_inode_write, this leaves behind no bh reference and
- * returns no iloc structure, so the caller needs to repeat the iloc
- * lookup to mark the inode dirty later.
- */
-static int ext3_pin_inode(handle_t *handle, struct inode *inode)
-{
-	struct ext3_iloc iloc;
-
-	int err = 0;
-	if (handle) {
-		err = ext3_get_inode_loc(inode, &iloc);
-		if (!err) {
-			BUFFER_TRACE(iloc.bh, "get_write_access");
-			err = journal_get_write_access(handle, iloc.bh);
-			if (!err)
-				err = ext3_journal_dirty_metadata(handle,
-								  iloc.bh);
-			brelse(iloc.bh);
-		}
-	}
-	ext3_std_error(inode->i_sb, err);
-	return err;
-}
-#endif
-
-int ext3_change_inode_journal_flag(struct inode *inode, int val)
-{
-	journal_t *journal;
-	handle_t *handle;
-	int err;
-
-	/*
-	 * We have to be very careful here: changing a data block's
-	 * journaling status dynamically is dangerous.  If we write a
-	 * data block to the journal, change the status and then delete
-	 * that block, we risk forgetting to revoke the old log record
-	 * from the journal and so a subsequent replay can corrupt data.
-	 * So, first we make sure that the journal is empty and that
-	 * nobody is changing anything.
-	 */
-
-	journal = EXT3_JOURNAL(inode);
-	if (is_journal_aborted(journal))
-		return -EROFS;
-
-	journal_lock_updates(journal);
-	journal_flush(journal);
-
-	/*
-	 * OK, there are no updates running now, and all cached data is
-	 * synced to disk.  We are now in a completely consistent state
-	 * which doesn't have anything in the journal, and we know that
-	 * no filesystem updates are running, so it is safe to modify
-	 * the inode's in-core data-journaling state flag now.
-	 */
-
-	if (val)
-		EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
-	else
-		EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
-	ext3_set_aops(inode);
-
-	journal_unlock_updates(journal);
-
-	/* Finally we can mark the inode as dirty. */
-
-	handle = ext3_journal_start(inode, 1);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	err = ext3_mark_inode_dirty(handle, inode);
-	handle->h_sync = 1;
-	ext3_journal_stop(handle);
-	ext3_std_error(inode->i_sb, err);
-
-	return err;
-}
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
deleted file mode 100644
index 4d96e9a64532..000000000000
--- a/fs/ext3/ioctl.c
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * linux/fs/ext3/ioctl.c
- *
- * Copyright (C) 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- */
-
-#include <linux/mount.h>
-#include <linux/compat.h>
-#include <asm/uaccess.h>
-#include "ext3.h"
-
-long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-	struct inode *inode = file_inode(filp);
-	struct ext3_inode_info *ei = EXT3_I(inode);
-	unsigned int flags;
-	unsigned short rsv_window_size;
-
-	ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
-
-	switch (cmd) {
-	case EXT3_IOC_GETFLAGS:
-		ext3_get_inode_flags(ei);
-		flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
-		return put_user(flags, (int __user *) arg);
-	case EXT3_IOC_SETFLAGS: {
-		handle_t *handle = NULL;
-		int err;
-		struct ext3_iloc iloc;
-		unsigned int oldflags;
-		unsigned int jflag;
-
-		if (!inode_owner_or_capable(inode))
-			return -EACCES;
-
-		if (get_user(flags, (int __user *) arg))
-			return -EFAULT;
-
-		err = mnt_want_write_file(filp);
-		if (err)
-			return err;
-
-		flags = ext3_mask_flags(inode->i_mode, flags);
-
-		mutex_lock(&inode->i_mutex);
-
-		/* Is it quota file? Do not allow user to mess with it */
-		err = -EPERM;
-		if (IS_NOQUOTA(inode))
-			goto flags_out;
-
-		oldflags = ei->i_flags;
-
-		/* The JOURNAL_DATA flag is modifiable only by root */
-		jflag = flags & EXT3_JOURNAL_DATA_FL;
-
-		/*
-		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-		 * the relevant capability.
-		 *
-		 * This test looks nicer. Thanks to Pauline Middelink
-		 */
-		if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
-			if (!capable(CAP_LINUX_IMMUTABLE))
-				goto flags_out;
-		}
-
-		/*
-		 * The JOURNAL_DATA flag can only be changed by
-		 * the relevant capability.
-		 */
-		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE))
-				goto flags_out;
-		}
-
-		handle = ext3_journal_start(inode, 1);
-		if (IS_ERR(handle)) {
-			err = PTR_ERR(handle);
-			goto flags_out;
-		}
-		if (IS_SYNC(inode))
-			handle->h_sync = 1;
-		err = ext3_reserve_inode_write(handle, inode, &iloc);
-		if (err)
-			goto flags_err;
-
-		flags = flags & EXT3_FL_USER_MODIFIABLE;
-		flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE;
-		ei->i_flags = flags;
-
-		ext3_set_inode_flags(inode);
-		inode->i_ctime = CURRENT_TIME_SEC;
-
-		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
-flags_err:
-		ext3_journal_stop(handle);
-		if (err)
-			goto flags_out;
-
-		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
-			err = ext3_change_inode_journal_flag(inode, jflag);
-flags_out:
-		mutex_unlock(&inode->i_mutex);
-		mnt_drop_write_file(filp);
-		return err;
-	}
-	case EXT3_IOC_GETVERSION:
-	case EXT3_IOC_GETVERSION_OLD:
-		return put_user(inode->i_generation, (int __user *) arg);
-	case EXT3_IOC_SETVERSION:
-	case EXT3_IOC_SETVERSION_OLD: {
-		handle_t *handle;
-		struct ext3_iloc iloc;
-		__u32 generation;
-		int err;
-
-		if (!inode_owner_or_capable(inode))
-			return -EPERM;
-
-		err = mnt_want_write_file(filp);
-		if (err)
-			return err;
-		if (get_user(generation, (int __user *) arg)) {
-			err = -EFAULT;
-			goto setversion_out;
-		}
-
-		mutex_lock(&inode->i_mutex);
-		handle = ext3_journal_start(inode, 1);
-		if (IS_ERR(handle)) {
-			err = PTR_ERR(handle);
-			goto unlock_out;
-		}
-		err = ext3_reserve_inode_write(handle, inode, &iloc);
-		if (err == 0) {
-			inode->i_ctime = CURRENT_TIME_SEC;
-			inode->i_generation = generation;
-			err = ext3_mark_iloc_dirty(handle, inode, &iloc);
-		}
-		ext3_journal_stop(handle);
-
-unlock_out:
-		mutex_unlock(&inode->i_mutex);
-setversion_out:
-		mnt_drop_write_file(filp);
-		return err;
-	}
-	case EXT3_IOC_GETRSVSZ:
-		if (test_opt(inode->i_sb, RESERVATION)
-			&& S_ISREG(inode->i_mode)
-			&& ei->i_block_alloc_info) {
-			rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
-			return put_user(rsv_window_size, (int __user *)arg);
-		}
-		return -ENOTTY;
-	case EXT3_IOC_SETRSVSZ: {
-		int err;
-
-		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
-			return -ENOTTY;
-
-		err = mnt_want_write_file(filp);
-		if (err)
-			return err;
-
-		if (!inode_owner_or_capable(inode)) {
-			err = -EACCES;
-			goto setrsvsz_out;
-		}
-
-		if (get_user(rsv_window_size, (int __user *)arg)) {
-			err = -EFAULT;
-			goto setrsvsz_out;
-		}
-
-		if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
-			rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
-
-		/*
-		 * need to allocate reservation structure for this inode
-		 * before set the window size
-		 */
-		mutex_lock(&ei->truncate_mutex);
-		if (!ei->i_block_alloc_info)
-			ext3_init_block_alloc_info(inode);
-
-		if (ei->i_block_alloc_info){
-			struct ext3_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
-			rsv->rsv_goal_size = rsv_window_size;
-		}
-		mutex_unlock(&ei->truncate_mutex);
-setrsvsz_out:
-		mnt_drop_write_file(filp);
-		return err;
-	}
-	case EXT3_IOC_GROUP_EXTEND: {
-		ext3_fsblk_t n_blocks_count;
-		struct super_block *sb = inode->i_sb;
-		int err, err2;
-
-		if (!capable(CAP_SYS_RESOURCE))
-			return -EPERM;
-
-		err = mnt_want_write_file(filp);
-		if (err)
-			return err;
-
-		if (get_user(n_blocks_count, (__u32 __user *)arg)) {
-			err = -EFAULT;
-			goto group_extend_out;
-		}
-		err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
-		journal_lock_updates(EXT3_SB(sb)->s_journal);
-		err2 = journal_flush(EXT3_SB(sb)->s_journal);
-		journal_unlock_updates(EXT3_SB(sb)->s_journal);
-		if (err == 0)
-			err = err2;
-group_extend_out:
-		mnt_drop_write_file(filp);
-		return err;
-	}
-	case EXT3_IOC_GROUP_ADD: {
-		struct ext3_new_group_data input;
-		struct super_block *sb = inode->i_sb;
-		int err, err2;
-
-		if (!capable(CAP_SYS_RESOURCE))
-			return -EPERM;
-
-		err = mnt_want_write_file(filp);
-		if (err)
-			return err;
-
-		if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg,
-				sizeof(input))) {
-			err = -EFAULT;
-			goto group_add_out;
-		}
-
-		err = ext3_group_add(sb, &input);
-		journal_lock_updates(EXT3_SB(sb)->s_journal);
-		err2 = journal_flush(EXT3_SB(sb)->s_journal);
-		journal_unlock_updates(EXT3_SB(sb)->s_journal);
-		if (err == 0)
-			err = err2;
-group_add_out:
-		mnt_drop_write_file(filp);
-		return err;
-	}
-	case FITRIM: {
-
-		struct super_block *sb = inode->i_sb;
-		struct fstrim_range range;
-		int ret = 0;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&range, (struct fstrim_range __user *)arg,
-				   sizeof(range)))
-			return -EFAULT;
-
-		ret = ext3_trim_fs(sb, &range);
-		if (ret < 0)
-			return ret;
-
-		if (copy_to_user((struct fstrim_range __user *)arg, &range,
-				 sizeof(range)))
-			return -EFAULT;
-
-		return 0;
-	}
-
-	default:
-		return -ENOTTY;
-	}
-}
-
-#ifdef CONFIG_COMPAT
-long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	/* These are just misnamed, they actually get/put from/to user an int */
-	switch (cmd) {
-	case EXT3_IOC32_GETFLAGS:
-		cmd = EXT3_IOC_GETFLAGS;
-		break;
-	case EXT3_IOC32_SETFLAGS:
-		cmd = EXT3_IOC_SETFLAGS;
-		break;
-	case EXT3_IOC32_GETVERSION:
-		cmd = EXT3_IOC_GETVERSION;
-		break;
-	case EXT3_IOC32_SETVERSION:
-		cmd = EXT3_IOC_SETVERSION;
-		break;
-	case EXT3_IOC32_GROUP_EXTEND:
-		cmd = EXT3_IOC_GROUP_EXTEND;
-		break;
-	case EXT3_IOC32_GETVERSION_OLD:
-		cmd = EXT3_IOC_GETVERSION_OLD;
-		break;
-	case EXT3_IOC32_SETVERSION_OLD:
-		cmd = EXT3_IOC_SETVERSION_OLD;
-		break;
-#ifdef CONFIG_JBD_DEBUG
-	case EXT3_IOC32_WAIT_FOR_READONLY:
-		cmd = EXT3_IOC_WAIT_FOR_READONLY;
-		break;
-#endif
-	case EXT3_IOC32_GETRSVSZ:
-		cmd = EXT3_IOC_GETRSVSZ;
-		break;
-	case EXT3_IOC32_SETRSVSZ:
-		cmd = EXT3_IOC_SETRSVSZ;
-		break;
-	case EXT3_IOC_GROUP_ADD:
-		break;
-	default:
-		return -ENOIOCTLCMD;
-	}
-	return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-}
-#endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
deleted file mode 100644
index c9e767cd4b67..000000000000
--- a/fs/ext3/namei.c
+++ /dev/null
@@ -1,2586 +0,0 @@
-/*
- *  linux/fs/ext3/namei.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/fs/minix/namei.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *
- *  Big-endian to little-endian byte-swapping/bitmaps by
- *        David S. Miller (davem@caip.rutgers.edu), 1995
- *  Directory entry file type support and forward compatibility hooks
- *	for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
- *  Hash Tree Directory indexing (c)
- *	Daniel Phillips, 2001
- *  Hash Tree Directory indexing porting
- *	Christopher Li, 2002
- *  Hash Tree Directory indexing cleanup
- *	Theodore Ts'o, 2002
- */
-
-#include <linux/quotaops.h>
-#include "ext3.h"
-#include "namei.h"
-#include "xattr.h"
-#include "acl.h"
-
-/*
- * define how far ahead to read directories while searching them.
- */
-#define NAMEI_RA_CHUNKS  2
-#define NAMEI_RA_BLOCKS  4
-#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-
-static struct buffer_head *ext3_append(handle_t *handle,
-					struct inode *inode,
-					u32 *block, int *err)
-{
-	struct buffer_head *bh;
-
-	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
-
-	if ((bh = ext3_dir_bread(handle, inode, *block, 1, err))) {
-		inode->i_size += inode->i_sb->s_blocksize;
-		EXT3_I(inode)->i_disksize = inode->i_size;
-		*err = ext3_journal_get_write_access(handle, bh);
-		if (*err) {
-			brelse(bh);
-			bh = NULL;
-		}
-	}
-	return bh;
-}
-
-#ifndef assert
-#define assert(test) J_ASSERT(test)
-#endif
-
-#ifdef DX_DEBUG
-#define dxtrace(command) command
-#else
-#define dxtrace(command)
-#endif
-
-struct fake_dirent
-{
-	__le32 inode;
-	__le16 rec_len;
-	u8 name_len;
-	u8 file_type;
-};
-
-struct dx_countlimit
-{
-	__le16 limit;
-	__le16 count;
-};
-
-struct dx_entry
-{
-	__le32 hash;
-	__le32 block;
-};
-
-/*
- * dx_root_info is laid out so that if it should somehow get overlaid by a
- * dirent the two low bits of the hash version will be zero.  Therefore, the
- * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
- */
-
-struct dx_root
-{
-	struct fake_dirent dot;
-	char dot_name[4];
-	struct fake_dirent dotdot;
-	char dotdot_name[4];
-	struct dx_root_info
-	{
-		__le32 reserved_zero;
-		u8 hash_version;
-		u8 info_length; /* 8 */
-		u8 indirect_levels;
-		u8 unused_flags;
-	}
-	info;
-	struct dx_entry	entries[0];
-};
-
-struct dx_node
-{
-	struct fake_dirent fake;
-	struct dx_entry	entries[0];
-};
-
-
-struct dx_frame
-{
-	struct buffer_head *bh;
-	struct dx_entry *entries;
-	struct dx_entry *at;
-};
-
-struct dx_map_entry
-{
-	u32 hash;
-	u16 offs;
-	u16 size;
-};
-
-static inline unsigned dx_get_block (struct dx_entry *entry);
-static void dx_set_block (struct dx_entry *entry, unsigned value);
-static inline unsigned dx_get_hash (struct dx_entry *entry);
-static void dx_set_hash (struct dx_entry *entry, unsigned value);
-static unsigned dx_get_count (struct dx_entry *entries);
-static unsigned dx_get_limit (struct dx_entry *entries);
-static void dx_set_count (struct dx_entry *entries, unsigned value);
-static void dx_set_limit (struct dx_entry *entries, unsigned value);
-static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
-static unsigned dx_node_limit (struct inode *dir);
-static struct dx_frame *dx_probe(struct qstr *entry,
-				 struct inode *dir,
-				 struct dx_hash_info *hinfo,
-				 struct dx_frame *frame,
-				 int *err);
-static void dx_release (struct dx_frame *frames);
-static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
-			struct dx_hash_info *hinfo, struct dx_map_entry map[]);
-static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
-		struct dx_map_entry *offsets, int count);
-static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
-static int ext3_htree_next_block(struct inode *dir, __u32 hash,
-				 struct dx_frame *frame,
-				 struct dx_frame *frames,
-				 __u32 *start_hash);
-static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
-			struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
-			int *err);
-static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
-			     struct inode *inode);
-
-/*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext3_dir_entry_2 *
-ext3_next_entry(struct ext3_dir_entry_2 *p)
-{
-	return (struct ext3_dir_entry_2 *)((char *)p +
-		ext3_rec_len_from_disk(p->rec_len));
-}
-
-/*
- * Future: use high four bits of block for coalesce-on-delete flags
- * Mask them off for now.
- */
-
-static inline unsigned dx_get_block (struct dx_entry *entry)
-{
-	return le32_to_cpu(entry->block) & 0x00ffffff;
-}
-
-static inline void dx_set_block (struct dx_entry *entry, unsigned value)
-{
-	entry->block = cpu_to_le32(value);
-}
-
-static inline unsigned dx_get_hash (struct dx_entry *entry)
-{
-	return le32_to_cpu(entry->hash);
-}
-
-static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
-{
-	entry->hash = cpu_to_le32(value);
-}
-
-static inline unsigned dx_get_count (struct dx_entry *entries)
-{
-	return le16_to_cpu(((struct dx_countlimit *) entries)->count);
-}
-
-static inline unsigned dx_get_limit (struct dx_entry *entries)
-{
-	return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
-}
-
-static inline void dx_set_count (struct dx_entry *entries, unsigned value)
-{
-	((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
-}
-
-static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
-{
-	((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
-}
-
-static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
-{
-	unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
-		EXT3_DIR_REC_LEN(2) - infosize;
-	return entry_space / sizeof(struct dx_entry);
-}
-
-static inline unsigned dx_node_limit (struct inode *dir)
-{
-	unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
-	return entry_space / sizeof(struct dx_entry);
-}
-
-/*
- * Debug
- */
-#ifdef DX_DEBUG
-static void dx_show_index (char * label, struct dx_entry *entries)
-{
-        int i, n = dx_get_count (entries);
-        printk("%s index ", label);
-        for (i = 0; i < n; i++)
-        {
-                printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i));
-        }
-        printk("\n");
-}
-
-struct stats
-{
-	unsigned names;
-	unsigned space;
-	unsigned bcount;
-};
-
-static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
-				 int size, int show_names)
-{
-	unsigned names = 0, space = 0;
-	char *base = (char *) de;
-	struct dx_hash_info h = *hinfo;
-
-	printk("names: ");
-	while ((char *) de < base + size)
-	{
-		if (de->inode)
-		{
-			if (show_names)
-			{
-				int len = de->name_len;
-				char *name = de->name;
-				while (len--) printk("%c", *name++);
-				ext3fs_dirhash(de->name, de->name_len, &h);
-				printk(":%x.%u ", h.hash,
-				       (unsigned) ((char *) de - base));
-			}
-			space += EXT3_DIR_REC_LEN(de->name_len);
-			names++;
-		}
-		de = ext3_next_entry(de);
-	}
-	printk("(%i)\n", names);
-	return (struct stats) { names, space, 1 };
-}
-
-struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
-			     struct dx_entry *entries, int levels)
-{
-	unsigned blocksize = dir->i_sb->s_blocksize;
-	unsigned count = dx_get_count (entries), names = 0, space = 0, i;
-	unsigned bcount = 0;
-	struct buffer_head *bh;
-	int err;
-	printk("%i indexed blocks...\n", count);
-	for (i = 0; i < count; i++, entries++)
-	{
-		u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
-		u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
-		struct stats stats;
-		printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
-		if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
-		stats = levels?
-		   dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
-		   dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
-		names += stats.names;
-		space += stats.space;
-		bcount += stats.bcount;
-		brelse (bh);
-	}
-	if (bcount)
-		printk("%snames %u, fullness %u (%u%%)\n", levels?"":"   ",
-			names, space/bcount,(space/bcount)*100/blocksize);
-	return (struct stats) { names, space, bcount};
-}
-#endif /* DX_DEBUG */
-
-/*
- * Probe for a directory leaf block to search.
- *
- * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
- * error in the directory index, and the caller should fall back to
- * searching the directory normally.  The callers of dx_probe **MUST**
- * check for this error code, and make sure it never gets reflected
- * back to userspace.
- */
-static struct dx_frame *
-dx_probe(struct qstr *entry, struct inode *dir,
-	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
-{
-	unsigned count, indirect;
-	struct dx_entry *at, *entries, *p, *q, *m;
-	struct dx_root *root;
-	struct buffer_head *bh;
-	struct dx_frame *frame = frame_in;
-	u32 hash;
-
-	frame->bh = NULL;
-	if (!(bh = ext3_dir_bread(NULL, dir, 0, 0, err))) {
-		*err = ERR_BAD_DX_DIR;
-		goto fail;
-	}
-	root = (struct dx_root *) bh->b_data;
-	if (root->info.hash_version != DX_HASH_TEA &&
-	    root->info.hash_version != DX_HASH_HALF_MD4 &&
-	    root->info.hash_version != DX_HASH_LEGACY) {
-		ext3_warning(dir->i_sb, __func__,
-			     "Unrecognised inode hash code %d",
-			     root->info.hash_version);
-		brelse(bh);
-		*err = ERR_BAD_DX_DIR;
-		goto fail;
-	}
-	hinfo->hash_version = root->info.hash_version;
-	if (hinfo->hash_version <= DX_HASH_TEA)
-		hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
-	hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
-	if (entry)
-		ext3fs_dirhash(entry->name, entry->len, hinfo);
-	hash = hinfo->hash;
-
-	if (root->info.unused_flags & 1) {
-		ext3_warning(dir->i_sb, __func__,
-			     "Unimplemented inode hash flags: %#06x",
-			     root->info.unused_flags);
-		brelse(bh);
-		*err = ERR_BAD_DX_DIR;
-		goto fail;
-	}
-
-	if ((indirect = root->info.indirect_levels) > 1) {
-		ext3_warning(dir->i_sb, __func__,
-			     "Unimplemented inode hash depth: %#06x",
-			     root->info.indirect_levels);
-		brelse(bh);
-		*err = ERR_BAD_DX_DIR;
-		goto fail;
-	}
-
-	entries = (struct dx_entry *) (((char *)&root->info) +
-				       root->info.info_length);
-
-	if (dx_get_limit(entries) != dx_root_limit(dir,
-						   root->info.info_length)) {
-		ext3_warning(dir->i_sb, __func__,
-			     "dx entry: limit != root limit");
-		brelse(bh);
-		*err = ERR_BAD_DX_DIR;
-		goto fail;
-	}
-
-	dxtrace (printk("Look up %x", hash));
-	while (1)
-	{
-		count = dx_get_count(entries);
-		if (!count || count > dx_get_limit(entries)) {
-			ext3_warning(dir->i_sb, __func__,
-				     "dx entry: no count or count > limit");
-			brelse(bh);
-			*err = ERR_BAD_DX_DIR;
-			goto fail2;
-		}
-
-		p = entries + 1;
-		q = entries + count - 1;
-		while (p <= q)
-		{
-			m = p + (q - p)/2;
-			dxtrace(printk("."));
-			if (dx_get_hash(m) > hash)
-				q = m - 1;
-			else
-				p = m + 1;
-		}
-
-		if (0) // linear search cross check
-		{
-			unsigned n = count - 1;
-			at = entries;
-			while (n--)
-			{
-				dxtrace(printk(","));
-				if (dx_get_hash(++at) > hash)
-				{
-					at--;
-					break;
-				}
-			}
-			assert (at == p - 1);
-		}
-
-		at = p - 1;
-		dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
-		frame->bh = bh;
-		frame->entries = entries;
-		frame->at = at;
-		if (!indirect--) return frame;
-		if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(at), 0, err))) {
-			*err = ERR_BAD_DX_DIR;
-			goto fail2;
-		}
-		at = entries = ((struct dx_node *) bh->b_data)->entries;
-		if (dx_get_limit(entries) != dx_node_limit (dir)) {
-			ext3_warning(dir->i_sb, __func__,
-				     "dx entry: limit != node limit");
-			brelse(bh);
-			*err = ERR_BAD_DX_DIR;
-			goto fail2;
-		}
-		frame++;
-		frame->bh = NULL;
-	}
-fail2:
-	while (frame >= frame_in) {
-		brelse(frame->bh);
-		frame--;
-	}
-fail:
-	if (*err == ERR_BAD_DX_DIR)
-		ext3_warning(dir->i_sb, __func__,
-			     "Corrupt dir inode %ld, running e2fsck is "
-			     "recommended.", dir->i_ino);
-	return NULL;
-}
-
-static void dx_release (struct dx_frame *frames)
-{
-	if (frames[0].bh == NULL)
-		return;
-
-	if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
-		brelse(frames[1].bh);
-	brelse(frames[0].bh);
-}
-
-/*
- * This function increments the frame pointer to search the next leaf
- * block, and reads in the necessary intervening nodes if the search
- * should be necessary.  Whether or not the search is necessary is
- * controlled by the hash parameter.  If the hash value is even, then
- * the search is only continued if the next block starts with that
- * hash value.  This is used if we are searching for a specific file.
- *
- * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
- *
- * This function returns 1 if the caller should continue to search,
- * or 0 if it should not.  If there is an error reading one of the
- * index blocks, it will a negative error code.
- *
- * If start_hash is non-null, it will be filled in with the starting
- * hash of the next page.
- */
-static int ext3_htree_next_block(struct inode *dir, __u32 hash,
-				 struct dx_frame *frame,
-				 struct dx_frame *frames,
-				 __u32 *start_hash)
-{
-	struct dx_frame *p;
-	struct buffer_head *bh;
-	int err, num_frames = 0;
-	__u32 bhash;
-
-	p = frame;
-	/*
-	 * Find the next leaf page by incrementing the frame pointer.
-	 * If we run out of entries in the interior node, loop around and
-	 * increment pointer in the parent node.  When we break out of
-	 * this loop, num_frames indicates the number of interior
-	 * nodes need to be read.
-	 */
-	while (1) {
-		if (++(p->at) < p->entries + dx_get_count(p->entries))
-			break;
-		if (p == frames)
-			return 0;
-		num_frames++;
-		p--;
-	}
-
-	/*
-	 * If the hash is 1, then continue only if the next page has a
-	 * continuation hash of any value.  This is used for readdir
-	 * handling.  Otherwise, check to see if the hash matches the
-	 * desired contiuation hash.  If it doesn't, return since
-	 * there's no point to read in the successive index pages.
-	 */
-	bhash = dx_get_hash(p->at);
-	if (start_hash)
-		*start_hash = bhash;
-	if ((hash & 1) == 0) {
-		if ((bhash & ~1) != hash)
-			return 0;
-	}
-	/*
-	 * If the hash is HASH_NB_ALWAYS, we always go to the next
-	 * block so no check is necessary
-	 */
-	while (num_frames--) {
-		if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(p->at),
-					  0, &err)))
-			return err; /* Failure */
-		p++;
-		brelse (p->bh);
-		p->bh = bh;
-		p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
-	}
-	return 1;
-}
-
-
-/*
- * This function fills a red-black tree with information from a
- * directory block.  It returns the number directory entries loaded
- * into the tree.  If there is an error it is returned in err.
- */
-static int htree_dirblock_to_tree(struct file *dir_file,
-				  struct inode *dir, int block,
-				  struct dx_hash_info *hinfo,
-				  __u32 start_hash, __u32 start_minor_hash)
-{
-	struct buffer_head *bh;
-	struct ext3_dir_entry_2 *de, *top;
-	int err = 0, count = 0;
-
-	dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
-
-	if (!(bh = ext3_dir_bread(NULL, dir, block, 0, &err)))
-		return err;
-
-	de = (struct ext3_dir_entry_2 *) bh->b_data;
-	top = (struct ext3_dir_entry_2 *) ((char *) de +
-					   dir->i_sb->s_blocksize -
-					   EXT3_DIR_REC_LEN(0));
-	for (; de < top; de = ext3_next_entry(de)) {
-		if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
-					(block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
-						+((char *)de - bh->b_data))) {
-			/* silently ignore the rest of the block */
-			break;
-		}
-		ext3fs_dirhash(de->name, de->name_len, hinfo);
-		if ((hinfo->hash < start_hash) ||
-		    ((hinfo->hash == start_hash) &&
-		     (hinfo->minor_hash < start_minor_hash)))
-			continue;
-		if (de->inode == 0)
-			continue;
-		if ((err = ext3_htree_store_dirent(dir_file,
-				   hinfo->hash, hinfo->minor_hash, de)) != 0) {
-			brelse(bh);
-			return err;
-		}
-		count++;
-	}
-	brelse(bh);
-	return count;
-}
-
-
-/*
- * This function fills a red-black tree with information from a
- * directory.  We start scanning the directory in hash order, starting
- * at start_hash and start_minor_hash.
- *
- * This function returns the number of entries inserted into the tree,
- * or a negative error code.
- */
-int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
-			 __u32 start_minor_hash, __u32 *next_hash)
-{
-	struct dx_hash_info hinfo;
-	struct ext3_dir_entry_2 *de;
-	struct dx_frame frames[2], *frame;
-	struct inode *dir;
-	int block, err;
-	int count = 0;
-	int ret;
-	__u32 hashval;
-
-	dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
-		       start_minor_hash));
-	dir = file_inode(dir_file);
-	if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
-		hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
-		if (hinfo.hash_version <= DX_HASH_TEA)
-			hinfo.hash_version +=
-				EXT3_SB(dir->i_sb)->s_hash_unsigned;
-		hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
-		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
-					       start_hash, start_minor_hash);
-		*next_hash = ~0;
-		return count;
-	}
-	hinfo.hash = start_hash;
-	hinfo.minor_hash = 0;
-	frame = dx_probe(NULL, file_inode(dir_file), &hinfo, frames, &err);
-	if (!frame)
-		return err;
-
-	/* Add '.' and '..' from the htree header */
-	if (!start_hash && !start_minor_hash) {
-		de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
-		if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
-			goto errout;
-		count++;
-	}
-	if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
-		de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
-		de = ext3_next_entry(de);
-		if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0)
-			goto errout;
-		count++;
-	}
-
-	while (1) {
-		block = dx_get_block(frame->at);
-		ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
-					     start_hash, start_minor_hash);
-		if (ret < 0) {
-			err = ret;
-			goto errout;
-		}
-		count += ret;
-		hashval = ~0;
-		ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
-					    frame, frames, &hashval);
-		*next_hash = hashval;
-		if (ret < 0) {
-			err = ret;
-			goto errout;
-		}
-		/*
-		 * Stop if:  (a) there are no more entries, or
-		 * (b) we have inserted at least one entry and the
-		 * next hash value is not a continuation
-		 */
-		if ((ret == 0) ||
-		    (count && ((hashval & 1) == 0)))
-			break;
-	}
-	dx_release(frames);
-	dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
-		       count, *next_hash));
-	return count;
-errout:
-	dx_release(frames);
-	return (err);
-}
-
-
-/*
- * Directory block splitting, compacting
- */
-
-/*
- * Create map of hash values, offsets, and sizes, stored at end of block.
- * Returns number of entries mapped.
- */
-static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
-		struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
-{
-	int count = 0;
-	char *base = (char *) de;
-	struct dx_hash_info h = *hinfo;
-
-	while ((char *) de < base + blocksize)
-	{
-		if (de->name_len && de->inode) {
-			ext3fs_dirhash(de->name, de->name_len, &h);
-			map_tail--;
-			map_tail->hash = h.hash;
-			map_tail->offs = (u16) ((char *) de - base);
-			map_tail->size = le16_to_cpu(de->rec_len);
-			count++;
-			cond_resched();
-		}
-		/* XXX: do we need to check rec_len == 0 case? -Chris */
-		de = ext3_next_entry(de);
-	}
-	return count;
-}
-
-/* Sort map by hash value */
-static void dx_sort_map (struct dx_map_entry *map, unsigned count)
-{
-        struct dx_map_entry *p, *q, *top = map + count - 1;
-        int more;
-        /* Combsort until bubble sort doesn't suck */
-        while (count > 2)
-	{
-                count = count*10/13;
-                if (count - 9 < 2) /* 9, 10 -> 11 */
-                        count = 11;
-                for (p = top, q = p - count; q >= map; p--, q--)
-                        if (p->hash < q->hash)
-                                swap(*p, *q);
-        }
-        /* Garden variety bubble sort */
-        do {
-                more = 0;
-                q = top;
-                while (q-- > map)
-		{
-                        if (q[1].hash >= q[0].hash)
-				continue;
-                        swap(*(q+1), *q);
-                        more = 1;
-		}
-	} while(more);
-}
-
-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
-{
-	struct dx_entry *entries = frame->entries;
-	struct dx_entry *old = frame->at, *new = old + 1;
-	int count = dx_get_count(entries);
-
-	assert(count < dx_get_limit(entries));
-	assert(old < entries + count);
-	memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
-	dx_set_hash(new, hash);
-	dx_set_block(new, block);
-	dx_set_count(entries, count + 1);
-}
-
-static void ext3_update_dx_flag(struct inode *inode)
-{
-	if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
-				     EXT3_FEATURE_COMPAT_DIR_INDEX))
-		EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
-}
-
-/*
- * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
- *
- * `len <= EXT3_NAME_LEN' is guaranteed by caller.
- * `de != NULL' is guaranteed by caller.
- */
-static inline int ext3_match (int len, const char * const name,
-			      struct ext3_dir_entry_2 * de)
-{
-	if (len != de->name_len)
-		return 0;
-	if (!de->inode)
-		return 0;
-	return !memcmp(name, de->name, len);
-}
-
-/*
- * Returns 0 if not found, -1 on failure, and 1 on success
- */
-static inline int search_dirblock(struct buffer_head * bh,
-				  struct inode *dir,
-				  struct qstr *child,
-				  unsigned long offset,
-				  struct ext3_dir_entry_2 ** res_dir)
-{
-	struct ext3_dir_entry_2 * de;
-	char * dlimit;
-	int de_len;
-	const char *name = child->name;
-	int namelen = child->len;
-
-	de = (struct ext3_dir_entry_2 *) bh->b_data;
-	dlimit = bh->b_data + dir->i_sb->s_blocksize;
-	while ((char *) de < dlimit) {
-		/* this code is executed quadratically often */
-		/* do minimal checking `by hand' */
-
-		if ((char *) de + namelen <= dlimit &&
-		    ext3_match (namelen, name, de)) {
-			/* found a match - just to be sure, do a full check */
-			if (!ext3_check_dir_entry("ext3_find_entry",
-						  dir, de, bh, offset))
-				return -1;
-			*res_dir = de;
-			return 1;
-		}
-		/* prevent looping on a bad block */
-		de_len = ext3_rec_len_from_disk(de->rec_len);
-		if (de_len <= 0)
-			return -1;
-		offset += de_len;
-		de = (struct ext3_dir_entry_2 *) ((char *) de + de_len);
-	}
-	return 0;
-}
-
-
-/*
- *	ext3_find_entry()
- *
- * finds an entry in the specified directory with the wanted name. It
- * returns the cache buffer in which the entry was found, and the entry
- * itself (as a parameter - res_dir). It does NOT read the inode of the
- * entry - you'll have to do that yourself if you want to.
- *
- * The returned buffer_head has ->b_count elevated.  The caller is expected
- * to brelse() it when appropriate.
- */
-static struct buffer_head *ext3_find_entry(struct inode *dir,
-					struct qstr *entry,
-					struct ext3_dir_entry_2 **res_dir)
-{
-	struct super_block * sb;
-	struct buffer_head * bh_use[NAMEI_RA_SIZE];
-	struct buffer_head * bh, *ret = NULL;
-	unsigned long start, block, b;
-	const u8 *name = entry->name;
-	int ra_max = 0;		/* Number of bh's in the readahead
-				   buffer, bh_use[] */
-	int ra_ptr = 0;		/* Current index into readahead
-				   buffer */
-	int num = 0;
-	int nblocks, i, err;
-	int namelen;
-
-	*res_dir = NULL;
-	sb = dir->i_sb;
-	namelen = entry->len;
-	if (namelen > EXT3_NAME_LEN)
-		return NULL;
-	if ((namelen <= 2) && (name[0] == '.') &&
-	    (name[1] == '.' || name[1] == 0)) {
-		/*
-		 * "." or ".." will only be in the first block
-		 * NFS may look up ".."; "." should be handled by the VFS
-		 */
-		block = start = 0;
-		nblocks = 1;
-		goto restart;
-	}
-	if (is_dx(dir)) {
-		bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
-		/*
-		 * On success, or if the error was file not found,
-		 * return.  Otherwise, fall back to doing a search the
-		 * old fashioned way.
-		 */
-		if (bh || (err != ERR_BAD_DX_DIR))
-			return bh;
-		dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
-	}
-	nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
-	start = EXT3_I(dir)->i_dir_start_lookup;
-	if (start >= nblocks)
-		start = 0;
-	block = start;
-restart:
-	do {
-		/*
-		 * We deal with the read-ahead logic here.
-		 */
-		if (ra_ptr >= ra_max) {
-			/* Refill the readahead buffer */
-			ra_ptr = 0;
-			b = block;
-			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
-				/*
-				 * Terminate if we reach the end of the
-				 * directory and must wrap, or if our
-				 * search has finished at this block.
-				 */
-				if (b >= nblocks || (num && block == start)) {
-					bh_use[ra_max] = NULL;
-					break;
-				}
-				num++;
-				bh = ext3_getblk(NULL, dir, b++, 0, &err);
-				bh_use[ra_max] = bh;
-				if (bh && !bh_uptodate_or_lock(bh)) {
-					get_bh(bh);
-					bh->b_end_io = end_buffer_read_sync;
-					submit_bh(READ | REQ_META | REQ_PRIO,
-						  bh);
-				}
-			}
-		}
-		if ((bh = bh_use[ra_ptr++]) == NULL)
-			goto next;
-		wait_on_buffer(bh);
-		if (!buffer_uptodate(bh)) {
-			/* read error, skip block & hope for the best */
-			ext3_error(sb, __func__, "reading directory #%lu "
-				   "offset %lu", dir->i_ino, block);
-			brelse(bh);
-			goto next;
-		}
-		i = search_dirblock(bh, dir, entry,
-			    block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
-		if (i == 1) {
-			EXT3_I(dir)->i_dir_start_lookup = block;
-			ret = bh;
-			goto cleanup_and_exit;
-		} else {
-			brelse(bh);
-			if (i < 0)
-				goto cleanup_and_exit;
-		}
-	next:
-		if (++block >= nblocks)
-			block = 0;
-	} while (block != start);
-
-	/*
-	 * If the directory has grown while we were searching, then
-	 * search the last part of the directory before giving up.
-	 */
-	block = nblocks;
-	nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
-	if (block < nblocks) {
-		start = 0;
-		goto restart;
-	}
-
-cleanup_and_exit:
-	/* Clean up the read-ahead blocks */
-	for (; ra_ptr < ra_max; ra_ptr++)
-		brelse (bh_use[ra_ptr]);
-	return ret;
-}
-
-static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
-			struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
-			int *err)
-{
-	struct super_block *sb = dir->i_sb;
-	struct dx_hash_info	hinfo;
-	struct dx_frame frames[2], *frame;
-	struct buffer_head *bh;
-	unsigned long block;
-	int retval;
-
-	if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
-		return NULL;
-	do {
-		block = dx_get_block(frame->at);
-		if (!(bh = ext3_dir_bread (NULL, dir, block, 0, err)))
-			goto errout;
-
-		retval = search_dirblock(bh, dir, entry,
-					 block << EXT3_BLOCK_SIZE_BITS(sb),
-					 res_dir);
-		if (retval == 1) {
-			dx_release(frames);
-			return bh;
-		}
-		brelse(bh);
-		if (retval == -1) {
-			*err = ERR_BAD_DX_DIR;
-			goto errout;
-		}
-
-		/* Check to see if we should continue to search */
-		retval = ext3_htree_next_block(dir, hinfo.hash, frame,
-					       frames, NULL);
-		if (retval < 0) {
-			ext3_warning(sb, __func__,
-			     "error reading index page in directory #%lu",
-			     dir->i_ino);
-			*err = retval;
-			goto errout;
-		}
-	} while (retval == 1);
-
-	*err = -ENOENT;
-errout:
-	dxtrace(printk("%s not found\n", entry->name));
-	dx_release (frames);
-	return NULL;
-}
-
-static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags)
-{
-	struct inode * inode;
-	struct ext3_dir_entry_2 * de;
-	struct buffer_head * bh;
-
-	if (dentry->d_name.len > EXT3_NAME_LEN)
-		return ERR_PTR(-ENAMETOOLONG);
-
-	bh = ext3_find_entry(dir, &dentry->d_name, &de);
-	inode = NULL;
-	if (bh) {
-		unsigned long ino = le32_to_cpu(de->inode);
-		brelse (bh);
-		if (!ext3_valid_inum(dir->i_sb, ino)) {
-			ext3_error(dir->i_sb, "ext3_lookup",
-				   "bad inode number: %lu", ino);
-			return ERR_PTR(-EIO);
-		}
-		inode = ext3_iget(dir->i_sb, ino);
-		if (inode == ERR_PTR(-ESTALE)) {
-			ext3_error(dir->i_sb, __func__,
-					"deleted inode referenced: %lu",
-					ino);
-			return ERR_PTR(-EIO);
-		}
-	}
-	return d_splice_alias(inode, dentry);
-}
-
-
-struct dentry *ext3_get_parent(struct dentry *child)
-{
-	unsigned long ino;
-	struct qstr dotdot = QSTR_INIT("..", 2);
-	struct ext3_dir_entry_2 * de;
-	struct buffer_head *bh;
-
-	bh = ext3_find_entry(d_inode(child), &dotdot, &de);
-	if (!bh)
-		return ERR_PTR(-ENOENT);
-	ino = le32_to_cpu(de->inode);
-	brelse(bh);
-
-	if (!ext3_valid_inum(d_inode(child)->i_sb, ino)) {
-		ext3_error(d_inode(child)->i_sb, "ext3_get_parent",
-			   "bad inode number: %lu", ino);
-		return ERR_PTR(-EIO);
-	}
-
-	return d_obtain_alias(ext3_iget(d_inode(child)->i_sb, ino));
-}
-
-#define S_SHIFT 12
-static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
-	[S_IFREG >> S_SHIFT]	= EXT3_FT_REG_FILE,
-	[S_IFDIR >> S_SHIFT]	= EXT3_FT_DIR,
-	[S_IFCHR >> S_SHIFT]	= EXT3_FT_CHRDEV,
-	[S_IFBLK >> S_SHIFT]	= EXT3_FT_BLKDEV,
-	[S_IFIFO >> S_SHIFT]	= EXT3_FT_FIFO,
-	[S_IFSOCK >> S_SHIFT]	= EXT3_FT_SOCK,
-	[S_IFLNK >> S_SHIFT]	= EXT3_FT_SYMLINK,
-};
-
-static inline void ext3_set_de_type(struct super_block *sb,
-				struct ext3_dir_entry_2 *de,
-				umode_t mode) {
-	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE))
-		de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
-}
-
-/*
- * Move count entries from end of map between two memory locations.
- * Returns pointer to last entry moved.
- */
-static struct ext3_dir_entry_2 *
-dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
-{
-	unsigned rec_len = 0;
-
-	while (count--) {
-		struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
-		rec_len = EXT3_DIR_REC_LEN(de->name_len);
-		memcpy (to, de, rec_len);
-		((struct ext3_dir_entry_2 *) to)->rec_len =
-				ext3_rec_len_to_disk(rec_len);
-		de->inode = 0;
-		map++;
-		to += rec_len;
-	}
-	return (struct ext3_dir_entry_2 *) (to - rec_len);
-}
-
-/*
- * Compact each dir entry in the range to the minimal rec_len.
- * Returns pointer to last entry in range.
- */
-static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
-{
-	struct ext3_dir_entry_2 *next, *to, *prev;
-	struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
-	unsigned rec_len = 0;
-
-	prev = to = de;
-	while ((char *)de < base + blocksize) {
-		next = ext3_next_entry(de);
-		if (de->inode && de->name_len) {
-			rec_len = EXT3_DIR_REC_LEN(de->name_len);
-			if (de > to)
-				memmove(to, de, rec_len);
-			to->rec_len = ext3_rec_len_to_disk(rec_len);
-			prev = to;
-			to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
-		}
-		de = next;
-	}
-	return prev;
-}
-
-/*
- * Split a full leaf block to make room for a new dir entry.
- * Allocate a new block, and move entries so that they are approx. equally full.
- * Returns pointer to de in block into which the new entry will be inserted.
- */
-static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
-			struct buffer_head **bh,struct dx_frame *frame,
-			struct dx_hash_info *hinfo, int *error)
-{
-	unsigned blocksize = dir->i_sb->s_blocksize;
-	unsigned count, continued;
-	struct buffer_head *bh2;
-	u32 newblock;
-	u32 hash2;
-	struct dx_map_entry *map;
-	char *data1 = (*bh)->b_data, *data2;
-	unsigned split, move, size;
-	struct ext3_dir_entry_2 *de = NULL, *de2;
-	int	err = 0, i;
-
-	bh2 = ext3_append (handle, dir, &newblock, &err);
-	if (!(bh2)) {
-		brelse(*bh);
-		*bh = NULL;
-		goto errout;
-	}
-
-	BUFFER_TRACE(*bh, "get_write_access");
-	err = ext3_journal_get_write_access(handle, *bh);
-	if (err)
-		goto journal_error;
-
-	BUFFER_TRACE(frame->bh, "get_write_access");
-	err = ext3_journal_get_write_access(handle, frame->bh);
-	if (err)
-		goto journal_error;
-
-	data2 = bh2->b_data;
-
-	/* create map in the end of data2 block */
-	map = (struct dx_map_entry *) (data2 + blocksize);
-	count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
-			     blocksize, hinfo, map);
-	map -= count;
-	dx_sort_map (map, count);
-	/* Split the existing block in the middle, size-wise */
-	size = 0;
-	move = 0;
-	for (i = count-1; i >= 0; i--) {
-		/* is more than half of this entry in 2nd half of the block? */
-		if (size + map[i].size/2 > blocksize/2)
-			break;
-		size += map[i].size;
-		move++;
-	}
-	/* map index at which we will split */
-	split = count - move;
-	hash2 = map[split].hash;
-	continued = hash2 == map[split - 1].hash;
-	dxtrace(printk("Split block %i at %x, %i/%i\n",
-		dx_get_block(frame->at), hash2, split, count-split));
-
-	/* Fancy dance to stay within two buffers */
-	de2 = dx_move_dirents(data1, data2, map + split, count - split);
-	de = dx_pack_dirents(data1,blocksize);
-	de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
-	de2->rec_len = ext3_rec_len_to_disk(data2 + blocksize - (char *) de2);
-	dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
-	dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
-
-	/* Which block gets the new entry? */
-	if (hinfo->hash >= hash2)
-	{
-		swap(*bh, bh2);
-		de = de2;
-	}
-	dx_insert_block (frame, hash2 + continued, newblock);
-	err = ext3_journal_dirty_metadata (handle, bh2);
-	if (err)
-		goto journal_error;
-	err = ext3_journal_dirty_metadata (handle, frame->bh);
-	if (err)
-		goto journal_error;
-	brelse (bh2);
-	dxtrace(dx_show_index ("frame", frame->entries));
-	return de;
-
-journal_error:
-	brelse(*bh);
-	brelse(bh2);
-	*bh = NULL;
-	ext3_std_error(dir->i_sb, err);
-errout:
-	*error = err;
-	return NULL;
-}
-
-
-/*
- * Add a new entry into a directory (leaf) block.  If de is non-NULL,
- * it points to a directory entry which is guaranteed to be large
- * enough for new directory entry.  If de is NULL, then
- * add_dirent_to_buf will attempt search the directory block for
- * space.  It will return -ENOSPC if no space is available, and -EIO
- * and -EEXIST if directory entry already exists.
- *
- * NOTE!  bh is NOT released in the case where ENOSPC is returned.  In
- * all other cases bh is released.
- */
-static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
-			     struct inode *inode, struct ext3_dir_entry_2 *de,
-			     struct buffer_head * bh)
-{
-	struct inode	*dir = d_inode(dentry->d_parent);
-	const char	*name = dentry->d_name.name;
-	int		namelen = dentry->d_name.len;
-	unsigned long	offset = 0;
-	unsigned short	reclen;
-	int		nlen, rlen, err;
-	char		*top;
-
-	reclen = EXT3_DIR_REC_LEN(namelen);
-	if (!de) {
-		de = (struct ext3_dir_entry_2 *)bh->b_data;
-		top = bh->b_data + dir->i_sb->s_blocksize - reclen;
-		while ((char *) de <= top) {
-			if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
-						  bh, offset)) {
-				brelse (bh);
-				return -EIO;
-			}
-			if (ext3_match (namelen, name, de)) {
-				brelse (bh);
-				return -EEXIST;
-			}
-			nlen = EXT3_DIR_REC_LEN(de->name_len);
-			rlen = ext3_rec_len_from_disk(de->rec_len);
-			if ((de->inode? rlen - nlen: rlen) >= reclen)
-				break;
-			de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
-			offset += rlen;
-		}
-		if ((char *) de > top)
-			return -ENOSPC;
-	}
-	BUFFER_TRACE(bh, "get_write_access");
-	err = ext3_journal_get_write_access(handle, bh);
-	if (err) {
-		ext3_std_error(dir->i_sb, err);
-		brelse(bh);
-		return err;
-	}
-
-	/* By now the buffer is marked for journaling */
-	nlen = EXT3_DIR_REC_LEN(de->name_len);
-	rlen = ext3_rec_len_from_disk(de->rec_len);
-	if (de->inode) {
-		struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
-		de1->rec_len = ext3_rec_len_to_disk(rlen - nlen);
-		de->rec_len = ext3_rec_len_to_disk(nlen);
-		de = de1;
-	}
-	de->file_type = EXT3_FT_UNKNOWN;
-	if (inode) {
-		de->inode = cpu_to_le32(inode->i_ino);
-		ext3_set_de_type(dir->i_sb, de, inode->i_mode);
-	} else
-		de->inode = 0;
-	de->name_len = namelen;
-	memcpy (de->name, name, namelen);
-	/*
-	 * XXX shouldn't update any times until successful
-	 * completion of syscall, but too many callers depend
-	 * on this.
-	 *
-	 * XXX similarly, too many callers depend on
-	 * ext3_new_inode() setting the times, but error
-	 * recovery deletes the inode, so the worst that can
-	 * happen is that the times are slightly out of date
-	 * and/or different from the directory change time.
-	 */
-	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
-	ext3_update_dx_flag(dir);
-	dir->i_version++;
-	ext3_mark_inode_dirty(handle, dir);
-	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-	err = ext3_journal_dirty_metadata(handle, bh);
-	if (err)
-		ext3_std_error(dir->i_sb, err);
-	brelse(bh);
-	return 0;
-}
-
-/*
- * This converts a one block unindexed directory to a 3 block indexed
- * directory, and adds the dentry to the indexed directory.
- */
-static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
-			    struct inode *inode, struct buffer_head *bh)
-{
-	struct inode	*dir = d_inode(dentry->d_parent);
-	const char	*name = dentry->d_name.name;
-	int		namelen = dentry->d_name.len;
-	struct buffer_head *bh2;
-	struct dx_root	*root;
-	struct dx_frame	frames[2], *frame;
-	struct dx_entry *entries;
-	struct ext3_dir_entry_2	*de, *de2;
-	char		*data1, *top;
-	unsigned	len;
-	int		retval;
-	unsigned	blocksize;
-	struct dx_hash_info hinfo;
-	u32		block;
-	struct fake_dirent *fde;
-
-	blocksize =  dir->i_sb->s_blocksize;
-	dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
-	retval = ext3_journal_get_write_access(handle, bh);
-	if (retval) {
-		ext3_std_error(dir->i_sb, retval);
-		brelse(bh);
-		return retval;
-	}
-	root = (struct dx_root *) bh->b_data;
-
-	/* The 0th block becomes the root, move the dirents out */
-	fde = &root->dotdot;
-	de = (struct ext3_dir_entry_2 *)((char *)fde +
-			ext3_rec_len_from_disk(fde->rec_len));
-	if ((char *) de >= (((char *) root) + blocksize)) {
-		ext3_error(dir->i_sb, __func__,
-			   "invalid rec_len for '..' in inode %lu",
-			   dir->i_ino);
-		brelse(bh);
-		return -EIO;
-	}
-	len = ((char *) root) + blocksize - (char *) de;
-
-	bh2 = ext3_append (handle, dir, &block, &retval);
-	if (!(bh2)) {
-		brelse(bh);
-		return retval;
-	}
-	EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
-	data1 = bh2->b_data;
-
-	memcpy (data1, de, len);
-	de = (struct ext3_dir_entry_2 *) data1;
-	top = data1 + len;
-	while ((char *)(de2 = ext3_next_entry(de)) < top)
-		de = de2;
-	de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
-	/* Initialize the root; the dot dirents already exist */
-	de = (struct ext3_dir_entry_2 *) (&root->dotdot);
-	de->rec_len = ext3_rec_len_to_disk(blocksize - EXT3_DIR_REC_LEN(2));
-	memset (&root->info, 0, sizeof(root->info));
-	root->info.info_length = sizeof(root->info);
-	root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
-	entries = root->entries;
-	dx_set_block (entries, 1);
-	dx_set_count (entries, 1);
-	dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
-
-	/* Initialize as for dx_probe */
-	hinfo.hash_version = root->info.hash_version;
-	if (hinfo.hash_version <= DX_HASH_TEA)
-		hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
-	hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
-	ext3fs_dirhash(name, namelen, &hinfo);
-	frame = frames;
-	frame->entries = entries;
-	frame->at = entries;
-	frame->bh = bh;
-	bh = bh2;
-	/*
-	 * Mark buffers dirty here so that if do_split() fails we write a
-	 * consistent set of buffers to disk.
-	 */
-	ext3_journal_dirty_metadata(handle, frame->bh);
-	ext3_journal_dirty_metadata(handle, bh);
-	de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-	if (!de) {
-		ext3_mark_inode_dirty(handle, dir);
-		dx_release(frames);
-		return retval;
-	}
-	dx_release(frames);
-
-	return add_dirent_to_buf(handle, dentry, inode, de, bh);
-}
-
-/*
- *	ext3_add_entry()
- *
- * adds a file entry to the specified directory, using the same
- * semantics as ext3_find_entry(). It returns NULL if it failed.
- *
- * NOTE!! The inode part of 'de' is left at 0 - which means you
- * may not sleep between calling this and putting something into
- * the entry, as someone else might have used it while you slept.
- */
-static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
-	struct inode *inode)
-{
-	struct inode *dir = d_inode(dentry->d_parent);
-	struct buffer_head * bh;
-	struct ext3_dir_entry_2 *de;
-	struct super_block * sb;
-	int	retval;
-	int	dx_fallback=0;
-	unsigned blocksize;
-	u32 block, blocks;
-
-	sb = dir->i_sb;
-	blocksize = sb->s_blocksize;
-	if (!dentry->d_name.len)
-		return -EINVAL;
-	if (is_dx(dir)) {
-		retval = ext3_dx_add_entry(handle, dentry, inode);
-		if (!retval || (retval != ERR_BAD_DX_DIR))
-			return retval;
-		EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
-		dx_fallback++;
-		ext3_mark_inode_dirty(handle, dir);
-	}
-	blocks = dir->i_size >> sb->s_blocksize_bits;
-	for (block = 0; block < blocks; block++) {
-		if (!(bh = ext3_dir_bread(handle, dir, block, 0, &retval)))
-			return retval;
-
-		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
-		if (retval != -ENOSPC)
-			return retval;
-
-		if (blocks == 1 && !dx_fallback &&
-		    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
-			return make_indexed_dir(handle, dentry, inode, bh);
-		brelse(bh);
-	}
-	bh = ext3_append(handle, dir, &block, &retval);
-	if (!bh)
-		return retval;
-	de = (struct ext3_dir_entry_2 *) bh->b_data;
-	de->inode = 0;
-	de->rec_len = ext3_rec_len_to_disk(blocksize);
-	return add_dirent_to_buf(handle, dentry, inode, de, bh);
-}
-
-/*
- * Returns 0 for success, or a negative error value
- */
-static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
-			     struct inode *inode)
-{
-	struct dx_frame frames[2], *frame;
-	struct dx_entry *entries, *at;
-	struct dx_hash_info hinfo;
-	struct buffer_head * bh;
-	struct inode *dir = d_inode(dentry->d_parent);
-	struct super_block * sb = dir->i_sb;
-	struct ext3_dir_entry_2 *de;
-	int err;
-
-	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
-	if (!frame)
-		return err;
-	entries = frame->entries;
-	at = frame->at;
-
-	if (!(bh = ext3_dir_bread(handle, dir, dx_get_block(frame->at), 0, &err)))
-		goto cleanup;
-
-	BUFFER_TRACE(bh, "get_write_access");
-	err = ext3_journal_get_write_access(handle, bh);
-	if (err)
-		goto journal_error;
-
-	err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
-	if (err != -ENOSPC) {
-		bh = NULL;
-		goto cleanup;
-	}
-
-	/* Block full, should compress but for now just split */
-	dxtrace(printk("using %u of %u node entries\n",
-		       dx_get_count(entries), dx_get_limit(entries)));
-	/* Need to split index? */
-	if (dx_get_count(entries) == dx_get_limit(entries)) {
-		u32 newblock;
-		unsigned icount = dx_get_count(entries);
-		int levels = frame - frames;
-		struct dx_entry *entries2;
-		struct dx_node *node2;
-		struct buffer_head *bh2;
-
-		if (levels && (dx_get_count(frames->entries) ==
-			       dx_get_limit(frames->entries))) {
-			ext3_warning(sb, __func__,
-				     "Directory index full!");
-			err = -ENOSPC;
-			goto cleanup;
-		}
-		bh2 = ext3_append (handle, dir, &newblock, &err);
-		if (!(bh2))
-			goto cleanup;
-		node2 = (struct dx_node *)(bh2->b_data);
-		entries2 = node2->entries;
-		memset(&node2->fake, 0, sizeof(struct fake_dirent));
-		node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
-		BUFFER_TRACE(frame->bh, "get_write_access");
-		err = ext3_journal_get_write_access(handle, frame->bh);
-		if (err)
-			goto journal_error;
-		if (levels) {
-			unsigned icount1 = icount/2, icount2 = icount - icount1;
-			unsigned hash2 = dx_get_hash(entries + icount1);
-			dxtrace(printk("Split index %i/%i\n", icount1, icount2));
-
-			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
-			err = ext3_journal_get_write_access(handle,
-							     frames[0].bh);
-			if (err)
-				goto journal_error;
-
-			memcpy ((char *) entries2, (char *) (entries + icount1),
-				icount2 * sizeof(struct dx_entry));
-			dx_set_count (entries, icount1);
-			dx_set_count (entries2, icount2);
-			dx_set_limit (entries2, dx_node_limit(dir));
-
-			/* Which index block gets the new entry? */
-			if (at - entries >= icount1) {
-				frame->at = at = at - entries - icount1 + entries2;
-				frame->entries = entries = entries2;
-				swap(frame->bh, bh2);
-			}
-			dx_insert_block (frames + 0, hash2, newblock);
-			dxtrace(dx_show_index ("node", frames[1].entries));
-			dxtrace(dx_show_index ("node",
-			       ((struct dx_node *) bh2->b_data)->entries));
-			err = ext3_journal_dirty_metadata(handle, bh2);
-			if (err)
-				goto journal_error;
-			brelse (bh2);
-		} else {
-			dxtrace(printk("Creating second level index...\n"));
-			memcpy((char *) entries2, (char *) entries,
-			       icount * sizeof(struct dx_entry));
-			dx_set_limit(entries2, dx_node_limit(dir));
-
-			/* Set up root */
-			dx_set_count(entries, 1);
-			dx_set_block(entries + 0, newblock);
-			((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
-
-			/* Add new access path frame */
-			frame = frames + 1;
-			frame->at = at = at - entries + entries2;
-			frame->entries = entries = entries2;
-			frame->bh = bh2;
-			err = ext3_journal_get_write_access(handle,
-							     frame->bh);
-			if (err)
-				goto journal_error;
-		}
-		err = ext3_journal_dirty_metadata(handle, frames[0].bh);
-		if (err)
-			goto journal_error;
-	}
-	de = do_split(handle, dir, &bh, frame, &hinfo, &err);
-	if (!de)
-		goto cleanup;
-	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
-	bh = NULL;
-	goto cleanup;
-
-journal_error:
-	ext3_std_error(dir->i_sb, err);
-cleanup:
-	if (bh)
-		brelse(bh);
-	dx_release(frames);
-	return err;
-}
-
-/*
- * ext3_delete_entry deletes a directory entry by merging it with the
- * previous entry
- */
-static int ext3_delete_entry (handle_t *handle,
-			      struct inode * dir,
-			      struct ext3_dir_entry_2 * de_del,
-			      struct buffer_head * bh)
-{
-	struct ext3_dir_entry_2 * de, * pde;
-	int i;
-
-	i = 0;
-	pde = NULL;
-	de = (struct ext3_dir_entry_2 *) bh->b_data;
-	while (i < bh->b_size) {
-		if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
-			return -EIO;
-		if (de == de_del)  {
-			int err;
-
-			BUFFER_TRACE(bh, "get_write_access");
-			err = ext3_journal_get_write_access(handle, bh);
-			if (err)
-				goto journal_error;
-
-			if (pde)
-				pde->rec_len = ext3_rec_len_to_disk(
-					ext3_rec_len_from_disk(pde->rec_len) +
-					ext3_rec_len_from_disk(de->rec_len));
-			else
-				de->inode = 0;
-			dir->i_version++;
-			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-			err = ext3_journal_dirty_metadata(handle, bh);
-			if (err) {
-journal_error:
-				ext3_std_error(dir->i_sb, err);
-				return err;
-			}
-			return 0;
-		}
-		i += ext3_rec_len_from_disk(de->rec_len);
-		pde = de;
-		de = ext3_next_entry(de);
-	}
-	return -ENOENT;
-}
-
-static int ext3_add_nondir(handle_t *handle,
-		struct dentry *dentry, struct inode *inode)
-{
-	int err = ext3_add_entry(handle, dentry, inode);
-	if (!err) {
-		ext3_mark_inode_dirty(handle, inode);
-		unlock_new_inode(inode);
-		d_instantiate(dentry, inode);
-		return 0;
-	}
-	drop_nlink(inode);
-	unlock_new_inode(inode);
-	iput(inode);
-	return err;
-}
-
-/*
- * By the time this is called, we already have created
- * the directory cache entry for the new file, but it
- * is so far negative - it has no inode.
- *
- * If the create succeeds, we fill in the inode information
- * with d_instantiate().
- */
-static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode,
-		bool excl)
-{
-	handle_t *handle;
-	struct inode * inode;
-	int err, retries = 0;
-
-	dquot_initialize(dir);
-
-retry:
-	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
-					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
-
-	inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
-	err = PTR_ERR(inode);
-	if (!IS_ERR(inode)) {
-		inode->i_op = &ext3_file_inode_operations;
-		inode->i_fop = &ext3_file_operations;
-		ext3_set_aops(inode);
-		err = ext3_add_nondir(handle, dentry, inode);
-	}
-	ext3_journal_stop(handle);
-	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
-		goto retry;
-	return err;
-}
-
-static int ext3_mknod (struct inode * dir, struct dentry *dentry,
-			umode_t mode, dev_t rdev)
-{
-	handle_t *handle;
-	struct inode *inode;
-	int err, retries = 0;
-
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
-
-	dquot_initialize(dir);
-
-retry:
-	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
-					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
-
-	inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
-	err = PTR_ERR(inode);
-	if (!IS_ERR(inode)) {
-		init_special_inode(inode, inode->i_mode, rdev);
-#ifdef CONFIG_EXT3_FS_XATTR
-		inode->i_op = &ext3_special_inode_operations;
-#endif
-		err = ext3_add_nondir(handle, dentry, inode);
-	}
-	ext3_journal_stop(handle);
-	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
-		goto retry;
-	return err;
-}
-
-static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
-	handle_t *handle;
-	struct inode *inode;
-	int err, retries = 0;
-
-	dquot_initialize(dir);
-
-retry:
-	handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
-			  4 + EXT3_XATTR_TRANS_BLOCKS);
-
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	inode = ext3_new_inode (handle, dir, NULL, mode);
-	err = PTR_ERR(inode);
-	if (!IS_ERR(inode)) {
-		inode->i_op = &ext3_file_inode_operations;
-		inode->i_fop = &ext3_file_operations;
-		ext3_set_aops(inode);
-		d_tmpfile(dentry, inode);
-		err = ext3_orphan_add(handle, inode);
-		if (err)
-			goto err_unlock_inode;
-		mark_inode_dirty(inode);
-		unlock_new_inode(inode);
-	}
-	ext3_journal_stop(handle);
-	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
-		goto retry;
-	return err;
-err_unlock_inode:
-	ext3_journal_stop(handle);
-	unlock_new_inode(inode);
-	return err;
-}
-
-static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
-{
-	handle_t *handle;
-	struct inode * inode;
-	struct buffer_head * dir_block = NULL;
-	struct ext3_dir_entry_2 * de;
-	int err, retries = 0;
-
-	if (dir->i_nlink >= EXT3_LINK_MAX)
-		return -EMLINK;
-
-	dquot_initialize(dir);
-
-retry:
-	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
-					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
-
-	inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode);
-	err = PTR_ERR(inode);
-	if (IS_ERR(inode))
-		goto out_stop;
-
-	inode->i_op = &ext3_dir_inode_operations;
-	inode->i_fop = &ext3_dir_operations;
-	inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
-	if (!(dir_block = ext3_dir_bread(handle, inode, 0, 1, &err)))
-		goto out_clear_inode;
-
-	BUFFER_TRACE(dir_block, "get_write_access");
-	err = ext3_journal_get_write_access(handle, dir_block);
-	if (err)
-		goto out_clear_inode;
-
-	de = (struct ext3_dir_entry_2 *) dir_block->b_data;
-	de->inode = cpu_to_le32(inode->i_ino);
-	de->name_len = 1;
-	de->rec_len = ext3_rec_len_to_disk(EXT3_DIR_REC_LEN(de->name_len));
-	strcpy (de->name, ".");
-	ext3_set_de_type(dir->i_sb, de, S_IFDIR);
-	de = ext3_next_entry(de);
-	de->inode = cpu_to_le32(dir->i_ino);
-	de->rec_len = ext3_rec_len_to_disk(inode->i_sb->s_blocksize -
-					EXT3_DIR_REC_LEN(1));
-	de->name_len = 2;
-	strcpy (de->name, "..");
-	ext3_set_de_type(dir->i_sb, de, S_IFDIR);
-	set_nlink(inode, 2);
-	BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
-	err = ext3_journal_dirty_metadata(handle, dir_block);
-	if (err)
-		goto out_clear_inode;
-
-	err = ext3_mark_inode_dirty(handle, inode);
-	if (!err)
-		err = ext3_add_entry (handle, dentry, inode);
-
-	if (err) {
-out_clear_inode:
-		clear_nlink(inode);
-		unlock_new_inode(inode);
-		ext3_mark_inode_dirty(handle, inode);
-		iput (inode);
-		goto out_stop;
-	}
-	inc_nlink(dir);
-	ext3_update_dx_flag(dir);
-	err = ext3_mark_inode_dirty(handle, dir);
-	if (err)
-		goto out_clear_inode;
-
-	unlock_new_inode(inode);
-	d_instantiate(dentry, inode);
-out_stop:
-	brelse(dir_block);
-	ext3_journal_stop(handle);
-	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
-		goto retry;
-	return err;
-}
-
-/*
- * routine to check that the specified directory is empty (for rmdir)
- */
-static int empty_dir (struct inode * inode)
-{
-	unsigned long offset;
-	struct buffer_head * bh;
-	struct ext3_dir_entry_2 * de, * de1;
-	struct super_block * sb;
-	int err = 0;
-
-	sb = inode->i_sb;
-	if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
-	    !(bh = ext3_dir_bread(NULL, inode, 0, 0, &err))) {
-		if (err)
-			ext3_error(inode->i_sb, __func__,
-				   "error %d reading directory #%lu offset 0",
-				   err, inode->i_ino);
-		else
-			ext3_warning(inode->i_sb, __func__,
-				     "bad directory (dir #%lu) - no data block",
-				     inode->i_ino);
-		return 1;
-	}
-	de = (struct ext3_dir_entry_2 *) bh->b_data;
-	de1 = ext3_next_entry(de);
-	if (le32_to_cpu(de->inode) != inode->i_ino ||
-			!le32_to_cpu(de1->inode) ||
-			strcmp (".", de->name) ||
-			strcmp ("..", de1->name)) {
-		ext3_warning (inode->i_sb, "empty_dir",
-			      "bad directory (dir #%lu) - no `.' or `..'",
-			      inode->i_ino);
-		brelse (bh);
-		return 1;
-	}
-	offset = ext3_rec_len_from_disk(de->rec_len) +
-			ext3_rec_len_from_disk(de1->rec_len);
-	de = ext3_next_entry(de1);
-	while (offset < inode->i_size ) {
-		if (!bh ||
-			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
-			err = 0;
-			brelse (bh);
-			if (!(bh = ext3_dir_bread (NULL, inode,
-				offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err))) {
-				if (err)
-					ext3_error(sb, __func__,
-						   "error %d reading directory"
-						   " #%lu offset %lu",
-						   err, inode->i_ino, offset);
-				offset += sb->s_blocksize;
-				continue;
-			}
-			de = (struct ext3_dir_entry_2 *) bh->b_data;
-		}
-		if (!ext3_check_dir_entry("empty_dir", inode, de, bh, offset)) {
-			de = (struct ext3_dir_entry_2 *)(bh->b_data +
-							 sb->s_blocksize);
-			offset = (offset | (sb->s_blocksize - 1)) + 1;
-			continue;
-		}
-		if (le32_to_cpu(de->inode)) {
-			brelse (bh);
-			return 0;
-		}
-		offset += ext3_rec_len_from_disk(de->rec_len);
-		de = ext3_next_entry(de);
-	}
-	brelse (bh);
-	return 1;
-}
-
-/* ext3_orphan_add() links an unlinked or truncated inode into a list of
- * such inodes, starting at the superblock, in case we crash before the
- * file is closed/deleted, or in case the inode truncate spans multiple
- * transactions and the last transaction is not recovered after a crash.
- *
- * At filesystem recovery time, we walk this list deleting unlinked
- * inodes and truncating linked inodes in ext3_orphan_cleanup().
- */
-int ext3_orphan_add(handle_t *handle, struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
-	struct ext3_iloc iloc;
-	int err = 0, rc;
-
-	mutex_lock(&EXT3_SB(sb)->s_orphan_lock);
-	if (!list_empty(&EXT3_I(inode)->i_orphan))
-		goto out_unlock;
-
-	/* Orphan handling is only valid for files with data blocks
-	 * being truncated, or files being unlinked. */
-
-	/* @@@ FIXME: Observation from aviro:
-	 * I think I can trigger J_ASSERT in ext3_orphan_add().  We block
-	 * here (on s_orphan_lock), so race with ext3_link() which might bump
-	 * ->i_nlink. For, say it, character device. Not a regular file,
-	 * not a directory, not a symlink and ->i_nlink > 0.
-	 *
-	 * tytso, 4/25/2009: I'm not sure how that could happen;
-	 * shouldn't the fs core protect us from these sort of
-	 * unlink()/link() races?
-	 */
-	J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-		S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
-
-	BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
-	err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
-	if (err)
-		goto out_unlock;
-
-	err = ext3_reserve_inode_write(handle, inode, &iloc);
-	if (err)
-		goto out_unlock;
-
-	/* Insert this inode at the head of the on-disk orphan list... */
-	NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
-	EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
-	err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
-	rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
-	if (!err)
-		err = rc;
-
-	/* Only add to the head of the in-memory list if all the
-	 * previous operations succeeded.  If the orphan_add is going to
-	 * fail (possibly taking the journal offline), we can't risk
-	 * leaving the inode on the orphan list: stray orphan-list
-	 * entries can cause panics at unmount time.
-	 *
-	 * This is safe: on error we're going to ignore the orphan list
-	 * anyway on the next recovery. */
-	if (!err)
-		list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
-
-	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
-	jbd_debug(4, "orphan inode %lu will point to %d\n",
-			inode->i_ino, NEXT_ORPHAN(inode));
-out_unlock:
-	mutex_unlock(&EXT3_SB(sb)->s_orphan_lock);
-	ext3_std_error(inode->i_sb, err);
-	return err;
-}
-
-/*
- * ext3_orphan_del() removes an unlinked or truncated inode from the list
- * of such inodes stored on disk, because it is finally being cleaned up.
- */
-int ext3_orphan_del(handle_t *handle, struct inode *inode)
-{
-	struct list_head *prev;
-	struct ext3_inode_info *ei = EXT3_I(inode);
-	struct ext3_sb_info *sbi;
-	unsigned long ino_next;
-	struct ext3_iloc iloc;
-	int err = 0;
-
-	mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
-	if (list_empty(&ei->i_orphan))
-		goto out;
-
-	ino_next = NEXT_ORPHAN(inode);
-	prev = ei->i_orphan.prev;
-	sbi = EXT3_SB(inode->i_sb);
-
-	jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
-
-	list_del_init(&ei->i_orphan);
-
-	/* If we're on an error path, we may not have a valid
-	 * transaction handle with which to update the orphan list on
-	 * disk, but we still need to remove the inode from the linked
-	 * list in memory. */
-	if (!handle)
-		goto out;
-
-	err = ext3_reserve_inode_write(handle, inode, &iloc);
-	if (err)
-		goto out_err;
-
-	if (prev == &sbi->s_orphan) {
-		jbd_debug(4, "superblock will point to %lu\n", ino_next);
-		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
-		err = ext3_journal_get_write_access(handle, sbi->s_sbh);
-		if (err)
-			goto out_brelse;
-		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
-		err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
-	} else {
-		struct ext3_iloc iloc2;
-		struct inode *i_prev =
-			&list_entry(prev, struct ext3_inode_info, i_orphan)->vfs_inode;
-
-		jbd_debug(4, "orphan inode %lu will point to %lu\n",
-			  i_prev->i_ino, ino_next);
-		err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
-		if (err)
-			goto out_brelse;
-		NEXT_ORPHAN(i_prev) = ino_next;
-		err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2);
-	}
-	if (err)
-		goto out_brelse;
-	NEXT_ORPHAN(inode) = 0;
-	err = ext3_mark_iloc_dirty(handle, inode, &iloc);
-
-out_err:
-	ext3_std_error(inode->i_sb, err);
-out:
-	mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
-	return err;
-
-out_brelse:
-	brelse(iloc.bh);
-	goto out_err;
-}
-
-static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
-{
-	int retval;
-	struct inode * inode;
-	struct buffer_head * bh;
-	struct ext3_dir_entry_2 * de;
-	handle_t *handle;
-
-	/* Initialize quotas before so that eventual writes go in
-	 * separate transaction */
-	dquot_initialize(dir);
-	dquot_initialize(d_inode(dentry));
-
-	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	retval = -ENOENT;
-	bh = ext3_find_entry(dir, &dentry->d_name, &de);
-	if (!bh)
-		goto end_rmdir;
-
-	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
-
-	inode = d_inode(dentry);
-
-	retval = -EIO;
-	if (le32_to_cpu(de->inode) != inode->i_ino)
-		goto end_rmdir;
-
-	retval = -ENOTEMPTY;
-	if (!empty_dir (inode))
-		goto end_rmdir;
-
-	retval = ext3_delete_entry(handle, dir, de, bh);
-	if (retval)
-		goto end_rmdir;
-	if (inode->i_nlink != 2)
-		ext3_warning (inode->i_sb, "ext3_rmdir",
-			      "empty directory has nlink!=2 (%d)",
-			      inode->i_nlink);
-	inode->i_version++;
-	clear_nlink(inode);
-	/* There's no need to set i_disksize: the fact that i_nlink is
-	 * zero will ensure that the right thing happens during any
-	 * recovery. */
-	inode->i_size = 0;
-	ext3_orphan_add(handle, inode);
-	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
-	ext3_mark_inode_dirty(handle, inode);
-	drop_nlink(dir);
-	ext3_update_dx_flag(dir);
-	ext3_mark_inode_dirty(handle, dir);
-
-end_rmdir:
-	ext3_journal_stop(handle);
-	brelse (bh);
-	return retval;
-}
-
-static int ext3_unlink(struct inode * dir, struct dentry *dentry)
-{
-	int retval;
-	struct inode * inode;
-	struct buffer_head * bh;
-	struct ext3_dir_entry_2 * de;
-	handle_t *handle;
-
-	trace_ext3_unlink_enter(dir, dentry);
-	/* Initialize quotas before so that eventual writes go
-	 * in separate transaction */
-	dquot_initialize(dir);
-	dquot_initialize(d_inode(dentry));
-
-	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
-
-	retval = -ENOENT;
-	bh = ext3_find_entry(dir, &dentry->d_name, &de);
-	if (!bh)
-		goto end_unlink;
-
-	inode = d_inode(dentry);
-
-	retval = -EIO;
-	if (le32_to_cpu(de->inode) != inode->i_ino)
-		goto end_unlink;
-
-	if (!inode->i_nlink) {
-		ext3_warning (inode->i_sb, "ext3_unlink",
-			      "Deleting nonexistent file (%lu), %d",
-			      inode->i_ino, inode->i_nlink);
-		set_nlink(inode, 1);
-	}
-	retval = ext3_delete_entry(handle, dir, de, bh);
-	if (retval)
-		goto end_unlink;
-	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
-	ext3_update_dx_flag(dir);
-	ext3_mark_inode_dirty(handle, dir);
-	drop_nlink(inode);
-	if (!inode->i_nlink)
-		ext3_orphan_add(handle, inode);
-	inode->i_ctime = dir->i_ctime;
-	ext3_mark_inode_dirty(handle, inode);
-	retval = 0;
-
-end_unlink:
-	ext3_journal_stop(handle);
-	brelse (bh);
-	trace_ext3_unlink_exit(dentry, retval);
-	return retval;
-}
-
-static int ext3_symlink (struct inode * dir,
-		struct dentry *dentry, const char * symname)
-{
-	handle_t *handle;
-	struct inode * inode;
-	int l, err, retries = 0;
-	int credits;
-
-	l = strlen(symname)+1;
-	if (l > dir->i_sb->s_blocksize)
-		return -ENAMETOOLONG;
-
-	dquot_initialize(dir);
-
-	if (l > EXT3_N_BLOCKS * 4) {
-		/*
-		 * For non-fast symlinks, we just allocate inode and put it on
-		 * orphan list in the first transaction => we need bitmap,
-		 * group descriptor, sb, inode block, quota blocks, and
-		 * possibly selinux xattr blocks.
-		 */
-		credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
-			  EXT3_XATTR_TRANS_BLOCKS;
-	} else {
-		/*
-		 * Fast symlink. We have to add entry to directory
-		 * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
-		 * allocate new inode (bitmap, group descriptor, inode block,
-		 * quota blocks, sb is already counted in previous macros).
-		 */
-		credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
-			  EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-			  EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
-	}
-retry:
-	handle = ext3_journal_start(dir, credits);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
-
-	inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO);
-	err = PTR_ERR(inode);
-	if (IS_ERR(inode))
-		goto out_stop;
-
-	if (l > EXT3_N_BLOCKS * 4) {
-		inode->i_op = &ext3_symlink_inode_operations;
-		ext3_set_aops(inode);
-		/*
-		 * We cannot call page_symlink() with transaction started
-		 * because it calls into ext3_write_begin() which acquires page
-		 * lock which ranks below transaction start (and it can also
-		 * wait for journal commit if we are running out of space). So
-		 * we have to stop transaction now and restart it when symlink
-		 * contents is written. 
-		 *
-		 * To keep fs consistent in case of crash, we have to put inode
-		 * to orphan list in the mean time.
-		 */
-		drop_nlink(inode);
-		err = ext3_orphan_add(handle, inode);
-		ext3_journal_stop(handle);
-		if (err)
-			goto err_drop_inode;
-		err = __page_symlink(inode, symname, l, 1);
-		if (err)
-			goto err_drop_inode;
-		/*
-		 * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
-		 * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
-		 */
-		handle = ext3_journal_start(dir,
-				EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
-				EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
-		if (IS_ERR(handle)) {
-			err = PTR_ERR(handle);
-			goto err_drop_inode;
-		}
-		set_nlink(inode, 1);
-		err = ext3_orphan_del(handle, inode);
-		if (err) {
-			ext3_journal_stop(handle);
-			drop_nlink(inode);
-			goto err_drop_inode;
-		}
-	} else {
-		inode->i_op = &ext3_fast_symlink_inode_operations;
-		inode->i_link = (char*)&EXT3_I(inode)->i_data;
-		memcpy(inode->i_link, symname, l);
-		inode->i_size = l-1;
-	}
-	EXT3_I(inode)->i_disksize = inode->i_size;
-	err = ext3_add_nondir(handle, dentry, inode);
-out_stop:
-	ext3_journal_stop(handle);
-	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
-		goto retry;
-	return err;
-err_drop_inode:
-	unlock_new_inode(inode);
-	iput(inode);
-	return err;
-}
-
-static int ext3_link (struct dentry * old_dentry,
-		struct inode * dir, struct dentry *dentry)
-{
-	handle_t *handle;
-	struct inode *inode = d_inode(old_dentry);
-	int err, retries = 0;
-
-	if (inode->i_nlink >= EXT3_LINK_MAX)
-		return -EMLINK;
-
-	dquot_initialize(dir);
-
-retry:
-	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
-					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
-
-	inode->i_ctime = CURRENT_TIME_SEC;
-	inc_nlink(inode);
-	ihold(inode);
-
-	err = ext3_add_entry(handle, dentry, inode);
-	if (!err) {
-		ext3_mark_inode_dirty(handle, inode);
-		/* this can happen only for tmpfile being
-		 * linked the first time
-		 */
-		if (inode->i_nlink == 1)
-			ext3_orphan_del(handle, inode);
-		d_instantiate(dentry, inode);
-	} else {
-		drop_nlink(inode);
-		iput(inode);
-	}
-	ext3_journal_stop(handle);
-	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
-		goto retry;
-	return err;
-}
-
-#define PARENT_INO(buffer) \
-	(ext3_next_entry((struct ext3_dir_entry_2 *)(buffer))->inode)
-
-/*
- * Anybody can rename anything with this: the permission checks are left to the
- * higher-level routines.
- */
-static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
-			   struct inode * new_dir,struct dentry *new_dentry)
-{
-	handle_t *handle;
-	struct inode * old_inode, * new_inode;
-	struct buffer_head * old_bh, * new_bh, * dir_bh;
-	struct ext3_dir_entry_2 * old_de, * new_de;
-	int retval, flush_file = 0;
-
-	dquot_initialize(old_dir);
-	dquot_initialize(new_dir);
-
-	old_bh = new_bh = dir_bh = NULL;
-
-	/* Initialize quotas before so that eventual writes go
-	 * in separate transaction */
-	if (d_really_is_positive(new_dentry))
-		dquot_initialize(d_inode(new_dentry));
-	handle = ext3_journal_start(old_dir, 2 *
-					EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
-					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
-		handle->h_sync = 1;
-
-	old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de);
-	/*
-	 *  Check for inode number is _not_ due to possible IO errors.
-	 *  We might rmdir the source, keep it as pwd of some process
-	 *  and merrily kill the link to whatever was created under the
-	 *  same name. Goodbye sticky bit ;-<
-	 */
-	old_inode = d_inode(old_dentry);
-	retval = -ENOENT;
-	if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
-		goto end_rename;
-
-	new_inode = d_inode(new_dentry);
-	new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de);
-	if (new_bh) {
-		if (!new_inode) {
-			brelse (new_bh);
-			new_bh = NULL;
-		}
-	}
-	if (S_ISDIR(old_inode->i_mode)) {
-		if (new_inode) {
-			retval = -ENOTEMPTY;
-			if (!empty_dir (new_inode))
-				goto end_rename;
-		}
-		retval = -EIO;
-		dir_bh = ext3_dir_bread(handle, old_inode, 0, 0, &retval);
-		if (!dir_bh)
-			goto end_rename;
-		if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
-			goto end_rename;
-		retval = -EMLINK;
-		if (!new_inode && new_dir!=old_dir &&
-				new_dir->i_nlink >= EXT3_LINK_MAX)
-			goto end_rename;
-	}
-	if (!new_bh) {
-		retval = ext3_add_entry (handle, new_dentry, old_inode);
-		if (retval)
-			goto end_rename;
-	} else {
-		BUFFER_TRACE(new_bh, "get write access");
-		retval = ext3_journal_get_write_access(handle, new_bh);
-		if (retval)
-			goto journal_error;
-		new_de->inode = cpu_to_le32(old_inode->i_ino);
-		if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
-					      EXT3_FEATURE_INCOMPAT_FILETYPE))
-			new_de->file_type = old_de->file_type;
-		new_dir->i_version++;
-		new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
-		ext3_mark_inode_dirty(handle, new_dir);
-		BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
-		retval = ext3_journal_dirty_metadata(handle, new_bh);
-		if (retval)
-			goto journal_error;
-		brelse(new_bh);
-		new_bh = NULL;
-	}
-
-	/*
-	 * Like most other Unix systems, set the ctime for inodes on a
-	 * rename.
-	 */
-	old_inode->i_ctime = CURRENT_TIME_SEC;
-	ext3_mark_inode_dirty(handle, old_inode);
-
-	/*
-	 * ok, that's it
-	 */
-	if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
-	    old_de->name_len != old_dentry->d_name.len ||
-	    strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
-	    (retval = ext3_delete_entry(handle, old_dir,
-					old_de, old_bh)) == -ENOENT) {
-		/* old_de could have moved from under us during htree split, so
-		 * make sure that we are deleting the right entry.  We might
-		 * also be pointing to a stale entry in the unused part of
-		 * old_bh so just checking inum and the name isn't enough. */
-		struct buffer_head *old_bh2;
-		struct ext3_dir_entry_2 *old_de2;
-
-		old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name,
-					  &old_de2);
-		if (old_bh2) {
-			retval = ext3_delete_entry(handle, old_dir,
-						   old_de2, old_bh2);
-			brelse(old_bh2);
-		}
-	}
-	if (retval) {
-		ext3_warning(old_dir->i_sb, "ext3_rename",
-				"Deleting old file (%lu), %d, error=%d",
-				old_dir->i_ino, old_dir->i_nlink, retval);
-	}
-
-	if (new_inode) {
-		drop_nlink(new_inode);
-		new_inode->i_ctime = CURRENT_TIME_SEC;
-	}
-	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
-	ext3_update_dx_flag(old_dir);
-	if (dir_bh) {
-		BUFFER_TRACE(dir_bh, "get_write_access");
-		retval = ext3_journal_get_write_access(handle, dir_bh);
-		if (retval)
-			goto journal_error;
-		PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
-		BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
-		retval = ext3_journal_dirty_metadata(handle, dir_bh);
-		if (retval) {
-journal_error:
-			ext3_std_error(new_dir->i_sb, retval);
-			goto end_rename;
-		}
-		drop_nlink(old_dir);
-		if (new_inode) {
-			drop_nlink(new_inode);
-		} else {
-			inc_nlink(new_dir);
-			ext3_update_dx_flag(new_dir);
-			ext3_mark_inode_dirty(handle, new_dir);
-		}
-	}
-	ext3_mark_inode_dirty(handle, old_dir);
-	if (new_inode) {
-		ext3_mark_inode_dirty(handle, new_inode);
-		if (!new_inode->i_nlink)
-			ext3_orphan_add(handle, new_inode);
-		if (ext3_should_writeback_data(new_inode))
-			flush_file = 1;
-	}
-	retval = 0;
-
-end_rename:
-	brelse (dir_bh);
-	brelse (old_bh);
-	brelse (new_bh);
-	ext3_journal_stop(handle);
-	if (retval == 0 && flush_file)
-		filemap_flush(old_inode->i_mapping);
-	return retval;
-}
-
-/*
- * directories can handle most operations...
- */
-const struct inode_operations ext3_dir_inode_operations = {
-	.create		= ext3_create,
-	.lookup		= ext3_lookup,
-	.link		= ext3_link,
-	.unlink		= ext3_unlink,
-	.symlink	= ext3_symlink,
-	.mkdir		= ext3_mkdir,
-	.rmdir		= ext3_rmdir,
-	.mknod		= ext3_mknod,
-	.tmpfile	= ext3_tmpfile,
-	.rename		= ext3_rename,
-	.setattr	= ext3_setattr,
-#ifdef CONFIG_EXT3_FS_XATTR
-	.setxattr	= generic_setxattr,
-	.getxattr	= generic_getxattr,
-	.listxattr	= ext3_listxattr,
-	.removexattr	= generic_removexattr,
-#endif
-	.get_acl	= ext3_get_acl,
-	.set_acl	= ext3_set_acl,
-};
-
-const struct inode_operations ext3_special_inode_operations = {
-	.setattr	= ext3_setattr,
-#ifdef CONFIG_EXT3_FS_XATTR
-	.setxattr	= generic_setxattr,
-	.getxattr	= generic_getxattr,
-	.listxattr	= ext3_listxattr,
-	.removexattr	= generic_removexattr,
-#endif
-	.get_acl	= ext3_get_acl,
-	.set_acl	= ext3_set_acl,
-};
diff --git a/fs/ext3/namei.h b/fs/ext3/namei.h
deleted file mode 100644
index 46304d8c9f0a..000000000000
--- a/fs/ext3/namei.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*  linux/fs/ext3/namei.h
- *
- * Copyright (C) 2005 Simtec Electronics
- *	Ben Dooks <ben@simtec.co.uk>
- *
-*/
-
-extern struct dentry *ext3_get_parent(struct dentry *child);
-
-static inline struct buffer_head *ext3_dir_bread(handle_t *handle,
-						 struct inode *inode,
-						 int block, int create,
-						 int *err)
-{
-	struct buffer_head *bh;
-
-	bh = ext3_bread(handle, inode, block, create, err);
-
-	if (!bh && !(*err)) {
-		*err = -EIO;
-		ext3_error(inode->i_sb, __func__,
-			   "Directory hole detected on inode %lu\n",
-			   inode->i_ino);
-		return NULL;
-	}
-	return bh;
-}
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
deleted file mode 100644
index 27105655502c..000000000000
--- a/fs/ext3/resize.c
+++ /dev/null
@@ -1,1117 +0,0 @@
-/*
- *  linux/fs/ext3/resize.c
- *
- * Support for resizing an ext3 filesystem while it is mounted.
- *
- * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
- *
- * This could probably be made into a module, because it is not often in use.
- */
-
-
-#define EXT3FS_DEBUG
-
-#include "ext3.h"
-
-
-#define outside(b, first, last)	((b) < (first) || (b) >= (last))
-#define inside(b, first, last)	((b) >= (first) && (b) < (last))
-
-static int verify_group_input(struct super_block *sb,
-			      struct ext3_new_group_data *input)
-{
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	struct ext3_super_block *es = sbi->s_es;
-	ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count);
-	ext3_fsblk_t end = start + input->blocks_count;
-	unsigned group = input->group;
-	ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
-	unsigned overhead = ext3_bg_has_super(sb, group) ?
-		(1 + ext3_bg_num_gdb(sb, group) +
-		 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
-	ext3_fsblk_t metaend = start + overhead;
-	struct buffer_head *bh = NULL;
-	ext3_grpblk_t free_blocks_count;
-	int err = -EINVAL;
-
-	input->free_blocks_count = free_blocks_count =
-		input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
-
-	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG "EXT3-fs: adding %s group %u: %u blocks "
-		       "(%d free, %u reserved)\n",
-		       ext3_bg_has_super(sb, input->group) ? "normal" :
-		       "no-super", input->group, input->blocks_count,
-		       free_blocks_count, input->reserved_blocks);
-
-	if (group != sbi->s_groups_count)
-		ext3_warning(sb, __func__,
-			     "Cannot add at group %u (only %lu groups)",
-			     input->group, sbi->s_groups_count);
-	else if ((start - le32_to_cpu(es->s_first_data_block)) %
-		 EXT3_BLOCKS_PER_GROUP(sb))
-		ext3_warning(sb, __func__, "Last group not full");
-	else if (input->reserved_blocks > input->blocks_count / 5)
-		ext3_warning(sb, __func__, "Reserved blocks too high (%u)",
-			     input->reserved_blocks);
-	else if (free_blocks_count < 0)
-		ext3_warning(sb, __func__, "Bad blocks count %u",
-			     input->blocks_count);
-	else if (!(bh = sb_bread(sb, end - 1)))
-		ext3_warning(sb, __func__,
-			     "Cannot read last block ("E3FSBLK")",
-			     end - 1);
-	else if (outside(input->block_bitmap, start, end))
-		ext3_warning(sb, __func__,
-			     "Block bitmap not in group (block %u)",
-			     input->block_bitmap);
-	else if (outside(input->inode_bitmap, start, end))
-		ext3_warning(sb, __func__,
-			     "Inode bitmap not in group (block %u)",
-			     input->inode_bitmap);
-	else if (outside(input->inode_table, start, end) ||
-	         outside(itend - 1, start, end))
-		ext3_warning(sb, __func__,
-			     "Inode table not in group (blocks %u-"E3FSBLK")",
-			     input->inode_table, itend - 1);
-	else if (input->inode_bitmap == input->block_bitmap)
-		ext3_warning(sb, __func__,
-			     "Block bitmap same as inode bitmap (%u)",
-			     input->block_bitmap);
-	else if (inside(input->block_bitmap, input->inode_table, itend))
-		ext3_warning(sb, __func__,
-			     "Block bitmap (%u) in inode table (%u-"E3FSBLK")",
-			     input->block_bitmap, input->inode_table, itend-1);
-	else if (inside(input->inode_bitmap, input->inode_table, itend))
-		ext3_warning(sb, __func__,
-			     "Inode bitmap (%u) in inode table (%u-"E3FSBLK")",
-			     input->inode_bitmap, input->inode_table, itend-1);
-	else if (inside(input->block_bitmap, start, metaend))
-		ext3_warning(sb, __func__,
-			     "Block bitmap (%u) in GDT table"
-			     " ("E3FSBLK"-"E3FSBLK")",
-			     input->block_bitmap, start, metaend - 1);
-	else if (inside(input->inode_bitmap, start, metaend))
-		ext3_warning(sb, __func__,
-			     "Inode bitmap (%u) in GDT table"
-			     " ("E3FSBLK"-"E3FSBLK")",
-			     input->inode_bitmap, start, metaend - 1);
-	else if (inside(input->inode_table, start, metaend) ||
-	         inside(itend - 1, start, metaend))
-		ext3_warning(sb, __func__,
-			     "Inode table (%u-"E3FSBLK") overlaps"
-			     "GDT table ("E3FSBLK"-"E3FSBLK")",
-			     input->inode_table, itend - 1, start, metaend - 1);
-	else
-		err = 0;
-	brelse(bh);
-
-	return err;
-}
-
-static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
-				  ext3_fsblk_t blk)
-{
-	struct buffer_head *bh;
-	int err;
-
-	bh = sb_getblk(sb, blk);
-	if (unlikely(!bh))
-		return ERR_PTR(-ENOMEM);
-	if ((err = ext3_journal_get_write_access(handle, bh))) {
-		brelse(bh);
-		bh = ERR_PTR(err);
-	} else {
-		lock_buffer(bh);
-		memset(bh->b_data, 0, sb->s_blocksize);
-		set_buffer_uptodate(bh);
-		unlock_buffer(bh);
-	}
-
-	return bh;
-}
-
-/*
- * To avoid calling the atomic setbit hundreds or thousands of times, we only
- * need to use it within a single byte (to ensure we get endianness right).
- * We can use memset for the rest of the bitmap as there are no other users.
- */
-static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
-{
-	int i;
-
-	if (start_bit >= end_bit)
-		return;
-
-	ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
-	for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
-		ext3_set_bit(i, bitmap);
-	if (i < end_bit)
-		memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
-}
-
-/*
- * If we have fewer than thresh credits, extend by EXT3_MAX_TRANS_DATA.
- * If that fails, restart the transaction & regain write access for the
- * buffer head which is used for block_bitmap modifications.
- */
-static int extend_or_restart_transaction(handle_t *handle, int thresh,
-					 struct buffer_head *bh)
-{
-	int err;
-
-	if (handle->h_buffer_credits >= thresh)
-		return 0;
-
-	err = ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA);
-	if (err < 0)
-		return err;
-	if (err) {
-		err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA);
-		if (err)
-			return err;
-		err = ext3_journal_get_write_access(handle, bh);
-		if (err)
-			return err;
-	}
-
-	return 0;
-}
-
-/*
- * Set up the block and inode bitmaps, and the inode table for the new group.
- * This doesn't need to be part of the main transaction, since we are only
- * changing blocks outside the actual filesystem.  We still do journaling to
- * ensure the recovery is correct in case of a failure just after resize.
- * If any part of this fails, we simply abort the resize.
- */
-static int setup_new_group_blocks(struct super_block *sb,
-				  struct ext3_new_group_data *input)
-{
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group);
-	int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
-		le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
-	unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group);
-	struct buffer_head *bh;
-	handle_t *handle;
-	ext3_fsblk_t block;
-	ext3_grpblk_t bit;
-	int i;
-	int err = 0, err2;
-
-	/* This transaction may be extended/restarted along the way */
-	handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA);
-
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	mutex_lock(&sbi->s_resize_lock);
-	if (input->group != sbi->s_groups_count) {
-		err = -EBUSY;
-		goto exit_journal;
-	}
-
-	if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
-		err = PTR_ERR(bh);
-		goto exit_journal;
-	}
-
-	if (ext3_bg_has_super(sb, input->group)) {
-		ext3_debug("mark backup superblock %#04lx (+0)\n", start);
-		ext3_set_bit(0, bh->b_data);
-	}
-
-	/* Copy all of the GDT blocks into the backup in this group */
-	for (i = 0, bit = 1, block = start + 1;
-	     i < gdblocks; i++, block++, bit++) {
-		struct buffer_head *gdb;
-
-		ext3_debug("update backup group %#04lx (+%d)\n", block, bit);
-
-		err = extend_or_restart_transaction(handle, 1, bh);
-		if (err)
-			goto exit_bh;
-
-		gdb = sb_getblk(sb, block);
-		if (unlikely(!gdb)) {
-			err = -ENOMEM;
-			goto exit_bh;
-		}
-		if ((err = ext3_journal_get_write_access(handle, gdb))) {
-			brelse(gdb);
-			goto exit_bh;
-		}
-		lock_buffer(gdb);
-		memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
-		set_buffer_uptodate(gdb);
-		unlock_buffer(gdb);
-		err = ext3_journal_dirty_metadata(handle, gdb);
-		if (err) {
-			brelse(gdb);
-			goto exit_bh;
-		}
-		ext3_set_bit(bit, bh->b_data);
-		brelse(gdb);
-	}
-
-	/* Zero out all of the reserved backup group descriptor table blocks */
-	for (i = 0, bit = gdblocks + 1, block = start + bit;
-	     i < reserved_gdb; i++, block++, bit++) {
-		struct buffer_head *gdb;
-
-		ext3_debug("clear reserved block %#04lx (+%d)\n", block, bit);
-
-		err = extend_or_restart_transaction(handle, 1, bh);
-		if (err)
-			goto exit_bh;
-
-		if (IS_ERR(gdb = bclean(handle, sb, block))) {
-			err = PTR_ERR(gdb);
-			goto exit_bh;
-		}
-		err = ext3_journal_dirty_metadata(handle, gdb);
-		if (err) {
-			brelse(gdb);
-			goto exit_bh;
-		}
-		ext3_set_bit(bit, bh->b_data);
-		brelse(gdb);
-	}
-	ext3_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap,
-		   input->block_bitmap - start);
-	ext3_set_bit(input->block_bitmap - start, bh->b_data);
-	ext3_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap,
-		   input->inode_bitmap - start);
-	ext3_set_bit(input->inode_bitmap - start, bh->b_data);
-
-	/* Zero out all of the inode table blocks */
-	for (i = 0, block = input->inode_table, bit = block - start;
-	     i < sbi->s_itb_per_group; i++, bit++, block++) {
-		struct buffer_head *it;
-
-		ext3_debug("clear inode block %#04lx (+%d)\n", block, bit);
-
-		err = extend_or_restart_transaction(handle, 1, bh);
-		if (err)
-			goto exit_bh;
-
-		if (IS_ERR(it = bclean(handle, sb, block))) {
-			err = PTR_ERR(it);
-			goto exit_bh;
-		}
-		err = ext3_journal_dirty_metadata(handle, it);
-		if (err) {
-			brelse(it);
-			goto exit_bh;
-		}
-		brelse(it);
-		ext3_set_bit(bit, bh->b_data);
-	}
-
-	err = extend_or_restart_transaction(handle, 2, bh);
-	if (err)
-		goto exit_bh;
-
-	mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
-			bh->b_data);
-	err = ext3_journal_dirty_metadata(handle, bh);
-	if (err)
-		goto exit_bh;
-	brelse(bh);
-
-	/* Mark unused entries in inode bitmap used */
-	ext3_debug("clear inode bitmap %#04x (+%ld)\n",
-		   input->inode_bitmap, input->inode_bitmap - start);
-	if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
-		err = PTR_ERR(bh);
-		goto exit_journal;
-	}
-
-	mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
-			bh->b_data);
-	err = ext3_journal_dirty_metadata(handle, bh);
-exit_bh:
-	brelse(bh);
-
-exit_journal:
-	mutex_unlock(&sbi->s_resize_lock);
-	if ((err2 = ext3_journal_stop(handle)) && !err)
-		err = err2;
-
-	return err;
-}
-
-/*
- * Iterate through the groups which hold BACKUP superblock/GDT copies in an
- * ext3 filesystem.  The counters should be initialized to 1, 5, and 7 before
- * calling this for the first time.  In a sparse filesystem it will be the
- * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
- * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
- */
-static unsigned ext3_list_backups(struct super_block *sb, unsigned *three,
-				  unsigned *five, unsigned *seven)
-{
-	unsigned *min = three;
-	int mult = 3;
-	unsigned ret;
-
-	if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
-					EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
-		ret = *min;
-		*min += 1;
-		return ret;
-	}
-
-	if (*five < *min) {
-		min = five;
-		mult = 5;
-	}
-	if (*seven < *min) {
-		min = seven;
-		mult = 7;
-	}
-
-	ret = *min;
-	*min *= mult;
-
-	return ret;
-}
-
-/*
- * Check that all of the backup GDT blocks are held in the primary GDT block.
- * It is assumed that they are stored in group order.  Returns the number of
- * groups in current filesystem that have BACKUPS, or -ve error code.
- */
-static int verify_reserved_gdb(struct super_block *sb,
-			       struct buffer_head *primary)
-{
-	const ext3_fsblk_t blk = primary->b_blocknr;
-	const unsigned long end = EXT3_SB(sb)->s_groups_count;
-	unsigned three = 1;
-	unsigned five = 5;
-	unsigned seven = 7;
-	unsigned grp;
-	__le32 *p = (__le32 *)primary->b_data;
-	int gdbackups = 0;
-
-	while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
-		if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
-			ext3_warning(sb, __func__,
-				     "reserved GDT "E3FSBLK
-				     " missing grp %d ("E3FSBLK")",
-				     blk, grp,
-				     grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
-			return -EINVAL;
-		}
-		if (++gdbackups > EXT3_ADDR_PER_BLOCK(sb))
-			return -EFBIG;
-	}
-
-	return gdbackups;
-}
-
-/*
- * Called when we need to bring a reserved group descriptor table block into
- * use from the resize inode.  The primary copy of the new GDT block currently
- * is an indirect block (under the double indirect block in the resize inode).
- * The new backup GDT blocks will be stored as leaf blocks in this indirect
- * block, in group order.  Even though we know all the block numbers we need,
- * we check to ensure that the resize inode has actually reserved these blocks.
- *
- * Don't need to update the block bitmaps because the blocks are still in use.
- *
- * We get all of the error cases out of the way, so that we are sure to not
- * fail once we start modifying the data on disk, because JBD has no rollback.
- */
-static int add_new_gdb(handle_t *handle, struct inode *inode,
-		       struct ext3_new_group_data *input,
-		       struct buffer_head **primary)
-{
-	struct super_block *sb = inode->i_sb;
-	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
-	unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
-	ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
-	struct buffer_head **o_group_desc, **n_group_desc;
-	struct buffer_head *dind;
-	int gdbackups;
-	struct ext3_iloc iloc;
-	__le32 *data;
-	int err;
-
-	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG
-		       "EXT3-fs: ext3_add_new_gdb: adding group block %lu\n",
-		       gdb_num);
-
-	/*
-	 * If we are not using the primary superblock/GDT copy don't resize,
-	 * because the user tools have no way of handling this.  Probably a
-	 * bad time to do it anyways.
-	 */
-	if (EXT3_SB(sb)->s_sbh->b_blocknr !=
-	    le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) {
-		ext3_warning(sb, __func__,
-			"won't resize using backup superblock at %llu",
-			(unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr);
-		return -EPERM;
-	}
-
-	*primary = sb_bread(sb, gdblock);
-	if (!*primary)
-		return -EIO;
-
-	if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
-		err = gdbackups;
-		goto exit_bh;
-	}
-
-	data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK;
-	dind = sb_bread(sb, le32_to_cpu(*data));
-	if (!dind) {
-		err = -EIO;
-		goto exit_bh;
-	}
-
-	data = (__le32 *)dind->b_data;
-	if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
-		ext3_warning(sb, __func__,
-			     "new group %u GDT block "E3FSBLK" not reserved",
-			     input->group, gdblock);
-		err = -EINVAL;
-		goto exit_dind;
-	}
-
-	if ((err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh)))
-		goto exit_dind;
-
-	if ((err = ext3_journal_get_write_access(handle, *primary)))
-		goto exit_sbh;
-
-	if ((err = ext3_journal_get_write_access(handle, dind)))
-		goto exit_primary;
-
-	/* ext3_reserve_inode_write() gets a reference on the iloc */
-	if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
-		goto exit_dindj;
-
-	n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
-			GFP_NOFS);
-	if (!n_group_desc) {
-		err = -ENOMEM;
-		ext3_warning (sb, __func__,
-			      "not enough memory for %lu groups", gdb_num + 1);
-		goto exit_inode;
-	}
-
-	/*
-	 * Finally, we have all of the possible failures behind us...
-	 *
-	 * Remove new GDT block from inode double-indirect block and clear out
-	 * the new GDT block for use (which also "frees" the backup GDT blocks
-	 * from the reserved inode).  We don't need to change the bitmaps for
-	 * these blocks, because they are marked as in-use from being in the
-	 * reserved inode, and will become GDT blocks (primary and backup).
-	 */
-	data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
-	err = ext3_journal_dirty_metadata(handle, dind);
-	if (err)
-		goto exit_group_desc;
-	brelse(dind);
-	dind = NULL;
-	inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
-	err = ext3_mark_iloc_dirty(handle, inode, &iloc);
-	if (err)
-		goto exit_group_desc;
-	memset((*primary)->b_data, 0, sb->s_blocksize);
-	err = ext3_journal_dirty_metadata(handle, *primary);
-	if (err)
-		goto exit_group_desc;
-
-	o_group_desc = EXT3_SB(sb)->s_group_desc;
-	memcpy(n_group_desc, o_group_desc,
-	       EXT3_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
-	n_group_desc[gdb_num] = *primary;
-	EXT3_SB(sb)->s_group_desc = n_group_desc;
-	EXT3_SB(sb)->s_gdb_count++;
-	kfree(o_group_desc);
-
-	le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-	err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
-	if (err)
-		goto exit_inode;
-
-	return 0;
-
-exit_group_desc:
-	kfree(n_group_desc);
-exit_inode:
-	//ext3_journal_release_buffer(handle, iloc.bh);
-	brelse(iloc.bh);
-exit_dindj:
-	//ext3_journal_release_buffer(handle, dind);
-exit_primary:
-	//ext3_journal_release_buffer(handle, *primary);
-exit_sbh:
-	//ext3_journal_release_buffer(handle, *primary);
-exit_dind:
-	brelse(dind);
-exit_bh:
-	brelse(*primary);
-
-	ext3_debug("leaving with error %d\n", err);
-	return err;
-}
-
-/*
- * Called when we are adding a new group which has a backup copy of each of
- * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
- * We need to add these reserved backup GDT blocks to the resize inode, so
- * that they are kept for future resizing and not allocated to files.
- *
- * Each reserved backup GDT block will go into a different indirect block.
- * The indirect blocks are actually the primary reserved GDT blocks,
- * so we know in advance what their block numbers are.  We only get the
- * double-indirect block to verify it is pointing to the primary reserved
- * GDT blocks so we don't overwrite a data block by accident.  The reserved
- * backup GDT blocks are stored in their reserved primary GDT block.
- */
-static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
-			      struct ext3_new_group_data *input)
-{
-	struct super_block *sb = inode->i_sb;
-	int reserved_gdb =le16_to_cpu(EXT3_SB(sb)->s_es->s_reserved_gdt_blocks);
-	struct buffer_head **primary;
-	struct buffer_head *dind;
-	struct ext3_iloc iloc;
-	ext3_fsblk_t blk;
-	__le32 *data, *end;
-	int gdbackups = 0;
-	int res, i;
-	int err;
-
-	primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS);
-	if (!primary)
-		return -ENOMEM;
-
-	data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK;
-	dind = sb_bread(sb, le32_to_cpu(*data));
-	if (!dind) {
-		err = -EIO;
-		goto exit_free;
-	}
-
-	blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count;
-	data = (__le32 *)dind->b_data + (EXT3_SB(sb)->s_gdb_count %
-					 EXT3_ADDR_PER_BLOCK(sb));
-	end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
-
-	/* Get each reserved primary GDT block and verify it holds backups */
-	for (res = 0; res < reserved_gdb; res++, blk++) {
-		if (le32_to_cpu(*data) != blk) {
-			ext3_warning(sb, __func__,
-				     "reserved block "E3FSBLK
-				     " not at offset %ld",
-				     blk,
-				     (long)(data - (__le32 *)dind->b_data));
-			err = -EINVAL;
-			goto exit_bh;
-		}
-		primary[res] = sb_bread(sb, blk);
-		if (!primary[res]) {
-			err = -EIO;
-			goto exit_bh;
-		}
-		if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
-			brelse(primary[res]);
-			err = gdbackups;
-			goto exit_bh;
-		}
-		if (++data >= end)
-			data = (__le32 *)dind->b_data;
-	}
-
-	for (i = 0; i < reserved_gdb; i++) {
-		if ((err = ext3_journal_get_write_access(handle, primary[i]))) {
-			/*
-			int j;
-			for (j = 0; j < i; j++)
-				ext3_journal_release_buffer(handle, primary[j]);
-			 */
-			goto exit_bh;
-		}
-	}
-
-	if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
-		goto exit_bh;
-
-	/*
-	 * Finally we can add each of the reserved backup GDT blocks from
-	 * the new group to its reserved primary GDT block.
-	 */
-	blk = input->group * EXT3_BLOCKS_PER_GROUP(sb);
-	for (i = 0; i < reserved_gdb; i++) {
-		int err2;
-		data = (__le32 *)primary[i]->b_data;
-		/* printk("reserving backup %lu[%u] = %lu\n",
-		       primary[i]->b_blocknr, gdbackups,
-		       blk + primary[i]->b_blocknr); */
-		data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
-		err2 = ext3_journal_dirty_metadata(handle, primary[i]);
-		if (!err)
-			err = err2;
-	}
-	inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
-	ext3_mark_iloc_dirty(handle, inode, &iloc);
-
-exit_bh:
-	while (--res >= 0)
-		brelse(primary[res]);
-	brelse(dind);
-
-exit_free:
-	kfree(primary);
-
-	return err;
-}
-
-/*
- * Update the backup copies of the ext3 metadata.  These don't need to be part
- * of the main resize transaction, because e2fsck will re-write them if there
- * is a problem (basically only OOM will cause a problem).  However, we
- * _should_ update the backups if possible, in case the primary gets trashed
- * for some reason and we need to run e2fsck from a backup superblock.  The
- * important part is that the new block and inode counts are in the backup
- * superblocks, and the location of the new group metadata in the GDT backups.
- *
- * We do not need take the s_resize_lock for this, because these
- * blocks are not otherwise touched by the filesystem code when it is
- * mounted.  We don't need to worry about last changing from
- * sbi->s_groups_count, because the worst that can happen is that we
- * do not copy the full number of backups at this time.  The resize
- * which changed s_groups_count will backup again.
- */
-static void update_backups(struct super_block *sb,
-			   int blk_off, char *data, int size)
-{
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	const unsigned long last = sbi->s_groups_count;
-	const int bpg = EXT3_BLOCKS_PER_GROUP(sb);
-	unsigned three = 1;
-	unsigned five = 5;
-	unsigned seven = 7;
-	unsigned group;
-	int rest = sb->s_blocksize - size;
-	handle_t *handle;
-	int err = 0, err2;
-
-	handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA);
-	if (IS_ERR(handle)) {
-		group = 1;
-		err = PTR_ERR(handle);
-		goto exit_err;
-	}
-
-	while ((group = ext3_list_backups(sb, &three, &five, &seven)) < last) {
-		struct buffer_head *bh;
-
-		/* Out of journal space, and can't get more - abort - so sad */
-		if (handle->h_buffer_credits == 0 &&
-		    ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA) &&
-		    (err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA)))
-			break;
-
-		bh = sb_getblk(sb, group * bpg + blk_off);
-		if (unlikely(!bh)) {
-			err = -ENOMEM;
-			break;
-		}
-		ext3_debug("update metadata backup %#04lx\n",
-			  (unsigned long)bh->b_blocknr);
-		if ((err = ext3_journal_get_write_access(handle, bh))) {
-			brelse(bh);
-			break;
-		}
-		lock_buffer(bh);
-		memcpy(bh->b_data, data, size);
-		if (rest)
-			memset(bh->b_data + size, 0, rest);
-		set_buffer_uptodate(bh);
-		unlock_buffer(bh);
-		err = ext3_journal_dirty_metadata(handle, bh);
-		brelse(bh);
-		if (err)
-			break;
-	}
-	if ((err2 = ext3_journal_stop(handle)) && !err)
-		err = err2;
-
-	/*
-	 * Ugh! Need to have e2fsck write the backup copies.  It is too
-	 * late to revert the resize, we shouldn't fail just because of
-	 * the backup copies (they are only needed in case of corruption).
-	 *
-	 * However, if we got here we have a journal problem too, so we
-	 * can't really start a transaction to mark the superblock.
-	 * Chicken out and just set the flag on the hope it will be written
-	 * to disk, and if not - we will simply wait until next fsck.
-	 */
-exit_err:
-	if (err) {
-		ext3_warning(sb, __func__,
-			     "can't update backup for group %d (err %d), "
-			     "forcing fsck on next reboot", group, err);
-		sbi->s_mount_state &= ~EXT3_VALID_FS;
-		sbi->s_es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
-		mark_buffer_dirty(sbi->s_sbh);
-	}
-}
-
-/* Add group descriptor data to an existing or new group descriptor block.
- * Ensure we handle all possible error conditions _before_ we start modifying
- * the filesystem, because we cannot abort the transaction and not have it
- * write the data to disk.
- *
- * If we are on a GDT block boundary, we need to get the reserved GDT block.
- * Otherwise, we may need to add backup GDT blocks for a sparse group.
- *
- * We only need to hold the superblock lock while we are actually adding
- * in the new group's counts to the superblock.  Prior to that we have
- * not really "added" the group at all.  We re-check that we are still
- * adding in the last group in case things have changed since verifying.
- */
-int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
-{
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	struct ext3_super_block *es = sbi->s_es;
-	int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
-		le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
-	struct buffer_head *primary = NULL;
-	struct ext3_group_desc *gdp;
-	struct inode *inode = NULL;
-	handle_t *handle;
-	int gdb_off, gdb_num;
-	int err, err2;
-
-	gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
-	gdb_off = input->group % EXT3_DESC_PER_BLOCK(sb);
-
-	if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb,
-					EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
-		ext3_warning(sb, __func__,
-			     "Can't resize non-sparse filesystem further");
-		return -EPERM;
-	}
-
-	if (le32_to_cpu(es->s_blocks_count) + input->blocks_count <
-	    le32_to_cpu(es->s_blocks_count)) {
-		ext3_warning(sb, __func__, "blocks_count overflow\n");
-		return -EINVAL;
-	}
-
-	if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) <
-	    le32_to_cpu(es->s_inodes_count)) {
-		ext3_warning(sb, __func__, "inodes_count overflow\n");
-		return -EINVAL;
-	}
-
-	if (reserved_gdb || gdb_off == 0) {
-		if (!EXT3_HAS_COMPAT_FEATURE(sb,
-					     EXT3_FEATURE_COMPAT_RESIZE_INODE)
-		    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
-			ext3_warning(sb, __func__,
-				     "No reserved GDT blocks, can't resize");
-			return -EPERM;
-		}
-		inode = ext3_iget(sb, EXT3_RESIZE_INO);
-		if (IS_ERR(inode)) {
-			ext3_warning(sb, __func__,
-				     "Error opening resize inode");
-			return PTR_ERR(inode);
-		}
-	}
-
-	if ((err = verify_group_input(sb, input)))
-		goto exit_put;
-
-	if ((err = setup_new_group_blocks(sb, input)))
-		goto exit_put;
-
-	/*
-	 * We will always be modifying at least the superblock and a GDT
-	 * block.  If we are adding a group past the last current GDT block,
-	 * we will also modify the inode and the dindirect block.  If we
-	 * are adding a group with superblock/GDT backups  we will also
-	 * modify each of the reserved GDT dindirect blocks.
-	 */
-	handle = ext3_journal_start_sb(sb,
-				       ext3_bg_has_super(sb, input->group) ?
-				       3 + reserved_gdb : 4);
-	if (IS_ERR(handle)) {
-		err = PTR_ERR(handle);
-		goto exit_put;
-	}
-
-	mutex_lock(&sbi->s_resize_lock);
-	if (input->group != sbi->s_groups_count) {
-		ext3_warning(sb, __func__,
-			     "multiple resizers run on filesystem!");
-		err = -EBUSY;
-		goto exit_journal;
-	}
-
-	if ((err = ext3_journal_get_write_access(handle, sbi->s_sbh)))
-		goto exit_journal;
-
-	/*
-	 * We will only either add reserved group blocks to a backup group
-	 * or remove reserved blocks for the first group in a new group block.
-	 * Doing both would be mean more complex code, and sane people don't
-	 * use non-sparse filesystems anymore.  This is already checked above.
-	 */
-	if (gdb_off) {
-		primary = sbi->s_group_desc[gdb_num];
-		if ((err = ext3_journal_get_write_access(handle, primary)))
-			goto exit_journal;
-
-		if (reserved_gdb && ext3_bg_num_gdb(sb, input->group) &&
-		    (err = reserve_backup_gdb(handle, inode, input)))
-			goto exit_journal;
-	} else if ((err = add_new_gdb(handle, inode, input, &primary)))
-		goto exit_journal;
-
-	/*
-	 * OK, now we've set up the new group.  Time to make it active.
-	 *
-	 * We do not lock all allocations via s_resize_lock
-	 * so we have to be safe wrt. concurrent accesses the group
-	 * data.  So we need to be careful to set all of the relevant
-	 * group descriptor data etc. *before* we enable the group.
-	 *
-	 * The key field here is sbi->s_groups_count: as long as
-	 * that retains its old value, nobody is going to access the new
-	 * group.
-	 *
-	 * So first we update all the descriptor metadata for the new
-	 * group; then we update the total disk blocks count; then we
-	 * update the groups count to enable the group; then finally we
-	 * update the free space counts so that the system can start
-	 * using the new disk blocks.
-	 */
-
-	/* Update group descriptor block for new group */
-	gdp = (struct ext3_group_desc *)primary->b_data + gdb_off;
-
-	gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap);
-	gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap);
-	gdp->bg_inode_table = cpu_to_le32(input->inode_table);
-	gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
-	gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb));
-
-	/*
-	 * Make the new blocks and inodes valid next.  We do this before
-	 * increasing the group count so that once the group is enabled,
-	 * all of its blocks and inodes are already valid.
-	 *
-	 * We always allocate group-by-group, then block-by-block or
-	 * inode-by-inode within a group, so enabling these
-	 * blocks/inodes before the group is live won't actually let us
-	 * allocate the new space yet.
-	 */
-	le32_add_cpu(&es->s_blocks_count, input->blocks_count);
-	le32_add_cpu(&es->s_inodes_count, EXT3_INODES_PER_GROUP(sb));
-
-	/*
-	 * We need to protect s_groups_count against other CPUs seeing
-	 * inconsistent state in the superblock.
-	 *
-	 * The precise rules we use are:
-	 *
-	 * * Writers of s_groups_count *must* hold s_resize_lock
-	 * AND
-	 * * Writers must perform a smp_wmb() after updating all dependent
-	 *   data and before modifying the groups count
-	 *
-	 * * Readers must hold s_resize_lock over the access
-	 * OR
-	 * * Readers must perform an smp_rmb() after reading the groups count
-	 *   and before reading any dependent data.
-	 *
-	 * NB. These rules can be relaxed when checking the group count
-	 * while freeing data, as we can only allocate from a block
-	 * group after serialising against the group count, and we can
-	 * only then free after serialising in turn against that
-	 * allocation.
-	 */
-	smp_wmb();
-
-	/* Update the global fs size fields */
-	sbi->s_groups_count++;
-
-	err = ext3_journal_dirty_metadata(handle, primary);
-	if (err)
-		goto exit_journal;
-
-	/* Update the reserved block counts only once the new group is
-	 * active. */
-	le32_add_cpu(&es->s_r_blocks_count, input->reserved_blocks);
-
-	/* Update the free space counts */
-	percpu_counter_add(&sbi->s_freeblocks_counter,
-			   input->free_blocks_count);
-	percpu_counter_add(&sbi->s_freeinodes_counter,
-			   EXT3_INODES_PER_GROUP(sb));
-
-	err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
-
-exit_journal:
-	mutex_unlock(&sbi->s_resize_lock);
-	if ((err2 = ext3_journal_stop(handle)) && !err)
-		err = err2;
-	if (!err) {
-		update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
-			       sizeof(struct ext3_super_block));
-		update_backups(sb, primary->b_blocknr, primary->b_data,
-			       primary->b_size);
-	}
-exit_put:
-	iput(inode);
-	return err;
-} /* ext3_group_add */
-
-/* Extend the filesystem to the new number of blocks specified.  This entry
- * point is only used to extend the current filesystem to the end of the last
- * existing group.  It can be accessed via ioctl, or by "remount,resize=<size>"
- * for emergencies (because it has no dependencies on reserved blocks).
- *
- * If we _really_ wanted, we could use default values to call ext3_group_add()
- * allow the "remount" trick to work for arbitrary resizing, assuming enough
- * GDT blocks are reserved to grow to the desired size.
- */
-int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
-		      ext3_fsblk_t n_blocks_count)
-{
-	ext3_fsblk_t o_blocks_count;
-	ext3_grpblk_t last;
-	ext3_grpblk_t add;
-	struct buffer_head * bh;
-	handle_t *handle;
-	int err;
-	unsigned long freed_blocks;
-
-	/* We don't need to worry about locking wrt other resizers just
-	 * yet: we're going to revalidate es->s_blocks_count after
-	 * taking the s_resize_lock below. */
-	o_blocks_count = le32_to_cpu(es->s_blocks_count);
-
-	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
-		       " up to "E3FSBLK" blocks\n",
-		       o_blocks_count, n_blocks_count);
-
-	if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
-		return 0;
-
-	if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
-		printk(KERN_ERR "EXT3-fs: filesystem on %s:"
-			" too large to resize to "E3FSBLK" blocks safely\n",
-			sb->s_id, n_blocks_count);
-		if (sizeof(sector_t) < 8)
-			ext3_warning(sb, __func__,
-			"CONFIG_LBDAF not enabled\n");
-		return -EINVAL;
-	}
-
-	if (n_blocks_count < o_blocks_count) {
-		ext3_warning(sb, __func__,
-			     "can't shrink FS - resize aborted");
-		return -EBUSY;
-	}
-
-	/* Handle the remaining blocks in the last group only. */
-	last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) %
-		EXT3_BLOCKS_PER_GROUP(sb);
-
-	if (last == 0) {
-		ext3_warning(sb, __func__,
-			     "need to use ext2online to resize further");
-		return -EPERM;
-	}
-
-	add = EXT3_BLOCKS_PER_GROUP(sb) - last;
-
-	if (o_blocks_count + add < o_blocks_count) {
-		ext3_warning(sb, __func__, "blocks_count overflow");
-		return -EINVAL;
-	}
-
-	if (o_blocks_count + add > n_blocks_count)
-		add = n_blocks_count - o_blocks_count;
-
-	if (o_blocks_count + add < n_blocks_count)
-		ext3_warning(sb, __func__,
-			     "will only finish group ("E3FSBLK
-			     " blocks, %u new)",
-			     o_blocks_count + add, add);
-
-	/* See if the device is actually as big as what was requested */
-	bh = sb_bread(sb, o_blocks_count + add -1);
-	if (!bh) {
-		ext3_warning(sb, __func__,
-			     "can't read last block, resize aborted");
-		return -ENOSPC;
-	}
-	brelse(bh);
-
-	/* We will update the superblock, one block bitmap, and
-	 * one group descriptor via ext3_free_blocks().
-	 */
-	handle = ext3_journal_start_sb(sb, 3);
-	if (IS_ERR(handle)) {
-		err = PTR_ERR(handle);
-		ext3_warning(sb, __func__, "error %d on journal start",err);
-		goto exit_put;
-	}
-
-	mutex_lock(&EXT3_SB(sb)->s_resize_lock);
-	if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
-		ext3_warning(sb, __func__,
-			     "multiple resizers run on filesystem!");
-		mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
-		ext3_journal_stop(handle);
-		err = -EBUSY;
-		goto exit_put;
-	}
-
-	if ((err = ext3_journal_get_write_access(handle,
-						 EXT3_SB(sb)->s_sbh))) {
-		ext3_warning(sb, __func__,
-			     "error %d on journal write access", err);
-		mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
-		ext3_journal_stop(handle);
-		goto exit_put;
-	}
-	es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
-	err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
-	mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
-	if (err) {
-		ext3_warning(sb, __func__,
-			     "error %d on journal dirty metadata", err);
-		ext3_journal_stop(handle);
-		goto exit_put;
-	}
-	ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
-		   o_blocks_count, o_blocks_count + add);
-	ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
-	ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n",
-		   o_blocks_count, o_blocks_count + add);
-	if ((err = ext3_journal_stop(handle)))
-		goto exit_put;
-	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG "EXT3-fs: extended group to %u blocks\n",
-		       le32_to_cpu(es->s_blocks_count));
-	update_backups(sb, EXT3_SB(sb)->s_sbh->b_blocknr, (char *)es,
-		       sizeof(struct ext3_super_block));
-exit_put:
-	return err;
-} /* ext3_group_extend */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
deleted file mode 100644
index 5ed0044fbb37..000000000000
--- a/fs/ext3/super.c
+++ /dev/null
@@ -1,3165 +0,0 @@
-/*
- *  linux/fs/ext3/super.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/fs/minix/inode.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *
- *  Big-endian to little-endian byte-swapping/bitmaps by
- *        David S. Miller (davem@caip.rutgers.edu), 1995
- */
-
-#include <linux/module.h>
-#include <linux/blkdev.h>
-#include <linux/parser.h>
-#include <linux/exportfs.h>
-#include <linux/statfs.h>
-#include <linux/random.h>
-#include <linux/mount.h>
-#include <linux/quotaops.h>
-#include <linux/seq_file.h>
-#include <linux/log2.h>
-#include <linux/cleancache.h>
-#include <linux/namei.h>
-
-#include <asm/uaccess.h>
-
-#define CREATE_TRACE_POINTS
-
-#include "ext3.h"
-#include "xattr.h"
-#include "acl.h"
-#include "namei.h"
-
-#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
-  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
-#else
-  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
-#endif
-
-static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
-			     unsigned long journal_devnum);
-static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
-			       unsigned int);
-static int ext3_commit_super(struct super_block *sb,
-			       struct ext3_super_block *es,
-			       int sync);
-static void ext3_mark_recovery_complete(struct super_block * sb,
-					struct ext3_super_block * es);
-static void ext3_clear_journal_err(struct super_block * sb,
-				   struct ext3_super_block * es);
-static int ext3_sync_fs(struct super_block *sb, int wait);
-static const char *ext3_decode_error(struct super_block * sb, int errno,
-				     char nbuf[16]);
-static int ext3_remount (struct super_block * sb, int * flags, char * data);
-static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
-static int ext3_unfreeze(struct super_block *sb);
-static int ext3_freeze(struct super_block *sb);
-
-/*
- * Wrappers for journal_start/end.
- */
-handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
-{
-	journal_t *journal;
-
-	if (sb->s_flags & MS_RDONLY)
-		return ERR_PTR(-EROFS);
-
-	/* Special case here: if the journal has aborted behind our
-	 * backs (eg. EIO in the commit thread), then we still need to
-	 * take the FS itself readonly cleanly. */
-	journal = EXT3_SB(sb)->s_journal;
-	if (is_journal_aborted(journal)) {
-		ext3_abort(sb, __func__,
-			   "Detected aborted journal");
-		return ERR_PTR(-EROFS);
-	}
-
-	return journal_start(journal, nblocks);
-}
-
-int __ext3_journal_stop(const char *where, handle_t *handle)
-{
-	struct super_block *sb;
-	int err;
-	int rc;
-
-	sb = handle->h_transaction->t_journal->j_private;
-	err = handle->h_err;
-	rc = journal_stop(handle);
-
-	if (!err)
-		err = rc;
-	if (err)
-		__ext3_std_error(sb, where, err);
-	return err;
-}
-
-void ext3_journal_abort_handle(const char *caller, const char *err_fn,
-		struct buffer_head *bh, handle_t *handle, int err)
-{
-	char nbuf[16];
-	const char *errstr = ext3_decode_error(NULL, err, nbuf);
-
-	if (bh)
-		BUFFER_TRACE(bh, "abort");
-
-	if (!handle->h_err)
-		handle->h_err = err;
-
-	if (is_handle_aborted(handle))
-		return;
-
-	printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n",
-		caller, errstr, err_fn);
-
-	journal_abort_handle(handle);
-}
-
-void ext3_msg(struct super_block *sb, const char *prefix,
-		const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-
-	va_start(args, fmt);
-
-	vaf.fmt = fmt;
-	vaf.va = &args;
-
-	printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
-
-	va_end(args);
-}
-
-/* Deal with the reporting of failure conditions on a filesystem such as
- * inconsistencies detected or read IO failures.
- *
- * On ext2, we can store the error state of the filesystem in the
- * superblock.  That is not possible on ext3, because we may have other
- * write ordering constraints on the superblock which prevent us from
- * writing it out straight away; and given that the journal is about to
- * be aborted, we can't rely on the current, or future, transactions to
- * write out the superblock safely.
- *
- * We'll just use the journal_abort() error code to record an error in
- * the journal instead.  On recovery, the journal will complain about
- * that error until we've noted it down and cleared it.
- */
-
-static void ext3_handle_error(struct super_block *sb)
-{
-	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
-
-	EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
-	es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
-
-	if (sb->s_flags & MS_RDONLY)
-		return;
-
-	if (!test_opt (sb, ERRORS_CONT)) {
-		journal_t *journal = EXT3_SB(sb)->s_journal;
-
-		set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
-		if (journal)
-			journal_abort(journal, -EIO);
-	}
-	if (test_opt (sb, ERRORS_RO)) {
-		ext3_msg(sb, KERN_CRIT,
-			"error: remounting filesystem read-only");
-		/*
-		 * Make sure updated value of ->s_mount_state will be visible
-		 * before ->s_flags update.
-		 */
-		smp_wmb();
-		sb->s_flags |= MS_RDONLY;
-	}
-	ext3_commit_super(sb, es, 1);
-	if (test_opt(sb, ERRORS_PANIC))
-		panic("EXT3-fs (%s): panic forced after error\n",
-			sb->s_id);
-}
-
-void ext3_error(struct super_block *sb, const char *function,
-		const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-
-	va_start(args, fmt);
-
-	vaf.fmt = fmt;
-	vaf.va = &args;
-
-	printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
-	       sb->s_id, function, &vaf);
-
-	va_end(args);
-
-	ext3_handle_error(sb);
-}
-
-static const char *ext3_decode_error(struct super_block * sb, int errno,
-				     char nbuf[16])
-{
-	char *errstr = NULL;
-
-	switch (errno) {
-	case -EIO:
-		errstr = "IO failure";
-		break;
-	case -ENOMEM:
-		errstr = "Out of memory";
-		break;
-	case -EROFS:
-		if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT)
-			errstr = "Journal has aborted";
-		else
-			errstr = "Readonly filesystem";
-		break;
-	default:
-		/* If the caller passed in an extra buffer for unknown
-		 * errors, textualise them now.  Else we just return
-		 * NULL. */
-		if (nbuf) {
-			/* Check for truncated error codes... */
-			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
-				errstr = nbuf;
-		}
-		break;
-	}
-
-	return errstr;
-}
-
-/* __ext3_std_error decodes expected errors from journaling functions
- * automatically and invokes the appropriate error response.  */
-
-void __ext3_std_error (struct super_block * sb, const char * function,
-		       int errno)
-{
-	char nbuf[16];
-	const char *errstr;
-
-	/* Special case: if the error is EROFS, and we're not already
-	 * inside a transaction, then there's really no point in logging
-	 * an error. */
-	if (errno == -EROFS && journal_current_handle() == NULL &&
-	    (sb->s_flags & MS_RDONLY))
-		return;
-
-	errstr = ext3_decode_error(sb, errno, nbuf);
-	ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr);
-
-	ext3_handle_error(sb);
-}
-
-/*
- * ext3_abort is a much stronger failure handler than ext3_error.  The
- * abort function may be used to deal with unrecoverable failures such
- * as journal IO errors or ENOMEM at a critical moment in log management.
- *
- * We unconditionally force the filesystem into an ABORT|READONLY state,
- * unless the error response on the fs has been set to panic in which
- * case we take the easy way out and panic immediately.
- */
-
-void ext3_abort(struct super_block *sb, const char *function,
-		 const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-
-	va_start(args, fmt);
-
-	vaf.fmt = fmt;
-	vaf.va = &args;
-
-	printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
-	       sb->s_id, function, &vaf);
-
-	va_end(args);
-
-	if (test_opt(sb, ERRORS_PANIC))
-		panic("EXT3-fs: panic from previous error\n");
-
-	if (sb->s_flags & MS_RDONLY)
-		return;
-
-	ext3_msg(sb, KERN_CRIT,
-		"error: remounting filesystem read-only");
-	EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
-	set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
-	/*
-	 * Make sure updated value of ->s_mount_state will be visible
-	 * before ->s_flags update.
-	 */
-	smp_wmb();
-	sb->s_flags |= MS_RDONLY;
-
-	if (EXT3_SB(sb)->s_journal)
-		journal_abort(EXT3_SB(sb)->s_journal, -EIO);
-}
-
-void ext3_warning(struct super_block *sb, const char *function,
-		  const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-
-	va_start(args, fmt);
-
-	vaf.fmt = fmt;
-	vaf.va = &args;
-
-	printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
-	       sb->s_id, function, &vaf);
-
-	va_end(args);
-}
-
-void ext3_update_dynamic_rev(struct super_block *sb)
-{
-	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
-
-	if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
-		return;
-
-	ext3_msg(sb, KERN_WARNING,
-		"warning: updating to rev %d because of "
-		"new feature flag, running e2fsck is recommended",
-		EXT3_DYNAMIC_REV);
-
-	es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
-	es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
-	es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV);
-	/* leave es->s_feature_*compat flags alone */
-	/* es->s_uuid will be set by e2fsck if empty */
-
-	/*
-	 * The rest of the superblock fields should be zero, and if not it
-	 * means they are likely already in use, so leave them alone.  We
-	 * can leave it up to e2fsck to clean up any inconsistencies there.
-	 */
-}
-
-/*
- * Open the external journal device
- */
-static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
-{
-	struct block_device *bdev;
-	char b[BDEVNAME_SIZE];
-
-	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
-	if (IS_ERR(bdev))
-		goto fail;
-	return bdev;
-
-fail:
-	ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld",
-		__bdevname(dev, b), PTR_ERR(bdev));
-
-	return NULL;
-}
-
-/*
- * Release the journal device
- */
-static void ext3_blkdev_put(struct block_device *bdev)
-{
-	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-}
-
-static void ext3_blkdev_remove(struct ext3_sb_info *sbi)
-{
-	struct block_device *bdev;
-	bdev = sbi->journal_bdev;
-	if (bdev) {
-		ext3_blkdev_put(bdev);
-		sbi->journal_bdev = NULL;
-	}
-}
-
-static inline struct inode *orphan_list_entry(struct list_head *l)
-{
-	return &list_entry(l, struct ext3_inode_info, i_orphan)->vfs_inode;
-}
-
-static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
-{
-	struct list_head *l;
-
-	ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d",
-	       le32_to_cpu(sbi->s_es->s_last_orphan));
-
-	ext3_msg(sb, KERN_ERR, "sb_info orphan list:");
-	list_for_each(l, &sbi->s_orphan) {
-		struct inode *inode = orphan_list_entry(l);
-		ext3_msg(sb, KERN_ERR, "  "
-		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
-		       inode->i_sb->s_id, inode->i_ino, inode,
-		       inode->i_mode, inode->i_nlink,
-		       NEXT_ORPHAN(inode));
-	}
-}
-
-static void ext3_put_super (struct super_block * sb)
-{
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	struct ext3_super_block *es = sbi->s_es;
-	int i, err;
-
-	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
-	ext3_xattr_put_super(sb);
-	err = journal_destroy(sbi->s_journal);
-	sbi->s_journal = NULL;
-	if (err < 0)
-		ext3_abort(sb, __func__, "Couldn't clean up the journal");
-
-	if (!(sb->s_flags & MS_RDONLY)) {
-		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-		es->s_state = cpu_to_le16(sbi->s_mount_state);
-		BUFFER_TRACE(sbi->s_sbh, "marking dirty");
-		mark_buffer_dirty(sbi->s_sbh);
-		ext3_commit_super(sb, es, 1);
-	}
-
-	for (i = 0; i < sbi->s_gdb_count; i++)
-		brelse(sbi->s_group_desc[i]);
-	kfree(sbi->s_group_desc);
-	percpu_counter_destroy(&sbi->s_freeblocks_counter);
-	percpu_counter_destroy(&sbi->s_freeinodes_counter);
-	percpu_counter_destroy(&sbi->s_dirs_counter);
-	brelse(sbi->s_sbh);
-#ifdef CONFIG_QUOTA
-	for (i = 0; i < EXT3_MAXQUOTAS; i++)
-		kfree(sbi->s_qf_names[i]);
-#endif
-
-	/* Debugging code just in case the in-memory inode orphan list
-	 * isn't empty.  The on-disk one can be non-empty if we've
-	 * detected an error and taken the fs readonly, but the
-	 * in-memory list had better be clean by this point. */
-	if (!list_empty(&sbi->s_orphan))
-		dump_orphan_list(sb, sbi);
-	J_ASSERT(list_empty(&sbi->s_orphan));
-
-	invalidate_bdev(sb->s_bdev);
-	if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
-		/*
-		 * Invalidate the journal device's buffers.  We don't want them
-		 * floating about in memory - the physical journal device may
-		 * hotswapped, and it breaks the `ro-after' testing code.
-		 */
-		sync_blockdev(sbi->journal_bdev);
-		invalidate_bdev(sbi->journal_bdev);
-		ext3_blkdev_remove(sbi);
-	}
-	sb->s_fs_info = NULL;
-	kfree(sbi->s_blockgroup_lock);
-	mutex_destroy(&sbi->s_orphan_lock);
-	mutex_destroy(&sbi->s_resize_lock);
-	kfree(sbi);
-}
-
-static struct kmem_cache *ext3_inode_cachep;
-
-/*
- * Called inside transaction, so use GFP_NOFS
- */
-static struct inode *ext3_alloc_inode(struct super_block *sb)
-{
-	struct ext3_inode_info *ei;
-
-	ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS);
-	if (!ei)
-		return NULL;
-	ei->i_block_alloc_info = NULL;
-	ei->vfs_inode.i_version = 1;
-	atomic_set(&ei->i_datasync_tid, 0);
-	atomic_set(&ei->i_sync_tid, 0);
-#ifdef CONFIG_QUOTA
-	memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
-#endif
-
-	return &ei->vfs_inode;
-}
-
-static int ext3_drop_inode(struct inode *inode)
-{
-	int drop = generic_drop_inode(inode);
-
-	trace_ext3_drop_inode(inode, drop);
-	return drop;
-}
-
-static void ext3_i_callback(struct rcu_head *head)
-{
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-	kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
-}
-
-static void ext3_destroy_inode(struct inode *inode)
-{
-	if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
-		printk("EXT3 Inode %p: orphan list check failed!\n",
-			EXT3_I(inode));
-		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
-				EXT3_I(inode), sizeof(struct ext3_inode_info),
-				false);
-		dump_stack();
-	}
-	call_rcu(&inode->i_rcu, ext3_i_callback);
-}
-
-static void init_once(void *foo)
-{
-	struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
-
-	INIT_LIST_HEAD(&ei->i_orphan);
-#ifdef CONFIG_EXT3_FS_XATTR
-	init_rwsem(&ei->xattr_sem);
-#endif
-	mutex_init(&ei->truncate_mutex);
-	inode_init_once(&ei->vfs_inode);
-}
-
-static int __init init_inodecache(void)
-{
-	ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
-					     sizeof(struct ext3_inode_info),
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
-					     init_once);
-	if (ext3_inode_cachep == NULL)
-		return -ENOMEM;
-	return 0;
-}
-
-static void destroy_inodecache(void)
-{
-	/*
-	 * Make sure all delayed rcu free inodes are flushed before we
-	 * destroy cache.
-	 */
-	rcu_barrier();
-	kmem_cache_destroy(ext3_inode_cachep);
-}
-
-static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
-{
-#if defined(CONFIG_QUOTA)
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-
-	if (sbi->s_jquota_fmt) {
-		char *fmtname = "";
-
-		switch (sbi->s_jquota_fmt) {
-		case QFMT_VFS_OLD:
-			fmtname = "vfsold";
-			break;
-		case QFMT_VFS_V0:
-			fmtname = "vfsv0";
-			break;
-		case QFMT_VFS_V1:
-			fmtname = "vfsv1";
-			break;
-		}
-		seq_printf(seq, ",jqfmt=%s", fmtname);
-	}
-
-	if (sbi->s_qf_names[USRQUOTA])
-		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
-
-	if (sbi->s_qf_names[GRPQUOTA])
-		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
-
-	if (test_opt(sb, USRQUOTA))
-		seq_puts(seq, ",usrquota");
-
-	if (test_opt(sb, GRPQUOTA))
-		seq_puts(seq, ",grpquota");
-#endif
-}
-
-static char *data_mode_string(unsigned long mode)
-{
-	switch (mode) {
-	case EXT3_MOUNT_JOURNAL_DATA:
-		return "journal";
-	case EXT3_MOUNT_ORDERED_DATA:
-		return "ordered";
-	case EXT3_MOUNT_WRITEBACK_DATA:
-		return "writeback";
-	}
-	return "unknown";
-}
-
-/*
- * Show an option if
- *  - it's set to a non-default value OR
- *  - if the per-sb default is different from the global default
- */
-static int ext3_show_options(struct seq_file *seq, struct dentry *root)
-{
-	struct super_block *sb = root->d_sb;
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	struct ext3_super_block *es = sbi->s_es;
-	unsigned long def_mount_opts;
-
-	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
-
-	if (sbi->s_sb_block != 1)
-		seq_printf(seq, ",sb=%lu", sbi->s_sb_block);
-	if (test_opt(sb, MINIX_DF))
-		seq_puts(seq, ",minixdf");
-	if (test_opt(sb, GRPID))
-		seq_puts(seq, ",grpid");
-	if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS))
-		seq_puts(seq, ",nogrpid");
-	if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT3_DEF_RESUID)) ||
-	    le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) {
-		seq_printf(seq, ",resuid=%u",
-				from_kuid_munged(&init_user_ns, sbi->s_resuid));
-	}
-	if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT3_DEF_RESGID)) ||
-	    le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) {
-		seq_printf(seq, ",resgid=%u",
-				from_kgid_munged(&init_user_ns, sbi->s_resgid));
-	}
-	if (test_opt(sb, ERRORS_RO)) {
-		int def_errors = le16_to_cpu(es->s_errors);
-
-		if (def_errors == EXT3_ERRORS_PANIC ||
-		    def_errors == EXT3_ERRORS_CONTINUE) {
-			seq_puts(seq, ",errors=remount-ro");
-		}
-	}
-	if (test_opt(sb, ERRORS_CONT))
-		seq_puts(seq, ",errors=continue");
-	if (test_opt(sb, ERRORS_PANIC))
-		seq_puts(seq, ",errors=panic");
-	if (test_opt(sb, NO_UID32))
-		seq_puts(seq, ",nouid32");
-	if (test_opt(sb, DEBUG))
-		seq_puts(seq, ",debug");
-#ifdef CONFIG_EXT3_FS_XATTR
-	if (test_opt(sb, XATTR_USER))
-		seq_puts(seq, ",user_xattr");
-	if (!test_opt(sb, XATTR_USER) &&
-	    (def_mount_opts & EXT3_DEFM_XATTR_USER)) {
-		seq_puts(seq, ",nouser_xattr");
-	}
-#endif
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	if (test_opt(sb, POSIX_ACL))
-		seq_puts(seq, ",acl");
-	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT3_DEFM_ACL))
-		seq_puts(seq, ",noacl");
-#endif
-	if (!test_opt(sb, RESERVATION))
-		seq_puts(seq, ",noreservation");
-	if (sbi->s_commit_interval) {
-		seq_printf(seq, ",commit=%u",
-			   (unsigned) (sbi->s_commit_interval / HZ));
-	}
-
-	/*
-	 * Always display barrier state so it's clear what the status is.
-	 */
-	seq_puts(seq, ",barrier=");
-	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
-	seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
-	if (test_opt(sb, DATA_ERR_ABORT))
-		seq_puts(seq, ",data_err=abort");
-
-	if (test_opt(sb, NOLOAD))
-		seq_puts(seq, ",norecovery");
-
-	ext3_show_quota_options(seq, sb);
-
-	return 0;
-}
-
-
-static struct inode *ext3_nfs_get_inode(struct super_block *sb,
-		u64 ino, u32 generation)
-{
-	struct inode *inode;
-
-	if (ino < EXT3_FIRST_INO(sb) && ino != EXT3_ROOT_INO)
-		return ERR_PTR(-ESTALE);
-	if (ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count))
-		return ERR_PTR(-ESTALE);
-
-	/* iget isn't really right if the inode is currently unallocated!!
-	 *
-	 * ext3_read_inode will return a bad_inode if the inode had been
-	 * deleted, so we should be safe.
-	 *
-	 * Currently we don't know the generation for parent directory, so
-	 * a generation of 0 means "accept any"
-	 */
-	inode = ext3_iget(sb, ino);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-	if (generation && inode->i_generation != generation) {
-		iput(inode);
-		return ERR_PTR(-ESTALE);
-	}
-
-	return inode;
-}
-
-static struct dentry *ext3_fh_to_dentry(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
-{
-	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
-				    ext3_nfs_get_inode);
-}
-
-static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
-{
-	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
-				    ext3_nfs_get_inode);
-}
-
-/*
- * Try to release metadata pages (indirect blocks, directories) which are
- * mapped via the block device.  Since these pages could have journal heads
- * which would prevent try_to_free_buffers() from freeing them, we must use
- * jbd layer's try_to_free_buffers() function to release them.
- */
-static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
-				 gfp_t wait)
-{
-	journal_t *journal = EXT3_SB(sb)->s_journal;
-
-	WARN_ON(PageChecked(page));
-	if (!page_has_buffers(page))
-		return 0;
-	if (journal)
-		return journal_try_to_free_buffers(journal, page, 
-						   wait & ~__GFP_WAIT);
-	return try_to_free_buffers(page);
-}
-
-#ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
-#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
-
-static int ext3_write_dquot(struct dquot *dquot);
-static int ext3_acquire_dquot(struct dquot *dquot);
-static int ext3_release_dquot(struct dquot *dquot);
-static int ext3_mark_dquot_dirty(struct dquot *dquot);
-static int ext3_write_info(struct super_block *sb, int type);
-static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-			 struct path *path);
-static int ext3_quota_on_mount(struct super_block *sb, int type);
-static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
-			       size_t len, loff_t off);
-static ssize_t ext3_quota_write(struct super_block *sb, int type,
-				const char *data, size_t len, loff_t off);
-static struct dquot **ext3_get_dquots(struct inode *inode)
-{
-	return EXT3_I(inode)->i_dquot;
-}
-
-static const struct dquot_operations ext3_quota_operations = {
-	.write_dquot	= ext3_write_dquot,
-	.acquire_dquot	= ext3_acquire_dquot,
-	.release_dquot	= ext3_release_dquot,
-	.mark_dirty	= ext3_mark_dquot_dirty,
-	.write_info	= ext3_write_info,
-	.alloc_dquot	= dquot_alloc,
-	.destroy_dquot	= dquot_destroy,
-};
-
-static const struct quotactl_ops ext3_qctl_operations = {
-	.quota_on	= ext3_quota_on,
-	.quota_off	= dquot_quota_off,
-	.quota_sync	= dquot_quota_sync,
-	.get_state	= dquot_get_state,
-	.set_info	= dquot_set_dqinfo,
-	.get_dqblk	= dquot_get_dqblk,
-	.set_dqblk	= dquot_set_dqblk
-};
-#endif
-
-static const struct super_operations ext3_sops = {
-	.alloc_inode	= ext3_alloc_inode,
-	.destroy_inode	= ext3_destroy_inode,
-	.write_inode	= ext3_write_inode,
-	.dirty_inode	= ext3_dirty_inode,
-	.drop_inode	= ext3_drop_inode,
-	.evict_inode	= ext3_evict_inode,
-	.put_super	= ext3_put_super,
-	.sync_fs	= ext3_sync_fs,
-	.freeze_fs	= ext3_freeze,
-	.unfreeze_fs	= ext3_unfreeze,
-	.statfs		= ext3_statfs,
-	.remount_fs	= ext3_remount,
-	.show_options	= ext3_show_options,
-#ifdef CONFIG_QUOTA
-	.quota_read	= ext3_quota_read,
-	.quota_write	= ext3_quota_write,
-	.get_dquots	= ext3_get_dquots,
-#endif
-	.bdev_try_to_free_page = bdev_try_to_free_page,
-};
-
-static const struct export_operations ext3_export_ops = {
-	.fh_to_dentry = ext3_fh_to_dentry,
-	.fh_to_parent = ext3_fh_to_parent,
-	.get_parent = ext3_get_parent,
-};
-
-enum {
-	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
-	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
-	Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
-	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
-	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
-	Opt_journal_path,
-	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
-	Opt_data_err_abort, Opt_data_err_ignore,
-	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
-	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
-	Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
-	Opt_resize, Opt_usrquota, Opt_grpquota
-};
-
-static const match_table_t tokens = {
-	{Opt_bsd_df, "bsddf"},
-	{Opt_minix_df, "minixdf"},
-	{Opt_grpid, "grpid"},
-	{Opt_grpid, "bsdgroups"},
-	{Opt_nogrpid, "nogrpid"},
-	{Opt_nogrpid, "sysvgroups"},
-	{Opt_resgid, "resgid=%u"},
-	{Opt_resuid, "resuid=%u"},
-	{Opt_sb, "sb=%u"},
-	{Opt_err_cont, "errors=continue"},
-	{Opt_err_panic, "errors=panic"},
-	{Opt_err_ro, "errors=remount-ro"},
-	{Opt_nouid32, "nouid32"},
-	{Opt_nocheck, "nocheck"},
-	{Opt_nocheck, "check=none"},
-	{Opt_debug, "debug"},
-	{Opt_oldalloc, "oldalloc"},
-	{Opt_orlov, "orlov"},
-	{Opt_user_xattr, "user_xattr"},
-	{Opt_nouser_xattr, "nouser_xattr"},
-	{Opt_acl, "acl"},
-	{Opt_noacl, "noacl"},
-	{Opt_reservation, "reservation"},
-	{Opt_noreservation, "noreservation"},
-	{Opt_noload, "noload"},
-	{Opt_noload, "norecovery"},
-	{Opt_nobh, "nobh"},
-	{Opt_bh, "bh"},
-	{Opt_commit, "commit=%u"},
-	{Opt_journal_update, "journal=update"},
-	{Opt_journal_inum, "journal=%u"},
-	{Opt_journal_dev, "journal_dev=%u"},
-	{Opt_journal_path, "journal_path=%s"},
-	{Opt_abort, "abort"},
-	{Opt_data_journal, "data=journal"},
-	{Opt_data_ordered, "data=ordered"},
-	{Opt_data_writeback, "data=writeback"},
-	{Opt_data_err_abort, "data_err=abort"},
-	{Opt_data_err_ignore, "data_err=ignore"},
-	{Opt_offusrjquota, "usrjquota="},
-	{Opt_usrjquota, "usrjquota=%s"},
-	{Opt_offgrpjquota, "grpjquota="},
-	{Opt_grpjquota, "grpjquota=%s"},
-	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
-	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
-	{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
-	{Opt_grpquota, "grpquota"},
-	{Opt_noquota, "noquota"},
-	{Opt_quota, "quota"},
-	{Opt_usrquota, "usrquota"},
-	{Opt_barrier, "barrier=%u"},
-	{Opt_barrier, "barrier"},
-	{Opt_nobarrier, "nobarrier"},
-	{Opt_resize, "resize"},
-	{Opt_err, NULL},
-};
-
-static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
-{
-	ext3_fsblk_t	sb_block;
-	char		*options = (char *) *data;
-
-	if (!options || strncmp(options, "sb=", 3) != 0)
-		return 1;	/* Default location */
-	options += 3;
-	/*todo: use simple_strtoll with >32bit ext3 */
-	sb_block = simple_strtoul(options, &options, 0);
-	if (*options && *options != ',') {
-		ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s",
-		       (char *) *data);
-		return 1;
-	}
-	if (*options == ',')
-		options++;
-	*data = (void *) options;
-	return sb_block;
-}
-
-#ifdef CONFIG_QUOTA
-static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
-{
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	char *qname;
-
-	if (sb_any_quota_loaded(sb) &&
-		!sbi->s_qf_names[qtype]) {
-		ext3_msg(sb, KERN_ERR,
-			"Cannot change journaled "
-			"quota options when quota turned on");
-		return 0;
-	}
-	qname = match_strdup(args);
-	if (!qname) {
-		ext3_msg(sb, KERN_ERR,
-			"Not enough memory for storing quotafile name");
-		return 0;
-	}
-	if (sbi->s_qf_names[qtype]) {
-		int same = !strcmp(sbi->s_qf_names[qtype], qname);
-
-		kfree(qname);
-		if (!same) {
-			ext3_msg(sb, KERN_ERR,
-				 "%s quota file already specified",
-				 QTYPE2NAME(qtype));
-		}
-		return same;
-	}
-	if (strchr(qname, '/')) {
-		ext3_msg(sb, KERN_ERR,
-			"quotafile must be on filesystem root");
-		kfree(qname);
-		return 0;
-	}
-	sbi->s_qf_names[qtype] = qname;
-	set_opt(sbi->s_mount_opt, QUOTA);
-	return 1;
-}
-
-static int clear_qf_name(struct super_block *sb, int qtype) {
-
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-
-	if (sb_any_quota_loaded(sb) &&
-		sbi->s_qf_names[qtype]) {
-		ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options"
-			" when quota turned on");
-		return 0;
-	}
-	if (sbi->s_qf_names[qtype]) {
-		kfree(sbi->s_qf_names[qtype]);
-		sbi->s_qf_names[qtype] = NULL;
-	}
-	return 1;
-}
-#endif
-
-static int parse_options (char *options, struct super_block *sb,
-			  unsigned int *inum, unsigned long *journal_devnum,
-			  ext3_fsblk_t *n_blocks_count, int is_remount)
-{
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	char * p;
-	substring_t args[MAX_OPT_ARGS];
-	int data_opt = 0;
-	int option;
-	kuid_t uid;
-	kgid_t gid;
-	char *journal_path;
-	struct inode *journal_inode;
-	struct path path;
-	int error;
-
-#ifdef CONFIG_QUOTA
-	int qfmt;
-#endif
-
-	if (!options)
-		return 1;
-
-	while ((p = strsep (&options, ",")) != NULL) {
-		int token;
-		if (!*p)
-			continue;
-		/*
-		 * Initialize args struct so we know whether arg was
-		 * found; some options take optional arguments.
-		 */
-		args[0].to = args[0].from = NULL;
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_bsd_df:
-			clear_opt (sbi->s_mount_opt, MINIX_DF);
-			break;
-		case Opt_minix_df:
-			set_opt (sbi->s_mount_opt, MINIX_DF);
-			break;
-		case Opt_grpid:
-			set_opt (sbi->s_mount_opt, GRPID);
-			break;
-		case Opt_nogrpid:
-			clear_opt (sbi->s_mount_opt, GRPID);
-			break;
-		case Opt_resuid:
-			if (match_int(&args[0], &option))
-				return 0;
-			uid = make_kuid(current_user_ns(), option);
-			if (!uid_valid(uid)) {
-				ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option);
-				return 0;
-
-			}
-			sbi->s_resuid = uid;
-			break;
-		case Opt_resgid:
-			if (match_int(&args[0], &option))
-				return 0;
-			gid = make_kgid(current_user_ns(), option);
-			if (!gid_valid(gid)) {
-				ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option);
-				return 0;
-			}
-			sbi->s_resgid = gid;
-			break;
-		case Opt_sb:
-			/* handled by get_sb_block() instead of here */
-			/* *sb_block = match_int(&args[0]); */
-			break;
-		case Opt_err_panic:
-			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
-			clear_opt (sbi->s_mount_opt, ERRORS_RO);
-			set_opt (sbi->s_mount_opt, ERRORS_PANIC);
-			break;
-		case Opt_err_ro:
-			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
-			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
-			set_opt (sbi->s_mount_opt, ERRORS_RO);
-			break;
-		case Opt_err_cont:
-			clear_opt (sbi->s_mount_opt, ERRORS_RO);
-			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
-			set_opt (sbi->s_mount_opt, ERRORS_CONT);
-			break;
-		case Opt_nouid32:
-			set_opt (sbi->s_mount_opt, NO_UID32);
-			break;
-		case Opt_nocheck:
-			clear_opt (sbi->s_mount_opt, CHECK);
-			break;
-		case Opt_debug:
-			set_opt (sbi->s_mount_opt, DEBUG);
-			break;
-		case Opt_oldalloc:
-			ext3_msg(sb, KERN_WARNING,
-				"Ignoring deprecated oldalloc option");
-			break;
-		case Opt_orlov:
-			ext3_msg(sb, KERN_WARNING,
-				"Ignoring deprecated orlov option");
-			break;
-#ifdef CONFIG_EXT3_FS_XATTR
-		case Opt_user_xattr:
-			set_opt (sbi->s_mount_opt, XATTR_USER);
-			break;
-		case Opt_nouser_xattr:
-			clear_opt (sbi->s_mount_opt, XATTR_USER);
-			break;
-#else
-		case Opt_user_xattr:
-		case Opt_nouser_xattr:
-			ext3_msg(sb, KERN_INFO,
-				"(no)user_xattr options not supported");
-			break;
-#endif
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-		case Opt_acl:
-			set_opt(sbi->s_mount_opt, POSIX_ACL);
-			break;
-		case Opt_noacl:
-			clear_opt(sbi->s_mount_opt, POSIX_ACL);
-			break;
-#else
-		case Opt_acl:
-		case Opt_noacl:
-			ext3_msg(sb, KERN_INFO,
-				"(no)acl options not supported");
-			break;
-#endif
-		case Opt_reservation:
-			set_opt(sbi->s_mount_opt, RESERVATION);
-			break;
-		case Opt_noreservation:
-			clear_opt(sbi->s_mount_opt, RESERVATION);
-			break;
-		case Opt_journal_update:
-			/* @@@ FIXME */
-			/* Eventually we will want to be able to create
-			   a journal file here.  For now, only allow the
-			   user to specify an existing inode to be the
-			   journal file. */
-			if (is_remount) {
-				ext3_msg(sb, KERN_ERR, "error: cannot specify "
-					"journal on remount");
-				return 0;
-			}
-			set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
-			break;
-		case Opt_journal_inum:
-			if (is_remount) {
-				ext3_msg(sb, KERN_ERR, "error: cannot specify "
-				       "journal on remount");
-				return 0;
-			}
-			if (match_int(&args[0], &option))
-				return 0;
-			*inum = option;
-			break;
-		case Opt_journal_dev:
-			if (is_remount) {
-				ext3_msg(sb, KERN_ERR, "error: cannot specify "
-				       "journal on remount");
-				return 0;
-			}
-			if (match_int(&args[0], &option))
-				return 0;
-			*journal_devnum = option;
-			break;
-		case Opt_journal_path:
-			if (is_remount) {
-				ext3_msg(sb, KERN_ERR, "error: cannot specify "
-				       "journal on remount");
-				return 0;
-			}
-
-			journal_path = match_strdup(&args[0]);
-			if (!journal_path) {
-				ext3_msg(sb, KERN_ERR, "error: could not dup "
-					"journal device string");
-				return 0;
-			}
-
-			error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
-			if (error) {
-				ext3_msg(sb, KERN_ERR, "error: could not find "
-					"journal device path: error %d", error);
-				kfree(journal_path);
-				return 0;
-			}
-
-			journal_inode = d_inode(path.dentry);
-			if (!S_ISBLK(journal_inode->i_mode)) {
-				ext3_msg(sb, KERN_ERR, "error: journal path %s "
-					"is not a block device", journal_path);
-				path_put(&path);
-				kfree(journal_path);
-				return 0;
-			}
-
-			*journal_devnum = new_encode_dev(journal_inode->i_rdev);
-			path_put(&path);
-			kfree(journal_path);
-			break;
-		case Opt_noload:
-			set_opt (sbi->s_mount_opt, NOLOAD);
-			break;
-		case Opt_commit:
-			if (match_int(&args[0], &option))
-				return 0;
-			if (option < 0)
-				return 0;
-			if (option == 0)
-				option = JBD_DEFAULT_MAX_COMMIT_AGE;
-			sbi->s_commit_interval = HZ * option;
-			break;
-		case Opt_data_journal:
-			data_opt = EXT3_MOUNT_JOURNAL_DATA;
-			goto datacheck;
-		case Opt_data_ordered:
-			data_opt = EXT3_MOUNT_ORDERED_DATA;
-			goto datacheck;
-		case Opt_data_writeback:
-			data_opt = EXT3_MOUNT_WRITEBACK_DATA;
-		datacheck:
-			if (is_remount) {
-				if (test_opt(sb, DATA_FLAGS) == data_opt)
-					break;
-				ext3_msg(sb, KERN_ERR,
-					"error: cannot change "
-					"data mode on remount. The filesystem "
-					"is mounted in data=%s mode and you "
-					"try to remount it in data=%s mode.",
-					data_mode_string(test_opt(sb,
-							DATA_FLAGS)),
-					data_mode_string(data_opt));
-				return 0;
-			} else {
-				clear_opt(sbi->s_mount_opt, DATA_FLAGS);
-				sbi->s_mount_opt |= data_opt;
-			}
-			break;
-		case Opt_data_err_abort:
-			set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
-			break;
-		case Opt_data_err_ignore:
-			clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
-			break;
-#ifdef CONFIG_QUOTA
-		case Opt_usrjquota:
-			if (!set_qf_name(sb, USRQUOTA, &args[0]))
-				return 0;
-			break;
-		case Opt_grpjquota:
-			if (!set_qf_name(sb, GRPQUOTA, &args[0]))
-				return 0;
-			break;
-		case Opt_offusrjquota:
-			if (!clear_qf_name(sb, USRQUOTA))
-				return 0;
-			break;
-		case Opt_offgrpjquota:
-			if (!clear_qf_name(sb, GRPQUOTA))
-				return 0;
-			break;
-		case Opt_jqfmt_vfsold:
-			qfmt = QFMT_VFS_OLD;
-			goto set_qf_format;
-		case Opt_jqfmt_vfsv0:
-			qfmt = QFMT_VFS_V0;
-			goto set_qf_format;
-		case Opt_jqfmt_vfsv1:
-			qfmt = QFMT_VFS_V1;
-set_qf_format:
-			if (sb_any_quota_loaded(sb) &&
-			    sbi->s_jquota_fmt != qfmt) {
-				ext3_msg(sb, KERN_ERR, "error: cannot change "
-					"journaled quota options when "
-					"quota turned on.");
-				return 0;
-			}
-			sbi->s_jquota_fmt = qfmt;
-			break;
-		case Opt_quota:
-		case Opt_usrquota:
-			set_opt(sbi->s_mount_opt, QUOTA);
-			set_opt(sbi->s_mount_opt, USRQUOTA);
-			break;
-		case Opt_grpquota:
-			set_opt(sbi->s_mount_opt, QUOTA);
-			set_opt(sbi->s_mount_opt, GRPQUOTA);
-			break;
-		case Opt_noquota:
-			if (sb_any_quota_loaded(sb)) {
-				ext3_msg(sb, KERN_ERR, "error: cannot change "
-					"quota options when quota turned on.");
-				return 0;
-			}
-			clear_opt(sbi->s_mount_opt, QUOTA);
-			clear_opt(sbi->s_mount_opt, USRQUOTA);
-			clear_opt(sbi->s_mount_opt, GRPQUOTA);
-			break;
-#else
-		case Opt_quota:
-		case Opt_usrquota:
-		case Opt_grpquota:
-			ext3_msg(sb, KERN_ERR,
-				"error: quota options not supported.");
-			break;
-		case Opt_usrjquota:
-		case Opt_grpjquota:
-		case Opt_offusrjquota:
-		case Opt_offgrpjquota:
-		case Opt_jqfmt_vfsold:
-		case Opt_jqfmt_vfsv0:
-		case Opt_jqfmt_vfsv1:
-			ext3_msg(sb, KERN_ERR,
-				"error: journaled quota options not "
-				"supported.");
-			break;
-		case Opt_noquota:
-			break;
-#endif
-		case Opt_abort:
-			set_opt(sbi->s_mount_opt, ABORT);
-			break;
-		case Opt_nobarrier:
-			clear_opt(sbi->s_mount_opt, BARRIER);
-			break;
-		case Opt_barrier:
-			if (args[0].from) {
-				if (match_int(&args[0], &option))
-					return 0;
-			} else
-				option = 1;	/* No argument, default to 1 */
-			if (option)
-				set_opt(sbi->s_mount_opt, BARRIER);
-			else
-				clear_opt(sbi->s_mount_opt, BARRIER);
-			break;
-		case Opt_ignore:
-			break;
-		case Opt_resize:
-			if (!is_remount) {
-				ext3_msg(sb, KERN_ERR,
-					"error: resize option only available "
-					"for remount");
-				return 0;
-			}
-			if (match_int(&args[0], &option) != 0)
-				return 0;
-			*n_blocks_count = option;
-			break;
-		case Opt_nobh:
-			ext3_msg(sb, KERN_WARNING,
-				"warning: ignoring deprecated nobh option");
-			break;
-		case Opt_bh:
-			ext3_msg(sb, KERN_WARNING,
-				"warning: ignoring deprecated bh option");
-			break;
-		default:
-			ext3_msg(sb, KERN_ERR,
-				"error: unrecognized mount option \"%s\" "
-				"or missing value", p);
-			return 0;
-		}
-	}
-#ifdef CONFIG_QUOTA
-	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
-		if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
-			clear_opt(sbi->s_mount_opt, USRQUOTA);
-		if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
-			clear_opt(sbi->s_mount_opt, GRPQUOTA);
-
-		if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
-			ext3_msg(sb, KERN_ERR, "error: old and new quota "
-					"format mixing.");
-			return 0;
-		}
-
-		if (!sbi->s_jquota_fmt) {
-			ext3_msg(sb, KERN_ERR, "error: journaled quota format "
-					"not specified.");
-			return 0;
-		}
-	}
-#endif
-	return 1;
-}
-
-static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
-			    int read_only)
-{
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	int res = 0;
-
-	if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
-		ext3_msg(sb, KERN_ERR,
-			"error: revision level too high, "
-			"forcing read-only mode");
-		res = MS_RDONLY;
-	}
-	if (read_only)
-		return res;
-	if (!(sbi->s_mount_state & EXT3_VALID_FS))
-		ext3_msg(sb, KERN_WARNING,
-			"warning: mounting unchecked fs, "
-			"running e2fsck is recommended");
-	else if ((sbi->s_mount_state & EXT3_ERROR_FS))
-		ext3_msg(sb, KERN_WARNING,
-			"warning: mounting fs with errors, "
-			"running e2fsck is recommended");
-	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
-		 le16_to_cpu(es->s_mnt_count) >=
-			le16_to_cpu(es->s_max_mnt_count))
-		ext3_msg(sb, KERN_WARNING,
-			"warning: maximal mount count reached, "
-			"running e2fsck is recommended");
-	else if (le32_to_cpu(es->s_checkinterval) &&
-		(le32_to_cpu(es->s_lastcheck) +
-			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
-		ext3_msg(sb, KERN_WARNING,
-			"warning: checktime reached, "
-			"running e2fsck is recommended");
-#if 0
-		/* @@@ We _will_ want to clear the valid bit if we find
-                   inconsistencies, to force a fsck at reboot.  But for
-                   a plain journaled filesystem we can keep it set as
-                   valid forever! :) */
-	es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
-#endif
-	if (!le16_to_cpu(es->s_max_mnt_count))
-		es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
-	le16_add_cpu(&es->s_mnt_count, 1);
-	es->s_mtime = cpu_to_le32(get_seconds());
-	ext3_update_dynamic_rev(sb);
-	EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-
-	ext3_commit_super(sb, es, 1);
-	if (test_opt(sb, DEBUG))
-		ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, "
-				"bpg=%lu, ipg=%lu, mo=%04lx]",
-			sb->s_blocksize,
-			sbi->s_groups_count,
-			EXT3_BLOCKS_PER_GROUP(sb),
-			EXT3_INODES_PER_GROUP(sb),
-			sbi->s_mount_opt);
-
-	if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
-		char b[BDEVNAME_SIZE];
-		ext3_msg(sb, KERN_INFO, "using external journal on %s",
-			bdevname(EXT3_SB(sb)->s_journal->j_dev, b));
-	} else {
-		ext3_msg(sb, KERN_INFO, "using internal journal");
-	}
-	cleancache_init_fs(sb);
-	return res;
-}
-
-/* Called at mount-time, super-block is locked */
-static int ext3_check_descriptors(struct super_block *sb)
-{
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	int i;
-
-	ext3_debug ("Checking group descriptors");
-
-	for (i = 0; i < sbi->s_groups_count; i++) {
-		struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL);
-		ext3_fsblk_t first_block = ext3_group_first_block_no(sb, i);
-		ext3_fsblk_t last_block;
-
-		if (i == sbi->s_groups_count - 1)
-			last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
-		else
-			last_block = first_block +
-				(EXT3_BLOCKS_PER_GROUP(sb) - 1);
-
-		if (le32_to_cpu(gdp->bg_block_bitmap) < first_block ||
-		    le32_to_cpu(gdp->bg_block_bitmap) > last_block)
-		{
-			ext3_error (sb, "ext3_check_descriptors",
-				    "Block bitmap for group %d"
-				    " not in group (block %lu)!",
-				    i, (unsigned long)
-					le32_to_cpu(gdp->bg_block_bitmap));
-			return 0;
-		}
-		if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block ||
-		    le32_to_cpu(gdp->bg_inode_bitmap) > last_block)
-		{
-			ext3_error (sb, "ext3_check_descriptors",
-				    "Inode bitmap for group %d"
-				    " not in group (block %lu)!",
-				    i, (unsigned long)
-					le32_to_cpu(gdp->bg_inode_bitmap));
-			return 0;
-		}
-		if (le32_to_cpu(gdp->bg_inode_table) < first_block ||
-		    le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group - 1 >
-		    last_block)
-		{
-			ext3_error (sb, "ext3_check_descriptors",
-				    "Inode table for group %d"
-				    " not in group (block %lu)!",
-				    i, (unsigned long)
-					le32_to_cpu(gdp->bg_inode_table));
-			return 0;
-		}
-	}
-
-	sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb));
-	sbi->s_es->s_free_inodes_count=cpu_to_le32(ext3_count_free_inodes(sb));
-	return 1;
-}
-
-
-/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at
- * the superblock) which were deleted from all directories, but held open by
- * a process at the time of a crash.  We walk the list and try to delete these
- * inodes at recovery time (only with a read-write filesystem).
- *
- * In order to keep the orphan inode chain consistent during traversal (in
- * case of crash during recovery), we link each inode into the superblock
- * orphan list_head and handle it the same way as an inode deletion during
- * normal operation (which journals the operations for us).
- *
- * We only do an iget() and an iput() on each inode, which is very safe if we
- * accidentally point at an in-use or already deleted inode.  The worst that
- * can happen in this case is that we get a "bit already cleared" message from
- * ext3_free_inode().  The only reason we would point at a wrong inode is if
- * e2fsck was run on this filesystem, and it must have already done the orphan
- * inode cleanup for us, so we can safely abort without any further action.
- */
-static void ext3_orphan_cleanup (struct super_block * sb,
-				 struct ext3_super_block * es)
-{
-	unsigned int s_flags = sb->s_flags;
-	int nr_orphans = 0, nr_truncates = 0;
-#ifdef CONFIG_QUOTA
-	int i;
-#endif
-	if (!es->s_last_orphan) {
-		jbd_debug(4, "no orphan inodes to clean up\n");
-		return;
-	}
-
-	if (bdev_read_only(sb->s_bdev)) {
-		ext3_msg(sb, KERN_ERR, "error: write access "
-			"unavailable, skipping orphan cleanup.");
-		return;
-	}
-
-	/* Check if feature set allows readwrite operations */
-	if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) {
-		ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
-			 "unknown ROCOMPAT features");
-		return;
-	}
-
-	if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
-		/* don't clear list on RO mount w/ errors */
-		if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
-			jbd_debug(1, "Errors on filesystem, "
-				  "clearing orphan list.\n");
-			es->s_last_orphan = 0;
-		}
-		jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
-		return;
-	}
-
-	if (s_flags & MS_RDONLY) {
-		ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
-		sb->s_flags &= ~MS_RDONLY;
-	}
-#ifdef CONFIG_QUOTA
-	/* Needed for iput() to work correctly and not trash data */
-	sb->s_flags |= MS_ACTIVE;
-	/* Turn on quotas so that they are updated correctly */
-	for (i = 0; i < EXT3_MAXQUOTAS; i++) {
-		if (EXT3_SB(sb)->s_qf_names[i]) {
-			int ret = ext3_quota_on_mount(sb, i);
-			if (ret < 0)
-				ext3_msg(sb, KERN_ERR,
-					"error: cannot turn on journaled "
-					"quota: %d", ret);
-		}
-	}
-#endif
-
-	while (es->s_last_orphan) {
-		struct inode *inode;
-
-		inode = ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
-		if (IS_ERR(inode)) {
-			es->s_last_orphan = 0;
-			break;
-		}
-
-		list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
-		dquot_initialize(inode);
-		if (inode->i_nlink) {
-			printk(KERN_DEBUG
-				"%s: truncating inode %lu to %Ld bytes\n",
-				__func__, inode->i_ino, inode->i_size);
-			jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
-				  inode->i_ino, inode->i_size);
-			ext3_truncate(inode);
-			nr_truncates++;
-		} else {
-			printk(KERN_DEBUG
-				"%s: deleting unreferenced inode %lu\n",
-				__func__, inode->i_ino);
-			jbd_debug(2, "deleting unreferenced inode %lu\n",
-				  inode->i_ino);
-			nr_orphans++;
-		}
-		iput(inode);  /* The delete magic happens here! */
-	}
-
-#define PLURAL(x) (x), ((x)==1) ? "" : "s"
-
-	if (nr_orphans)
-		ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
-		       PLURAL(nr_orphans));
-	if (nr_truncates)
-		ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
-		       PLURAL(nr_truncates));
-#ifdef CONFIG_QUOTA
-	/* Turn quotas off */
-	for (i = 0; i < EXT3_MAXQUOTAS; i++) {
-		if (sb_dqopt(sb)->files[i])
-			dquot_quota_off(sb, i);
-	}
-#endif
-	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
-}
-
-/*
- * Maximal file size.  There is a direct, and {,double-,triple-}indirect
- * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
- * We need to be 1 filesystem block less than the 2^32 sector limit.
- */
-static loff_t ext3_max_size(int bits)
-{
-	loff_t res = EXT3_NDIR_BLOCKS;
-	int meta_blocks;
-	loff_t upper_limit;
-
-	/* This is calculated to be the largest file size for a
-	 * dense, file such that the total number of
-	 * sectors in the file, including data and all indirect blocks,
-	 * does not exceed 2^32 -1
-	 * __u32 i_blocks representing the total number of
-	 * 512 bytes blocks of the file
-	 */
-	upper_limit = (1LL << 32) - 1;
-
-	/* total blocks in file system block size */
-	upper_limit >>= (bits - 9);
-
-
-	/* indirect blocks */
-	meta_blocks = 1;
-	/* double indirect blocks */
-	meta_blocks += 1 + (1LL << (bits-2));
-	/* tripple indirect blocks */
-	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
-
-	upper_limit -= meta_blocks;
-	upper_limit <<= bits;
-
-	res += 1LL << (bits-2);
-	res += 1LL << (2*(bits-2));
-	res += 1LL << (3*(bits-2));
-	res <<= bits;
-	if (res > upper_limit)
-		res = upper_limit;
-
-	if (res > MAX_LFS_FILESIZE)
-		res = MAX_LFS_FILESIZE;
-
-	return res;
-}
-
-static ext3_fsblk_t descriptor_loc(struct super_block *sb,
-				    ext3_fsblk_t logic_sb_block,
-				    int nr)
-{
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	unsigned long bg, first_meta_bg;
-	int has_super = 0;
-
-	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
-
-	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) ||
-	    nr < first_meta_bg)
-		return (logic_sb_block + nr + 1);
-	bg = sbi->s_desc_per_block * nr;
-	if (ext3_bg_has_super(sb, bg))
-		has_super = 1;
-	return (has_super + ext3_group_first_block_no(sb, bg));
-}
-
-
-static int ext3_fill_super (struct super_block *sb, void *data, int silent)
-{
-	struct buffer_head * bh;
-	struct ext3_super_block *es = NULL;
-	struct ext3_sb_info *sbi;
-	ext3_fsblk_t block;
-	ext3_fsblk_t sb_block = get_sb_block(&data, sb);
-	ext3_fsblk_t logic_sb_block;
-	unsigned long offset = 0;
-	unsigned int journal_inum = 0;
-	unsigned long journal_devnum = 0;
-	unsigned long def_mount_opts;
-	struct inode *root;
-	int blocksize;
-	int hblock;
-	int db_count;
-	int i;
-	int needs_recovery;
-	int ret = -EINVAL;
-	__le32 features;
-	int err;
-
-	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-	if (!sbi)
-		return -ENOMEM;
-
-	sbi->s_blockgroup_lock =
-		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
-	if (!sbi->s_blockgroup_lock) {
-		kfree(sbi);
-		return -ENOMEM;
-	}
-	sb->s_fs_info = sbi;
-	sbi->s_sb_block = sb_block;
-
-	blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
-	if (!blocksize) {
-		ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
-		goto out_fail;
-	}
-
-	/*
-	 * The ext3 superblock will not be buffer aligned for other than 1kB
-	 * block sizes.  We need to calculate the offset from buffer start.
-	 */
-	if (blocksize != EXT3_MIN_BLOCK_SIZE) {
-		logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
-		offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
-	} else {
-		logic_sb_block = sb_block;
-	}
-
-	if (!(bh = sb_bread(sb, logic_sb_block))) {
-		ext3_msg(sb, KERN_ERR, "error: unable to read superblock");
-		goto out_fail;
-	}
-	/*
-	 * Note: s_es must be initialized as soon as possible because
-	 *       some ext3 macro-instructions depend on its value
-	 */
-	es = (struct ext3_super_block *) (bh->b_data + offset);
-	sbi->s_es = es;
-	sb->s_magic = le16_to_cpu(es->s_magic);
-	if (sb->s_magic != EXT3_SUPER_MAGIC)
-		goto cantfind_ext3;
-
-	/* Set defaults before we parse the mount options */
-	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
-	if (def_mount_opts & EXT3_DEFM_DEBUG)
-		set_opt(sbi->s_mount_opt, DEBUG);
-	if (def_mount_opts & EXT3_DEFM_BSDGROUPS)
-		set_opt(sbi->s_mount_opt, GRPID);
-	if (def_mount_opts & EXT3_DEFM_UID16)
-		set_opt(sbi->s_mount_opt, NO_UID32);
-#ifdef CONFIG_EXT3_FS_XATTR
-	if (def_mount_opts & EXT3_DEFM_XATTR_USER)
-		set_opt(sbi->s_mount_opt, XATTR_USER);
-#endif
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	if (def_mount_opts & EXT3_DEFM_ACL)
-		set_opt(sbi->s_mount_opt, POSIX_ACL);
-#endif
-	if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
-		set_opt(sbi->s_mount_opt, JOURNAL_DATA);
-	else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
-		set_opt(sbi->s_mount_opt, ORDERED_DATA);
-	else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
-		set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
-
-	if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
-		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
-	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_CONTINUE)
-		set_opt(sbi->s_mount_opt, ERRORS_CONT);
-	else
-		set_opt(sbi->s_mount_opt, ERRORS_RO);
-
-	sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
-	sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
-
-	/* enable barriers by default */
-	set_opt(sbi->s_mount_opt, BARRIER);
-	set_opt(sbi->s_mount_opt, RESERVATION);
-
-	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
-			    NULL, 0))
-		goto failed_mount;
-
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
-
-	if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
-	    (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
-	     EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
-	     EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
-		ext3_msg(sb, KERN_WARNING,
-			"warning: feature flags set on rev 0 fs, "
-			"running e2fsck is recommended");
-	/*
-	 * Check feature flags regardless of the revision level, since we
-	 * previously didn't change the revision level when setting the flags,
-	 * so there is a chance incompat flags are set on a rev 0 filesystem.
-	 */
-	features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP);
-	if (features) {
-		ext3_msg(sb, KERN_ERR,
-			"error: couldn't mount because of unsupported "
-			"optional features (%x)", le32_to_cpu(features));
-		goto failed_mount;
-	}
-	features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP);
-	if (!(sb->s_flags & MS_RDONLY) && features) {
-		ext3_msg(sb, KERN_ERR,
-			"error: couldn't mount RDWR because of unsupported "
-			"optional features (%x)", le32_to_cpu(features));
-		goto failed_mount;
-	}
-	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
-
-	if (blocksize < EXT3_MIN_BLOCK_SIZE ||
-	    blocksize > EXT3_MAX_BLOCK_SIZE) {
-		ext3_msg(sb, KERN_ERR,
-			"error: couldn't mount because of unsupported "
-			"filesystem blocksize %d", blocksize);
-		goto failed_mount;
-	}
-
-	hblock = bdev_logical_block_size(sb->s_bdev);
-	if (sb->s_blocksize != blocksize) {
-		/*
-		 * Make sure the blocksize for the filesystem is larger
-		 * than the hardware sectorsize for the machine.
-		 */
-		if (blocksize < hblock) {
-			ext3_msg(sb, KERN_ERR,
-				"error: fsblocksize %d too small for "
-				"hardware sectorsize %d", blocksize, hblock);
-			goto failed_mount;
-		}
-
-		brelse (bh);
-		if (!sb_set_blocksize(sb, blocksize)) {
-			ext3_msg(sb, KERN_ERR,
-				"error: bad blocksize %d", blocksize);
-			goto out_fail;
-		}
-		logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
-		offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
-		bh = sb_bread(sb, logic_sb_block);
-		if (!bh) {
-			ext3_msg(sb, KERN_ERR,
-			       "error: can't read superblock on 2nd try");
-			goto failed_mount;
-		}
-		es = (struct ext3_super_block *)(bh->b_data + offset);
-		sbi->s_es = es;
-		if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
-			ext3_msg(sb, KERN_ERR,
-				"error: magic mismatch");
-			goto failed_mount;
-		}
-	}
-
-	sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits);
-
-	if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) {
-		sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE;
-		sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO;
-	} else {
-		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
-		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
-		if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
-		    (!is_power_of_2(sbi->s_inode_size)) ||
-		    (sbi->s_inode_size > blocksize)) {
-			ext3_msg(sb, KERN_ERR,
-				"error: unsupported inode size: %d",
-				sbi->s_inode_size);
-			goto failed_mount;
-		}
-	}
-	sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
-				   le32_to_cpu(es->s_log_frag_size);
-	if (blocksize != sbi->s_frag_size) {
-		ext3_msg(sb, KERN_ERR,
-		       "error: fragsize %lu != blocksize %u (unsupported)",
-		       sbi->s_frag_size, blocksize);
-		goto failed_mount;
-	}
-	sbi->s_frags_per_block = 1;
-	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
-	sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
-	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
-	if (EXT3_INODE_SIZE(sb) == 0 || EXT3_INODES_PER_GROUP(sb) == 0)
-		goto cantfind_ext3;
-	sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
-	if (sbi->s_inodes_per_block == 0)
-		goto cantfind_ext3;
-	sbi->s_itb_per_group = sbi->s_inodes_per_group /
-					sbi->s_inodes_per_block;
-	sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
-	sbi->s_sbh = bh;
-	sbi->s_mount_state = le16_to_cpu(es->s_state);
-	sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb));
-	sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb));
-	for (i = 0; i < 4; i++)
-		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
-	sbi->s_def_hash_version = es->s_def_hash_version;
-	i = le32_to_cpu(es->s_flags);
-	if (i & EXT2_FLAGS_UNSIGNED_HASH)
-		sbi->s_hash_unsigned = 3;
-	else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
-#ifdef __CHAR_UNSIGNED__
-		es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
-		sbi->s_hash_unsigned = 3;
-#else
-		es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
-#endif
-	}
-
-	if (sbi->s_blocks_per_group > blocksize * 8) {
-		ext3_msg(sb, KERN_ERR,
-			"#blocks per group too big: %lu",
-			sbi->s_blocks_per_group);
-		goto failed_mount;
-	}
-	if (sbi->s_frags_per_group > blocksize * 8) {
-		ext3_msg(sb, KERN_ERR,
-			"error: #fragments per group too big: %lu",
-			sbi->s_frags_per_group);
-		goto failed_mount;
-	}
-	if (sbi->s_inodes_per_group > blocksize * 8) {
-		ext3_msg(sb, KERN_ERR,
-			"error: #inodes per group too big: %lu",
-			sbi->s_inodes_per_group);
-		goto failed_mount;
-	}
-
-	err = generic_check_addressable(sb->s_blocksize_bits,
-					le32_to_cpu(es->s_blocks_count));
-	if (err) {
-		ext3_msg(sb, KERN_ERR,
-			"error: filesystem is too large to mount safely");
-		if (sizeof(sector_t) < 8)
-			ext3_msg(sb, KERN_ERR,
-				"error: CONFIG_LBDAF not enabled");
-		ret = err;
-		goto failed_mount;
-	}
-
-	if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
-		goto cantfind_ext3;
-	sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
-			       le32_to_cpu(es->s_first_data_block) - 1)
-				       / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
-	db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb));
-	sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
-				    GFP_KERNEL);
-	if (sbi->s_group_desc == NULL) {
-		ext3_msg(sb, KERN_ERR,
-			"error: not enough memory");
-		ret = -ENOMEM;
-		goto failed_mount;
-	}
-
-	bgl_lock_init(sbi->s_blockgroup_lock);
-
-	for (i = 0; i < db_count; i++) {
-		block = descriptor_loc(sb, logic_sb_block, i);
-		sbi->s_group_desc[i] = sb_bread(sb, block);
-		if (!sbi->s_group_desc[i]) {
-			ext3_msg(sb, KERN_ERR,
-				"error: can't read group descriptor %d", i);
-			db_count = i;
-			goto failed_mount2;
-		}
-	}
-	if (!ext3_check_descriptors (sb)) {
-		ext3_msg(sb, KERN_ERR,
-			"error: group descriptors corrupted");
-		goto failed_mount2;
-	}
-	sbi->s_gdb_count = db_count;
-	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
-	spin_lock_init(&sbi->s_next_gen_lock);
-
-	/* per fileystem reservation list head & lock */
-	spin_lock_init(&sbi->s_rsv_window_lock);
-	sbi->s_rsv_window_root = RB_ROOT;
-	/* Add a single, static dummy reservation to the start of the
-	 * reservation window list --- it gives us a placeholder for
-	 * append-at-start-of-list which makes the allocation logic
-	 * _much_ simpler. */
-	sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
-	sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
-	sbi->s_rsv_window_head.rsv_alloc_hit = 0;
-	sbi->s_rsv_window_head.rsv_goal_size = 0;
-	ext3_rsv_window_add(sb, &sbi->s_rsv_window_head);
-
-	/*
-	 * set up enough so that it can read an inode
-	 */
-	sb->s_op = &ext3_sops;
-	sb->s_export_op = &ext3_export_ops;
-	sb->s_xattr = ext3_xattr_handlers;
-#ifdef CONFIG_QUOTA
-	sb->s_qcop = &ext3_qctl_operations;
-	sb->dq_op = &ext3_quota_operations;
-	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
-#endif
-	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
-	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
-	mutex_init(&sbi->s_orphan_lock);
-	mutex_init(&sbi->s_resize_lock);
-
-	sb->s_root = NULL;
-
-	needs_recovery = (es->s_last_orphan != 0 ||
-			  EXT3_HAS_INCOMPAT_FEATURE(sb,
-				    EXT3_FEATURE_INCOMPAT_RECOVER));
-
-	/*
-	 * The first inode we look at is the journal inode.  Don't try
-	 * root first: it may be modified in the journal!
-	 */
-	if (!test_opt(sb, NOLOAD) &&
-	    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
-		if (ext3_load_journal(sb, es, journal_devnum))
-			goto failed_mount2;
-	} else if (journal_inum) {
-		if (ext3_create_journal(sb, es, journal_inum))
-			goto failed_mount2;
-	} else {
-		if (!silent)
-			ext3_msg(sb, KERN_ERR,
-				"error: no journal found. "
-				"mounting ext3 over ext2?");
-		goto failed_mount2;
-	}
-	err = percpu_counter_init(&sbi->s_freeblocks_counter,
-			ext3_count_free_blocks(sb), GFP_KERNEL);
-	if (!err) {
-		err = percpu_counter_init(&sbi->s_freeinodes_counter,
-				ext3_count_free_inodes(sb), GFP_KERNEL);
-	}
-	if (!err) {
-		err = percpu_counter_init(&sbi->s_dirs_counter,
-				ext3_count_dirs(sb), GFP_KERNEL);
-	}
-	if (err) {
-		ext3_msg(sb, KERN_ERR, "error: insufficient memory");
-		ret = err;
-		goto failed_mount3;
-	}
-
-	/* We have now updated the journal if required, so we can
-	 * validate the data journaling mode. */
-	switch (test_opt(sb, DATA_FLAGS)) {
-	case 0:
-		/* No mode set, assume a default based on the journal
-                   capabilities: ORDERED_DATA if the journal can
-                   cope, else JOURNAL_DATA */
-		if (journal_check_available_features
-		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
-			set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
-		else
-			set_opt(sbi->s_mount_opt, JOURNAL_DATA);
-		break;
-
-	case EXT3_MOUNT_ORDERED_DATA:
-	case EXT3_MOUNT_WRITEBACK_DATA:
-		if (!journal_check_available_features
-		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
-			ext3_msg(sb, KERN_ERR,
-				"error: journal does not support "
-				"requested data journaling mode");
-			goto failed_mount3;
-		}
-	default:
-		break;
-	}
-
-	/*
-	 * The journal_load will have done any necessary log recovery,
-	 * so we can safely mount the rest of the filesystem now.
-	 */
-
-	root = ext3_iget(sb, EXT3_ROOT_INO);
-	if (IS_ERR(root)) {
-		ext3_msg(sb, KERN_ERR, "error: get root inode failed");
-		ret = PTR_ERR(root);
-		goto failed_mount3;
-	}
-	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
-		iput(root);
-		ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
-		goto failed_mount3;
-	}
-	sb->s_root = d_make_root(root);
-	if (!sb->s_root) {
-		ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
-		ret = -ENOMEM;
-		goto failed_mount3;
-	}
-
-	if (ext3_setup_super(sb, es, sb->s_flags & MS_RDONLY))
-		sb->s_flags |= MS_RDONLY;
-
-	EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
-	ext3_orphan_cleanup(sb, es);
-	EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
-	if (needs_recovery) {
-		ext3_mark_recovery_complete(sb, es);
-		ext3_msg(sb, KERN_INFO, "recovery complete");
-	}
-	ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode",
-		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
-		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
-		"writeback");
-
-	return 0;
-
-cantfind_ext3:
-	if (!silent)
-		ext3_msg(sb, KERN_INFO,
-			"error: can't find ext3 filesystem on dev %s.",
-		       sb->s_id);
-	goto failed_mount;
-
-failed_mount3:
-	percpu_counter_destroy(&sbi->s_freeblocks_counter);
-	percpu_counter_destroy(&sbi->s_freeinodes_counter);
-	percpu_counter_destroy(&sbi->s_dirs_counter);
-	journal_destroy(sbi->s_journal);
-failed_mount2:
-	for (i = 0; i < db_count; i++)
-		brelse(sbi->s_group_desc[i]);
-	kfree(sbi->s_group_desc);
-failed_mount:
-#ifdef CONFIG_QUOTA
-	for (i = 0; i < EXT3_MAXQUOTAS; i++)
-		kfree(sbi->s_qf_names[i]);
-#endif
-	ext3_blkdev_remove(sbi);
-	brelse(bh);
-out_fail:
-	sb->s_fs_info = NULL;
-	kfree(sbi->s_blockgroup_lock);
-	kfree(sbi);
-	return ret;
-}
-
-/*
- * Setup any per-fs journal parameters now.  We'll do this both on
- * initial mount, once the journal has been initialised but before we've
- * done any recovery; and again on any subsequent remount.
- */
-static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
-{
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-
-	if (sbi->s_commit_interval)
-		journal->j_commit_interval = sbi->s_commit_interval;
-	/* We could also set up an ext3-specific default for the commit
-	 * interval here, but for now we'll just fall back to the jbd
-	 * default. */
-
-	spin_lock(&journal->j_state_lock);
-	if (test_opt(sb, BARRIER))
-		journal->j_flags |= JFS_BARRIER;
-	else
-		journal->j_flags &= ~JFS_BARRIER;
-	if (test_opt(sb, DATA_ERR_ABORT))
-		journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR;
-	else
-		journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR;
-	spin_unlock(&journal->j_state_lock);
-}
-
-static journal_t *ext3_get_journal(struct super_block *sb,
-				   unsigned int journal_inum)
-{
-	struct inode *journal_inode;
-	journal_t *journal;
-
-	/* First, test for the existence of a valid inode on disk.  Bad
-	 * things happen if we iget() an unused inode, as the subsequent
-	 * iput() will try to delete it. */
-
-	journal_inode = ext3_iget(sb, journal_inum);
-	if (IS_ERR(journal_inode)) {
-		ext3_msg(sb, KERN_ERR, "error: no journal found");
-		return NULL;
-	}
-	if (!journal_inode->i_nlink) {
-		make_bad_inode(journal_inode);
-		iput(journal_inode);
-		ext3_msg(sb, KERN_ERR, "error: journal inode is deleted");
-		return NULL;
-	}
-
-	jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
-		  journal_inode, journal_inode->i_size);
-	if (!S_ISREG(journal_inode->i_mode)) {
-		ext3_msg(sb, KERN_ERR, "error: invalid journal inode");
-		iput(journal_inode);
-		return NULL;
-	}
-
-	journal = journal_init_inode(journal_inode);
-	if (!journal) {
-		ext3_msg(sb, KERN_ERR, "error: could not load journal inode");
-		iput(journal_inode);
-		return NULL;
-	}
-	journal->j_private = sb;
-	ext3_init_journal_params(sb, journal);
-	return journal;
-}
-
-static journal_t *ext3_get_dev_journal(struct super_block *sb,
-				       dev_t j_dev)
-{
-	struct buffer_head * bh;
-	journal_t *journal;
-	ext3_fsblk_t start;
-	ext3_fsblk_t len;
-	int hblock, blocksize;
-	ext3_fsblk_t sb_block;
-	unsigned long offset;
-	struct ext3_super_block * es;
-	struct block_device *bdev;
-
-	bdev = ext3_blkdev_get(j_dev, sb);
-	if (bdev == NULL)
-		return NULL;
-
-	blocksize = sb->s_blocksize;
-	hblock = bdev_logical_block_size(bdev);
-	if (blocksize < hblock) {
-		ext3_msg(sb, KERN_ERR,
-			"error: blocksize too small for journal device");
-		goto out_bdev;
-	}
-
-	sb_block = EXT3_MIN_BLOCK_SIZE / blocksize;
-	offset = EXT3_MIN_BLOCK_SIZE % blocksize;
-	set_blocksize(bdev, blocksize);
-	if (!(bh = __bread(bdev, sb_block, blocksize))) {
-		ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of "
-			"external journal");
-		goto out_bdev;
-	}
-
-	es = (struct ext3_super_block *) (bh->b_data + offset);
-	if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
-	    !(le32_to_cpu(es->s_feature_incompat) &
-	      EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
-		ext3_msg(sb, KERN_ERR, "error: external journal has "
-			"bad superblock");
-		brelse(bh);
-		goto out_bdev;
-	}
-
-	if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
-		ext3_msg(sb, KERN_ERR, "error: journal UUID does not match");
-		brelse(bh);
-		goto out_bdev;
-	}
-
-	len = le32_to_cpu(es->s_blocks_count);
-	start = sb_block + 1;
-	brelse(bh);	/* we're done with the superblock */
-
-	journal = journal_init_dev(bdev, sb->s_bdev,
-					start, len, blocksize);
-	if (!journal) {
-		ext3_msg(sb, KERN_ERR,
-			"error: failed to create device journal");
-		goto out_bdev;
-	}
-	journal->j_private = sb;
-	if (!bh_uptodate_or_lock(journal->j_sb_buffer)) {
-		if (bh_submit_read(journal->j_sb_buffer)) {
-			ext3_msg(sb, KERN_ERR, "I/O error on journal device");
-			goto out_journal;
-		}
-	}
-	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
-		ext3_msg(sb, KERN_ERR,
-			"error: external journal has more than one "
-			"user (unsupported) - %d",
-			be32_to_cpu(journal->j_superblock->s_nr_users));
-		goto out_journal;
-	}
-	EXT3_SB(sb)->journal_bdev = bdev;
-	ext3_init_journal_params(sb, journal);
-	return journal;
-out_journal:
-	journal_destroy(journal);
-out_bdev:
-	ext3_blkdev_put(bdev);
-	return NULL;
-}
-
-static int ext3_load_journal(struct super_block *sb,
-			     struct ext3_super_block *es,
-			     unsigned long journal_devnum)
-{
-	journal_t *journal;
-	unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
-	dev_t journal_dev;
-	int err = 0;
-	int really_read_only;
-
-	if (journal_devnum &&
-	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
-		ext3_msg(sb, KERN_INFO, "external journal device major/minor "
-			"numbers have changed");
-		journal_dev = new_decode_dev(journal_devnum);
-	} else
-		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
-
-	really_read_only = bdev_read_only(sb->s_bdev);
-
-	/*
-	 * Are we loading a blank journal or performing recovery after a
-	 * crash?  For recovery, we need to check in advance whether we
-	 * can get read-write access to the device.
-	 */
-
-	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
-		if (sb->s_flags & MS_RDONLY) {
-			ext3_msg(sb, KERN_INFO,
-				"recovery required on readonly filesystem");
-			if (really_read_only) {
-				ext3_msg(sb, KERN_ERR, "error: write access "
-					"unavailable, cannot proceed");
-				return -EROFS;
-			}
-			ext3_msg(sb, KERN_INFO,
-				"write access will be enabled during recovery");
-		}
-	}
-
-	if (journal_inum && journal_dev) {
-		ext3_msg(sb, KERN_ERR, "error: filesystem has both journal "
-		       "and inode journals");
-		return -EINVAL;
-	}
-
-	if (journal_inum) {
-		if (!(journal = ext3_get_journal(sb, journal_inum)))
-			return -EINVAL;
-	} else {
-		if (!(journal = ext3_get_dev_journal(sb, journal_dev)))
-			return -EINVAL;
-	}
-
-	if (!(journal->j_flags & JFS_BARRIER))
-		printk(KERN_INFO "EXT3-fs: barriers not enabled\n");
-
-	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
-		err = journal_update_format(journal);
-		if (err)  {
-			ext3_msg(sb, KERN_ERR, "error updating journal");
-			journal_destroy(journal);
-			return err;
-		}
-	}
-
-	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER))
-		err = journal_wipe(journal, !really_read_only);
-	if (!err)
-		err = journal_load(journal);
-
-	if (err) {
-		ext3_msg(sb, KERN_ERR, "error loading journal");
-		journal_destroy(journal);
-		return err;
-	}
-
-	EXT3_SB(sb)->s_journal = journal;
-	ext3_clear_journal_err(sb, es);
-
-	if (!really_read_only && journal_devnum &&
-	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
-		es->s_journal_dev = cpu_to_le32(journal_devnum);
-
-		/* Make sure we flush the recovery flag to disk. */
-		ext3_commit_super(sb, es, 1);
-	}
-
-	return 0;
-}
-
-static int ext3_create_journal(struct super_block *sb,
-			       struct ext3_super_block *es,
-			       unsigned int journal_inum)
-{
-	journal_t *journal;
-	int err;
-
-	if (sb->s_flags & MS_RDONLY) {
-		ext3_msg(sb, KERN_ERR,
-			"error: readonly filesystem when trying to "
-			"create journal");
-		return -EROFS;
-	}
-
-	journal = ext3_get_journal(sb, journal_inum);
-	if (!journal)
-		return -EINVAL;
-
-	ext3_msg(sb, KERN_INFO, "creating new journal on inode %u",
-	       journal_inum);
-
-	err = journal_create(journal);
-	if (err) {
-		ext3_msg(sb, KERN_ERR, "error creating journal");
-		journal_destroy(journal);
-		return -EIO;
-	}
-
-	EXT3_SB(sb)->s_journal = journal;
-
-	ext3_update_dynamic_rev(sb);
-	EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-	EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
-
-	es->s_journal_inum = cpu_to_le32(journal_inum);
-
-	/* Make sure we flush the recovery flag to disk. */
-	ext3_commit_super(sb, es, 1);
-
-	return 0;
-}
-
-static int ext3_commit_super(struct super_block *sb,
-			       struct ext3_super_block *es,
-			       int sync)
-{
-	struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
-	int error = 0;
-
-	if (!sbh)
-		return error;
-
-	if (buffer_write_io_error(sbh)) {
-		/*
-		 * Oh, dear.  A previous attempt to write the
-		 * superblock failed.  This could happen because the
-		 * USB device was yanked out.  Or it could happen to
-		 * be a transient write error and maybe the block will
-		 * be remapped.  Nothing we can do but to retry the
-		 * write and hope for the best.
-		 */
-		ext3_msg(sb, KERN_ERR, "previous I/O error to "
-		       "superblock detected");
-		clear_buffer_write_io_error(sbh);
-		set_buffer_uptodate(sbh);
-	}
-	/*
-	 * If the file system is mounted read-only, don't update the
-	 * superblock write time.  This avoids updating the superblock
-	 * write time when we are mounting the root file system
-	 * read/only but we need to replay the journal; at that point,
-	 * for people who are east of GMT and who make their clock
-	 * tick in localtime for Windows bug-for-bug compatibility,
-	 * the clock is set in the future, and this will cause e2fsck
-	 * to complain and force a full file system check.
-	 */
-	if (!(sb->s_flags & MS_RDONLY))
-		es->s_wtime = cpu_to_le32(get_seconds());
-	es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
-	es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
-	BUFFER_TRACE(sbh, "marking dirty");
-	mark_buffer_dirty(sbh);
-	if (sync) {
-		error = sync_dirty_buffer(sbh);
-		if (buffer_write_io_error(sbh)) {
-			ext3_msg(sb, KERN_ERR, "I/O error while writing "
-			       "superblock");
-			clear_buffer_write_io_error(sbh);
-			set_buffer_uptodate(sbh);
-		}
-	}
-	return error;
-}
-
-
-/*
- * Have we just finished recovery?  If so, and if we are mounting (or
- * remounting) the filesystem readonly, then we will end up with a
- * consistent fs on disk.  Record that fact.
- */
-static void ext3_mark_recovery_complete(struct super_block * sb,
-					struct ext3_super_block * es)
-{
-	journal_t *journal = EXT3_SB(sb)->s_journal;
-
-	journal_lock_updates(journal);
-	if (journal_flush(journal) < 0)
-		goto out;
-
-	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
-	    sb->s_flags & MS_RDONLY) {
-		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-		ext3_commit_super(sb, es, 1);
-	}
-
-out:
-	journal_unlock_updates(journal);
-}
-
-/*
- * If we are mounting (or read-write remounting) a filesystem whose journal
- * has recorded an error from a previous lifetime, move that error to the
- * main filesystem now.
- */
-static void ext3_clear_journal_err(struct super_block *sb,
-				   struct ext3_super_block *es)
-{
-	journal_t *journal;
-	int j_errno;
-	const char *errstr;
-
-	journal = EXT3_SB(sb)->s_journal;
-
-	/*
-	 * Now check for any error status which may have been recorded in the
-	 * journal by a prior ext3_error() or ext3_abort()
-	 */
-
-	j_errno = journal_errno(journal);
-	if (j_errno) {
-		char nbuf[16];
-
-		errstr = ext3_decode_error(sb, j_errno, nbuf);
-		ext3_warning(sb, __func__, "Filesystem error recorded "
-			     "from previous mount: %s", errstr);
-		ext3_warning(sb, __func__, "Marking fs in need of "
-			     "filesystem check.");
-
-		EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
-		es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
-		ext3_commit_super (sb, es, 1);
-
-		journal_clear_err(journal);
-	}
-}
-
-/*
- * Force the running and committing transactions to commit,
- * and wait on the commit.
- */
-int ext3_force_commit(struct super_block *sb)
-{
-	journal_t *journal;
-	int ret;
-
-	if (sb->s_flags & MS_RDONLY)
-		return 0;
-
-	journal = EXT3_SB(sb)->s_journal;
-	ret = ext3_journal_force_commit(journal);
-	return ret;
-}
-
-static int ext3_sync_fs(struct super_block *sb, int wait)
-{
-	tid_t target;
-
-	trace_ext3_sync_fs(sb, wait);
-	/*
-	 * Writeback quota in non-journalled quota case - journalled quota has
-	 * no dirty dquots
-	 */
-	dquot_writeback_dquots(sb, -1);
-	if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
-		if (wait)
-			log_wait_commit(EXT3_SB(sb)->s_journal, target);
-	}
-	return 0;
-}
-
-/*
- * LVM calls this function before a (read-only) snapshot is created.  This
- * gives us a chance to flush the journal completely and mark the fs clean.
- */
-static int ext3_freeze(struct super_block *sb)
-{
-	int error = 0;
-	journal_t *journal;
-
-	if (!(sb->s_flags & MS_RDONLY)) {
-		journal = EXT3_SB(sb)->s_journal;
-
-		/* Now we set up the journal barrier. */
-		journal_lock_updates(journal);
-
-		/*
-		 * We don't want to clear needs_recovery flag when we failed
-		 * to flush the journal.
-		 */
-		error = journal_flush(journal);
-		if (error < 0)
-			goto out;
-
-		/* Journal blocked and flushed, clear needs_recovery flag. */
-		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-		error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
-		if (error)
-			goto out;
-	}
-	return 0;
-
-out:
-	journal_unlock_updates(journal);
-	return error;
-}
-
-/*
- * Called by LVM after the snapshot is done.  We need to reset the RECOVER
- * flag here, even though the filesystem is not technically dirty yet.
- */
-static int ext3_unfreeze(struct super_block *sb)
-{
-	if (!(sb->s_flags & MS_RDONLY)) {
-		/* Reser the needs_recovery flag before the fs is unlocked. */
-		EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-		ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
-		journal_unlock_updates(EXT3_SB(sb)->s_journal);
-	}
-	return 0;
-}
-
-static int ext3_remount (struct super_block * sb, int * flags, char * data)
-{
-	struct ext3_super_block * es;
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	ext3_fsblk_t n_blocks_count = 0;
-	unsigned long old_sb_flags;
-	struct ext3_mount_options old_opts;
-	int enable_quota = 0;
-	int err;
-#ifdef CONFIG_QUOTA
-	int i;
-#endif
-
-	sync_filesystem(sb);
-
-	/* Store the original options */
-	old_sb_flags = sb->s_flags;
-	old_opts.s_mount_opt = sbi->s_mount_opt;
-	old_opts.s_resuid = sbi->s_resuid;
-	old_opts.s_resgid = sbi->s_resgid;
-	old_opts.s_commit_interval = sbi->s_commit_interval;
-#ifdef CONFIG_QUOTA
-	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
-	for (i = 0; i < EXT3_MAXQUOTAS; i++)
-		if (sbi->s_qf_names[i]) {
-			old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
-							 GFP_KERNEL);
-			if (!old_opts.s_qf_names[i]) {
-				int j;
-
-				for (j = 0; j < i; j++)
-					kfree(old_opts.s_qf_names[j]);
-				return -ENOMEM;
-			}
-		} else
-			old_opts.s_qf_names[i] = NULL;
-#endif
-
-	/*
-	 * Allow the "check" option to be passed as a remount option.
-	 */
-	if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
-		err = -EINVAL;
-		goto restore_opts;
-	}
-
-	if (test_opt(sb, ABORT))
-		ext3_abort(sb, __func__, "Abort forced by user");
-
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
-
-	es = sbi->s_es;
-
-	ext3_init_journal_params(sb, sbi->s_journal);
-
-	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
-		n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
-		if (test_opt(sb, ABORT)) {
-			err = -EROFS;
-			goto restore_opts;
-		}
-
-		if (*flags & MS_RDONLY) {
-			err = dquot_suspend(sb, -1);
-			if (err < 0)
-				goto restore_opts;
-
-			/*
-			 * First of all, the unconditional stuff we have to do
-			 * to disable replay of the journal when we next remount
-			 */
-			sb->s_flags |= MS_RDONLY;
-
-			/*
-			 * OK, test if we are remounting a valid rw partition
-			 * readonly, and if so set the rdonly flag and then
-			 * mark the partition as valid again.
-			 */
-			if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) &&
-			    (sbi->s_mount_state & EXT3_VALID_FS))
-				es->s_state = cpu_to_le16(sbi->s_mount_state);
-
-			ext3_mark_recovery_complete(sb, es);
-		} else {
-			__le32 ret;
-			if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
-					~EXT3_FEATURE_RO_COMPAT_SUPP))) {
-				ext3_msg(sb, KERN_WARNING,
-					"warning: couldn't remount RDWR "
-					"because of unsupported optional "
-					"features (%x)", le32_to_cpu(ret));
-				err = -EROFS;
-				goto restore_opts;
-			}
-
-			/*
-			 * If we have an unprocessed orphan list hanging
-			 * around from a previously readonly bdev mount,
-			 * require a full umount & mount for now.
-			 */
-			if (es->s_last_orphan) {
-				ext3_msg(sb, KERN_WARNING, "warning: couldn't "
-				       "remount RDWR because of unprocessed "
-				       "orphan inode list.  Please "
-				       "umount & mount instead.");
-				err = -EINVAL;
-				goto restore_opts;
-			}
-
-			/*
-			 * Mounting a RDONLY partition read-write, so reread
-			 * and store the current valid flag.  (It may have
-			 * been changed by e2fsck since we originally mounted
-			 * the partition.)
-			 */
-			ext3_clear_journal_err(sb, es);
-			sbi->s_mount_state = le16_to_cpu(es->s_state);
-			if ((err = ext3_group_extend(sb, es, n_blocks_count)))
-				goto restore_opts;
-			if (!ext3_setup_super (sb, es, 0))
-				sb->s_flags &= ~MS_RDONLY;
-			enable_quota = 1;
-		}
-	}
-#ifdef CONFIG_QUOTA
-	/* Release old quota file names */
-	for (i = 0; i < EXT3_MAXQUOTAS; i++)
-		kfree(old_opts.s_qf_names[i]);
-#endif
-	if (enable_quota)
-		dquot_resume(sb, -1);
-	return 0;
-restore_opts:
-	sb->s_flags = old_sb_flags;
-	sbi->s_mount_opt = old_opts.s_mount_opt;
-	sbi->s_resuid = old_opts.s_resuid;
-	sbi->s_resgid = old_opts.s_resgid;
-	sbi->s_commit_interval = old_opts.s_commit_interval;
-#ifdef CONFIG_QUOTA
-	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
-	for (i = 0; i < EXT3_MAXQUOTAS; i++) {
-		kfree(sbi->s_qf_names[i]);
-		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
-	}
-#endif
-	return err;
-}
-
-static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
-{
-	struct super_block *sb = dentry->d_sb;
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	struct ext3_super_block *es = sbi->s_es;
-	u64 fsid;
-
-	if (test_opt(sb, MINIX_DF)) {
-		sbi->s_overhead_last = 0;
-	} else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
-		unsigned long ngroups = sbi->s_groups_count, i;
-		ext3_fsblk_t overhead = 0;
-		smp_rmb();
-
-		/*
-		 * Compute the overhead (FS structures).  This is constant
-		 * for a given filesystem unless the number of block groups
-		 * changes so we cache the previous value until it does.
-		 */
-
-		/*
-		 * All of the blocks before first_data_block are
-		 * overhead
-		 */
-		overhead = le32_to_cpu(es->s_first_data_block);
-
-		/*
-		 * Add the overhead attributed to the superblock and
-		 * block group descriptors.  If the sparse superblocks
-		 * feature is turned on, then not all groups have this.
-		 */
-		for (i = 0; i < ngroups; i++) {
-			overhead += ext3_bg_has_super(sb, i) +
-				ext3_bg_num_gdb(sb, i);
-			cond_resched();
-		}
-
-		/*
-		 * Every block group has an inode bitmap, a block
-		 * bitmap, and an inode table.
-		 */
-		overhead += ngroups * (2 + sbi->s_itb_per_group);
-
-		/* Add the internal journal blocks as well */
-		if (sbi->s_journal && !sbi->journal_bdev)
-			overhead += sbi->s_journal->j_maxlen;
-
-		sbi->s_overhead_last = overhead;
-		smp_wmb();
-		sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
-	}
-
-	buf->f_type = EXT3_SUPER_MAGIC;
-	buf->f_bsize = sb->s_blocksize;
-	buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
-	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
-	buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
-	if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
-		buf->f_bavail = 0;
-	buf->f_files = le32_to_cpu(es->s_inodes_count);
-	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
-	buf->f_namelen = EXT3_NAME_LEN;
-	fsid = le64_to_cpup((void *)es->s_uuid) ^
-	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
-	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
-	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
-	return 0;
-}
-
-/* Helper function for writing quotas on sync - we need to start transaction before quota file
- * is locked for write. Otherwise the are possible deadlocks:
- * Process 1                         Process 2
- * ext3_create()                     quota_sync()
- *   journal_start()                   write_dquot()
- *   dquot_initialize()                       down(dqio_mutex)
- *     down(dqio_mutex)                    journal_start()
- *
- */
-
-#ifdef CONFIG_QUOTA
-
-static inline struct inode *dquot_to_inode(struct dquot *dquot)
-{
-	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
-}
-
-static int ext3_write_dquot(struct dquot *dquot)
-{
-	int ret, err;
-	handle_t *handle;
-	struct inode *inode;
-
-	inode = dquot_to_inode(dquot);
-	handle = ext3_journal_start(inode,
-					EXT3_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-	ret = dquot_commit(dquot);
-	err = ext3_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-
-static int ext3_acquire_dquot(struct dquot *dquot)
-{
-	int ret, err;
-	handle_t *handle;
-
-	handle = ext3_journal_start(dquot_to_inode(dquot),
-					EXT3_QUOTA_INIT_BLOCKS(dquot->dq_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-	ret = dquot_acquire(dquot);
-	err = ext3_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-
-static int ext3_release_dquot(struct dquot *dquot)
-{
-	int ret, err;
-	handle_t *handle;
-
-	handle = ext3_journal_start(dquot_to_inode(dquot),
-					EXT3_QUOTA_DEL_BLOCKS(dquot->dq_sb));
-	if (IS_ERR(handle)) {
-		/* Release dquot anyway to avoid endless cycle in dqput() */
-		dquot_release(dquot);
-		return PTR_ERR(handle);
-	}
-	ret = dquot_release(dquot);
-	err = ext3_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-
-static int ext3_mark_dquot_dirty(struct dquot *dquot)
-{
-	/* Are we journaling quotas? */
-	if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
-	    EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
-		dquot_mark_dquot_dirty(dquot);
-		return ext3_write_dquot(dquot);
-	} else {
-		return dquot_mark_dquot_dirty(dquot);
-	}
-}
-
-static int ext3_write_info(struct super_block *sb, int type)
-{
-	int ret, err;
-	handle_t *handle;
-
-	/* Data block + inode block */
-	handle = ext3_journal_start(d_inode(sb->s_root), 2);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-	ret = dquot_commit_info(sb, type);
-	err = ext3_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-
-/*
- * Turn on quotas during mount time - we need to find
- * the quota file and such...
- */
-static int ext3_quota_on_mount(struct super_block *sb, int type)
-{
-	return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
-					EXT3_SB(sb)->s_jquota_fmt, type);
-}
-
-/*
- * Standard function to be called on quota_on
- */
-static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-			 struct path *path)
-{
-	int err;
-
-	if (!test_opt(sb, QUOTA))
-		return -EINVAL;
-
-	/* Quotafile not on the same filesystem? */
-	if (path->dentry->d_sb != sb)
-		return -EXDEV;
-	/* Journaling quota? */
-	if (EXT3_SB(sb)->s_qf_names[type]) {
-		/* Quotafile not of fs root? */
-		if (path->dentry->d_parent != sb->s_root)
-			ext3_msg(sb, KERN_WARNING,
-				"warning: Quota file not on filesystem root. "
-				"Journaled quota will not work.");
-	}
-
-	/*
-	 * When we journal data on quota file, we have to flush journal to see
-	 * all updates to the file when we bypass pagecache...
-	 */
-	if (ext3_should_journal_data(d_inode(path->dentry))) {
-		/*
-		 * We don't need to lock updates but journal_flush() could
-		 * otherwise be livelocked...
-		 */
-		journal_lock_updates(EXT3_SB(sb)->s_journal);
-		err = journal_flush(EXT3_SB(sb)->s_journal);
-		journal_unlock_updates(EXT3_SB(sb)->s_journal);
-		if (err)
-			return err;
-	}
-
-	return dquot_quota_on(sb, type, format_id, path);
-}
-
-/* Read data from quotafile - avoid pagecache and such because we cannot afford
- * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and no one else should touch the files)
- * we don't have to be afraid of races */
-static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
-			       size_t len, loff_t off)
-{
-	struct inode *inode = sb_dqopt(sb)->files[type];
-	sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
-	int err = 0;
-	int offset = off & (sb->s_blocksize - 1);
-	int tocopy;
-	size_t toread;
-	struct buffer_head *bh;
-	loff_t i_size = i_size_read(inode);
-
-	if (off > i_size)
-		return 0;
-	if (off+len > i_size)
-		len = i_size-off;
-	toread = len;
-	while (toread > 0) {
-		tocopy = sb->s_blocksize - offset < toread ?
-				sb->s_blocksize - offset : toread;
-		bh = ext3_bread(NULL, inode, blk, 0, &err);
-		if (err)
-			return err;
-		if (!bh)	/* A hole? */
-			memset(data, 0, tocopy);
-		else
-			memcpy(data, bh->b_data+offset, tocopy);
-		brelse(bh);
-		offset = 0;
-		toread -= tocopy;
-		data += tocopy;
-		blk++;
-	}
-	return len;
-}
-
-/* Write to quotafile (we know the transaction is already started and has
- * enough credits) */
-static ssize_t ext3_quota_write(struct super_block *sb, int type,
-				const char *data, size_t len, loff_t off)
-{
-	struct inode *inode = sb_dqopt(sb)->files[type];
-	sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
-	int err = 0;
-	int offset = off & (sb->s_blocksize - 1);
-	int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
-	struct buffer_head *bh;
-	handle_t *handle = journal_current_handle();
-
-	if (!handle) {
-		ext3_msg(sb, KERN_WARNING,
-			"warning: quota write (off=%llu, len=%llu)"
-			" cancelled because transaction is not started.",
-			(unsigned long long)off, (unsigned long long)len);
-		return -EIO;
-	}
-
-	/*
-	 * Since we account only one data block in transaction credits,
-	 * then it is impossible to cross a block boundary.
-	 */
-	if (sb->s_blocksize - offset < len) {
-		ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
-			" cancelled because not block aligned",
-			(unsigned long long)off, (unsigned long long)len);
-		return -EIO;
-	}
-	bh = ext3_bread(handle, inode, blk, 1, &err);
-	if (!bh)
-		goto out;
-	if (journal_quota) {
-		err = ext3_journal_get_write_access(handle, bh);
-		if (err) {
-			brelse(bh);
-			goto out;
-		}
-	}
-	lock_buffer(bh);
-	memcpy(bh->b_data+offset, data, len);
-	flush_dcache_page(bh->b_page);
-	unlock_buffer(bh);
-	if (journal_quota)
-		err = ext3_journal_dirty_metadata(handle, bh);
-	else {
-		/* Always do at least ordered writes for quotas */
-		err = ext3_journal_dirty_data(handle, bh);
-		mark_buffer_dirty(bh);
-	}
-	brelse(bh);
-out:
-	if (err)
-		return err;
-	if (inode->i_size < off + len) {
-		i_size_write(inode, off + len);
-		EXT3_I(inode)->i_disksize = inode->i_size;
-	}
-	inode->i_version++;
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	ext3_mark_inode_dirty(handle, inode);
-	return len;
-}
-
-#endif
-
-static struct dentry *ext3_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
-{
-	return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
-}
-
-static struct file_system_type ext3_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "ext3",
-	.mount		= ext3_mount,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("ext3");
-
-static int __init init_ext3_fs(void)
-{
-	int err = init_ext3_xattr();
-	if (err)
-		return err;
-	err = init_inodecache();
-	if (err)
-		goto out1;
-        err = register_filesystem(&ext3_fs_type);
-	if (err)
-		goto out;
-	return 0;
-out:
-	destroy_inodecache();
-out1:
-	exit_ext3_xattr();
-	return err;
-}
-
-static void __exit exit_ext3_fs(void)
-{
-	unregister_filesystem(&ext3_fs_type);
-	destroy_inodecache();
-	exit_ext3_xattr();
-}
-
-MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
-MODULE_LICENSE("GPL");
-module_init(init_ext3_fs)
-module_exit(exit_ext3_fs)
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
deleted file mode 100644
index c08c59094ae6..000000000000
--- a/fs/ext3/symlink.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- *  linux/fs/ext3/symlink.c
- *
- * Only fast symlinks left here - the rest is done by generic code. AV, 1999
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/fs/minix/symlink.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *
- *  ext3 symlink handling code
- */
-
-#include "ext3.h"
-#include "xattr.h"
-
-const struct inode_operations ext3_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
-	.setattr	= ext3_setattr,
-#ifdef CONFIG_EXT3_FS_XATTR
-	.setxattr	= generic_setxattr,
-	.getxattr	= generic_getxattr,
-	.listxattr	= ext3_listxattr,
-	.removexattr	= generic_removexattr,
-#endif
-};
-
-const struct inode_operations ext3_fast_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= simple_follow_link,
-	.setattr	= ext3_setattr,
-#ifdef CONFIG_EXT3_FS_XATTR
-	.setxattr	= generic_setxattr,
-	.getxattr	= generic_getxattr,
-	.listxattr	= ext3_listxattr,
-	.removexattr	= generic_removexattr,
-#endif
-};
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
deleted file mode 100644
index 7cf36501ccf4..000000000000
--- a/fs/ext3/xattr.c
+++ /dev/null
@@ -1,1330 +0,0 @@
-/*
- * linux/fs/ext3/xattr.c
- *
- * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
- *
- * Fix by Harrison Xing <harrison@mountainviewdata.com>.
- * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
- * Extended attributes for symlinks and special files added per
- *  suggestion of Luka Renko <luka.renko@hermes.si>.
- * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
- *  Red Hat Inc.
- * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
- *  and Andreas Gruenbacher <agruen@suse.de>.
- */
-
-/*
- * Extended attributes are stored directly in inodes (on file systems with
- * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
- * field contains the block number if an inode uses an additional block. All
- * attributes must fit in the inode and one additional block. Blocks that
- * contain the identical set of attributes may be shared among several inodes.
- * Identical blocks are detected by keeping a cache of blocks that have
- * recently been accessed.
- *
- * The attributes in inodes and on blocks have a different header; the entries
- * are stored in the same format:
- *
- *   +------------------+
- *   | header           |
- *   | entry 1          | |
- *   | entry 2          | | growing downwards
- *   | entry 3          | v
- *   | four null bytes  |
- *   | . . .            |
- *   | value 1          | ^
- *   | value 3          | | growing upwards
- *   | value 2          | |
- *   +------------------+
- *
- * The header is followed by multiple entry descriptors. In disk blocks, the
- * entry descriptors are kept sorted. In inodes, they are unsorted. The
- * attribute values are aligned to the end of the block in no specific order.
- *
- * Locking strategy
- * ----------------
- * EXT3_I(inode)->i_file_acl is protected by EXT3_I(inode)->xattr_sem.
- * EA blocks are only changed if they are exclusive to an inode, so
- * holding xattr_sem also means that nothing but the EA block's reference
- * count can change. Multiple writers to the same block are synchronized
- * by the buffer lock.
- */
-
-#include "ext3.h"
-#include <linux/mbcache.h>
-#include <linux/quotaops.h>
-#include "xattr.h"
-#include "acl.h"
-
-#define BHDR(bh) ((struct ext3_xattr_header *)((bh)->b_data))
-#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr))
-#define BFIRST(bh) ENTRY(BHDR(bh)+1)
-#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
-
-#define IHDR(inode, raw_inode) \
-	((struct ext3_xattr_ibody_header *) \
-		((void *)raw_inode + \
-		 EXT3_GOOD_OLD_INODE_SIZE + \
-		 EXT3_I(inode)->i_extra_isize))
-#define IFIRST(hdr) ((struct ext3_xattr_entry *)((hdr)+1))
-
-#ifdef EXT3_XATTR_DEBUG
-# define ea_idebug(inode, f...) do { \
-		printk(KERN_DEBUG "inode %s:%lu: ", \
-			inode->i_sb->s_id, inode->i_ino); \
-		printk(f); \
-		printk("\n"); \
-	} while (0)
-# define ea_bdebug(bh, f...) do { \
-		char b[BDEVNAME_SIZE]; \
-		printk(KERN_DEBUG "block %s:%lu: ", \
-			bdevname(bh->b_bdev, b), \
-			(unsigned long) bh->b_blocknr); \
-		printk(f); \
-		printk("\n"); \
-	} while (0)
-#else
-# define ea_idebug(f...)
-# define ea_bdebug(f...)
-#endif
-
-static void ext3_xattr_cache_insert(struct buffer_head *);
-static struct buffer_head *ext3_xattr_cache_find(struct inode *,
-						 struct ext3_xattr_header *,
-						 struct mb_cache_entry **);
-static void ext3_xattr_rehash(struct ext3_xattr_header *,
-			      struct ext3_xattr_entry *);
-static int ext3_xattr_list(struct dentry *dentry, char *buffer,
-			   size_t buffer_size);
-
-static struct mb_cache *ext3_xattr_cache;
-
-static const struct xattr_handler *ext3_xattr_handler_map[] = {
-	[EXT3_XATTR_INDEX_USER]		     = &ext3_xattr_user_handler,
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	[EXT3_XATTR_INDEX_POSIX_ACL_ACCESS]  = &posix_acl_access_xattr_handler,
-	[EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
-#endif
-	[EXT3_XATTR_INDEX_TRUSTED]	     = &ext3_xattr_trusted_handler,
-#ifdef CONFIG_EXT3_FS_SECURITY
-	[EXT3_XATTR_INDEX_SECURITY]	     = &ext3_xattr_security_handler,
-#endif
-};
-
-const struct xattr_handler *ext3_xattr_handlers[] = {
-	&ext3_xattr_user_handler,
-	&ext3_xattr_trusted_handler,
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	&posix_acl_access_xattr_handler,
-	&posix_acl_default_xattr_handler,
-#endif
-#ifdef CONFIG_EXT3_FS_SECURITY
-	&ext3_xattr_security_handler,
-#endif
-	NULL
-};
-
-static inline const struct xattr_handler *
-ext3_xattr_handler(int name_index)
-{
-	const struct xattr_handler *handler = NULL;
-
-	if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
-		handler = ext3_xattr_handler_map[name_index];
-	return handler;
-}
-
-/*
- * Inode operation listxattr()
- *
- * d_inode(dentry)->i_mutex: don't care
- */
-ssize_t
-ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
-	return ext3_xattr_list(dentry, buffer, size);
-}
-
-static int
-ext3_xattr_check_names(struct ext3_xattr_entry *entry, void *end)
-{
-	while (!IS_LAST_ENTRY(entry)) {
-		struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(entry);
-		if ((void *)next >= end)
-			return -EIO;
-		entry = next;
-	}
-	return 0;
-}
-
-static inline int
-ext3_xattr_check_block(struct buffer_head *bh)
-{
-	int error;
-
-	if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
-	    BHDR(bh)->h_blocks != cpu_to_le32(1))
-		return -EIO;
-	error = ext3_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
-	return error;
-}
-
-static inline int
-ext3_xattr_check_entry(struct ext3_xattr_entry *entry, size_t size)
-{
-	size_t value_size = le32_to_cpu(entry->e_value_size);
-
-	if (entry->e_value_block != 0 || value_size > size ||
-	    le16_to_cpu(entry->e_value_offs) + value_size > size)
-		return -EIO;
-	return 0;
-}
-
-static int
-ext3_xattr_find_entry(struct ext3_xattr_entry **pentry, int name_index,
-		      const char *name, size_t size, int sorted)
-{
-	struct ext3_xattr_entry *entry;
-	size_t name_len;
-	int cmp = 1;
-
-	if (name == NULL)
-		return -EINVAL;
-	name_len = strlen(name);
-	entry = *pentry;
-	for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
-		cmp = name_index - entry->e_name_index;
-		if (!cmp)
-			cmp = name_len - entry->e_name_len;
-		if (!cmp)
-			cmp = memcmp(name, entry->e_name, name_len);
-		if (cmp <= 0 && (sorted || cmp == 0))
-			break;
-	}
-	*pentry = entry;
-	if (!cmp && ext3_xattr_check_entry(entry, size))
-			return -EIO;
-	return cmp ? -ENODATA : 0;
-}
-
-static int
-ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
-		     void *buffer, size_t buffer_size)
-{
-	struct buffer_head *bh = NULL;
-	struct ext3_xattr_entry *entry;
-	size_t size;
-	int error;
-
-	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
-		  name_index, name, buffer, (long)buffer_size);
-
-	error = -ENODATA;
-	if (!EXT3_I(inode)->i_file_acl)
-		goto cleanup;
-	ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
-	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
-	if (!bh)
-		goto cleanup;
-	ea_bdebug(bh, "b_count=%d, refcount=%d",
-		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
-	if (ext3_xattr_check_block(bh)) {
-bad_block:	ext3_error(inode->i_sb, __func__,
-			   "inode %lu: bad block "E3FSBLK, inode->i_ino,
-			   EXT3_I(inode)->i_file_acl);
-		error = -EIO;
-		goto cleanup;
-	}
-	ext3_xattr_cache_insert(bh);
-	entry = BFIRST(bh);
-	error = ext3_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
-	if (error == -EIO)
-		goto bad_block;
-	if (error)
-		goto cleanup;
-	size = le32_to_cpu(entry->e_value_size);
-	if (buffer) {
-		error = -ERANGE;
-		if (size > buffer_size)
-			goto cleanup;
-		memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
-		       size);
-	}
-	error = size;
-
-cleanup:
-	brelse(bh);
-	return error;
-}
-
-static int
-ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
-		     void *buffer, size_t buffer_size)
-{
-	struct ext3_xattr_ibody_header *header;
-	struct ext3_xattr_entry *entry;
-	struct ext3_inode *raw_inode;
-	struct ext3_iloc iloc;
-	size_t size;
-	void *end;
-	int error;
-
-	if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
-		return -ENODATA;
-	error = ext3_get_inode_loc(inode, &iloc);
-	if (error)
-		return error;
-	raw_inode = ext3_raw_inode(&iloc);
-	header = IHDR(inode, raw_inode);
-	entry = IFIRST(header);
-	end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
-	error = ext3_xattr_check_names(entry, end);
-	if (error)
-		goto cleanup;
-	error = ext3_xattr_find_entry(&entry, name_index, name,
-				      end - (void *)entry, 0);
-	if (error)
-		goto cleanup;
-	size = le32_to_cpu(entry->e_value_size);
-	if (buffer) {
-		error = -ERANGE;
-		if (size > buffer_size)
-			goto cleanup;
-		memcpy(buffer, (void *)IFIRST(header) +
-		       le16_to_cpu(entry->e_value_offs), size);
-	}
-	error = size;
-
-cleanup:
-	brelse(iloc.bh);
-	return error;
-}
-
-/*
- * ext3_xattr_get()
- *
- * Copy an extended attribute into the buffer
- * provided, or compute the buffer size required.
- * Buffer is NULL to compute the size of the buffer required.
- *
- * Returns a negative error number on failure, or the number of bytes
- * used / required on success.
- */
-int
-ext3_xattr_get(struct inode *inode, int name_index, const char *name,
-	       void *buffer, size_t buffer_size)
-{
-	int error;
-
-	down_read(&EXT3_I(inode)->xattr_sem);
-	error = ext3_xattr_ibody_get(inode, name_index, name, buffer,
-				     buffer_size);
-	if (error == -ENODATA)
-		error = ext3_xattr_block_get(inode, name_index, name, buffer,
-					     buffer_size);
-	up_read(&EXT3_I(inode)->xattr_sem);
-	return error;
-}
-
-static int
-ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
-			char *buffer, size_t buffer_size)
-{
-	size_t rest = buffer_size;
-
-	for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
-		const struct xattr_handler *handler =
-			ext3_xattr_handler(entry->e_name_index);
-
-		if (handler) {
-			size_t size = handler->list(dentry, buffer, rest,
-						    entry->e_name,
-						    entry->e_name_len,
-						    handler->flags);
-			if (buffer) {
-				if (size > rest)
-					return -ERANGE;
-				buffer += size;
-			}
-			rest -= size;
-		}
-	}
-	return buffer_size - rest;
-}
-
-static int
-ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
-{
-	struct inode *inode = d_inode(dentry);
-	struct buffer_head *bh = NULL;
-	int error;
-
-	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
-		  buffer, (long)buffer_size);
-
-	error = 0;
-	if (!EXT3_I(inode)->i_file_acl)
-		goto cleanup;
-	ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
-	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
-	error = -EIO;
-	if (!bh)
-		goto cleanup;
-	ea_bdebug(bh, "b_count=%d, refcount=%d",
-		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
-	if (ext3_xattr_check_block(bh)) {
-		ext3_error(inode->i_sb, __func__,
-			   "inode %lu: bad block "E3FSBLK, inode->i_ino,
-			   EXT3_I(inode)->i_file_acl);
-		error = -EIO;
-		goto cleanup;
-	}
-	ext3_xattr_cache_insert(bh);
-	error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
-
-cleanup:
-	brelse(bh);
-
-	return error;
-}
-
-static int
-ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
-{
-	struct inode *inode = d_inode(dentry);
-	struct ext3_xattr_ibody_header *header;
-	struct ext3_inode *raw_inode;
-	struct ext3_iloc iloc;
-	void *end;
-	int error;
-
-	if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
-		return 0;
-	error = ext3_get_inode_loc(inode, &iloc);
-	if (error)
-		return error;
-	raw_inode = ext3_raw_inode(&iloc);
-	header = IHDR(inode, raw_inode);
-	end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
-	error = ext3_xattr_check_names(IFIRST(header), end);
-	if (error)
-		goto cleanup;
-	error = ext3_xattr_list_entries(dentry, IFIRST(header),
-					buffer, buffer_size);
-
-cleanup:
-	brelse(iloc.bh);
-	return error;
-}
-
-/*
- * ext3_xattr_list()
- *
- * Copy a list of attribute names into the buffer
- * provided, or compute the buffer size required.
- * Buffer is NULL to compute the size of the buffer required.
- *
- * Returns a negative error number on failure, or the number of bytes
- * used / required on success.
- */
-static int
-ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
-{
-	int i_error, b_error;
-
-	down_read(&EXT3_I(d_inode(dentry))->xattr_sem);
-	i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size);
-	if (i_error < 0) {
-		b_error = 0;
-	} else {
-		if (buffer) {
-			buffer += i_error;
-			buffer_size -= i_error;
-		}
-		b_error = ext3_xattr_block_list(dentry, buffer, buffer_size);
-		if (b_error < 0)
-			i_error = 0;
-	}
-	up_read(&EXT3_I(d_inode(dentry))->xattr_sem);
-	return i_error + b_error;
-}
-
-/*
- * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is
- * not set, set it.
- */
-static void ext3_xattr_update_super_block(handle_t *handle,
-					  struct super_block *sb)
-{
-	if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
-		return;
-
-	if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
-		EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR);
-		ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
-	}
-}
-
-/*
- * Release the xattr block BH: If the reference count is > 1, decrement
- * it; otherwise free the block.
- */
-static void
-ext3_xattr_release_block(handle_t *handle, struct inode *inode,
-			 struct buffer_head *bh)
-{
-	struct mb_cache_entry *ce = NULL;
-	int error = 0;
-
-	ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, bh->b_blocknr);
-	error = ext3_journal_get_write_access(handle, bh);
-	if (error)
-		 goto out;
-
-	lock_buffer(bh);
-
-	if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
-		ea_bdebug(bh, "refcount now=0; freeing");
-		if (ce)
-			mb_cache_entry_free(ce);
-		ext3_free_blocks(handle, inode, bh->b_blocknr, 1);
-		get_bh(bh);
-		ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
-	} else {
-		le32_add_cpu(&BHDR(bh)->h_refcount, -1);
-		error = ext3_journal_dirty_metadata(handle, bh);
-		if (IS_SYNC(inode))
-			handle->h_sync = 1;
-		dquot_free_block(inode, 1);
-		ea_bdebug(bh, "refcount now=%d; releasing",
-			  le32_to_cpu(BHDR(bh)->h_refcount));
-		if (ce)
-			mb_cache_entry_release(ce);
-	}
-	unlock_buffer(bh);
-out:
-	ext3_std_error(inode->i_sb, error);
-	return;
-}
-
-struct ext3_xattr_info {
-	int name_index;
-	const char *name;
-	const void *value;
-	size_t value_len;
-};
-
-struct ext3_xattr_search {
-	struct ext3_xattr_entry *first;
-	void *base;
-	void *end;
-	struct ext3_xattr_entry *here;
-	int not_found;
-};
-
-static int
-ext3_xattr_set_entry(struct ext3_xattr_info *i, struct ext3_xattr_search *s)
-{
-	struct ext3_xattr_entry *last;
-	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
-
-	/* Compute min_offs and last. */
-	last = s->first;
-	for (; !IS_LAST_ENTRY(last); last = EXT3_XATTR_NEXT(last)) {
-		if (!last->e_value_block && last->e_value_size) {
-			size_t offs = le16_to_cpu(last->e_value_offs);
-			if (offs < min_offs)
-				min_offs = offs;
-		}
-	}
-	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
-	if (!s->not_found) {
-		if (!s->here->e_value_block && s->here->e_value_size) {
-			size_t size = le32_to_cpu(s->here->e_value_size);
-			free += EXT3_XATTR_SIZE(size);
-		}
-		free += EXT3_XATTR_LEN(name_len);
-	}
-	if (i->value) {
-		if (free < EXT3_XATTR_LEN(name_len) +
-			   EXT3_XATTR_SIZE(i->value_len))
-			return -ENOSPC;
-	}
-
-	if (i->value && s->not_found) {
-		/* Insert the new name. */
-		size_t size = EXT3_XATTR_LEN(name_len);
-		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
-		memmove((void *)s->here + size, s->here, rest);
-		memset(s->here, 0, size);
-		s->here->e_name_index = i->name_index;
-		s->here->e_name_len = name_len;
-		memcpy(s->here->e_name, i->name, name_len);
-	} else {
-		if (!s->here->e_value_block && s->here->e_value_size) {
-			void *first_val = s->base + min_offs;
-			size_t offs = le16_to_cpu(s->here->e_value_offs);
-			void *val = s->base + offs;
-			size_t size = EXT3_XATTR_SIZE(
-				le32_to_cpu(s->here->e_value_size));
-
-			if (i->value && size == EXT3_XATTR_SIZE(i->value_len)) {
-				/* The old and the new value have the same
-				   size. Just replace. */
-				s->here->e_value_size =
-					cpu_to_le32(i->value_len);
-				memset(val + size - EXT3_XATTR_PAD, 0,
-				       EXT3_XATTR_PAD); /* Clear pad bytes. */
-				memcpy(val, i->value, i->value_len);
-				return 0;
-			}
-
-			/* Remove the old value. */
-			memmove(first_val + size, first_val, val - first_val);
-			memset(first_val, 0, size);
-			s->here->e_value_size = 0;
-			s->here->e_value_offs = 0;
-			min_offs += size;
-
-			/* Adjust all value offsets. */
-			last = s->first;
-			while (!IS_LAST_ENTRY(last)) {
-				size_t o = le16_to_cpu(last->e_value_offs);
-				if (!last->e_value_block &&
-				    last->e_value_size && o < offs)
-					last->e_value_offs =
-						cpu_to_le16(o + size);
-				last = EXT3_XATTR_NEXT(last);
-			}
-		}
-		if (!i->value) {
-			/* Remove the old name. */
-			size_t size = EXT3_XATTR_LEN(name_len);
-			last = ENTRY((void *)last - size);
-			memmove(s->here, (void *)s->here + size,
-				(void *)last - (void *)s->here + sizeof(__u32));
-			memset(last, 0, size);
-		}
-	}
-
-	if (i->value) {
-		/* Insert the new value. */
-		s->here->e_value_size = cpu_to_le32(i->value_len);
-		if (i->value_len) {
-			size_t size = EXT3_XATTR_SIZE(i->value_len);
-			void *val = s->base + min_offs - size;
-			s->here->e_value_offs = cpu_to_le16(min_offs - size);
-			memset(val + size - EXT3_XATTR_PAD, 0,
-			       EXT3_XATTR_PAD); /* Clear the pad bytes. */
-			memcpy(val, i->value, i->value_len);
-		}
-	}
-	return 0;
-}
-
-struct ext3_xattr_block_find {
-	struct ext3_xattr_search s;
-	struct buffer_head *bh;
-};
-
-static int
-ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
-		      struct ext3_xattr_block_find *bs)
-{
-	struct super_block *sb = inode->i_sb;
-	int error;
-
-	ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
-		  i->name_index, i->name, i->value, (long)i->value_len);
-
-	if (EXT3_I(inode)->i_file_acl) {
-		/* The inode already has an extended attribute block. */
-		bs->bh = sb_bread(sb, EXT3_I(inode)->i_file_acl);
-		error = -EIO;
-		if (!bs->bh)
-			goto cleanup;
-		ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
-			atomic_read(&(bs->bh->b_count)),
-			le32_to_cpu(BHDR(bs->bh)->h_refcount));
-		if (ext3_xattr_check_block(bs->bh)) {
-			ext3_error(sb, __func__,
-				"inode %lu: bad block "E3FSBLK, inode->i_ino,
-				EXT3_I(inode)->i_file_acl);
-			error = -EIO;
-			goto cleanup;
-		}
-		/* Find the named attribute. */
-		bs->s.base = BHDR(bs->bh);
-		bs->s.first = BFIRST(bs->bh);
-		bs->s.end = bs->bh->b_data + bs->bh->b_size;
-		bs->s.here = bs->s.first;
-		error = ext3_xattr_find_entry(&bs->s.here, i->name_index,
-					      i->name, bs->bh->b_size, 1);
-		if (error && error != -ENODATA)
-			goto cleanup;
-		bs->s.not_found = error;
-	}
-	error = 0;
-
-cleanup:
-	return error;
-}
-
-static int
-ext3_xattr_block_set(handle_t *handle, struct inode *inode,
-		     struct ext3_xattr_info *i,
-		     struct ext3_xattr_block_find *bs)
-{
-	struct super_block *sb = inode->i_sb;
-	struct buffer_head *new_bh = NULL;
-	struct ext3_xattr_search *s = &bs->s;
-	struct mb_cache_entry *ce = NULL;
-	int error = 0;
-
-#define header(x) ((struct ext3_xattr_header *)(x))
-
-	if (i->value && i->value_len > sb->s_blocksize)
-		return -ENOSPC;
-	if (s->base) {
-		ce = mb_cache_entry_get(ext3_xattr_cache, bs->bh->b_bdev,
-					bs->bh->b_blocknr);
-		error = ext3_journal_get_write_access(handle, bs->bh);
-		if (error)
-			goto cleanup;
-		lock_buffer(bs->bh);
-
-		if (header(s->base)->h_refcount == cpu_to_le32(1)) {
-			if (ce) {
-				mb_cache_entry_free(ce);
-				ce = NULL;
-			}
-			ea_bdebug(bs->bh, "modifying in-place");
-			error = ext3_xattr_set_entry(i, s);
-			if (!error) {
-				if (!IS_LAST_ENTRY(s->first))
-					ext3_xattr_rehash(header(s->base),
-							  s->here);
-				ext3_xattr_cache_insert(bs->bh);
-			}
-			unlock_buffer(bs->bh);
-			if (error == -EIO)
-				goto bad_block;
-			if (!error)
-				error = ext3_journal_dirty_metadata(handle,
-								    bs->bh);
-			if (error)
-				goto cleanup;
-			goto inserted;
-		} else {
-			int offset = (char *)s->here - bs->bh->b_data;
-
-			unlock_buffer(bs->bh);
-			journal_release_buffer(handle, bs->bh);
-
-			if (ce) {
-				mb_cache_entry_release(ce);
-				ce = NULL;
-			}
-			ea_bdebug(bs->bh, "cloning");
-			s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
-			error = -ENOMEM;
-			if (s->base == NULL)
-				goto cleanup;
-			memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
-			s->first = ENTRY(header(s->base)+1);
-			header(s->base)->h_refcount = cpu_to_le32(1);
-			s->here = ENTRY(s->base + offset);
-			s->end = s->base + bs->bh->b_size;
-		}
-	} else {
-		/* Allocate a buffer where we construct the new block. */
-		s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
-		/* assert(header == s->base) */
-		error = -ENOMEM;
-		if (s->base == NULL)
-			goto cleanup;
-		header(s->base)->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
-		header(s->base)->h_blocks = cpu_to_le32(1);
-		header(s->base)->h_refcount = cpu_to_le32(1);
-		s->first = ENTRY(header(s->base)+1);
-		s->here = ENTRY(header(s->base)+1);
-		s->end = s->base + sb->s_blocksize;
-	}
-
-	error = ext3_xattr_set_entry(i, s);
-	if (error == -EIO)
-		goto bad_block;
-	if (error)
-		goto cleanup;
-	if (!IS_LAST_ENTRY(s->first))
-		ext3_xattr_rehash(header(s->base), s->here);
-
-inserted:
-	if (!IS_LAST_ENTRY(s->first)) {
-		new_bh = ext3_xattr_cache_find(inode, header(s->base), &ce);
-		if (new_bh) {
-			/* We found an identical block in the cache. */
-			if (new_bh == bs->bh)
-				ea_bdebug(new_bh, "keeping");
-			else {
-				/* The old block is released after updating
-				   the inode. */
-				error = dquot_alloc_block(inode, 1);
-				if (error)
-					goto cleanup;
-				error = ext3_journal_get_write_access(handle,
-								      new_bh);
-				if (error)
-					goto cleanup_dquot;
-				lock_buffer(new_bh);
-				le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
-				ea_bdebug(new_bh, "reusing; refcount now=%d",
-					le32_to_cpu(BHDR(new_bh)->h_refcount));
-				unlock_buffer(new_bh);
-				error = ext3_journal_dirty_metadata(handle,
-								    new_bh);
-				if (error)
-					goto cleanup_dquot;
-			}
-			mb_cache_entry_release(ce);
-			ce = NULL;
-		} else if (bs->bh && s->base == bs->bh->b_data) {
-			/* We were modifying this block in-place. */
-			ea_bdebug(bs->bh, "keeping this block");
-			new_bh = bs->bh;
-			get_bh(new_bh);
-		} else {
-			/* We need to allocate a new block */
-			ext3_fsblk_t goal = ext3_group_first_block_no(sb,
-						EXT3_I(inode)->i_block_group);
-			ext3_fsblk_t block;
-
-			/*
-			 * Protect us agaist concurrent allocations to the
-			 * same inode from ext3_..._writepage(). Reservation
-			 * code does not expect racing allocations.
-			 */
-			mutex_lock(&EXT3_I(inode)->truncate_mutex);
-			block = ext3_new_block(handle, inode, goal, &error);
-			mutex_unlock(&EXT3_I(inode)->truncate_mutex);
-			if (error)
-				goto cleanup;
-			ea_idebug(inode, "creating block %d", block);
-
-			new_bh = sb_getblk(sb, block);
-			if (unlikely(!new_bh)) {
-getblk_failed:
-				ext3_free_blocks(handle, inode, block, 1);
-				error = -ENOMEM;
-				goto cleanup;
-			}
-			lock_buffer(new_bh);
-			error = ext3_journal_get_create_access(handle, new_bh);
-			if (error) {
-				unlock_buffer(new_bh);
-				goto getblk_failed;
-			}
-			memcpy(new_bh->b_data, s->base, new_bh->b_size);
-			set_buffer_uptodate(new_bh);
-			unlock_buffer(new_bh);
-			ext3_xattr_cache_insert(new_bh);
-			error = ext3_journal_dirty_metadata(handle, new_bh);
-			if (error)
-				goto cleanup;
-		}
-	}
-
-	/* Update the inode. */
-	EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
-
-	/* Drop the previous xattr block. */
-	if (bs->bh && bs->bh != new_bh)
-		ext3_xattr_release_block(handle, inode, bs->bh);
-	error = 0;
-
-cleanup:
-	if (ce)
-		mb_cache_entry_release(ce);
-	brelse(new_bh);
-	if (!(bs->bh && s->base == bs->bh->b_data))
-		kfree(s->base);
-
-	return error;
-
-cleanup_dquot:
-	dquot_free_block(inode, 1);
-	goto cleanup;
-
-bad_block:
-	ext3_error(inode->i_sb, __func__,
-		   "inode %lu: bad block "E3FSBLK, inode->i_ino,
-		   EXT3_I(inode)->i_file_acl);
-	goto cleanup;
-
-#undef header
-}
-
-struct ext3_xattr_ibody_find {
-	struct ext3_xattr_search s;
-	struct ext3_iloc iloc;
-};
-
-static int
-ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
-		      struct ext3_xattr_ibody_find *is)
-{
-	struct ext3_xattr_ibody_header *header;
-	struct ext3_inode *raw_inode;
-	int error;
-
-	if (EXT3_I(inode)->i_extra_isize == 0)
-		return 0;
-	raw_inode = ext3_raw_inode(&is->iloc);
-	header = IHDR(inode, raw_inode);
-	is->s.base = is->s.first = IFIRST(header);
-	is->s.here = is->s.first;
-	is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
-	if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) {
-		error = ext3_xattr_check_names(IFIRST(header), is->s.end);
-		if (error)
-			return error;
-		/* Find the named attribute. */
-		error = ext3_xattr_find_entry(&is->s.here, i->name_index,
-					      i->name, is->s.end -
-					      (void *)is->s.base, 0);
-		if (error && error != -ENODATA)
-			return error;
-		is->s.not_found = error;
-	}
-	return 0;
-}
-
-static int
-ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
-		     struct ext3_xattr_info *i,
-		     struct ext3_xattr_ibody_find *is)
-{
-	struct ext3_xattr_ibody_header *header;
-	struct ext3_xattr_search *s = &is->s;
-	int error;
-
-	if (EXT3_I(inode)->i_extra_isize == 0)
-		return -ENOSPC;
-	error = ext3_xattr_set_entry(i, s);
-	if (error)
-		return error;
-	header = IHDR(inode, ext3_raw_inode(&is->iloc));
-	if (!IS_LAST_ENTRY(s->first)) {
-		header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
-		ext3_set_inode_state(inode, EXT3_STATE_XATTR);
-	} else {
-		header->h_magic = cpu_to_le32(0);
-		ext3_clear_inode_state(inode, EXT3_STATE_XATTR);
-	}
-	return 0;
-}
-
-/*
- * ext3_xattr_set_handle()
- *
- * Create, replace or remove an extended attribute for this inode.  Value
- * is NULL to remove an existing extended attribute, and non-NULL to
- * either replace an existing extended attribute, or create a new extended
- * attribute. The flags XATTR_REPLACE and XATTR_CREATE
- * specify that an extended attribute must exist and must not exist
- * previous to the call, respectively.
- *
- * Returns 0, or a negative error number on failure.
- */
-int
-ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
-		      const char *name, const void *value, size_t value_len,
-		      int flags)
-{
-	struct ext3_xattr_info i = {
-		.name_index = name_index,
-		.name = name,
-		.value = value,
-		.value_len = value_len,
-
-	};
-	struct ext3_xattr_ibody_find is = {
-		.s = { .not_found = -ENODATA, },
-	};
-	struct ext3_xattr_block_find bs = {
-		.s = { .not_found = -ENODATA, },
-	};
-	int error;
-
-	if (!name)
-		return -EINVAL;
-	if (strlen(name) > 255)
-		return -ERANGE;
-	down_write(&EXT3_I(inode)->xattr_sem);
-	error = ext3_get_inode_loc(inode, &is.iloc);
-	if (error)
-		goto cleanup;
-
-	error = ext3_journal_get_write_access(handle, is.iloc.bh);
-	if (error)
-		goto cleanup;
-
-	if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) {
-		struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
-		memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
-		ext3_clear_inode_state(inode, EXT3_STATE_NEW);
-	}
-
-	error = ext3_xattr_ibody_find(inode, &i, &is);
-	if (error)
-		goto cleanup;
-	if (is.s.not_found)
-		error = ext3_xattr_block_find(inode, &i, &bs);
-	if (error)
-		goto cleanup;
-	if (is.s.not_found && bs.s.not_found) {
-		error = -ENODATA;
-		if (flags & XATTR_REPLACE)
-			goto cleanup;
-		error = 0;
-		if (!value)
-			goto cleanup;
-	} else {
-		error = -EEXIST;
-		if (flags & XATTR_CREATE)
-			goto cleanup;
-	}
-	if (!value) {
-		if (!is.s.not_found)
-			error = ext3_xattr_ibody_set(handle, inode, &i, &is);
-		else if (!bs.s.not_found)
-			error = ext3_xattr_block_set(handle, inode, &i, &bs);
-	} else {
-		error = ext3_xattr_ibody_set(handle, inode, &i, &is);
-		if (!error && !bs.s.not_found) {
-			i.value = NULL;
-			error = ext3_xattr_block_set(handle, inode, &i, &bs);
-		} else if (error == -ENOSPC) {
-			if (EXT3_I(inode)->i_file_acl && !bs.s.base) {
-				error = ext3_xattr_block_find(inode, &i, &bs);
-				if (error)
-					goto cleanup;
-			}
-			error = ext3_xattr_block_set(handle, inode, &i, &bs);
-			if (error)
-				goto cleanup;
-			if (!is.s.not_found) {
-				i.value = NULL;
-				error = ext3_xattr_ibody_set(handle, inode, &i,
-							     &is);
-			}
-		}
-	}
-	if (!error) {
-		ext3_xattr_update_super_block(handle, inode->i_sb);
-		inode->i_ctime = CURRENT_TIME_SEC;
-		error = ext3_mark_iloc_dirty(handle, inode, &is.iloc);
-		/*
-		 * The bh is consumed by ext3_mark_iloc_dirty, even with
-		 * error != 0.
-		 */
-		is.iloc.bh = NULL;
-		if (IS_SYNC(inode))
-			handle->h_sync = 1;
-	}
-
-cleanup:
-	brelse(is.iloc.bh);
-	brelse(bs.bh);
-	up_write(&EXT3_I(inode)->xattr_sem);
-	return error;
-}
-
-/*
- * ext3_xattr_set()
- *
- * Like ext3_xattr_set_handle, but start from an inode. This extended
- * attribute modification is a filesystem transaction by itself.
- *
- * Returns 0, or a negative error number on failure.
- */
-int
-ext3_xattr_set(struct inode *inode, int name_index, const char *name,
-	       const void *value, size_t value_len, int flags)
-{
-	handle_t *handle;
-	int error, retries = 0;
-
-retry:
-	handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle)) {
-		error = PTR_ERR(handle);
-	} else {
-		int error2;
-
-		error = ext3_xattr_set_handle(handle, inode, name_index, name,
-					      value, value_len, flags);
-		error2 = ext3_journal_stop(handle);
-		if (error == -ENOSPC &&
-		    ext3_should_retry_alloc(inode->i_sb, &retries))
-			goto retry;
-		if (error == 0)
-			error = error2;
-	}
-
-	return error;
-}
-
-/*
- * ext3_xattr_delete_inode()
- *
- * Free extended attribute resources associated with this inode. This
- * is called immediately before an inode is freed. We have exclusive
- * access to the inode.
- */
-void
-ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
-{
-	struct buffer_head *bh = NULL;
-
-	if (!EXT3_I(inode)->i_file_acl)
-		goto cleanup;
-	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
-	if (!bh) {
-		ext3_error(inode->i_sb, __func__,
-			"inode %lu: block "E3FSBLK" read error", inode->i_ino,
-			EXT3_I(inode)->i_file_acl);
-		goto cleanup;
-	}
-	if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
-	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-		ext3_error(inode->i_sb, __func__,
-			"inode %lu: bad block "E3FSBLK, inode->i_ino,
-			EXT3_I(inode)->i_file_acl);
-		goto cleanup;
-	}
-	ext3_xattr_release_block(handle, inode, bh);
-	EXT3_I(inode)->i_file_acl = 0;
-
-cleanup:
-	brelse(bh);
-}
-
-/*
- * ext3_xattr_put_super()
- *
- * This is called when a file system is unmounted.
- */
-void
-ext3_xattr_put_super(struct super_block *sb)
-{
-	mb_cache_shrink(sb->s_bdev);
-}
-
-/*
- * ext3_xattr_cache_insert()
- *
- * Create a new entry in the extended attribute cache, and insert
- * it unless such an entry is already in the cache.
- *
- * Returns 0, or a negative error number on failure.
- */
-static void
-ext3_xattr_cache_insert(struct buffer_head *bh)
-{
-	__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
-	struct mb_cache_entry *ce;
-	int error;
-
-	ce = mb_cache_entry_alloc(ext3_xattr_cache, GFP_NOFS);
-	if (!ce) {
-		ea_bdebug(bh, "out of memory");
-		return;
-	}
-	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
-	if (error) {
-		mb_cache_entry_free(ce);
-		if (error == -EBUSY) {
-			ea_bdebug(bh, "already in cache");
-			error = 0;
-		}
-	} else {
-		ea_bdebug(bh, "inserting [%x]", (int)hash);
-		mb_cache_entry_release(ce);
-	}
-}
-
-/*
- * ext3_xattr_cmp()
- *
- * Compare two extended attribute blocks for equality.
- *
- * Returns 0 if the blocks are equal, 1 if they differ, and
- * a negative error number on errors.
- */
-static int
-ext3_xattr_cmp(struct ext3_xattr_header *header1,
-	       struct ext3_xattr_header *header2)
-{
-	struct ext3_xattr_entry *entry1, *entry2;
-
-	entry1 = ENTRY(header1+1);
-	entry2 = ENTRY(header2+1);
-	while (!IS_LAST_ENTRY(entry1)) {
-		if (IS_LAST_ENTRY(entry2))
-			return 1;
-		if (entry1->e_hash != entry2->e_hash ||
-		    entry1->e_name_index != entry2->e_name_index ||
-		    entry1->e_name_len != entry2->e_name_len ||
-		    entry1->e_value_size != entry2->e_value_size ||
-		    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
-			return 1;
-		if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
-			return -EIO;
-		if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
-			   (char *)header2 + le16_to_cpu(entry2->e_value_offs),
-			   le32_to_cpu(entry1->e_value_size)))
-			return 1;
-
-		entry1 = EXT3_XATTR_NEXT(entry1);
-		entry2 = EXT3_XATTR_NEXT(entry2);
-	}
-	if (!IS_LAST_ENTRY(entry2))
-		return 1;
-	return 0;
-}
-
-/*
- * ext3_xattr_cache_find()
- *
- * Find an identical extended attribute block.
- *
- * Returns a pointer to the block found, or NULL if such a block was
- * not found or an error occurred.
- */
-static struct buffer_head *
-ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header,
-		      struct mb_cache_entry **pce)
-{
-	__u32 hash = le32_to_cpu(header->h_hash);
-	struct mb_cache_entry *ce;
-
-	if (!header->h_hash)
-		return NULL;  /* never share */
-	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
-again:
-	ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev,
-				       hash);
-	while (ce) {
-		struct buffer_head *bh;
-
-		if (IS_ERR(ce)) {
-			if (PTR_ERR(ce) == -EAGAIN)
-				goto again;
-			break;
-		}
-		bh = sb_bread(inode->i_sb, ce->e_block);
-		if (!bh) {
-			ext3_error(inode->i_sb, __func__,
-				"inode %lu: block %lu read error",
-				inode->i_ino, (unsigned long) ce->e_block);
-		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
-				EXT3_XATTR_REFCOUNT_MAX) {
-			ea_idebug(inode, "block %lu refcount %d>=%d",
-				  (unsigned long) ce->e_block,
-				  le32_to_cpu(BHDR(bh)->h_refcount),
-					  EXT3_XATTR_REFCOUNT_MAX);
-		} else if (ext3_xattr_cmp(header, BHDR(bh)) == 0) {
-			*pce = ce;
-			return bh;
-		}
-		brelse(bh);
-		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
-	}
-	return NULL;
-}
-
-#define NAME_HASH_SHIFT 5
-#define VALUE_HASH_SHIFT 16
-
-/*
- * ext3_xattr_hash_entry()
- *
- * Compute the hash of an extended attribute.
- */
-static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header,
-					 struct ext3_xattr_entry *entry)
-{
-	__u32 hash = 0;
-	char *name = entry->e_name;
-	int n;
-
-	for (n=0; n < entry->e_name_len; n++) {
-		hash = (hash << NAME_HASH_SHIFT) ^
-		       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
-		       *name++;
-	}
-
-	if (entry->e_value_block == 0 && entry->e_value_size != 0) {
-		__le32 *value = (__le32 *)((char *)header +
-			le16_to_cpu(entry->e_value_offs));
-		for (n = (le32_to_cpu(entry->e_value_size) +
-		     EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) {
-			hash = (hash << VALUE_HASH_SHIFT) ^
-			       (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
-			       le32_to_cpu(*value++);
-		}
-	}
-	entry->e_hash = cpu_to_le32(hash);
-}
-
-#undef NAME_HASH_SHIFT
-#undef VALUE_HASH_SHIFT
-
-#define BLOCK_HASH_SHIFT 16
-
-/*
- * ext3_xattr_rehash()
- *
- * Re-compute the extended attribute hash value after an entry has changed.
- */
-static void ext3_xattr_rehash(struct ext3_xattr_header *header,
-			      struct ext3_xattr_entry *entry)
-{
-	struct ext3_xattr_entry *here;
-	__u32 hash = 0;
-
-	ext3_xattr_hash_entry(header, entry);
-	here = ENTRY(header+1);
-	while (!IS_LAST_ENTRY(here)) {
-		if (!here->e_hash) {
-			/* Block is not shared if an entry's hash value == 0 */
-			hash = 0;
-			break;
-		}
-		hash = (hash << BLOCK_HASH_SHIFT) ^
-		       (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
-		       le32_to_cpu(here->e_hash);
-		here = EXT3_XATTR_NEXT(here);
-	}
-	header->h_hash = cpu_to_le32(hash);
-}
-
-#undef BLOCK_HASH_SHIFT
-
-int __init
-init_ext3_xattr(void)
-{
-	ext3_xattr_cache = mb_cache_create("ext3_xattr", 6);
-	if (!ext3_xattr_cache)
-		return -ENOMEM;
-	return 0;
-}
-
-void
-exit_ext3_xattr(void)
-{
-	if (ext3_xattr_cache)
-		mb_cache_destroy(ext3_xattr_cache);
-	ext3_xattr_cache = NULL;
-}
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
deleted file mode 100644
index 32e93ebf8031..000000000000
--- a/fs/ext3/xattr.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
-  File: fs/ext3/xattr.h
-
-  On-disk format of extended attributes for the ext3 filesystem.
-
-  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
-*/
-
-#include <linux/xattr.h>
-
-/* Magic value in attribute blocks */
-#define EXT3_XATTR_MAGIC		0xEA020000
-
-/* Maximum number of references to one attribute block */
-#define EXT3_XATTR_REFCOUNT_MAX		1024
-
-/* Name indexes */
-#define EXT3_XATTR_INDEX_USER			1
-#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS	2
-#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT	3
-#define EXT3_XATTR_INDEX_TRUSTED		4
-#define	EXT3_XATTR_INDEX_LUSTRE			5
-#define EXT3_XATTR_INDEX_SECURITY	        6
-
-struct ext3_xattr_header {
-	__le32	h_magic;	/* magic number for identification */
-	__le32	h_refcount;	/* reference count */
-	__le32	h_blocks;	/* number of disk blocks used */
-	__le32	h_hash;		/* hash value of all attributes */
-	__u32	h_reserved[4];	/* zero right now */
-};
-
-struct ext3_xattr_ibody_header {
-	__le32	h_magic;	/* magic number for identification */
-};
-
-struct ext3_xattr_entry {
-	__u8	e_name_len;	/* length of name */
-	__u8	e_name_index;	/* attribute name index */
-	__le16	e_value_offs;	/* offset in disk block of value */
-	__le32	e_value_block;	/* disk block attribute is stored on (n/i) */
-	__le32	e_value_size;	/* size of attribute value */
-	__le32	e_hash;		/* hash value of name and value */
-	char	e_name[0];	/* attribute name */
-};
-
-#define EXT3_XATTR_PAD_BITS		2
-#define EXT3_XATTR_PAD		(1<<EXT3_XATTR_PAD_BITS)
-#define EXT3_XATTR_ROUND		(EXT3_XATTR_PAD-1)
-#define EXT3_XATTR_LEN(name_len) \
-	(((name_len) + EXT3_XATTR_ROUND + \
-	sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND)
-#define EXT3_XATTR_NEXT(entry) \
-	( (struct ext3_xattr_entry *)( \
-	  (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) )
-#define EXT3_XATTR_SIZE(size) \
-	(((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND)
-
-# ifdef CONFIG_EXT3_FS_XATTR
-
-extern const struct xattr_handler ext3_xattr_user_handler;
-extern const struct xattr_handler ext3_xattr_trusted_handler;
-extern const struct xattr_handler ext3_xattr_security_handler;
-
-extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
-
-extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
-extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
-extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
-
-extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
-extern void ext3_xattr_put_super(struct super_block *);
-
-extern int init_ext3_xattr(void);
-extern void exit_ext3_xattr(void);
-
-extern const struct xattr_handler *ext3_xattr_handlers[];
-
-# else  /* CONFIG_EXT3_FS_XATTR */
-
-static inline int
-ext3_xattr_get(struct inode *inode, int name_index, const char *name,
-	       void *buffer, size_t size, int flags)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int
-ext3_xattr_set(struct inode *inode, int name_index, const char *name,
-	       const void *value, size_t size, int flags)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int
-ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
-	       const char *name, const void *value, size_t size, int flags)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void
-ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
-{
-}
-
-static inline void
-ext3_xattr_put_super(struct super_block *sb)
-{
-}
-
-static inline int
-init_ext3_xattr(void)
-{
-	return 0;
-}
-
-static inline void
-exit_ext3_xattr(void)
-{
-}
-
-#define ext3_xattr_handlers	NULL
-
-# endif  /* CONFIG_EXT3_FS_XATTR */
-
-#ifdef CONFIG_EXT3_FS_SECURITY
-extern int ext3_init_security(handle_t *handle, struct inode *inode,
-			      struct inode *dir, const struct qstr *qstr);
-#else
-static inline int ext3_init_security(handle_t *handle, struct inode *inode,
-				     struct inode *dir, const struct qstr *qstr)
-{
-	return 0;
-}
-#endif
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
deleted file mode 100644
index c9506d5e3b13..000000000000
--- a/fs/ext3/xattr_security.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * linux/fs/ext3/xattr_security.c
- * Handler for storing security labels as extended attributes.
- */
-
-#include <linux/security.h>
-#include "ext3.h"
-#include "xattr.h"
-
-static size_t
-ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
-			 const char *name, size_t name_len, int type)
-{
-	const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
-}
-
-static int
-ext3_xattr_security_get(struct dentry *dentry, const char *name,
-		void *buffer, size_t size, int type)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY,
-			      name, buffer, size);
-}
-
-static int
-ext3_xattr_security_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY,
-			      name, value, size, flags);
-}
-
-static int ext3_initxattrs(struct inode *inode,
-			   const struct xattr *xattr_array,
-			   void *fs_info)
-{
-	const struct xattr *xattr;
-	handle_t *handle = fs_info;
-	int err = 0;
-
-	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-		err = ext3_xattr_set_handle(handle, inode,
-					    EXT3_XATTR_INDEX_SECURITY,
-					    xattr->name, xattr->value,
-					    xattr->value_len, 0);
-		if (err < 0)
-			break;
-	}
-	return err;
-}
-
-int
-ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
-		   const struct qstr *qstr)
-{
-	return security_inode_init_security(inode, dir, qstr,
-					    &ext3_initxattrs, handle);
-}
-
-const struct xattr_handler ext3_xattr_security_handler = {
-	.prefix	= XATTR_SECURITY_PREFIX,
-	.list	= ext3_xattr_security_list,
-	.get	= ext3_xattr_security_get,
-	.set	= ext3_xattr_security_set,
-};
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
deleted file mode 100644
index 206cc66dc285..000000000000
--- a/fs/ext3/xattr_trusted.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * linux/fs/ext3/xattr_trusted.c
- * Handler for trusted extended attributes.
- *
- * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
- */
-
-#include "ext3.h"
-#include "xattr.h"
-
-static size_t
-ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
-		const char *name, size_t name_len, int type)
-{
-	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return 0;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
-}
-
-static int
-ext3_xattr_trusted_get(struct dentry *dentry, const char *name,
-		       void *buffer, size_t size, int type)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED,
-			      name, buffer, size);
-}
-
-static int
-ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, name,
-			      value, size, flags);
-}
-
-const struct xattr_handler ext3_xattr_trusted_handler = {
-	.prefix	= XATTR_TRUSTED_PREFIX,
-	.list	= ext3_xattr_trusted_list,
-	.get	= ext3_xattr_trusted_get,
-	.set	= ext3_xattr_trusted_set,
-};
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
deleted file mode 100644
index 021508ad1616..000000000000
--- a/fs/ext3/xattr_user.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * linux/fs/ext3/xattr_user.c
- * Handler for extended user attributes.
- *
- * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
- */
-
-#include "ext3.h"
-#include "xattr.h"
-
-static size_t
-ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
-		const char *name, size_t name_len, int type)
-{
-	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (!test_opt(dentry->d_sb, XATTR_USER))
-		return 0;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_USER_PREFIX, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
-}
-
-static int
-ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer,
-		size_t size, int type)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	if (!test_opt(dentry->d_sb, XATTR_USER))
-		return -EOPNOTSUPP;
-	return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_USER,
-			      name, buffer, size);
-}
-
-static int
-ext3_xattr_user_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	if (!test_opt(dentry->d_sb, XATTR_USER))
-		return -EOPNOTSUPP;
-	return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_USER,
-			      name, value, size, flags);
-}
-
-const struct xattr_handler ext3_xattr_user_handler = {
-	.prefix	= XATTR_USER_PREFIX,
-	.list	= ext3_xattr_user_list,
-	.get	= ext3_xattr_user_get,
-	.set	= ext3_xattr_user_set,
-};
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index bf8bc8aba471..219a190ccae9 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -1,5 +1,38 @@
+# Ext3 configs are here for backward compatibility with old configs which may
+# have EXT3_FS set but not EXT4_FS set and thus would result in non-bootable
+# kernels after the removal of ext3 driver.
+config EXT3_FS
+	tristate "The Extended 3 (ext3) filesystem"
+	# These must match EXT4_FS selects...
+	select EXT4_FS
+	select JBD2
+	select CRC16
+	select CRYPTO
+	select CRYPTO_CRC32C
+	help
+	  This config option is here only for backward compatibility. ext3
+	  filesystem is now handled by the ext4 driver.
+
+config EXT3_FS_POSIX_ACL
+	bool "Ext3 POSIX Access Control Lists"
+	depends on EXT3_FS
+	select EXT4_FS_POSIX_ACL
+	select FS_POSIX_ACL
+	help
+	  This config option is here only for backward compatibility. ext3
+	  filesystem is now handled by the ext4 driver.
+
+config EXT3_FS_SECURITY
+	bool "Ext3 Security Labels"
+	depends on EXT3_FS
+	select EXT4_FS_SECURITY
+	help
+	  This config option is here only for backward compatibility. ext3
+	  filesystem is now handled by the ext4 driver.
+
 config EXT4_FS
 	tristate "The Extended 4 (ext4) filesystem"
+	# Please update EXT3_FS selects when changing these
 	select JBD2
 	select CRC16
 	select CRYPTO
@@ -28,14 +61,14 @@ config EXT4_FS
 
 	  If unsure, say N.
 
-config EXT4_USE_FOR_EXT23
+config EXT4_USE_FOR_EXT2
 	bool "Use ext4 for ext2/ext3 file systems"
 	depends on EXT4_FS
-	depends on EXT3_FS=n || EXT2_FS=n
+	depends on EXT2_FS=n
 	default y
 	help
-	  Allow the ext4 file system driver code to be used for ext2 or
-	  ext3 file system mounts.  This allows users to reduce their
+	  Allow the ext4 file system driver code to be used for ext2
+	  file system mounts.  This allows users to reduce their
 	  compiled kernel size by using one file system driver for
 	  ext2, ext3, and ext4 file systems.
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 58987b5c514b..06b4b14e8aa0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -84,7 +84,7 @@ static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
 static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
 
-#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
 static struct file_system_type ext2_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext2",
@@ -100,7 +100,6 @@ MODULE_ALIAS("ext2");
 #endif
 
 
-#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext3",
@@ -111,9 +110,6 @@ static struct file_system_type ext3_fs_type = {
 MODULE_ALIAS_FS("ext3");
 MODULE_ALIAS("ext3");
 #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
-#else
-#define IS_EXT3_SB(sb) (0)
-#endif
 
 static int ext4_verify_csum_type(struct super_block *sb,
 				 struct ext4_super_block *es)
@@ -5500,7 +5496,7 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
 	return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
 }
 
-#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
 static inline void register_as_ext2(void)
 {
 	int err = register_filesystem(&ext2_fs_type);
@@ -5530,7 +5526,6 @@ static inline void unregister_as_ext2(void) { }
 static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
 #endif
 
-#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static inline void register_as_ext3(void)
 {
 	int err = register_filesystem(&ext3_fs_type);
@@ -5556,11 +5551,6 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
 		return 0;
 	return 1;
 }
-#else
-static inline void register_as_ext3(void) { }
-static inline void unregister_as_ext3(void) { }
-static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
-#endif
 
 static struct file_system_type ext4_fs_type = {
 	.owner		= THIS_MODULE,
diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig
deleted file mode 100644
index 4e28beeed157..000000000000
--- a/fs/jbd/Kconfig
+++ /dev/null
@@ -1,30 +0,0 @@
-config JBD
-	tristate
-	help
-	  This is a generic journalling layer for block devices.  It is
-	  currently used by the ext3 file system, but it could also be
-	  used to add journal support to other file systems or block
-	  devices such as RAID or LVM.
-
-	  If you are using the ext3 file system, you need to say Y here.
-	  If you are not using ext3 then you will probably want to say N.
-
-	  To compile this device as a module, choose M here: the module will be
-	  called jbd.  If you are compiling ext3 into the kernel, you
-	  cannot compile this code as a module.
-
-config JBD_DEBUG
-	bool "JBD (ext3) debugging support"
-	depends on JBD && DEBUG_FS
-	help
-	  If you are using the ext3 journaled file system (or potentially any
-	  other file system/device using JBD), this option allows you to
-	  enable debugging output while the system is running, in order to
-	  help track down any problems you are having.  By default the
-	  debugging output will be turned off.
-
-	  If you select Y here, then you will be able to turn on debugging
-	  with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
-	  number between 1 and 5, the higher the number, the more debugging
-	  output is generated.  To turn debugging off again, do
-	  "echo 0 > /sys/kernel/debug/jbd/jbd-debug".
diff --git a/fs/jbd/Makefile b/fs/jbd/Makefile
deleted file mode 100644
index 54aca4868a36..000000000000
--- a/fs/jbd/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for the linux journaling routines.
-#
-
-obj-$(CONFIG_JBD) += jbd.o
-
-jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
deleted file mode 100644
index 08c03044abdd..000000000000
--- a/fs/jbd/checkpoint.c
+++ /dev/null
@@ -1,782 +0,0 @@
-/*
- * linux/fs/jbd/checkpoint.c
- *
- * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
- *
- * Copyright 1999 Red Hat Software --- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * Checkpoint routines for the generic filesystem journaling code.
- * Part of the ext2fs journaling system.
- *
- * Checkpointing is the process of ensuring that a section of the log is
- * committed fully to disk, so that that portion of the log can be
- * reused.
- */
-
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/blkdev.h>
-#include <trace/events/jbd.h>
-
-/*
- * Unlink a buffer from a transaction checkpoint list.
- *
- * Called with j_list_lock held.
- */
-static inline void __buffer_unlink_first(struct journal_head *jh)
-{
-	transaction_t *transaction = jh->b_cp_transaction;
-
-	jh->b_cpnext->b_cpprev = jh->b_cpprev;
-	jh->b_cpprev->b_cpnext = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh) {
-		transaction->t_checkpoint_list = jh->b_cpnext;
-		if (transaction->t_checkpoint_list == jh)
-			transaction->t_checkpoint_list = NULL;
-	}
-}
-
-/*
- * Unlink a buffer from a transaction checkpoint(io) list.
- *
- * Called with j_list_lock held.
- */
-static inline void __buffer_unlink(struct journal_head *jh)
-{
-	transaction_t *transaction = jh->b_cp_transaction;
-
-	__buffer_unlink_first(jh);
-	if (transaction->t_checkpoint_io_list == jh) {
-		transaction->t_checkpoint_io_list = jh->b_cpnext;
-		if (transaction->t_checkpoint_io_list == jh)
-			transaction->t_checkpoint_io_list = NULL;
-	}
-}
-
-/*
- * Move a buffer from the checkpoint list to the checkpoint io list
- *
- * Called with j_list_lock held
- */
-static inline void __buffer_relink_io(struct journal_head *jh)
-{
-	transaction_t *transaction = jh->b_cp_transaction;
-
-	__buffer_unlink_first(jh);
-
-	if (!transaction->t_checkpoint_io_list) {
-		jh->b_cpnext = jh->b_cpprev = jh;
-	} else {
-		jh->b_cpnext = transaction->t_checkpoint_io_list;
-		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
-		jh->b_cpprev->b_cpnext = jh;
-		jh->b_cpnext->b_cpprev = jh;
-	}
-	transaction->t_checkpoint_io_list = jh;
-}
-
-/*
- * Try to release a checkpointed buffer from its transaction.
- * Returns 1 if we released it and 2 if we also released the
- * whole transaction.
- *
- * Requires j_list_lock
- * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
- */
-static int __try_to_free_cp_buf(struct journal_head *jh)
-{
-	int ret = 0;
-	struct buffer_head *bh = jh2bh(jh);
-
-	if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
-	    !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
-		/*
-		 * Get our reference so that bh cannot be freed before
-		 * we unlock it
-		 */
-		get_bh(bh);
-		JBUFFER_TRACE(jh, "remove from checkpoint list");
-		ret = __journal_remove_checkpoint(jh) + 1;
-		jbd_unlock_bh_state(bh);
-		BUFFER_TRACE(bh, "release");
-		__brelse(bh);
-	} else {
-		jbd_unlock_bh_state(bh);
-	}
-	return ret;
-}
-
-/*
- * __log_wait_for_space: wait until there is space in the journal.
- *
- * Called under j-state_lock *only*.  It will be unlocked if we have to wait
- * for a checkpoint to free up some space in the log.
- */
-void __log_wait_for_space(journal_t *journal)
-{
-	int nblocks, space_left;
-	assert_spin_locked(&journal->j_state_lock);
-
-	nblocks = jbd_space_needed(journal);
-	while (__log_space_left(journal) < nblocks) {
-		if (journal->j_flags & JFS_ABORT)
-			return;
-		spin_unlock(&journal->j_state_lock);
-		mutex_lock(&journal->j_checkpoint_mutex);
-
-		/*
-		 * Test again, another process may have checkpointed while we
-		 * were waiting for the checkpoint lock. If there are no
-		 * transactions ready to be checkpointed, try to recover
-		 * journal space by calling cleanup_journal_tail(), and if
-		 * that doesn't work, by waiting for the currently committing
-		 * transaction to complete.  If there is absolutely no way
-		 * to make progress, this is either a BUG or corrupted
-		 * filesystem, so abort the journal and leave a stack
-		 * trace for forensic evidence.
-		 */
-		spin_lock(&journal->j_state_lock);
-		spin_lock(&journal->j_list_lock);
-		nblocks = jbd_space_needed(journal);
-		space_left = __log_space_left(journal);
-		if (space_left < nblocks) {
-			int chkpt = journal->j_checkpoint_transactions != NULL;
-			tid_t tid = 0;
-
-			if (journal->j_committing_transaction)
-				tid = journal->j_committing_transaction->t_tid;
-			spin_unlock(&journal->j_list_lock);
-			spin_unlock(&journal->j_state_lock);
-			if (chkpt) {
-				log_do_checkpoint(journal);
-			} else if (cleanup_journal_tail(journal) == 0) {
-				/* We were able to recover space; yay! */
-				;
-			} else if (tid) {
-				log_wait_commit(journal, tid);
-			} else {
-				printk(KERN_ERR "%s: needed %d blocks and "
-				       "only had %d space available\n",
-				       __func__, nblocks, space_left);
-				printk(KERN_ERR "%s: no way to get more "
-				       "journal space\n", __func__);
-				WARN_ON(1);
-				journal_abort(journal, 0);
-			}
-			spin_lock(&journal->j_state_lock);
-		} else {
-			spin_unlock(&journal->j_list_lock);
-		}
-		mutex_unlock(&journal->j_checkpoint_mutex);
-	}
-}
-
-/*
- * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
- * The caller must restart a list walk.  Wait for someone else to run
- * jbd_unlock_bh_state().
- */
-static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
-	__releases(journal->j_list_lock)
-{
-	get_bh(bh);
-	spin_unlock(&journal->j_list_lock);
-	jbd_lock_bh_state(bh);
-	jbd_unlock_bh_state(bh);
-	put_bh(bh);
-}
-
-/*
- * Clean up transaction's list of buffers submitted for io.
- * We wait for any pending IO to complete and remove any clean
- * buffers. Note that we take the buffers in the opposite ordering
- * from the one in which they were submitted for IO.
- *
- * Return 0 on success, and return <0 if some buffers have failed
- * to be written out.
- *
- * Called with j_list_lock held.
- */
-static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
-{
-	struct journal_head *jh;
-	struct buffer_head *bh;
-	tid_t this_tid;
-	int released = 0;
-	int ret = 0;
-
-	this_tid = transaction->t_tid;
-restart:
-	/* Did somebody clean up the transaction in the meanwhile? */
-	if (journal->j_checkpoint_transactions != transaction ||
-			transaction->t_tid != this_tid)
-		return ret;
-	while (!released && transaction->t_checkpoint_io_list) {
-		jh = transaction->t_checkpoint_io_list;
-		bh = jh2bh(jh);
-		if (!jbd_trylock_bh_state(bh)) {
-			jbd_sync_bh(journal, bh);
-			spin_lock(&journal->j_list_lock);
-			goto restart;
-		}
-		get_bh(bh);
-		if (buffer_locked(bh)) {
-			spin_unlock(&journal->j_list_lock);
-			jbd_unlock_bh_state(bh);
-			wait_on_buffer(bh);
-			/* the journal_head may have gone by now */
-			BUFFER_TRACE(bh, "brelse");
-			__brelse(bh);
-			spin_lock(&journal->j_list_lock);
-			goto restart;
-		}
-		if (unlikely(buffer_write_io_error(bh)))
-			ret = -EIO;
-
-		/*
-		 * Now in whatever state the buffer currently is, we know that
-		 * it has been written out and so we can drop it from the list
-		 */
-		released = __journal_remove_checkpoint(jh);
-		jbd_unlock_bh_state(bh);
-		__brelse(bh);
-	}
-
-	return ret;
-}
-
-#define NR_BATCH	64
-
-static void
-__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
-{
-	int i;
-	struct blk_plug plug;
-
-	blk_start_plug(&plug);
-	for (i = 0; i < *batch_count; i++)
-		write_dirty_buffer(bhs[i], WRITE_SYNC);
-	blk_finish_plug(&plug);
-
-	for (i = 0; i < *batch_count; i++) {
-		struct buffer_head *bh = bhs[i];
-		clear_buffer_jwrite(bh);
-		BUFFER_TRACE(bh, "brelse");
-		__brelse(bh);
-	}
-	*batch_count = 0;
-}
-
-/*
- * Try to flush one buffer from the checkpoint list to disk.
- *
- * Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list.  Return <0 if the buffer has failed to
- * be written out.
- *
- * Called with j_list_lock held and drops it if 1 is returned
- * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
- */
-static int __process_buffer(journal_t *journal, struct journal_head *jh,
-			struct buffer_head **bhs, int *batch_count)
-{
-	struct buffer_head *bh = jh2bh(jh);
-	int ret = 0;
-
-	if (buffer_locked(bh)) {
-		get_bh(bh);
-		spin_unlock(&journal->j_list_lock);
-		jbd_unlock_bh_state(bh);
-		wait_on_buffer(bh);
-		/* the journal_head may have gone by now */
-		BUFFER_TRACE(bh, "brelse");
-		__brelse(bh);
-		ret = 1;
-	} else if (jh->b_transaction != NULL) {
-		transaction_t *t = jh->b_transaction;
-		tid_t tid = t->t_tid;
-
-		spin_unlock(&journal->j_list_lock);
-		jbd_unlock_bh_state(bh);
-		log_start_commit(journal, tid);
-		log_wait_commit(journal, tid);
-		ret = 1;
-	} else if (!buffer_dirty(bh)) {
-		ret = 1;
-		if (unlikely(buffer_write_io_error(bh)))
-			ret = -EIO;
-		get_bh(bh);
-		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
-		BUFFER_TRACE(bh, "remove from checkpoint");
-		__journal_remove_checkpoint(jh);
-		spin_unlock(&journal->j_list_lock);
-		jbd_unlock_bh_state(bh);
-		__brelse(bh);
-	} else {
-		/*
-		 * Important: we are about to write the buffer, and
-		 * possibly block, while still holding the journal lock.
-		 * We cannot afford to let the transaction logic start
-		 * messing around with this buffer before we write it to
-		 * disk, as that would break recoverability.
-		 */
-		BUFFER_TRACE(bh, "queue");
-		get_bh(bh);
-		J_ASSERT_BH(bh, !buffer_jwrite(bh));
-		set_buffer_jwrite(bh);
-		bhs[*batch_count] = bh;
-		__buffer_relink_io(jh);
-		jbd_unlock_bh_state(bh);
-		(*batch_count)++;
-		if (*batch_count == NR_BATCH) {
-			spin_unlock(&journal->j_list_lock);
-			__flush_batch(journal, bhs, batch_count);
-			ret = 1;
-		}
-	}
-	return ret;
-}
-
-/*
- * Perform an actual checkpoint. We take the first transaction on the
- * list of transactions to be checkpointed and send all its buffers
- * to disk. We submit larger chunks of data at once.
- *
- * The journal should be locked before calling this function.
- * Called with j_checkpoint_mutex held.
- */
-int log_do_checkpoint(journal_t *journal)
-{
-	transaction_t *transaction;
-	tid_t this_tid;
-	int result;
-
-	jbd_debug(1, "Start checkpoint\n");
-
-	/*
-	 * First thing: if there are any transactions in the log which
-	 * don't need checkpointing, just eliminate them from the
-	 * journal straight away.
-	 */
-	result = cleanup_journal_tail(journal);
-	trace_jbd_checkpoint(journal, result);
-	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
-	if (result <= 0)
-		return result;
-
-	/*
-	 * OK, we need to start writing disk blocks.  Take one transaction
-	 * and write it.
-	 */
-	result = 0;
-	spin_lock(&journal->j_list_lock);
-	if (!journal->j_checkpoint_transactions)
-		goto out;
-	transaction = journal->j_checkpoint_transactions;
-	this_tid = transaction->t_tid;
-restart:
-	/*
-	 * If someone cleaned up this transaction while we slept, we're
-	 * done (maybe it's a new transaction, but it fell at the same
-	 * address).
-	 */
-	if (journal->j_checkpoint_transactions == transaction &&
-			transaction->t_tid == this_tid) {
-		int batch_count = 0;
-		struct buffer_head *bhs[NR_BATCH];
-		struct journal_head *jh;
-		int retry = 0, err;
-
-		while (!retry && transaction->t_checkpoint_list) {
-			struct buffer_head *bh;
-
-			jh = transaction->t_checkpoint_list;
-			bh = jh2bh(jh);
-			if (!jbd_trylock_bh_state(bh)) {
-				jbd_sync_bh(journal, bh);
-				retry = 1;
-				break;
-			}
-			retry = __process_buffer(journal, jh, bhs,&batch_count);
-			if (retry < 0 && !result)
-				result = retry;
-			if (!retry && (need_resched() ||
-				spin_needbreak(&journal->j_list_lock))) {
-				spin_unlock(&journal->j_list_lock);
-				retry = 1;
-				break;
-			}
-		}
-
-		if (batch_count) {
-			if (!retry) {
-				spin_unlock(&journal->j_list_lock);
-				retry = 1;
-			}
-			__flush_batch(journal, bhs, &batch_count);
-		}
-
-		if (retry) {
-			spin_lock(&journal->j_list_lock);
-			goto restart;
-		}
-		/*
-		 * Now we have cleaned up the first transaction's checkpoint
-		 * list. Let's clean up the second one
-		 */
-		err = __wait_cp_io(journal, transaction);
-		if (!result)
-			result = err;
-	}
-out:
-	spin_unlock(&journal->j_list_lock);
-	if (result < 0)
-		journal_abort(journal, result);
-	else
-		result = cleanup_journal_tail(journal);
-
-	return (result < 0) ? result : 0;
-}
-
-/*
- * Check the list of checkpoint transactions for the journal to see if
- * we have already got rid of any since the last update of the log tail
- * in the journal superblock.  If so, we can instantly roll the
- * superblock forward to remove those transactions from the log.
- *
- * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
- *
- * This is the only part of the journaling code which really needs to be
- * aware of transaction aborts.  Checkpointing involves writing to the
- * main filesystem area rather than to the journal, so it can proceed
- * even in abort state, but we must not update the super block if
- * checkpointing may have failed.  Otherwise, we would lose some metadata
- * buffers which should be written-back to the filesystem.
- */
-
-int cleanup_journal_tail(journal_t *journal)
-{
-	transaction_t * transaction;
-	tid_t		first_tid;
-	unsigned int	blocknr, freed;
-
-	if (is_journal_aborted(journal))
-		return 1;
-
-	/*
-	 * OK, work out the oldest transaction remaining in the log, and
-	 * the log block it starts at.
-	 *
-	 * If the log is now empty, we need to work out which is the
-	 * next transaction ID we will write, and where it will
-	 * start.
-	 */
-	spin_lock(&journal->j_state_lock);
-	spin_lock(&journal->j_list_lock);
-	transaction = journal->j_checkpoint_transactions;
-	if (transaction) {
-		first_tid = transaction->t_tid;
-		blocknr = transaction->t_log_start;
-	} else if ((transaction = journal->j_committing_transaction) != NULL) {
-		first_tid = transaction->t_tid;
-		blocknr = transaction->t_log_start;
-	} else if ((transaction = journal->j_running_transaction) != NULL) {
-		first_tid = transaction->t_tid;
-		blocknr = journal->j_head;
-	} else {
-		first_tid = journal->j_transaction_sequence;
-		blocknr = journal->j_head;
-	}
-	spin_unlock(&journal->j_list_lock);
-	J_ASSERT(blocknr != 0);
-
-	/* If the oldest pinned transaction is at the tail of the log
-           already then there's not much we can do right now. */
-	if (journal->j_tail_sequence == first_tid) {
-		spin_unlock(&journal->j_state_lock);
-		return 1;
-	}
-	spin_unlock(&journal->j_state_lock);
-
-	/*
-	 * We need to make sure that any blocks that were recently written out
-	 * --- perhaps by log_do_checkpoint() --- are flushed out before we
-	 * drop the transactions from the journal. Similarly we need to be sure
-	 * superblock makes it to disk before next transaction starts reusing
-	 * freed space (otherwise we could replay some blocks of the new
-	 * transaction thinking they belong to the old one). So we use
-	 * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially
-	 * with an appropriately sized journal, but we need this to guarantee
-	 * correctness.  Fortunately cleanup_journal_tail() doesn't get called
-	 * all that often.
-	 */
-	journal_update_sb_log_tail(journal, first_tid, blocknr,
-				   WRITE_FLUSH_FUA);
-
-	spin_lock(&journal->j_state_lock);
-	/* OK, update the superblock to recover the freed space.
-	 * Physical blocks come first: have we wrapped beyond the end of
-	 * the log?  */
-	freed = blocknr - journal->j_tail;
-	if (blocknr < journal->j_tail)
-		freed = freed + journal->j_last - journal->j_first;
-
-	trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed);
-	jbd_debug(1,
-		  "Cleaning journal tail from %d to %d (offset %u), "
-		  "freeing %u\n",
-		  journal->j_tail_sequence, first_tid, blocknr, freed);
-
-	journal->j_free += freed;
-	journal->j_tail_sequence = first_tid;
-	journal->j_tail = blocknr;
-	spin_unlock(&journal->j_state_lock);
-	return 0;
-}
-
-
-/* Checkpoint list management */
-
-/*
- * journal_clean_one_cp_list
- *
- * Find all the written-back checkpoint buffers in the given list and release
- * them.
- *
- * Called with j_list_lock held.
- * Returns number of buffers reaped (for debug)
- */
-
-static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
-{
-	struct journal_head *last_jh;
-	struct journal_head *next_jh = jh;
-	int ret, freed = 0;
-
-	*released = 0;
-	if (!jh)
-		return 0;
-
-	last_jh = jh->b_cpprev;
-	do {
-		jh = next_jh;
-		next_jh = jh->b_cpnext;
-		/* Use trylock because of the ranking */
-		if (jbd_trylock_bh_state(jh2bh(jh))) {
-			ret = __try_to_free_cp_buf(jh);
-			if (ret) {
-				freed++;
-				if (ret == 2) {
-					*released = 1;
-					return freed;
-				}
-			}
-		}
-		/*
-		 * This function only frees up some memory
-		 * if possible so we dont have an obligation
-		 * to finish processing. Bail out if preemption
-		 * requested:
-		 */
-		if (need_resched())
-			return freed;
-	} while (jh != last_jh);
-
-	return freed;
-}
-
-/*
- * journal_clean_checkpoint_list
- *
- * Find all the written-back checkpoint buffers in the journal and release them.
- *
- * Called with the journal locked.
- * Called with j_list_lock held.
- * Returns number of buffers reaped (for debug)
- */
-
-int __journal_clean_checkpoint_list(journal_t *journal)
-{
-	transaction_t *transaction, *last_transaction, *next_transaction;
-	int ret = 0;
-	int released;
-
-	transaction = journal->j_checkpoint_transactions;
-	if (!transaction)
-		goto out;
-
-	last_transaction = transaction->t_cpprev;
-	next_transaction = transaction;
-	do {
-		transaction = next_transaction;
-		next_transaction = transaction->t_cpnext;
-		ret += journal_clean_one_cp_list(transaction->
-				t_checkpoint_list, &released);
-		/*
-		 * This function only frees up some memory if possible so we
-		 * dont have an obligation to finish processing. Bail out if
-		 * preemption requested:
-		 */
-		if (need_resched())
-			goto out;
-		if (released)
-			continue;
-		/*
-		 * It is essential that we are as careful as in the case of
-		 * t_checkpoint_list with removing the buffer from the list as
-		 * we can possibly see not yet submitted buffers on io_list
-		 */
-		ret += journal_clean_one_cp_list(transaction->
-				t_checkpoint_io_list, &released);
-		if (need_resched())
-			goto out;
-	} while (transaction != last_transaction);
-out:
-	return ret;
-}
-
-/*
- * journal_remove_checkpoint: called after a buffer has been committed
- * to disk (either by being write-back flushed to disk, or being
- * committed to the log).
- *
- * We cannot safely clean a transaction out of the log until all of the
- * buffer updates committed in that transaction have safely been stored
- * elsewhere on disk.  To achieve this, all of the buffers in a
- * transaction need to be maintained on the transaction's checkpoint
- * lists until they have been rewritten, at which point this function is
- * called to remove the buffer from the existing transaction's
- * checkpoint lists.
- *
- * The function returns 1 if it frees the transaction, 0 otherwise.
- * The function can free jh and bh.
- *
- * This function is called with j_list_lock held.
- * This function is called with jbd_lock_bh_state(jh2bh(jh))
- */
-
-int __journal_remove_checkpoint(struct journal_head *jh)
-{
-	transaction_t *transaction;
-	journal_t *journal;
-	int ret = 0;
-
-	JBUFFER_TRACE(jh, "entry");
-
-	if ((transaction = jh->b_cp_transaction) == NULL) {
-		JBUFFER_TRACE(jh, "not on transaction");
-		goto out;
-	}
-	journal = transaction->t_journal;
-
-	JBUFFER_TRACE(jh, "removing from transaction");
-	__buffer_unlink(jh);
-	jh->b_cp_transaction = NULL;
-	journal_put_journal_head(jh);
-
-	if (transaction->t_checkpoint_list != NULL ||
-	    transaction->t_checkpoint_io_list != NULL)
-		goto out;
-
-	/*
-	 * There is one special case to worry about: if we have just pulled the
-	 * buffer off a running or committing transaction's checkpoing list,
-	 * then even if the checkpoint list is empty, the transaction obviously
-	 * cannot be dropped!
-	 *
-	 * The locking here around t_state is a bit sleazy.
-	 * See the comment at the end of journal_commit_transaction().
-	 */
-	if (transaction->t_state != T_FINISHED)
-		goto out;
-
-	/* OK, that was the last buffer for the transaction: we can now
-	   safely remove this transaction from the log */
-
-	__journal_drop_transaction(journal, transaction);
-
-	/* Just in case anybody was waiting for more transactions to be
-           checkpointed... */
-	wake_up(&journal->j_wait_logspace);
-	ret = 1;
-out:
-	return ret;
-}
-
-/*
- * journal_insert_checkpoint: put a committed buffer onto a checkpoint
- * list so that we know when it is safe to clean the transaction out of
- * the log.
- *
- * Called with the journal locked.
- * Called with j_list_lock held.
- */
-void __journal_insert_checkpoint(struct journal_head *jh,
-			       transaction_t *transaction)
-{
-	JBUFFER_TRACE(jh, "entry");
-	J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
-	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
-
-	/* Get reference for checkpointing transaction */
-	journal_grab_journal_head(jh2bh(jh));
-	jh->b_cp_transaction = transaction;
-
-	if (!transaction->t_checkpoint_list) {
-		jh->b_cpnext = jh->b_cpprev = jh;
-	} else {
-		jh->b_cpnext = transaction->t_checkpoint_list;
-		jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
-		jh->b_cpprev->b_cpnext = jh;
-		jh->b_cpnext->b_cpprev = jh;
-	}
-	transaction->t_checkpoint_list = jh;
-}
-
-/*
- * We've finished with this transaction structure: adios...
- *
- * The transaction must have no links except for the checkpoint by this
- * point.
- *
- * Called with the journal locked.
- * Called with j_list_lock held.
- */
-
-void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
-{
-	assert_spin_locked(&journal->j_list_lock);
-	if (transaction->t_cpnext) {
-		transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
-		transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
-		if (journal->j_checkpoint_transactions == transaction)
-			journal->j_checkpoint_transactions =
-				transaction->t_cpnext;
-		if (journal->j_checkpoint_transactions == transaction)
-			journal->j_checkpoint_transactions = NULL;
-	}
-
-	J_ASSERT(transaction->t_state == T_FINISHED);
-	J_ASSERT(transaction->t_buffers == NULL);
-	J_ASSERT(transaction->t_sync_datalist == NULL);
-	J_ASSERT(transaction->t_forget == NULL);
-	J_ASSERT(transaction->t_iobuf_list == NULL);
-	J_ASSERT(transaction->t_shadow_list == NULL);
-	J_ASSERT(transaction->t_log_list == NULL);
-	J_ASSERT(transaction->t_checkpoint_list == NULL);
-	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
-	J_ASSERT(transaction->t_updates == 0);
-	J_ASSERT(journal->j_committing_transaction != transaction);
-	J_ASSERT(journal->j_running_transaction != transaction);
-
-	trace_jbd_drop_transaction(journal, transaction);
-	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
-	kfree(transaction);
-}
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
deleted file mode 100644
index bb217dcb41af..000000000000
--- a/fs/jbd/commit.c
+++ /dev/null
@@ -1,1021 +0,0 @@
-/*
- * linux/fs/jbd/commit.c
- *
- * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
- *
- * Copyright 1998 Red Hat corp --- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * Journal commit routines for the generic filesystem journaling code;
- * part of the ext2fs journaling system.
- */
-
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/errno.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <trace/events/jbd.h>
-
-/*
- * Default IO end handler for temporary BJ_IO buffer_heads.
- */
-static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
-{
-	BUFFER_TRACE(bh, "");
-	if (uptodate)
-		set_buffer_uptodate(bh);
-	else
-		clear_buffer_uptodate(bh);
-	unlock_buffer(bh);
-}
-
-/*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not successfully freed, because they are attached to a committing transaction.
- * After the transaction commits, these pages are left on the LRU, with no
- * ->mapping, and with attached buffers.  These pages are trivially reclaimable
- * by the VM, but their apparent absence upsets the VM accounting, and it makes
- * the numbers in /proc/meminfo look odd.
- *
- * So here, we have a buffer which has just come off the forget list.  Look to
- * see if we can strip all buffers from the backing page.
- *
- * Called under journal->j_list_lock.  The caller provided us with a ref
- * against the buffer, and we drop that here.
- */
-static void release_buffer_page(struct buffer_head *bh)
-{
-	struct page *page;
-
-	if (buffer_dirty(bh))
-		goto nope;
-	if (atomic_read(&bh->b_count) != 1)
-		goto nope;
-	page = bh->b_page;
-	if (!page)
-		goto nope;
-	if (page->mapping)
-		goto nope;
-
-	/* OK, it's a truncated page */
-	if (!trylock_page(page))
-		goto nope;
-
-	page_cache_get(page);
-	__brelse(bh);
-	try_to_free_buffers(page);
-	unlock_page(page);
-	page_cache_release(page);
-	return;
-
-nope:
-	__brelse(bh);
-}
-
-/*
- * Decrement reference counter for data buffer. If it has been marked
- * 'BH_Freed', release it and the page to which it belongs if possible.
- */
-static void release_data_buffer(struct buffer_head *bh)
-{
-	if (buffer_freed(bh)) {
-		WARN_ON_ONCE(buffer_dirty(bh));
-		clear_buffer_freed(bh);
-		clear_buffer_mapped(bh);
-		clear_buffer_new(bh);
-		clear_buffer_req(bh);
-		bh->b_bdev = NULL;
-		release_buffer_page(bh);
-	} else
-		put_bh(bh);
-}
-
-/*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held.  For ranking reasons we must trylock.  If we lose, schedule away and
- * return 0.  j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
-	if (!jbd_trylock_bh_state(bh)) {
-		spin_unlock(&journal->j_list_lock);
-		schedule();
-		return 0;
-	}
-	return 1;
-}
-
-/* Done it all: now write the commit record.  We should have
- * cleaned up our previous buffers by now, so if we are in abort
- * mode we can now just skip the rest of the journal write
- * entirely.
- *
- * Returns 1 if the journal needs to be aborted or 0 on success
- */
-static int journal_write_commit_record(journal_t *journal,
-					transaction_t *commit_transaction)
-{
-	struct journal_head *descriptor;
-	struct buffer_head *bh;
-	journal_header_t *header;
-	int ret;
-
-	if (is_journal_aborted(journal))
-		return 0;
-
-	descriptor = journal_get_descriptor_buffer(journal);
-	if (!descriptor)
-		return 1;
-
-	bh = jh2bh(descriptor);
-
-	header = (journal_header_t *)(bh->b_data);
-	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
-	header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
-	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
-
-	JBUFFER_TRACE(descriptor, "write commit block");
-	set_buffer_dirty(bh);
-
-	if (journal->j_flags & JFS_BARRIER)
-		ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
-	else
-		ret = sync_dirty_buffer(bh);
-
-	put_bh(bh);		/* One for getblk() */
-	journal_put_journal_head(descriptor);
-
-	return (ret == -EIO);
-}
-
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
-				   int write_op)
-{
-	int i;
-
-	for (i = 0; i < bufs; i++) {
-		wbuf[i]->b_end_io = end_buffer_write_sync;
-		/*
-		 * Here we write back pagecache data that may be mmaped. Since
-		 * we cannot afford to clean the page and set PageWriteback
-		 * here due to lock ordering (page lock ranks above transaction
-		 * start), the data can change while IO is in flight. Tell the
-		 * block layer it should bounce the bio pages if stable data
-		 * during write is required.
-		 *
-		 * We use up our safety reference in submit_bh().
-		 */
-		_submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE);
-	}
-}
-
-/*
- *  Submit all the data buffers to disk
- */
-static int journal_submit_data_buffers(journal_t *journal,
-				       transaction_t *commit_transaction,
-				       int write_op)
-{
-	struct journal_head *jh;
-	struct buffer_head *bh;
-	int locked;
-	int bufs = 0;
-	struct buffer_head **wbuf = journal->j_wbuf;
-	int err = 0;
-
-	/*
-	 * Whenever we unlock the journal and sleep, things can get added
-	 * onto ->t_sync_datalist, so we have to keep looping back to
-	 * write_out_data until we *know* that the list is empty.
-	 *
-	 * Cleanup any flushed data buffers from the data list.  Even in
-	 * abort mode, we want to flush this out as soon as possible.
-	 */
-write_out_data:
-	cond_resched();
-	spin_lock(&journal->j_list_lock);
-
-	while (commit_transaction->t_sync_datalist) {
-		jh = commit_transaction->t_sync_datalist;
-		bh = jh2bh(jh);
-		locked = 0;
-
-		/* Get reference just to make sure buffer does not disappear
-		 * when we are forced to drop various locks */
-		get_bh(bh);
-		/* If the buffer is dirty, we need to submit IO and hence
-		 * we need the buffer lock. We try to lock the buffer without
-		 * blocking. If we fail, we need to drop j_list_lock and do
-		 * blocking lock_buffer().
-		 */
-		if (buffer_dirty(bh)) {
-			if (!trylock_buffer(bh)) {
-				BUFFER_TRACE(bh, "needs blocking lock");
-				spin_unlock(&journal->j_list_lock);
-				trace_jbd_do_submit_data(journal,
-						     commit_transaction);
-				/* Write out all data to prevent deadlocks */
-				journal_do_submit_data(wbuf, bufs, write_op);
-				bufs = 0;
-				lock_buffer(bh);
-				spin_lock(&journal->j_list_lock);
-			}
-			locked = 1;
-		}
-		/* We have to get bh_state lock. Again out of order, sigh. */
-		if (!inverted_lock(journal, bh)) {
-			jbd_lock_bh_state(bh);
-			spin_lock(&journal->j_list_lock);
-		}
-		/* Someone already cleaned up the buffer? */
-		if (!buffer_jbd(bh) || bh2jh(bh) != jh
-			|| jh->b_transaction != commit_transaction
-			|| jh->b_jlist != BJ_SyncData) {
-			jbd_unlock_bh_state(bh);
-			if (locked)
-				unlock_buffer(bh);
-			BUFFER_TRACE(bh, "already cleaned up");
-			release_data_buffer(bh);
-			continue;
-		}
-		if (locked && test_clear_buffer_dirty(bh)) {
-			BUFFER_TRACE(bh, "needs writeout, adding to array");
-			wbuf[bufs++] = bh;
-			__journal_file_buffer(jh, commit_transaction,
-						BJ_Locked);
-			jbd_unlock_bh_state(bh);
-			if (bufs == journal->j_wbufsize) {
-				spin_unlock(&journal->j_list_lock);
-				trace_jbd_do_submit_data(journal,
-						     commit_transaction);
-				journal_do_submit_data(wbuf, bufs, write_op);
-				bufs = 0;
-				goto write_out_data;
-			}
-		} else if (!locked && buffer_locked(bh)) {
-			__journal_file_buffer(jh, commit_transaction,
-						BJ_Locked);
-			jbd_unlock_bh_state(bh);
-			put_bh(bh);
-		} else {
-			BUFFER_TRACE(bh, "writeout complete: unfile");
-			if (unlikely(!buffer_uptodate(bh)))
-				err = -EIO;
-			__journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			if (locked)
-				unlock_buffer(bh);
-			release_data_buffer(bh);
-		}
-
-		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
-			spin_unlock(&journal->j_list_lock);
-			goto write_out_data;
-		}
-	}
-	spin_unlock(&journal->j_list_lock);
-	trace_jbd_do_submit_data(journal, commit_transaction);
-	journal_do_submit_data(wbuf, bufs, write_op);
-
-	return err;
-}
-
-/*
- * journal_commit_transaction
- *
- * The primary function for committing a transaction to the log.  This
- * function is called by the journal thread to begin a complete commit.
- */
-void journal_commit_transaction(journal_t *journal)
-{
-	transaction_t *commit_transaction;
-	struct journal_head *jh, *new_jh, *descriptor;
-	struct buffer_head **wbuf = journal->j_wbuf;
-	int bufs;
-	int flags;
-	int err;
-	unsigned int blocknr;
-	ktime_t start_time;
-	u64 commit_time;
-	char *tagp = NULL;
-	journal_header_t *header;
-	journal_block_tag_t *tag = NULL;
-	int space_left = 0;
-	int first_tag = 0;
-	int tag_flag;
-	int i;
-	struct blk_plug plug;
-	int write_op = WRITE;
-
-	/*
-	 * First job: lock down the current transaction and wait for
-	 * all outstanding updates to complete.
-	 */
-
-	/* Do we need to erase the effects of a prior journal_flush? */
-	if (journal->j_flags & JFS_FLUSHED) {
-		jbd_debug(3, "super block updated\n");
-		mutex_lock(&journal->j_checkpoint_mutex);
-		/*
-		 * We hold j_checkpoint_mutex so tail cannot change under us.
-		 * We don't need any special data guarantees for writing sb
-		 * since journal is empty and it is ok for write to be
-		 * flushed only with transaction commit.
-		 */
-		journal_update_sb_log_tail(journal, journal->j_tail_sequence,
-					   journal->j_tail, WRITE_SYNC);
-		mutex_unlock(&journal->j_checkpoint_mutex);
-	} else {
-		jbd_debug(3, "superblock not updated\n");
-	}
-
-	J_ASSERT(journal->j_running_transaction != NULL);
-	J_ASSERT(journal->j_committing_transaction == NULL);
-
-	commit_transaction = journal->j_running_transaction;
-
-	trace_jbd_start_commit(journal, commit_transaction);
-	jbd_debug(1, "JBD: starting commit of transaction %d\n",
-			commit_transaction->t_tid);
-
-	spin_lock(&journal->j_state_lock);
-	J_ASSERT(commit_transaction->t_state == T_RUNNING);
-	commit_transaction->t_state = T_LOCKED;
-
-	trace_jbd_commit_locking(journal, commit_transaction);
-	spin_lock(&commit_transaction->t_handle_lock);
-	while (commit_transaction->t_updates) {
-		DEFINE_WAIT(wait);
-
-		prepare_to_wait(&journal->j_wait_updates, &wait,
-					TASK_UNINTERRUPTIBLE);
-		if (commit_transaction->t_updates) {
-			spin_unlock(&commit_transaction->t_handle_lock);
-			spin_unlock(&journal->j_state_lock);
-			schedule();
-			spin_lock(&journal->j_state_lock);
-			spin_lock(&commit_transaction->t_handle_lock);
-		}
-		finish_wait(&journal->j_wait_updates, &wait);
-	}
-	spin_unlock(&commit_transaction->t_handle_lock);
-
-	J_ASSERT (commit_transaction->t_outstanding_credits <=
-			journal->j_max_transaction_buffers);
-
-	/*
-	 * First thing we are allowed to do is to discard any remaining
-	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
-	 * that there are no such buffers: if a large filesystem
-	 * operation like a truncate needs to split itself over multiple
-	 * transactions, then it may try to do a journal_restart() while
-	 * there are still BJ_Reserved buffers outstanding.  These must
-	 * be released cleanly from the current transaction.
-	 *
-	 * In this case, the filesystem must still reserve write access
-	 * again before modifying the buffer in the new transaction, but
-	 * we do not require it to remember exactly which old buffers it
-	 * has reserved.  This is consistent with the existing behaviour
-	 * that multiple journal_get_write_access() calls to the same
-	 * buffer are perfectly permissible.
-	 */
-	while (commit_transaction->t_reserved_list) {
-		jh = commit_transaction->t_reserved_list;
-		JBUFFER_TRACE(jh, "reserved, unused: refile");
-		/*
-		 * A journal_get_undo_access()+journal_release_buffer() may
-		 * leave undo-committed data.
-		 */
-		if (jh->b_committed_data) {
-			struct buffer_head *bh = jh2bh(jh);
-
-			jbd_lock_bh_state(bh);
-			jbd_free(jh->b_committed_data, bh->b_size);
-			jh->b_committed_data = NULL;
-			jbd_unlock_bh_state(bh);
-		}
-		journal_refile_buffer(journal, jh);
-	}
-
-	/*
-	 * Now try to drop any written-back buffers from the journal's
-	 * checkpoint lists.  We do this *before* commit because it potentially
-	 * frees some memory
-	 */
-	spin_lock(&journal->j_list_lock);
-	__journal_clean_checkpoint_list(journal);
-	spin_unlock(&journal->j_list_lock);
-
-	jbd_debug (3, "JBD: commit phase 1\n");
-
-	/*
-	 * Clear revoked flag to reflect there is no revoked buffers
-	 * in the next transaction which is going to be started.
-	 */
-	journal_clear_buffer_revoked_flags(journal);
-
-	/*
-	 * Switch to a new revoke table.
-	 */
-	journal_switch_revoke_table(journal);
-
-	trace_jbd_commit_flushing(journal, commit_transaction);
-	commit_transaction->t_state = T_FLUSH;
-	journal->j_committing_transaction = commit_transaction;
-	journal->j_running_transaction = NULL;
-	start_time = ktime_get();
-	commit_transaction->t_log_start = journal->j_head;
-	wake_up(&journal->j_wait_transaction_locked);
-	spin_unlock(&journal->j_state_lock);
-
-	jbd_debug (3, "JBD: commit phase 2\n");
-
-	if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid))
-		write_op = WRITE_SYNC;
-
-	/*
-	 * Now start flushing things to disk, in the order they appear
-	 * on the transaction lists.  Data blocks go first.
-	 */
-	blk_start_plug(&plug);
-	err = journal_submit_data_buffers(journal, commit_transaction,
-					  write_op);
-	blk_finish_plug(&plug);
-
-	/*
-	 * Wait for all previously submitted IO to complete.
-	 */
-	spin_lock(&journal->j_list_lock);
-	while (commit_transaction->t_locked_list) {
-		struct buffer_head *bh;
-
-		jh = commit_transaction->t_locked_list->b_tprev;
-		bh = jh2bh(jh);
-		get_bh(bh);
-		if (buffer_locked(bh)) {
-			spin_unlock(&journal->j_list_lock);
-			wait_on_buffer(bh);
-			spin_lock(&journal->j_list_lock);
-		}
-		if (unlikely(!buffer_uptodate(bh))) {
-			if (!trylock_page(bh->b_page)) {
-				spin_unlock(&journal->j_list_lock);
-				lock_page(bh->b_page);
-				spin_lock(&journal->j_list_lock);
-			}
-			if (bh->b_page->mapping)
-				set_bit(AS_EIO, &bh->b_page->mapping->flags);
-
-			unlock_page(bh->b_page);
-			SetPageError(bh->b_page);
-			err = -EIO;
-		}
-		if (!inverted_lock(journal, bh)) {
-			put_bh(bh);
-			spin_lock(&journal->j_list_lock);
-			continue;
-		}
-		if (buffer_jbd(bh) && bh2jh(bh) == jh &&
-		    jh->b_transaction == commit_transaction &&
-		    jh->b_jlist == BJ_Locked)
-			__journal_unfile_buffer(jh);
-		jbd_unlock_bh_state(bh);
-		release_data_buffer(bh);
-		cond_resched_lock(&journal->j_list_lock);
-	}
-	spin_unlock(&journal->j_list_lock);
-
-	if (err) {
-		char b[BDEVNAME_SIZE];
-
-		printk(KERN_WARNING
-			"JBD: Detected IO errors while flushing file data "
-			"on %s\n", bdevname(journal->j_fs_dev, b));
-		if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
-			journal_abort(journal, err);
-		err = 0;
-	}
-
-	blk_start_plug(&plug);
-
-	journal_write_revoke_records(journal, commit_transaction, write_op);
-
-	/*
-	 * If we found any dirty or locked buffers, then we should have
-	 * looped back up to the write_out_data label.  If there weren't
-	 * any then journal_clean_data_list should have wiped the list
-	 * clean by now, so check that it is in fact empty.
-	 */
-	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
-	jbd_debug (3, "JBD: commit phase 3\n");
-
-	/*
-	 * Way to go: we have now written out all of the data for a
-	 * transaction!  Now comes the tricky part: we need to write out
-	 * metadata.  Loop over the transaction's entire buffer list:
-	 */
-	spin_lock(&journal->j_state_lock);
-	commit_transaction->t_state = T_COMMIT;
-	spin_unlock(&journal->j_state_lock);
-
-	trace_jbd_commit_logging(journal, commit_transaction);
-	J_ASSERT(commit_transaction->t_nr_buffers <=
-		 commit_transaction->t_outstanding_credits);
-
-	descriptor = NULL;
-	bufs = 0;
-	while (commit_transaction->t_buffers) {
-
-		/* Find the next buffer to be journaled... */
-
-		jh = commit_transaction->t_buffers;
-
-		/* If we're in abort mode, we just un-journal the buffer and
-		   release it. */
-
-		if (is_journal_aborted(journal)) {
-			clear_buffer_jbddirty(jh2bh(jh));
-			JBUFFER_TRACE(jh, "journal is aborting: refile");
-			journal_refile_buffer(journal, jh);
-			/* If that was the last one, we need to clean up
-			 * any descriptor buffers which may have been
-			 * already allocated, even if we are now
-			 * aborting. */
-			if (!commit_transaction->t_buffers)
-				goto start_journal_io;
-			continue;
-		}
-
-		/* Make sure we have a descriptor block in which to
-		   record the metadata buffer. */
-
-		if (!descriptor) {
-			struct buffer_head *bh;
-
-			J_ASSERT (bufs == 0);
-
-			jbd_debug(4, "JBD: get descriptor\n");
-
-			descriptor = journal_get_descriptor_buffer(journal);
-			if (!descriptor) {
-				journal_abort(journal, -EIO);
-				continue;
-			}
-
-			bh = jh2bh(descriptor);
-			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
-				(unsigned long long)bh->b_blocknr, bh->b_data);
-			header = (journal_header_t *)&bh->b_data[0];
-			header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
-			header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
-			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
-
-			tagp = &bh->b_data[sizeof(journal_header_t)];
-			space_left = bh->b_size - sizeof(journal_header_t);
-			first_tag = 1;
-			set_buffer_jwrite(bh);
-			set_buffer_dirty(bh);
-			wbuf[bufs++] = bh;
-
-			/* Record it so that we can wait for IO
-                           completion later */
-			BUFFER_TRACE(bh, "ph3: file as descriptor");
-			journal_file_buffer(descriptor, commit_transaction,
-					BJ_LogCtl);
-		}
-
-		/* Where is the buffer to be written? */
-
-		err = journal_next_log_block(journal, &blocknr);
-		/* If the block mapping failed, just abandon the buffer
-		   and repeat this loop: we'll fall into the
-		   refile-on-abort condition above. */
-		if (err) {
-			journal_abort(journal, err);
-			continue;
-		}
-
-		/*
-		 * start_this_handle() uses t_outstanding_credits to determine
-		 * the free space in the log, but this counter is changed
-		 * by journal_next_log_block() also.
-		 */
-		commit_transaction->t_outstanding_credits--;
-
-		/* Bump b_count to prevent truncate from stumbling over
-                   the shadowed buffer!  @@@ This can go if we ever get
-                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
-		get_bh(jh2bh(jh));
-
-		/* Make a temporary IO buffer with which to write it out
-                   (this will requeue both the metadata buffer and the
-                   temporary IO buffer). new_bh goes on BJ_IO*/
-
-		set_buffer_jwrite(jh2bh(jh));
-		/*
-		 * akpm: journal_write_metadata_buffer() sets
-		 * new_bh->b_transaction to commit_transaction.
-		 * We need to clean this up before we release new_bh
-		 * (which is of type BJ_IO)
-		 */
-		JBUFFER_TRACE(jh, "ph3: write metadata");
-		flags = journal_write_metadata_buffer(commit_transaction,
-						      jh, &new_jh, blocknr);
-		set_buffer_jwrite(jh2bh(new_jh));
-		wbuf[bufs++] = jh2bh(new_jh);
-
-		/* Record the new block's tag in the current descriptor
-                   buffer */
-
-		tag_flag = 0;
-		if (flags & 1)
-			tag_flag |= JFS_FLAG_ESCAPE;
-		if (!first_tag)
-			tag_flag |= JFS_FLAG_SAME_UUID;
-
-		tag = (journal_block_tag_t *) tagp;
-		tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
-		tag->t_flags = cpu_to_be32(tag_flag);
-		tagp += sizeof(journal_block_tag_t);
-		space_left -= sizeof(journal_block_tag_t);
-
-		if (first_tag) {
-			memcpy (tagp, journal->j_uuid, 16);
-			tagp += 16;
-			space_left -= 16;
-			first_tag = 0;
-		}
-
-		/* If there's no more to do, or if the descriptor is full,
-		   let the IO rip! */
-
-		if (bufs == journal->j_wbufsize ||
-		    commit_transaction->t_buffers == NULL ||
-		    space_left < sizeof(journal_block_tag_t) + 16) {
-
-			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
-
-			/* Write an end-of-descriptor marker before
-                           submitting the IOs.  "tag" still points to
-                           the last tag we set up. */
-
-			tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
-
-start_journal_io:
-			for (i = 0; i < bufs; i++) {
-				struct buffer_head *bh = wbuf[i];
-				lock_buffer(bh);
-				clear_buffer_dirty(bh);
-				set_buffer_uptodate(bh);
-				bh->b_end_io = journal_end_buffer_io_sync;
-				/*
-				 * In data=journal mode, here we can end up
-				 * writing pagecache data that might be
-				 * mmapped. Since we can't afford to clean the
-				 * page and set PageWriteback (see the comment
-				 * near the other use of _submit_bh()), the
-				 * data can change while the write is in
-				 * flight.  Tell the block layer to bounce the
-				 * bio pages if stable pages are required.
-				 */
-				_submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE);
-			}
-			cond_resched();
-
-			/* Force a new descriptor to be generated next
-                           time round the loop. */
-			descriptor = NULL;
-			bufs = 0;
-		}
-	}
-
-	blk_finish_plug(&plug);
-
-	/* Lo and behold: we have just managed to send a transaction to
-           the log.  Before we can commit it, wait for the IO so far to
-           complete.  Control buffers being written are on the
-           transaction's t_log_list queue, and metadata buffers are on
-           the t_iobuf_list queue.
-
-	   Wait for the buffers in reverse order.  That way we are
-	   less likely to be woken up until all IOs have completed, and
-	   so we incur less scheduling load.
-	*/
-
-	jbd_debug(3, "JBD: commit phase 4\n");
-
-	/*
-	 * akpm: these are BJ_IO, and j_list_lock is not needed.
-	 * See __journal_try_to_free_buffer.
-	 */
-wait_for_iobuf:
-	while (commit_transaction->t_iobuf_list != NULL) {
-		struct buffer_head *bh;
-
-		jh = commit_transaction->t_iobuf_list->b_tprev;
-		bh = jh2bh(jh);
-		if (buffer_locked(bh)) {
-			wait_on_buffer(bh);
-			goto wait_for_iobuf;
-		}
-		if (cond_resched())
-			goto wait_for_iobuf;
-
-		if (unlikely(!buffer_uptodate(bh)))
-			err = -EIO;
-
-		clear_buffer_jwrite(bh);
-
-		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
-		journal_unfile_buffer(journal, jh);
-
-		/*
-		 * ->t_iobuf_list should contain only dummy buffer_heads
-		 * which were created by journal_write_metadata_buffer().
-		 */
-		BUFFER_TRACE(bh, "dumping temporary bh");
-		journal_put_journal_head(jh);
-		__brelse(bh);
-		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
-		free_buffer_head(bh);
-
-		/* We also have to unlock and free the corresponding
-                   shadowed buffer */
-		jh = commit_transaction->t_shadow_list->b_tprev;
-		bh = jh2bh(jh);
-		clear_buffer_jwrite(bh);
-		J_ASSERT_BH(bh, buffer_jbddirty(bh));
-
-		/* The metadata is now released for reuse, but we need
-                   to remember it against this transaction so that when
-                   we finally commit, we can do any checkpointing
-                   required. */
-		JBUFFER_TRACE(jh, "file as BJ_Forget");
-		journal_file_buffer(jh, commit_transaction, BJ_Forget);
-		/*
-		 * Wake up any transactions which were waiting for this
-		 * IO to complete. The barrier must be here so that changes
-		 * by journal_file_buffer() take effect before wake_up_bit()
-		 * does the waitqueue check.
-		 */
-		smp_mb();
-		wake_up_bit(&bh->b_state, BH_Unshadow);
-		JBUFFER_TRACE(jh, "brelse shadowed buffer");
-		__brelse(bh);
-	}
-
-	J_ASSERT (commit_transaction->t_shadow_list == NULL);
-
-	jbd_debug(3, "JBD: commit phase 5\n");
-
-	/* Here we wait for the revoke record and descriptor record buffers */
- wait_for_ctlbuf:
-	while (commit_transaction->t_log_list != NULL) {
-		struct buffer_head *bh;
-
-		jh = commit_transaction->t_log_list->b_tprev;
-		bh = jh2bh(jh);
-		if (buffer_locked(bh)) {
-			wait_on_buffer(bh);
-			goto wait_for_ctlbuf;
-		}
-		if (cond_resched())
-			goto wait_for_ctlbuf;
-
-		if (unlikely(!buffer_uptodate(bh)))
-			err = -EIO;
-
-		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
-		clear_buffer_jwrite(bh);
-		journal_unfile_buffer(journal, jh);
-		journal_put_journal_head(jh);
-		__brelse(bh);		/* One for getblk */
-		/* AKPM: bforget here */
-	}
-
-	if (err)
-		journal_abort(journal, err);
-
-	jbd_debug(3, "JBD: commit phase 6\n");
-
-	/* All metadata is written, now write commit record and do cleanup */
-	spin_lock(&journal->j_state_lock);
-	J_ASSERT(commit_transaction->t_state == T_COMMIT);
-	commit_transaction->t_state = T_COMMIT_RECORD;
-	spin_unlock(&journal->j_state_lock);
-
-	if (journal_write_commit_record(journal, commit_transaction))
-		err = -EIO;
-
-	if (err)
-		journal_abort(journal, err);
-
-	/* End of a transaction!  Finally, we can do checkpoint
-           processing: any buffers committed as a result of this
-           transaction can be removed from any checkpoint list it was on
-           before. */
-
-	jbd_debug(3, "JBD: commit phase 7\n");
-
-	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
-	J_ASSERT(commit_transaction->t_buffers == NULL);
-	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
-	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
-	J_ASSERT(commit_transaction->t_shadow_list == NULL);
-	J_ASSERT(commit_transaction->t_log_list == NULL);
-
-restart_loop:
-	/*
-	 * As there are other places (journal_unmap_buffer()) adding buffers
-	 * to this list we have to be careful and hold the j_list_lock.
-	 */
-	spin_lock(&journal->j_list_lock);
-	while (commit_transaction->t_forget) {
-		transaction_t *cp_transaction;
-		struct buffer_head *bh;
-		int try_to_free = 0;
-
-		jh = commit_transaction->t_forget;
-		spin_unlock(&journal->j_list_lock);
-		bh = jh2bh(jh);
-		/*
-		 * Get a reference so that bh cannot be freed before we are
-		 * done with it.
-		 */
-		get_bh(bh);
-		jbd_lock_bh_state(bh);
-		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
-			jh->b_transaction == journal->j_running_transaction);
-
-		/*
-		 * If there is undo-protected committed data against
-		 * this buffer, then we can remove it now.  If it is a
-		 * buffer needing such protection, the old frozen_data
-		 * field now points to a committed version of the
-		 * buffer, so rotate that field to the new committed
-		 * data.
-		 *
-		 * Otherwise, we can just throw away the frozen data now.
-		 */
-		if (jh->b_committed_data) {
-			jbd_free(jh->b_committed_data, bh->b_size);
-			jh->b_committed_data = NULL;
-			if (jh->b_frozen_data) {
-				jh->b_committed_data = jh->b_frozen_data;
-				jh->b_frozen_data = NULL;
-			}
-		} else if (jh->b_frozen_data) {
-			jbd_free(jh->b_frozen_data, bh->b_size);
-			jh->b_frozen_data = NULL;
-		}
-
-		spin_lock(&journal->j_list_lock);
-		cp_transaction = jh->b_cp_transaction;
-		if (cp_transaction) {
-			JBUFFER_TRACE(jh, "remove from old cp transaction");
-			__journal_remove_checkpoint(jh);
-		}
-
-		/* Only re-checkpoint the buffer_head if it is marked
-		 * dirty.  If the buffer was added to the BJ_Forget list
-		 * by journal_forget, it may no longer be dirty and
-		 * there's no point in keeping a checkpoint record for
-		 * it. */
-
-		/*
-		 * A buffer which has been freed while still being journaled by
-		 * a previous transaction.
-		 */
-		if (buffer_freed(bh)) {
-			/*
-			 * If the running transaction is the one containing
-			 * "add to orphan" operation (b_next_transaction !=
-			 * NULL), we have to wait for that transaction to
-			 * commit before we can really get rid of the buffer.
-			 * So just clear b_modified to not confuse transaction
-			 * credit accounting and refile the buffer to
-			 * BJ_Forget of the running transaction. If the just
-			 * committed transaction contains "add to orphan"
-			 * operation, we can completely invalidate the buffer
-			 * now. We are rather throughout in that since the
-			 * buffer may be still accessible when blocksize <
-			 * pagesize and it is attached to the last partial
-			 * page.
-			 */
-			jh->b_modified = 0;
-			if (!jh->b_next_transaction) {
-				clear_buffer_freed(bh);
-				clear_buffer_jbddirty(bh);
-				clear_buffer_mapped(bh);
-				clear_buffer_new(bh);
-				clear_buffer_req(bh);
-				bh->b_bdev = NULL;
-			}
-		}
-
-		if (buffer_jbddirty(bh)) {
-			JBUFFER_TRACE(jh, "add to new checkpointing trans");
-			__journal_insert_checkpoint(jh, commit_transaction);
-			if (is_journal_aborted(journal))
-				clear_buffer_jbddirty(bh);
-		} else {
-			J_ASSERT_BH(bh, !buffer_dirty(bh));
-			/*
-			 * The buffer on BJ_Forget list and not jbddirty means
-			 * it has been freed by this transaction and hence it
-			 * could not have been reallocated until this
-			 * transaction has committed. *BUT* it could be
-			 * reallocated once we have written all the data to
-			 * disk and before we process the buffer on BJ_Forget
-			 * list.
-			 */
-			if (!jh->b_next_transaction)
-				try_to_free = 1;
-		}
-		JBUFFER_TRACE(jh, "refile or unfile freed buffer");
-		__journal_refile_buffer(jh);
-		jbd_unlock_bh_state(bh);
-		if (try_to_free)
-			release_buffer_page(bh);
-		else
-			__brelse(bh);
-		cond_resched_lock(&journal->j_list_lock);
-	}
-	spin_unlock(&journal->j_list_lock);
-	/*
-	 * This is a bit sleazy.  We use j_list_lock to protect transition
-	 * of a transaction into T_FINISHED state and calling
-	 * __journal_drop_transaction(). Otherwise we could race with
-	 * other checkpointing code processing the transaction...
-	 */
-	spin_lock(&journal->j_state_lock);
-	spin_lock(&journal->j_list_lock);
-	/*
-	 * Now recheck if some buffers did not get attached to the transaction
-	 * while the lock was dropped...
-	 */
-	if (commit_transaction->t_forget) {
-		spin_unlock(&journal->j_list_lock);
-		spin_unlock(&journal->j_state_lock);
-		goto restart_loop;
-	}
-
-	/* Done with this transaction! */
-
-	jbd_debug(3, "JBD: commit phase 8\n");
-
-	J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
-
-	commit_transaction->t_state = T_FINISHED;
-	J_ASSERT(commit_transaction == journal->j_committing_transaction);
-	journal->j_commit_sequence = commit_transaction->t_tid;
-	journal->j_committing_transaction = NULL;
-	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
-
-	/*
-	 * weight the commit time higher than the average time so we don't
-	 * react too strongly to vast changes in commit time
-	 */
-	if (likely(journal->j_average_commit_time))
-		journal->j_average_commit_time = (commit_time*3 +
-				journal->j_average_commit_time) / 4;
-	else
-		journal->j_average_commit_time = commit_time;
-
-	spin_unlock(&journal->j_state_lock);
-
-	if (commit_transaction->t_checkpoint_list == NULL &&
-	    commit_transaction->t_checkpoint_io_list == NULL) {
-		__journal_drop_transaction(journal, commit_transaction);
-	} else {
-		if (journal->j_checkpoint_transactions == NULL) {
-			journal->j_checkpoint_transactions = commit_transaction;
-			commit_transaction->t_cpnext = commit_transaction;
-			commit_transaction->t_cpprev = commit_transaction;
-		} else {
-			commit_transaction->t_cpnext =
-				journal->j_checkpoint_transactions;
-			commit_transaction->t_cpprev =
-				commit_transaction->t_cpnext->t_cpprev;
-			commit_transaction->t_cpnext->t_cpprev =
-				commit_transaction;
-			commit_transaction->t_cpprev->t_cpnext =
-				commit_transaction;
-		}
-	}
-	spin_unlock(&journal->j_list_lock);
-
-	trace_jbd_end_commit(journal, commit_transaction);
-	jbd_debug(1, "JBD: commit %d complete, head %d\n",
-		  journal->j_commit_sequence, journal->j_tail_sequence);
-
-	wake_up(&journal->j_wait_done_commit);
-}
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
deleted file mode 100644
index c46a79adb6ad..000000000000
--- a/fs/jbd/journal.c
+++ /dev/null
@@ -1,2145 +0,0 @@
-/*
- * linux/fs/jbd/journal.c
- *
- * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
- *
- * Copyright 1998 Red Hat corp --- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * Generic filesystem journal-writing code; part of the ext2fs
- * journaling system.
- *
- * This file manages journals: areas of disk reserved for logging
- * transactional updates.  This includes the kernel journaling thread
- * which is responsible for scheduling updates to the log.
- *
- * We do not actually manage the physical storage of the journal in this
- * file: that is left to a per-journal policy function, which allows us
- * to store the journal within a filesystem-specified area for ext2
- * journaling (ext2 can use a reserved inode for storing the log).
- */
-
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/freezer.h>
-#include <linux/pagemap.h>
-#include <linux/kthread.h>
-#include <linux/poison.h>
-#include <linux/proc_fs.h>
-#include <linux/debugfs.h>
-#include <linux/ratelimit.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/jbd.h>
-
-#include <asm/uaccess.h>
-#include <asm/page.h>
-
-EXPORT_SYMBOL(journal_start);
-EXPORT_SYMBOL(journal_restart);
-EXPORT_SYMBOL(journal_extend);
-EXPORT_SYMBOL(journal_stop);
-EXPORT_SYMBOL(journal_lock_updates);
-EXPORT_SYMBOL(journal_unlock_updates);
-EXPORT_SYMBOL(journal_get_write_access);
-EXPORT_SYMBOL(journal_get_create_access);
-EXPORT_SYMBOL(journal_get_undo_access);
-EXPORT_SYMBOL(journal_dirty_data);
-EXPORT_SYMBOL(journal_dirty_metadata);
-EXPORT_SYMBOL(journal_release_buffer);
-EXPORT_SYMBOL(journal_forget);
-#if 0
-EXPORT_SYMBOL(journal_sync_buffer);
-#endif
-EXPORT_SYMBOL(journal_flush);
-EXPORT_SYMBOL(journal_revoke);
-
-EXPORT_SYMBOL(journal_init_dev);
-EXPORT_SYMBOL(journal_init_inode);
-EXPORT_SYMBOL(journal_update_format);
-EXPORT_SYMBOL(journal_check_used_features);
-EXPORT_SYMBOL(journal_check_available_features);
-EXPORT_SYMBOL(journal_set_features);
-EXPORT_SYMBOL(journal_create);
-EXPORT_SYMBOL(journal_load);
-EXPORT_SYMBOL(journal_destroy);
-EXPORT_SYMBOL(journal_abort);
-EXPORT_SYMBOL(journal_errno);
-EXPORT_SYMBOL(journal_ack_err);
-EXPORT_SYMBOL(journal_clear_err);
-EXPORT_SYMBOL(log_wait_commit);
-EXPORT_SYMBOL(log_start_commit);
-EXPORT_SYMBOL(journal_start_commit);
-EXPORT_SYMBOL(journal_force_commit_nested);
-EXPORT_SYMBOL(journal_wipe);
-EXPORT_SYMBOL(journal_blocks_per_page);
-EXPORT_SYMBOL(journal_invalidatepage);
-EXPORT_SYMBOL(journal_try_to_free_buffers);
-EXPORT_SYMBOL(journal_force_commit);
-
-static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
-static void __journal_abort_soft (journal_t *journal, int errno);
-static const char *journal_dev_name(journal_t *journal, char *buffer);
-
-#ifdef CONFIG_JBD_DEBUG
-void __jbd_debug(int level, const char *file, const char *func,
-		 unsigned int line, const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-
-	if (level > journal_enable_debug)
-		return;
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
-	printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
-	va_end(args);
-}
-EXPORT_SYMBOL(__jbd_debug);
-#endif
-
-/*
- * Helper function used to manage commit timeouts
- */
-
-static void commit_timeout(unsigned long __data)
-{
-	struct task_struct * p = (struct task_struct *) __data;
-
-	wake_up_process(p);
-}
-
-/*
- * kjournald: The main thread function used to manage a logging device
- * journal.
- *
- * This kernel thread is responsible for two things:
- *
- * 1) COMMIT:  Every so often we need to commit the current state of the
- *    filesystem to disk.  The journal thread is responsible for writing
- *    all of the metadata buffers to disk.
- *
- * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
- *    of the data in that part of the log has been rewritten elsewhere on
- *    the disk.  Flushing these old buffers to reclaim space in the log is
- *    known as checkpointing, and this thread is responsible for that job.
- */
-
-static int kjournald(void *arg)
-{
-	journal_t *journal = arg;
-	transaction_t *transaction;
-
-	/*
-	 * Set up an interval timer which can be used to trigger a commit wakeup
-	 * after the commit interval expires
-	 */
-	setup_timer(&journal->j_commit_timer, commit_timeout,
-			(unsigned long)current);
-
-	set_freezable();
-
-	/* Record that the journal thread is running */
-	journal->j_task = current;
-	wake_up(&journal->j_wait_done_commit);
-
-	printk(KERN_INFO "kjournald starting.  Commit interval %ld seconds\n",
-			journal->j_commit_interval / HZ);
-
-	/*
-	 * And now, wait forever for commit wakeup events.
-	 */
-	spin_lock(&journal->j_state_lock);
-
-loop:
-	if (journal->j_flags & JFS_UNMOUNT)
-		goto end_loop;
-
-	jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
-		journal->j_commit_sequence, journal->j_commit_request);
-
-	if (journal->j_commit_sequence != journal->j_commit_request) {
-		jbd_debug(1, "OK, requests differ\n");
-		spin_unlock(&journal->j_state_lock);
-		del_timer_sync(&journal->j_commit_timer);
-		journal_commit_transaction(journal);
-		spin_lock(&journal->j_state_lock);
-		goto loop;
-	}
-
-	wake_up(&journal->j_wait_done_commit);
-	if (freezing(current)) {
-		/*
-		 * The simpler the better. Flushing journal isn't a
-		 * good idea, because that depends on threads that may
-		 * be already stopped.
-		 */
-		jbd_debug(1, "Now suspending kjournald\n");
-		spin_unlock(&journal->j_state_lock);
-		try_to_freeze();
-		spin_lock(&journal->j_state_lock);
-	} else {
-		/*
-		 * We assume on resume that commits are already there,
-		 * so we don't sleep
-		 */
-		DEFINE_WAIT(wait);
-		int should_sleep = 1;
-
-		prepare_to_wait(&journal->j_wait_commit, &wait,
-				TASK_INTERRUPTIBLE);
-		if (journal->j_commit_sequence != journal->j_commit_request)
-			should_sleep = 0;
-		transaction = journal->j_running_transaction;
-		if (transaction && time_after_eq(jiffies,
-						transaction->t_expires))
-			should_sleep = 0;
-		if (journal->j_flags & JFS_UNMOUNT)
-			should_sleep = 0;
-		if (should_sleep) {
-			spin_unlock(&journal->j_state_lock);
-			schedule();
-			spin_lock(&journal->j_state_lock);
-		}
-		finish_wait(&journal->j_wait_commit, &wait);
-	}
-
-	jbd_debug(1, "kjournald wakes\n");
-
-	/*
-	 * Were we woken up by a commit wakeup event?
-	 */
-	transaction = journal->j_running_transaction;
-	if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
-		journal->j_commit_request = transaction->t_tid;
-		jbd_debug(1, "woke because of timeout\n");
-	}
-	goto loop;
-
-end_loop:
-	spin_unlock(&journal->j_state_lock);
-	del_timer_sync(&journal->j_commit_timer);
-	journal->j_task = NULL;
-	wake_up(&journal->j_wait_done_commit);
-	jbd_debug(1, "Journal thread exiting.\n");
-	return 0;
-}
-
-static int journal_start_thread(journal_t *journal)
-{
-	struct task_struct *t;
-
-	t = kthread_run(kjournald, journal, "kjournald");
-	if (IS_ERR(t))
-		return PTR_ERR(t);
-
-	wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
-	return 0;
-}
-
-static void journal_kill_thread(journal_t *journal)
-{
-	spin_lock(&journal->j_state_lock);
-	journal->j_flags |= JFS_UNMOUNT;
-
-	while (journal->j_task) {
-		wake_up(&journal->j_wait_commit);
-		spin_unlock(&journal->j_state_lock);
-		wait_event(journal->j_wait_done_commit,
-				journal->j_task == NULL);
-		spin_lock(&journal->j_state_lock);
-	}
-	spin_unlock(&journal->j_state_lock);
-}
-
-/*
- * journal_write_metadata_buffer: write a metadata buffer to the journal.
- *
- * Writes a metadata buffer to a given disk block.  The actual IO is not
- * performed but a new buffer_head is constructed which labels the data
- * to be written with the correct destination disk block.
- *
- * Any magic-number escaping which needs to be done will cause a
- * copy-out here.  If the buffer happens to start with the
- * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the
- * magic number is only written to the log for descripter blocks.  In
- * this case, we copy the data and replace the first word with 0, and we
- * return a result code which indicates that this buffer needs to be
- * marked as an escaped buffer in the corresponding log descriptor
- * block.  The missing word can then be restored when the block is read
- * during recovery.
- *
- * If the source buffer has already been modified by a new transaction
- * since we took the last commit snapshot, we use the frozen copy of
- * that data for IO.  If we end up using the existing buffer_head's data
- * for the write, then we *have* to lock the buffer to prevent anyone
- * else from using and possibly modifying it while the IO is in
- * progress.
- *
- * The function returns a pointer to the buffer_heads to be used for IO.
- *
- * We assume that the journal has already been locked in this function.
- *
- * Return value:
- *  <0: Error
- * >=0: Finished OK
- *
- * On success:
- * Bit 0 set == escape performed on the data
- * Bit 1 set == buffer copy-out performed (kfree the data after IO)
- */
-
-int journal_write_metadata_buffer(transaction_t *transaction,
-				  struct journal_head  *jh_in,
-				  struct journal_head **jh_out,
-				  unsigned int blocknr)
-{
-	int need_copy_out = 0;
-	int done_copy_out = 0;
-	int do_escape = 0;
-	char *mapped_data;
-	struct buffer_head *new_bh;
-	struct journal_head *new_jh;
-	struct page *new_page;
-	unsigned int new_offset;
-	struct buffer_head *bh_in = jh2bh(jh_in);
-	journal_t *journal = transaction->t_journal;
-
-	/*
-	 * The buffer really shouldn't be locked: only the current committing
-	 * transaction is allowed to write it, so nobody else is allowed
-	 * to do any IO.
-	 *
-	 * akpm: except if we're journalling data, and write() output is
-	 * also part of a shared mapping, and another thread has
-	 * decided to launch a writepage() against this buffer.
-	 */
-	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
-
-	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
-	/* keep subsequent assertions sane */
-	atomic_set(&new_bh->b_count, 1);
-	new_jh = journal_add_journal_head(new_bh);	/* This sleeps */
-
-	/*
-	 * If a new transaction has already done a buffer copy-out, then
-	 * we use that version of the data for the commit.
-	 */
-	jbd_lock_bh_state(bh_in);
-repeat:
-	if (jh_in->b_frozen_data) {
-		done_copy_out = 1;
-		new_page = virt_to_page(jh_in->b_frozen_data);
-		new_offset = offset_in_page(jh_in->b_frozen_data);
-	} else {
-		new_page = jh2bh(jh_in)->b_page;
-		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
-	}
-
-	mapped_data = kmap_atomic(new_page);
-	/*
-	 * Check for escaping
-	 */
-	if (*((__be32 *)(mapped_data + new_offset)) ==
-				cpu_to_be32(JFS_MAGIC_NUMBER)) {
-		need_copy_out = 1;
-		do_escape = 1;
-	}
-	kunmap_atomic(mapped_data);
-
-	/*
-	 * Do we need to do a data copy?
-	 */
-	if (need_copy_out && !done_copy_out) {
-		char *tmp;
-
-		jbd_unlock_bh_state(bh_in);
-		tmp = jbd_alloc(bh_in->b_size, GFP_NOFS);
-		jbd_lock_bh_state(bh_in);
-		if (jh_in->b_frozen_data) {
-			jbd_free(tmp, bh_in->b_size);
-			goto repeat;
-		}
-
-		jh_in->b_frozen_data = tmp;
-		mapped_data = kmap_atomic(new_page);
-		memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
-		kunmap_atomic(mapped_data);
-
-		new_page = virt_to_page(tmp);
-		new_offset = offset_in_page(tmp);
-		done_copy_out = 1;
-	}
-
-	/*
-	 * Did we need to do an escaping?  Now we've done all the
-	 * copying, we can finally do so.
-	 */
-	if (do_escape) {
-		mapped_data = kmap_atomic(new_page);
-		*((unsigned int *)(mapped_data + new_offset)) = 0;
-		kunmap_atomic(mapped_data);
-	}
-
-	set_bh_page(new_bh, new_page, new_offset);
-	new_jh->b_transaction = NULL;
-	new_bh->b_size = jh2bh(jh_in)->b_size;
-	new_bh->b_bdev = transaction->t_journal->j_dev;
-	new_bh->b_blocknr = blocknr;
-	set_buffer_mapped(new_bh);
-	set_buffer_dirty(new_bh);
-
-	*jh_out = new_jh;
-
-	/*
-	 * The to-be-written buffer needs to get moved to the io queue,
-	 * and the original buffer whose contents we are shadowing or
-	 * copying is moved to the transaction's shadow queue.
-	 */
-	JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
-	spin_lock(&journal->j_list_lock);
-	__journal_file_buffer(jh_in, transaction, BJ_Shadow);
-	spin_unlock(&journal->j_list_lock);
-	jbd_unlock_bh_state(bh_in);
-
-	JBUFFER_TRACE(new_jh, "file as BJ_IO");
-	journal_file_buffer(new_jh, transaction, BJ_IO);
-
-	return do_escape | (done_copy_out << 1);
-}
-
-/*
- * Allocation code for the journal file.  Manage the space left in the
- * journal, so that we can begin checkpointing when appropriate.
- */
-
-/*
- * __log_space_left: Return the number of free blocks left in the journal.
- *
- * Called with the journal already locked.
- *
- * Called under j_state_lock
- */
-
-int __log_space_left(journal_t *journal)
-{
-	int left = journal->j_free;
-
-	assert_spin_locked(&journal->j_state_lock);
-
-	/*
-	 * Be pessimistic here about the number of those free blocks which
-	 * might be required for log descriptor control blocks.
-	 */
-
-#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
-
-	left -= MIN_LOG_RESERVED_BLOCKS;
-
-	if (left <= 0)
-		return 0;
-	left -= (left >> 3);
-	return left;
-}
-
-/*
- * Called under j_state_lock.  Returns true if a transaction commit was started.
- */
-int __log_start_commit(journal_t *journal, tid_t target)
-{
-	/*
-	 * The only transaction we can possibly wait upon is the
-	 * currently running transaction (if it exists).  Otherwise,
-	 * the target tid must be an old one.
-	 */
-	if (journal->j_commit_request != target &&
-	    journal->j_running_transaction &&
-	    journal->j_running_transaction->t_tid == target) {
-		/*
-		 * We want a new commit: OK, mark the request and wakeup the
-		 * commit thread.  We do _not_ do the commit ourselves.
-		 */
-
-		journal->j_commit_request = target;
-		jbd_debug(1, "JBD: requesting commit %d/%d\n",
-			  journal->j_commit_request,
-			  journal->j_commit_sequence);
-		wake_up(&journal->j_wait_commit);
-		return 1;
-	} else if (!tid_geq(journal->j_commit_request, target))
-		/* This should never happen, but if it does, preserve
-		   the evidence before kjournald goes into a loop and
-		   increments j_commit_sequence beyond all recognition. */
-		WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
-		    journal->j_commit_request, journal->j_commit_sequence,
-		    target, journal->j_running_transaction ?
-		    journal->j_running_transaction->t_tid : 0);
-	return 0;
-}
-
-int log_start_commit(journal_t *journal, tid_t tid)
-{
-	int ret;
-
-	spin_lock(&journal->j_state_lock);
-	ret = __log_start_commit(journal, tid);
-	spin_unlock(&journal->j_state_lock);
-	return ret;
-}
-
-/*
- * Force and wait upon a commit if the calling process is not within
- * transaction.  This is used for forcing out undo-protected data which contains
- * bitmaps, when the fs is running out of space.
- *
- * We can only force the running transaction if we don't have an active handle;
- * otherwise, we will deadlock.
- *
- * Returns true if a transaction was started.
- */
-int journal_force_commit_nested(journal_t *journal)
-{
-	transaction_t *transaction = NULL;
-	tid_t tid;
-
-	spin_lock(&journal->j_state_lock);
-	if (journal->j_running_transaction && !current->journal_info) {
-		transaction = journal->j_running_transaction;
-		__log_start_commit(journal, transaction->t_tid);
-	} else if (journal->j_committing_transaction)
-		transaction = journal->j_committing_transaction;
-
-	if (!transaction) {
-		spin_unlock(&journal->j_state_lock);
-		return 0;	/* Nothing to retry */
-	}
-
-	tid = transaction->t_tid;
-	spin_unlock(&journal->j_state_lock);
-	log_wait_commit(journal, tid);
-	return 1;
-}
-
-/*
- * Start a commit of the current running transaction (if any).  Returns true
- * if a transaction is going to be committed (or is currently already
- * committing), and fills its tid in at *ptid
- */
-int journal_start_commit(journal_t *journal, tid_t *ptid)
-{
-	int ret = 0;
-
-	spin_lock(&journal->j_state_lock);
-	if (journal->j_running_transaction) {
-		tid_t tid = journal->j_running_transaction->t_tid;
-
-		__log_start_commit(journal, tid);
-		/* There's a running transaction and we've just made sure
-		 * it's commit has been scheduled. */
-		if (ptid)
-			*ptid = tid;
-		ret = 1;
-	} else if (journal->j_committing_transaction) {
-		/*
-		 * If commit has been started, then we have to wait for
-		 * completion of that transaction.
-		 */
-		if (ptid)
-			*ptid = journal->j_committing_transaction->t_tid;
-		ret = 1;
-	}
-	spin_unlock(&journal->j_state_lock);
-	return ret;
-}
-
-/*
- * Wait for a specified commit to complete.
- * The caller may not hold the journal lock.
- */
-int log_wait_commit(journal_t *journal, tid_t tid)
-{
-	int err = 0;
-
-#ifdef CONFIG_JBD_DEBUG
-	spin_lock(&journal->j_state_lock);
-	if (!tid_geq(journal->j_commit_request, tid)) {
-		printk(KERN_ERR
-		       "%s: error: j_commit_request=%d, tid=%d\n",
-		       __func__, journal->j_commit_request, tid);
-	}
-	spin_unlock(&journal->j_state_lock);
-#endif
-	spin_lock(&journal->j_state_lock);
-	/*
-	 * Not running or committing trans? Must be already committed. This
-	 * saves us from waiting for a *long* time when tid overflows.
-	 */
-	if (!((journal->j_running_transaction &&
-	       journal->j_running_transaction->t_tid == tid) ||
-	      (journal->j_committing_transaction &&
-	       journal->j_committing_transaction->t_tid == tid)))
-		goto out_unlock;
-
-	if (!tid_geq(journal->j_commit_waited, tid))
-		journal->j_commit_waited = tid;
-	while (tid_gt(tid, journal->j_commit_sequence)) {
-		jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
-				  tid, journal->j_commit_sequence);
-		wake_up(&journal->j_wait_commit);
-		spin_unlock(&journal->j_state_lock);
-		wait_event(journal->j_wait_done_commit,
-				!tid_gt(tid, journal->j_commit_sequence));
-		spin_lock(&journal->j_state_lock);
-	}
-out_unlock:
-	spin_unlock(&journal->j_state_lock);
-
-	if (unlikely(is_journal_aborted(journal)))
-		err = -EIO;
-	return err;
-}
-
-/*
- * Return 1 if a given transaction has not yet sent barrier request
- * connected with a transaction commit. If 0 is returned, transaction
- * may or may not have sent the barrier. Used to avoid sending barrier
- * twice in common cases.
- */
-int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
-{
-	int ret = 0;
-	transaction_t *commit_trans;
-
-	if (!(journal->j_flags & JFS_BARRIER))
-		return 0;
-	spin_lock(&journal->j_state_lock);
-	/* Transaction already committed? */
-	if (tid_geq(journal->j_commit_sequence, tid))
-		goto out;
-	/*
-	 * Transaction is being committed and we already proceeded to
-	 * writing commit record?
-	 */
-	commit_trans = journal->j_committing_transaction;
-	if (commit_trans && commit_trans->t_tid == tid &&
-	    commit_trans->t_state >= T_COMMIT_RECORD)
-		goto out;
-	ret = 1;
-out:
-	spin_unlock(&journal->j_state_lock);
-	return ret;
-}
-EXPORT_SYMBOL(journal_trans_will_send_data_barrier);
-
-/*
- * Log buffer allocation routines:
- */
-
-int journal_next_log_block(journal_t *journal, unsigned int *retp)
-{
-	unsigned int blocknr;
-
-	spin_lock(&journal->j_state_lock);
-	J_ASSERT(journal->j_free > 1);
-
-	blocknr = journal->j_head;
-	journal->j_head++;
-	journal->j_free--;
-	if (journal->j_head == journal->j_last)
-		journal->j_head = journal->j_first;
-	spin_unlock(&journal->j_state_lock);
-	return journal_bmap(journal, blocknr, retp);
-}
-
-/*
- * Conversion of logical to physical block numbers for the journal
- *
- * On external journals the journal blocks are identity-mapped, so
- * this is a no-op.  If needed, we can use j_blk_offset - everything is
- * ready.
- */
-int journal_bmap(journal_t *journal, unsigned int blocknr,
-		 unsigned int *retp)
-{
-	int err = 0;
-	unsigned int ret;
-
-	if (journal->j_inode) {
-		ret = bmap(journal->j_inode, blocknr);
-		if (ret)
-			*retp = ret;
-		else {
-			char b[BDEVNAME_SIZE];
-
-			printk(KERN_ALERT "%s: journal block not found "
-					"at offset %u on %s\n",
-				__func__,
-				blocknr,
-				bdevname(journal->j_dev, b));
-			err = -EIO;
-			__journal_abort_soft(journal, err);
-		}
-	} else {
-		*retp = blocknr; /* +journal->j_blk_offset */
-	}
-	return err;
-}
-
-/*
- * We play buffer_head aliasing tricks to write data/metadata blocks to
- * the journal without copying their contents, but for journal
- * descriptor blocks we do need to generate bona fide buffers.
- *
- * After the caller of journal_get_descriptor_buffer() has finished modifying
- * the buffer's contents they really should run flush_dcache_page(bh->b_page).
- * But we don't bother doing that, so there will be coherency problems with
- * mmaps of blockdevs which hold live JBD-controlled filesystems.
- */
-struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
-{
-	struct buffer_head *bh;
-	unsigned int blocknr;
-	int err;
-
-	err = journal_next_log_block(journal, &blocknr);
-
-	if (err)
-		return NULL;
-
-	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-	if (!bh)
-		return NULL;
-	lock_buffer(bh);
-	memset(bh->b_data, 0, journal->j_blocksize);
-	set_buffer_uptodate(bh);
-	unlock_buffer(bh);
-	BUFFER_TRACE(bh, "return this buffer");
-	return journal_add_journal_head(bh);
-}
-
-/*
- * Management for journal control blocks: functions to create and
- * destroy journal_t structures, and to initialise and read existing
- * journal blocks from disk.  */
-
-/* First: create and setup a journal_t object in memory.  We initialise
- * very few fields yet: that has to wait until we have created the
- * journal structures from from scratch, or loaded them from disk. */
-
-static journal_t * journal_init_common (void)
-{
-	journal_t *journal;
-	int err;
-
-	journal = kzalloc(sizeof(*journal), GFP_KERNEL);
-	if (!journal)
-		goto fail;
-
-	init_waitqueue_head(&journal->j_wait_transaction_locked);
-	init_waitqueue_head(&journal->j_wait_logspace);
-	init_waitqueue_head(&journal->j_wait_done_commit);
-	init_waitqueue_head(&journal->j_wait_checkpoint);
-	init_waitqueue_head(&journal->j_wait_commit);
-	init_waitqueue_head(&journal->j_wait_updates);
-	mutex_init(&journal->j_checkpoint_mutex);
-	spin_lock_init(&journal->j_revoke_lock);
-	spin_lock_init(&journal->j_list_lock);
-	spin_lock_init(&journal->j_state_lock);
-
-	journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
-
-	/* The journal is marked for error until we succeed with recovery! */
-	journal->j_flags = JFS_ABORT;
-
-	/* Set up a default-sized revoke table for the new mount. */
-	err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
-	if (err) {
-		kfree(journal);
-		goto fail;
-	}
-	return journal;
-fail:
-	return NULL;
-}
-
-/* journal_init_dev and journal_init_inode:
- *
- * Create a journal structure assigned some fixed set of disk blocks to
- * the journal.  We don't actually touch those disk blocks yet, but we
- * need to set up all of the mapping information to tell the journaling
- * system where the journal blocks are.
- *
- */
-
-/**
- *  journal_t * journal_init_dev() - creates and initialises a journal structure
- *  @bdev: Block device on which to create the journal
- *  @fs_dev: Device which hold journalled filesystem for this journal.
- *  @start: Block nr Start of journal.
- *  @len:  Length of the journal in blocks.
- *  @blocksize: blocksize of journalling device
- *
- *  Returns: a newly created journal_t *
- *
- *  journal_init_dev creates a journal which maps a fixed contiguous
- *  range of blocks on an arbitrary block device.
- *
- */
-journal_t * journal_init_dev(struct block_device *bdev,
-			struct block_device *fs_dev,
-			int start, int len, int blocksize)
-{
-	journal_t *journal = journal_init_common();
-	struct buffer_head *bh;
-	int n;
-
-	if (!journal)
-		return NULL;
-
-	/* journal descriptor can store up to n blocks -bzzz */
-	journal->j_blocksize = blocksize;
-	n = journal->j_blocksize / sizeof(journal_block_tag_t);
-	journal->j_wbufsize = n;
-	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
-	if (!journal->j_wbuf) {
-		printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
-			__func__);
-		goto out_err;
-	}
-	journal->j_dev = bdev;
-	journal->j_fs_dev = fs_dev;
-	journal->j_blk_offset = start;
-	journal->j_maxlen = len;
-
-	bh = __getblk(journal->j_dev, start, journal->j_blocksize);
-	if (!bh) {
-		printk(KERN_ERR
-		       "%s: Cannot get buffer for journal superblock\n",
-		       __func__);
-		goto out_err;
-	}
-	journal->j_sb_buffer = bh;
-	journal->j_superblock = (journal_superblock_t *)bh->b_data;
-
-	return journal;
-out_err:
-	kfree(journal->j_wbuf);
-	kfree(journal);
-	return NULL;
-}
-
-/**
- *  journal_t * journal_init_inode () - creates a journal which maps to a inode.
- *  @inode: An inode to create the journal in
- *
- * journal_init_inode creates a journal which maps an on-disk inode as
- * the journal.  The inode must exist already, must support bmap() and
- * must have all data blocks preallocated.
- */
-journal_t * journal_init_inode (struct inode *inode)
-{
-	struct buffer_head *bh;
-	journal_t *journal = journal_init_common();
-	int err;
-	int n;
-	unsigned int blocknr;
-
-	if (!journal)
-		return NULL;
-
-	journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
-	journal->j_inode = inode;
-	jbd_debug(1,
-		  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
-		  journal, inode->i_sb->s_id, inode->i_ino,
-		  (long long) inode->i_size,
-		  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
-
-	journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
-	journal->j_blocksize = inode->i_sb->s_blocksize;
-
-	/* journal descriptor can store up to n blocks -bzzz */
-	n = journal->j_blocksize / sizeof(journal_block_tag_t);
-	journal->j_wbufsize = n;
-	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
-	if (!journal->j_wbuf) {
-		printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
-			__func__);
-		goto out_err;
-	}
-
-	err = journal_bmap(journal, 0, &blocknr);
-	/* If that failed, give up */
-	if (err) {
-		printk(KERN_ERR "%s: Cannot locate journal superblock\n",
-		       __func__);
-		goto out_err;
-	}
-
-	bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
-	if (!bh) {
-		printk(KERN_ERR
-		       "%s: Cannot get buffer for journal superblock\n",
-		       __func__);
-		goto out_err;
-	}
-	journal->j_sb_buffer = bh;
-	journal->j_superblock = (journal_superblock_t *)bh->b_data;
-
-	return journal;
-out_err:
-	kfree(journal->j_wbuf);
-	kfree(journal);
-	return NULL;
-}
-
-/*
- * If the journal init or create aborts, we need to mark the journal
- * superblock as being NULL to prevent the journal destroy from writing
- * back a bogus superblock.
- */
-static void journal_fail_superblock (journal_t *journal)
-{
-	struct buffer_head *bh = journal->j_sb_buffer;
-	brelse(bh);
-	journal->j_sb_buffer = NULL;
-}
-
-/*
- * Given a journal_t structure, initialise the various fields for
- * startup of a new journaling session.  We use this both when creating
- * a journal, and after recovering an old journal to reset it for
- * subsequent use.
- */
-
-static int journal_reset(journal_t *journal)
-{
-	journal_superblock_t *sb = journal->j_superblock;
-	unsigned int first, last;
-
-	first = be32_to_cpu(sb->s_first);
-	last = be32_to_cpu(sb->s_maxlen);
-	if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
-		printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
-		       first, last);
-		journal_fail_superblock(journal);
-		return -EINVAL;
-	}
-
-	journal->j_first = first;
-	journal->j_last = last;
-
-	journal->j_head = first;
-	journal->j_tail = first;
-	journal->j_free = last - first;
-
-	journal->j_tail_sequence = journal->j_transaction_sequence;
-	journal->j_commit_sequence = journal->j_transaction_sequence - 1;
-	journal->j_commit_request = journal->j_commit_sequence;
-
-	journal->j_max_transaction_buffers = journal->j_maxlen / 4;
-
-	/*
-	 * As a special case, if the on-disk copy is already marked as needing
-	 * no recovery (s_start == 0), then we can safely defer the superblock
-	 * update until the next commit by setting JFS_FLUSHED.  This avoids
-	 * attempting a write to a potential-readonly device.
-	 */
-	if (sb->s_start == 0) {
-		jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
-			"(start %u, seq %d, errno %d)\n",
-			journal->j_tail, journal->j_tail_sequence,
-			journal->j_errno);
-		journal->j_flags |= JFS_FLUSHED;
-	} else {
-		/* Lock here to make assertions happy... */
-		mutex_lock(&journal->j_checkpoint_mutex);
-		/*
-		 * Update log tail information. We use WRITE_FUA since new
-		 * transaction will start reusing journal space and so we
-		 * must make sure information about current log tail is on
-		 * disk before that.
-		 */
-		journal_update_sb_log_tail(journal,
-					   journal->j_tail_sequence,
-					   journal->j_tail,
-					   WRITE_FUA);
-		mutex_unlock(&journal->j_checkpoint_mutex);
-	}
-	return journal_start_thread(journal);
-}
-
-/**
- * int journal_create() - Initialise the new journal file
- * @journal: Journal to create. This structure must have been initialised
- *
- * Given a journal_t structure which tells us which disk blocks we can
- * use, create a new journal superblock and initialise all of the
- * journal fields from scratch.
- **/
-int journal_create(journal_t *journal)
-{
-	unsigned int blocknr;
-	struct buffer_head *bh;
-	journal_superblock_t *sb;
-	int i, err;
-
-	if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
-		printk (KERN_ERR "Journal length (%d blocks) too short.\n",
-			journal->j_maxlen);
-		journal_fail_superblock(journal);
-		return -EINVAL;
-	}
-
-	if (journal->j_inode == NULL) {
-		/*
-		 * We don't know what block to start at!
-		 */
-		printk(KERN_EMERG
-		       "%s: creation of journal on external device!\n",
-		       __func__);
-		BUG();
-	}
-
-	/* Zero out the entire journal on disk.  We cannot afford to
-	   have any blocks on disk beginning with JFS_MAGIC_NUMBER. */
-	jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
-	for (i = 0; i < journal->j_maxlen; i++) {
-		err = journal_bmap(journal, i, &blocknr);
-		if (err)
-			return err;
-		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-		if (unlikely(!bh))
-			return -ENOMEM;
-		lock_buffer(bh);
-		memset (bh->b_data, 0, journal->j_blocksize);
-		BUFFER_TRACE(bh, "marking dirty");
-		mark_buffer_dirty(bh);
-		BUFFER_TRACE(bh, "marking uptodate");
-		set_buffer_uptodate(bh);
-		unlock_buffer(bh);
-		__brelse(bh);
-	}
-
-	sync_blockdev(journal->j_dev);
-	jbd_debug(1, "JBD: journal cleared.\n");
-
-	/* OK, fill in the initial static fields in the new superblock */
-	sb = journal->j_superblock;
-
-	sb->s_header.h_magic	 = cpu_to_be32(JFS_MAGIC_NUMBER);
-	sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
-
-	sb->s_blocksize	= cpu_to_be32(journal->j_blocksize);
-	sb->s_maxlen	= cpu_to_be32(journal->j_maxlen);
-	sb->s_first	= cpu_to_be32(1);
-
-	journal->j_transaction_sequence = 1;
-
-	journal->j_flags &= ~JFS_ABORT;
-	journal->j_format_version = 2;
-
-	return journal_reset(journal);
-}
-
-static void journal_write_superblock(journal_t *journal, int write_op)
-{
-	struct buffer_head *bh = journal->j_sb_buffer;
-	int ret;
-
-	trace_journal_write_superblock(journal, write_op);
-	if (!(journal->j_flags & JFS_BARRIER))
-		write_op &= ~(REQ_FUA | REQ_FLUSH);
-	lock_buffer(bh);
-	if (buffer_write_io_error(bh)) {
-		char b[BDEVNAME_SIZE];
-		/*
-		 * Oh, dear.  A previous attempt to write the journal
-		 * superblock failed.  This could happen because the
-		 * USB device was yanked out.  Or it could happen to
-		 * be a transient write error and maybe the block will
-		 * be remapped.  Nothing we can do but to retry the
-		 * write and hope for the best.
-		 */
-		printk(KERN_ERR "JBD: previous I/O error detected "
-		       "for journal superblock update for %s.\n",
-		       journal_dev_name(journal, b));
-		clear_buffer_write_io_error(bh);
-		set_buffer_uptodate(bh);
-	}
-
-	get_bh(bh);
-	bh->b_end_io = end_buffer_write_sync;
-	ret = submit_bh(write_op, bh);
-	wait_on_buffer(bh);
-	if (buffer_write_io_error(bh)) {
-		clear_buffer_write_io_error(bh);
-		set_buffer_uptodate(bh);
-		ret = -EIO;
-	}
-	if (ret) {
-		char b[BDEVNAME_SIZE];
-		printk(KERN_ERR "JBD: Error %d detected "
-		       "when updating journal superblock for %s.\n",
-		       ret, journal_dev_name(journal, b));
-	}
-}
-
-/**
- * journal_update_sb_log_tail() - Update log tail in journal sb on disk.
- * @journal: The journal to update.
- * @tail_tid: TID of the new transaction at the tail of the log
- * @tail_block: The first block of the transaction at the tail of the log
- * @write_op: With which operation should we write the journal sb
- *
- * Update a journal's superblock information about log tail and write it to
- * disk, waiting for the IO to complete.
- */
-void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
-				unsigned int tail_block, int write_op)
-{
-	journal_superblock_t *sb = journal->j_superblock;
-
-	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
-	jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n",
-		  tail_block, tail_tid);
-
-	sb->s_sequence = cpu_to_be32(tail_tid);
-	sb->s_start    = cpu_to_be32(tail_block);
-
-	journal_write_superblock(journal, write_op);
-
-	/* Log is no longer empty */
-	spin_lock(&journal->j_state_lock);
-	WARN_ON(!sb->s_sequence);
-	journal->j_flags &= ~JFS_FLUSHED;
-	spin_unlock(&journal->j_state_lock);
-}
-
-/**
- * mark_journal_empty() - Mark on disk journal as empty.
- * @journal: The journal to update.
- *
- * Update a journal's dynamic superblock fields to show that journal is empty.
- * Write updated superblock to disk waiting for IO to complete.
- */
-static void mark_journal_empty(journal_t *journal)
-{
-	journal_superblock_t *sb = journal->j_superblock;
-
-	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
-	spin_lock(&journal->j_state_lock);
-	/* Is it already empty? */
-	if (sb->s_start == 0) {
-		spin_unlock(&journal->j_state_lock);
-		return;
-	}
-	jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n",
-        	  journal->j_tail_sequence);
-
-	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
-	sb->s_start    = cpu_to_be32(0);
-	spin_unlock(&journal->j_state_lock);
-
-	journal_write_superblock(journal, WRITE_FUA);
-
-	spin_lock(&journal->j_state_lock);
-	/* Log is empty */
-	journal->j_flags |= JFS_FLUSHED;
-	spin_unlock(&journal->j_state_lock);
-}
-
-/**
- * journal_update_sb_errno() - Update error in the journal.
- * @journal: The journal to update.
- *
- * Update a journal's errno.  Write updated superblock to disk waiting for IO
- * to complete.
- */
-static void journal_update_sb_errno(journal_t *journal)
-{
-	journal_superblock_t *sb = journal->j_superblock;
-
-	spin_lock(&journal->j_state_lock);
-	jbd_debug(1, "JBD: updating superblock error (errno %d)\n",
-        	  journal->j_errno);
-	sb->s_errno = cpu_to_be32(journal->j_errno);
-	spin_unlock(&journal->j_state_lock);
-
-	journal_write_superblock(journal, WRITE_SYNC);
-}
-
-/*
- * Read the superblock for a given journal, performing initial
- * validation of the format.
- */
-
-static int journal_get_superblock(journal_t *journal)
-{
-	struct buffer_head *bh;
-	journal_superblock_t *sb;
-	int err = -EIO;
-
-	bh = journal->j_sb_buffer;
-
-	J_ASSERT(bh != NULL);
-	if (!buffer_uptodate(bh)) {
-		ll_rw_block(READ, 1, &bh);
-		wait_on_buffer(bh);
-		if (!buffer_uptodate(bh)) {
-			printk (KERN_ERR
-				"JBD: IO error reading journal superblock\n");
-			goto out;
-		}
-	}
-
-	sb = journal->j_superblock;
-
-	err = -EINVAL;
-
-	if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) ||
-	    sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
-		printk(KERN_WARNING "JBD: no valid journal superblock found\n");
-		goto out;
-	}
-
-	switch(be32_to_cpu(sb->s_header.h_blocktype)) {
-	case JFS_SUPERBLOCK_V1:
-		journal->j_format_version = 1;
-		break;
-	case JFS_SUPERBLOCK_V2:
-		journal->j_format_version = 2;
-		break;
-	default:
-		printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
-		goto out;
-	}
-
-	if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
-		journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
-	else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
-		printk (KERN_WARNING "JBD: journal file too short\n");
-		goto out;
-	}
-
-	if (be32_to_cpu(sb->s_first) == 0 ||
-	    be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
-		printk(KERN_WARNING
-			"JBD: Invalid start block of journal: %u\n",
-			be32_to_cpu(sb->s_first));
-		goto out;
-	}
-
-	return 0;
-
-out:
-	journal_fail_superblock(journal);
-	return err;
-}
-
-/*
- * Load the on-disk journal superblock and read the key fields into the
- * journal_t.
- */
-
-static int load_superblock(journal_t *journal)
-{
-	int err;
-	journal_superblock_t *sb;
-
-	err = journal_get_superblock(journal);
-	if (err)
-		return err;
-
-	sb = journal->j_superblock;
-
-	journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
-	journal->j_tail = be32_to_cpu(sb->s_start);
-	journal->j_first = be32_to_cpu(sb->s_first);
-	journal->j_last = be32_to_cpu(sb->s_maxlen);
-	journal->j_errno = be32_to_cpu(sb->s_errno);
-
-	return 0;
-}
-
-
-/**
- * int journal_load() - Read journal from disk.
- * @journal: Journal to act on.
- *
- * Given a journal_t structure which tells us which disk blocks contain
- * a journal, read the journal from disk to initialise the in-memory
- * structures.
- */
-int journal_load(journal_t *journal)
-{
-	int err;
-	journal_superblock_t *sb;
-
-	err = load_superblock(journal);
-	if (err)
-		return err;
-
-	sb = journal->j_superblock;
-	/* If this is a V2 superblock, then we have to check the
-	 * features flags on it. */
-
-	if (journal->j_format_version >= 2) {
-		if ((sb->s_feature_ro_compat &
-		     ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) ||
-		    (sb->s_feature_incompat &
-		     ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) {
-			printk (KERN_WARNING
-				"JBD: Unrecognised features on journal\n");
-			return -EINVAL;
-		}
-	}
-
-	/* Let the recovery code check whether it needs to recover any
-	 * data from the journal. */
-	if (journal_recover(journal))
-		goto recovery_error;
-
-	/* OK, we've finished with the dynamic journal bits:
-	 * reinitialise the dynamic contents of the superblock in memory
-	 * and reset them on disk. */
-	if (journal_reset(journal))
-		goto recovery_error;
-
-	journal->j_flags &= ~JFS_ABORT;
-	journal->j_flags |= JFS_LOADED;
-	return 0;
-
-recovery_error:
-	printk (KERN_WARNING "JBD: recovery failed\n");
-	return -EIO;
-}
-
-/**
- * void journal_destroy() - Release a journal_t structure.
- * @journal: Journal to act on.
- *
- * Release a journal_t structure once it is no longer in use by the
- * journaled object.
- * Return <0 if we couldn't clean up the journal.
- */
-int journal_destroy(journal_t *journal)
-{
-	int err = 0;
-
-	
-	/* Wait for the commit thread to wake up and die. */
-	journal_kill_thread(journal);
-
-	/* Force a final log commit */
-	if (journal->j_running_transaction)
-		journal_commit_transaction(journal);
-
-	/* Force any old transactions to disk */
-
-	/* We cannot race with anybody but must keep assertions happy */
-	mutex_lock(&journal->j_checkpoint_mutex);
-	/* Totally anal locking here... */
-	spin_lock(&journal->j_list_lock);
-	while (journal->j_checkpoint_transactions != NULL) {
-		spin_unlock(&journal->j_list_lock);
-		log_do_checkpoint(journal);
-		spin_lock(&journal->j_list_lock);
-	}
-
-	J_ASSERT(journal->j_running_transaction == NULL);
-	J_ASSERT(journal->j_committing_transaction == NULL);
-	J_ASSERT(journal->j_checkpoint_transactions == NULL);
-	spin_unlock(&journal->j_list_lock);
-
-	if (journal->j_sb_buffer) {
-		if (!is_journal_aborted(journal)) {
-			journal->j_tail_sequence =
-				++journal->j_transaction_sequence;
-			mark_journal_empty(journal);
-		} else
-			err = -EIO;
-		brelse(journal->j_sb_buffer);
-	}
-	mutex_unlock(&journal->j_checkpoint_mutex);
-
-	iput(journal->j_inode);
-	if (journal->j_revoke)
-		journal_destroy_revoke(journal);
-	kfree(journal->j_wbuf);
-	kfree(journal);
-
-	return err;
-}
-
-
-/**
- *int journal_check_used_features () - Check if features specified are used.
- * @journal: Journal to check.
- * @compat: bitmask of compatible features
- * @ro: bitmask of features that force read-only mount
- * @incompat: bitmask of incompatible features
- *
- * Check whether the journal uses all of a given set of
- * features.  Return true (non-zero) if it does.
- **/
-
-int journal_check_used_features (journal_t *journal, unsigned long compat,
-				 unsigned long ro, unsigned long incompat)
-{
-	journal_superblock_t *sb;
-
-	if (!compat && !ro && !incompat)
-		return 1;
-	if (journal->j_format_version == 1)
-		return 0;
-
-	sb = journal->j_superblock;
-
-	if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
-	    ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
-	    ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
-		return 1;
-
-	return 0;
-}
-
-/**
- * int journal_check_available_features() - Check feature set in journalling layer
- * @journal: Journal to check.
- * @compat: bitmask of compatible features
- * @ro: bitmask of features that force read-only mount
- * @incompat: bitmask of incompatible features
- *
- * Check whether the journaling code supports the use of
- * all of a given set of features on this journal.  Return true
- * (non-zero) if it can. */
-
-int journal_check_available_features (journal_t *journal, unsigned long compat,
-				      unsigned long ro, unsigned long incompat)
-{
-	if (!compat && !ro && !incompat)
-		return 1;
-
-	/* We can support any known requested features iff the
-	 * superblock is in version 2.  Otherwise we fail to support any
-	 * extended sb features. */
-
-	if (journal->j_format_version != 2)
-		return 0;
-
-	if ((compat   & JFS_KNOWN_COMPAT_FEATURES) == compat &&
-	    (ro       & JFS_KNOWN_ROCOMPAT_FEATURES) == ro &&
-	    (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat)
-		return 1;
-
-	return 0;
-}
-
-/**
- * int journal_set_features () - Mark a given journal feature in the superblock
- * @journal: Journal to act on.
- * @compat: bitmask of compatible features
- * @ro: bitmask of features that force read-only mount
- * @incompat: bitmask of incompatible features
- *
- * Mark a given journal feature as present on the
- * superblock.  Returns true if the requested features could be set.
- *
- */
-
-int journal_set_features (journal_t *journal, unsigned long compat,
-			  unsigned long ro, unsigned long incompat)
-{
-	journal_superblock_t *sb;
-
-	if (journal_check_used_features(journal, compat, ro, incompat))
-		return 1;
-
-	if (!journal_check_available_features(journal, compat, ro, incompat))
-		return 0;
-
-	jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
-		  compat, ro, incompat);
-
-	sb = journal->j_superblock;
-
-	sb->s_feature_compat    |= cpu_to_be32(compat);
-	sb->s_feature_ro_compat |= cpu_to_be32(ro);
-	sb->s_feature_incompat  |= cpu_to_be32(incompat);
-
-	return 1;
-}
-
-
-/**
- * int journal_update_format () - Update on-disk journal structure.
- * @journal: Journal to act on.
- *
- * Given an initialised but unloaded journal struct, poke about in the
- * on-disk structure to update it to the most recent supported version.
- */
-int journal_update_format (journal_t *journal)
-{
-	journal_superblock_t *sb;
-	int err;
-
-	err = journal_get_superblock(journal);
-	if (err)
-		return err;
-
-	sb = journal->j_superblock;
-
-	switch (be32_to_cpu(sb->s_header.h_blocktype)) {
-	case JFS_SUPERBLOCK_V2:
-		return 0;
-	case JFS_SUPERBLOCK_V1:
-		return journal_convert_superblock_v1(journal, sb);
-	default:
-		break;
-	}
-	return -EINVAL;
-}
-
-static int journal_convert_superblock_v1(journal_t *journal,
-					 journal_superblock_t *sb)
-{
-	int offset, blocksize;
-	struct buffer_head *bh;
-
-	printk(KERN_WARNING
-		"JBD: Converting superblock from version 1 to 2.\n");
-
-	/* Pre-initialise new fields to zero */
-	offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
-	blocksize = be32_to_cpu(sb->s_blocksize);
-	memset(&sb->s_feature_compat, 0, blocksize-offset);
-
-	sb->s_nr_users = cpu_to_be32(1);
-	sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
-	journal->j_format_version = 2;
-
-	bh = journal->j_sb_buffer;
-	BUFFER_TRACE(bh, "marking dirty");
-	mark_buffer_dirty(bh);
-	sync_dirty_buffer(bh);
-	return 0;
-}
-
-
-/**
- * int journal_flush () - Flush journal
- * @journal: Journal to act on.
- *
- * Flush all data for a given journal to disk and empty the journal.
- * Filesystems can use this when remounting readonly to ensure that
- * recovery does not need to happen on remount.
- */
-
-int journal_flush(journal_t *journal)
-{
-	int err = 0;
-	transaction_t *transaction = NULL;
-
-	spin_lock(&journal->j_state_lock);
-
-	/* Force everything buffered to the log... */
-	if (journal->j_running_transaction) {
-		transaction = journal->j_running_transaction;
-		__log_start_commit(journal, transaction->t_tid);
-	} else if (journal->j_committing_transaction)
-		transaction = journal->j_committing_transaction;
-
-	/* Wait for the log commit to complete... */
-	if (transaction) {
-		tid_t tid = transaction->t_tid;
-
-		spin_unlock(&journal->j_state_lock);
-		log_wait_commit(journal, tid);
-	} else {
-		spin_unlock(&journal->j_state_lock);
-	}
-
-	/* ...and flush everything in the log out to disk. */
-	spin_lock(&journal->j_list_lock);
-	while (!err && journal->j_checkpoint_transactions != NULL) {
-		spin_unlock(&journal->j_list_lock);
-		mutex_lock(&journal->j_checkpoint_mutex);
-		err = log_do_checkpoint(journal);
-		mutex_unlock(&journal->j_checkpoint_mutex);
-		spin_lock(&journal->j_list_lock);
-	}
-	spin_unlock(&journal->j_list_lock);
-
-	if (is_journal_aborted(journal))
-		return -EIO;
-
-	mutex_lock(&journal->j_checkpoint_mutex);
-	cleanup_journal_tail(journal);
-
-	/* Finally, mark the journal as really needing no recovery.
-	 * This sets s_start==0 in the underlying superblock, which is
-	 * the magic code for a fully-recovered superblock.  Any future
-	 * commits of data to the journal will restore the current
-	 * s_start value. */
-	mark_journal_empty(journal);
-	mutex_unlock(&journal->j_checkpoint_mutex);
-	spin_lock(&journal->j_state_lock);
-	J_ASSERT(!journal->j_running_transaction);
-	J_ASSERT(!journal->j_committing_transaction);
-	J_ASSERT(!journal->j_checkpoint_transactions);
-	J_ASSERT(journal->j_head == journal->j_tail);
-	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
-	spin_unlock(&journal->j_state_lock);
-	return 0;
-}
-
-/**
- * int journal_wipe() - Wipe journal contents
- * @journal: Journal to act on.
- * @write: flag (see below)
- *
- * Wipe out all of the contents of a journal, safely.  This will produce
- * a warning if the journal contains any valid recovery information.
- * Must be called between journal_init_*() and journal_load().
- *
- * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
- * we merely suppress recovery.
- */
-
-int journal_wipe(journal_t *journal, int write)
-{
-	int err = 0;
-
-	J_ASSERT (!(journal->j_flags & JFS_LOADED));
-
-	err = load_superblock(journal);
-	if (err)
-		return err;
-
-	if (!journal->j_tail)
-		goto no_recovery;
-
-	printk (KERN_WARNING "JBD: %s recovery information on journal\n",
-		write ? "Clearing" : "Ignoring");
-
-	err = journal_skip_recovery(journal);
-	if (write) {
-		/* Lock to make assertions happy... */
-		mutex_lock(&journal->j_checkpoint_mutex);
-		mark_journal_empty(journal);
-		mutex_unlock(&journal->j_checkpoint_mutex);
-	}
-
- no_recovery:
-	return err;
-}
-
-/*
- * journal_dev_name: format a character string to describe on what
- * device this journal is present.
- */
-
-static const char *journal_dev_name(journal_t *journal, char *buffer)
-{
-	struct block_device *bdev;
-
-	if (journal->j_inode)
-		bdev = journal->j_inode->i_sb->s_bdev;
-	else
-		bdev = journal->j_dev;
-
-	return bdevname(bdev, buffer);
-}
-
-/*
- * Journal abort has very specific semantics, which we describe
- * for journal abort.
- *
- * Two internal function, which provide abort to te jbd layer
- * itself are here.
- */
-
-/*
- * Quick version for internal journal use (doesn't lock the journal).
- * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
- * and don't attempt to make any other journal updates.
- */
-static void __journal_abort_hard(journal_t *journal)
-{
-	transaction_t *transaction;
-	char b[BDEVNAME_SIZE];
-
-	if (journal->j_flags & JFS_ABORT)
-		return;
-
-	printk(KERN_ERR "Aborting journal on device %s.\n",
-		journal_dev_name(journal, b));
-
-	spin_lock(&journal->j_state_lock);
-	journal->j_flags |= JFS_ABORT;
-	transaction = journal->j_running_transaction;
-	if (transaction)
-		__log_start_commit(journal, transaction->t_tid);
-	spin_unlock(&journal->j_state_lock);
-}
-
-/* Soft abort: record the abort error status in the journal superblock,
- * but don't do any other IO. */
-static void __journal_abort_soft (journal_t *journal, int errno)
-{
-	if (journal->j_flags & JFS_ABORT)
-		return;
-
-	if (!journal->j_errno)
-		journal->j_errno = errno;
-
-	__journal_abort_hard(journal);
-
-	if (errno)
-		journal_update_sb_errno(journal);
-}
-
-/**
- * void journal_abort () - Shutdown the journal immediately.
- * @journal: the journal to shutdown.
- * @errno:   an error number to record in the journal indicating
- *           the reason for the shutdown.
- *
- * Perform a complete, immediate shutdown of the ENTIRE
- * journal (not of a single transaction).  This operation cannot be
- * undone without closing and reopening the journal.
- *
- * The journal_abort function is intended to support higher level error
- * recovery mechanisms such as the ext2/ext3 remount-readonly error
- * mode.
- *
- * Journal abort has very specific semantics.  Any existing dirty,
- * unjournaled buffers in the main filesystem will still be written to
- * disk by bdflush, but the journaling mechanism will be suspended
- * immediately and no further transaction commits will be honoured.
- *
- * Any dirty, journaled buffers will be written back to disk without
- * hitting the journal.  Atomicity cannot be guaranteed on an aborted
- * filesystem, but we _do_ attempt to leave as much data as possible
- * behind for fsck to use for cleanup.
- *
- * Any attempt to get a new transaction handle on a journal which is in
- * ABORT state will just result in an -EROFS error return.  A
- * journal_stop on an existing handle will return -EIO if we have
- * entered abort state during the update.
- *
- * Recursive transactions are not disturbed by journal abort until the
- * final journal_stop, which will receive the -EIO error.
- *
- * Finally, the journal_abort call allows the caller to supply an errno
- * which will be recorded (if possible) in the journal superblock.  This
- * allows a client to record failure conditions in the middle of a
- * transaction without having to complete the transaction to record the
- * failure to disk.  ext3_error, for example, now uses this
- * functionality.
- *
- * Errors which originate from within the journaling layer will NOT
- * supply an errno; a null errno implies that absolutely no further
- * writes are done to the journal (unless there are any already in
- * progress).
- *
- */
-
-void journal_abort(journal_t *journal, int errno)
-{
-	__journal_abort_soft(journal, errno);
-}
-
-/**
- * int journal_errno () - returns the journal's error state.
- * @journal: journal to examine.
- *
- * This is the errno numbet set with journal_abort(), the last
- * time the journal was mounted - if the journal was stopped
- * without calling abort this will be 0.
- *
- * If the journal has been aborted on this mount time -EROFS will
- * be returned.
- */
-int journal_errno(journal_t *journal)
-{
-	int err;
-
-	spin_lock(&journal->j_state_lock);
-	if (journal->j_flags & JFS_ABORT)
-		err = -EROFS;
-	else
-		err = journal->j_errno;
-	spin_unlock(&journal->j_state_lock);
-	return err;
-}
-
-/**
- * int journal_clear_err () - clears the journal's error state
- * @journal: journal to act on.
- *
- * An error must be cleared or Acked to take a FS out of readonly
- * mode.
- */
-int journal_clear_err(journal_t *journal)
-{
-	int err = 0;
-
-	spin_lock(&journal->j_state_lock);
-	if (journal->j_flags & JFS_ABORT)
-		err = -EROFS;
-	else
-		journal->j_errno = 0;
-	spin_unlock(&journal->j_state_lock);
-	return err;
-}
-
-/**
- * void journal_ack_err() - Ack journal err.
- * @journal: journal to act on.
- *
- * An error must be cleared or Acked to take a FS out of readonly
- * mode.
- */
-void journal_ack_err(journal_t *journal)
-{
-	spin_lock(&journal->j_state_lock);
-	if (journal->j_errno)
-		journal->j_flags |= JFS_ACK_ERR;
-	spin_unlock(&journal->j_state_lock);
-}
-
-int journal_blocks_per_page(struct inode *inode)
-{
-	return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-}
-
-/*
- * Journal_head storage management
- */
-static struct kmem_cache *journal_head_cache;
-#ifdef CONFIG_JBD_DEBUG
-static atomic_t nr_journal_heads = ATOMIC_INIT(0);
-#endif
-
-static int journal_init_journal_head_cache(void)
-{
-	int retval;
-
-	J_ASSERT(journal_head_cache == NULL);
-	journal_head_cache = kmem_cache_create("journal_head",
-				sizeof(struct journal_head),
-				0,		/* offset */
-				SLAB_TEMPORARY,	/* flags */
-				NULL);		/* ctor */
-	retval = 0;
-	if (!journal_head_cache) {
-		retval = -ENOMEM;
-		printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
-	}
-	return retval;
-}
-
-static void journal_destroy_journal_head_cache(void)
-{
-	if (journal_head_cache) {
-		kmem_cache_destroy(journal_head_cache);
-		journal_head_cache = NULL;
-	}
-}
-
-/*
- * journal_head splicing and dicing
- */
-static struct journal_head *journal_alloc_journal_head(void)
-{
-	struct journal_head *ret;
-
-#ifdef CONFIG_JBD_DEBUG
-	atomic_inc(&nr_journal_heads);
-#endif
-	ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS);
-	if (ret == NULL) {
-		jbd_debug(1, "out of memory for journal_head\n");
-		printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
-				   __func__);
-
-		while (ret == NULL) {
-			yield();
-			ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS);
-		}
-	}
-	return ret;
-}
-
-static void journal_free_journal_head(struct journal_head *jh)
-{
-#ifdef CONFIG_JBD_DEBUG
-	atomic_dec(&nr_journal_heads);
-	memset(jh, JBD_POISON_FREE, sizeof(*jh));
-#endif
-	kmem_cache_free(journal_head_cache, jh);
-}
-
-/*
- * A journal_head is attached to a buffer_head whenever JBD has an
- * interest in the buffer.
- *
- * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
- * is set.  This bit is tested in core kernel code where we need to take
- * JBD-specific actions.  Testing the zeroness of ->b_private is not reliable
- * there.
- *
- * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
- *
- * When a buffer has its BH_JBD bit set it is immune from being released by
- * core kernel code, mainly via ->b_count.
- *
- * A journal_head is detached from its buffer_head when the journal_head's
- * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
- * transaction (b_cp_transaction) hold their references to b_jcount.
- *
- * Various places in the kernel want to attach a journal_head to a buffer_head
- * _before_ attaching the journal_head to a transaction.  To protect the
- * journal_head in this situation, journal_add_journal_head elevates the
- * journal_head's b_jcount refcount by one.  The caller must call
- * journal_put_journal_head() to undo this.
- *
- * So the typical usage would be:
- *
- *	(Attach a journal_head if needed.  Increments b_jcount)
- *	struct journal_head *jh = journal_add_journal_head(bh);
- *	...
- *      (Get another reference for transaction)
- *      journal_grab_journal_head(bh);
- *      jh->b_transaction = xxx;
- *      (Put original reference)
- *      journal_put_journal_head(jh);
- */
-
-/*
- * Give a buffer_head a journal_head.
- *
- * May sleep.
- */
-struct journal_head *journal_add_journal_head(struct buffer_head *bh)
-{
-	struct journal_head *jh;
-	struct journal_head *new_jh = NULL;
-
-repeat:
-	if (!buffer_jbd(bh))
-		new_jh = journal_alloc_journal_head();
-
-	jbd_lock_bh_journal_head(bh);
-	if (buffer_jbd(bh)) {
-		jh = bh2jh(bh);
-	} else {
-		J_ASSERT_BH(bh,
-			(atomic_read(&bh->b_count) > 0) ||
-			(bh->b_page && bh->b_page->mapping));
-
-		if (!new_jh) {
-			jbd_unlock_bh_journal_head(bh);
-			goto repeat;
-		}
-
-		jh = new_jh;
-		new_jh = NULL;		/* We consumed it */
-		set_buffer_jbd(bh);
-		bh->b_private = jh;
-		jh->b_bh = bh;
-		get_bh(bh);
-		BUFFER_TRACE(bh, "added journal_head");
-	}
-	jh->b_jcount++;
-	jbd_unlock_bh_journal_head(bh);
-	if (new_jh)
-		journal_free_journal_head(new_jh);
-	return bh->b_private;
-}
-
-/*
- * Grab a ref against this buffer_head's journal_head.  If it ended up not
- * having a journal_head, return NULL
- */
-struct journal_head *journal_grab_journal_head(struct buffer_head *bh)
-{
-	struct journal_head *jh = NULL;
-
-	jbd_lock_bh_journal_head(bh);
-	if (buffer_jbd(bh)) {
-		jh = bh2jh(bh);
-		jh->b_jcount++;
-	}
-	jbd_unlock_bh_journal_head(bh);
-	return jh;
-}
-
-static void __journal_remove_journal_head(struct buffer_head *bh)
-{
-	struct journal_head *jh = bh2jh(bh);
-
-	J_ASSERT_JH(jh, jh->b_jcount >= 0);
-	J_ASSERT_JH(jh, jh->b_transaction == NULL);
-	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
-	J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
-	J_ASSERT_BH(bh, buffer_jbd(bh));
-	J_ASSERT_BH(bh, jh2bh(jh) == bh);
-	BUFFER_TRACE(bh, "remove journal_head");
-	if (jh->b_frozen_data) {
-		printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
-		jbd_free(jh->b_frozen_data, bh->b_size);
-	}
-	if (jh->b_committed_data) {
-		printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
-		jbd_free(jh->b_committed_data, bh->b_size);
-	}
-	bh->b_private = NULL;
-	jh->b_bh = NULL;	/* debug, really */
-	clear_buffer_jbd(bh);
-	journal_free_journal_head(jh);
-}
-
-/*
- * Drop a reference on the passed journal_head.  If it fell to zero then
- * release the journal_head from the buffer_head.
- */
-void journal_put_journal_head(struct journal_head *jh)
-{
-	struct buffer_head *bh = jh2bh(jh);
-
-	jbd_lock_bh_journal_head(bh);
-	J_ASSERT_JH(jh, jh->b_jcount > 0);
-	--jh->b_jcount;
-	if (!jh->b_jcount) {
-		__journal_remove_journal_head(bh);
-		jbd_unlock_bh_journal_head(bh);
-		__brelse(bh);
-	} else
-		jbd_unlock_bh_journal_head(bh);
-}
-
-/*
- * debugfs tunables
- */
-#ifdef CONFIG_JBD_DEBUG
-
-u8 journal_enable_debug __read_mostly;
-EXPORT_SYMBOL(journal_enable_debug);
-
-static struct dentry *jbd_debugfs_dir;
-static struct dentry *jbd_debug;
-
-static void __init jbd_create_debugfs_entry(void)
-{
-	jbd_debugfs_dir = debugfs_create_dir("jbd", NULL);
-	if (jbd_debugfs_dir)
-		jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR,
-					       jbd_debugfs_dir,
-					       &journal_enable_debug);
-}
-
-static void __exit jbd_remove_debugfs_entry(void)
-{
-	debugfs_remove(jbd_debug);
-	debugfs_remove(jbd_debugfs_dir);
-}
-
-#else
-
-static inline void jbd_create_debugfs_entry(void)
-{
-}
-
-static inline void jbd_remove_debugfs_entry(void)
-{
-}
-
-#endif
-
-struct kmem_cache *jbd_handle_cache;
-
-static int __init journal_init_handle_cache(void)
-{
-	jbd_handle_cache = kmem_cache_create("journal_handle",
-				sizeof(handle_t),
-				0,		/* offset */
-				SLAB_TEMPORARY,	/* flags */
-				NULL);		/* ctor */
-	if (jbd_handle_cache == NULL) {
-		printk(KERN_EMERG "JBD: failed to create handle cache\n");
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-static void journal_destroy_handle_cache(void)
-{
-	if (jbd_handle_cache)
-		kmem_cache_destroy(jbd_handle_cache);
-}
-
-/*
- * Module startup and shutdown
- */
-
-static int __init journal_init_caches(void)
-{
-	int ret;
-
-	ret = journal_init_revoke_caches();
-	if (ret == 0)
-		ret = journal_init_journal_head_cache();
-	if (ret == 0)
-		ret = journal_init_handle_cache();
-	return ret;
-}
-
-static void journal_destroy_caches(void)
-{
-	journal_destroy_revoke_caches();
-	journal_destroy_journal_head_cache();
-	journal_destroy_handle_cache();
-}
-
-static int __init journal_init(void)
-{
-	int ret;
-
-	BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
-
-	ret = journal_init_caches();
-	if (ret != 0)
-		journal_destroy_caches();
-	jbd_create_debugfs_entry();
-	return ret;
-}
-
-static void __exit journal_exit(void)
-{
-#ifdef CONFIG_JBD_DEBUG
-	int n = atomic_read(&nr_journal_heads);
-	if (n)
-		printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n);
-#endif
-	jbd_remove_debugfs_entry();
-	journal_destroy_caches();
-}
-
-MODULE_LICENSE("GPL");
-module_init(journal_init);
-module_exit(journal_exit);
-
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
deleted file mode 100644
index a748fe21465a..000000000000
--- a/fs/jbd/recovery.c
+++ /dev/null
@@ -1,594 +0,0 @@
-/*
- * linux/fs/jbd/recovery.c
- *
- * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
- *
- * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * Journal recovery routines for the generic filesystem journaling code;
- * part of the ext2fs journaling system.
- */
-
-#ifndef __KERNEL__
-#include "jfs_user.h"
-#else
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/errno.h>
-#include <linux/blkdev.h>
-#endif
-
-/*
- * Maintain information about the progress of the recovery job, so that
- * the different passes can carry information between them.
- */
-struct recovery_info
-{
-	tid_t		start_transaction;
-	tid_t		end_transaction;
-
-	int		nr_replays;
-	int		nr_revokes;
-	int		nr_revoke_hits;
-};
-
-enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
-static int do_one_pass(journal_t *journal,
-				struct recovery_info *info, enum passtype pass);
-static int scan_revoke_records(journal_t *, struct buffer_head *,
-				tid_t, struct recovery_info *);
-
-#ifdef __KERNEL__
-
-/* Release readahead buffers after use */
-static void journal_brelse_array(struct buffer_head *b[], int n)
-{
-	while (--n >= 0)
-		brelse (b[n]);
-}
-
-
-/*
- * When reading from the journal, we are going through the block device
- * layer directly and so there is no readahead being done for us.  We
- * need to implement any readahead ourselves if we want it to happen at
- * all.  Recovery is basically one long sequential read, so make sure we
- * do the IO in reasonably large chunks.
- *
- * This is not so critical that we need to be enormously clever about
- * the readahead size, though.  128K is a purely arbitrary, good-enough
- * fixed value.
- */
-
-#define MAXBUF 8
-static int do_readahead(journal_t *journal, unsigned int start)
-{
-	int err;
-	unsigned int max, nbufs, next;
-	unsigned int blocknr;
-	struct buffer_head *bh;
-
-	struct buffer_head * bufs[MAXBUF];
-
-	/* Do up to 128K of readahead */
-	max = start + (128 * 1024 / journal->j_blocksize);
-	if (max > journal->j_maxlen)
-		max = journal->j_maxlen;
-
-	/* Do the readahead itself.  We'll submit MAXBUF buffer_heads at
-	 * a time to the block device IO layer. */
-
-	nbufs = 0;
-
-	for (next = start; next < max; next++) {
-		err = journal_bmap(journal, next, &blocknr);
-
-		if (err) {
-			printk (KERN_ERR "JBD: bad block at offset %u\n",
-				next);
-			goto failed;
-		}
-
-		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-		if (!bh) {
-			err = -ENOMEM;
-			goto failed;
-		}
-
-		if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
-			bufs[nbufs++] = bh;
-			if (nbufs == MAXBUF) {
-				ll_rw_block(READ, nbufs, bufs);
-				journal_brelse_array(bufs, nbufs);
-				nbufs = 0;
-			}
-		} else
-			brelse(bh);
-	}
-
-	if (nbufs)
-		ll_rw_block(READ, nbufs, bufs);
-	err = 0;
-
-failed:
-	if (nbufs)
-		journal_brelse_array(bufs, nbufs);
-	return err;
-}
-
-#endif /* __KERNEL__ */
-
-
-/*
- * Read a block from the journal
- */
-
-static int jread(struct buffer_head **bhp, journal_t *journal,
-		 unsigned int offset)
-{
-	int err;
-	unsigned int blocknr;
-	struct buffer_head *bh;
-
-	*bhp = NULL;
-
-	if (offset >= journal->j_maxlen) {
-		printk(KERN_ERR "JBD: corrupted journal superblock\n");
-		return -EIO;
-	}
-
-	err = journal_bmap(journal, offset, &blocknr);
-
-	if (err) {
-		printk (KERN_ERR "JBD: bad block at offset %u\n",
-			offset);
-		return err;
-	}
-
-	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-	if (!bh)
-		return -ENOMEM;
-
-	if (!buffer_uptodate(bh)) {
-		/* If this is a brand new buffer, start readahead.
-                   Otherwise, we assume we are already reading it.  */
-		if (!buffer_req(bh))
-			do_readahead(journal, offset);
-		wait_on_buffer(bh);
-	}
-
-	if (!buffer_uptodate(bh)) {
-		printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
-			offset);
-		brelse(bh);
-		return -EIO;
-	}
-
-	*bhp = bh;
-	return 0;
-}
-
-
-/*
- * Count the number of in-use tags in a journal descriptor block.
- */
-
-static int count_tags(struct buffer_head *bh, int size)
-{
-	char *			tagp;
-	journal_block_tag_t *	tag;
-	int			nr = 0;
-
-	tagp = &bh->b_data[sizeof(journal_header_t)];
-
-	while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
-		tag = (journal_block_tag_t *) tagp;
-
-		nr++;
-		tagp += sizeof(journal_block_tag_t);
-		if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID)))
-			tagp += 16;
-
-		if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG))
-			break;
-	}
-
-	return nr;
-}
-
-
-/* Make sure we wrap around the log correctly! */
-#define wrap(journal, var)						\
-do {									\
-	if (var >= (journal)->j_last)					\
-		var -= ((journal)->j_last - (journal)->j_first);	\
-} while (0)
-
-/**
- * journal_recover - recovers a on-disk journal
- * @journal: the journal to recover
- *
- * The primary function for recovering the log contents when mounting a
- * journaled device.
- *
- * Recovery is done in three passes.  In the first pass, we look for the
- * end of the log.  In the second, we assemble the list of revoke
- * blocks.  In the third and final pass, we replay any un-revoked blocks
- * in the log.
- */
-int journal_recover(journal_t *journal)
-{
-	int			err, err2;
-	journal_superblock_t *	sb;
-
-	struct recovery_info	info;
-
-	memset(&info, 0, sizeof(info));
-	sb = journal->j_superblock;
-
-	/*
-	 * The journal superblock's s_start field (the current log head)
-	 * is always zero if, and only if, the journal was cleanly
-	 * unmounted.
-	 */
-
-	if (!sb->s_start) {
-		jbd_debug(1, "No recovery required, last transaction %d\n",
-			  be32_to_cpu(sb->s_sequence));
-		journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
-		return 0;
-	}
-
-	err = do_one_pass(journal, &info, PASS_SCAN);
-	if (!err)
-		err = do_one_pass(journal, &info, PASS_REVOKE);
-	if (!err)
-		err = do_one_pass(journal, &info, PASS_REPLAY);
-
-	jbd_debug(1, "JBD: recovery, exit status %d, "
-		  "recovered transactions %u to %u\n",
-		  err, info.start_transaction, info.end_transaction);
-	jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n",
-		  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
-
-	/* Restart the log at the next transaction ID, thus invalidating
-	 * any existing commit records in the log. */
-	journal->j_transaction_sequence = ++info.end_transaction;
-
-	journal_clear_revoke(journal);
-	err2 = sync_blockdev(journal->j_fs_dev);
-	if (!err)
-		err = err2;
-	/* Flush disk caches to get replayed data on the permanent storage */
-	if (journal->j_flags & JFS_BARRIER) {
-		err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
-		if (!err)
-			err = err2;
-	}
-
-	return err;
-}
-
-/**
- * journal_skip_recovery - Start journal and wipe exiting records
- * @journal: journal to startup
- *
- * Locate any valid recovery information from the journal and set up the
- * journal structures in memory to ignore it (presumably because the
- * caller has evidence that it is out of date).
- * This function does'nt appear to be exorted..
- *
- * We perform one pass over the journal to allow us to tell the user how
- * much recovery information is being erased, and to let us initialise
- * the journal transaction sequence numbers to the next unused ID.
- */
-int journal_skip_recovery(journal_t *journal)
-{
-	int			err;
-	struct recovery_info	info;
-
-	memset (&info, 0, sizeof(info));
-
-	err = do_one_pass(journal, &info, PASS_SCAN);
-
-	if (err) {
-		printk(KERN_ERR "JBD: error %d scanning journal\n", err);
-		++journal->j_transaction_sequence;
-	} else {
-#ifdef CONFIG_JBD_DEBUG
-		int dropped = info.end_transaction -
-			      be32_to_cpu(journal->j_superblock->s_sequence);
-		jbd_debug(1,
-			  "JBD: ignoring %d transaction%s from the journal.\n",
-			  dropped, (dropped == 1) ? "" : "s");
-#endif
-		journal->j_transaction_sequence = ++info.end_transaction;
-	}
-
-	journal->j_tail = 0;
-	return err;
-}
-
-static int do_one_pass(journal_t *journal,
-			struct recovery_info *info, enum passtype pass)
-{
-	unsigned int		first_commit_ID, next_commit_ID;
-	unsigned int		next_log_block;
-	int			err, success = 0;
-	journal_superblock_t *	sb;
-	journal_header_t *	tmp;
-	struct buffer_head *	bh;
-	unsigned int		sequence;
-	int			blocktype;
-
-	/*
-	 * First thing is to establish what we expect to find in the log
-	 * (in terms of transaction IDs), and where (in terms of log
-	 * block offsets): query the superblock.
-	 */
-
-	sb = journal->j_superblock;
-	next_commit_ID = be32_to_cpu(sb->s_sequence);
-	next_log_block = be32_to_cpu(sb->s_start);
-
-	first_commit_ID = next_commit_ID;
-	if (pass == PASS_SCAN)
-		info->start_transaction = first_commit_ID;
-
-	jbd_debug(1, "Starting recovery pass %d\n", pass);
-
-	/*
-	 * Now we walk through the log, transaction by transaction,
-	 * making sure that each transaction has a commit block in the
-	 * expected place.  Each complete transaction gets replayed back
-	 * into the main filesystem.
-	 */
-
-	while (1) {
-		int			flags;
-		char *			tagp;
-		journal_block_tag_t *	tag;
-		struct buffer_head *	obh;
-		struct buffer_head *	nbh;
-
-		cond_resched();
-
-		/* If we already know where to stop the log traversal,
-		 * check right now that we haven't gone past the end of
-		 * the log. */
-
-		if (pass != PASS_SCAN)
-			if (tid_geq(next_commit_ID, info->end_transaction))
-				break;
-
-		jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
-			  next_commit_ID, next_log_block, journal->j_last);
-
-		/* Skip over each chunk of the transaction looking
-		 * either the next descriptor block or the final commit
-		 * record. */
-
-		jbd_debug(3, "JBD: checking block %u\n", next_log_block);
-		err = jread(&bh, journal, next_log_block);
-		if (err)
-			goto failed;
-
-		next_log_block++;
-		wrap(journal, next_log_block);
-
-		/* What kind of buffer is it?
-		 *
-		 * If it is a descriptor block, check that it has the
-		 * expected sequence number.  Otherwise, we're all done
-		 * here. */
-
-		tmp = (journal_header_t *)bh->b_data;
-
-		if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
-			brelse(bh);
-			break;
-		}
-
-		blocktype = be32_to_cpu(tmp->h_blocktype);
-		sequence = be32_to_cpu(tmp->h_sequence);
-		jbd_debug(3, "Found magic %d, sequence %d\n",
-			  blocktype, sequence);
-
-		if (sequence != next_commit_ID) {
-			brelse(bh);
-			break;
-		}
-
-		/* OK, we have a valid descriptor block which matches
-		 * all of the sequence number checks.  What are we going
-		 * to do with it?  That depends on the pass... */
-
-		switch(blocktype) {
-		case JFS_DESCRIPTOR_BLOCK:
-			/* If it is a valid descriptor block, replay it
-			 * in pass REPLAY; otherwise, just skip over the
-			 * blocks it describes. */
-			if (pass != PASS_REPLAY) {
-				next_log_block +=
-					count_tags(bh, journal->j_blocksize);
-				wrap(journal, next_log_block);
-				brelse(bh);
-				continue;
-			}
-
-			/* A descriptor block: we can now write all of
-			 * the data blocks.  Yay, useful work is finally
-			 * getting done here! */
-
-			tagp = &bh->b_data[sizeof(journal_header_t)];
-			while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
-			       <= journal->j_blocksize) {
-				unsigned int io_block;
-
-				tag = (journal_block_tag_t *) tagp;
-				flags = be32_to_cpu(tag->t_flags);
-
-				io_block = next_log_block++;
-				wrap(journal, next_log_block);
-				err = jread(&obh, journal, io_block);
-				if (err) {
-					/* Recover what we can, but
-					 * report failure at the end. */
-					success = err;
-					printk (KERN_ERR
-						"JBD: IO error %d recovering "
-						"block %u in log\n",
-						err, io_block);
-				} else {
-					unsigned int blocknr;
-
-					J_ASSERT(obh != NULL);
-					blocknr = be32_to_cpu(tag->t_blocknr);
-
-					/* If the block has been
-					 * revoked, then we're all done
-					 * here. */
-					if (journal_test_revoke
-					    (journal, blocknr,
-					     next_commit_ID)) {
-						brelse(obh);
-						++info->nr_revoke_hits;
-						goto skip_write;
-					}
-
-					/* Find a buffer for the new
-					 * data being restored */
-					nbh = __getblk(journal->j_fs_dev,
-							blocknr,
-							journal->j_blocksize);
-					if (nbh == NULL) {
-						printk(KERN_ERR
-						       "JBD: Out of memory "
-						       "during recovery.\n");
-						err = -ENOMEM;
-						brelse(bh);
-						brelse(obh);
-						goto failed;
-					}
-
-					lock_buffer(nbh);
-					memcpy(nbh->b_data, obh->b_data,
-							journal->j_blocksize);
-					if (flags & JFS_FLAG_ESCAPE) {
-						*((__be32 *)nbh->b_data) =
-						cpu_to_be32(JFS_MAGIC_NUMBER);
-					}
-
-					BUFFER_TRACE(nbh, "marking dirty");
-					set_buffer_uptodate(nbh);
-					mark_buffer_dirty(nbh);
-					BUFFER_TRACE(nbh, "marking uptodate");
-					++info->nr_replays;
-					/* ll_rw_block(WRITE, 1, &nbh); */
-					unlock_buffer(nbh);
-					brelse(obh);
-					brelse(nbh);
-				}
-
-			skip_write:
-				tagp += sizeof(journal_block_tag_t);
-				if (!(flags & JFS_FLAG_SAME_UUID))
-					tagp += 16;
-
-				if (flags & JFS_FLAG_LAST_TAG)
-					break;
-			}
-
-			brelse(bh);
-			continue;
-
-		case JFS_COMMIT_BLOCK:
-			/* Found an expected commit block: not much to
-			 * do other than move on to the next sequence
-			 * number. */
-			brelse(bh);
-			next_commit_ID++;
-			continue;
-
-		case JFS_REVOKE_BLOCK:
-			/* If we aren't in the REVOKE pass, then we can
-			 * just skip over this block. */
-			if (pass != PASS_REVOKE) {
-				brelse(bh);
-				continue;
-			}
-
-			err = scan_revoke_records(journal, bh,
-						  next_commit_ID, info);
-			brelse(bh);
-			if (err)
-				goto failed;
-			continue;
-
-		default:
-			jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
-				  blocktype);
-			brelse(bh);
-			goto done;
-		}
-	}
-
- done:
-	/*
-	 * We broke out of the log scan loop: either we came to the
-	 * known end of the log or we found an unexpected block in the
-	 * log.  If the latter happened, then we know that the "current"
-	 * transaction marks the end of the valid log.
-	 */
-
-	if (pass == PASS_SCAN)
-		info->end_transaction = next_commit_ID;
-	else {
-		/* It's really bad news if different passes end up at
-		 * different places (but possible due to IO errors). */
-		if (info->end_transaction != next_commit_ID) {
-			printk (KERN_ERR "JBD: recovery pass %d ended at "
-				"transaction %u, expected %u\n",
-				pass, next_commit_ID, info->end_transaction);
-			if (!success)
-				success = -EIO;
-		}
-	}
-
-	return success;
-
- failed:
-	return err;
-}
-
-
-/* Scan a revoke record, marking all blocks mentioned as revoked. */
-
-static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
-			       tid_t sequence, struct recovery_info *info)
-{
-	journal_revoke_header_t *header;
-	int offset, max;
-
-	header = (journal_revoke_header_t *) bh->b_data;
-	offset = sizeof(journal_revoke_header_t);
-	max = be32_to_cpu(header->r_count);
-
-	while (offset < max) {
-		unsigned int blocknr;
-		int err;
-
-		blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
-		offset += 4;
-		err = journal_set_revoke(journal, blocknr, sequence);
-		if (err)
-			return err;
-		++info->nr_revokes;
-	}
-	return 0;
-}
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
deleted file mode 100644
index dcead636c33b..000000000000
--- a/fs/jbd/revoke.c
+++ /dev/null
@@ -1,733 +0,0 @@
-/*
- * linux/fs/jbd/revoke.c
- *
- * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
- *
- * Copyright 2000 Red Hat corp --- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * Journal revoke routines for the generic filesystem journaling code;
- * part of the ext2fs journaling system.
- *
- * Revoke is the mechanism used to prevent old log records for deleted
- * metadata from being replayed on top of newer data using the same
- * blocks.  The revoke mechanism is used in two separate places:
- *
- * + Commit: during commit we write the entire list of the current
- *   transaction's revoked blocks to the journal
- *
- * + Recovery: during recovery we record the transaction ID of all
- *   revoked blocks.  If there are multiple revoke records in the log
- *   for a single block, only the last one counts, and if there is a log
- *   entry for a block beyond the last revoke, then that log entry still
- *   gets replayed.
- *
- * We can get interactions between revokes and new log data within a
- * single transaction:
- *
- * Block is revoked and then journaled:
- *   The desired end result is the journaling of the new block, so we
- *   cancel the revoke before the transaction commits.
- *
- * Block is journaled and then revoked:
- *   The revoke must take precedence over the write of the block, so we
- *   need either to cancel the journal entry or to write the revoke
- *   later in the log than the log block.  In this case, we choose the
- *   latter: journaling a block cancels any revoke record for that block
- *   in the current transaction, so any revoke for that block in the
- *   transaction must have happened after the block was journaled and so
- *   the revoke must take precedence.
- *
- * Block is revoked and then written as data:
- *   The data write is allowed to succeed, but the revoke is _not_
- *   cancelled.  We still need to prevent old log records from
- *   overwriting the new data.  We don't even need to clear the revoke
- *   bit here.
- *
- * We cache revoke status of a buffer in the current transaction in b_states
- * bits.  As the name says, revokevalid flag indicates that the cached revoke
- * status of a buffer is valid and we can rely on the cached status.
- *
- * Revoke information on buffers is a tri-state value:
- *
- * RevokeValid clear:	no cached revoke status, need to look it up
- * RevokeValid set, Revoked clear:
- *			buffer has not been revoked, and cancel_revoke
- *			need do nothing.
- * RevokeValid set, Revoked set:
- *			buffer has been revoked.
- *
- * Locking rules:
- * We keep two hash tables of revoke records. One hashtable belongs to the
- * running transaction (is pointed to by journal->j_revoke), the other one
- * belongs to the committing transaction. Accesses to the second hash table
- * happen only from the kjournald and no other thread touches this table.  Also
- * journal_switch_revoke_table() which switches which hashtable belongs to the
- * running and which to the committing transaction is called only from
- * kjournald. Therefore we need no locks when accessing the hashtable belonging
- * to the committing transaction.
- *
- * All users operating on the hash table belonging to the running transaction
- * have a handle to the transaction. Therefore they are safe from kjournald
- * switching hash tables under them. For operations on the lists of entries in
- * the hash table j_revoke_lock is used.
- *
- * Finally, also replay code uses the hash tables but at this moment no one else
- * can touch them (filesystem isn't mounted yet) and hence no locking is
- * needed.
- */
-
-#ifndef __KERNEL__
-#include "jfs_user.h"
-#else
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/init.h>
-#include <linux/bio.h>
-#endif
-#include <linux/log2.h>
-#include <linux/hash.h>
-
-static struct kmem_cache *revoke_record_cache;
-static struct kmem_cache *revoke_table_cache;
-
-/* Each revoke record represents one single revoked block.  During
-   journal replay, this involves recording the transaction ID of the
-   last transaction to revoke this block. */
-
-struct jbd_revoke_record_s
-{
-	struct list_head  hash;
-	tid_t		  sequence;	/* Used for recovery only */
-	unsigned int	  blocknr;
-};
-
-
-/* The revoke table is just a simple hash table of revoke records. */
-struct jbd_revoke_table_s
-{
-	/* It is conceivable that we might want a larger hash table
-	 * for recovery.  Must be a power of two. */
-	int		  hash_size;
-	int		  hash_shift;
-	struct list_head *hash_table;
-};
-
-
-#ifdef __KERNEL__
-static void write_one_revoke_record(journal_t *, transaction_t *,
-				    struct journal_head **, int *,
-				    struct jbd_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct journal_head *, int, int);
-#endif
-
-/* Utility functions to maintain the revoke table */
-
-static inline int hash(journal_t *journal, unsigned int block)
-{
-	struct jbd_revoke_table_s *table = journal->j_revoke;
-
-	return hash_32(block, table->hash_shift);
-}
-
-static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
-			      tid_t seq)
-{
-	struct list_head *hash_list;
-	struct jbd_revoke_record_s *record;
-
-repeat:
-	record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS);
-	if (!record)
-		goto oom;
-
-	record->sequence = seq;
-	record->blocknr = blocknr;
-	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
-	spin_lock(&journal->j_revoke_lock);
-	list_add(&record->hash, hash_list);
-	spin_unlock(&journal->j_revoke_lock);
-	return 0;
-
-oom:
-	if (!journal_oom_retry)
-		return -ENOMEM;
-	jbd_debug(1, "ENOMEM in %s, retrying\n", __func__);
-	yield();
-	goto repeat;
-}
-
-/* Find a revoke record in the journal's hash table. */
-
-static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
-						      unsigned int blocknr)
-{
-	struct list_head *hash_list;
-	struct jbd_revoke_record_s *record;
-
-	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
-
-	spin_lock(&journal->j_revoke_lock);
-	record = (struct jbd_revoke_record_s *) hash_list->next;
-	while (&(record->hash) != hash_list) {
-		if (record->blocknr == blocknr) {
-			spin_unlock(&journal->j_revoke_lock);
-			return record;
-		}
-		record = (struct jbd_revoke_record_s *) record->hash.next;
-	}
-	spin_unlock(&journal->j_revoke_lock);
-	return NULL;
-}
-
-void journal_destroy_revoke_caches(void)
-{
-	if (revoke_record_cache) {
-		kmem_cache_destroy(revoke_record_cache);
-		revoke_record_cache = NULL;
-	}
-	if (revoke_table_cache) {
-		kmem_cache_destroy(revoke_table_cache);
-		revoke_table_cache = NULL;
-	}
-}
-
-int __init journal_init_revoke_caches(void)
-{
-	J_ASSERT(!revoke_record_cache);
-	J_ASSERT(!revoke_table_cache);
-
-	revoke_record_cache = kmem_cache_create("revoke_record",
-					   sizeof(struct jbd_revoke_record_s),
-					   0,
-					   SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
-					   NULL);
-	if (!revoke_record_cache)
-		goto record_cache_failure;
-
-	revoke_table_cache = kmem_cache_create("revoke_table",
-					   sizeof(struct jbd_revoke_table_s),
-					   0, SLAB_TEMPORARY, NULL);
-	if (!revoke_table_cache)
-		goto table_cache_failure;
-
-	return 0;
-
-table_cache_failure:
-	journal_destroy_revoke_caches();
-record_cache_failure:
-	return -ENOMEM;
-}
-
-static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
-{
-	int i;
-	struct jbd_revoke_table_s *table;
-
-	table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
-	if (!table)
-		goto out;
-
-	table->hash_size = hash_size;
-	table->hash_shift = ilog2(hash_size);
-	table->hash_table =
-		kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
-	if (!table->hash_table) {
-		kmem_cache_free(revoke_table_cache, table);
-		table = NULL;
-		goto out;
-	}
-
-	for (i = 0; i < hash_size; i++)
-		INIT_LIST_HEAD(&table->hash_table[i]);
-
-out:
-	return table;
-}
-
-static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table)
-{
-	int i;
-	struct list_head *hash_list;
-
-	for (i = 0; i < table->hash_size; i++) {
-		hash_list = &table->hash_table[i];
-		J_ASSERT(list_empty(hash_list));
-	}
-
-	kfree(table->hash_table);
-	kmem_cache_free(revoke_table_cache, table);
-}
-
-/* Initialise the revoke table for a given journal to a given size. */
-int journal_init_revoke(journal_t *journal, int hash_size)
-{
-	J_ASSERT(journal->j_revoke_table[0] == NULL);
-	J_ASSERT(is_power_of_2(hash_size));
-
-	journal->j_revoke_table[0] = journal_init_revoke_table(hash_size);
-	if (!journal->j_revoke_table[0])
-		goto fail0;
-
-	journal->j_revoke_table[1] = journal_init_revoke_table(hash_size);
-	if (!journal->j_revoke_table[1])
-		goto fail1;
-
-	journal->j_revoke = journal->j_revoke_table[1];
-
-	spin_lock_init(&journal->j_revoke_lock);
-
-	return 0;
-
-fail1:
-	journal_destroy_revoke_table(journal->j_revoke_table[0]);
-fail0:
-	return -ENOMEM;
-}
-
-/* Destroy a journal's revoke table.  The table must already be empty! */
-void journal_destroy_revoke(journal_t *journal)
-{
-	journal->j_revoke = NULL;
-	if (journal->j_revoke_table[0])
-		journal_destroy_revoke_table(journal->j_revoke_table[0]);
-	if (journal->j_revoke_table[1])
-		journal_destroy_revoke_table(journal->j_revoke_table[1]);
-}
-
-
-#ifdef __KERNEL__
-
-/*
- * journal_revoke: revoke a given buffer_head from the journal.  This
- * prevents the block from being replayed during recovery if we take a
- * crash after this current transaction commits.  Any subsequent
- * metadata writes of the buffer in this transaction cancel the
- * revoke.
- *
- * Note that this call may block --- it is up to the caller to make
- * sure that there are no further calls to journal_write_metadata
- * before the revoke is complete.  In ext3, this implies calling the
- * revoke before clearing the block bitmap when we are deleting
- * metadata.
- *
- * Revoke performs a journal_forget on any buffer_head passed in as a
- * parameter, but does _not_ forget the buffer_head if the bh was only
- * found implicitly.
- *
- * bh_in may not be a journalled buffer - it may have come off
- * the hash tables without an attached journal_head.
- *
- * If bh_in is non-zero, journal_revoke() will decrement its b_count
- * by one.
- */
-
-int journal_revoke(handle_t *handle, unsigned int blocknr,
-		   struct buffer_head *bh_in)
-{
-	struct buffer_head *bh = NULL;
-	journal_t *journal;
-	struct block_device *bdev;
-	int err;
-
-	might_sleep();
-	if (bh_in)
-		BUFFER_TRACE(bh_in, "enter");
-
-	journal = handle->h_transaction->t_journal;
-	if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){
-		J_ASSERT (!"Cannot set revoke feature!");
-		return -EINVAL;
-	}
-
-	bdev = journal->j_fs_dev;
-	bh = bh_in;
-
-	if (!bh) {
-		bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
-		if (bh)
-			BUFFER_TRACE(bh, "found on hash");
-	}
-#ifdef JBD_EXPENSIVE_CHECKING
-	else {
-		struct buffer_head *bh2;
-
-		/* If there is a different buffer_head lying around in
-		 * memory anywhere... */
-		bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
-		if (bh2) {
-			/* ... and it has RevokeValid status... */
-			if (bh2 != bh && buffer_revokevalid(bh2))
-				/* ...then it better be revoked too,
-				 * since it's illegal to create a revoke
-				 * record against a buffer_head which is
-				 * not marked revoked --- that would
-				 * risk missing a subsequent revoke
-				 * cancel. */
-				J_ASSERT_BH(bh2, buffer_revoked(bh2));
-			put_bh(bh2);
-		}
-	}
-#endif
-
-	/* We really ought not ever to revoke twice in a row without
-           first having the revoke cancelled: it's illegal to free a
-           block twice without allocating it in between! */
-	if (bh) {
-		if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
-				 "inconsistent data on disk")) {
-			if (!bh_in)
-				brelse(bh);
-			return -EIO;
-		}
-		set_buffer_revoked(bh);
-		set_buffer_revokevalid(bh);
-		if (bh_in) {
-			BUFFER_TRACE(bh_in, "call journal_forget");
-			journal_forget(handle, bh_in);
-		} else {
-			BUFFER_TRACE(bh, "call brelse");
-			__brelse(bh);
-		}
-	}
-
-	jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
-	err = insert_revoke_hash(journal, blocknr,
-				handle->h_transaction->t_tid);
-	BUFFER_TRACE(bh_in, "exit");
-	return err;
-}
-
-/*
- * Cancel an outstanding revoke.  For use only internally by the
- * journaling code (called from journal_get_write_access).
- *
- * We trust buffer_revoked() on the buffer if the buffer is already
- * being journaled: if there is no revoke pending on the buffer, then we
- * don't do anything here.
- *
- * This would break if it were possible for a buffer to be revoked and
- * discarded, and then reallocated within the same transaction.  In such
- * a case we would have lost the revoked bit, but when we arrived here
- * the second time we would still have a pending revoke to cancel.  So,
- * do not trust the Revoked bit on buffers unless RevokeValid is also
- * set.
- */
-int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
-{
-	struct jbd_revoke_record_s *record;
-	journal_t *journal = handle->h_transaction->t_journal;
-	int need_cancel;
-	int did_revoke = 0;	/* akpm: debug */
-	struct buffer_head *bh = jh2bh(jh);
-
-	jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
-
-	/* Is the existing Revoke bit valid?  If so, we trust it, and
-	 * only perform the full cancel if the revoke bit is set.  If
-	 * not, we can't trust the revoke bit, and we need to do the
-	 * full search for a revoke record. */
-	if (test_set_buffer_revokevalid(bh)) {
-		need_cancel = test_clear_buffer_revoked(bh);
-	} else {
-		need_cancel = 1;
-		clear_buffer_revoked(bh);
-	}
-
-	if (need_cancel) {
-		record = find_revoke_record(journal, bh->b_blocknr);
-		if (record) {
-			jbd_debug(4, "cancelled existing revoke on "
-				  "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
-			spin_lock(&journal->j_revoke_lock);
-			list_del(&record->hash);
-			spin_unlock(&journal->j_revoke_lock);
-			kmem_cache_free(revoke_record_cache, record);
-			did_revoke = 1;
-		}
-	}
-
-#ifdef JBD_EXPENSIVE_CHECKING
-	/* There better not be one left behind by now! */
-	record = find_revoke_record(journal, bh->b_blocknr);
-	J_ASSERT_JH(jh, record == NULL);
-#endif
-
-	/* Finally, have we just cleared revoke on an unhashed
-	 * buffer_head?  If so, we'd better make sure we clear the
-	 * revoked status on any hashed alias too, otherwise the revoke
-	 * state machine will get very upset later on. */
-	if (need_cancel) {
-		struct buffer_head *bh2;
-		bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
-		if (bh2) {
-			if (bh2 != bh)
-				clear_buffer_revoked(bh2);
-			__brelse(bh2);
-		}
-	}
-	return did_revoke;
-}
-
-/*
- * journal_clear_revoked_flags clears revoked flag of buffers in
- * revoke table to reflect there is no revoked buffer in the next
- * transaction which is going to be started.
- */
-void journal_clear_buffer_revoked_flags(journal_t *journal)
-{
-	struct jbd_revoke_table_s *revoke = journal->j_revoke;
-	int i = 0;
-
-	for (i = 0; i < revoke->hash_size; i++) {
-		struct list_head *hash_list;
-		struct list_head *list_entry;
-		hash_list = &revoke->hash_table[i];
-
-		list_for_each(list_entry, hash_list) {
-			struct jbd_revoke_record_s *record;
-			struct buffer_head *bh;
-			record = (struct jbd_revoke_record_s *)list_entry;
-			bh = __find_get_block(journal->j_fs_dev,
-					      record->blocknr,
-					      journal->j_blocksize);
-			if (bh) {
-				clear_buffer_revoked(bh);
-				__brelse(bh);
-			}
-		}
-	}
-}
-
-/* journal_switch_revoke table select j_revoke for next transaction
- * we do not want to suspend any processing until all revokes are
- * written -bzzz
- */
-void journal_switch_revoke_table(journal_t *journal)
-{
-	int i;
-
-	if (journal->j_revoke == journal->j_revoke_table[0])
-		journal->j_revoke = journal->j_revoke_table[1];
-	else
-		journal->j_revoke = journal->j_revoke_table[0];
-
-	for (i = 0; i < journal->j_revoke->hash_size; i++)
-		INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
-}
-
-/*
- * Write revoke records to the journal for all entries in the current
- * revoke hash, deleting the entries as we go.
- */
-void journal_write_revoke_records(journal_t *journal,
-				  transaction_t *transaction, int write_op)
-{
-	struct journal_head *descriptor;
-	struct jbd_revoke_record_s *record;
-	struct jbd_revoke_table_s *revoke;
-	struct list_head *hash_list;
-	int i, offset, count;
-
-	descriptor = NULL;
-	offset = 0;
-	count = 0;
-
-	/* select revoke table for committing transaction */
-	revoke = journal->j_revoke == journal->j_revoke_table[0] ?
-		journal->j_revoke_table[1] : journal->j_revoke_table[0];
-
-	for (i = 0; i < revoke->hash_size; i++) {
-		hash_list = &revoke->hash_table[i];
-
-		while (!list_empty(hash_list)) {
-			record = (struct jbd_revoke_record_s *)
-				hash_list->next;
-			write_one_revoke_record(journal, transaction,
-						&descriptor, &offset,
-						record, write_op);
-			count++;
-			list_del(&record->hash);
-			kmem_cache_free(revoke_record_cache, record);
-		}
-	}
-	if (descriptor)
-		flush_descriptor(journal, descriptor, offset, write_op);
-	jbd_debug(1, "Wrote %d revoke records\n", count);
-}
-
-/*
- * Write out one revoke record.  We need to create a new descriptor
- * block if the old one is full or if we have not already created one.
- */
-
-static void write_one_revoke_record(journal_t *journal,
-				    transaction_t *transaction,
-				    struct journal_head **descriptorp,
-				    int *offsetp,
-				    struct jbd_revoke_record_s *record,
-				    int write_op)
-{
-	struct journal_head *descriptor;
-	int offset;
-	journal_header_t *header;
-
-	/* If we are already aborting, this all becomes a noop.  We
-           still need to go round the loop in
-           journal_write_revoke_records in order to free all of the
-           revoke records: only the IO to the journal is omitted. */
-	if (is_journal_aborted(journal))
-		return;
-
-	descriptor = *descriptorp;
-	offset = *offsetp;
-
-	/* Make sure we have a descriptor with space left for the record */
-	if (descriptor) {
-		if (offset == journal->j_blocksize) {
-			flush_descriptor(journal, descriptor, offset, write_op);
-			descriptor = NULL;
-		}
-	}
-
-	if (!descriptor) {
-		descriptor = journal_get_descriptor_buffer(journal);
-		if (!descriptor)
-			return;
-		header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
-		header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
-		header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK);
-		header->h_sequence  = cpu_to_be32(transaction->t_tid);
-
-		/* Record it so that we can wait for IO completion later */
-		JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
-		journal_file_buffer(descriptor, transaction, BJ_LogCtl);
-
-		offset = sizeof(journal_revoke_header_t);
-		*descriptorp = descriptor;
-	}
-
-	* ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
-		cpu_to_be32(record->blocknr);
-	offset += 4;
-	*offsetp = offset;
-}
-
-/*
- * Flush a revoke descriptor out to the journal.  If we are aborting,
- * this is a noop; otherwise we are generating a buffer which needs to
- * be waited for during commit, so it has to go onto the appropriate
- * journal buffer list.
- */
-
-static void flush_descriptor(journal_t *journal,
-			     struct journal_head *descriptor,
-			     int offset, int write_op)
-{
-	journal_revoke_header_t *header;
-	struct buffer_head *bh = jh2bh(descriptor);
-
-	if (is_journal_aborted(journal)) {
-		put_bh(bh);
-		return;
-	}
-
-	header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
-	header->r_count = cpu_to_be32(offset);
-	set_buffer_jwrite(bh);
-	BUFFER_TRACE(bh, "write");
-	set_buffer_dirty(bh);
-	write_dirty_buffer(bh, write_op);
-}
-#endif
-
-/*
- * Revoke support for recovery.
- *
- * Recovery needs to be able to:
- *
- *  record all revoke records, including the tid of the latest instance
- *  of each revoke in the journal
- *
- *  check whether a given block in a given transaction should be replayed
- *  (ie. has not been revoked by a revoke record in that or a subsequent
- *  transaction)
- *
- *  empty the revoke table after recovery.
- */
-
-/*
- * First, setting revoke records.  We create a new revoke record for
- * every block ever revoked in the log as we scan it for recovery, and
- * we update the existing records if we find multiple revokes for a
- * single block.
- */
-
-int journal_set_revoke(journal_t *journal,
-		       unsigned int blocknr,
-		       tid_t sequence)
-{
-	struct jbd_revoke_record_s *record;
-
-	record = find_revoke_record(journal, blocknr);
-	if (record) {
-		/* If we have multiple occurrences, only record the
-		 * latest sequence number in the hashed record */
-		if (tid_gt(sequence, record->sequence))
-			record->sequence = sequence;
-		return 0;
-	}
-	return insert_revoke_hash(journal, blocknr, sequence);
-}
-
-/*
- * Test revoke records.  For a given block referenced in the log, has
- * that block been revoked?  A revoke record with a given transaction
- * sequence number revokes all blocks in that transaction and earlier
- * ones, but later transactions still need replayed.
- */
-
-int journal_test_revoke(journal_t *journal,
-			unsigned int blocknr,
-			tid_t sequence)
-{
-	struct jbd_revoke_record_s *record;
-
-	record = find_revoke_record(journal, blocknr);
-	if (!record)
-		return 0;
-	if (tid_gt(sequence, record->sequence))
-		return 0;
-	return 1;
-}
-
-/*
- * Finally, once recovery is over, we need to clear the revoke table so
- * that it can be reused by the running filesystem.
- */
-
-void journal_clear_revoke(journal_t *journal)
-{
-	int i;
-	struct list_head *hash_list;
-	struct jbd_revoke_record_s *record;
-	struct jbd_revoke_table_s *revoke;
-
-	revoke = journal->j_revoke;
-
-	for (i = 0; i < revoke->hash_size; i++) {
-		hash_list = &revoke->hash_table[i];
-		while (!list_empty(hash_list)) {
-			record = (struct jbd_revoke_record_s*) hash_list->next;
-			list_del(&record->hash);
-			kmem_cache_free(revoke_record_cache, record);
-		}
-	}
-}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
deleted file mode 100644
index 1695ba8334a2..000000000000
--- a/fs/jbd/transaction.c
+++ /dev/null
@@ -1,2237 +0,0 @@
-/*
- * linux/fs/jbd/transaction.c
- *
- * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
- *
- * Copyright 1998 Red Hat corp --- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * Generic filesystem transaction handling code; part of the ext2fs
- * journaling system.
- *
- * This file manages transactions (compound commits managed by the
- * journaling code) and handles (individual atomic operations by the
- * filesystem).
- */
-
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/hrtimer.h>
-
-static void __journal_temp_unlink_buffer(struct journal_head *jh);
-
-/*
- * get_transaction: obtain a new transaction_t object.
- *
- * Simply allocate and initialise a new transaction.  Create it in
- * RUNNING state and add it to the current journal (which should not
- * have an existing running transaction: we only make a new transaction
- * once we have started to commit the old one).
- *
- * Preconditions:
- *	The journal MUST be locked.  We don't perform atomic mallocs on the
- *	new transaction	and we can't block without protecting against other
- *	processes trying to touch the journal while it is in transition.
- *
- * Called under j_state_lock
- */
-
-static transaction_t *
-get_transaction(journal_t *journal, transaction_t *transaction)
-{
-	transaction->t_journal = journal;
-	transaction->t_state = T_RUNNING;
-	transaction->t_start_time = ktime_get();
-	transaction->t_tid = journal->j_transaction_sequence++;
-	transaction->t_expires = jiffies + journal->j_commit_interval;
-	spin_lock_init(&transaction->t_handle_lock);
-
-	/* Set up the commit timer for the new transaction. */
-	journal->j_commit_timer.expires =
-				round_jiffies_up(transaction->t_expires);
-	add_timer(&journal->j_commit_timer);
-
-	J_ASSERT(journal->j_running_transaction == NULL);
-	journal->j_running_transaction = transaction;
-
-	return transaction;
-}
-
-/*
- * Handle management.
- *
- * A handle_t is an object which represents a single atomic update to a
- * filesystem, and which tracks all of the modifications which form part
- * of that one update.
- */
-
-/*
- * start_this_handle: Given a handle, deal with any locking or stalling
- * needed to make sure that there is enough journal space for the handle
- * to begin.  Attach the handle to a transaction and set up the
- * transaction's buffer credits.
- */
-
-static int start_this_handle(journal_t *journal, handle_t *handle)
-{
-	transaction_t *transaction;
-	int needed;
-	int nblocks = handle->h_buffer_credits;
-	transaction_t *new_transaction = NULL;
-	int ret = 0;
-
-	if (nblocks > journal->j_max_transaction_buffers) {
-		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
-		       current->comm, nblocks,
-		       journal->j_max_transaction_buffers);
-		ret = -ENOSPC;
-		goto out;
-	}
-
-alloc_transaction:
-	if (!journal->j_running_transaction) {
-		new_transaction = kzalloc(sizeof(*new_transaction),
-						GFP_NOFS|__GFP_NOFAIL);
-		if (!new_transaction) {
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
-
-	jbd_debug(3, "New handle %p going live.\n", handle);
-
-repeat:
-
-	/*
-	 * We need to hold j_state_lock until t_updates has been incremented,
-	 * for proper journal barrier handling
-	 */
-	spin_lock(&journal->j_state_lock);
-repeat_locked:
-	if (is_journal_aborted(journal) ||
-	    (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
-		spin_unlock(&journal->j_state_lock);
-		ret = -EROFS;
-		goto out;
-	}
-
-	/* Wait on the journal's transaction barrier if necessary */
-	if (journal->j_barrier_count) {
-		spin_unlock(&journal->j_state_lock);
-		wait_event(journal->j_wait_transaction_locked,
-				journal->j_barrier_count == 0);
-		goto repeat;
-	}
-
-	if (!journal->j_running_transaction) {
-		if (!new_transaction) {
-			spin_unlock(&journal->j_state_lock);
-			goto alloc_transaction;
-		}
-		get_transaction(journal, new_transaction);
-		new_transaction = NULL;
-	}
-
-	transaction = journal->j_running_transaction;
-
-	/*
-	 * If the current transaction is locked down for commit, wait for the
-	 * lock to be released.
-	 */
-	if (transaction->t_state == T_LOCKED) {
-		DEFINE_WAIT(wait);
-
-		prepare_to_wait(&journal->j_wait_transaction_locked,
-					&wait, TASK_UNINTERRUPTIBLE);
-		spin_unlock(&journal->j_state_lock);
-		schedule();
-		finish_wait(&journal->j_wait_transaction_locked, &wait);
-		goto repeat;
-	}
-
-	/*
-	 * If there is not enough space left in the log to write all potential
-	 * buffers requested by this operation, we need to stall pending a log
-	 * checkpoint to free some more log space.
-	 */
-	spin_lock(&transaction->t_handle_lock);
-	needed = transaction->t_outstanding_credits + nblocks;
-
-	if (needed > journal->j_max_transaction_buffers) {
-		/*
-		 * If the current transaction is already too large, then start
-		 * to commit it: we can then go back and attach this handle to
-		 * a new transaction.
-		 */
-		DEFINE_WAIT(wait);
-
-		jbd_debug(2, "Handle %p starting new commit...\n", handle);
-		spin_unlock(&transaction->t_handle_lock);
-		prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
-				TASK_UNINTERRUPTIBLE);
-		__log_start_commit(journal, transaction->t_tid);
-		spin_unlock(&journal->j_state_lock);
-		schedule();
-		finish_wait(&journal->j_wait_transaction_locked, &wait);
-		goto repeat;
-	}
-
-	/*
-	 * The commit code assumes that it can get enough log space
-	 * without forcing a checkpoint.  This is *critical* for
-	 * correctness: a checkpoint of a buffer which is also
-	 * associated with a committing transaction creates a deadlock,
-	 * so commit simply cannot force through checkpoints.
-	 *
-	 * We must therefore ensure the necessary space in the journal
-	 * *before* starting to dirty potentially checkpointed buffers
-	 * in the new transaction.
-	 *
-	 * The worst part is, any transaction currently committing can
-	 * reduce the free space arbitrarily.  Be careful to account for
-	 * those buffers when checkpointing.
-	 */
-
-	/*
-	 * @@@ AKPM: This seems rather over-defensive.  We're giving commit
-	 * a _lot_ of headroom: 1/4 of the journal plus the size of
-	 * the committing transaction.  Really, we only need to give it
-	 * committing_transaction->t_outstanding_credits plus "enough" for
-	 * the log control blocks.
-	 * Also, this test is inconsistent with the matching one in
-	 * journal_extend().
-	 */
-	if (__log_space_left(journal) < jbd_space_needed(journal)) {
-		jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
-		spin_unlock(&transaction->t_handle_lock);
-		__log_wait_for_space(journal);
-		goto repeat_locked;
-	}
-
-	/* OK, account for the buffers that this operation expects to
-	 * use and add the handle to the running transaction. */
-
-	handle->h_transaction = transaction;
-	transaction->t_outstanding_credits += nblocks;
-	transaction->t_updates++;
-	transaction->t_handle_count++;
-	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
-		  handle, nblocks, transaction->t_outstanding_credits,
-		  __log_space_left(journal));
-	spin_unlock(&transaction->t_handle_lock);
-	spin_unlock(&journal->j_state_lock);
-
-	lock_map_acquire(&handle->h_lockdep_map);
-out:
-	if (unlikely(new_transaction))		/* It's usually NULL */
-		kfree(new_transaction);
-	return ret;
-}
-
-static struct lock_class_key jbd_handle_key;
-
-/* Allocate a new handle.  This should probably be in a slab... */
-static handle_t *new_handle(int nblocks)
-{
-	handle_t *handle = jbd_alloc_handle(GFP_NOFS);
-	if (!handle)
-		return NULL;
-	handle->h_buffer_credits = nblocks;
-	handle->h_ref = 1;
-
-	lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0);
-
-	return handle;
-}
-
-/**
- * handle_t *journal_start() - Obtain a new handle.
- * @journal: Journal to start transaction on.
- * @nblocks: number of block buffer we might modify
- *
- * We make sure that the transaction can guarantee at least nblocks of
- * modified buffers in the log.  We block until the log can guarantee
- * that much space.
- *
- * This function is visible to journal users (like ext3fs), so is not
- * called with the journal already locked.
- *
- * Return a pointer to a newly allocated handle, or an ERR_PTR() value
- * on failure.
- */
-handle_t *journal_start(journal_t *journal, int nblocks)
-{
-	handle_t *handle = journal_current_handle();
-	int err;
-
-	if (!journal)
-		return ERR_PTR(-EROFS);
-
-	if (handle) {
-		J_ASSERT(handle->h_transaction->t_journal == journal);
-		handle->h_ref++;
-		return handle;
-	}
-
-	handle = new_handle(nblocks);
-	if (!handle)
-		return ERR_PTR(-ENOMEM);
-
-	current->journal_info = handle;
-
-	err = start_this_handle(journal, handle);
-	if (err < 0) {
-		jbd_free_handle(handle);
-		current->journal_info = NULL;
-		handle = ERR_PTR(err);
-	}
-	return handle;
-}
-
-/**
- * int journal_extend() - extend buffer credits.
- * @handle:  handle to 'extend'
- * @nblocks: nr blocks to try to extend by.
- *
- * Some transactions, such as large extends and truncates, can be done
- * atomically all at once or in several stages.  The operation requests
- * a credit for a number of buffer modications in advance, but can
- * extend its credit if it needs more.
- *
- * journal_extend tries to give the running handle more buffer credits.
- * It does not guarantee that allocation - this is a best-effort only.
- * The calling process MUST be able to deal cleanly with a failure to
- * extend here.
- *
- * Return 0 on success, non-zero on failure.
- *
- * return code < 0 implies an error
- * return code > 0 implies normal transaction-full status.
- */
-int journal_extend(handle_t *handle, int nblocks)
-{
-	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
-	int result;
-	int wanted;
-
-	result = -EIO;
-	if (is_handle_aborted(handle))
-		goto out;
-
-	result = 1;
-
-	spin_lock(&journal->j_state_lock);
-
-	/* Don't extend a locked-down transaction! */
-	if (handle->h_transaction->t_state != T_RUNNING) {
-		jbd_debug(3, "denied handle %p %d blocks: "
-			  "transaction not running\n", handle, nblocks);
-		goto error_out;
-	}
-
-	spin_lock(&transaction->t_handle_lock);
-	wanted = transaction->t_outstanding_credits + nblocks;
-
-	if (wanted > journal->j_max_transaction_buffers) {
-		jbd_debug(3, "denied handle %p %d blocks: "
-			  "transaction too large\n", handle, nblocks);
-		goto unlock;
-	}
-
-	if (wanted > __log_space_left(journal)) {
-		jbd_debug(3, "denied handle %p %d blocks: "
-			  "insufficient log space\n", handle, nblocks);
-		goto unlock;
-	}
-
-	handle->h_buffer_credits += nblocks;
-	transaction->t_outstanding_credits += nblocks;
-	result = 0;
-
-	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
-unlock:
-	spin_unlock(&transaction->t_handle_lock);
-error_out:
-	spin_unlock(&journal->j_state_lock);
-out:
-	return result;
-}
-
-
-/**
- * int journal_restart() - restart a handle.
- * @handle:  handle to restart
- * @nblocks: nr credits requested
- *
- * Restart a handle for a multi-transaction filesystem
- * operation.
- *
- * If the journal_extend() call above fails to grant new buffer credits
- * to a running handle, a call to journal_restart will commit the
- * handle's transaction so far and reattach the handle to a new
- * transaction capabable of guaranteeing the requested number of
- * credits.
- */
-
-int journal_restart(handle_t *handle, int nblocks)
-{
-	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
-	int ret;
-
-	/* If we've had an abort of any type, don't even think about
-	 * actually doing the restart! */
-	if (is_handle_aborted(handle))
-		return 0;
-
-	/*
-	 * First unlink the handle from its current transaction, and start the
-	 * commit on that.
-	 */
-	J_ASSERT(transaction->t_updates > 0);
-	J_ASSERT(journal_current_handle() == handle);
-
-	spin_lock(&journal->j_state_lock);
-	spin_lock(&transaction->t_handle_lock);
-	transaction->t_outstanding_credits -= handle->h_buffer_credits;
-	transaction->t_updates--;
-
-	if (!transaction->t_updates)
-		wake_up(&journal->j_wait_updates);
-	spin_unlock(&transaction->t_handle_lock);
-
-	jbd_debug(2, "restarting handle %p\n", handle);
-	__log_start_commit(journal, transaction->t_tid);
-	spin_unlock(&journal->j_state_lock);
-
-	lock_map_release(&handle->h_lockdep_map);
-	handle->h_buffer_credits = nblocks;
-	ret = start_this_handle(journal, handle);
-	return ret;
-}
-
-
-/**
- * void journal_lock_updates () - establish a transaction barrier.
- * @journal:  Journal to establish a barrier on.
- *
- * This locks out any further updates from being started, and blocks until all
- * existing updates have completed, returning only once the journal is in a
- * quiescent state with no updates running.
- *
- * We do not use simple mutex for synchronization as there are syscalls which
- * want to return with filesystem locked and that trips up lockdep. Also
- * hibernate needs to lock filesystem but locked mutex then blocks hibernation.
- * Since locking filesystem is rare operation, we use simple counter and
- * waitqueue for locking.
- */
-void journal_lock_updates(journal_t *journal)
-{
-	DEFINE_WAIT(wait);
-
-wait:
-	/* Wait for previous locked operation to finish */
-	wait_event(journal->j_wait_transaction_locked,
-		   journal->j_barrier_count == 0);
-
-	spin_lock(&journal->j_state_lock);
-	/*
-	 * Check reliably under the lock whether we are the ones winning the race
-	 * and locking the journal
-	 */
-	if (journal->j_barrier_count > 0) {
-		spin_unlock(&journal->j_state_lock);
-		goto wait;
-	}
-	++journal->j_barrier_count;
-
-	/* Wait until there are no running updates */
-	while (1) {
-		transaction_t *transaction = journal->j_running_transaction;
-
-		if (!transaction)
-			break;
-
-		spin_lock(&transaction->t_handle_lock);
-		if (!transaction->t_updates) {
-			spin_unlock(&transaction->t_handle_lock);
-			break;
-		}
-		prepare_to_wait(&journal->j_wait_updates, &wait,
-				TASK_UNINTERRUPTIBLE);
-		spin_unlock(&transaction->t_handle_lock);
-		spin_unlock(&journal->j_state_lock);
-		schedule();
-		finish_wait(&journal->j_wait_updates, &wait);
-		spin_lock(&journal->j_state_lock);
-	}
-	spin_unlock(&journal->j_state_lock);
-}
-
-/**
- * void journal_unlock_updates (journal_t* journal) - release barrier
- * @journal:  Journal to release the barrier on.
- *
- * Release a transaction barrier obtained with journal_lock_updates().
- */
-void journal_unlock_updates (journal_t *journal)
-{
-	J_ASSERT(journal->j_barrier_count != 0);
-
-	spin_lock(&journal->j_state_lock);
-	--journal->j_barrier_count;
-	spin_unlock(&journal->j_state_lock);
-	wake_up(&journal->j_wait_transaction_locked);
-}
-
-static void warn_dirty_buffer(struct buffer_head *bh)
-{
-	char b[BDEVNAME_SIZE];
-
-	printk(KERN_WARNING
-	       "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
-	       "There's a risk of filesystem corruption in case of system "
-	       "crash.\n",
-	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
-}
-
-/*
- * If the buffer is already part of the current transaction, then there
- * is nothing we need to do.  If it is already part of a prior
- * transaction which we are still committing to disk, then we need to
- * make sure that we do not overwrite the old copy: we do copy-out to
- * preserve the copy going to disk.  We also account the buffer against
- * the handle's metadata buffer credits (unless the buffer is already
- * part of the transaction, that is).
- *
- */
-static int
-do_get_write_access(handle_t *handle, struct journal_head *jh,
-			int force_copy)
-{
-	struct buffer_head *bh;
-	transaction_t *transaction;
-	journal_t *journal;
-	int error;
-	char *frozen_buffer = NULL;
-	int need_copy = 0;
-
-	if (is_handle_aborted(handle))
-		return -EROFS;
-
-	transaction = handle->h_transaction;
-	journal = transaction->t_journal;
-
-	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
-
-	JBUFFER_TRACE(jh, "entry");
-repeat:
-	bh = jh2bh(jh);
-
-	/* @@@ Need to check for errors here at some point. */
-
-	lock_buffer(bh);
-	jbd_lock_bh_state(bh);
-
-	/* We now hold the buffer lock so it is safe to query the buffer
-	 * state.  Is the buffer dirty?
-	 *
-	 * If so, there are two possibilities.  The buffer may be
-	 * non-journaled, and undergoing a quite legitimate writeback.
-	 * Otherwise, it is journaled, and we don't expect dirty buffers
-	 * in that state (the buffers should be marked JBD_Dirty
-	 * instead.)  So either the IO is being done under our own
-	 * control and this is a bug, or it's a third party IO such as
-	 * dump(8) (which may leave the buffer scheduled for read ---
-	 * ie. locked but not dirty) or tune2fs (which may actually have
-	 * the buffer dirtied, ugh.)  */
-
-	if (buffer_dirty(bh)) {
-		/*
-		 * First question: is this buffer already part of the current
-		 * transaction or the existing committing transaction?
-		 */
-		if (jh->b_transaction) {
-			J_ASSERT_JH(jh,
-				jh->b_transaction == transaction ||
-				jh->b_transaction ==
-					journal->j_committing_transaction);
-			if (jh->b_next_transaction)
-				J_ASSERT_JH(jh, jh->b_next_transaction ==
-							transaction);
-			warn_dirty_buffer(bh);
-		}
-		/*
-		 * In any case we need to clean the dirty flag and we must
-		 * do it under the buffer lock to be sure we don't race
-		 * with running write-out.
-		 */
-		JBUFFER_TRACE(jh, "Journalling dirty buffer");
-		clear_buffer_dirty(bh);
-		set_buffer_jbddirty(bh);
-	}
-
-	unlock_buffer(bh);
-
-	error = -EROFS;
-	if (is_handle_aborted(handle)) {
-		jbd_unlock_bh_state(bh);
-		goto out;
-	}
-	error = 0;
-
-	/*
-	 * The buffer is already part of this transaction if b_transaction or
-	 * b_next_transaction points to it
-	 */
-	if (jh->b_transaction == transaction ||
-	    jh->b_next_transaction == transaction)
-		goto done;
-
-	/*
-	 * this is the first time this transaction is touching this buffer,
-	 * reset the modified flag
-	 */
-	jh->b_modified = 0;
-
-	/*
-	 * If there is already a copy-out version of this buffer, then we don't
-	 * need to make another one
-	 */
-	if (jh->b_frozen_data) {
-		JBUFFER_TRACE(jh, "has frozen data");
-		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-		jh->b_next_transaction = transaction;
-		goto done;
-	}
-
-	/* Is there data here we need to preserve? */
-
-	if (jh->b_transaction && jh->b_transaction != transaction) {
-		JBUFFER_TRACE(jh, "owned by older transaction");
-		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-		J_ASSERT_JH(jh, jh->b_transaction ==
-					journal->j_committing_transaction);
-
-		/* There is one case we have to be very careful about.
-		 * If the committing transaction is currently writing
-		 * this buffer out to disk and has NOT made a copy-out,
-		 * then we cannot modify the buffer contents at all
-		 * right now.  The essence of copy-out is that it is the
-		 * extra copy, not the primary copy, which gets
-		 * journaled.  If the primary copy is already going to
-		 * disk then we cannot do copy-out here. */
-
-		if (jh->b_jlist == BJ_Shadow) {
-			DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
-			wait_queue_head_t *wqh;
-
-			wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
-
-			JBUFFER_TRACE(jh, "on shadow: sleep");
-			jbd_unlock_bh_state(bh);
-			/* commit wakes up all shadow buffers after IO */
-			for ( ; ; ) {
-				prepare_to_wait(wqh, &wait.wait,
-						TASK_UNINTERRUPTIBLE);
-				if (jh->b_jlist != BJ_Shadow)
-					break;
-				schedule();
-			}
-			finish_wait(wqh, &wait.wait);
-			goto repeat;
-		}
-
-		/* Only do the copy if the currently-owning transaction
-		 * still needs it.  If it is on the Forget list, the
-		 * committing transaction is past that stage.  The
-		 * buffer had better remain locked during the kmalloc,
-		 * but that should be true --- we hold the journal lock
-		 * still and the buffer is already on the BUF_JOURNAL
-		 * list so won't be flushed.
-		 *
-		 * Subtle point, though: if this is a get_undo_access,
-		 * then we will be relying on the frozen_data to contain
-		 * the new value of the committed_data record after the
-		 * transaction, so we HAVE to force the frozen_data copy
-		 * in that case. */
-
-		if (jh->b_jlist != BJ_Forget || force_copy) {
-			JBUFFER_TRACE(jh, "generate frozen data");
-			if (!frozen_buffer) {
-				JBUFFER_TRACE(jh, "allocate memory for buffer");
-				jbd_unlock_bh_state(bh);
-				frozen_buffer =
-					jbd_alloc(jh2bh(jh)->b_size,
-							 GFP_NOFS);
-				if (!frozen_buffer) {
-					printk(KERN_ERR
-					       "%s: OOM for frozen_buffer\n",
-					       __func__);
-					JBUFFER_TRACE(jh, "oom!");
-					error = -ENOMEM;
-					jbd_lock_bh_state(bh);
-					goto done;
-				}
-				goto repeat;
-			}
-			jh->b_frozen_data = frozen_buffer;
-			frozen_buffer = NULL;
-			need_copy = 1;
-		}
-		jh->b_next_transaction = transaction;
-	}
-
-
-	/*
-	 * Finally, if the buffer is not journaled right now, we need to make
-	 * sure it doesn't get written to disk before the caller actually
-	 * commits the new data
-	 */
-	if (!jh->b_transaction) {
-		JBUFFER_TRACE(jh, "no transaction");
-		J_ASSERT_JH(jh, !jh->b_next_transaction);
-		JBUFFER_TRACE(jh, "file as BJ_Reserved");
-		spin_lock(&journal->j_list_lock);
-		__journal_file_buffer(jh, transaction, BJ_Reserved);
-		spin_unlock(&journal->j_list_lock);
-	}
-
-done:
-	if (need_copy) {
-		struct page *page;
-		int offset;
-		char *source;
-
-		J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
-			    "Possible IO failure.\n");
-		page = jh2bh(jh)->b_page;
-		offset = offset_in_page(jh2bh(jh)->b_data);
-		source = kmap_atomic(page);
-		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
-		kunmap_atomic(source);
-	}
-	jbd_unlock_bh_state(bh);
-
-	/*
-	 * If we are about to journal a buffer, then any revoke pending on it is
-	 * no longer valid
-	 */
-	journal_cancel_revoke(handle, jh);
-
-out:
-	if (unlikely(frozen_buffer))	/* It's usually NULL */
-		jbd_free(frozen_buffer, bh->b_size);
-
-	JBUFFER_TRACE(jh, "exit");
-	return error;
-}
-
-/**
- * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
- * @handle: transaction to add buffer modifications to
- * @bh:     bh to be used for metadata writes
- *
- * Returns an error code or 0 on success.
- *
- * In full data journalling mode the buffer may be of type BJ_AsyncData,
- * because we're write()ing a buffer which is also part of a shared mapping.
- */
-
-int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
-{
-	struct journal_head *jh = journal_add_journal_head(bh);
-	int rc;
-
-	/* We do not want to get caught playing with fields which the
-	 * log thread also manipulates.  Make sure that the buffer
-	 * completes any outstanding IO before proceeding. */
-	rc = do_get_write_access(handle, jh, 0);
-	journal_put_journal_head(jh);
-	return rc;
-}
-
-
-/*
- * When the user wants to journal a newly created buffer_head
- * (ie. getblk() returned a new buffer and we are going to populate it
- * manually rather than reading off disk), then we need to keep the
- * buffer_head locked until it has been completely filled with new
- * data.  In this case, we should be able to make the assertion that
- * the bh is not already part of an existing transaction.
- *
- * The buffer should already be locked by the caller by this point.
- * There is no lock ranking violation: it was a newly created,
- * unlocked buffer beforehand. */
-
-/**
- * int journal_get_create_access () - notify intent to use newly created bh
- * @handle: transaction to new buffer to
- * @bh: new buffer.
- *
- * Call this if you create a new bh.
- */
-int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
-{
-	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
-	struct journal_head *jh = journal_add_journal_head(bh);
-	int err;
-
-	jbd_debug(5, "journal_head %p\n", jh);
-	err = -EROFS;
-	if (is_handle_aborted(handle))
-		goto out;
-	err = 0;
-
-	JBUFFER_TRACE(jh, "entry");
-	/*
-	 * The buffer may already belong to this transaction due to pre-zeroing
-	 * in the filesystem's new_block code.  It may also be on the previous,
-	 * committing transaction's lists, but it HAS to be in Forget state in
-	 * that case: the transaction must have deleted the buffer for it to be
-	 * reused here.
-	 */
-	jbd_lock_bh_state(bh);
-	spin_lock(&journal->j_list_lock);
-	J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
-		jh->b_transaction == NULL ||
-		(jh->b_transaction == journal->j_committing_transaction &&
-			  jh->b_jlist == BJ_Forget)));
-
-	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
-
-	if (jh->b_transaction == NULL) {
-		/*
-		 * Previous journal_forget() could have left the buffer
-		 * with jbddirty bit set because it was being committed. When
-		 * the commit finished, we've filed the buffer for
-		 * checkpointing and marked it dirty. Now we are reallocating
-		 * the buffer so the transaction freeing it must have
-		 * committed and so it's safe to clear the dirty bit.
-		 */
-		clear_buffer_dirty(jh2bh(jh));
-
-		/* first access by this transaction */
-		jh->b_modified = 0;
-
-		JBUFFER_TRACE(jh, "file as BJ_Reserved");
-		__journal_file_buffer(jh, transaction, BJ_Reserved);
-	} else if (jh->b_transaction == journal->j_committing_transaction) {
-		/* first access by this transaction */
-		jh->b_modified = 0;
-
-		JBUFFER_TRACE(jh, "set next transaction");
-		jh->b_next_transaction = transaction;
-	}
-	spin_unlock(&journal->j_list_lock);
-	jbd_unlock_bh_state(bh);
-
-	/*
-	 * akpm: I added this.  ext3_alloc_branch can pick up new indirect
-	 * blocks which contain freed but then revoked metadata.  We need
-	 * to cancel the revoke in case we end up freeing it yet again
-	 * and the reallocating as data - this would cause a second revoke,
-	 * which hits an assertion error.
-	 */
-	JBUFFER_TRACE(jh, "cancelling revoke");
-	journal_cancel_revoke(handle, jh);
-out:
-	journal_put_journal_head(jh);
-	return err;
-}
-
-/**
- * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences
- * @handle: transaction
- * @bh: buffer to undo
- *
- * Sometimes there is a need to distinguish between metadata which has
- * been committed to disk and that which has not.  The ext3fs code uses
- * this for freeing and allocating space, we have to make sure that we
- * do not reuse freed space until the deallocation has been committed,
- * since if we overwrote that space we would make the delete
- * un-rewindable in case of a crash.
- *
- * To deal with that, journal_get_undo_access requests write access to a
- * buffer for parts of non-rewindable operations such as delete
- * operations on the bitmaps.  The journaling code must keep a copy of
- * the buffer's contents prior to the undo_access call until such time
- * as we know that the buffer has definitely been committed to disk.
- *
- * We never need to know which transaction the committed data is part
- * of, buffers touched here are guaranteed to be dirtied later and so
- * will be committed to a new transaction in due course, at which point
- * we can discard the old committed data pointer.
- *
- * Returns error number or 0 on success.
- */
-int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
-{
-	int err;
-	struct journal_head *jh = journal_add_journal_head(bh);
-	char *committed_data = NULL;
-
-	JBUFFER_TRACE(jh, "entry");
-
-	/*
-	 * Do this first --- it can drop the journal lock, so we want to
-	 * make sure that obtaining the committed_data is done
-	 * atomically wrt. completion of any outstanding commits.
-	 */
-	err = do_get_write_access(handle, jh, 1);
-	if (err)
-		goto out;
-
-repeat:
-	if (!jh->b_committed_data) {
-		committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS);
-		if (!committed_data) {
-			printk(KERN_ERR "%s: No memory for committed data\n",
-				__func__);
-			err = -ENOMEM;
-			goto out;
-		}
-	}
-
-	jbd_lock_bh_state(bh);
-	if (!jh->b_committed_data) {
-		/* Copy out the current buffer contents into the
-		 * preserved, committed copy. */
-		JBUFFER_TRACE(jh, "generate b_committed data");
-		if (!committed_data) {
-			jbd_unlock_bh_state(bh);
-			goto repeat;
-		}
-
-		jh->b_committed_data = committed_data;
-		committed_data = NULL;
-		memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
-	}
-	jbd_unlock_bh_state(bh);
-out:
-	journal_put_journal_head(jh);
-	if (unlikely(committed_data))
-		jbd_free(committed_data, bh->b_size);
-	return err;
-}
-
-/**
- * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * Description:
- * Mark a buffer as containing dirty data which needs to be flushed before
- * we can commit the current transaction.
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-	journal_t *journal = handle->h_transaction->t_journal;
-	int need_brelse = 0;
-	struct journal_head *jh;
-	int ret = 0;
-
-	if (is_handle_aborted(handle))
-		return ret;
-
-	jh = journal_add_journal_head(bh);
-	JBUFFER_TRACE(jh, "entry");
-
-	/*
-	 * The buffer could *already* be dirty.  Writeout can start
-	 * at any time.
-	 */
-	jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-
-	/*
-	 * What if the buffer is already part of a running transaction?
-	 *
-	 * There are two cases:
-	 * 1) It is part of the current running transaction.  Refile it,
-	 *    just in case we have allocated it as metadata, deallocated
-	 *    it, then reallocated it as data.
-	 * 2) It is part of the previous, still-committing transaction.
-	 *    If all we want to do is to guarantee that the buffer will be
-	 *    written to disk before this new transaction commits, then
-	 *    being sure that the *previous* transaction has this same
-	 *    property is sufficient for us!  Just leave it on its old
-	 *    transaction.
-	 *
-	 * In case (2), the buffer must not already exist as metadata
-	 * --- that would violate write ordering (a transaction is free
-	 * to write its data at any point, even before the previous
-	 * committing transaction has committed).  The caller must
-	 * never, ever allow this to happen: there's nothing we can do
-	 * about it in this layer.
-	 */
-	jbd_lock_bh_state(bh);
-	spin_lock(&journal->j_list_lock);
-
-	/* Now that we have bh_state locked, are we really still mapped? */
-	if (!buffer_mapped(bh)) {
-		JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
-		goto no_journal;
-	}
-
-	if (jh->b_transaction) {
-		JBUFFER_TRACE(jh, "has transaction");
-		if (jh->b_transaction != handle->h_transaction) {
-			JBUFFER_TRACE(jh, "belongs to older transaction");
-			J_ASSERT_JH(jh, jh->b_transaction ==
-					journal->j_committing_transaction);
-
-			/* @@@ IS THIS TRUE  ? */
-			/*
-			 * Not any more.  Scenario: someone does a write()
-			 * in data=journal mode.  The buffer's transaction has
-			 * moved into commit.  Then someone does another
-			 * write() to the file.  We do the frozen data copyout
-			 * and set b_next_transaction to point to j_running_t.
-			 * And while we're in that state, someone does a
-			 * writepage() in an attempt to pageout the same area
-			 * of the file via a shared mapping.  At present that
-			 * calls journal_dirty_data(), and we get right here.
-			 * It may be too late to journal the data.  Simply
-			 * falling through to the next test will suffice: the
-			 * data will be dirty and wil be checkpointed.  The
-			 * ordering comments in the next comment block still
-			 * apply.
-			 */
-			//J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-
-			/*
-			 * If we're journalling data, and this buffer was
-			 * subject to a write(), it could be metadata, forget
-			 * or shadow against the committing transaction.  Now,
-			 * someone has dirtied the same darn page via a mapping
-			 * and it is being writepage()'d.
-			 * We *could* just steal the page from commit, with some
-			 * fancy locking there.  Instead, we just skip it -
-			 * don't tie the page's buffers to the new transaction
-			 * at all.
-			 * Implication: if we crash before the writepage() data
-			 * is written into the filesystem, recovery will replay
-			 * the write() data.
-			 */
-			if (jh->b_jlist != BJ_None &&
-					jh->b_jlist != BJ_SyncData &&
-					jh->b_jlist != BJ_Locked) {
-				JBUFFER_TRACE(jh, "Not stealing");
-				goto no_journal;
-			}
-
-			/*
-			 * This buffer may be undergoing writeout in commit.  We
-			 * can't return from here and let the caller dirty it
-			 * again because that can cause the write-out loop in
-			 * commit to never terminate.
-			 */
-			if (buffer_dirty(bh)) {
-				get_bh(bh);
-				spin_unlock(&journal->j_list_lock);
-				jbd_unlock_bh_state(bh);
-				need_brelse = 1;
-				sync_dirty_buffer(bh);
-				jbd_lock_bh_state(bh);
-				spin_lock(&journal->j_list_lock);
-				/* Since we dropped the lock... */
-				if (!buffer_mapped(bh)) {
-					JBUFFER_TRACE(jh, "buffer got unmapped");
-					goto no_journal;
-				}
-				/* The buffer may become locked again at any
-				   time if it is redirtied */
-			}
-
-			/*
-			 * We cannot remove the buffer with io error from the
-			 * committing transaction, because otherwise it would
-			 * miss the error and the commit would not abort.
-			 */
-			if (unlikely(!buffer_uptodate(bh))) {
-				ret = -EIO;
-				goto no_journal;
-			}
-			/* We might have slept so buffer could be refiled now */
-			if (jh->b_transaction != NULL &&
-			    jh->b_transaction != handle->h_transaction) {
-				JBUFFER_TRACE(jh, "unfile from commit");
-				__journal_temp_unlink_buffer(jh);
-				/* It still points to the committing
-				 * transaction; move it to this one so
-				 * that the refile assert checks are
-				 * happy. */
-				jh->b_transaction = handle->h_transaction;
-			}
-			/* The buffer will be refiled below */
-
-		}
-		/*
-		 * Special case --- the buffer might actually have been
-		 * allocated and then immediately deallocated in the previous,
-		 * committing transaction, so might still be left on that
-		 * transaction's metadata lists.
-		 */
-		if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
-			JBUFFER_TRACE(jh, "not on correct data list: unfile");
-			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
-			JBUFFER_TRACE(jh, "file as data");
-			__journal_file_buffer(jh, handle->h_transaction,
-						BJ_SyncData);
-		}
-	} else {
-		JBUFFER_TRACE(jh, "not on a transaction");
-		__journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
-	}
-no_journal:
-	spin_unlock(&journal->j_list_lock);
-	jbd_unlock_bh_state(bh);
-	if (need_brelse) {
-		BUFFER_TRACE(bh, "brelse");
-		__brelse(bh);
-	}
-	JBUFFER_TRACE(jh, "exit");
-	journal_put_journal_head(jh);
-	return ret;
-}
-
-/**
- * int journal_dirty_metadata() - mark a buffer as containing dirty metadata
- * @handle: transaction to add buffer to.
- * @bh: buffer to mark
- *
- * Mark dirty metadata which needs to be journaled as part of the current
- * transaction.
- *
- * The buffer is placed on the transaction's metadata list and is marked
- * as belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * Special care needs to be taken if the buffer already belongs to the
- * current committing transaction (in which case we should have frozen
- * data present for that commit).  In that case, we don't relink the
- * buffer: that only gets done when the old transaction finally
- * completes its commit.
- */
-int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
-{
-	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
-	struct journal_head *jh = bh2jh(bh);
-
-	jbd_debug(5, "journal_head %p\n", jh);
-	JBUFFER_TRACE(jh, "entry");
-	if (is_handle_aborted(handle))
-		goto out;
-
-	jbd_lock_bh_state(bh);
-
-	if (jh->b_modified == 0) {
-		/*
-		 * This buffer's got modified and becoming part
-		 * of the transaction. This needs to be done
-		 * once a transaction -bzzz
-		 */
-		jh->b_modified = 1;
-		J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
-		handle->h_buffer_credits--;
-	}
-
-	/*
-	 * fastpath, to avoid expensive locking.  If this buffer is already
-	 * on the running transaction's metadata list there is nothing to do.
-	 * Nobody can take it off again because there is a handle open.
-	 * I _think_ we're OK here with SMP barriers - a mistaken decision will
-	 * result in this test being false, so we go in and take the locks.
-	 */
-	if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
-		JBUFFER_TRACE(jh, "fastpath");
-		J_ASSERT_JH(jh, jh->b_transaction ==
-					journal->j_running_transaction);
-		goto out_unlock_bh;
-	}
-
-	set_buffer_jbddirty(bh);
-
-	/*
-	 * Metadata already on the current transaction list doesn't
-	 * need to be filed.  Metadata on another transaction's list must
-	 * be committing, and will be refiled once the commit completes:
-	 * leave it alone for now.
-	 */
-	if (jh->b_transaction != transaction) {
-		JBUFFER_TRACE(jh, "already on other transaction");
-		J_ASSERT_JH(jh, jh->b_transaction ==
-					journal->j_committing_transaction);
-		J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
-		/* And this case is illegal: we can't reuse another
-		 * transaction's data buffer, ever. */
-		goto out_unlock_bh;
-	}
-
-	/* That test should have eliminated the following case: */
-	J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
-
-	JBUFFER_TRACE(jh, "file as BJ_Metadata");
-	spin_lock(&journal->j_list_lock);
-	__journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
-	spin_unlock(&journal->j_list_lock);
-out_unlock_bh:
-	jbd_unlock_bh_state(bh);
-out:
-	JBUFFER_TRACE(jh, "exit");
-	return 0;
-}
-
-/*
- * journal_release_buffer: undo a get_write_access without any buffer
- * updates, if the update decided in the end that it didn't need access.
- *
- */
-void
-journal_release_buffer(handle_t *handle, struct buffer_head *bh)
-{
-	BUFFER_TRACE(bh, "entry");
-}
-
-/**
- * void journal_forget() - bforget() for potentially-journaled buffers.
- * @handle: transaction handle
- * @bh:     bh to 'forget'
- *
- * We can only do the bforget if there are no commits pending against the
- * buffer.  If the buffer is dirty in the current running transaction we
- * can safely unlink it.
- *
- * bh may not be a journalled buffer at all - it may be a non-JBD
- * buffer which came off the hashtable.  Check for this.
- *
- * Decrements bh->b_count by one.
- *
- * Allow this call even if the handle has aborted --- it may be part of
- * the caller's cleanup after an abort.
- */
-int journal_forget (handle_t *handle, struct buffer_head *bh)
-{
-	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
-	struct journal_head *jh;
-	int drop_reserve = 0;
-	int err = 0;
-	int was_modified = 0;
-
-	BUFFER_TRACE(bh, "entry");
-
-	jbd_lock_bh_state(bh);
-	spin_lock(&journal->j_list_lock);
-
-	if (!buffer_jbd(bh))
-		goto not_jbd;
-	jh = bh2jh(bh);
-
-	/* Critical error: attempting to delete a bitmap buffer, maybe?
-	 * Don't do any jbd operations, and return an error. */
-	if (!J_EXPECT_JH(jh, !jh->b_committed_data,
-			 "inconsistent data on disk")) {
-		err = -EIO;
-		goto not_jbd;
-	}
-
-	/* keep track of whether or not this transaction modified us */
-	was_modified = jh->b_modified;
-
-	/*
-	 * The buffer's going from the transaction, we must drop
-	 * all references -bzzz
-	 */
-	jh->b_modified = 0;
-
-	if (jh->b_transaction == handle->h_transaction) {
-		J_ASSERT_JH(jh, !jh->b_frozen_data);
-
-		/* If we are forgetting a buffer which is already part
-		 * of this transaction, then we can just drop it from
-		 * the transaction immediately. */
-		clear_buffer_dirty(bh);
-		clear_buffer_jbddirty(bh);
-
-		JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
-
-		/*
-		 * we only want to drop a reference if this transaction
-		 * modified the buffer
-		 */
-		if (was_modified)
-			drop_reserve = 1;
-
-		/*
-		 * We are no longer going to journal this buffer.
-		 * However, the commit of this transaction is still
-		 * important to the buffer: the delete that we are now
-		 * processing might obsolete an old log entry, so by
-		 * committing, we can satisfy the buffer's checkpoint.
-		 *
-		 * So, if we have a checkpoint on the buffer, we should
-		 * now refile the buffer on our BJ_Forget list so that
-		 * we know to remove the checkpoint after we commit.
-		 */
-
-		if (jh->b_cp_transaction) {
-			__journal_temp_unlink_buffer(jh);
-			__journal_file_buffer(jh, transaction, BJ_Forget);
-		} else {
-			__journal_unfile_buffer(jh);
-			if (!buffer_jbd(bh)) {
-				spin_unlock(&journal->j_list_lock);
-				jbd_unlock_bh_state(bh);
-				__bforget(bh);
-				goto drop;
-			}
-		}
-	} else if (jh->b_transaction) {
-		J_ASSERT_JH(jh, (jh->b_transaction ==
-				 journal->j_committing_transaction));
-		/* However, if the buffer is still owned by a prior
-		 * (committing) transaction, we can't drop it yet... */
-		JBUFFER_TRACE(jh, "belongs to older transaction");
-		/* ... but we CAN drop it from the new transaction if we
-		 * have also modified it since the original commit. */
-
-		if (jh->b_next_transaction) {
-			J_ASSERT(jh->b_next_transaction == transaction);
-			jh->b_next_transaction = NULL;
-
-			/*
-			 * only drop a reference if this transaction modified
-			 * the buffer
-			 */
-			if (was_modified)
-				drop_reserve = 1;
-		}
-	}
-
-not_jbd:
-	spin_unlock(&journal->j_list_lock);
-	jbd_unlock_bh_state(bh);
-	__brelse(bh);
-drop:
-	if (drop_reserve) {
-		/* no need to reserve log space for this block -bzzz */
-		handle->h_buffer_credits++;
-	}
-	return err;
-}
-
-/**
- * int journal_stop() - complete a transaction
- * @handle: tranaction to complete.
- *
- * All done for a particular handle.
- *
- * There is not much action needed here.  We just return any remaining
- * buffer credits to the transaction and remove the handle.  The only
- * complication is that we need to start a commit operation if the
- * filesystem is marked for synchronous update.
- *
- * journal_stop itself will not usually return an error, but it may
- * do so in unusual circumstances.  In particular, expect it to
- * return -EIO if a journal_abort has been executed since the
- * transaction began.
- */
-int journal_stop(handle_t *handle)
-{
-	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
-	int err;
-	pid_t pid;
-
-	J_ASSERT(journal_current_handle() == handle);
-
-	if (is_handle_aborted(handle))
-		err = -EIO;
-	else {
-		J_ASSERT(transaction->t_updates > 0);
-		err = 0;
-	}
-
-	if (--handle->h_ref > 0) {
-		jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
-			  handle->h_ref);
-		return err;
-	}
-
-	jbd_debug(4, "Handle %p going down\n", handle);
-
-	/*
-	 * Implement synchronous transaction batching.  If the handle
-	 * was synchronous, don't force a commit immediately.  Let's
-	 * yield and let another thread piggyback onto this transaction.
-	 * Keep doing that while new threads continue to arrive.
-	 * It doesn't cost much - we're about to run a commit and sleep
-	 * on IO anyway.  Speeds up many-threaded, many-dir operations
-	 * by 30x or more...
-	 *
-	 * We try and optimize the sleep time against what the underlying disk
-	 * can do, instead of having a static sleep time.  This is useful for
-	 * the case where our storage is so fast that it is more optimal to go
-	 * ahead and force a flush and wait for the transaction to be committed
-	 * than it is to wait for an arbitrary amount of time for new writers to
-	 * join the transaction.  We achieve this by measuring how long it takes
-	 * to commit a transaction, and compare it with how long this
-	 * transaction has been running, and if run time < commit time then we
-	 * sleep for the delta and commit.  This greatly helps super fast disks
-	 * that would see slowdowns as more threads started doing fsyncs.
-	 *
-	 * But don't do this if this process was the most recent one to
-	 * perform a synchronous write.  We do this to detect the case where a
-	 * single process is doing a stream of sync writes.  No point in waiting
-	 * for joiners in that case.
-	 */
-	pid = current->pid;
-	if (handle->h_sync && journal->j_last_sync_writer != pid) {
-		u64 commit_time, trans_time;
-
-		journal->j_last_sync_writer = pid;
-
-		spin_lock(&journal->j_state_lock);
-		commit_time = journal->j_average_commit_time;
-		spin_unlock(&journal->j_state_lock);
-
-		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
-						   transaction->t_start_time));
-
-		commit_time = min_t(u64, commit_time,
-				    1000*jiffies_to_usecs(1));
-
-		if (trans_time < commit_time) {
-			ktime_t expires = ktime_add_ns(ktime_get(),
-						       commit_time);
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
-		}
-	}
-
-	current->journal_info = NULL;
-	spin_lock(&journal->j_state_lock);
-	spin_lock(&transaction->t_handle_lock);
-	transaction->t_outstanding_credits -= handle->h_buffer_credits;
-	transaction->t_updates--;
-	if (!transaction->t_updates) {
-		wake_up(&journal->j_wait_updates);
-		if (journal->j_barrier_count)
-			wake_up(&journal->j_wait_transaction_locked);
-	}
-
-	/*
-	 * If the handle is marked SYNC, we need to set another commit
-	 * going!  We also want to force a commit if the current
-	 * transaction is occupying too much of the log, or if the
-	 * transaction is too old now.
-	 */
-	if (handle->h_sync ||
-			transaction->t_outstanding_credits >
-				journal->j_max_transaction_buffers ||
-			time_after_eq(jiffies, transaction->t_expires)) {
-		/* Do this even for aborted journals: an abort still
-		 * completes the commit thread, it just doesn't write
-		 * anything to disk. */
-		tid_t tid = transaction->t_tid;
-
-		spin_unlock(&transaction->t_handle_lock);
-		jbd_debug(2, "transaction too old, requesting commit for "
-					"handle %p\n", handle);
-		/* This is non-blocking */
-		__log_start_commit(journal, transaction->t_tid);
-		spin_unlock(&journal->j_state_lock);
-
-		/*
-		 * Special case: JFS_SYNC synchronous updates require us
-		 * to wait for the commit to complete.
-		 */
-		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
-			err = log_wait_commit(journal, tid);
-	} else {
-		spin_unlock(&transaction->t_handle_lock);
-		spin_unlock(&journal->j_state_lock);
-	}
-
-	lock_map_release(&handle->h_lockdep_map);
-
-	jbd_free_handle(handle);
-	return err;
-}
-
-/**
- * int journal_force_commit() - force any uncommitted transactions
- * @journal: journal to force
- *
- * For synchronous operations: force any uncommitted transactions
- * to disk.  May seem kludgy, but it reuses all the handle batching
- * code in a very simple manner.
- */
-int journal_force_commit(journal_t *journal)
-{
-	handle_t *handle;
-	int ret;
-
-	handle = journal_start(journal, 1);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-	} else {
-		handle->h_sync = 1;
-		ret = journal_stop(handle);
-	}
-	return ret;
-}
-
-/*
- *
- * List management code snippets: various functions for manipulating the
- * transaction buffer lists.
- *
- */
-
-/*
- * Append a buffer to a transaction list, given the transaction's list head
- * pointer.
- *
- * j_list_lock is held.
- *
- * jbd_lock_bh_state(jh2bh(jh)) is held.
- */
-
-static inline void
-__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
-{
-	if (!*list) {
-		jh->b_tnext = jh->b_tprev = jh;
-		*list = jh;
-	} else {
-		/* Insert at the tail of the list to preserve order */
-		struct journal_head *first = *list, *last = first->b_tprev;
-		jh->b_tprev = last;
-		jh->b_tnext = first;
-		last->b_tnext = first->b_tprev = jh;
-	}
-}
-
-/*
- * Remove a buffer from a transaction list, given the transaction's list
- * head pointer.
- *
- * Called with j_list_lock held, and the journal may not be locked.
- *
- * jbd_lock_bh_state(jh2bh(jh)) is held.
- */
-
-static inline void
-__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
-{
-	if (*list == jh) {
-		*list = jh->b_tnext;
-		if (*list == jh)
-			*list = NULL;
-	}
-	jh->b_tprev->b_tnext = jh->b_tnext;
-	jh->b_tnext->b_tprev = jh->b_tprev;
-}
-
-/*
- * Remove a buffer from the appropriate transaction list.
- *
- * Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
- * is holding onto a copy of one of thee pointers, it could go bad.
- * Generally the caller needs to re-read the pointer from the transaction_t.
- *
- * Called under j_list_lock.  The journal may not be locked.
- */
-static void __journal_temp_unlink_buffer(struct journal_head *jh)
-{
-	struct journal_head **list = NULL;
-	transaction_t *transaction;
-	struct buffer_head *bh = jh2bh(jh);
-
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
-	transaction = jh->b_transaction;
-	if (transaction)
-		assert_spin_locked(&transaction->t_journal->j_list_lock);
-
-	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
-	if (jh->b_jlist != BJ_None)
-		J_ASSERT_JH(jh, transaction != NULL);
-
-	switch (jh->b_jlist) {
-	case BJ_None:
-		return;
-	case BJ_SyncData:
-		list = &transaction->t_sync_datalist;
-		break;
-	case BJ_Metadata:
-		transaction->t_nr_buffers--;
-		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
-		list = &transaction->t_buffers;
-		break;
-	case BJ_Forget:
-		list = &transaction->t_forget;
-		break;
-	case BJ_IO:
-		list = &transaction->t_iobuf_list;
-		break;
-	case BJ_Shadow:
-		list = &transaction->t_shadow_list;
-		break;
-	case BJ_LogCtl:
-		list = &transaction->t_log_list;
-		break;
-	case BJ_Reserved:
-		list = &transaction->t_reserved_list;
-		break;
-	case BJ_Locked:
-		list = &transaction->t_locked_list;
-		break;
-	}
-
-	__blist_del_buffer(list, jh);
-	jh->b_jlist = BJ_None;
-	if (test_clear_buffer_jbddirty(bh))
-		mark_buffer_dirty(bh);	/* Expose it to the VM */
-}
-
-/*
- * Remove buffer from all transactions.
- *
- * Called with bh_state lock and j_list_lock
- *
- * jh and bh may be already freed when this function returns.
- */
-void __journal_unfile_buffer(struct journal_head *jh)
-{
-	__journal_temp_unlink_buffer(jh);
-	jh->b_transaction = NULL;
-	journal_put_journal_head(jh);
-}
-
-void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
-{
-	struct buffer_head *bh = jh2bh(jh);
-
-	/* Get reference so that buffer cannot be freed before we unlock it */
-	get_bh(bh);
-	jbd_lock_bh_state(bh);
-	spin_lock(&journal->j_list_lock);
-	__journal_unfile_buffer(jh);
-	spin_unlock(&journal->j_list_lock);
-	jbd_unlock_bh_state(bh);
-	__brelse(bh);
-}
-
-/*
- * Called from journal_try_to_free_buffers().
- *
- * Called under jbd_lock_bh_state(bh)
- */
-static void
-__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
-{
-	struct journal_head *jh;
-
-	jh = bh2jh(bh);
-
-	if (buffer_locked(bh) || buffer_dirty(bh))
-		goto out;
-
-	if (jh->b_next_transaction != NULL)
-		goto out;
-
-	spin_lock(&journal->j_list_lock);
-	if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
-		if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
-			/* A written-back ordered data buffer */
-			JBUFFER_TRACE(jh, "release data");
-			__journal_unfile_buffer(jh);
-		}
-	} else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
-		/* written-back checkpointed metadata buffer */
-		if (jh->b_jlist == BJ_None) {
-			JBUFFER_TRACE(jh, "remove from checkpoint list");
-			__journal_remove_checkpoint(jh);
-		}
-	}
-	spin_unlock(&journal->j_list_lock);
-out:
-	return;
-}
-
-/**
- * int journal_try_to_free_buffers() - try to free page buffers.
- * @journal: journal for operation
- * @page: to try and free
- * @gfp_mask: we use the mask to detect how hard should we try to release
- * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
- * release the buffers.
- *
- *
- * For all the buffers on this page,
- * if they are fully written out ordered data, move them onto BUF_CLEAN
- * so try_to_free_buffers() can reap them.
- *
- * This function returns non-zero if we wish try_to_free_buffers()
- * to be called. We do this if the page is releasable by try_to_free_buffers().
- * We also do it if the page has locked or dirty buffers and the caller wants
- * us to perform sync or async writeout.
- *
- * This complicates JBD locking somewhat.  We aren't protected by the
- * BKL here.  We wish to remove the buffer from its committing or
- * running transaction's ->t_datalist via __journal_unfile_buffer.
- *
- * This may *change* the value of transaction_t->t_datalist, so anyone
- * who looks at t_datalist needs to lock against this function.
- *
- * Even worse, someone may be doing a journal_dirty_data on this
- * buffer.  So we need to lock against that.  journal_dirty_data()
- * will come out of the lock with the buffer dirty, which makes it
- * ineligible for release here.
- *
- * Who else is affected by this?  hmm...  Really the only contender
- * is do_get_write_access() - it could be looking at the buffer while
- * journal_try_to_free_buffer() is changing its state.  But that
- * cannot happen because we never reallocate freed data as metadata
- * while the data is part of a transaction.  Yes?
- *
- * Return 0 on failure, 1 on success
- */
-int journal_try_to_free_buffers(journal_t *journal,
-				struct page *page, gfp_t gfp_mask)
-{
-	struct buffer_head *head;
-	struct buffer_head *bh;
-	int ret = 0;
-
-	J_ASSERT(PageLocked(page));
-
-	head = page_buffers(page);
-	bh = head;
-	do {
-		struct journal_head *jh;
-
-		/*
-		 * We take our own ref against the journal_head here to avoid
-		 * having to add tons of locking around each instance of
-		 * journal_put_journal_head().
-		 */
-		jh = journal_grab_journal_head(bh);
-		if (!jh)
-			continue;
-
-		jbd_lock_bh_state(bh);
-		__journal_try_to_free_buffer(journal, bh);
-		journal_put_journal_head(jh);
-		jbd_unlock_bh_state(bh);
-		if (buffer_jbd(bh))
-			goto busy;
-	} while ((bh = bh->b_this_page) != head);
-
-	ret = try_to_free_buffers(page);
-
-busy:
-	return ret;
-}
-
-/*
- * This buffer is no longer needed.  If it is on an older transaction's
- * checkpoint list we need to record it on this transaction's forget list
- * to pin this buffer (and hence its checkpointing transaction) down until
- * this transaction commits.  If the buffer isn't on a checkpoint list, we
- * release it.
- * Returns non-zero if JBD no longer has an interest in the buffer.
- *
- * Called under j_list_lock.
- *
- * Called under jbd_lock_bh_state(bh).
- */
-static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
-{
-	int may_free = 1;
-	struct buffer_head *bh = jh2bh(jh);
-
-	if (jh->b_cp_transaction) {
-		JBUFFER_TRACE(jh, "on running+cp transaction");
-		__journal_temp_unlink_buffer(jh);
-		/*
-		 * We don't want to write the buffer anymore, clear the
-		 * bit so that we don't confuse checks in
-		 * __journal_file_buffer
-		 */
-		clear_buffer_dirty(bh);
-		__journal_file_buffer(jh, transaction, BJ_Forget);
-		may_free = 0;
-	} else {
-		JBUFFER_TRACE(jh, "on running transaction");
-		__journal_unfile_buffer(jh);
-	}
-	return may_free;
-}
-
-/*
- * journal_invalidatepage
- *
- * This code is tricky.  It has a number of cases to deal with.
- *
- * There are two invariants which this code relies on:
- *
- * i_size must be updated on disk before we start calling invalidatepage on the
- * data.
- *
- *  This is done in ext3 by defining an ext3_setattr method which
- *  updates i_size before truncate gets going.  By maintaining this
- *  invariant, we can be sure that it is safe to throw away any buffers
- *  attached to the current transaction: once the transaction commits,
- *  we know that the data will not be needed.
- *
- *  Note however that we can *not* throw away data belonging to the
- *  previous, committing transaction!
- *
- * Any disk blocks which *are* part of the previous, committing
- * transaction (and which therefore cannot be discarded immediately) are
- * not going to be reused in the new running transaction
- *
- *  The bitmap committed_data images guarantee this: any block which is
- *  allocated in one transaction and removed in the next will be marked
- *  as in-use in the committed_data bitmap, so cannot be reused until
- *  the next transaction to delete the block commits.  This means that
- *  leaving committing buffers dirty is quite safe: the disk blocks
- *  cannot be reallocated to a different file and so buffer aliasing is
- *  not possible.
- *
- *
- * The above applies mainly to ordered data mode.  In writeback mode we
- * don't make guarantees about the order in which data hits disk --- in
- * particular we don't guarantee that new dirty data is flushed before
- * transaction commit --- so it is always safe just to discard data
- * immediately in that mode.  --sct
- */
-
-/*
- * The journal_unmap_buffer helper function returns zero if the buffer
- * concerned remains pinned as an anonymous buffer belonging to an older
- * transaction.
- *
- * We're outside-transaction here.  Either or both of j_running_transaction
- * and j_committing_transaction may be NULL.
- */
-static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
-				int partial_page)
-{
-	transaction_t *transaction;
-	struct journal_head *jh;
-	int may_free = 1;
-
-	BUFFER_TRACE(bh, "entry");
-
-retry:
-	/*
-	 * It is safe to proceed here without the j_list_lock because the
-	 * buffers cannot be stolen by try_to_free_buffers as long as we are
-	 * holding the page lock. --sct
-	 */
-
-	if (!buffer_jbd(bh))
-		goto zap_buffer_unlocked;
-
-	spin_lock(&journal->j_state_lock);
-	jbd_lock_bh_state(bh);
-	spin_lock(&journal->j_list_lock);
-
-	jh = journal_grab_journal_head(bh);
-	if (!jh)
-		goto zap_buffer_no_jh;
-
-	/*
-	 * We cannot remove the buffer from checkpoint lists until the
-	 * transaction adding inode to orphan list (let's call it T)
-	 * is committed.  Otherwise if the transaction changing the
-	 * buffer would be cleaned from the journal before T is
-	 * committed, a crash will cause that the correct contents of
-	 * the buffer will be lost.  On the other hand we have to
-	 * clear the buffer dirty bit at latest at the moment when the
-	 * transaction marking the buffer as freed in the filesystem
-	 * structures is committed because from that moment on the
-	 * block can be reallocated and used by a different page.
-	 * Since the block hasn't been freed yet but the inode has
-	 * already been added to orphan list, it is safe for us to add
-	 * the buffer to BJ_Forget list of the newest transaction.
-	 *
-	 * Also we have to clear buffer_mapped flag of a truncated buffer
-	 * because the buffer_head may be attached to the page straddling
-	 * i_size (can happen only when blocksize < pagesize) and thus the
-	 * buffer_head can be reused when the file is extended again. So we end
-	 * up keeping around invalidated buffers attached to transactions'
-	 * BJ_Forget list just to stop checkpointing code from cleaning up
-	 * the transaction this buffer was modified in.
-	 */
-	transaction = jh->b_transaction;
-	if (transaction == NULL) {
-		/* First case: not on any transaction.  If it
-		 * has no checkpoint link, then we can zap it:
-		 * it's a writeback-mode buffer so we don't care
-		 * if it hits disk safely. */
-		if (!jh->b_cp_transaction) {
-			JBUFFER_TRACE(jh, "not on any transaction: zap");
-			goto zap_buffer;
-		}
-
-		if (!buffer_dirty(bh)) {
-			/* bdflush has written it.  We can drop it now */
-			goto zap_buffer;
-		}
-
-		/* OK, it must be in the journal but still not
-		 * written fully to disk: it's metadata or
-		 * journaled data... */
-
-		if (journal->j_running_transaction) {
-			/* ... and once the current transaction has
-			 * committed, the buffer won't be needed any
-			 * longer. */
-			JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
-			may_free = __dispose_buffer(jh,
-					journal->j_running_transaction);
-			goto zap_buffer;
-		} else {
-			/* There is no currently-running transaction. So the
-			 * orphan record which we wrote for this file must have
-			 * passed into commit.  We must attach this buffer to
-			 * the committing transaction, if it exists. */
-			if (journal->j_committing_transaction) {
-				JBUFFER_TRACE(jh, "give to committing trans");
-				may_free = __dispose_buffer(jh,
-					journal->j_committing_transaction);
-				goto zap_buffer;
-			} else {
-				/* The orphan record's transaction has
-				 * committed.  We can cleanse this buffer */
-				clear_buffer_jbddirty(bh);
-				goto zap_buffer;
-			}
-		}
-	} else if (transaction == journal->j_committing_transaction) {
-		JBUFFER_TRACE(jh, "on committing transaction");
-		if (jh->b_jlist == BJ_Locked) {
-			/*
-			 * The buffer is on the committing transaction's locked
-			 * list.  We have the buffer locked, so I/O has
-			 * completed.  So we can nail the buffer now.
-			 */
-			may_free = __dispose_buffer(jh, transaction);
-			goto zap_buffer;
-		}
-		/*
-		 * The buffer is committing, we simply cannot touch
-		 * it. If the page is straddling i_size we have to wait
-		 * for commit and try again.
-		 */
-		if (partial_page) {
-			tid_t tid = journal->j_committing_transaction->t_tid;
-
-			journal_put_journal_head(jh);
-			spin_unlock(&journal->j_list_lock);
-			jbd_unlock_bh_state(bh);
-			spin_unlock(&journal->j_state_lock);
-			unlock_buffer(bh);
-			log_wait_commit(journal, tid);
-			lock_buffer(bh);
-			goto retry;
-		}
-		/*
-		 * OK, buffer won't be reachable after truncate. We just set
-		 * j_next_transaction to the running transaction (if there is
-		 * one) and mark buffer as freed so that commit code knows it
-		 * should clear dirty bits when it is done with the buffer.
-		 */
-		set_buffer_freed(bh);
-		if (journal->j_running_transaction && buffer_jbddirty(bh))
-			jh->b_next_transaction = journal->j_running_transaction;
-		journal_put_journal_head(jh);
-		spin_unlock(&journal->j_list_lock);
-		jbd_unlock_bh_state(bh);
-		spin_unlock(&journal->j_state_lock);
-		return 0;
-	} else {
-		/* Good, the buffer belongs to the running transaction.
-		 * We are writing our own transaction's data, not any
-		 * previous one's, so it is safe to throw it away
-		 * (remember that we expect the filesystem to have set
-		 * i_size already for this truncate so recovery will not
-		 * expose the disk blocks we are discarding here.) */
-		J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
-		JBUFFER_TRACE(jh, "on running transaction");
-		may_free = __dispose_buffer(jh, transaction);
-	}
-
-zap_buffer:
-	/*
-	 * This is tricky. Although the buffer is truncated, it may be reused
-	 * if blocksize < pagesize and it is attached to the page straddling
-	 * EOF. Since the buffer might have been added to BJ_Forget list of the
-	 * running transaction, journal_get_write_access() won't clear
-	 * b_modified and credit accounting gets confused. So clear b_modified
-	 * here. */
-	jh->b_modified = 0;
-	journal_put_journal_head(jh);
-zap_buffer_no_jh:
-	spin_unlock(&journal->j_list_lock);
-	jbd_unlock_bh_state(bh);
-	spin_unlock(&journal->j_state_lock);
-zap_buffer_unlocked:
-	clear_buffer_dirty(bh);
-	J_ASSERT_BH(bh, !buffer_jbddirty(bh));
-	clear_buffer_mapped(bh);
-	clear_buffer_req(bh);
-	clear_buffer_new(bh);
-	bh->b_bdev = NULL;
-	return may_free;
-}
-
-/**
- * void journal_invalidatepage() - invalidate a journal page
- * @journal: journal to use for flush
- * @page:    page to flush
- * @offset:  offset of the range to invalidate
- * @length:  length of the range to invalidate
- *
- * Reap page buffers containing data in specified range in page.
- */
-void journal_invalidatepage(journal_t *journal,
-		      struct page *page,
-		      unsigned int offset,
-		      unsigned int length)
-{
-	struct buffer_head *head, *bh, *next;
-	unsigned int stop = offset + length;
-	unsigned int curr_off = 0;
-	int partial_page = (offset || length < PAGE_CACHE_SIZE);
-	int may_free = 1;
-
-	if (!PageLocked(page))
-		BUG();
-	if (!page_has_buffers(page))
-		return;
-
-	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
-
-	/* We will potentially be playing with lists other than just the
-	 * data lists (especially for journaled data mode), so be
-	 * cautious in our locking. */
-
-	head = bh = page_buffers(page);
-	do {
-		unsigned int next_off = curr_off + bh->b_size;
-		next = bh->b_this_page;
-
-		if (next_off > stop)
-			return;
-
-		if (offset <= curr_off) {
-			/* This block is wholly outside the truncation point */
-			lock_buffer(bh);
-			may_free &= journal_unmap_buffer(journal, bh,
-							 partial_page);
-			unlock_buffer(bh);
-		}
-		curr_off = next_off;
-		bh = next;
-
-	} while (bh != head);
-
-	if (!partial_page) {
-		if (may_free && try_to_free_buffers(page))
-			J_ASSERT(!page_has_buffers(page));
-	}
-}
-
-/*
- * File a buffer on the given transaction list.
- */
-void __journal_file_buffer(struct journal_head *jh,
-			transaction_t *transaction, int jlist)
-{
-	struct journal_head **list = NULL;
-	int was_dirty = 0;
-	struct buffer_head *bh = jh2bh(jh);
-
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
-	assert_spin_locked(&transaction->t_journal->j_list_lock);
-
-	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
-	J_ASSERT_JH(jh, jh->b_transaction == transaction ||
-				jh->b_transaction == NULL);
-
-	if (jh->b_transaction && jh->b_jlist == jlist)
-		return;
-
-	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
-	    jlist == BJ_Shadow || jlist == BJ_Forget) {
-		/*
-		 * For metadata buffers, we track dirty bit in buffer_jbddirty
-		 * instead of buffer_dirty. We should not see a dirty bit set
-		 * here because we clear it in do_get_write_access but e.g.
-		 * tune2fs can modify the sb and set the dirty bit at any time
-		 * so we try to gracefully handle that.
-		 */
-		if (buffer_dirty(bh))
-			warn_dirty_buffer(bh);
-		if (test_clear_buffer_dirty(bh) ||
-		    test_clear_buffer_jbddirty(bh))
-			was_dirty = 1;
-	}
-
-	if (jh->b_transaction)
-		__journal_temp_unlink_buffer(jh);
-	else
-		journal_grab_journal_head(bh);
-	jh->b_transaction = transaction;
-
-	switch (jlist) {
-	case BJ_None:
-		J_ASSERT_JH(jh, !jh->b_committed_data);
-		J_ASSERT_JH(jh, !jh->b_frozen_data);
-		return;
-	case BJ_SyncData:
-		list = &transaction->t_sync_datalist;
-		break;
-	case BJ_Metadata:
-		transaction->t_nr_buffers++;
-		list = &transaction->t_buffers;
-		break;
-	case BJ_Forget:
-		list = &transaction->t_forget;
-		break;
-	case BJ_IO:
-		list = &transaction->t_iobuf_list;
-		break;
-	case BJ_Shadow:
-		list = &transaction->t_shadow_list;
-		break;
-	case BJ_LogCtl:
-		list = &transaction->t_log_list;
-		break;
-	case BJ_Reserved:
-		list = &transaction->t_reserved_list;
-		break;
-	case BJ_Locked:
-		list =  &transaction->t_locked_list;
-		break;
-	}
-
-	__blist_add_buffer(list, jh);
-	jh->b_jlist = jlist;
-
-	if (was_dirty)
-		set_buffer_jbddirty(bh);
-}
-
-void journal_file_buffer(struct journal_head *jh,
-				transaction_t *transaction, int jlist)
-{
-	jbd_lock_bh_state(jh2bh(jh));
-	spin_lock(&transaction->t_journal->j_list_lock);
-	__journal_file_buffer(jh, transaction, jlist);
-	spin_unlock(&transaction->t_journal->j_list_lock);
-	jbd_unlock_bh_state(jh2bh(jh));
-}
-
-/*
- * Remove a buffer from its current buffer list in preparation for
- * dropping it from its current transaction entirely.  If the buffer has
- * already started to be used by a subsequent transaction, refile the
- * buffer on that transaction's metadata list.
- *
- * Called under j_list_lock
- * Called under jbd_lock_bh_state(jh2bh(jh))
- *
- * jh and bh may be already free when this function returns
- */
-void __journal_refile_buffer(struct journal_head *jh)
-{
-	int was_dirty, jlist;
-	struct buffer_head *bh = jh2bh(jh);
-
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
-	if (jh->b_transaction)
-		assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
-
-	/* If the buffer is now unused, just drop it. */
-	if (jh->b_next_transaction == NULL) {
-		__journal_unfile_buffer(jh);
-		return;
-	}
-
-	/*
-	 * It has been modified by a later transaction: add it to the new
-	 * transaction's metadata list.
-	 */
-
-	was_dirty = test_clear_buffer_jbddirty(bh);
-	__journal_temp_unlink_buffer(jh);
-	/*
-	 * We set b_transaction here because b_next_transaction will inherit
-	 * our jh reference and thus __journal_file_buffer() must not take a
-	 * new one.
-	 */
-	jh->b_transaction = jh->b_next_transaction;
-	jh->b_next_transaction = NULL;
-	if (buffer_freed(bh))
-		jlist = BJ_Forget;
-	else if (jh->b_modified)
-		jlist = BJ_Metadata;
-	else
-		jlist = BJ_Reserved;
-	__journal_file_buffer(jh, jh->b_transaction, jlist);
-	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
-
-	if (was_dirty)
-		set_buffer_jbddirty(bh);
-}
-
-/*
- * __journal_refile_buffer() with necessary locking added. We take our bh
- * reference so that we can safely unlock bh.
- *
- * The jh and bh may be freed by this call.
- */
-void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
-{
-	struct buffer_head *bh = jh2bh(jh);
-
-	/* Get reference so that buffer cannot be freed before we unlock it */
-	get_bh(bh);
-	jbd_lock_bh_state(bh);
-	spin_lock(&journal->j_list_lock);
-	__journal_refile_buffer(jh);
-	jbd_unlock_bh_state(bh);
-	spin_unlock(&journal->j_list_lock);
-	__brelse(bh);
-}
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
deleted file mode 100644
index d32615280be9..000000000000
--- a/include/linux/jbd.h
+++ /dev/null
@@ -1,1047 +0,0 @@
-/*
- * linux/include/linux/jbd.h
- *
- * Written by Stephen C. Tweedie <sct@redhat.com>
- *
- * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * Definitions for transaction data structures for the buffer cache
- * filesystem journaling support.
- */
-
-#ifndef _LINUX_JBD_H
-#define _LINUX_JBD_H
-
-/* Allow this file to be included directly into e2fsprogs */
-#ifndef __KERNEL__
-#include "jfs_compat.h"
-#define JFS_DEBUG
-#define jfs_debug jbd_debug
-#else
-
-#include <linux/types.h>
-#include <linux/buffer_head.h>
-#include <linux/journal-head.h>
-#include <linux/stddef.h>
-#include <linux/mutex.h>
-#include <linux/timer.h>
-#include <linux/lockdep.h>
-#include <linux/slab.h>
-
-#define journal_oom_retry 1
-
-/*
- * Define JBD_PARANOID_IOFAIL to cause a kernel BUG() if ext3 finds
- * certain classes of error which can occur due to failed IOs.  Under
- * normal use we want ext3 to continue after such errors, because
- * hardware _can_ fail, but for debugging purposes when running tests on
- * known-good hardware we may want to trap these errors.
- */
-#undef JBD_PARANOID_IOFAIL
-
-/*
- * The default maximum commit age, in seconds.
- */
-#define JBD_DEFAULT_MAX_COMMIT_AGE 5
-
-#ifdef CONFIG_JBD_DEBUG
-/*
- * Define JBD_EXPENSIVE_CHECKING to enable more expensive internal
- * consistency checks.  By default we don't do this unless
- * CONFIG_JBD_DEBUG is on.
- */
-#define JBD_EXPENSIVE_CHECKING
-extern u8 journal_enable_debug;
-
-void __jbd_debug(int level, const char *file, const char *func,
-		 unsigned int line, const char *fmt, ...);
-
-#define jbd_debug(n, fmt, a...) \
-	__jbd_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a)
-#else
-#define jbd_debug(n, fmt, a...)    /**/
-#endif
-
-static inline void *jbd_alloc(size_t size, gfp_t flags)
-{
-	return (void *)__get_free_pages(flags, get_order(size));
-}
-
-static inline void jbd_free(void *ptr, size_t size)
-{
-	free_pages((unsigned long)ptr, get_order(size));
-}
-
-#define JFS_MIN_JOURNAL_BLOCKS 1024
-
-
-/**
- * typedef handle_t - The handle_t type represents a single atomic update being performed by some process.
- *
- * All filesystem modifications made by the process go
- * through this handle.  Recursive operations (such as quota operations)
- * are gathered into a single update.
- *
- * The buffer credits field is used to account for journaled buffers
- * being modified by the running process.  To ensure that there is
- * enough log space for all outstanding operations, we need to limit the
- * number of outstanding buffers possible at any time.  When the
- * operation completes, any buffer credits not used are credited back to
- * the transaction, so that at all times we know how many buffers the
- * outstanding updates on a transaction might possibly touch.
- *
- * This is an opaque datatype.
- **/
-typedef struct handle_s		handle_t;	/* Atomic operation type */
-
-
-/**
- * typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem.
- *
- * journal_t is linked to from the fs superblock structure.
- *
- * We use the journal_t to keep track of all outstanding transaction
- * activity on the filesystem, and to manage the state of the log
- * writing process.
- *
- * This is an opaque datatype.
- **/
-typedef struct journal_s	journal_t;	/* Journal control structure */
-#endif
-
-/*
- * Internal structures used by the logging mechanism:
- */
-
-#define JFS_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */
-
-/*
- * On-disk structures
- */
-
-/*
- * Descriptor block types:
- */
-
-#define JFS_DESCRIPTOR_BLOCK	1
-#define JFS_COMMIT_BLOCK	2
-#define JFS_SUPERBLOCK_V1	3
-#define JFS_SUPERBLOCK_V2	4
-#define JFS_REVOKE_BLOCK	5
-
-/*
- * Standard header for all descriptor blocks:
- */
-typedef struct journal_header_s
-{
-	__be32		h_magic;
-	__be32		h_blocktype;
-	__be32		h_sequence;
-} journal_header_t;
-
-
-/*
- * The block tag: used to describe a single buffer in the journal
- */
-typedef struct journal_block_tag_s
-{
-	__be32		t_blocknr;	/* The on-disk block number */
-	__be32		t_flags;	/* See below */
-} journal_block_tag_t;
-
-/*
- * The revoke descriptor: used on disk to describe a series of blocks to
- * be revoked from the log
- */
-typedef struct journal_revoke_header_s
-{
-	journal_header_t r_header;
-	__be32		 r_count;	/* Count of bytes used in the block */
-} journal_revoke_header_t;
-
-
-/* Definitions for the journal tag flags word: */
-#define JFS_FLAG_ESCAPE		1	/* on-disk block is escaped */
-#define JFS_FLAG_SAME_UUID	2	/* block has same uuid as previous */
-#define JFS_FLAG_DELETED	4	/* block deleted by this transaction */
-#define JFS_FLAG_LAST_TAG	8	/* last tag in this descriptor block */
-
-
-/*
- * The journal superblock.  All fields are in big-endian byte order.
- */
-typedef struct journal_superblock_s
-{
-/* 0x0000 */
-	journal_header_t s_header;
-
-/* 0x000C */
-	/* Static information describing the journal */
-	__be32	s_blocksize;		/* journal device blocksize */
-	__be32	s_maxlen;		/* total blocks in journal file */
-	__be32	s_first;		/* first block of log information */
-
-/* 0x0018 */
-	/* Dynamic information describing the current state of the log */
-	__be32	s_sequence;		/* first commit ID expected in log */
-	__be32	s_start;		/* blocknr of start of log */
-
-/* 0x0020 */
-	/* Error value, as set by journal_abort(). */
-	__be32	s_errno;
-
-/* 0x0024 */
-	/* Remaining fields are only valid in a version-2 superblock */
-	__be32	s_feature_compat;	/* compatible feature set */
-	__be32	s_feature_incompat;	/* incompatible feature set */
-	__be32	s_feature_ro_compat;	/* readonly-compatible feature set */
-/* 0x0030 */
-	__u8	s_uuid[16];		/* 128-bit uuid for journal */
-
-/* 0x0040 */
-	__be32	s_nr_users;		/* Nr of filesystems sharing log */
-
-	__be32	s_dynsuper;		/* Blocknr of dynamic superblock copy*/
-
-/* 0x0048 */
-	__be32	s_max_transaction;	/* Limit of journal blocks per trans.*/
-	__be32	s_max_trans_data;	/* Limit of data blocks per trans. */
-
-/* 0x0050 */
-	__u32	s_padding[44];
-
-/* 0x0100 */
-	__u8	s_users[16*48];		/* ids of all fs'es sharing the log */
-/* 0x0400 */
-} journal_superblock_t;
-
-#define JFS_HAS_COMPAT_FEATURE(j,mask)					\
-	((j)->j_format_version >= 2 &&					\
-	 ((j)->j_superblock->s_feature_compat & cpu_to_be32((mask))))
-#define JFS_HAS_RO_COMPAT_FEATURE(j,mask)				\
-	((j)->j_format_version >= 2 &&					\
-	 ((j)->j_superblock->s_feature_ro_compat & cpu_to_be32((mask))))
-#define JFS_HAS_INCOMPAT_FEATURE(j,mask)				\
-	((j)->j_format_version >= 2 &&					\
-	 ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
-
-#define JFS_FEATURE_INCOMPAT_REVOKE	0x00000001
-
-/* Features known to this kernel version: */
-#define JFS_KNOWN_COMPAT_FEATURES	0
-#define JFS_KNOWN_ROCOMPAT_FEATURES	0
-#define JFS_KNOWN_INCOMPAT_FEATURES	JFS_FEATURE_INCOMPAT_REVOKE
-
-#ifdef __KERNEL__
-
-#include <linux/fs.h>
-#include <linux/sched.h>
-
-enum jbd_state_bits {
-	BH_JBD			/* Has an attached ext3 journal_head */
-	  = BH_PrivateStart,
-	BH_JWrite,		/* Being written to log (@@@ DEBUGGING) */
-	BH_Freed,		/* Has been freed (truncated) */
-	BH_Revoked,		/* Has been revoked from the log */
-	BH_RevokeValid,		/* Revoked flag is valid */
-	BH_JBDDirty,		/* Is dirty but journaled */
-	BH_State,		/* Pins most journal_head state */
-	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
-	BH_Unshadow,		/* Dummy bit, for BJ_Shadow wakeup filtering */
-	BH_JBDPrivateStart,	/* First bit available for private use by FS */
-};
-
-BUFFER_FNS(JBD, jbd)
-BUFFER_FNS(JWrite, jwrite)
-BUFFER_FNS(JBDDirty, jbddirty)
-TAS_BUFFER_FNS(JBDDirty, jbddirty)
-BUFFER_FNS(Revoked, revoked)
-TAS_BUFFER_FNS(Revoked, revoked)
-BUFFER_FNS(RevokeValid, revokevalid)
-TAS_BUFFER_FNS(RevokeValid, revokevalid)
-BUFFER_FNS(Freed, freed)
-
-#include <linux/jbd_common.h>
-
-#define J_ASSERT(assert)	BUG_ON(!(assert))
-
-#define J_ASSERT_BH(bh, expr)	J_ASSERT(expr)
-#define J_ASSERT_JH(jh, expr)	J_ASSERT(expr)
-
-#if defined(JBD_PARANOID_IOFAIL)
-#define J_EXPECT(expr, why...)		J_ASSERT(expr)
-#define J_EXPECT_BH(bh, expr, why...)	J_ASSERT_BH(bh, expr)
-#define J_EXPECT_JH(jh, expr, why...)	J_ASSERT_JH(jh, expr)
-#else
-#define __journal_expect(expr, why...)					     \
-	({								     \
-		int val = (expr);					     \
-		if (!val) {						     \
-			printk(KERN_ERR					     \
-				"EXT3-fs unexpected failure: %s;\n",# expr); \
-			printk(KERN_ERR why "\n");			     \
-		}							     \
-		val;							     \
-	})
-#define J_EXPECT(expr, why...)		__journal_expect(expr, ## why)
-#define J_EXPECT_BH(bh, expr, why...)	__journal_expect(expr, ## why)
-#define J_EXPECT_JH(jh, expr, why...)	__journal_expect(expr, ## why)
-#endif
-
-struct jbd_revoke_table_s;
-
-/**
- * struct handle_s - this is the concrete type associated with handle_t.
- * @h_transaction: Which compound transaction is this update a part of?
- * @h_buffer_credits: Number of remaining buffers we are allowed to dirty.
- * @h_ref: Reference count on this handle
- * @h_err: Field for caller's use to track errors through large fs operations
- * @h_sync: flag for sync-on-close
- * @h_jdata: flag to force data journaling
- * @h_aborted: flag indicating fatal error on handle
- * @h_lockdep_map: lockdep info for debugging lock problems
- */
-struct handle_s
-{
-	/* Which compound transaction is this update a part of? */
-	transaction_t		*h_transaction;
-
-	/* Number of remaining buffers we are allowed to dirty: */
-	int			h_buffer_credits;
-
-	/* Reference count on this handle */
-	int			h_ref;
-
-	/* Field for caller's use to track errors through large fs */
-	/* operations */
-	int			h_err;
-
-	/* Flags [no locking] */
-	unsigned int	h_sync:		1;	/* sync-on-close */
-	unsigned int	h_jdata:	1;	/* force data journaling */
-	unsigned int	h_aborted:	1;	/* fatal error on handle */
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	h_lockdep_map;
-#endif
-};
-
-
-/* The transaction_t type is the guts of the journaling mechanism.  It
- * tracks a compound transaction through its various states:
- *
- * RUNNING:	accepting new updates
- * LOCKED:	Updates still running but we don't accept new ones
- * RUNDOWN:	Updates are tidying up but have finished requesting
- *		new buffers to modify (state not used for now)
- * FLUSH:       All updates complete, but we are still writing to disk
- * COMMIT:      All data on disk, writing commit record
- * FINISHED:	We still have to keep the transaction for checkpointing.
- *
- * The transaction keeps track of all of the buffers modified by a
- * running transaction, and all of the buffers committed but not yet
- * flushed to home for finished transactions.
- */
-
-/*
- * Lock ranking:
- *
- *    j_list_lock
- *      ->jbd_lock_bh_journal_head()	(This is "innermost")
- *
- *    j_state_lock
- *    ->jbd_lock_bh_state()
- *
- *    jbd_lock_bh_state()
- *    ->j_list_lock
- *
- *    j_state_lock
- *    ->t_handle_lock
- *
- *    j_state_lock
- *    ->j_list_lock			(journal_unmap_buffer)
- *
- */
-
-struct transaction_s
-{
-	/* Pointer to the journal for this transaction. [no locking] */
-	journal_t		*t_journal;
-
-	/* Sequence number for this transaction [no locking] */
-	tid_t			t_tid;
-
-	/*
-	 * Transaction's current state
-	 * [no locking - only kjournald alters this]
-	 * [j_list_lock] guards transition of a transaction into T_FINISHED
-	 * state and subsequent call of __journal_drop_transaction()
-	 * FIXME: needs barriers
-	 * KLUDGE: [use j_state_lock]
-	 */
-	enum {
-		T_RUNNING,
-		T_LOCKED,
-		T_FLUSH,
-		T_COMMIT,
-		T_COMMIT_RECORD,
-		T_FINISHED
-	}			t_state;
-
-	/*
-	 * Where in the log does this transaction's commit start? [no locking]
-	 */
-	unsigned int		t_log_start;
-
-	/* Number of buffers on the t_buffers list [j_list_lock] */
-	int			t_nr_buffers;
-
-	/*
-	 * Doubly-linked circular list of all buffers reserved but not yet
-	 * modified by this transaction [j_list_lock]
-	 */
-	struct journal_head	*t_reserved_list;
-
-	/*
-	 * Doubly-linked circular list of all buffers under writeout during
-	 * commit [j_list_lock]
-	 */
-	struct journal_head	*t_locked_list;
-
-	/*
-	 * Doubly-linked circular list of all metadata buffers owned by this
-	 * transaction [j_list_lock]
-	 */
-	struct journal_head	*t_buffers;
-
-	/*
-	 * Doubly-linked circular list of all data buffers still to be
-	 * flushed before this transaction can be committed [j_list_lock]
-	 */
-	struct journal_head	*t_sync_datalist;
-
-	/*
-	 * Doubly-linked circular list of all forget buffers (superseded
-	 * buffers which we can un-checkpoint once this transaction commits)
-	 * [j_list_lock]
-	 */
-	struct journal_head	*t_forget;
-
-	/*
-	 * Doubly-linked circular list of all buffers still to be flushed before
-	 * this transaction can be checkpointed. [j_list_lock]
-	 */
-	struct journal_head	*t_checkpoint_list;
-
-	/*
-	 * Doubly-linked circular list of all buffers submitted for IO while
-	 * checkpointing. [j_list_lock]
-	 */
-	struct journal_head	*t_checkpoint_io_list;
-
-	/*
-	 * Doubly-linked circular list of temporary buffers currently undergoing
-	 * IO in the log [j_list_lock]
-	 */
-	struct journal_head	*t_iobuf_list;
-
-	/*
-	 * Doubly-linked circular list of metadata buffers being shadowed by log
-	 * IO.  The IO buffers on the iobuf list and the shadow buffers on this
-	 * list match each other one for one at all times. [j_list_lock]
-	 */
-	struct journal_head	*t_shadow_list;
-
-	/*
-	 * Doubly-linked circular list of control buffers being written to the
-	 * log. [j_list_lock]
-	 */
-	struct journal_head	*t_log_list;
-
-	/*
-	 * Protects info related to handles
-	 */
-	spinlock_t		t_handle_lock;
-
-	/*
-	 * Number of outstanding updates running on this transaction
-	 * [t_handle_lock]
-	 */
-	int			t_updates;
-
-	/*
-	 * Number of buffers reserved for use by all handles in this transaction
-	 * handle but not yet modified. [t_handle_lock]
-	 */
-	int			t_outstanding_credits;
-
-	/*
-	 * Forward and backward links for the circular list of all transactions
-	 * awaiting checkpoint. [j_list_lock]
-	 */
-	transaction_t		*t_cpnext, *t_cpprev;
-
-	/*
-	 * When will the transaction expire (become due for commit), in jiffies?
-	 * [no locking]
-	 */
-	unsigned long		t_expires;
-
-	/*
-	 * When this transaction started, in nanoseconds [no locking]
-	 */
-	ktime_t			t_start_time;
-
-	/*
-	 * How many handles used this transaction? [t_handle_lock]
-	 */
-	int t_handle_count;
-};
-
-/**
- * struct journal_s - this is the concrete type associated with journal_t.
- * @j_flags:  General journaling state flags
- * @j_errno:  Is there an outstanding uncleared error on the journal (from a
- *     prior abort)?
- * @j_sb_buffer: First part of superblock buffer
- * @j_superblock: Second part of superblock buffer
- * @j_format_version: Version of the superblock format
- * @j_state_lock: Protect the various scalars in the journal
- * @j_barrier_count:  Number of processes waiting to create a barrier lock
- * @j_running_transaction: The current running transaction..
- * @j_committing_transaction: the transaction we are pushing to disk
- * @j_checkpoint_transactions: a linked circular list of all transactions
- *  waiting for checkpointing
- * @j_wait_transaction_locked: Wait queue for waiting for a locked transaction
- *  to start committing, or for a barrier lock to be released
- * @j_wait_logspace: Wait queue for waiting for checkpointing to complete
- * @j_wait_done_commit: Wait queue for waiting for commit to complete
- * @j_wait_checkpoint:  Wait queue to trigger checkpointing
- * @j_wait_commit: Wait queue to trigger commit
- * @j_wait_updates: Wait queue to wait for updates to complete
- * @j_checkpoint_mutex: Mutex for locking against concurrent checkpoints
- * @j_head: Journal head - identifies the first unused block in the journal
- * @j_tail: Journal tail - identifies the oldest still-used block in the
- *  journal.
- * @j_free: Journal free - how many free blocks are there in the journal?
- * @j_first: The block number of the first usable block
- * @j_last: The block number one beyond the last usable block
- * @j_dev: Device where we store the journal
- * @j_blocksize: blocksize for the location where we store the journal.
- * @j_blk_offset: starting block offset for into the device where we store the
- *     journal
- * @j_fs_dev: Device which holds the client fs.  For internal journal this will
- *     be equal to j_dev
- * @j_maxlen: Total maximum capacity of the journal region on disk.
- * @j_list_lock: Protects the buffer lists and internal buffer state.
- * @j_inode: Optional inode where we store the journal.  If present, all journal
- *     block numbers are mapped into this inode via bmap().
- * @j_tail_sequence:  Sequence number of the oldest transaction in the log
- * @j_transaction_sequence: Sequence number of the next transaction to grant
- * @j_commit_sequence: Sequence number of the most recently committed
- *  transaction
- * @j_commit_request: Sequence number of the most recent transaction wanting
- *     commit
- * @j_commit_waited: Sequence number of the most recent transaction someone
- *     is waiting for to commit.
- * @j_uuid: Uuid of client object.
- * @j_task: Pointer to the current commit thread for this journal
- * @j_max_transaction_buffers:  Maximum number of metadata buffers to allow in a
- *     single compound commit transaction
- * @j_commit_interval: What is the maximum transaction lifetime before we begin
- *  a commit?
- * @j_commit_timer:  The timer used to wakeup the commit thread
- * @j_revoke_lock: Protect the revoke table
- * @j_revoke: The revoke table - maintains the list of revoked blocks in the
- *     current transaction.
- * @j_revoke_table: alternate revoke tables for j_revoke
- * @j_wbuf: array of buffer_heads for journal_commit_transaction
- * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the
- *	number that will fit in j_blocksize
- * @j_last_sync_writer: most recent pid which did a synchronous write
- * @j_average_commit_time: the average amount of time in nanoseconds it
- *	takes to commit a transaction to the disk.
- * @j_private: An opaque pointer to fs-private information.
- */
-
-struct journal_s
-{
-	/* General journaling state flags [j_state_lock] */
-	unsigned long		j_flags;
-
-	/*
-	 * Is there an outstanding uncleared error on the journal (from a prior
-	 * abort)? [j_state_lock]
-	 */
-	int			j_errno;
-
-	/* The superblock buffer */
-	struct buffer_head	*j_sb_buffer;
-	journal_superblock_t	*j_superblock;
-
-	/* Version of the superblock format */
-	int			j_format_version;
-
-	/*
-	 * Protect the various scalars in the journal
-	 */
-	spinlock_t		j_state_lock;
-
-	/*
-	 * Number of processes waiting to create a barrier lock [j_state_lock]
-	 */
-	int			j_barrier_count;
-
-	/*
-	 * Transactions: The current running transaction...
-	 * [j_state_lock] [caller holding open handle]
-	 */
-	transaction_t		*j_running_transaction;
-
-	/*
-	 * the transaction we are pushing to disk
-	 * [j_state_lock] [caller holding open handle]
-	 */
-	transaction_t		*j_committing_transaction;
-
-	/*
-	 * ... and a linked circular list of all transactions waiting for
-	 * checkpointing. [j_list_lock]
-	 */
-	transaction_t		*j_checkpoint_transactions;
-
-	/*
-	 * Wait queue for waiting for a locked transaction to start committing,
-	 * or for a barrier lock to be released
-	 */
-	wait_queue_head_t	j_wait_transaction_locked;
-
-	/* Wait queue for waiting for checkpointing to complete */
-	wait_queue_head_t	j_wait_logspace;
-
-	/* Wait queue for waiting for commit to complete */
-	wait_queue_head_t	j_wait_done_commit;
-
-	/* Wait queue to trigger checkpointing */
-	wait_queue_head_t	j_wait_checkpoint;
-
-	/* Wait queue to trigger commit */
-	wait_queue_head_t	j_wait_commit;
-
-	/* Wait queue to wait for updates to complete */
-	wait_queue_head_t	j_wait_updates;
-
-	/* Semaphore for locking against concurrent checkpoints */
-	struct mutex		j_checkpoint_mutex;
-
-	/*
-	 * Journal head: identifies the first unused block in the journal.
-	 * [j_state_lock]
-	 */
-	unsigned int		j_head;
-
-	/*
-	 * Journal tail: identifies the oldest still-used block in the journal.
-	 * [j_state_lock]
-	 */
-	unsigned int		j_tail;
-
-	/*
-	 * Journal free: how many free blocks are there in the journal?
-	 * [j_state_lock]
-	 */
-	unsigned int		j_free;
-
-	/*
-	 * Journal start and end: the block numbers of the first usable block
-	 * and one beyond the last usable block in the journal. [j_state_lock]
-	 */
-	unsigned int		j_first;
-	unsigned int		j_last;
-
-	/*
-	 * Device, blocksize and starting block offset for the location where we
-	 * store the journal.
-	 */
-	struct block_device	*j_dev;
-	int			j_blocksize;
-	unsigned int		j_blk_offset;
-
-	/*
-	 * Device which holds the client fs.  For internal journal this will be
-	 * equal to j_dev.
-	 */
-	struct block_device	*j_fs_dev;
-
-	/* Total maximum capacity of the journal region on disk. */
-	unsigned int		j_maxlen;
-
-	/*
-	 * Protects the buffer lists and internal buffer state.
-	 */
-	spinlock_t		j_list_lock;
-
-	/* Optional inode where we store the journal.  If present, all */
-	/* journal block numbers are mapped into this inode via */
-	/* bmap(). */
-	struct inode		*j_inode;
-
-	/*
-	 * Sequence number of the oldest transaction in the log [j_state_lock]
-	 */
-	tid_t			j_tail_sequence;
-
-	/*
-	 * Sequence number of the next transaction to grant [j_state_lock]
-	 */
-	tid_t			j_transaction_sequence;
-
-	/*
-	 * Sequence number of the most recently committed transaction
-	 * [j_state_lock].
-	 */
-	tid_t			j_commit_sequence;
-
-	/*
-	 * Sequence number of the most recent transaction wanting commit
-	 * [j_state_lock]
-	 */
-	tid_t			j_commit_request;
-
-	/*
-	 * Sequence number of the most recent transaction someone is waiting
-	 * for to commit.
-	 * [j_state_lock]
-	 */
-	tid_t                   j_commit_waited;
-
-	/*
-	 * Journal uuid: identifies the object (filesystem, LVM volume etc)
-	 * backed by this journal.  This will eventually be replaced by an array
-	 * of uuids, allowing us to index multiple devices within a single
-	 * journal and to perform atomic updates across them.
-	 */
-	__u8			j_uuid[16];
-
-	/* Pointer to the current commit thread for this journal */
-	struct task_struct	*j_task;
-
-	/*
-	 * Maximum number of metadata buffers to allow in a single compound
-	 * commit transaction
-	 */
-	int			j_max_transaction_buffers;
-
-	/*
-	 * What is the maximum transaction lifetime before we begin a commit?
-	 */
-	unsigned long		j_commit_interval;
-
-	/* The timer used to wakeup the commit thread: */
-	struct timer_list	j_commit_timer;
-
-	/*
-	 * The revoke table: maintains the list of revoked blocks in the
-	 * current transaction.  [j_revoke_lock]
-	 */
-	spinlock_t		j_revoke_lock;
-	struct jbd_revoke_table_s *j_revoke;
-	struct jbd_revoke_table_s *j_revoke_table[2];
-
-	/*
-	 * array of bhs for journal_commit_transaction
-	 */
-	struct buffer_head	**j_wbuf;
-	int			j_wbufsize;
-
-	/*
-	 * this is the pid of the last person to run a synchronous operation
-	 * through the journal.
-	 */
-	pid_t			j_last_sync_writer;
-
-	/*
-	 * the average amount of time in nanoseconds it takes to commit a
-	 * transaction to the disk.  [j_state_lock]
-	 */
-	u64			j_average_commit_time;
-
-	/*
-	 * An opaque pointer to fs-private information.  ext3 puts its
-	 * superblock pointer here
-	 */
-	void *j_private;
-};
-
-/*
- * Journal flag definitions
- */
-#define JFS_UNMOUNT	0x001	/* Journal thread is being destroyed */
-#define JFS_ABORT	0x002	/* Journaling has been aborted for errors. */
-#define JFS_ACK_ERR	0x004	/* The errno in the sb has been acked */
-#define JFS_FLUSHED	0x008	/* The journal superblock has been flushed */
-#define JFS_LOADED	0x010	/* The journal superblock has been loaded */
-#define JFS_BARRIER	0x020	/* Use IDE barriers */
-#define JFS_ABORT_ON_SYNCDATA_ERR	0x040  /* Abort the journal on file
-						* data write error in ordered
-						* mode */
-
-/*
- * Function declarations for the journaling transaction and buffer
- * management
- */
-
-/* Filing buffers */
-extern void journal_unfile_buffer(journal_t *, struct journal_head *);
-extern void __journal_unfile_buffer(struct journal_head *);
-extern void __journal_refile_buffer(struct journal_head *);
-extern void journal_refile_buffer(journal_t *, struct journal_head *);
-extern void __journal_file_buffer(struct journal_head *, transaction_t *, int);
-extern void __journal_free_buffer(struct journal_head *bh);
-extern void journal_file_buffer(struct journal_head *, transaction_t *, int);
-extern void __journal_clean_data_list(transaction_t *transaction);
-
-/* Log buffer allocation */
-extern struct journal_head * journal_get_descriptor_buffer(journal_t *);
-int journal_next_log_block(journal_t *, unsigned int *);
-
-/* Commit management */
-extern void journal_commit_transaction(journal_t *);
-
-/* Checkpoint list management */
-int __journal_clean_checkpoint_list(journal_t *journal);
-int __journal_remove_checkpoint(struct journal_head *);
-void __journal_insert_checkpoint(struct journal_head *, transaction_t *);
-
-/* Buffer IO */
-extern int
-journal_write_metadata_buffer(transaction_t	  *transaction,
-			      struct journal_head  *jh_in,
-			      struct journal_head **jh_out,
-			      unsigned int blocknr);
-
-/* Transaction locking */
-extern void		__wait_on_journal (journal_t *);
-
-/*
- * Journal locking.
- *
- * We need to lock the journal during transaction state changes so that nobody
- * ever tries to take a handle on the running transaction while we are in the
- * middle of moving it to the commit phase.  j_state_lock does this.
- *
- * Note that the locking is completely interrupt unsafe.  We never touch
- * journal structures from interrupts.
- */
-
-static inline handle_t *journal_current_handle(void)
-{
-	return current->journal_info;
-}
-
-/* The journaling code user interface:
- *
- * Create and destroy handles
- * Register buffer modifications against the current transaction.
- */
-
-extern handle_t *journal_start(journal_t *, int nblocks);
-extern int	 journal_restart (handle_t *, int nblocks);
-extern int	 journal_extend (handle_t *, int nblocks);
-extern int	 journal_get_write_access(handle_t *, struct buffer_head *);
-extern int	 journal_get_create_access (handle_t *, struct buffer_head *);
-extern int	 journal_get_undo_access(handle_t *, struct buffer_head *);
-extern int	 journal_dirty_data (handle_t *, struct buffer_head *);
-extern int	 journal_dirty_metadata (handle_t *, struct buffer_head *);
-extern void	 journal_release_buffer (handle_t *, struct buffer_head *);
-extern int	 journal_forget (handle_t *, struct buffer_head *);
-extern void	 journal_sync_buffer (struct buffer_head *);
-extern void	 journal_invalidatepage(journal_t *,
-				struct page *, unsigned int, unsigned int);
-extern int	 journal_try_to_free_buffers(journal_t *, struct page *, gfp_t);
-extern int	 journal_stop(handle_t *);
-extern int	 journal_flush (journal_t *);
-extern void	 journal_lock_updates (journal_t *);
-extern void	 journal_unlock_updates (journal_t *);
-
-extern journal_t * journal_init_dev(struct block_device *bdev,
-				struct block_device *fs_dev,
-				int start, int len, int bsize);
-extern journal_t * journal_init_inode (struct inode *);
-extern int	   journal_update_format (journal_t *);
-extern int	   journal_check_used_features
-		   (journal_t *, unsigned long, unsigned long, unsigned long);
-extern int	   journal_check_available_features
-		   (journal_t *, unsigned long, unsigned long, unsigned long);
-extern int	   journal_set_features
-		   (journal_t *, unsigned long, unsigned long, unsigned long);
-extern int	   journal_create     (journal_t *);
-extern int	   journal_load       (journal_t *journal);
-extern int	   journal_destroy    (journal_t *);
-extern int	   journal_recover    (journal_t *journal);
-extern int	   journal_wipe       (journal_t *, int);
-extern int	   journal_skip_recovery	(journal_t *);
-extern void	   journal_update_sb_log_tail	(journal_t *, tid_t, unsigned int,
-						 int);
-extern void	   journal_abort      (journal_t *, int);
-extern int	   journal_errno      (journal_t *);
-extern void	   journal_ack_err    (journal_t *);
-extern int	   journal_clear_err  (journal_t *);
-extern int	   journal_bmap(journal_t *, unsigned int, unsigned int *);
-extern int	   journal_force_commit(journal_t *);
-
-/*
- * journal_head management
- */
-struct journal_head *journal_add_journal_head(struct buffer_head *bh);
-struct journal_head *journal_grab_journal_head(struct buffer_head *bh);
-void journal_put_journal_head(struct journal_head *jh);
-
-/*
- * handle management
- */
-extern struct kmem_cache *jbd_handle_cache;
-
-static inline handle_t *jbd_alloc_handle(gfp_t gfp_flags)
-{
-	return kmem_cache_zalloc(jbd_handle_cache, gfp_flags);
-}
-
-static inline void jbd_free_handle(handle_t *handle)
-{
-	kmem_cache_free(jbd_handle_cache, handle);
-}
-
-/* Primary revoke support */
-#define JOURNAL_REVOKE_DEFAULT_HASH 256
-extern int	   journal_init_revoke(journal_t *, int);
-extern void	   journal_destroy_revoke_caches(void);
-extern int	   journal_init_revoke_caches(void);
-
-extern void	   journal_destroy_revoke(journal_t *);
-extern int	   journal_revoke (handle_t *,
-				unsigned int, struct buffer_head *);
-extern int	   journal_cancel_revoke(handle_t *, struct journal_head *);
-extern void	   journal_write_revoke_records(journal_t *,
-						transaction_t *, int);
-
-/* Recovery revoke support */
-extern int	journal_set_revoke(journal_t *, unsigned int, tid_t);
-extern int	journal_test_revoke(journal_t *, unsigned int, tid_t);
-extern void	journal_clear_revoke(journal_t *);
-extern void	journal_switch_revoke_table(journal_t *journal);
-extern void	journal_clear_buffer_revoked_flags(journal_t *journal);
-
-/*
- * The log thread user interface:
- *
- * Request space in the current transaction, and force transaction commit
- * transitions on demand.
- */
-
-int __log_space_left(journal_t *); /* Called with journal locked */
-int log_start_commit(journal_t *journal, tid_t tid);
-int __log_start_commit(journal_t *journal, tid_t tid);
-int journal_start_commit(journal_t *journal, tid_t *tid);
-int journal_force_commit_nested(journal_t *journal);
-int log_wait_commit(journal_t *journal, tid_t tid);
-int log_do_checkpoint(journal_t *journal);
-int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
-
-void __log_wait_for_space(journal_t *journal);
-extern void	__journal_drop_transaction(journal_t *, transaction_t *);
-extern int	cleanup_journal_tail(journal_t *);
-
-/*
- * is_journal_abort
- *
- * Simple test wrapper function to test the JFS_ABORT state flag.  This
- * bit, when set, indicates that we have had a fatal error somewhere,
- * either inside the journaling layer or indicated to us by the client
- * (eg. ext3), and that we and should not commit any further
- * transactions.
- */
-
-static inline int is_journal_aborted(journal_t *journal)
-{
-	return journal->j_flags & JFS_ABORT;
-}
-
-static inline int is_handle_aborted(handle_t *handle)
-{
-	if (handle->h_aborted)
-		return 1;
-	return is_journal_aborted(handle->h_transaction->t_journal);
-}
-
-static inline void journal_abort_handle(handle_t *handle)
-{
-	handle->h_aborted = 1;
-}
-
-#endif /* __KERNEL__   */
-
-/* Comparison functions for transaction IDs: perform comparisons using
- * modulo arithmetic so that they work over sequence number wraps. */
-
-static inline int tid_gt(tid_t x, tid_t y)
-{
-	int difference = (x - y);
-	return (difference > 0);
-}
-
-static inline int tid_geq(tid_t x, tid_t y)
-{
-	int difference = (x - y);
-	return (difference >= 0);
-}
-
-extern int journal_blocks_per_page(struct inode *inode);
-
-/*
- * Return the minimum number of blocks which must be free in the journal
- * before a new transaction may be started.  Must be called under j_state_lock.
- */
-static inline int jbd_space_needed(journal_t *journal)
-{
-	int nblocks = journal->j_max_transaction_buffers;
-	if (journal->j_committing_transaction)
-		nblocks += journal->j_committing_transaction->
-					t_outstanding_credits;
-	return nblocks;
-}
-
-/*
- * Definitions which augment the buffer_head layer
- */
-
-/* journaling buffer types */
-#define BJ_None		0	/* Not journaled */
-#define BJ_SyncData	1	/* Normal data: flush before commit */
-#define BJ_Metadata	2	/* Normal journaled metadata */
-#define BJ_Forget	3	/* Buffer superseded by this transaction */
-#define BJ_IO		4	/* Buffer is for temporary IO use */
-#define BJ_Shadow	5	/* Buffer contents being shadowed to the log */
-#define BJ_LogCtl	6	/* Buffer contains log descriptors */
-#define BJ_Reserved	7	/* Buffer is reserved for access by journal */
-#define BJ_Locked	8	/* Locked for I/O during commit */
-#define BJ_Types	9
-
-extern int jbd_blocks_per_page(struct inode *inode);
-
-#ifdef __KERNEL__
-
-#define buffer_trace_init(bh)	do {} while (0)
-#define print_buffer_fields(bh)	do {} while (0)
-#define print_buffer_trace(bh)	do {} while (0)
-#define BUFFER_TRACE(bh, info)	do {} while (0)
-#define BUFFER_TRACE2(bh, bh2, info)	do {} while (0)
-#define JBUFFER_TRACE(jh, info)	do {} while (0)
-
-#endif	/* __KERNEL__ */
-
-#endif	/* _LINUX_JBD_H */
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index edb640ae9a94..ad4b28647298 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -29,6 +29,7 @@
 #include <linux/mutex.h>
 #include <linux/timer.h>
 #include <linux/slab.h>
+#include <linux/bit_spinlock.h>
 #include <crypto/hash.h>
 #endif
 
@@ -336,7 +337,45 @@ BUFFER_FNS(Freed, freed)
 BUFFER_FNS(Shadow, shadow)
 BUFFER_FNS(Verified, verified)
 
-#include <linux/jbd_common.h>
+static inline struct buffer_head *jh2bh(struct journal_head *jh)
+{
+	return jh->b_bh;
+}
+
+static inline struct journal_head *bh2jh(struct buffer_head *bh)
+{
+	return bh->b_private;
+}
+
+static inline void jbd_lock_bh_state(struct buffer_head *bh)
+{
+	bit_spin_lock(BH_State, &bh->b_state);
+}
+
+static inline int jbd_trylock_bh_state(struct buffer_head *bh)
+{
+	return bit_spin_trylock(BH_State, &bh->b_state);
+}
+
+static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
+{
+	return bit_spin_is_locked(BH_State, &bh->b_state);
+}
+
+static inline void jbd_unlock_bh_state(struct buffer_head *bh)
+{
+	bit_spin_unlock(BH_State, &bh->b_state);
+}
+
+static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
+{
+	bit_spin_lock(BH_JournalHead, &bh->b_state);
+}
+
+static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
+{
+	bit_spin_unlock(BH_JournalHead, &bh->b_state);
+}
 
 #define J_ASSERT(assert)	BUG_ON(!(assert))
 
diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h
deleted file mode 100644
index 3dc53432355f..000000000000
--- a/include/linux/jbd_common.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef _LINUX_JBD_STATE_H
-#define _LINUX_JBD_STATE_H
-
-#include <linux/bit_spinlock.h>
-
-static inline struct buffer_head *jh2bh(struct journal_head *jh)
-{
-	return jh->b_bh;
-}
-
-static inline struct journal_head *bh2jh(struct buffer_head *bh)
-{
-	return bh->b_private;
-}
-
-static inline void jbd_lock_bh_state(struct buffer_head *bh)
-{
-	bit_spin_lock(BH_State, &bh->b_state);
-}
-
-static inline int jbd_trylock_bh_state(struct buffer_head *bh)
-{
-	return bit_spin_trylock(BH_State, &bh->b_state);
-}
-
-static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
-{
-	return bit_spin_is_locked(BH_State, &bh->b_state);
-}
-
-static inline void jbd_unlock_bh_state(struct buffer_head *bh)
-{
-	bit_spin_unlock(BH_State, &bh->b_state);
-}
-
-static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
-{
-	bit_spin_lock(BH_JournalHead, &bh->b_state);
-}
-
-static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
-{
-	bit_spin_unlock(BH_JournalHead, &bh->b_state);
-}
-
-#endif
diff --git a/include/trace/events/ext3.h b/include/trace/events/ext3.h
deleted file mode 100644
index fc733d28117a..000000000000
--- a/include/trace/events/ext3.h
+++ /dev/null
@@ -1,866 +0,0 @@
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM ext3
-
-#if !defined(_TRACE_EXT3_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_EXT3_H
-
-#include <linux/tracepoint.h>
-
-TRACE_EVENT(ext3_free_inode,
-	TP_PROTO(struct inode *inode),
-
-	TP_ARGS(inode),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	umode_t, mode			)
-		__field(	uid_t,	uid			)
-		__field(	gid_t,	gid			)
-		__field(	blkcnt_t, blocks		)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->mode	= inode->i_mode;
-		__entry->uid	= i_uid_read(inode);
-		__entry->gid	= i_gid_read(inode);
-		__entry->blocks	= inode->i_blocks;
-	),
-
-	TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %lu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  __entry->mode, __entry->uid, __entry->gid,
-		  (unsigned long) __entry->blocks)
-);
-
-TRACE_EVENT(ext3_request_inode,
-	TP_PROTO(struct inode *dir, int mode),
-
-	TP_ARGS(dir, mode),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	dir			)
-		__field(	umode_t, mode			)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= dir->i_sb->s_dev;
-		__entry->dir	= dir->i_ino;
-		__entry->mode	= mode;
-	),
-
-	TP_printk("dev %d,%d dir %lu mode 0%o",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->dir, __entry->mode)
-);
-
-TRACE_EVENT(ext3_allocate_inode,
-	TP_PROTO(struct inode *inode, struct inode *dir, int mode),
-
-	TP_ARGS(inode, dir, mode),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	ino_t,	dir			)
-		__field(	umode_t, mode			)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->dir	= dir->i_ino;
-		__entry->mode	= mode;
-	),
-
-	TP_printk("dev %d,%d ino %lu dir %lu mode 0%o",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  (unsigned long) __entry->dir, __entry->mode)
-);
-
-TRACE_EVENT(ext3_evict_inode,
-	TP_PROTO(struct inode *inode),
-
-	TP_ARGS(inode),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	int,	nlink			)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->nlink	= inode->i_nlink;
-	),
-
-	TP_printk("dev %d,%d ino %lu nlink %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino, __entry->nlink)
-);
-
-TRACE_EVENT(ext3_drop_inode,
-	TP_PROTO(struct inode *inode, int drop),
-
-	TP_ARGS(inode, drop),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	int,	drop			)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->drop	= drop;
-	),
-
-	TP_printk("dev %d,%d ino %lu drop %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino, __entry->drop)
-);
-
-TRACE_EVENT(ext3_mark_inode_dirty,
-	TP_PROTO(struct inode *inode, unsigned long IP),
-
-	TP_ARGS(inode, IP),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(unsigned long,	ip			)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->ip	= IP;
-	),
-
-	TP_printk("dev %d,%d ino %lu caller %pS",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino, (void *)__entry->ip)
-);
-
-TRACE_EVENT(ext3_write_begin,
-	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
-		 unsigned int flags),
-
-	TP_ARGS(inode, pos, len, flags),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	loff_t,	pos			)
-		__field(	unsigned int, len		)
-		__field(	unsigned int, flags		)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->pos	= pos;
-		__entry->len	= len;
-		__entry->flags	= flags;
-	),
-
-	TP_printk("dev %d,%d ino %lu pos %llu len %u flags %u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  (unsigned long long) __entry->pos, __entry->len,
-		  __entry->flags)
-);
-
-DECLARE_EVENT_CLASS(ext3__write_end,
-	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
-			unsigned int copied),
-
-	TP_ARGS(inode, pos, len, copied),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	loff_t,	pos			)
-		__field(	unsigned int, len		)
-		__field(	unsigned int, copied		)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->pos	= pos;
-		__entry->len	= len;
-		__entry->copied	= copied;
-	),
-
-	TP_printk("dev %d,%d ino %lu pos %llu len %u copied %u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  (unsigned long long) __entry->pos, __entry->len,
-		  __entry->copied)
-);
-
-DEFINE_EVENT(ext3__write_end, ext3_ordered_write_end,
-
-	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
-		 unsigned int copied),
-
-	TP_ARGS(inode, pos, len, copied)
-);
-
-DEFINE_EVENT(ext3__write_end, ext3_writeback_write_end,
-
-	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
-		 unsigned int copied),
-
-	TP_ARGS(inode, pos, len, copied)
-);
-
-DEFINE_EVENT(ext3__write_end, ext3_journalled_write_end,
-
-	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
-		 unsigned int copied),
-
-	TP_ARGS(inode, pos, len, copied)
-);
-
-DECLARE_EVENT_CLASS(ext3__page_op,
-	TP_PROTO(struct page *page),
-
-	TP_ARGS(page),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	pgoff_t, index			)
-
-	),
-
-	TP_fast_assign(
-		__entry->index	= page->index;
-		__entry->ino	= page->mapping->host->i_ino;
-		__entry->dev	= page->mapping->host->i_sb->s_dev;
-	),
-
-	TP_printk("dev %d,%d ino %lu page_index %lu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino, __entry->index)
-);
-
-DEFINE_EVENT(ext3__page_op, ext3_ordered_writepage,
-
-	TP_PROTO(struct page *page),
-
-	TP_ARGS(page)
-);
-
-DEFINE_EVENT(ext3__page_op, ext3_writeback_writepage,
-
-	TP_PROTO(struct page *page),
-
-	TP_ARGS(page)
-);
-
-DEFINE_EVENT(ext3__page_op, ext3_journalled_writepage,
-
-	TP_PROTO(struct page *page),
-
-	TP_ARGS(page)
-);
-
-DEFINE_EVENT(ext3__page_op, ext3_readpage,
-
-	TP_PROTO(struct page *page),
-
-	TP_ARGS(page)
-);
-
-DEFINE_EVENT(ext3__page_op, ext3_releasepage,
-
-	TP_PROTO(struct page *page),
-
-	TP_ARGS(page)
-);
-
-TRACE_EVENT(ext3_invalidatepage,
-	TP_PROTO(struct page *page, unsigned int offset, unsigned int length),
-
-	TP_ARGS(page, offset, length),
-
-	TP_STRUCT__entry(
-		__field(	pgoff_t, index			)
-		__field(	unsigned int, offset		)
-		__field(	unsigned int, length		)
-		__field(	ino_t,	ino			)
-		__field(	dev_t,	dev			)
-
-	),
-
-	TP_fast_assign(
-		__entry->index	= page->index;
-		__entry->offset	= offset;
-		__entry->length	= length;
-		__entry->ino	= page->mapping->host->i_ino;
-		__entry->dev	= page->mapping->host->i_sb->s_dev;
-	),
-
-	TP_printk("dev %d,%d ino %lu page_index %lu offset %u length %u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  __entry->index, __entry->offset, __entry->length)
-);
-
-TRACE_EVENT(ext3_discard_blocks,
-	TP_PROTO(struct super_block *sb, unsigned long blk,
-			unsigned long count),
-
-	TP_ARGS(sb, blk, count),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,		dev		)
-		__field(	unsigned long,	blk		)
-		__field(	unsigned long,	count		)
-
-	),
-
-	TP_fast_assign(
-		__entry->dev	= sb->s_dev;
-		__entry->blk	= blk;
-		__entry->count	= count;
-	),
-
-	TP_printk("dev %d,%d blk %lu count %lu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->blk, __entry->count)
-);
-
-TRACE_EVENT(ext3_request_blocks,
-	TP_PROTO(struct inode *inode, unsigned long goal,
-		 unsigned long count),
-
-	TP_ARGS(inode, goal, count),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	unsigned long, count		)
-		__field(	unsigned long,	goal		)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->count	= count;
-		__entry->goal	= goal;
-	),
-
-	TP_printk("dev %d,%d ino %lu count %lu goal %lu ",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  __entry->count, __entry->goal)
-);
-
-TRACE_EVENT(ext3_allocate_blocks,
-	TP_PROTO(struct inode *inode, unsigned long goal,
-		 unsigned long count, unsigned long block),
-
-	TP_ARGS(inode, goal, count, block),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	unsigned long,	block		)
-		__field(	unsigned long, count		)
-		__field(	unsigned long,	goal		)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->block	= block;
-		__entry->count	= count;
-		__entry->goal	= goal;
-	),
-
-	TP_printk("dev %d,%d ino %lu count %lu block %lu goal %lu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		   __entry->count, __entry->block,
-		  __entry->goal)
-);
-
-TRACE_EVENT(ext3_free_blocks,
-	TP_PROTO(struct inode *inode, unsigned long block,
-		 unsigned long count),
-
-	TP_ARGS(inode, block, count),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	umode_t, mode			)
-		__field(	unsigned long,	block		)
-		__field(	unsigned long,	count		)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= inode->i_sb->s_dev;
-		__entry->ino		= inode->i_ino;
-		__entry->mode		= inode->i_mode;
-		__entry->block		= block;
-		__entry->count		= count;
-	),
-
-	TP_printk("dev %d,%d ino %lu mode 0%o block %lu count %lu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  __entry->mode, __entry->block, __entry->count)
-);
-
-TRACE_EVENT(ext3_sync_file_enter,
-	TP_PROTO(struct file *file, int datasync),
-
-	TP_ARGS(file, datasync),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	ino_t,	parent			)
-		__field(	int,	datasync		)
-	),
-
-	TP_fast_assign(
-		struct dentry *dentry = file->f_path.dentry;
-
-		__entry->dev		= d_inode(dentry)->i_sb->s_dev;
-		__entry->ino		= d_inode(dentry)->i_ino;
-		__entry->datasync	= datasync;
-		__entry->parent		= d_inode(dentry->d_parent)->i_ino;
-	),
-
-	TP_printk("dev %d,%d ino %lu parent %ld datasync %d ",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  (unsigned long) __entry->parent, __entry->datasync)
-);
-
-TRACE_EVENT(ext3_sync_file_exit,
-	TP_PROTO(struct inode *inode, int ret),
-
-	TP_ARGS(inode, ret),
-
-	TP_STRUCT__entry(
-		__field(	int,	ret			)
-		__field(	ino_t,	ino			)
-		__field(	dev_t,	dev			)
-	),
-
-	TP_fast_assign(
-		__entry->ret		= ret;
-		__entry->ino		= inode->i_ino;
-		__entry->dev		= inode->i_sb->s_dev;
-	),
-
-	TP_printk("dev %d,%d ino %lu ret %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  __entry->ret)
-);
-
-TRACE_EVENT(ext3_sync_fs,
-	TP_PROTO(struct super_block *sb, int wait),
-
-	TP_ARGS(sb, wait),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	int,	wait			)
-
-	),
-
-	TP_fast_assign(
-		__entry->dev	= sb->s_dev;
-		__entry->wait	= wait;
-	),
-
-	TP_printk("dev %d,%d wait %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->wait)
-);
-
-TRACE_EVENT(ext3_rsv_window_add,
-	TP_PROTO(struct super_block *sb,
-		 struct ext3_reserve_window_node *rsv_node),
-
-	TP_ARGS(sb, rsv_node),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	start		)
-		__field(	unsigned long,	end		)
-		__field(	dev_t,	dev			)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= sb->s_dev;
-		__entry->start	= rsv_node->rsv_window._rsv_start;
-		__entry->end	= rsv_node->rsv_window._rsv_end;
-	),
-
-	TP_printk("dev %d,%d start %lu end %lu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->start, __entry->end)
-);
-
-TRACE_EVENT(ext3_discard_reservation,
-	TP_PROTO(struct inode *inode,
-		 struct ext3_reserve_window_node *rsv_node),
-
-	TP_ARGS(inode, rsv_node),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	start		)
-		__field(	unsigned long,	end		)
-		__field(	ino_t,	ino			)
-		__field(	dev_t,	dev			)
-	),
-
-	TP_fast_assign(
-		__entry->start	= rsv_node->rsv_window._rsv_start;
-		__entry->end	= rsv_node->rsv_window._rsv_end;
-		__entry->ino	= inode->i_ino;
-		__entry->dev	= inode->i_sb->s_dev;
-	),
-
-	TP_printk("dev %d,%d ino %lu start %lu end %lu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long)__entry->ino, __entry->start,
-		  __entry->end)
-);
-
-TRACE_EVENT(ext3_alloc_new_reservation,
-	TP_PROTO(struct super_block *sb, unsigned long goal),
-
-	TP_ARGS(sb, goal),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	unsigned long,	goal		)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= sb->s_dev;
-		__entry->goal	= goal;
-	),
-
-	TP_printk("dev %d,%d goal %lu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->goal)
-);
-
-TRACE_EVENT(ext3_reserved,
-	TP_PROTO(struct super_block *sb, unsigned long block,
-		 struct ext3_reserve_window_node *rsv_node),
-
-	TP_ARGS(sb, block, rsv_node),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	block		)
-		__field(	unsigned long,	start		)
-		__field(	unsigned long,	end		)
-		__field(	dev_t,	dev			)
-	),
-
-	TP_fast_assign(
-		__entry->block	= block;
-		__entry->start	= rsv_node->rsv_window._rsv_start;
-		__entry->end	= rsv_node->rsv_window._rsv_end;
-		__entry->dev	= sb->s_dev;
-	),
-
-	TP_printk("dev %d,%d block %lu, start %lu end %lu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->block, __entry->start, __entry->end)
-);
-
-TRACE_EVENT(ext3_forget,
-	TP_PROTO(struct inode *inode, int is_metadata, unsigned long block),
-
-	TP_ARGS(inode, is_metadata, block),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	umode_t, mode			)
-		__field(	int,	is_metadata		)
-		__field(	unsigned long,	block		)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->mode	= inode->i_mode;
-		__entry->is_metadata = is_metadata;
-		__entry->block	= block;
-	),
-
-	TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %lu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  __entry->mode, __entry->is_metadata, __entry->block)
-);
-
-TRACE_EVENT(ext3_read_block_bitmap,
-	TP_PROTO(struct super_block *sb, unsigned int group),
-
-	TP_ARGS(sb, group),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	__u32,	group			)
-
-	),
-
-	TP_fast_assign(
-		__entry->dev	= sb->s_dev;
-		__entry->group	= group;
-	),
-
-	TP_printk("dev %d,%d group %u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->group)
-);
-
-TRACE_EVENT(ext3_direct_IO_enter,
-	TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw),
-
-	TP_ARGS(inode, offset, len, rw),
-
-	TP_STRUCT__entry(
-		__field(	ino_t,	ino			)
-		__field(	dev_t,	dev			)
-		__field(	loff_t,	pos			)
-		__field(	unsigned long,	len		)
-		__field(	int,	rw			)
-	),
-
-	TP_fast_assign(
-		__entry->ino	= inode->i_ino;
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->pos	= offset;
-		__entry->len	= len;
-		__entry->rw	= rw;
-	),
-
-	TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  (unsigned long long) __entry->pos, __entry->len,
-		  __entry->rw)
-);
-
-TRACE_EVENT(ext3_direct_IO_exit,
-	TP_PROTO(struct inode *inode, loff_t offset, unsigned long len,
-		 int rw, int ret),
-
-	TP_ARGS(inode, offset, len, rw, ret),
-
-	TP_STRUCT__entry(
-		__field(	ino_t,	ino			)
-		__field(	dev_t,	dev			)
-		__field(	loff_t,	pos			)
-		__field(	unsigned long,	len		)
-		__field(	int,	rw			)
-		__field(	int,	ret			)
-	),
-
-	TP_fast_assign(
-		__entry->ino	= inode->i_ino;
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->pos	= offset;
-		__entry->len	= len;
-		__entry->rw	= rw;
-		__entry->ret	= ret;
-	),
-
-	TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d ret %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  (unsigned long long) __entry->pos, __entry->len,
-		  __entry->rw, __entry->ret)
-);
-
-TRACE_EVENT(ext3_unlink_enter,
-	TP_PROTO(struct inode *parent, struct dentry *dentry),
-
-	TP_ARGS(parent, dentry),
-
-	TP_STRUCT__entry(
-		__field(	ino_t,	parent			)
-		__field(	ino_t,	ino			)
-		__field(	loff_t,	size			)
-		__field(	dev_t,	dev			)
-	),
-
-	TP_fast_assign(
-		__entry->parent		= parent->i_ino;
-		__entry->ino		= d_inode(dentry)->i_ino;
-		__entry->size		= d_inode(dentry)->i_size;
-		__entry->dev		= d_inode(dentry)->i_sb->s_dev;
-	),
-
-	TP_printk("dev %d,%d ino %lu size %lld parent %ld",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  (unsigned long long)__entry->size,
-		  (unsigned long) __entry->parent)
-);
-
-TRACE_EVENT(ext3_unlink_exit,
-	TP_PROTO(struct dentry *dentry, int ret),
-
-	TP_ARGS(dentry, ret),
-
-	TP_STRUCT__entry(
-		__field(	ino_t,	ino			)
-		__field(	dev_t,	dev			)
-		__field(	int,	ret			)
-	),
-
-	TP_fast_assign(
-		__entry->ino		= d_inode(dentry)->i_ino;
-		__entry->dev		= d_inode(dentry)->i_sb->s_dev;
-		__entry->ret		= ret;
-	),
-
-	TP_printk("dev %d,%d ino %lu ret %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  __entry->ret)
-);
-
-DECLARE_EVENT_CLASS(ext3__truncate,
-	TP_PROTO(struct inode *inode),
-
-	TP_ARGS(inode),
-
-	TP_STRUCT__entry(
-		__field(	ino_t,		ino		)
-		__field(	dev_t,		dev		)
-		__field(	blkcnt_t,	blocks		)
-	),
-
-	TP_fast_assign(
-		__entry->ino    = inode->i_ino;
-		__entry->dev    = inode->i_sb->s_dev;
-		__entry->blocks	= inode->i_blocks;
-	),
-
-	TP_printk("dev %d,%d ino %lu blocks %lu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino, (unsigned long) __entry->blocks)
-);
-
-DEFINE_EVENT(ext3__truncate, ext3_truncate_enter,
-
-	TP_PROTO(struct inode *inode),
-
-	TP_ARGS(inode)
-);
-
-DEFINE_EVENT(ext3__truncate, ext3_truncate_exit,
-
-	TP_PROTO(struct inode *inode),
-
-	TP_ARGS(inode)
-);
-
-TRACE_EVENT(ext3_get_blocks_enter,
-	TP_PROTO(struct inode *inode, unsigned long lblk,
-		 unsigned long len, int create),
-
-	TP_ARGS(inode, lblk, len, create),
-
-	TP_STRUCT__entry(
-		__field(	ino_t,		ino		)
-		__field(	dev_t,		dev		)
-		__field(	unsigned long,	lblk		)
-		__field(	unsigned long,	len		)
-		__field(	int,		create		)
-	),
-
-	TP_fast_assign(
-		__entry->ino    = inode->i_ino;
-		__entry->dev    = inode->i_sb->s_dev;
-		__entry->lblk	= lblk;
-		__entry->len	= len;
-		__entry->create	= create;
-	),
-
-	TP_printk("dev %d,%d ino %lu lblk %lu len %lu create %u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  __entry->lblk, __entry->len, __entry->create)
-);
-
-TRACE_EVENT(ext3_get_blocks_exit,
-	TP_PROTO(struct inode *inode, unsigned long lblk,
-		 unsigned long pblk, unsigned long len, int ret),
-
-	TP_ARGS(inode, lblk, pblk, len, ret),
-
-	TP_STRUCT__entry(
-		__field(	ino_t,		ino		)
-		__field(	dev_t,		dev		)
-		__field(	unsigned long,	lblk		)
-		__field(	unsigned long,	pblk		)
-		__field(	unsigned long,	len		)
-		__field(	int,		ret		)
-	),
-
-	TP_fast_assign(
-		__entry->ino    = inode->i_ino;
-		__entry->dev    = inode->i_sb->s_dev;
-		__entry->lblk	= lblk;
-		__entry->pblk	= pblk;
-		__entry->len	= len;
-		__entry->ret	= ret;
-	),
-
-	TP_printk("dev %d,%d ino %lu lblk %lu pblk %lu len %lu ret %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		   __entry->lblk, __entry->pblk,
-		  __entry->len, __entry->ret)
-);
-
-TRACE_EVENT(ext3_load_inode,
-	TP_PROTO(struct inode *inode),
-
-	TP_ARGS(inode),
-
-	TP_STRUCT__entry(
-		__field(	ino_t,	ino		)
-		__field(	dev_t,	dev		)
-	),
-
-	TP_fast_assign(
-		__entry->ino		= inode->i_ino;
-		__entry->dev		= inode->i_sb->s_dev;
-	),
-
-	TP_printk("dev %d,%d ino %lu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino)
-);
-
-#endif /* _TRACE_EXT3_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/include/trace/events/jbd.h b/include/trace/events/jbd.h
deleted file mode 100644
index da6f2591c25e..000000000000
--- a/include/trace/events/jbd.h
+++ /dev/null
@@ -1,194 +0,0 @@
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM jbd
-
-#if !defined(_TRACE_JBD_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_JBD_H
-
-#include <linux/jbd.h>
-#include <linux/tracepoint.h>
-
-TRACE_EVENT(jbd_checkpoint,
-
-	TP_PROTO(journal_t *journal, int result),
-
-	TP_ARGS(journal, result),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	int,	result			)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= journal->j_fs_dev->bd_dev;
-		__entry->result		= result;
-	),
-
-	TP_printk("dev %d,%d result %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->result)
-);
-
-DECLARE_EVENT_CLASS(jbd_commit,
-
-	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
-
-	TP_ARGS(journal, commit_transaction),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	int,	transaction		)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= journal->j_fs_dev->bd_dev;
-		__entry->transaction	= commit_transaction->t_tid;
-	),
-
-	TP_printk("dev %d,%d transaction %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->transaction)
-);
-
-DEFINE_EVENT(jbd_commit, jbd_start_commit,
-
-	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
-
-	TP_ARGS(journal, commit_transaction)
-);
-
-DEFINE_EVENT(jbd_commit, jbd_commit_locking,
-
-	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
-
-	TP_ARGS(journal, commit_transaction)
-);
-
-DEFINE_EVENT(jbd_commit, jbd_commit_flushing,
-
-	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
-
-	TP_ARGS(journal, commit_transaction)
-);
-
-DEFINE_EVENT(jbd_commit, jbd_commit_logging,
-
-	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
-
-	TP_ARGS(journal, commit_transaction)
-);
-
-TRACE_EVENT(jbd_drop_transaction,
-
-	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
-
-	TP_ARGS(journal, commit_transaction),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	int,	transaction		)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= journal->j_fs_dev->bd_dev;
-		__entry->transaction	= commit_transaction->t_tid;
-	),
-
-	TP_printk("dev %d,%d transaction %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->transaction)
-);
-
-TRACE_EVENT(jbd_end_commit,
-	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
-
-	TP_ARGS(journal, commit_transaction),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	int,	transaction		)
-		__field(	int,	head			)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= journal->j_fs_dev->bd_dev;
-		__entry->transaction	= commit_transaction->t_tid;
-		__entry->head		= journal->j_tail_sequence;
-	),
-
-	TP_printk("dev %d,%d transaction %d head %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->transaction, __entry->head)
-);
-
-TRACE_EVENT(jbd_do_submit_data,
-	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
-
-	TP_ARGS(journal, commit_transaction),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	int,	transaction		)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= journal->j_fs_dev->bd_dev;
-		__entry->transaction	= commit_transaction->t_tid;
-	),
-
-	TP_printk("dev %d,%d transaction %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		   __entry->transaction)
-);
-
-TRACE_EVENT(jbd_cleanup_journal_tail,
-
-	TP_PROTO(journal_t *journal, tid_t first_tid,
-		 unsigned long block_nr, unsigned long freed),
-
-	TP_ARGS(journal, first_tid, block_nr, freed),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	tid_t,	tail_sequence		)
-		__field(	tid_t,	first_tid		)
-		__field(unsigned long,	block_nr		)
-		__field(unsigned long,	freed			)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= journal->j_fs_dev->bd_dev;
-		__entry->tail_sequence	= journal->j_tail_sequence;
-		__entry->first_tid	= first_tid;
-		__entry->block_nr	= block_nr;
-		__entry->freed		= freed;
-	),
-
-	TP_printk("dev %d,%d from %u to %u offset %lu freed %lu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->tail_sequence, __entry->first_tid,
-		  __entry->block_nr, __entry->freed)
-);
-
-TRACE_EVENT(journal_write_superblock,
-	TP_PROTO(journal_t *journal, int write_op),
-
-	TP_ARGS(journal, write_op),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	int,	write_op		)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= journal->j_fs_dev->bd_dev;
-		__entry->write_op	= write_op;
-	),
-
-	TP_printk("dev %d,%d write_op %x", MAJOR(__entry->dev),
-		  MINOR(__entry->dev), __entry->write_op)
-);
-
-#endif /* _TRACE_JBD_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
-- 
cgit v1.2.3-70-g09d2


From a3ad0a9da863fa554fc17fa8345a07adcdd27d3c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 18 Jun 2015 17:19:14 +0200
Subject: block: Remove forced page bouncing under IO

JBD layer wrote back data buffers without setting PageWriteback bit.
Thus standard mechanism for guaranteeing stable pages under IO did not
work. Since JBD is gone now and there is no other user of the
functionality, just remove it.

Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 block/bounce.c            | 31 ++++---------------------------
 include/linux/blk_types.h |  5 ++---
 2 files changed, 6 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/block/bounce.c b/block/bounce.c
index b17311227c12..31cad13a0c9d 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -176,26 +176,8 @@ static void bounce_end_io_read_isa(struct bio *bio, int err)
 	__bounce_end_io_read(bio, isa_page_pool, err);
 }
 
-#ifdef CONFIG_NEED_BOUNCE_POOL
-static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
-{
-	if (bio_data_dir(bio) != WRITE)
-		return 0;
-
-	if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
-		return 0;
-
-	return test_bit(BIO_SNAP_STABLE, &bio->bi_flags);
-}
-#else
-static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
-{
-	return 0;
-}
-#endif /* CONFIG_NEED_BOUNCE_POOL */
-
 static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
-			       mempool_t *pool, int force)
+			       mempool_t *pool)
 {
 	struct bio *bio;
 	int rw = bio_data_dir(*bio_orig);
@@ -203,8 +185,6 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	struct bvec_iter iter;
 	unsigned i;
 
-	if (force)
-		goto bounce;
 	bio_for_each_segment(from, *bio_orig, iter)
 		if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q))
 			goto bounce;
@@ -216,7 +196,7 @@ bounce:
 	bio_for_each_segment_all(to, bio, i) {
 		struct page *page = to->bv_page;
 
-		if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
+		if (page_to_pfn(page) <= queue_bounce_pfn(q))
 			continue;
 
 		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
@@ -254,7 +234,6 @@ bounce:
 
 void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 {
-	int must_bounce;
 	mempool_t *pool;
 
 	/*
@@ -263,15 +242,13 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	if (!bio_has_data(*bio_orig))
 		return;
 
-	must_bounce = must_snapshot_stable_pages(q, *bio_orig);
-
 	/*
 	 * for non-isa bounce case, just check if the bounce pfn is equal
 	 * to or bigger than the highest pfn in the system -- in that case,
 	 * don't waste time iterating over bio segments
 	 */
 	if (!(q->bounce_gfp & GFP_DMA)) {
-		if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce)
+		if (queue_bounce_pfn(q) >= blk_max_pfn)
 			return;
 		pool = page_pool;
 	} else {
@@ -282,7 +259,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	/*
 	 * slow path
 	 */
-	__blk_queue_bounce(q, bio_orig, pool, must_bounce);
+	__blk_queue_bounce(q, bio_orig, pool);
 }
 
 EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 7303b3405520..89fd49184b48 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -118,9 +118,8 @@ struct bio {
 #define BIO_USER_MAPPED 4	/* contains user pages */
 #define BIO_NULL_MAPPED 5	/* contains invalid user pages */
 #define BIO_QUIET	6	/* Make BIO Quiet */
-#define BIO_SNAP_STABLE	7	/* bio data must be snapshotted during write */
-#define BIO_CHAIN	8	/* chained bio, ->bi_remaining in effect */
-#define BIO_REFFED	9	/* bio has elevated ->bi_cnt */
+#define BIO_CHAIN	7	/* chained bio, ->bi_remaining in effect */
+#define BIO_REFFED	8	/* bio has elevated ->bi_cnt */
 
 /*
  * Flags starting here get preserved by bio_reset() - this includes
-- 
cgit v1.2.3-70-g09d2


From bc27381edbeb654d819b7e1464091c456a0d3e64 Mon Sep 17 00:00:00 2001
From: Giuseppe Barba <giuseppe.barba@st.com>
Date: Tue, 21 Jul 2015 10:35:41 +0200
Subject: iio: st-sensors: add configuration for WhoAmI address

This patch permits to configure the WhoAmI register address
because some device could have not a standard address for
this register.

Signed-off-by: Giuseppe Barba <giuseppe.barba@st.com>
Reviewed-by: Denis Ciocca <denis.ciocca@st.com>
Acked-by: Denis Ciocca <denis.ciocca@st.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/accel/st_accel_core.c               |  5 +++
 drivers/iio/common/st_sensors/st_sensors_core.c | 49 ++++++++++++-------------
 drivers/iio/gyro/st_gyro_core.c                 |  3 ++
 drivers/iio/magnetometer/st_magn_core.c         |  3 ++
 drivers/iio/pressure/st_pressure_core.c         |  3 ++
 include/linux/iio/common/st_sensors.h           |  2 +
 6 files changed, 39 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c
index 4002e6410444..12b42f6e70ab 100644
--- a/drivers/iio/accel/st_accel_core.c
+++ b/drivers/iio/accel/st_accel_core.c
@@ -226,6 +226,7 @@ static const struct iio_chan_spec st_accel_16bit_channels[] = {
 static const struct st_sensor_settings st_accel_sensors_settings[] = {
 	{
 		.wai = ST_ACCEL_1_WAI_EXP,
+		.wai_addr = ST_SENSORS_DEFAULT_WAI_ADDRESS,
 		.sensors_supported = {
 			[0] = LIS3DH_ACCEL_DEV_NAME,
 			[1] = LSM303DLHC_ACCEL_DEV_NAME,
@@ -297,6 +298,7 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 	},
 	{
 		.wai = ST_ACCEL_2_WAI_EXP,
+		.wai_addr = ST_SENSORS_DEFAULT_WAI_ADDRESS,
 		.sensors_supported = {
 			[0] = LIS331DLH_ACCEL_DEV_NAME,
 			[1] = LSM303DL_ACCEL_DEV_NAME,
@@ -359,6 +361,7 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 	},
 	{
 		.wai = ST_ACCEL_3_WAI_EXP,
+		.wai_addr = ST_SENSORS_DEFAULT_WAI_ADDRESS,
 		.sensors_supported = {
 			[0] = LSM330_ACCEL_DEV_NAME,
 		},
@@ -437,6 +440,7 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 	},
 	{
 		.wai = ST_ACCEL_4_WAI_EXP,
+		.wai_addr = ST_SENSORS_DEFAULT_WAI_ADDRESS,
 		.sensors_supported = {
 			[0] = LIS3LV02DL_ACCEL_DEV_NAME,
 		},
@@ -494,6 +498,7 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 	},
 	{
 		.wai = ST_ACCEL_5_WAI_EXP,
+		.wai_addr = ST_SENSORS_DEFAULT_WAI_ADDRESS,
 		.sensors_supported = {
 			[0] = LIS331DL_ACCEL_DEV_NAME,
 		},
diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c
index 8086cbcff87d..d44bf1680859 100644
--- a/drivers/iio/common/st_sensors/st_sensors_core.c
+++ b/drivers/iio/common/st_sensors/st_sensors_core.c
@@ -479,46 +479,43 @@ int st_sensors_check_device_support(struct iio_dev *indio_dev,
 			int num_sensors_list,
 			const struct st_sensor_settings *sensor_settings)
 {
-	u8 wai;
 	int i, n, err;
+	u8 wai;
 	struct st_sensor_data *sdata = iio_priv(indio_dev);
 
-	err = sdata->tf->read_byte(&sdata->tb, sdata->dev,
-					ST_SENSORS_DEFAULT_WAI_ADDRESS, &wai);
-	if (err < 0) {
-		dev_err(&indio_dev->dev, "failed to read Who-Am-I register.\n");
-		goto read_wai_error;
-	}
-
 	for (i = 0; i < num_sensors_list; i++) {
-		if (sensor_settings[i].wai == wai)
+		for (n = 0; n < ST_SENSORS_MAX_4WAI; n++) {
+			if (strcmp(indio_dev->name,
+				sensor_settings[i].sensors_supported[n]) == 0) {
+				break;
+			}
+		}
+		if (n < ST_SENSORS_MAX_4WAI)
 			break;
 	}
-	if (i == num_sensors_list)
-		goto device_not_supported;
+	if (i == num_sensors_list) {
+		dev_err(&indio_dev->dev, "device name %s not recognized.\n",
+							indio_dev->name);
+		return -ENODEV;
+	}
 
-	for (n = 0; n < ARRAY_SIZE(sensor_settings[i].sensors_supported); n++) {
-		if (strcmp(indio_dev->name,
-				&sensor_settings[i].sensors_supported[n][0]) == 0)
-			break;
+	err = sdata->tf->read_byte(&sdata->tb, sdata->dev,
+					sensor_settings[i].wai_addr, &wai);
+	if (err < 0) {
+		dev_err(&indio_dev->dev, "failed to read Who-Am-I register.\n");
+		return err;
 	}
-	if (n == ARRAY_SIZE(sensor_settings[i].sensors_supported)) {
-		dev_err(&indio_dev->dev, "device name \"%s\" and WhoAmI (0x%02x) mismatch",
-			indio_dev->name, wai);
-		goto sensor_name_mismatch;
+
+	if (sensor_settings[i].wai != wai) {
+		dev_err(&indio_dev->dev, "%s: WhoAmI mismatch (0x%x).\n",
+						indio_dev->name, wai);
+		return -EINVAL;
 	}
 
 	sdata->sensor_settings =
 			(struct st_sensor_settings *)&sensor_settings[i];
 
 	return i;
-
-device_not_supported:
-	dev_err(&indio_dev->dev, "device not supported: WhoAmI (0x%x).\n", wai);
-sensor_name_mismatch:
-	err = -ENODEV;
-read_wai_error:
-	return err;
 }
 EXPORT_SYMBOL(st_sensors_check_device_support);
 
diff --git a/drivers/iio/gyro/st_gyro_core.c b/drivers/iio/gyro/st_gyro_core.c
index ffe96642b6d0..4b993a5bc9a1 100644
--- a/drivers/iio/gyro/st_gyro_core.c
+++ b/drivers/iio/gyro/st_gyro_core.c
@@ -131,6 +131,7 @@ static const struct iio_chan_spec st_gyro_16bit_channels[] = {
 static const struct st_sensor_settings st_gyro_sensors_settings[] = {
 	{
 		.wai = ST_GYRO_1_WAI_EXP,
+		.wai_addr = ST_SENSORS_DEFAULT_WAI_ADDRESS,
 		.sensors_supported = {
 			[0] = L3G4200D_GYRO_DEV_NAME,
 			[1] = LSM330DL_GYRO_DEV_NAME,
@@ -190,6 +191,7 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = {
 	},
 	{
 		.wai = ST_GYRO_2_WAI_EXP,
+		.wai_addr = ST_SENSORS_DEFAULT_WAI_ADDRESS,
 		.sensors_supported = {
 			[0] = L3GD20_GYRO_DEV_NAME,
 			[1] = LSM330D_GYRO_DEV_NAME,
@@ -252,6 +254,7 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = {
 	},
 	{
 		.wai = ST_GYRO_3_WAI_EXP,
+		.wai_addr = ST_SENSORS_DEFAULT_WAI_ADDRESS,
 		.sensors_supported = {
 			[0] = L3GD20_GYRO_DEV_NAME,
 		},
diff --git a/drivers/iio/magnetometer/st_magn_core.c b/drivers/iio/magnetometer/st_magn_core.c
index b4bcfb790f49..8d7d3a172874 100644
--- a/drivers/iio/magnetometer/st_magn_core.c
+++ b/drivers/iio/magnetometer/st_magn_core.c
@@ -192,6 +192,7 @@ static const struct iio_chan_spec st_magn_2_16bit_channels[] = {
 static const struct st_sensor_settings st_magn_sensors_settings[] = {
 	{
 		.wai = 0, /* This sensor has no valid WhoAmI report 0 */
+		.wai_addr = ST_SENSORS_DEFAULT_WAI_ADDRESS,
 		.sensors_supported = {
 			[0] = LSM303DLH_MAGN_DEV_NAME,
 		},
@@ -268,6 +269,7 @@ static const struct st_sensor_settings st_magn_sensors_settings[] = {
 	},
 	{
 		.wai = ST_MAGN_1_WAI_EXP,
+		.wai_addr = ST_SENSORS_DEFAULT_WAI_ADDRESS,
 		.sensors_supported = {
 			[0] = LSM303DLHC_MAGN_DEV_NAME,
 			[1] = LSM303DLM_MAGN_DEV_NAME,
@@ -346,6 +348,7 @@ static const struct st_sensor_settings st_magn_sensors_settings[] = {
 	},
 	{
 		.wai = ST_MAGN_2_WAI_EXP,
+		.wai_addr = ST_SENSORS_DEFAULT_WAI_ADDRESS,
 		.sensors_supported = {
 			[0] = LIS3MDL_MAGN_DEV_NAME,
 		},
diff --git a/drivers/iio/pressure/st_pressure_core.c b/drivers/iio/pressure/st_pressure_core.c
index e881fa6291e9..eb41d2b92c24 100644
--- a/drivers/iio/pressure/st_pressure_core.c
+++ b/drivers/iio/pressure/st_pressure_core.c
@@ -178,6 +178,7 @@ static const struct iio_chan_spec st_press_lps001wp_channels[] = {
 static const struct st_sensor_settings st_press_sensors_settings[] = {
 	{
 		.wai = ST_PRESS_LPS331AP_WAI_EXP,
+		.wai_addr = ST_SENSORS_DEFAULT_WAI_ADDRESS,
 		.sensors_supported = {
 			[0] = LPS331AP_PRESS_DEV_NAME,
 		},
@@ -225,6 +226,7 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 	},
 	{
 		.wai = ST_PRESS_LPS001WP_WAI_EXP,
+		.wai_addr = ST_SENSORS_DEFAULT_WAI_ADDRESS,
 		.sensors_supported = {
 			[0] = LPS001WP_PRESS_DEV_NAME,
 		},
@@ -260,6 +262,7 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 	},
 	{
 		.wai = ST_PRESS_LPS25H_WAI_EXP,
+		.wai_addr = ST_SENSORS_DEFAULT_WAI_ADDRESS,
 		.sensors_supported = {
 			[0] = LPS25H_PRESS_DEV_NAME,
 		},
diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index 2c476acb87d9..3c17cd7fdf06 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -166,6 +166,7 @@ struct st_sensor_transfer_function {
 /**
  * struct st_sensor_settings - ST specific sensor settings
  * @wai: Contents of WhoAmI register.
+ * @wai_addr: The address of WhoAmI register.
  * @sensors_supported: List of supported sensors by struct itself.
  * @ch: IIO channels for the sensor.
  * @odr: Output data rate register and ODR list available.
@@ -179,6 +180,7 @@ struct st_sensor_transfer_function {
  */
 struct st_sensor_settings {
 	u8 wai;
+	u8 wai_addr;
 	char sensors_supported[ST_SENSORS_MAX_4WAI][ST_SENSORS_MAX_NAME];
 	struct iio_chan_spec *ch;
 	int num_ch;
-- 
cgit v1.2.3-70-g09d2


From b6830f6df8914faae9561bb245860c21af9b9e9b Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 27 Jun 2015 09:19:00 -0400
Subject: serial: 8250: Split base port operations from universal driver

Refactor base port operations into new file; 8250_port.c.

Legacy irq handling, RSA port support, port storage for universal
driver, driver definition, module parameters and linkage remain in
8250_core.c

The source file split and resulting modules is diagrammed below:

  8250_core.c ====>   8250_core.c __
              \                     \
               \                     +-- 8250.ko (alias 8250_core)
                \     8250_pnp.c  __/     (universal driver)
                 \
                  =>  8250_port.c __
                                    \
                                     +-- 8250_base.ko
                      8250_dma.c  __/     (port operations)

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250.h      |   11 +
 drivers/tty/serial/8250/8250_core.c | 3260 +++--------------------------------
 drivers/tty/serial/8250/8250_port.c | 2898 +++++++++++++++++++++++++++++++
 drivers/tty/serial/8250/Makefile    |    5 +-
 include/linux/serial_8250.h         |    5 +
 5 files changed, 3116 insertions(+), 3063 deletions(-)
 create mode 100644 drivers/tty/serial/8250/8250_port.c

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h
index c43f74c53cd9..dd233108ec07 100644
--- a/drivers/tty/serial/8250/8250.h
+++ b/drivers/tty/serial/8250/8250.h
@@ -211,3 +211,14 @@ static inline int ns16550a_goto_highspeed(struct uart_8250_port *up)
 	}
 	return 1;
 }
+
+static inline int serial_index(struct uart_port *port)
+{
+	return port->minor - 64;
+}
+
+#if 0
+#define DEBUG_INTR(fmt...)	printk(fmt)
+#else
+#define DEBUG_INTR(fmt...)	do { } while (0)
+#endif
diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index 37fff12dd4d0..cfbb9d728e31 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -1,25 +1,23 @@
 /*
- *  Driver for 8250/16550-type serial ports
+ *  Universal/legacy driver for 8250/16550-type serial ports
  *
  *  Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
  *
  *  Copyright (C) 2001 Russell King.
  *
+ *  Supports: ISA-compatible 8250/16550 ports
+ *	      PNP 8250/16550 ports
+ *	      early_serial_setup() ports
+ *	      userspace-configurable "phantom" ports
+ *	      "serial8250" platform devices
+ *	      serial8250_register_8250_port() ports
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
- *
- * A note about mapbase / membase
- *
- *  mapbase is the physical address of the IO port.
- *  membase is an 'ioremapped' cookie.
  */
 
-#if defined(CONFIG_SERIAL_8250_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
-#define SUPPORT_SYSRQ
-#endif
-
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/ioport.h>
@@ -58,33 +56,10 @@ static unsigned int nr_uarts = CONFIG_SERIAL_8250_RUNTIME_UARTS;
 
 static struct uart_driver serial8250_reg;
 
-static int serial_index(struct uart_port *port)
-{
-	return port->minor - 64;
-}
-
 static unsigned int skip_txen_test; /* force skip of txen test at init time */
 
-/*
- * Debugging.
- */
-#if 0
-#define DEBUG_AUTOCONF(fmt...)	printk(fmt)
-#else
-#define DEBUG_AUTOCONF(fmt...)	do { } while (0)
-#endif
-
-#if 0
-#define DEBUG_INTR(fmt...)	printk(fmt)
-#else
-#define DEBUG_INTR(fmt...)	do { } while (0)
-#endif
-
 #define PASS_LIMIT	512
 
-#define BOTH_EMPTY 	(UART_LSR_TEMT | UART_LSR_THRE)
-
-
 #include <asm/serial.h>
 /*
  * SERIAL_PORT_DFNS tells us about built-in ports that have no
@@ -120,2695 +95,268 @@ static struct hlist_head irq_lists[NR_IRQ_HASH];
 static DEFINE_MUTEX(hash_mutex);	/* Used to walk the hash */
 
 /*
- * Here we define the default xmit fifo size used for each type of UART.
+ * This is the serial driver's interrupt routine.
+ *
+ * Arjan thinks the old way was overly complex, so it got simplified.
+ * Alan disagrees, saying that need the complexity to handle the weird
+ * nature of ISA shared interrupts.  (This is a special exception.)
+ *
+ * In order to handle ISA shared interrupts properly, we need to check
+ * that all ports have been serviced, and therefore the ISA interrupt
+ * line has been de-asserted.
+ *
+ * This means we need to loop through all ports. checking that they
+ * don't have an interrupt pending.
  */
-static const struct serial8250_config uart_config[] = {
-	[PORT_UNKNOWN] = {
-		.name		= "unknown",
-		.fifo_size	= 1,
-		.tx_loadsz	= 1,
-	},
-	[PORT_8250] = {
-		.name		= "8250",
-		.fifo_size	= 1,
-		.tx_loadsz	= 1,
-	},
-	[PORT_16450] = {
-		.name		= "16450",
-		.fifo_size	= 1,
-		.tx_loadsz	= 1,
-	},
-	[PORT_16550] = {
-		.name		= "16550",
-		.fifo_size	= 1,
-		.tx_loadsz	= 1,
-	},
-	[PORT_16550A] = {
-		.name		= "16550A",
-		.fifo_size	= 16,
-		.tx_loadsz	= 16,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
-		.rxtrig_bytes	= {1, 4, 8, 14},
-		.flags		= UART_CAP_FIFO,
-	},
-	[PORT_CIRRUS] = {
-		.name		= "Cirrus",
-		.fifo_size	= 1,
-		.tx_loadsz	= 1,
-	},
-	[PORT_16650] = {
-		.name		= "ST16650",
-		.fifo_size	= 1,
-		.tx_loadsz	= 1,
-		.flags		= UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
-	},
-	[PORT_16650V2] = {
-		.name		= "ST16650V2",
-		.fifo_size	= 32,
-		.tx_loadsz	= 16,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
-				  UART_FCR_T_TRIG_00,
-		.rxtrig_bytes	= {8, 16, 24, 28},
-		.flags		= UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
-	},
-	[PORT_16750] = {
-		.name		= "TI16750",
-		.fifo_size	= 64,
-		.tx_loadsz	= 64,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
-				  UART_FCR7_64BYTE,
-		.rxtrig_bytes	= {1, 16, 32, 56},
-		.flags		= UART_CAP_FIFO | UART_CAP_SLEEP | UART_CAP_AFE,
-	},
-	[PORT_STARTECH] = {
-		.name		= "Startech",
-		.fifo_size	= 1,
-		.tx_loadsz	= 1,
-	},
-	[PORT_16C950] = {
-		.name		= "16C950/954",
-		.fifo_size	= 128,
-		.tx_loadsz	= 128,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
-		/* UART_CAP_EFR breaks billionon CF bluetooth card. */
-		.flags		= UART_CAP_FIFO | UART_CAP_SLEEP,
-	},
-	[PORT_16654] = {
-		.name		= "ST16654",
-		.fifo_size	= 64,
-		.tx_loadsz	= 32,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
-				  UART_FCR_T_TRIG_10,
-		.rxtrig_bytes	= {8, 16, 56, 60},
-		.flags		= UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
-	},
-	[PORT_16850] = {
-		.name		= "XR16850",
-		.fifo_size	= 128,
-		.tx_loadsz	= 128,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
-		.flags		= UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
-	},
-	[PORT_RSA] = {
-		.name		= "RSA",
-		.fifo_size	= 2048,
-		.tx_loadsz	= 2048,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11,
-		.flags		= UART_CAP_FIFO,
-	},
-	[PORT_NS16550A] = {
-		.name		= "NS16550A",
-		.fifo_size	= 16,
-		.tx_loadsz	= 16,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
-		.flags		= UART_CAP_FIFO | UART_NATSEMI,
-	},
-	[PORT_XSCALE] = {
-		.name		= "XScale",
-		.fifo_size	= 32,
-		.tx_loadsz	= 32,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
-		.flags		= UART_CAP_FIFO | UART_CAP_UUE | UART_CAP_RTOIE,
-	},
-	[PORT_OCTEON] = {
-		.name		= "OCTEON",
-		.fifo_size	= 64,
-		.tx_loadsz	= 64,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
-		.flags		= UART_CAP_FIFO,
-	},
-	[PORT_AR7] = {
-		.name		= "AR7",
-		.fifo_size	= 16,
-		.tx_loadsz	= 16,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00,
-		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
-	},
-	[PORT_U6_16550A] = {
-		.name		= "U6_16550A",
-		.fifo_size	= 64,
-		.tx_loadsz	= 64,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
-		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
-	},
-	[PORT_TEGRA] = {
-		.name		= "Tegra",
-		.fifo_size	= 32,
-		.tx_loadsz	= 8,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
-				  UART_FCR_T_TRIG_01,
-		.rxtrig_bytes	= {1, 4, 8, 14},
-		.flags		= UART_CAP_FIFO | UART_CAP_RTOIE,
-	},
-	[PORT_XR17D15X] = {
-		.name		= "XR17D15X",
-		.fifo_size	= 64,
-		.tx_loadsz	= 64,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
-		.flags		= UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR |
-				  UART_CAP_SLEEP,
-	},
-	[PORT_XR17V35X] = {
-		.name		= "XR17V35X",
-		.fifo_size	= 256,
-		.tx_loadsz	= 256,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11 |
-				  UART_FCR_T_TRIG_11,
-		.flags		= UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR |
-				  UART_CAP_SLEEP,
-	},
-	[PORT_LPC3220] = {
-		.name		= "LPC3220",
-		.fifo_size	= 64,
-		.tx_loadsz	= 32,
-		.fcr		= UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO |
-				  UART_FCR_R_TRIG_00 | UART_FCR_T_TRIG_00,
-		.flags		= UART_CAP_FIFO,
-	},
-	[PORT_BRCM_TRUMANAGE] = {
-		.name		= "TruManage",
-		.fifo_size	= 1,
-		.tx_loadsz	= 1024,
-		.flags		= UART_CAP_HFIFO,
-	},
-	[PORT_8250_CIR] = {
-		.name		= "CIR port"
-	},
-	[PORT_ALTR_16550_F32] = {
-		.name		= "Altera 16550 FIFO32",
-		.fifo_size	= 32,
-		.tx_loadsz	= 32,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
-		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
-	},
-	[PORT_ALTR_16550_F64] = {
-		.name		= "Altera 16550 FIFO64",
-		.fifo_size	= 64,
-		.tx_loadsz	= 64,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
-		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
-	},
-	[PORT_ALTR_16550_F128] = {
-		.name		= "Altera 16550 FIFO128",
-		.fifo_size	= 128,
-		.tx_loadsz	= 128,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
-		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
-	},
-/* tx_loadsz is set to 63-bytes instead of 64-bytes to implement
-workaround of errata A-008006 which states that tx_loadsz should  be
-configured less than Maximum supported fifo bytes */
-	[PORT_16550A_FSL64] = {
-		.name		= "16550A_FSL64",
-		.fifo_size	= 64,
-		.tx_loadsz	= 63,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
-				  UART_FCR7_64BYTE,
-		.flags		= UART_CAP_FIFO,
-	},
-};
-
-/* Uart divisor latch read */
-static int default_serial_dl_read(struct uart_8250_port *up)
-{
-	return serial_in(up, UART_DLL) | serial_in(up, UART_DLM) << 8;
-}
-
-/* Uart divisor latch write */
-static void default_serial_dl_write(struct uart_8250_port *up, int value)
-{
-	serial_out(up, UART_DLL, value & 0xff);
-	serial_out(up, UART_DLM, value >> 8 & 0xff);
-}
-
-#if defined(CONFIG_MIPS_ALCHEMY) || defined(CONFIG_SERIAL_8250_RT288X)
-
-/* Au1x00/RT288x UART hardware has a weird register layout */
-static const s8 au_io_in_map[8] = {
-	 0,	/* UART_RX  */
-	 2,	/* UART_IER */
-	 3,	/* UART_IIR */
-	 5,	/* UART_LCR */
-	 6,	/* UART_MCR */
-	 7,	/* UART_LSR */
-	 8,	/* UART_MSR */
-	-1,	/* UART_SCR (unmapped) */
-};
-
-static const s8 au_io_out_map[8] = {
-	 1,	/* UART_TX  */
-	 2,	/* UART_IER */
-	 4,	/* UART_FCR */
-	 5,	/* UART_LCR */
-	 6,	/* UART_MCR */
-	-1,	/* UART_LSR (unmapped) */
-	-1,	/* UART_MSR (unmapped) */
-	-1,	/* UART_SCR (unmapped) */
-};
-
-static unsigned int au_serial_in(struct uart_port *p, int offset)
-{
-	if (offset >= ARRAY_SIZE(au_io_in_map))
-		return UINT_MAX;
-	offset = au_io_in_map[offset];
-	if (offset < 0)
-		return UINT_MAX;
-	return __raw_readl(p->membase + (offset << p->regshift));
-}
-
-static void au_serial_out(struct uart_port *p, int offset, int value)
-{
-	if (offset >= ARRAY_SIZE(au_io_out_map))
-		return;
-	offset = au_io_out_map[offset];
-	if (offset < 0)
-		return;
-	__raw_writel(value, p->membase + (offset << p->regshift));
-}
-
-/* Au1x00 haven't got a standard divisor latch */
-static int au_serial_dl_read(struct uart_8250_port *up)
+static irqreturn_t serial8250_interrupt(int irq, void *dev_id)
 {
-	return __raw_readl(up->port.membase + 0x28);
-}
+	struct irq_info *i = dev_id;
+	struct list_head *l, *end = NULL;
+	int pass_counter = 0, handled = 0;
 
-static void au_serial_dl_write(struct uart_8250_port *up, int value)
-{
-	__raw_writel(value, up->port.membase + 0x28);
-}
+	DEBUG_INTR("serial8250_interrupt(%d)...", irq);
 
-#endif
+	spin_lock(&i->lock);
 
-static unsigned int hub6_serial_in(struct uart_port *p, int offset)
-{
-	offset = offset << p->regshift;
-	outb(p->hub6 - 1 + offset, p->iobase);
-	return inb(p->iobase + 1);
-}
+	l = i->head;
+	do {
+		struct uart_8250_port *up;
+		struct uart_port *port;
 
-static void hub6_serial_out(struct uart_port *p, int offset, int value)
-{
-	offset = offset << p->regshift;
-	outb(p->hub6 - 1 + offset, p->iobase);
-	outb(value, p->iobase + 1);
-}
+		up = list_entry(l, struct uart_8250_port, list);
+		port = &up->port;
 
-static unsigned int mem_serial_in(struct uart_port *p, int offset)
-{
-	offset = offset << p->regshift;
-	return readb(p->membase + offset);
-}
+		if (port->handle_irq(port)) {
+			handled = 1;
+			end = NULL;
+		} else if (end == NULL)
+			end = l;
 
-static void mem_serial_out(struct uart_port *p, int offset, int value)
-{
-	offset = offset << p->regshift;
-	writeb(value, p->membase + offset);
-}
+		l = l->next;
 
-static void mem32_serial_out(struct uart_port *p, int offset, int value)
-{
-	offset = offset << p->regshift;
-	writel(value, p->membase + offset);
-}
+		if (l == i->head && pass_counter++ > PASS_LIMIT) {
+			/* If we hit this, we're dead. */
+			printk_ratelimited(KERN_ERR
+				"serial8250: too much work for irq%d\n", irq);
+			break;
+		}
+	} while (l != end);
 
-static unsigned int mem32_serial_in(struct uart_port *p, int offset)
-{
-	offset = offset << p->regshift;
-	return readl(p->membase + offset);
-}
+	spin_unlock(&i->lock);
 
-static void mem32be_serial_out(struct uart_port *p, int offset, int value)
-{
-	offset = offset << p->regshift;
-	iowrite32be(value, p->membase + offset);
-}
+	DEBUG_INTR("end.\n");
 
-static unsigned int mem32be_serial_in(struct uart_port *p, int offset)
-{
-	offset = offset << p->regshift;
-	return ioread32be(p->membase + offset);
+	return IRQ_RETVAL(handled);
 }
 
-static unsigned int io_serial_in(struct uart_port *p, int offset)
+/*
+ * To support ISA shared interrupts, we need to have one interrupt
+ * handler that ensures that the IRQ line has been deasserted
+ * before returning.  Failing to do this will result in the IRQ
+ * line being stuck active, and, since ISA irqs are edge triggered,
+ * no more IRQs will be seen.
+ */
+static void serial_do_unlink(struct irq_info *i, struct uart_8250_port *up)
 {
-	offset = offset << p->regshift;
-	return inb(p->iobase + offset);
-}
+	spin_lock_irq(&i->lock);
 
-static void io_serial_out(struct uart_port *p, int offset, int value)
-{
-	offset = offset << p->regshift;
-	outb(value, p->iobase + offset);
+	if (!list_empty(i->head)) {
+		if (i->head == &up->list)
+			i->head = i->head->next;
+		list_del(&up->list);
+	} else {
+		BUG_ON(i->head != &up->list);
+		i->head = NULL;
+	}
+	spin_unlock_irq(&i->lock);
+	/* List empty so throw away the hash node */
+	if (i->head == NULL) {
+		hlist_del(&i->node);
+		kfree(i);
+	}
 }
 
-static int serial8250_default_handle_irq(struct uart_port *port);
-static int exar_handle_irq(struct uart_port *port);
-
-static void set_io_from_upio(struct uart_port *p)
+static int serial_link_irq_chain(struct uart_8250_port *up)
 {
-	struct uart_8250_port *up = up_to_u8250p(p);
-
-	up->dl_read = default_serial_dl_read;
-	up->dl_write = default_serial_dl_write;
+	struct hlist_head *h;
+	struct hlist_node *n;
+	struct irq_info *i;
+	int ret, irq_flags = up->port.flags & UPF_SHARE_IRQ ? IRQF_SHARED : 0;
 
-	switch (p->iotype) {
-	case UPIO_HUB6:
-		p->serial_in = hub6_serial_in;
-		p->serial_out = hub6_serial_out;
-		break;
+	mutex_lock(&hash_mutex);
 
-	case UPIO_MEM:
-		p->serial_in = mem_serial_in;
-		p->serial_out = mem_serial_out;
-		break;
+	h = &irq_lists[up->port.irq % NR_IRQ_HASH];
 
-	case UPIO_MEM32:
-		p->serial_in = mem32_serial_in;
-		p->serial_out = mem32_serial_out;
-		break;
+	hlist_for_each(n, h) {
+		i = hlist_entry(n, struct irq_info, node);
+		if (i->irq == up->port.irq)
+			break;
+	}
 
-	case UPIO_MEM32BE:
-		p->serial_in = mem32be_serial_in;
-		p->serial_out = mem32be_serial_out;
-		break;
+	if (n == NULL) {
+		i = kzalloc(sizeof(struct irq_info), GFP_KERNEL);
+		if (i == NULL) {
+			mutex_unlock(&hash_mutex);
+			return -ENOMEM;
+		}
+		spin_lock_init(&i->lock);
+		i->irq = up->port.irq;
+		hlist_add_head(&i->node, h);
+	}
+	mutex_unlock(&hash_mutex);
 
-#if defined(CONFIG_MIPS_ALCHEMY) || defined(CONFIG_SERIAL_8250_RT288X)
-	case UPIO_AU:
-		p->serial_in = au_serial_in;
-		p->serial_out = au_serial_out;
-		up->dl_read = au_serial_dl_read;
-		up->dl_write = au_serial_dl_write;
-		break;
-#endif
+	spin_lock_irq(&i->lock);
 
-	default:
-		p->serial_in = io_serial_in;
-		p->serial_out = io_serial_out;
-		break;
-	}
-	/* Remember loaded iotype */
-	up->cur_iotype = p->iotype;
-	p->handle_irq = serial8250_default_handle_irq;
-}
+	if (i->head) {
+		list_add(&up->list, i->head);
+		spin_unlock_irq(&i->lock);
 
-static void
-serial_port_out_sync(struct uart_port *p, int offset, int value)
-{
-	switch (p->iotype) {
-	case UPIO_MEM:
-	case UPIO_MEM32:
-	case UPIO_MEM32BE:
-	case UPIO_AU:
-		p->serial_out(p, offset, value);
-		p->serial_in(p, UART_LCR);	/* safe, no side-effects */
-		break;
-	default:
-		p->serial_out(p, offset, value);
+		ret = 0;
+	} else {
+		INIT_LIST_HEAD(&up->list);
+		i->head = &up->list;
+		spin_unlock_irq(&i->lock);
+		irq_flags |= up->port.irqflags;
+		ret = request_irq(up->port.irq, serial8250_interrupt,
+				  irq_flags, "serial", i);
+		if (ret < 0)
+			serial_do_unlink(i, up);
 	}
-}
 
-/*
- * For the 16C950
- */
-static void serial_icr_write(struct uart_8250_port *up, int offset, int value)
-{
-	serial_out(up, UART_SCR, offset);
-	serial_out(up, UART_ICR, value);
+	return ret;
 }
 
-static unsigned int serial_icr_read(struct uart_8250_port *up, int offset)
+static void serial_unlink_irq_chain(struct uart_8250_port *up)
 {
-	unsigned int value;
+	/*
+	 * yes, some broken gcc emit "warning: 'i' may be used uninitialized"
+	 * but no, we are not going to take a patch that assigns NULL below.
+	 */
+	struct irq_info *i;
+	struct hlist_node *n;
+	struct hlist_head *h;
 
-	serial_icr_write(up, UART_ACR, up->acr | UART_ACR_ICRRD);
-	serial_out(up, UART_SCR, offset);
-	value = serial_in(up, UART_ICR);
-	serial_icr_write(up, UART_ACR, up->acr);
+	mutex_lock(&hash_mutex);
 
-	return value;
-}
+	h = &irq_lists[up->port.irq % NR_IRQ_HASH];
 
-/*
- * FIFO support.
- */
-static void serial8250_clear_fifos(struct uart_8250_port *p)
-{
-	if (p->capabilities & UART_CAP_FIFO) {
-		serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO);
-		serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO |
-			       UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
-		serial_out(p, UART_FCR, 0);
+	hlist_for_each(n, h) {
+		i = hlist_entry(n, struct irq_info, node);
+		if (i->irq == up->port.irq)
+			break;
 	}
-}
 
-void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p)
-{
-	serial8250_clear_fifos(p);
-	serial_out(p, UART_FCR, p->fcr);
-}
-EXPORT_SYMBOL_GPL(serial8250_clear_and_reinit_fifos);
+	BUG_ON(n == NULL);
+	BUG_ON(i->head == NULL);
 
-void serial8250_rpm_get(struct uart_8250_port *p)
-{
-	if (!(p->capabilities & UART_CAP_RPM))
-		return;
-	pm_runtime_get_sync(p->port.dev);
-}
-EXPORT_SYMBOL_GPL(serial8250_rpm_get);
+	if (list_empty(i->head))
+		free_irq(up->port.irq, i);
 
-void serial8250_rpm_put(struct uart_8250_port *p)
-{
-	if (!(p->capabilities & UART_CAP_RPM))
-		return;
-	pm_runtime_mark_last_busy(p->port.dev);
-	pm_runtime_put_autosuspend(p->port.dev);
+	serial_do_unlink(i, up);
+	mutex_unlock(&hash_mutex);
 }
-EXPORT_SYMBOL_GPL(serial8250_rpm_put);
 
 /*
- * These two wrappers ensure that enable_runtime_pm_tx() can be called more than
- * once and disable_runtime_pm_tx() will still disable RPM because the fifo is
- * empty and the HW can idle again.
+ * This function is used to handle ports that do not have an
+ * interrupt.  This doesn't work very well for 16450's, but gives
+ * barely passable results for a 16550A.  (Although at the expense
+ * of much CPU overhead).
  */
-static void serial8250_rpm_get_tx(struct uart_8250_port *p)
+static void serial8250_timeout(unsigned long data)
 {
-	unsigned char rpm_active;
-
-	if (!(p->capabilities & UART_CAP_RPM))
-		return;
+	struct uart_8250_port *up = (struct uart_8250_port *)data;
 
-	rpm_active = xchg(&p->rpm_tx_active, 1);
-	if (rpm_active)
-		return;
-	pm_runtime_get_sync(p->port.dev);
+	up->port.handle_irq(&up->port);
+	mod_timer(&up->timer, jiffies + uart_poll_timeout(&up->port));
 }
 
-static void serial8250_rpm_put_tx(struct uart_8250_port *p)
+static void serial8250_backup_timeout(unsigned long data)
 {
-	unsigned char rpm_active;
+	struct uart_8250_port *up = (struct uart_8250_port *)data;
+	unsigned int iir, ier = 0, lsr;
+	unsigned long flags;
 
-	if (!(p->capabilities & UART_CAP_RPM))
-		return;
-
-	rpm_active = xchg(&p->rpm_tx_active, 0);
-	if (!rpm_active)
-		return;
-	pm_runtime_mark_last_busy(p->port.dev);
-	pm_runtime_put_autosuspend(p->port.dev);
-}
-
-/*
- * IER sleep support.  UARTs which have EFRs need the "extended
- * capability" bit enabled.  Note that on XR16C850s, we need to
- * reset LCR to write to IER.
- */
-static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
-{
-	unsigned char lcr = 0, efr = 0;
-	/*
-	 * Exar UARTs have a SLEEP register that enables or disables
-	 * each UART to enter sleep mode separately.  On the XR17V35x the
-	 * register is accessible to each UART at the UART_EXAR_SLEEP
-	 * offset but the UART channel may only write to the corresponding
-	 * bit.
-	 */
-	serial8250_rpm_get(p);
-	if ((p->port.type == PORT_XR17V35X) ||
-	   (p->port.type == PORT_XR17D15X)) {
-		serial_out(p, UART_EXAR_SLEEP, sleep ? 0xff : 0);
-		goto out;
-	}
-
-	if (p->capabilities & UART_CAP_SLEEP) {
-		if (p->capabilities & UART_CAP_EFR) {
-			lcr = serial_in(p, UART_LCR);
-			efr = serial_in(p, UART_EFR);
-			serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
-			serial_out(p, UART_EFR, UART_EFR_ECB);
-			serial_out(p, UART_LCR, 0);
-		}
-		serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0);
-		if (p->capabilities & UART_CAP_EFR) {
-			serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
-			serial_out(p, UART_EFR, efr);
-			serial_out(p, UART_LCR, lcr);
-		}
-	}
-out:
-	serial8250_rpm_put(p);
-}
-
-#ifdef CONFIG_SERIAL_8250_RSA
-/*
- * Attempts to turn on the RSA FIFO.  Returns zero on failure.
- * We set the port uart clock rate if we succeed.
- */
-static int __enable_rsa(struct uart_8250_port *up)
-{
-	unsigned char mode;
-	int result;
-
-	mode = serial_in(up, UART_RSA_MSR);
-	result = mode & UART_RSA_MSR_FIFO;
-
-	if (!result) {
-		serial_out(up, UART_RSA_MSR, mode | UART_RSA_MSR_FIFO);
-		mode = serial_in(up, UART_RSA_MSR);
-		result = mode & UART_RSA_MSR_FIFO;
-	}
-
-	if (result)
-		up->port.uartclk = SERIAL_RSA_BAUD_BASE * 16;
-
-	return result;
-}
-
-static void enable_rsa(struct uart_8250_port *up)
-{
-	if (up->port.type == PORT_RSA) {
-		if (up->port.uartclk != SERIAL_RSA_BAUD_BASE * 16) {
-			spin_lock_irq(&up->port.lock);
-			__enable_rsa(up);
-			spin_unlock_irq(&up->port.lock);
-		}
-		if (up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16)
-			serial_out(up, UART_RSA_FRR, 0);
-	}
-}
-
-/*
- * Attempts to turn off the RSA FIFO.  Returns zero on failure.
- * It is unknown why interrupts were disabled in here.  However,
- * the caller is expected to preserve this behaviour by grabbing
- * the spinlock before calling this function.
- */
-static void disable_rsa(struct uart_8250_port *up)
-{
-	unsigned char mode;
-	int result;
-
-	if (up->port.type == PORT_RSA &&
-	    up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16) {
-		spin_lock_irq(&up->port.lock);
-
-		mode = serial_in(up, UART_RSA_MSR);
-		result = !(mode & UART_RSA_MSR_FIFO);
-
-		if (!result) {
-			serial_out(up, UART_RSA_MSR, mode & ~UART_RSA_MSR_FIFO);
-			mode = serial_in(up, UART_RSA_MSR);
-			result = !(mode & UART_RSA_MSR_FIFO);
-		}
-
-		if (result)
-			up->port.uartclk = SERIAL_RSA_BAUD_BASE_LO * 16;
-		spin_unlock_irq(&up->port.lock);
-	}
-}
-#endif /* CONFIG_SERIAL_8250_RSA */
-
-/*
- * This is a quickie test to see how big the FIFO is.
- * It doesn't work at all the time, more's the pity.
- */
-static int size_fifo(struct uart_8250_port *up)
-{
-	unsigned char old_fcr, old_mcr, old_lcr;
-	unsigned short old_dl;
-	int count;
-
-	old_lcr = serial_in(up, UART_LCR);
-	serial_out(up, UART_LCR, 0);
-	old_fcr = serial_in(up, UART_FCR);
-	old_mcr = serial_in(up, UART_MCR);
-	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO |
-		    UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
-	serial_out(up, UART_MCR, UART_MCR_LOOP);
-	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
-	old_dl = serial_dl_read(up);
-	serial_dl_write(up, 0x0001);
-	serial_out(up, UART_LCR, 0x03);
-	for (count = 0; count < 256; count++)
-		serial_out(up, UART_TX, count);
-	mdelay(20);/* FIXME - schedule_timeout */
-	for (count = 0; (serial_in(up, UART_LSR) & UART_LSR_DR) &&
-	     (count < 256); count++)
-		serial_in(up, UART_RX);
-	serial_out(up, UART_FCR, old_fcr);
-	serial_out(up, UART_MCR, old_mcr);
-	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
-	serial_dl_write(up, old_dl);
-	serial_out(up, UART_LCR, old_lcr);
-
-	return count;
-}
-
-/*
- * Read UART ID using the divisor method - set DLL and DLM to zero
- * and the revision will be in DLL and device type in DLM.  We
- * preserve the device state across this.
- */
-static unsigned int autoconfig_read_divisor_id(struct uart_8250_port *p)
-{
-	unsigned char old_dll, old_dlm, old_lcr;
-	unsigned int id;
-
-	old_lcr = serial_in(p, UART_LCR);
-	serial_out(p, UART_LCR, UART_LCR_CONF_MODE_A);
-
-	old_dll = serial_in(p, UART_DLL);
-	old_dlm = serial_in(p, UART_DLM);
-
-	serial_out(p, UART_DLL, 0);
-	serial_out(p, UART_DLM, 0);
-
-	id = serial_in(p, UART_DLL) | serial_in(p, UART_DLM) << 8;
-
-	serial_out(p, UART_DLL, old_dll);
-	serial_out(p, UART_DLM, old_dlm);
-	serial_out(p, UART_LCR, old_lcr);
-
-	return id;
-}
-
-/*
- * This is a helper routine to autodetect StarTech/Exar/Oxsemi UART's.
- * When this function is called we know it is at least a StarTech
- * 16650 V2, but it might be one of several StarTech UARTs, or one of
- * its clones.  (We treat the broken original StarTech 16650 V1 as a
- * 16550, and why not?  Startech doesn't seem to even acknowledge its
- * existence.)
- *
- * What evil have men's minds wrought...
- */
-static void autoconfig_has_efr(struct uart_8250_port *up)
-{
-	unsigned int id1, id2, id3, rev;
-
-	/*
-	 * Everything with an EFR has SLEEP
-	 */
-	up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP;
-
-	/*
-	 * First we check to see if it's an Oxford Semiconductor UART.
-	 *
-	 * If we have to do this here because some non-National
-	 * Semiconductor clone chips lock up if you try writing to the
-	 * LSR register (which serial_icr_read does)
-	 */
-
-	/*
-	 * Check for Oxford Semiconductor 16C950.
-	 *
-	 * EFR [4] must be set else this test fails.
-	 *
-	 * This shouldn't be necessary, but Mike Hudson (Exoray@isys.ca)
-	 * claims that it's needed for 952 dual UART's (which are not
-	 * recommended for new designs).
-	 */
-	up->acr = 0;
-	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
-	serial_out(up, UART_EFR, UART_EFR_ECB);
-	serial_out(up, UART_LCR, 0x00);
-	id1 = serial_icr_read(up, UART_ID1);
-	id2 = serial_icr_read(up, UART_ID2);
-	id3 = serial_icr_read(up, UART_ID3);
-	rev = serial_icr_read(up, UART_REV);
-
-	DEBUG_AUTOCONF("950id=%02x:%02x:%02x:%02x ", id1, id2, id3, rev);
-
-	if (id1 == 0x16 && id2 == 0xC9 &&
-	    (id3 == 0x50 || id3 == 0x52 || id3 == 0x54)) {
-		up->port.type = PORT_16C950;
-
-		/*
-		 * Enable work around for the Oxford Semiconductor 952 rev B
-		 * chip which causes it to seriously miscalculate baud rates
-		 * when DLL is 0.
-		 */
-		if (id3 == 0x52 && rev == 0x01)
-			up->bugs |= UART_BUG_QUOT;
-		return;
-	}
-
-	/*
-	 * We check for a XR16C850 by setting DLL and DLM to 0, and then
-	 * reading back DLL and DLM.  The chip type depends on the DLM
-	 * value read back:
-	 *  0x10 - XR16C850 and the DLL contains the chip revision.
-	 *  0x12 - XR16C2850.
-	 *  0x14 - XR16C854.
-	 */
-	id1 = autoconfig_read_divisor_id(up);
-	DEBUG_AUTOCONF("850id=%04x ", id1);
-
-	id2 = id1 >> 8;
-	if (id2 == 0x10 || id2 == 0x12 || id2 == 0x14) {
-		up->port.type = PORT_16850;
-		return;
-	}
-
-	/*
-	 * It wasn't an XR16C850.
-	 *
-	 * We distinguish between the '654 and the '650 by counting
-	 * how many bytes are in the FIFO.  I'm using this for now,
-	 * since that's the technique that was sent to me in the
-	 * serial driver update, but I'm not convinced this works.
-	 * I've had problems doing this in the past.  -TYT
-	 */
-	if (size_fifo(up) == 64)
-		up->port.type = PORT_16654;
-	else
-		up->port.type = PORT_16650V2;
-}
-
-/*
- * We detected a chip without a FIFO.  Only two fall into
- * this category - the original 8250 and the 16450.  The
- * 16450 has a scratch register (accessible with LCR=0)
- */
-static void autoconfig_8250(struct uart_8250_port *up)
-{
-	unsigned char scratch, status1, status2;
-
-	up->port.type = PORT_8250;
-
-	scratch = serial_in(up, UART_SCR);
-	serial_out(up, UART_SCR, 0xa5);
-	status1 = serial_in(up, UART_SCR);
-	serial_out(up, UART_SCR, 0x5a);
-	status2 = serial_in(up, UART_SCR);
-	serial_out(up, UART_SCR, scratch);
-
-	if (status1 == 0xa5 && status2 == 0x5a)
-		up->port.type = PORT_16450;
-}
-
-static int broken_efr(struct uart_8250_port *up)
-{
-	/*
-	 * Exar ST16C2550 "A2" devices incorrectly detect as
-	 * having an EFR, and report an ID of 0x0201.  See
-	 * http://linux.derkeiler.com/Mailing-Lists/Kernel/2004-11/4812.html
-	 */
-	if (autoconfig_read_divisor_id(up) == 0x0201 && size_fifo(up) == 16)
-		return 1;
-
-	return 0;
-}
-
-/*
- * We know that the chip has FIFOs.  Does it have an EFR?  The
- * EFR is located in the same register position as the IIR and
- * we know the top two bits of the IIR are currently set.  The
- * EFR should contain zero.  Try to read the EFR.
- */
-static void autoconfig_16550a(struct uart_8250_port *up)
-{
-	unsigned char status1, status2;
-	unsigned int iersave;
-
-	up->port.type = PORT_16550A;
-	up->capabilities |= UART_CAP_FIFO;
-
-	/*
-	 * XR17V35x UARTs have an extra divisor register, DLD
-	 * that gets enabled with when DLAB is set which will
-	 * cause the device to incorrectly match and assign
-	 * port type to PORT_16650.  The EFR for this UART is
-	 * found at offset 0x09. Instead check the Deice ID (DVID)
-	 * register for a 2, 4 or 8 port UART.
-	 */
-	if (up->port.flags & UPF_EXAR_EFR) {
-		status1 = serial_in(up, UART_EXAR_DVID);
-		if (status1 == 0x82 || status1 == 0x84 || status1 == 0x88) {
-			DEBUG_AUTOCONF("Exar XR17V35x ");
-			up->port.type = PORT_XR17V35X;
-			up->capabilities |= UART_CAP_AFE | UART_CAP_EFR |
-						UART_CAP_SLEEP;
-
-			return;
-		}
-
-	}
-
-	/*
-	 * Check for presence of the EFR when DLAB is set.
-	 * Only ST16C650V1 UARTs pass this test.
-	 */
-	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
-	if (serial_in(up, UART_EFR) == 0) {
-		serial_out(up, UART_EFR, 0xA8);
-		if (serial_in(up, UART_EFR) != 0) {
-			DEBUG_AUTOCONF("EFRv1 ");
-			up->port.type = PORT_16650;
-			up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP;
-		} else {
-			serial_out(up, UART_LCR, 0);
-			serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO |
-				   UART_FCR7_64BYTE);
-			status1 = serial_in(up, UART_IIR) >> 5;
-			serial_out(up, UART_FCR, 0);
-			serial_out(up, UART_LCR, 0);
-
-			if (status1 == 7)
-				up->port.type = PORT_16550A_FSL64;
-			else
-				DEBUG_AUTOCONF("Motorola 8xxx DUART ");
-		}
-		serial_out(up, UART_EFR, 0);
-		return;
-	}
-
-	/*
-	 * Maybe it requires 0xbf to be written to the LCR.
-	 * (other ST16C650V2 UARTs, TI16C752A, etc)
-	 */
-	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
-	if (serial_in(up, UART_EFR) == 0 && !broken_efr(up)) {
-		DEBUG_AUTOCONF("EFRv2 ");
-		autoconfig_has_efr(up);
-		return;
-	}
-
-	/*
-	 * Check for a National Semiconductor SuperIO chip.
-	 * Attempt to switch to bank 2, read the value of the LOOP bit
-	 * from EXCR1. Switch back to bank 0, change it in MCR. Then
-	 * switch back to bank 2, read it from EXCR1 again and check
-	 * it's changed. If so, set baud_base in EXCR2 to 921600. -- dwmw2
-	 */
-	serial_out(up, UART_LCR, 0);
-	status1 = serial_in(up, UART_MCR);
-	serial_out(up, UART_LCR, 0xE0);
-	status2 = serial_in(up, 0x02); /* EXCR1 */
-
-	if (!((status2 ^ status1) & UART_MCR_LOOP)) {
-		serial_out(up, UART_LCR, 0);
-		serial_out(up, UART_MCR, status1 ^ UART_MCR_LOOP);
-		serial_out(up, UART_LCR, 0xE0);
-		status2 = serial_in(up, 0x02); /* EXCR1 */
-		serial_out(up, UART_LCR, 0);
-		serial_out(up, UART_MCR, status1);
-
-		if ((status2 ^ status1) & UART_MCR_LOOP) {
-			unsigned short quot;
-
-			serial_out(up, UART_LCR, 0xE0);
-
-			quot = serial_dl_read(up);
-			quot <<= 3;
-
-			if (ns16550a_goto_highspeed(up))
-				serial_dl_write(up, quot);
-
-			serial_out(up, UART_LCR, 0);
-
-			up->port.uartclk = 921600*16;
-			up->port.type = PORT_NS16550A;
-			up->capabilities |= UART_NATSEMI;
-			return;
-		}
-	}
-
-	/*
-	 * No EFR.  Try to detect a TI16750, which only sets bit 5 of
-	 * the IIR when 64 byte FIFO mode is enabled when DLAB is set.
-	 * Try setting it with and without DLAB set.  Cheap clones
-	 * set bit 5 without DLAB set.
-	 */
-	serial_out(up, UART_LCR, 0);
-	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE);
-	status1 = serial_in(up, UART_IIR) >> 5;
-	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);
-	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
-	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE);
-	status2 = serial_in(up, UART_IIR) >> 5;
-	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);
-	serial_out(up, UART_LCR, 0);
-
-	DEBUG_AUTOCONF("iir1=%d iir2=%d ", status1, status2);
-
-	if (status1 == 6 && status2 == 7) {
-		up->port.type = PORT_16750;
-		up->capabilities |= UART_CAP_AFE | UART_CAP_SLEEP;
-		return;
-	}
-
-	/*
-	 * Try writing and reading the UART_IER_UUE bit (b6).
-	 * If it works, this is probably one of the Xscale platform's
-	 * internal UARTs.
-	 * We're going to explicitly set the UUE bit to 0 before
-	 * trying to write and read a 1 just to make sure it's not
-	 * already a 1 and maybe locked there before we even start start.
-	 */
-	iersave = serial_in(up, UART_IER);
-	serial_out(up, UART_IER, iersave & ~UART_IER_UUE);
-	if (!(serial_in(up, UART_IER) & UART_IER_UUE)) {
-		/*
-		 * OK it's in a known zero state, try writing and reading
-		 * without disturbing the current state of the other bits.
-		 */
-		serial_out(up, UART_IER, iersave | UART_IER_UUE);
-		if (serial_in(up, UART_IER) & UART_IER_UUE) {
-			/*
-			 * It's an Xscale.
-			 * We'll leave the UART_IER_UUE bit set to 1 (enabled).
-			 */
-			DEBUG_AUTOCONF("Xscale ");
-			up->port.type = PORT_XSCALE;
-			up->capabilities |= UART_CAP_UUE | UART_CAP_RTOIE;
-			return;
-		}
-	} else {
-		/*
-		 * If we got here we couldn't force the IER_UUE bit to 0.
-		 * Log it and continue.
-		 */
-		DEBUG_AUTOCONF("Couldn't force IER_UUE to 0 ");
-	}
-	serial_out(up, UART_IER, iersave);
-
-	/*
-	 * Exar uarts have EFR in a weird location
-	 */
-	if (up->port.flags & UPF_EXAR_EFR) {
-		DEBUG_AUTOCONF("Exar XR17D15x ");
-		up->port.type = PORT_XR17D15X;
-		up->capabilities |= UART_CAP_AFE | UART_CAP_EFR |
-				    UART_CAP_SLEEP;
-
-		return;
-	}
-
-	/*
-	 * We distinguish between 16550A and U6 16550A by counting
-	 * how many bytes are in the FIFO.
-	 */
-	if (up->port.type == PORT_16550A && size_fifo(up) == 64) {
-		up->port.type = PORT_U6_16550A;
-		up->capabilities |= UART_CAP_AFE;
-	}
-}
-
-/*
- * This routine is called by rs_init() to initialize a specific serial
- * port.  It determines what type of UART chip this serial port is
- * using: 8250, 16450, 16550, 16550A.  The important question is
- * whether or not this UART is a 16550A or not, since this will
- * determine whether or not we can use its FIFO features or not.
- */
-static void autoconfig(struct uart_8250_port *up)
-{
-	unsigned char status1, scratch, scratch2, scratch3;
-	unsigned char save_lcr, save_mcr;
-	struct uart_port *port = &up->port;
-	unsigned long flags;
-	unsigned int old_capabilities;
-
-	if (!port->iobase && !port->mapbase && !port->membase)
-		return;
-
-	DEBUG_AUTOCONF("ttyS%d: autoconf (0x%04lx, 0x%p): ",
-		       serial_index(port), port->iobase, port->membase);
-
-	/*
-	 * We really do need global IRQs disabled here - we're going to
-	 * be frobbing the chips IRQ enable register to see if it exists.
-	 */
-	spin_lock_irqsave(&port->lock, flags);
-
-	up->capabilities = 0;
-	up->bugs = 0;
-
-	if (!(port->flags & UPF_BUGGY_UART)) {
-		/*
-		 * Do a simple existence test first; if we fail this,
-		 * there's no point trying anything else.
-		 *
-		 * 0x80 is used as a nonsense port to prevent against
-		 * false positives due to ISA bus float.  The
-		 * assumption is that 0x80 is a non-existent port;
-		 * which should be safe since include/asm/io.h also
-		 * makes this assumption.
-		 *
-		 * Note: this is safe as long as MCR bit 4 is clear
-		 * and the device is in "PC" mode.
-		 */
-		scratch = serial_in(up, UART_IER);
-		serial_out(up, UART_IER, 0);
-#ifdef __i386__
-		outb(0xff, 0x080);
-#endif
-		/*
-		 * Mask out IER[7:4] bits for test as some UARTs (e.g. TL
-		 * 16C754B) allow only to modify them if an EFR bit is set.
-		 */
-		scratch2 = serial_in(up, UART_IER) & 0x0f;
-		serial_out(up, UART_IER, 0x0F);
-#ifdef __i386__
-		outb(0, 0x080);
-#endif
-		scratch3 = serial_in(up, UART_IER) & 0x0f;
-		serial_out(up, UART_IER, scratch);
-		if (scratch2 != 0 || scratch3 != 0x0F) {
-			/*
-			 * We failed; there's nothing here
-			 */
-			spin_unlock_irqrestore(&port->lock, flags);
-			DEBUG_AUTOCONF("IER test failed (%02x, %02x) ",
-				       scratch2, scratch3);
-			goto out;
-		}
-	}
-
-	save_mcr = serial_in(up, UART_MCR);
-	save_lcr = serial_in(up, UART_LCR);
-
-	/*
-	 * Check to see if a UART is really there.  Certain broken
-	 * internal modems based on the Rockwell chipset fail this
-	 * test, because they apparently don't implement the loopback
-	 * test mode.  So this test is skipped on the COM 1 through
-	 * COM 4 ports.  This *should* be safe, since no board
-	 * manufacturer would be stupid enough to design a board
-	 * that conflicts with COM 1-4 --- we hope!
-	 */
-	if (!(port->flags & UPF_SKIP_TEST)) {
-		serial_out(up, UART_MCR, UART_MCR_LOOP | 0x0A);
-		status1 = serial_in(up, UART_MSR) & 0xF0;
-		serial_out(up, UART_MCR, save_mcr);
-		if (status1 != 0x90) {
-			spin_unlock_irqrestore(&port->lock, flags);
-			DEBUG_AUTOCONF("LOOP test failed (%02x) ",
-				       status1);
-			goto out;
-		}
-	}
-
-	/*
-	 * We're pretty sure there's a port here.  Lets find out what
-	 * type of port it is.  The IIR top two bits allows us to find
-	 * out if it's 8250 or 16450, 16550, 16550A or later.  This
-	 * determines what we test for next.
-	 *
-	 * We also initialise the EFR (if any) to zero for later.  The
-	 * EFR occupies the same register location as the FCR and IIR.
-	 */
-	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
-	serial_out(up, UART_EFR, 0);
-	serial_out(up, UART_LCR, 0);
-
-	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);
-	scratch = serial_in(up, UART_IIR) >> 6;
-
-	switch (scratch) {
-	case 0:
-		autoconfig_8250(up);
-		break;
-	case 1:
-		port->type = PORT_UNKNOWN;
-		break;
-	case 2:
-		port->type = PORT_16550;
-		break;
-	case 3:
-		autoconfig_16550a(up);
-		break;
-	}
-
-#ifdef CONFIG_SERIAL_8250_RSA
-	/*
-	 * Only probe for RSA ports if we got the region.
-	 */
-	if (port->type == PORT_16550A && up->probe & UART_PROBE_RSA &&
-	    __enable_rsa(up))
-		port->type = PORT_RSA;
-#endif
-
-	serial_out(up, UART_LCR, save_lcr);
-
-	port->fifosize = uart_config[up->port.type].fifo_size;
-	old_capabilities = up->capabilities;
-	up->capabilities = uart_config[port->type].flags;
-	up->tx_loadsz = uart_config[port->type].tx_loadsz;
-
-	if (port->type == PORT_UNKNOWN)
-		goto out_lock;
-
-	/*
-	 * Reset the UART.
-	 */
-#ifdef CONFIG_SERIAL_8250_RSA
-	if (port->type == PORT_RSA)
-		serial_out(up, UART_RSA_FRR, 0);
-#endif
-	serial_out(up, UART_MCR, save_mcr);
-	serial8250_clear_fifos(up);
-	serial_in(up, UART_RX);
-	if (up->capabilities & UART_CAP_UUE)
-		serial_out(up, UART_IER, UART_IER_UUE);
-	else
-		serial_out(up, UART_IER, 0);
-
-out_lock:
-	spin_unlock_irqrestore(&port->lock, flags);
-	if (up->capabilities != old_capabilities) {
-		printk(KERN_WARNING
-		       "ttyS%d: detected caps %08x should be %08x\n",
-		       serial_index(port), old_capabilities,
-		       up->capabilities);
-	}
-out:
-	DEBUG_AUTOCONF("iir=%d ", scratch);
-	DEBUG_AUTOCONF("type=%s\n", uart_config[port->type].name);
-}
-
-static void autoconfig_irq(struct uart_8250_port *up)
-{
-	struct uart_port *port = &up->port;
-	unsigned char save_mcr, save_ier;
-	unsigned char save_ICP = 0;
-	unsigned int ICP = 0;
-	unsigned long irqs;
-	int irq;
-
-	if (port->flags & UPF_FOURPORT) {
-		ICP = (port->iobase & 0xfe0) | 0x1f;
-		save_ICP = inb_p(ICP);
-		outb_p(0x80, ICP);
-		inb_p(ICP);
-	}
-
-	/* forget possible initially masked and pending IRQ */
-	probe_irq_off(probe_irq_on());
-	save_mcr = serial_in(up, UART_MCR);
-	save_ier = serial_in(up, UART_IER);
-	serial_out(up, UART_MCR, UART_MCR_OUT1 | UART_MCR_OUT2);
-
-	irqs = probe_irq_on();
-	serial_out(up, UART_MCR, 0);
-	udelay(10);
-	if (port->flags & UPF_FOURPORT) {
-		serial_out(up, UART_MCR,
-			    UART_MCR_DTR | UART_MCR_RTS);
-	} else {
-		serial_out(up, UART_MCR,
-			    UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2);
-	}
-	serial_out(up, UART_IER, 0x0f);	/* enable all intrs */
-	serial_in(up, UART_LSR);
-	serial_in(up, UART_RX);
-	serial_in(up, UART_IIR);
-	serial_in(up, UART_MSR);
-	serial_out(up, UART_TX, 0xFF);
-	udelay(20);
-	irq = probe_irq_off(irqs);
-
-	serial_out(up, UART_MCR, save_mcr);
-	serial_out(up, UART_IER, save_ier);
-
-	if (port->flags & UPF_FOURPORT)
-		outb_p(save_ICP, ICP);
-
-	port->irq = (irq > 0) ? irq : 0;
-}
-
-static inline void __stop_tx(struct uart_8250_port *p)
-{
-	if (p->ier & UART_IER_THRI) {
-		p->ier &= ~UART_IER_THRI;
-		serial_out(p, UART_IER, p->ier);
-		serial8250_rpm_put_tx(p);
-	}
-}
-
-static void serial8250_stop_tx(struct uart_port *port)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-
-	serial8250_rpm_get(up);
-	__stop_tx(up);
-
-	/*
-	 * We really want to stop the transmitter from sending.
-	 */
-	if (port->type == PORT_16C950) {
-		up->acr |= UART_ACR_TXDIS;
-		serial_icr_write(up, UART_ACR, up->acr);
-	}
-	serial8250_rpm_put(up);
-}
-
-static void serial8250_start_tx(struct uart_port *port)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-
-	serial8250_rpm_get_tx(up);
-
-	if (up->dma && !up->dma->tx_dma(up))
-		return;
-
-	if (!(up->ier & UART_IER_THRI)) {
-		up->ier |= UART_IER_THRI;
-		serial_port_out(port, UART_IER, up->ier);
-
-		if (up->bugs & UART_BUG_TXEN) {
-			unsigned char lsr;
-			lsr = serial_in(up, UART_LSR);
-			up->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS;
-			if (lsr & UART_LSR_THRE)
-				serial8250_tx_chars(up);
-		}
-	}
-
-	/*
-	 * Re-enable the transmitter if we disabled it.
-	 */
-	if (port->type == PORT_16C950 && up->acr & UART_ACR_TXDIS) {
-		up->acr &= ~UART_ACR_TXDIS;
-		serial_icr_write(up, UART_ACR, up->acr);
-	}
-}
-
-static void serial8250_throttle(struct uart_port *port)
-{
-	port->throttle(port);
-}
-
-static void serial8250_unthrottle(struct uart_port *port)
-{
-	port->unthrottle(port);
-}
-
-static void serial8250_stop_rx(struct uart_port *port)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-
-	serial8250_rpm_get(up);
-
-	up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
-	up->port.read_status_mask &= ~UART_LSR_DR;
-	serial_port_out(port, UART_IER, up->ier);
-
-	serial8250_rpm_put(up);
-}
-
-static void serial8250_disable_ms(struct uart_port *port)
-{
-	struct uart_8250_port *up =
-		container_of(port, struct uart_8250_port, port);
-
-	/* no MSR capabilities */
-	if (up->bugs & UART_BUG_NOMSR)
-		return;
-
-	up->ier &= ~UART_IER_MSI;
-	serial_port_out(port, UART_IER, up->ier);
-}
-
-static void serial8250_enable_ms(struct uart_port *port)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-
-	/* no MSR capabilities */
-	if (up->bugs & UART_BUG_NOMSR)
-		return;
-
-	up->ier |= UART_IER_MSI;
-
-	serial8250_rpm_get(up);
-	serial_port_out(port, UART_IER, up->ier);
-	serial8250_rpm_put(up);
-}
-
-/*
- * serial8250_rx_chars: processes according to the passed in LSR
- * value, and returns the remaining LSR bits not handled
- * by this Rx routine.
- */
-unsigned char
-serial8250_rx_chars(struct uart_8250_port *up, unsigned char lsr)
-{
-	struct uart_port *port = &up->port;
-	unsigned char ch;
-	int max_count = 256;
-	char flag;
-
-	do {
-		if (likely(lsr & UART_LSR_DR))
-			ch = serial_in(up, UART_RX);
-		else
-			/*
-			 * Intel 82571 has a Serial Over Lan device that will
-			 * set UART_LSR_BI without setting UART_LSR_DR when
-			 * it receives a break. To avoid reading from the
-			 * receive buffer without UART_LSR_DR bit set, we
-			 * just force the read character to be 0
-			 */
-			ch = 0;
-
-		flag = TTY_NORMAL;
-		port->icount.rx++;
-
-		lsr |= up->lsr_saved_flags;
-		up->lsr_saved_flags = 0;
-
-		if (unlikely(lsr & UART_LSR_BRK_ERROR_BITS)) {
-			if (lsr & UART_LSR_BI) {
-				lsr &= ~(UART_LSR_FE | UART_LSR_PE);
-				port->icount.brk++;
-				/*
-				 * We do the SysRQ and SAK checking
-				 * here because otherwise the break
-				 * may get masked by ignore_status_mask
-				 * or read_status_mask.
-				 */
-				if (uart_handle_break(port))
-					goto ignore_char;
-			} else if (lsr & UART_LSR_PE)
-				port->icount.parity++;
-			else if (lsr & UART_LSR_FE)
-				port->icount.frame++;
-			if (lsr & UART_LSR_OE)
-				port->icount.overrun++;
-
-			/*
-			 * Mask off conditions which should be ignored.
-			 */
-			lsr &= port->read_status_mask;
-
-			if (lsr & UART_LSR_BI) {
-				DEBUG_INTR("handling break....");
-				flag = TTY_BREAK;
-			} else if (lsr & UART_LSR_PE)
-				flag = TTY_PARITY;
-			else if (lsr & UART_LSR_FE)
-				flag = TTY_FRAME;
-		}
-		if (uart_handle_sysrq_char(port, ch))
-			goto ignore_char;
-
-		uart_insert_char(port, lsr, UART_LSR_OE, ch, flag);
-
-ignore_char:
-		lsr = serial_in(up, UART_LSR);
-	} while ((lsr & (UART_LSR_DR | UART_LSR_BI)) && (--max_count > 0));
-	spin_unlock(&port->lock);
-	tty_flip_buffer_push(&port->state->port);
-	spin_lock(&port->lock);
-	return lsr;
-}
-EXPORT_SYMBOL_GPL(serial8250_rx_chars);
-
-void serial8250_tx_chars(struct uart_8250_port *up)
-{
-	struct uart_port *port = &up->port;
-	struct circ_buf *xmit = &port->state->xmit;
-	int count;
-
-	if (port->x_char) {
-		serial_out(up, UART_TX, port->x_char);
-		port->icount.tx++;
-		port->x_char = 0;
-		return;
-	}
-	if (uart_tx_stopped(port)) {
-		serial8250_stop_tx(port);
-		return;
-	}
-	if (uart_circ_empty(xmit)) {
-		__stop_tx(up);
-		return;
-	}
-
-	count = up->tx_loadsz;
-	do {
-		serial_out(up, UART_TX, xmit->buf[xmit->tail]);
-		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
-		port->icount.tx++;
-		if (uart_circ_empty(xmit))
-			break;
-		if (up->capabilities & UART_CAP_HFIFO) {
-			if ((serial_port_in(port, UART_LSR) & BOTH_EMPTY) !=
-			    BOTH_EMPTY)
-				break;
-		}
-	} while (--count > 0);
-
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
-		uart_write_wakeup(port);
-
-	DEBUG_INTR("THRE...");
-
-	/*
-	 * With RPM enabled, we have to wait until the FIFO is empty before the
-	 * HW can go idle. So we get here once again with empty FIFO and disable
-	 * the interrupt and RPM in __stop_tx()
-	 */
-	if (uart_circ_empty(xmit) && !(up->capabilities & UART_CAP_RPM))
-		__stop_tx(up);
-}
-EXPORT_SYMBOL_GPL(serial8250_tx_chars);
-
-/* Caller holds uart port lock */
-unsigned int serial8250_modem_status(struct uart_8250_port *up)
-{
-	struct uart_port *port = &up->port;
-	unsigned int status = serial_in(up, UART_MSR);
-
-	status |= up->msr_saved_flags;
-	up->msr_saved_flags = 0;
-	if (status & UART_MSR_ANY_DELTA && up->ier & UART_IER_MSI &&
-	    port->state != NULL) {
-		if (status & UART_MSR_TERI)
-			port->icount.rng++;
-		if (status & UART_MSR_DDSR)
-			port->icount.dsr++;
-		if (status & UART_MSR_DDCD)
-			uart_handle_dcd_change(port, status & UART_MSR_DCD);
-		if (status & UART_MSR_DCTS)
-			uart_handle_cts_change(port, status & UART_MSR_CTS);
-
-		wake_up_interruptible(&port->state->port.delta_msr_wait);
-	}
-
-	return status;
-}
-EXPORT_SYMBOL_GPL(serial8250_modem_status);
-
-/*
- * This handles the interrupt from one port.
- */
-int serial8250_handle_irq(struct uart_port *port, unsigned int iir)
-{
-	unsigned char status;
-	unsigned long flags;
-	struct uart_8250_port *up = up_to_u8250p(port);
-	int dma_err = 0;
-
-	if (iir & UART_IIR_NO_INT)
-		return 0;
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	status = serial_port_in(port, UART_LSR);
-
-	DEBUG_INTR("status = %x...", status);
-
-	if (status & (UART_LSR_DR | UART_LSR_BI)) {
-		if (up->dma)
-			dma_err = up->dma->rx_dma(up, iir);
-
-		if (!up->dma || dma_err)
-			status = serial8250_rx_chars(up, status);
-	}
-	serial8250_modem_status(up);
-	if ((!up->dma || (up->dma && up->dma->tx_err)) &&
-	    (status & UART_LSR_THRE))
-		serial8250_tx_chars(up);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-	return 1;
-}
-EXPORT_SYMBOL_GPL(serial8250_handle_irq);
-
-static int serial8250_default_handle_irq(struct uart_port *port)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-	unsigned int iir;
-	int ret;
-
-	serial8250_rpm_get(up);
-
-	iir = serial_port_in(port, UART_IIR);
-	ret = serial8250_handle_irq(port, iir);
-
-	serial8250_rpm_put(up);
-	return ret;
-}
-
-/*
- * These Exar UARTs have an extra interrupt indicator that could
- * fire for a few unimplemented interrupts.  One of which is a
- * wakeup event when coming out of sleep.  Put this here just
- * to be on the safe side that these interrupts don't go unhandled.
- */
-static int exar_handle_irq(struct uart_port *port)
-{
-	unsigned char int0, int1, int2, int3;
-	unsigned int iir = serial_port_in(port, UART_IIR);
-	int ret;
-
-	ret = serial8250_handle_irq(port, iir);
-
-	if ((port->type == PORT_XR17V35X) ||
-	   (port->type == PORT_XR17D15X)) {
-		int0 = serial_port_in(port, 0x80);
-		int1 = serial_port_in(port, 0x81);
-		int2 = serial_port_in(port, 0x82);
-		int3 = serial_port_in(port, 0x83);
-	}
-
-	return ret;
-}
-
-/*
- * This is the serial driver's interrupt routine.
- *
- * Arjan thinks the old way was overly complex, so it got simplified.
- * Alan disagrees, saying that need the complexity to handle the weird
- * nature of ISA shared interrupts.  (This is a special exception.)
- *
- * In order to handle ISA shared interrupts properly, we need to check
- * that all ports have been serviced, and therefore the ISA interrupt
- * line has been de-asserted.
- *
- * This means we need to loop through all ports. checking that they
- * don't have an interrupt pending.
- */
-static irqreturn_t serial8250_interrupt(int irq, void *dev_id)
-{
-	struct irq_info *i = dev_id;
-	struct list_head *l, *end = NULL;
-	int pass_counter = 0, handled = 0;
-
-	DEBUG_INTR("serial8250_interrupt(%d)...", irq);
-
-	spin_lock(&i->lock);
-
-	l = i->head;
-	do {
-		struct uart_8250_port *up;
-		struct uart_port *port;
-
-		up = list_entry(l, struct uart_8250_port, list);
-		port = &up->port;
-
-		if (port->handle_irq(port)) {
-			handled = 1;
-			end = NULL;
-		} else if (end == NULL)
-			end = l;
-
-		l = l->next;
-
-		if (l == i->head && pass_counter++ > PASS_LIMIT) {
-			/* If we hit this, we're dead. */
-			printk_ratelimited(KERN_ERR
-				"serial8250: too much work for irq%d\n", irq);
-			break;
-		}
-	} while (l != end);
-
-	spin_unlock(&i->lock);
-
-	DEBUG_INTR("end.\n");
-
-	return IRQ_RETVAL(handled);
-}
-
-/*
- * To support ISA shared interrupts, we need to have one interrupt
- * handler that ensures that the IRQ line has been deasserted
- * before returning.  Failing to do this will result in the IRQ
- * line being stuck active, and, since ISA irqs are edge triggered,
- * no more IRQs will be seen.
- */
-static void serial_do_unlink(struct irq_info *i, struct uart_8250_port *up)
-{
-	spin_lock_irq(&i->lock);
-
-	if (!list_empty(i->head)) {
-		if (i->head == &up->list)
-			i->head = i->head->next;
-		list_del(&up->list);
-	} else {
-		BUG_ON(i->head != &up->list);
-		i->head = NULL;
-	}
-	spin_unlock_irq(&i->lock);
-	/* List empty so throw away the hash node */
-	if (i->head == NULL) {
-		hlist_del(&i->node);
-		kfree(i);
-	}
-}
-
-static int serial_link_irq_chain(struct uart_8250_port *up)
-{
-	struct hlist_head *h;
-	struct hlist_node *n;
-	struct irq_info *i;
-	int ret, irq_flags = up->port.flags & UPF_SHARE_IRQ ? IRQF_SHARED : 0;
-
-	mutex_lock(&hash_mutex);
-
-	h = &irq_lists[up->port.irq % NR_IRQ_HASH];
-
-	hlist_for_each(n, h) {
-		i = hlist_entry(n, struct irq_info, node);
-		if (i->irq == up->port.irq)
-			break;
-	}
-
-	if (n == NULL) {
-		i = kzalloc(sizeof(struct irq_info), GFP_KERNEL);
-		if (i == NULL) {
-			mutex_unlock(&hash_mutex);
-			return -ENOMEM;
-		}
-		spin_lock_init(&i->lock);
-		i->irq = up->port.irq;
-		hlist_add_head(&i->node, h);
-	}
-	mutex_unlock(&hash_mutex);
-
-	spin_lock_irq(&i->lock);
-
-	if (i->head) {
-		list_add(&up->list, i->head);
-		spin_unlock_irq(&i->lock);
-
-		ret = 0;
-	} else {
-		INIT_LIST_HEAD(&up->list);
-		i->head = &up->list;
-		spin_unlock_irq(&i->lock);
-		irq_flags |= up->port.irqflags;
-		ret = request_irq(up->port.irq, serial8250_interrupt,
-				  irq_flags, "serial", i);
-		if (ret < 0)
-			serial_do_unlink(i, up);
-	}
-
-	return ret;
-}
-
-static void serial_unlink_irq_chain(struct uart_8250_port *up)
-{
-	/*
-	 * yes, some broken gcc emit "warning: 'i' may be used uninitialized"
-	 * but no, we are not going to take a patch that assigns NULL below.
-	 */
-	struct irq_info *i;
-	struct hlist_node *n;
-	struct hlist_head *h;
-
-	mutex_lock(&hash_mutex);
-
-	h = &irq_lists[up->port.irq % NR_IRQ_HASH];
-
-	hlist_for_each(n, h) {
-		i = hlist_entry(n, struct irq_info, node);
-		if (i->irq == up->port.irq)
-			break;
-	}
-
-	BUG_ON(n == NULL);
-	BUG_ON(i->head == NULL);
-
-	if (list_empty(i->head))
-		free_irq(up->port.irq, i);
-
-	serial_do_unlink(i, up);
-	mutex_unlock(&hash_mutex);
-}
-
-/*
- * This function is used to handle ports that do not have an
- * interrupt.  This doesn't work very well for 16450's, but gives
- * barely passable results for a 16550A.  (Although at the expense
- * of much CPU overhead).
- */
-static void serial8250_timeout(unsigned long data)
-{
-	struct uart_8250_port *up = (struct uart_8250_port *)data;
-
-	up->port.handle_irq(&up->port);
-	mod_timer(&up->timer, jiffies + uart_poll_timeout(&up->port));
-}
-
-static void serial8250_backup_timeout(unsigned long data)
-{
-	struct uart_8250_port *up = (struct uart_8250_port *)data;
-	unsigned int iir, ier = 0, lsr;
-	unsigned long flags;
-
-	spin_lock_irqsave(&up->port.lock, flags);
-
-	/*
-	 * Must disable interrupts or else we risk racing with the interrupt
-	 * based handler.
-	 */
-	if (up->port.irq) {
-		ier = serial_in(up, UART_IER);
-		serial_out(up, UART_IER, 0);
-	}
-
-	iir = serial_in(up, UART_IIR);
-
-	/*
-	 * This should be a safe test for anyone who doesn't trust the
-	 * IIR bits on their UART, but it's specifically designed for
-	 * the "Diva" UART used on the management processor on many HP
-	 * ia64 and parisc boxes.
-	 */
-	lsr = serial_in(up, UART_LSR);
-	up->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS;
-	if ((iir & UART_IIR_NO_INT) && (up->ier & UART_IER_THRI) &&
-	    (!uart_circ_empty(&up->port.state->xmit) || up->port.x_char) &&
-	    (lsr & UART_LSR_THRE)) {
-		iir &= ~(UART_IIR_ID | UART_IIR_NO_INT);
-		iir |= UART_IIR_THRI;
-	}
-
-	if (!(iir & UART_IIR_NO_INT))
-		serial8250_tx_chars(up);
-
-	if (up->port.irq)
-		serial_out(up, UART_IER, ier);
-
-	spin_unlock_irqrestore(&up->port.lock, flags);
-
-	/* Standard timer interval plus 0.2s to keep the port running */
-	mod_timer(&up->timer,
-		jiffies + uart_poll_timeout(&up->port) + HZ / 5);
-}
-
-static int univ8250_setup_irq(struct uart_8250_port *up)
-{
-	struct uart_port *port = &up->port;
-	int retval = 0;
-
-	/*
-	 * The above check will only give an accurate result the first time
-	 * the port is opened so this value needs to be preserved.
-	 */
-	if (up->bugs & UART_BUG_THRE) {
-		pr_debug("ttyS%d - using backup timer\n", serial_index(port));
-
-		up->timer.function = serial8250_backup_timeout;
-		up->timer.data = (unsigned long)up;
-		mod_timer(&up->timer, jiffies +
-			  uart_poll_timeout(port) + HZ / 5);
-	}
-
-	/*
-	 * If the "interrupt" for this port doesn't correspond with any
-	 * hardware interrupt, we use a timer-based system.  The original
-	 * driver used to do this with IRQ0.
-	 */
-	if (!port->irq) {
-		up->timer.data = (unsigned long)up;
-		mod_timer(&up->timer, jiffies + uart_poll_timeout(port));
-	} else
-		retval = serial_link_irq_chain(up);
-
-	return retval;
-}
-
-static void univ8250_release_irq(struct uart_8250_port *up)
-{
-	struct uart_port *port = &up->port;
-
-	del_timer_sync(&up->timer);
-	up->timer.function = serial8250_timeout;
-	if (port->irq)
-		serial_unlink_irq_chain(up);
-}
-
-static unsigned int serial8250_tx_empty(struct uart_port *port)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-	unsigned long flags;
-	unsigned int lsr;
-
-	serial8250_rpm_get(up);
-
-	spin_lock_irqsave(&port->lock, flags);
-	lsr = serial_port_in(port, UART_LSR);
-	up->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS;
-	spin_unlock_irqrestore(&port->lock, flags);
-
-	serial8250_rpm_put(up);
-
-	return (lsr & BOTH_EMPTY) == BOTH_EMPTY ? TIOCSER_TEMT : 0;
-}
-
-static unsigned int serial8250_get_mctrl(struct uart_port *port)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-	unsigned int status;
-	unsigned int ret;
-
-	serial8250_rpm_get(up);
-	status = serial8250_modem_status(up);
-	serial8250_rpm_put(up);
-
-	ret = 0;
-	if (status & UART_MSR_DCD)
-		ret |= TIOCM_CAR;
-	if (status & UART_MSR_RI)
-		ret |= TIOCM_RNG;
-	if (status & UART_MSR_DSR)
-		ret |= TIOCM_DSR;
-	if (status & UART_MSR_CTS)
-		ret |= TIOCM_CTS;
-	return ret;
-}
-
-void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-	unsigned char mcr = 0;
-
-	if (mctrl & TIOCM_RTS)
-		mcr |= UART_MCR_RTS;
-	if (mctrl & TIOCM_DTR)
-		mcr |= UART_MCR_DTR;
-	if (mctrl & TIOCM_OUT1)
-		mcr |= UART_MCR_OUT1;
-	if (mctrl & TIOCM_OUT2)
-		mcr |= UART_MCR_OUT2;
-	if (mctrl & TIOCM_LOOP)
-		mcr |= UART_MCR_LOOP;
-
-	mcr = (mcr & up->mcr_mask) | up->mcr_force | up->mcr;
-
-	serial_port_out(port, UART_MCR, mcr);
-}
-EXPORT_SYMBOL_GPL(serial8250_do_set_mctrl);
-
-static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
-{
-	if (port->set_mctrl)
-		port->set_mctrl(port, mctrl);
-	else
-		serial8250_do_set_mctrl(port, mctrl);
-}
-
-static void serial8250_break_ctl(struct uart_port *port, int break_state)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-	unsigned long flags;
-
-	serial8250_rpm_get(up);
-	spin_lock_irqsave(&port->lock, flags);
-	if (break_state == -1)
-		up->lcr |= UART_LCR_SBC;
-	else
-		up->lcr &= ~UART_LCR_SBC;
-	serial_port_out(port, UART_LCR, up->lcr);
-	spin_unlock_irqrestore(&port->lock, flags);
-	serial8250_rpm_put(up);
-}
-
-/*
- *	Wait for transmitter & holding register to empty
- */
-static void wait_for_xmitr(struct uart_8250_port *up, int bits)
-{
-	unsigned int status, tmout = 10000;
-
-	/* Wait up to 10ms for the character(s) to be sent. */
-	for (;;) {
-		status = serial_in(up, UART_LSR);
-
-		up->lsr_saved_flags |= status & LSR_SAVE_FLAGS;
-
-		if ((status & bits) == bits)
-			break;
-		if (--tmout == 0)
-			break;
-		udelay(1);
-	}
-
-	/* Wait up to 1s for flow control if necessary */
-	if (up->port.flags & UPF_CONS_FLOW) {
-		unsigned int tmout;
-		for (tmout = 1000000; tmout; tmout--) {
-			unsigned int msr = serial_in(up, UART_MSR);
-			up->msr_saved_flags |= msr & MSR_SAVE_FLAGS;
-			if (msr & UART_MSR_CTS)
-				break;
-			udelay(1);
-			touch_nmi_watchdog();
-		}
-	}
-}
-
-#ifdef CONFIG_CONSOLE_POLL
-/*
- * Console polling routines for writing and reading from the uart while
- * in an interrupt or debug context.
- */
-
-static int serial8250_get_poll_char(struct uart_port *port)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-	unsigned char lsr;
-	int status;
-
-	serial8250_rpm_get(up);
-
-	lsr = serial_port_in(port, UART_LSR);
-
-	if (!(lsr & UART_LSR_DR)) {
-		status = NO_POLL_CHAR;
-		goto out;
-	}
-
-	status = serial_port_in(port, UART_RX);
-out:
-	serial8250_rpm_put(up);
-	return status;
-}
-
-
-static void serial8250_put_poll_char(struct uart_port *port,
-			 unsigned char c)
-{
-	unsigned int ier;
-	struct uart_8250_port *up = up_to_u8250p(port);
-
-	serial8250_rpm_get(up);
-	/*
-	 *	First save the IER then disable the interrupts
-	 */
-	ier = serial_port_in(port, UART_IER);
-	if (up->capabilities & UART_CAP_UUE)
-		serial_port_out(port, UART_IER, UART_IER_UUE);
-	else
-		serial_port_out(port, UART_IER, 0);
-
-	wait_for_xmitr(up, BOTH_EMPTY);
-	/*
-	 *	Send the character out.
-	 */
-	serial_port_out(port, UART_TX, c);
-
-	/*
-	 *	Finally, wait for transmitter to become empty
-	 *	and restore the IER
-	 */
-	wait_for_xmitr(up, BOTH_EMPTY);
-	serial_port_out(port, UART_IER, ier);
-	serial8250_rpm_put(up);
-}
-
-#endif /* CONFIG_CONSOLE_POLL */
-
-int serial8250_do_startup(struct uart_port *port)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-	unsigned long flags;
-	unsigned char lsr, iir;
-	int retval;
-
-	if (port->type == PORT_8250_CIR)
-		return -ENODEV;
-
-	if (!port->fifosize)
-		port->fifosize = uart_config[port->type].fifo_size;
-	if (!up->tx_loadsz)
-		up->tx_loadsz = uart_config[port->type].tx_loadsz;
-	if (!up->capabilities)
-		up->capabilities = uart_config[port->type].flags;
-	up->mcr = 0;
-
-	if (port->iotype != up->cur_iotype)
-		set_io_from_upio(port);
-
-	serial8250_rpm_get(up);
-	if (port->type == PORT_16C950) {
-		/* Wake up and initialize UART */
-		up->acr = 0;
-		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
-		serial_port_out(port, UART_EFR, UART_EFR_ECB);
-		serial_port_out(port, UART_IER, 0);
-		serial_port_out(port, UART_LCR, 0);
-		serial_icr_write(up, UART_CSR, 0); /* Reset the UART */
-		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
-		serial_port_out(port, UART_EFR, UART_EFR_ECB);
-		serial_port_out(port, UART_LCR, 0);
-	}
-
-#ifdef CONFIG_SERIAL_8250_RSA
-	/*
-	 * If this is an RSA port, see if we can kick it up to the
-	 * higher speed clock.
-	 */
-	enable_rsa(up);
-#endif
-	/*
-	 * Clear the FIFO buffers and disable them.
-	 * (they will be reenabled in set_termios())
-	 */
-	serial8250_clear_fifos(up);
-
-	/*
-	 * Clear the interrupt registers.
-	 */
-	serial_port_in(port, UART_LSR);
-	serial_port_in(port, UART_RX);
-	serial_port_in(port, UART_IIR);
-	serial_port_in(port, UART_MSR);
-
-	/*
-	 * At this point, there's no way the LSR could still be 0xff;
-	 * if it is, then bail out, because there's likely no UART
-	 * here.
-	 */
-	if (!(port->flags & UPF_BUGGY_UART) &&
-	    (serial_port_in(port, UART_LSR) == 0xff)) {
-		printk_ratelimited(KERN_INFO "ttyS%d: LSR safety check engaged!\n",
-				   serial_index(port));
-		retval = -ENODEV;
-		goto out;
-	}
-
-	/*
-	 * For a XR16C850, we need to set the trigger levels
-	 */
-	if (port->type == PORT_16850) {
-		unsigned char fctr;
-
-		serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
-
-		fctr = serial_in(up, UART_FCTR) & ~(UART_FCTR_RX|UART_FCTR_TX);
-		serial_port_out(port, UART_FCTR,
-				fctr | UART_FCTR_TRGD | UART_FCTR_RX);
-		serial_port_out(port, UART_TRG, UART_TRG_96);
-		serial_port_out(port, UART_FCTR,
-				fctr | UART_FCTR_TRGD | UART_FCTR_TX);
-		serial_port_out(port, UART_TRG, UART_TRG_96);
-
-		serial_port_out(port, UART_LCR, 0);
-	}
-
-	if (port->irq) {
-		unsigned char iir1;
-		/*
-		 * Test for UARTs that do not reassert THRE when the
-		 * transmitter is idle and the interrupt has already
-		 * been cleared.  Real 16550s should always reassert
-		 * this interrupt whenever the transmitter is idle and
-		 * the interrupt is enabled.  Delays are necessary to
-		 * allow register changes to become visible.
-		 */
-		spin_lock_irqsave(&port->lock, flags);
-		if (up->port.irqflags & IRQF_SHARED)
-			disable_irq_nosync(port->irq);
-
-		wait_for_xmitr(up, UART_LSR_THRE);
-		serial_port_out_sync(port, UART_IER, UART_IER_THRI);
-		udelay(1); /* allow THRE to set */
-		iir1 = serial_port_in(port, UART_IIR);
-		serial_port_out(port, UART_IER, 0);
-		serial_port_out_sync(port, UART_IER, UART_IER_THRI);
-		udelay(1); /* allow a working UART time to re-assert THRE */
-		iir = serial_port_in(port, UART_IIR);
-		serial_port_out(port, UART_IER, 0);
-
-		if (port->irqflags & IRQF_SHARED)
-			enable_irq(port->irq);
-		spin_unlock_irqrestore(&port->lock, flags);
-
-		/*
-		 * If the interrupt is not reasserted, or we otherwise
-		 * don't trust the iir, setup a timer to kick the UART
-		 * on a regular basis.
-		 */
-		if ((!(iir1 & UART_IIR_NO_INT) && (iir & UART_IIR_NO_INT)) ||
-		    up->port.flags & UPF_BUG_THRE) {
-			up->bugs |= UART_BUG_THRE;
-		}
-	}
-
-	retval = up->ops->setup_irq(up);
-	if (retval)
-		goto out;
-
-	/*
-	 * Now, initialize the UART
-	 */
-	serial_port_out(port, UART_LCR, UART_LCR_WLEN8);
-
-	spin_lock_irqsave(&port->lock, flags);
-	if (up->port.flags & UPF_FOURPORT) {
-		if (!up->port.irq)
-			up->port.mctrl |= TIOCM_OUT1;
-	} else
-		/*
-		 * Most PC uarts need OUT2 raised to enable interrupts.
-		 */
-		if (port->irq)
-			up->port.mctrl |= TIOCM_OUT2;
-
-	serial8250_set_mctrl(port, port->mctrl);
-
-	/* Serial over Lan (SoL) hack:
-	   Intel 8257x Gigabit ethernet chips have a
-	   16550 emulation, to be used for Serial Over Lan.
-	   Those chips take a longer time than a normal
-	   serial device to signalize that a transmission
-	   data was queued. Due to that, the above test generally
-	   fails. One solution would be to delay the reading of
-	   iir. However, this is not reliable, since the timeout
-	   is variable. So, let's just don't test if we receive
-	   TX irq. This way, we'll never enable UART_BUG_TXEN.
-	 */
-	if (up->port.flags & UPF_NO_TXEN_TEST)
-		goto dont_test_tx_en;
-
-	/*
-	 * Do a quick test to see if we receive an
-	 * interrupt when we enable the TX irq.
-	 */
-	serial_port_out(port, UART_IER, UART_IER_THRI);
-	lsr = serial_port_in(port, UART_LSR);
-	iir = serial_port_in(port, UART_IIR);
-	serial_port_out(port, UART_IER, 0);
-
-	if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) {
-		if (!(up->bugs & UART_BUG_TXEN)) {
-			up->bugs |= UART_BUG_TXEN;
-			pr_debug("ttyS%d - enabling bad tx status workarounds\n",
-				 serial_index(port));
-		}
-	} else {
-		up->bugs &= ~UART_BUG_TXEN;
-	}
-
-dont_test_tx_en:
-	spin_unlock_irqrestore(&port->lock, flags);
-
-	/*
-	 * Clear the interrupt registers again for luck, and clear the
-	 * saved flags to avoid getting false values from polling
-	 * routines or the previous session.
-	 */
-	serial_port_in(port, UART_LSR);
-	serial_port_in(port, UART_RX);
-	serial_port_in(port, UART_IIR);
-	serial_port_in(port, UART_MSR);
-	up->lsr_saved_flags = 0;
-	up->msr_saved_flags = 0;
-
-	/*
-	 * Request DMA channels for both RX and TX.
-	 */
-	if (up->dma) {
-		retval = serial8250_request_dma(up);
-		if (retval) {
-			pr_warn_ratelimited("ttyS%d - failed to request DMA\n",
-					    serial_index(port));
-			up->dma = NULL;
-		}
-	}
-
-	/*
-	 * Finally, enable interrupts.  Note: Modem status interrupts
-	 * are set via set_termios(), which will be occurring imminently
-	 * anyway, so we don't enable them here.
-	 */
-	up->ier = UART_IER_RLSI | UART_IER_RDI;
-	serial_port_out(port, UART_IER, up->ier);
-
-	if (port->flags & UPF_FOURPORT) {
-		unsigned int icp;
-		/*
-		 * Enable interrupts on the AST Fourport board
-		 */
-		icp = (port->iobase & 0xfe0) | 0x01f;
-		outb_p(0x80, icp);
-		inb_p(icp);
-	}
-	retval = 0;
-out:
-	serial8250_rpm_put(up);
-	return retval;
-}
-EXPORT_SYMBOL_GPL(serial8250_do_startup);
-
-static int serial8250_startup(struct uart_port *port)
-{
-	if (port->startup)
-		return port->startup(port);
-	return serial8250_do_startup(port);
-}
-
-void serial8250_do_shutdown(struct uart_port *port)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-	unsigned long flags;
-
-	serial8250_rpm_get(up);
-	/*
-	 * Disable interrupts from this port
-	 */
-	up->ier = 0;
-	serial_port_out(port, UART_IER, 0);
-
-	if (up->dma)
-		serial8250_release_dma(up);
-
-	spin_lock_irqsave(&port->lock, flags);
-	if (port->flags & UPF_FOURPORT) {
-		/* reset interrupts on the AST Fourport board */
-		inb((port->iobase & 0xfe0) | 0x1f);
-		port->mctrl |= TIOCM_OUT1;
-	} else
-		port->mctrl &= ~TIOCM_OUT2;
-
-	serial8250_set_mctrl(port, port->mctrl);
-	spin_unlock_irqrestore(&port->lock, flags);
-
-	/*
-	 * Disable break condition and FIFOs
-	 */
-	serial_port_out(port, UART_LCR,
-			serial_port_in(port, UART_LCR) & ~UART_LCR_SBC);
-	serial8250_clear_fifos(up);
-
-#ifdef CONFIG_SERIAL_8250_RSA
-	/*
-	 * Reset the RSA board back to 115kbps compat mode.
-	 */
-	disable_rsa(up);
-#endif
-
-	/*
-	 * Read data port to reset things, and then unlink from
-	 * the IRQ chain.
-	 */
-	serial_port_in(port, UART_RX);
-	serial8250_rpm_put(up);
-
-	up->ops->release_irq(up);
-}
-EXPORT_SYMBOL_GPL(serial8250_do_shutdown);
-
-static void serial8250_shutdown(struct uart_port *port)
-{
-	if (port->shutdown)
-		port->shutdown(port);
-	else
-		serial8250_do_shutdown(port);
-}
-
-/*
- * XR17V35x UARTs have an extra fractional divisor register (DLD)
- * Calculate divisor with extra 4-bit fractional portion
- */
-static unsigned int xr17v35x_get_divisor(struct uart_8250_port *up,
-					 unsigned int baud,
-					 unsigned int *frac)
-{
-	struct uart_port *port = &up->port;
-	unsigned int quot_16;
-
-	quot_16 = DIV_ROUND_CLOSEST(port->uartclk, baud);
-	*frac = quot_16 & 0x0f;
-
-	return quot_16 >> 4;
-}
-
-static unsigned int serial8250_get_divisor(struct uart_8250_port *up,
-					   unsigned int baud,
-					   unsigned int *frac)
-{
-	struct uart_port *port = &up->port;
-	unsigned int quot;
-
-	/*
-	 * Handle magic divisors for baud rates above baud_base on
-	 * SMSC SuperIO chips.
-	 *
-	 */
-	if ((port->flags & UPF_MAGIC_MULTIPLIER) &&
-	    baud == (port->uartclk/4))
-		quot = 0x8001;
-	else if ((port->flags & UPF_MAGIC_MULTIPLIER) &&
-		 baud == (port->uartclk/8))
-		quot = 0x8002;
-	else if (up->port.type == PORT_XR17V35X)
-		quot = xr17v35x_get_divisor(up, baud, frac);
-	else
-		quot = uart_get_divisor(port, baud);
-
-	/*
-	 * Oxford Semi 952 rev B workaround
-	 */
-	if (up->bugs & UART_BUG_QUOT && (quot & 0xff) == 0)
-		quot++;
-
-	return quot;
-}
-
-static unsigned char serial8250_compute_lcr(struct uart_8250_port *up,
-					    tcflag_t c_cflag)
-{
-	unsigned char cval;
-
-	switch (c_cflag & CSIZE) {
-	case CS5:
-		cval = UART_LCR_WLEN5;
-		break;
-	case CS6:
-		cval = UART_LCR_WLEN6;
-		break;
-	case CS7:
-		cval = UART_LCR_WLEN7;
-		break;
-	default:
-	case CS8:
-		cval = UART_LCR_WLEN8;
-		break;
-	}
-
-	if (c_cflag & CSTOPB)
-		cval |= UART_LCR_STOP;
-	if (c_cflag & PARENB) {
-		cval |= UART_LCR_PARITY;
-		if (up->bugs & UART_BUG_PARITY)
-			up->fifo_bug = true;
-	}
-	if (!(c_cflag & PARODD))
-		cval |= UART_LCR_EPAR;
-#ifdef CMSPAR
-	if (c_cflag & CMSPAR)
-		cval |= UART_LCR_SPAR;
-#endif
-
-	return cval;
-}
-
-static void serial8250_set_divisor(struct uart_port *port, unsigned int baud,
-			    unsigned int quot, unsigned int quot_frac)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-
-	/* Workaround to enable 115200 baud on OMAP1510 internal ports */
-	if (is_omap1510_8250(up)) {
-		if (baud == 115200) {
-			quot = 1;
-			serial_port_out(port, UART_OMAP_OSC_12M_SEL, 1);
-		} else
-			serial_port_out(port, UART_OMAP_OSC_12M_SEL, 0);
-	}
-
-	/*
-	 * For NatSemi, switch to bank 2 not bank 1, to avoid resetting EXCR2,
-	 * otherwise just set DLAB
-	 */
-	if (up->capabilities & UART_NATSEMI)
-		serial_port_out(port, UART_LCR, 0xe0);
-	else
-		serial_port_out(port, UART_LCR, up->lcr | UART_LCR_DLAB);
-
-	serial_dl_write(up, quot);
-
-	/* XR17V35x UARTs have an extra fractional divisor register (DLD) */
-	if (up->port.type == PORT_XR17V35X)
-		serial_port_out(port, 0x2, quot_frac);
-}
-
-void
-serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios,
-		          struct ktermios *old)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-	unsigned char cval;
-	unsigned long flags;
-	unsigned int baud, quot, frac = 0;
-
-	cval = serial8250_compute_lcr(up, termios->c_cflag);
-
-	/*
-	 * Ask the core to calculate the divisor for us.
-	 */
-	baud = uart_get_baud_rate(port, termios, old,
-				  port->uartclk / 16 / 0xffff,
-				  port->uartclk / 16);
-	quot = serial8250_get_divisor(up, baud, &frac);
-
-	/*
-	 * Ok, we're now changing the port state.  Do it with
-	 * interrupts disabled.
-	 */
-	serial8250_rpm_get(up);
-	spin_lock_irqsave(&port->lock, flags);
-
-	up->lcr = cval;					/* Save computed LCR */
-
-	if (up->capabilities & UART_CAP_FIFO && port->fifosize > 1) {
-		/* NOTE: If fifo_bug is not set, a user can set RX_trigger. */
-		if ((baud < 2400 && !up->dma) || up->fifo_bug) {
-			up->fcr &= ~UART_FCR_TRIGGER_MASK;
-			up->fcr |= UART_FCR_TRIGGER_1;
-		}
-	}
-
-	/*
-	 * MCR-based auto flow control.  When AFE is enabled, RTS will be
-	 * deasserted when the receive FIFO contains more characters than
-	 * the trigger, or the MCR RTS bit is cleared.  In the case where
-	 * the remote UART is not using CTS auto flow control, we must
-	 * have sufficient FIFO entries for the latency of the remote
-	 * UART to respond.  IOW, at least 32 bytes of FIFO.
-	 */
-	if (up->capabilities & UART_CAP_AFE && port->fifosize >= 32) {
-		up->mcr &= ~UART_MCR_AFE;
-		if (termios->c_cflag & CRTSCTS)
-			up->mcr |= UART_MCR_AFE;
-	}
-
-	/*
-	 * Update the per-port timeout.
-	 */
-	uart_update_timeout(port, termios->c_cflag, baud);
-
-	port->read_status_mask = UART_LSR_OE | UART_LSR_THRE | UART_LSR_DR;
-	if (termios->c_iflag & INPCK)
-		port->read_status_mask |= UART_LSR_FE | UART_LSR_PE;
-	if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK))
-		port->read_status_mask |= UART_LSR_BI;
-
-	/*
-	 * Characteres to ignore
-	 */
-	port->ignore_status_mask = 0;
-	if (termios->c_iflag & IGNPAR)
-		port->ignore_status_mask |= UART_LSR_PE | UART_LSR_FE;
-	if (termios->c_iflag & IGNBRK) {
-		port->ignore_status_mask |= UART_LSR_BI;
-		/*
-		 * If we're ignoring parity and break indicators,
-		 * ignore overruns too (for real raw support).
-		 */
-		if (termios->c_iflag & IGNPAR)
-			port->ignore_status_mask |= UART_LSR_OE;
-	}
-
-	/*
-	 * ignore all characters if CREAD is not set
-	 */
-	if ((termios->c_cflag & CREAD) == 0)
-		port->ignore_status_mask |= UART_LSR_DR;
-
-	/*
-	 * CTS flow control flag and modem status interrupts
-	 */
-	up->ier &= ~UART_IER_MSI;
-	if (!(up->bugs & UART_BUG_NOMSR) &&
-			UART_ENABLE_MS(&up->port, termios->c_cflag))
-		up->ier |= UART_IER_MSI;
-	if (up->capabilities & UART_CAP_UUE)
-		up->ier |= UART_IER_UUE;
-	if (up->capabilities & UART_CAP_RTOIE)
-		up->ier |= UART_IER_RTOIE;
-
-	serial_port_out(port, UART_IER, up->ier);
-
-	if (up->capabilities & UART_CAP_EFR) {
-		unsigned char efr = 0;
-		/*
-		 * TI16C752/Startech hardware flow control.  FIXME:
-		 * - TI16C752 requires control thresholds to be set.
-		 * - UART_MCR_RTS is ineffective if auto-RTS mode is enabled.
-		 */
-		if (termios->c_cflag & CRTSCTS)
-			efr |= UART_EFR_CTS;
-
-		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
-		if (port->flags & UPF_EXAR_EFR)
-			serial_port_out(port, UART_XR_EFR, efr);
-		else
-			serial_port_out(port, UART_EFR, efr);
-	}
-
-	serial8250_set_divisor(port, baud, quot, frac);
+	spin_lock_irqsave(&up->port.lock, flags);
 
 	/*
-	 * LCR DLAB must be set to enable 64-byte FIFO mode. If the FCR
-	 * is written without DLAB set, this mode will be disabled.
+	 * Must disable interrupts or else we risk racing with the interrupt
+	 * based handler.
 	 */
-	if (port->type == PORT_16750)
-		serial_port_out(port, UART_FCR, up->fcr);
-
-	serial_port_out(port, UART_LCR, up->lcr);	/* reset DLAB */
-	if (port->type != PORT_16750) {
-		/* emulated UARTs (Lucent Venus 167x) need two steps */
-		if (up->fcr & UART_FCR_ENABLE_FIFO)
-			serial_port_out(port, UART_FCR, UART_FCR_ENABLE_FIFO);
-		serial_port_out(port, UART_FCR, up->fcr);	/* set fcr */
-	}
-	serial8250_set_mctrl(port, port->mctrl);
-	spin_unlock_irqrestore(&port->lock, flags);
-	serial8250_rpm_put(up);
-
-	/* Don't rewrite B0 */
-	if (tty_termios_baud_rate(termios))
-		tty_termios_encode_baud_rate(termios, baud, baud);
-}
-EXPORT_SYMBOL(serial8250_do_set_termios);
-
-static void
-serial8250_set_termios(struct uart_port *port, struct ktermios *termios,
-		       struct ktermios *old)
-{
-	if (port->set_termios)
-		port->set_termios(port, termios, old);
-	else
-		serial8250_do_set_termios(port, termios, old);
-}
-
-static void
-serial8250_set_ldisc(struct uart_port *port, struct ktermios *termios)
-{
-	if (termios->c_line == N_PPS) {
-		port->flags |= UPF_HARDPPS_CD;
-		spin_lock_irq(&port->lock);
-		serial8250_enable_ms(port);
-		spin_unlock_irq(&port->lock);
-	} else {
-		port->flags &= ~UPF_HARDPPS_CD;
-		if (!UART_ENABLE_MS(port, termios->c_cflag)) {
-			spin_lock_irq(&port->lock);
-			serial8250_disable_ms(port);
-			spin_unlock_irq(&port->lock);
-		}
+	if (up->port.irq) {
+		ier = serial_in(up, UART_IER);
+		serial_out(up, UART_IER, 0);
 	}
-}
 
+	iir = serial_in(up, UART_IIR);
 
-void serial8250_do_pm(struct uart_port *port, unsigned int state,
-		      unsigned int oldstate)
-{
-	struct uart_8250_port *p = up_to_u8250p(port);
+	/*
+	 * This should be a safe test for anyone who doesn't trust the
+	 * IIR bits on their UART, but it's specifically designed for
+	 * the "Diva" UART used on the management processor on many HP
+	 * ia64 and parisc boxes.
+	 */
+	lsr = serial_in(up, UART_LSR);
+	up->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS;
+	if ((iir & UART_IIR_NO_INT) && (up->ier & UART_IER_THRI) &&
+	    (!uart_circ_empty(&up->port.state->xmit) || up->port.x_char) &&
+	    (lsr & UART_LSR_THRE)) {
+		iir &= ~(UART_IIR_ID | UART_IIR_NO_INT);
+		iir |= UART_IIR_THRI;
+	}
 
-	serial8250_set_sleep(p, state != 0);
-}
-EXPORT_SYMBOL(serial8250_do_pm);
+	if (!(iir & UART_IIR_NO_INT))
+		serial8250_tx_chars(up);
 
-static void
-serial8250_pm(struct uart_port *port, unsigned int state,
-	      unsigned int oldstate)
-{
-	if (port->pm)
-		port->pm(port, state, oldstate);
-	else
-		serial8250_do_pm(port, state, oldstate);
-}
+	if (up->port.irq)
+		serial_out(up, UART_IER, ier);
 
-static unsigned int serial8250_port_size(struct uart_8250_port *pt)
-{
-	if (pt->port.mapsize)
-		return pt->port.mapsize;
-	if (pt->port.iotype == UPIO_AU) {
-		if (pt->port.type == PORT_RT2880)
-			return 0x100;
-		return 0x1000;
-	}
-	if (is_omap1_8250(pt))
-		return 0x16 << pt->port.regshift;
+	spin_unlock_irqrestore(&up->port.lock, flags);
 
-	return 8 << pt->port.regshift;
+	/* Standard timer interval plus 0.2s to keep the port running */
+	mod_timer(&up->timer,
+		jiffies + uart_poll_timeout(&up->port) + HZ / 5);
 }
 
-/*
- * Resource handling.
- */
-static int serial8250_request_std_resource(struct uart_8250_port *up)
+static int univ8250_setup_irq(struct uart_8250_port *up)
 {
-	unsigned int size = serial8250_port_size(up);
 	struct uart_port *port = &up->port;
-	int ret = 0;
+	int retval = 0;
 
-	switch (port->iotype) {
-	case UPIO_AU:
-	case UPIO_TSI:
-	case UPIO_MEM32:
-	case UPIO_MEM32BE:
-	case UPIO_MEM:
-		if (!port->mapbase)
-			break;
+	/*
+	 * The above check will only give an accurate result the first time
+	 * the port is opened so this value needs to be preserved.
+	 */
+	if (up->bugs & UART_BUG_THRE) {
+		pr_debug("ttyS%d - using backup timer\n", serial_index(port));
 
-		if (!request_mem_region(port->mapbase, size, "serial")) {
-			ret = -EBUSY;
-			break;
-		}
+		up->timer.function = serial8250_backup_timeout;
+		up->timer.data = (unsigned long)up;
+		mod_timer(&up->timer, jiffies +
+			  uart_poll_timeout(port) + HZ / 5);
+	}
 
-		if (port->flags & UPF_IOREMAP) {
-			port->membase = ioremap_nocache(port->mapbase, size);
-			if (!port->membase) {
-				release_mem_region(port->mapbase, size);
-				ret = -ENOMEM;
-			}
-		}
-		break;
+	/*
+	 * If the "interrupt" for this port doesn't correspond with any
+	 * hardware interrupt, we use a timer-based system.  The original
+	 * driver used to do this with IRQ0.
+	 */
+	if (!port->irq) {
+		up->timer.data = (unsigned long)up;
+		mod_timer(&up->timer, jiffies + uart_poll_timeout(port));
+	} else
+		retval = serial_link_irq_chain(up);
 
-	case UPIO_HUB6:
-	case UPIO_PORT:
-		if (!request_region(port->iobase, size, "serial"))
-			ret = -EBUSY;
-		break;
-	}
-	return ret;
+	return retval;
 }
 
-static void serial8250_release_std_resource(struct uart_8250_port *up)
+static void univ8250_release_irq(struct uart_8250_port *up)
 {
-	unsigned int size = serial8250_port_size(up);
 	struct uart_port *port = &up->port;
 
-	switch (port->iotype) {
-	case UPIO_AU:
-	case UPIO_TSI:
-	case UPIO_MEM32:
-	case UPIO_MEM32BE:
-	case UPIO_MEM:
-		if (!port->mapbase)
-			break;
-
-		if (port->flags & UPF_IOREMAP) {
-			iounmap(port->membase);
-			port->membase = NULL;
-		}
-
-		release_mem_region(port->mapbase, size);
-		break;
-
-	case UPIO_HUB6:
-	case UPIO_PORT:
-		release_region(port->iobase, size);
-		break;
-	}
+	del_timer_sync(&up->timer);
+	up->timer.function = serial8250_timeout;
+	if (port->irq)
+		serial_unlink_irq_chain(up);
 }
 
 #ifdef CONFIG_SERIAL_8250_RSA
@@ -2848,259 +396,6 @@ static void serial8250_release_rsa_resource(struct uart_8250_port *up)
 }
 #endif
 
-static void serial8250_release_port(struct uart_port *port)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-
-	serial8250_release_std_resource(up);
-}
-
-static int serial8250_request_port(struct uart_port *port)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-	int ret;
-
-	if (port->type == PORT_8250_CIR)
-		return -ENODEV;
-
-	ret = serial8250_request_std_resource(up);
-
-	return ret;
-}
-
-static int fcr_get_rxtrig_bytes(struct uart_8250_port *up)
-{
-	const struct serial8250_config *conf_type = &uart_config[up->port.type];
-	unsigned char bytes;
-
-	bytes = conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(up->fcr)];
-
-	return bytes ? bytes : -EOPNOTSUPP;
-}
-
-static int bytes_to_fcr_rxtrig(struct uart_8250_port *up, unsigned char bytes)
-{
-	const struct serial8250_config *conf_type = &uart_config[up->port.type];
-	int i;
-
-	if (!conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(UART_FCR_R_TRIG_00)])
-		return -EOPNOTSUPP;
-
-	for (i = 1; i < UART_FCR_R_TRIG_MAX_STATE; i++) {
-		if (bytes < conf_type->rxtrig_bytes[i])
-			/* Use the nearest lower value */
-			return (--i) << UART_FCR_R_TRIG_SHIFT;
-	}
-
-	return UART_FCR_R_TRIG_11;
-}
-
-static int do_get_rxtrig(struct tty_port *port)
-{
-	struct uart_state *state = container_of(port, struct uart_state, port);
-	struct uart_port *uport = state->uart_port;
-	struct uart_8250_port *up =
-		container_of(uport, struct uart_8250_port, port);
-
-	if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1)
-		return -EINVAL;
-
-	return fcr_get_rxtrig_bytes(up);
-}
-
-static int do_serial8250_get_rxtrig(struct tty_port *port)
-{
-	int rxtrig_bytes;
-
-	mutex_lock(&port->mutex);
-	rxtrig_bytes = do_get_rxtrig(port);
-	mutex_unlock(&port->mutex);
-
-	return rxtrig_bytes;
-}
-
-static ssize_t serial8250_get_attr_rx_trig_bytes(struct device *dev,
-	struct device_attribute *attr, char *buf)
-{
-	struct tty_port *port = dev_get_drvdata(dev);
-	int rxtrig_bytes;
-
-	rxtrig_bytes = do_serial8250_get_rxtrig(port);
-	if (rxtrig_bytes < 0)
-		return rxtrig_bytes;
-
-	return snprintf(buf, PAGE_SIZE, "%d\n", rxtrig_bytes);
-}
-
-static int do_set_rxtrig(struct tty_port *port, unsigned char bytes)
-{
-	struct uart_state *state = container_of(port, struct uart_state, port);
-	struct uart_port *uport = state->uart_port;
-	struct uart_8250_port *up =
-		container_of(uport, struct uart_8250_port, port);
-	int rxtrig;
-
-	if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1 ||
-	    up->fifo_bug)
-		return -EINVAL;
-
-	rxtrig = bytes_to_fcr_rxtrig(up, bytes);
-	if (rxtrig < 0)
-		return rxtrig;
-
-	serial8250_clear_fifos(up);
-	up->fcr &= ~UART_FCR_TRIGGER_MASK;
-	up->fcr |= (unsigned char)rxtrig;
-	serial_out(up, UART_FCR, up->fcr);
-	return 0;
-}
-
-static int do_serial8250_set_rxtrig(struct tty_port *port, unsigned char bytes)
-{
-	int ret;
-
-	mutex_lock(&port->mutex);
-	ret = do_set_rxtrig(port, bytes);
-	mutex_unlock(&port->mutex);
-
-	return ret;
-}
-
-static ssize_t serial8250_set_attr_rx_trig_bytes(struct device *dev,
-	struct device_attribute *attr, const char *buf, size_t count)
-{
-	struct tty_port *port = dev_get_drvdata(dev);
-	unsigned char bytes;
-	int ret;
-
-	if (!count)
-		return -EINVAL;
-
-	ret = kstrtou8(buf, 10, &bytes);
-	if (ret < 0)
-		return ret;
-
-	ret = do_serial8250_set_rxtrig(port, bytes);
-	if (ret < 0)
-		return ret;
-
-	return count;
-}
-
-static DEVICE_ATTR(rx_trig_bytes, S_IRUSR | S_IWUSR | S_IRGRP,
-		   serial8250_get_attr_rx_trig_bytes,
-		   serial8250_set_attr_rx_trig_bytes);
-
-static struct attribute *serial8250_dev_attrs[] = {
-	&dev_attr_rx_trig_bytes.attr,
-	NULL,
-	};
-
-static struct attribute_group serial8250_dev_attr_group = {
-	.attrs = serial8250_dev_attrs,
-	};
-
-static void register_dev_spec_attr_grp(struct uart_8250_port *up)
-{
-	const struct serial8250_config *conf_type = &uart_config[up->port.type];
-
-	if (conf_type->rxtrig_bytes[0])
-		up->port.attr_group = &serial8250_dev_attr_group;
-}
-
-static void serial8250_config_port(struct uart_port *port, int flags)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-	int ret;
-
-	if (port->type == PORT_8250_CIR)
-		return;
-
-	/*
-	 * Find the region that we can probe for.  This in turn
-	 * tells us whether we can probe for the type of port.
-	 */
-	ret = serial8250_request_std_resource(up);
-	if (ret < 0)
-		return;
-
-	if (port->iotype != up->cur_iotype)
-		set_io_from_upio(port);
-
-	if (flags & UART_CONFIG_TYPE)
-		autoconfig(up);
-
-	/* if access method is AU, it is a 16550 with a quirk */
-	if (port->type == PORT_16550A && port->iotype == UPIO_AU)
-		up->bugs |= UART_BUG_NOMSR;
-
-	/* HW bugs may trigger IRQ while IIR == NO_INT */
-	if (port->type == PORT_TEGRA)
-		up->bugs |= UART_BUG_NOMSR;
-
-	if (port->type != PORT_UNKNOWN && flags & UART_CONFIG_IRQ)
-		autoconfig_irq(up);
-
-	if (port->type == PORT_UNKNOWN)
-		serial8250_release_std_resource(up);
-
-	/* Fixme: probably not the best place for this */
-	if ((port->type == PORT_XR17V35X) ||
-	   (port->type == PORT_XR17D15X))
-		port->handle_irq = exar_handle_irq;
-
-	register_dev_spec_attr_grp(up);
-	up->fcr = uart_config[up->port.type].fcr;
-}
-
-static int
-serial8250_verify_port(struct uart_port *port, struct serial_struct *ser)
-{
-	if (ser->irq >= nr_irqs || ser->irq < 0 ||
-	    ser->baud_base < 9600 || ser->type < PORT_UNKNOWN ||
-	    ser->type >= ARRAY_SIZE(uart_config) || ser->type == PORT_CIRRUS ||
-	    ser->type == PORT_STARTECH)
-		return -EINVAL;
-	return 0;
-}
-
-static const char *
-serial8250_type(struct uart_port *port)
-{
-	int type = port->type;
-
-	if (type >= ARRAY_SIZE(uart_config))
-		type = 0;
-	return uart_config[type].name;
-}
-
-static const struct uart_ops serial8250_pops = {
-	.tx_empty	= serial8250_tx_empty,
-	.set_mctrl	= serial8250_set_mctrl,
-	.get_mctrl	= serial8250_get_mctrl,
-	.stop_tx	= serial8250_stop_tx,
-	.start_tx	= serial8250_start_tx,
-	.throttle	= serial8250_throttle,
-	.unthrottle	= serial8250_unthrottle,
-	.stop_rx	= serial8250_stop_rx,
-	.enable_ms	= serial8250_enable_ms,
-	.break_ctl	= serial8250_break_ctl,
-	.startup	= serial8250_startup,
-	.shutdown	= serial8250_shutdown,
-	.set_termios	= serial8250_set_termios,
-	.set_ldisc	= serial8250_set_ldisc,
-	.pm		= serial8250_pm,
-	.type		= serial8250_type,
-	.release_port	= serial8250_release_port,
-	.request_port	= serial8250_request_port,
-	.config_port	= serial8250_config_port,
-	.verify_port	= serial8250_verify_port,
-#ifdef CONFIG_CONSOLE_POLL
-	.poll_get_char = serial8250_get_poll_char,
-	.poll_put_char = serial8250_put_poll_char,
-#endif
-};
-
 static const struct uart_ops *base_ops;
 static struct uart_ops univ8250_port_ops;
 
@@ -3139,42 +434,6 @@ void serial8250_set_isa_configurator(
 }
 EXPORT_SYMBOL(serial8250_set_isa_configurator);
 
-static void serial8250_init_port(struct uart_8250_port *up)
-{
-	struct uart_port *port = &up->port;
-
-	spin_lock_init(&port->lock);
-	port->ops = &serial8250_pops;
-
-	up->cur_iotype = 0xFF;
-}
-
-static void serial8250_set_defaults(struct uart_8250_port *up)
-{
-	struct uart_port *port = &up->port;
-
-	if (up->port.flags & UPF_FIXED_TYPE) {
-		unsigned int type = up->port.type;
-
-		if (!up->port.fifosize)
-			up->port.fifosize = uart_config[type].fifo_size;
-		if (!up->tx_loadsz)
-			up->tx_loadsz = uart_config[type].tx_loadsz;
-		if (!up->capabilities)
-			up->capabilities = uart_config[type].flags;
-	}
-
-	set_io_from_upio(port);
-
-	/* default dma handlers */
-	if (up->dma) {
-		if (!up->dma->tx_dma)
-			up->dma->tx_dma = serial8250_tx_dma;
-		if (!up->dma->rx_dma)
-			up->dma->rx_dma = serial8250_rx_dma;
-	}
-}
-
 #ifdef CONFIG_SERIAL_8250_RSA
 
 static void univ8250_config_port(struct uart_port *port, int flags)
@@ -3324,94 +583,6 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev)
 
 #ifdef CONFIG_SERIAL_8250_CONSOLE
 
-static void serial8250_console_putchar(struct uart_port *port, int ch)
-{
-	struct uart_8250_port *up = up_to_u8250p(port);
-
-	wait_for_xmitr(up, UART_LSR_THRE);
-	serial_port_out(port, UART_TX, ch);
-}
-
-/*
- *	Print a string to the serial port trying not to disturb
- *	any possible real use of the port...
- *
- *	The console_lock must be held when we get here.
- */
-static void serial8250_console_write(struct uart_8250_port *up, const char *s,
-				     unsigned int count)
-{
-	struct uart_port *port = &up->port;
-	unsigned long flags;
-	unsigned int ier;
-	int locked = 1;
-
-	touch_nmi_watchdog();
-
-	serial8250_rpm_get(up);
-
-	if (port->sysrq)
-		locked = 0;
-	else if (oops_in_progress)
-		locked = spin_trylock_irqsave(&port->lock, flags);
-	else
-		spin_lock_irqsave(&port->lock, flags);
-
-	/*
-	 *	First save the IER then disable the interrupts
-	 */
-	ier = serial_port_in(port, UART_IER);
-
-	if (up->capabilities & UART_CAP_UUE)
-		serial_port_out(port, UART_IER, UART_IER_UUE);
-	else
-		serial_port_out(port, UART_IER, 0);
-
-	/* check scratch reg to see if port powered off during system sleep */
-	if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
-		struct ktermios termios;
-		unsigned int baud, quot, frac = 0;
-
-		termios.c_cflag = port->cons->cflag;
-		if (port->state->port.tty && termios.c_cflag == 0)
-			termios.c_cflag = port->state->port.tty->termios.c_cflag;
-
-		baud = uart_get_baud_rate(port, &termios, NULL,
-					  port->uartclk / 16 / 0xffff,
-					  port->uartclk / 16);
-		quot = serial8250_get_divisor(up, baud, &frac);
-
-		serial8250_set_divisor(port, baud, quot, frac);
-		serial_port_out(port, UART_LCR, up->lcr);
-		serial_port_out(port, UART_MCR, UART_MCR_DTR | UART_MCR_RTS);
-
-		up->canary = 0;
-	}
-
-	uart_console_write(port, s, count, serial8250_console_putchar);
-
-	/*
-	 *	Finally, wait for transmitter to become empty
-	 *	and restore the IER
-	 */
-	wait_for_xmitr(up, BOTH_EMPTY);
-	serial_port_out(port, UART_IER, ier);
-
-	/*
-	 *	The receive handling will happen properly because the
-	 *	receive ready bit will still be set; it is not cleared
-	 *	on read.  However, modem control will not, we must
-	 *	call it if we have saved something in the saved flags
-	 *	while processing with interrupts off.
-	 */
-	if (up->msr_saved_flags)
-		serial8250_modem_status(up);
-
-	if (locked)
-		spin_unlock_irqrestore(&port->lock, flags);
-	serial8250_rpm_put(up);
-}
-
 static void univ8250_console_write(struct console *co, const char *s,
 				   unsigned int count)
 {
@@ -3420,39 +591,6 @@ static void univ8250_console_write(struct console *co, const char *s,
 	serial8250_console_write(up, s, count);
 }
 
-static unsigned int probe_baud(struct uart_port *port)
-{
-	unsigned char lcr, dll, dlm;
-	unsigned int quot;
-
-	lcr = serial_port_in(port, UART_LCR);
-	serial_port_out(port, UART_LCR, lcr | UART_LCR_DLAB);
-	dll = serial_port_in(port, UART_DLL);
-	dlm = serial_port_in(port, UART_DLM);
-	serial_port_out(port, UART_LCR, lcr);
-
-	quot = (dlm << 8) | dll;
-	return (port->uartclk / 16) / quot;
-}
-
-static int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
-{
-	int baud = 9600;
-	int bits = 8;
-	int parity = 'n';
-	int flow = 'n';
-
-	if (!port->iobase && !port->membase)
-		return -ENODEV;
-
-	if (options)
-		uart_parse_options(options, &baud, &parity, &bits, &flow);
-	else if (probe)
-		baud = probe_baud(port);
-
-	return uart_set_options(port, port->cons, baud, parity, bits, flow);
-}
-
 static int univ8250_console_setup(struct console *co, char *options)
 {
 	struct uart_port *port;
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
new file mode 100644
index 000000000000..4dc143eee086
--- /dev/null
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -0,0 +1,2898 @@
+/*
+ *  Base port operations for 8250/16550-type serial ports
+ *
+ *  Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
+ *  Split from 8250_core.c, Copyright (C) 2001 Russell King.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * A note about mapbase / membase
+ *
+ *  mapbase is the physical address of the IO port.
+ *  membase is an 'ioremapped' cookie.
+ */
+
+#if defined(CONFIG_SERIAL_8250_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
+#define SUPPORT_SYSRQ
+#endif
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/sysrq.h>
+#include <linux/delay.h>
+#include <linux/platform_device.h>
+#include <linux/tty.h>
+#include <linux/ratelimit.h>
+#include <linux/tty_flip.h>
+#include <linux/serial.h>
+#include <linux/serial_8250.h>
+#include <linux/nmi.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/pm_runtime.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+
+#include "8250.h"
+
+/*
+ * Debugging.
+ */
+#if 0
+#define DEBUG_AUTOCONF(fmt...)	printk(fmt)
+#else
+#define DEBUG_AUTOCONF(fmt...)	do { } while (0)
+#endif
+
+#define BOTH_EMPTY 	(UART_LSR_TEMT | UART_LSR_THRE)
+
+/*
+ * Here we define the default xmit fifo size used for each type of UART.
+ */
+static const struct serial8250_config uart_config[] = {
+	[PORT_UNKNOWN] = {
+		.name		= "unknown",
+		.fifo_size	= 1,
+		.tx_loadsz	= 1,
+	},
+	[PORT_8250] = {
+		.name		= "8250",
+		.fifo_size	= 1,
+		.tx_loadsz	= 1,
+	},
+	[PORT_16450] = {
+		.name		= "16450",
+		.fifo_size	= 1,
+		.tx_loadsz	= 1,
+	},
+	[PORT_16550] = {
+		.name		= "16550",
+		.fifo_size	= 1,
+		.tx_loadsz	= 1,
+	},
+	[PORT_16550A] = {
+		.name		= "16550A",
+		.fifo_size	= 16,
+		.tx_loadsz	= 16,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
+		.rxtrig_bytes	= {1, 4, 8, 14},
+		.flags		= UART_CAP_FIFO,
+	},
+	[PORT_CIRRUS] = {
+		.name		= "Cirrus",
+		.fifo_size	= 1,
+		.tx_loadsz	= 1,
+	},
+	[PORT_16650] = {
+		.name		= "ST16650",
+		.fifo_size	= 1,
+		.tx_loadsz	= 1,
+		.flags		= UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
+	},
+	[PORT_16650V2] = {
+		.name		= "ST16650V2",
+		.fifo_size	= 32,
+		.tx_loadsz	= 16,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
+				  UART_FCR_T_TRIG_00,
+		.rxtrig_bytes	= {8, 16, 24, 28},
+		.flags		= UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
+	},
+	[PORT_16750] = {
+		.name		= "TI16750",
+		.fifo_size	= 64,
+		.tx_loadsz	= 64,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
+				  UART_FCR7_64BYTE,
+		.rxtrig_bytes	= {1, 16, 32, 56},
+		.flags		= UART_CAP_FIFO | UART_CAP_SLEEP | UART_CAP_AFE,
+	},
+	[PORT_STARTECH] = {
+		.name		= "Startech",
+		.fifo_size	= 1,
+		.tx_loadsz	= 1,
+	},
+	[PORT_16C950] = {
+		.name		= "16C950/954",
+		.fifo_size	= 128,
+		.tx_loadsz	= 128,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
+		/* UART_CAP_EFR breaks billionon CF bluetooth card. */
+		.flags		= UART_CAP_FIFO | UART_CAP_SLEEP,
+	},
+	[PORT_16654] = {
+		.name		= "ST16654",
+		.fifo_size	= 64,
+		.tx_loadsz	= 32,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
+				  UART_FCR_T_TRIG_10,
+		.rxtrig_bytes	= {8, 16, 56, 60},
+		.flags		= UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
+	},
+	[PORT_16850] = {
+		.name		= "XR16850",
+		.fifo_size	= 128,
+		.tx_loadsz	= 128,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
+		.flags		= UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
+	},
+	[PORT_RSA] = {
+		.name		= "RSA",
+		.fifo_size	= 2048,
+		.tx_loadsz	= 2048,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11,
+		.flags		= UART_CAP_FIFO,
+	},
+	[PORT_NS16550A] = {
+		.name		= "NS16550A",
+		.fifo_size	= 16,
+		.tx_loadsz	= 16,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
+		.flags		= UART_CAP_FIFO | UART_NATSEMI,
+	},
+	[PORT_XSCALE] = {
+		.name		= "XScale",
+		.fifo_size	= 32,
+		.tx_loadsz	= 32,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
+		.flags		= UART_CAP_FIFO | UART_CAP_UUE | UART_CAP_RTOIE,
+	},
+	[PORT_OCTEON] = {
+		.name		= "OCTEON",
+		.fifo_size	= 64,
+		.tx_loadsz	= 64,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
+		.flags		= UART_CAP_FIFO,
+	},
+	[PORT_AR7] = {
+		.name		= "AR7",
+		.fifo_size	= 16,
+		.tx_loadsz	= 16,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00,
+		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
+	},
+	[PORT_U6_16550A] = {
+		.name		= "U6_16550A",
+		.fifo_size	= 64,
+		.tx_loadsz	= 64,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
+		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
+	},
+	[PORT_TEGRA] = {
+		.name		= "Tegra",
+		.fifo_size	= 32,
+		.tx_loadsz	= 8,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
+				  UART_FCR_T_TRIG_01,
+		.rxtrig_bytes	= {1, 4, 8, 14},
+		.flags		= UART_CAP_FIFO | UART_CAP_RTOIE,
+	},
+	[PORT_XR17D15X] = {
+		.name		= "XR17D15X",
+		.fifo_size	= 64,
+		.tx_loadsz	= 64,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
+		.flags		= UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR |
+				  UART_CAP_SLEEP,
+	},
+	[PORT_XR17V35X] = {
+		.name		= "XR17V35X",
+		.fifo_size	= 256,
+		.tx_loadsz	= 256,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11 |
+				  UART_FCR_T_TRIG_11,
+		.flags		= UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR |
+				  UART_CAP_SLEEP,
+	},
+	[PORT_LPC3220] = {
+		.name		= "LPC3220",
+		.fifo_size	= 64,
+		.tx_loadsz	= 32,
+		.fcr		= UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO |
+				  UART_FCR_R_TRIG_00 | UART_FCR_T_TRIG_00,
+		.flags		= UART_CAP_FIFO,
+	},
+	[PORT_BRCM_TRUMANAGE] = {
+		.name		= "TruManage",
+		.fifo_size	= 1,
+		.tx_loadsz	= 1024,
+		.flags		= UART_CAP_HFIFO,
+	},
+	[PORT_8250_CIR] = {
+		.name		= "CIR port"
+	},
+	[PORT_ALTR_16550_F32] = {
+		.name		= "Altera 16550 FIFO32",
+		.fifo_size	= 32,
+		.tx_loadsz	= 32,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
+		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
+	},
+	[PORT_ALTR_16550_F64] = {
+		.name		= "Altera 16550 FIFO64",
+		.fifo_size	= 64,
+		.tx_loadsz	= 64,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
+		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
+	},
+	[PORT_ALTR_16550_F128] = {
+		.name		= "Altera 16550 FIFO128",
+		.fifo_size	= 128,
+		.tx_loadsz	= 128,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
+		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
+	},
+/* tx_loadsz is set to 63-bytes instead of 64-bytes to implement
+workaround of errata A-008006 which states that tx_loadsz should  be
+configured less than Maximum supported fifo bytes */
+	[PORT_16550A_FSL64] = {
+		.name		= "16550A_FSL64",
+		.fifo_size	= 64,
+		.tx_loadsz	= 63,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
+				  UART_FCR7_64BYTE,
+		.flags		= UART_CAP_FIFO,
+	},
+};
+
+/* Uart divisor latch read */
+static int default_serial_dl_read(struct uart_8250_port *up)
+{
+	return serial_in(up, UART_DLL) | serial_in(up, UART_DLM) << 8;
+}
+
+/* Uart divisor latch write */
+static void default_serial_dl_write(struct uart_8250_port *up, int value)
+{
+	serial_out(up, UART_DLL, value & 0xff);
+	serial_out(up, UART_DLM, value >> 8 & 0xff);
+}
+
+#if defined(CONFIG_MIPS_ALCHEMY) || defined(CONFIG_SERIAL_8250_RT288X)
+
+/* Au1x00/RT288x UART hardware has a weird register layout */
+static const s8 au_io_in_map[8] = {
+	 0,	/* UART_RX  */
+	 2,	/* UART_IER */
+	 3,	/* UART_IIR */
+	 5,	/* UART_LCR */
+	 6,	/* UART_MCR */
+	 7,	/* UART_LSR */
+	 8,	/* UART_MSR */
+	-1,	/* UART_SCR (unmapped) */
+};
+
+static const s8 au_io_out_map[8] = {
+	 1,	/* UART_TX  */
+	 2,	/* UART_IER */
+	 4,	/* UART_FCR */
+	 5,	/* UART_LCR */
+	 6,	/* UART_MCR */
+	-1,	/* UART_LSR (unmapped) */
+	-1,	/* UART_MSR (unmapped) */
+	-1,	/* UART_SCR (unmapped) */
+};
+
+static unsigned int au_serial_in(struct uart_port *p, int offset)
+{
+	if (offset >= ARRAY_SIZE(au_io_in_map))
+		return UINT_MAX;
+	offset = au_io_in_map[offset];
+	if (offset < 0)
+		return UINT_MAX;
+	return __raw_readl(p->membase + (offset << p->regshift));
+}
+
+static void au_serial_out(struct uart_port *p, int offset, int value)
+{
+	if (offset >= ARRAY_SIZE(au_io_out_map))
+		return;
+	offset = au_io_out_map[offset];
+	if (offset < 0)
+		return;
+	__raw_writel(value, p->membase + (offset << p->regshift));
+}
+
+/* Au1x00 haven't got a standard divisor latch */
+static int au_serial_dl_read(struct uart_8250_port *up)
+{
+	return __raw_readl(up->port.membase + 0x28);
+}
+
+static void au_serial_dl_write(struct uart_8250_port *up, int value)
+{
+	__raw_writel(value, up->port.membase + 0x28);
+}
+
+#endif
+
+static unsigned int hub6_serial_in(struct uart_port *p, int offset)
+{
+	offset = offset << p->regshift;
+	outb(p->hub6 - 1 + offset, p->iobase);
+	return inb(p->iobase + 1);
+}
+
+static void hub6_serial_out(struct uart_port *p, int offset, int value)
+{
+	offset = offset << p->regshift;
+	outb(p->hub6 - 1 + offset, p->iobase);
+	outb(value, p->iobase + 1);
+}
+
+static unsigned int mem_serial_in(struct uart_port *p, int offset)
+{
+	offset = offset << p->regshift;
+	return readb(p->membase + offset);
+}
+
+static void mem_serial_out(struct uart_port *p, int offset, int value)
+{
+	offset = offset << p->regshift;
+	writeb(value, p->membase + offset);
+}
+
+static void mem32_serial_out(struct uart_port *p, int offset, int value)
+{
+	offset = offset << p->regshift;
+	writel(value, p->membase + offset);
+}
+
+static unsigned int mem32_serial_in(struct uart_port *p, int offset)
+{
+	offset = offset << p->regshift;
+	return readl(p->membase + offset);
+}
+
+static void mem32be_serial_out(struct uart_port *p, int offset, int value)
+{
+	offset = offset << p->regshift;
+	iowrite32be(value, p->membase + offset);
+}
+
+static unsigned int mem32be_serial_in(struct uart_port *p, int offset)
+{
+	offset = offset << p->regshift;
+	return ioread32be(p->membase + offset);
+}
+
+static unsigned int io_serial_in(struct uart_port *p, int offset)
+{
+	offset = offset << p->regshift;
+	return inb(p->iobase + offset);
+}
+
+static void io_serial_out(struct uart_port *p, int offset, int value)
+{
+	offset = offset << p->regshift;
+	outb(value, p->iobase + offset);
+}
+
+static int serial8250_default_handle_irq(struct uart_port *port);
+static int exar_handle_irq(struct uart_port *port);
+
+static void set_io_from_upio(struct uart_port *p)
+{
+	struct uart_8250_port *up = up_to_u8250p(p);
+
+	up->dl_read = default_serial_dl_read;
+	up->dl_write = default_serial_dl_write;
+
+	switch (p->iotype) {
+	case UPIO_HUB6:
+		p->serial_in = hub6_serial_in;
+		p->serial_out = hub6_serial_out;
+		break;
+
+	case UPIO_MEM:
+		p->serial_in = mem_serial_in;
+		p->serial_out = mem_serial_out;
+		break;
+
+	case UPIO_MEM32:
+		p->serial_in = mem32_serial_in;
+		p->serial_out = mem32_serial_out;
+		break;
+
+	case UPIO_MEM32BE:
+		p->serial_in = mem32be_serial_in;
+		p->serial_out = mem32be_serial_out;
+		break;
+
+#if defined(CONFIG_MIPS_ALCHEMY) || defined(CONFIG_SERIAL_8250_RT288X)
+	case UPIO_AU:
+		p->serial_in = au_serial_in;
+		p->serial_out = au_serial_out;
+		up->dl_read = au_serial_dl_read;
+		up->dl_write = au_serial_dl_write;
+		break;
+#endif
+
+	default:
+		p->serial_in = io_serial_in;
+		p->serial_out = io_serial_out;
+		break;
+	}
+	/* Remember loaded iotype */
+	up->cur_iotype = p->iotype;
+	p->handle_irq = serial8250_default_handle_irq;
+}
+
+static void
+serial_port_out_sync(struct uart_port *p, int offset, int value)
+{
+	switch (p->iotype) {
+	case UPIO_MEM:
+	case UPIO_MEM32:
+	case UPIO_MEM32BE:
+	case UPIO_AU:
+		p->serial_out(p, offset, value);
+		p->serial_in(p, UART_LCR);	/* safe, no side-effects */
+		break;
+	default:
+		p->serial_out(p, offset, value);
+	}
+}
+
+/*
+ * For the 16C950
+ */
+static void serial_icr_write(struct uart_8250_port *up, int offset, int value)
+{
+	serial_out(up, UART_SCR, offset);
+	serial_out(up, UART_ICR, value);
+}
+
+static unsigned int serial_icr_read(struct uart_8250_port *up, int offset)
+{
+	unsigned int value;
+
+	serial_icr_write(up, UART_ACR, up->acr | UART_ACR_ICRRD);
+	serial_out(up, UART_SCR, offset);
+	value = serial_in(up, UART_ICR);
+	serial_icr_write(up, UART_ACR, up->acr);
+
+	return value;
+}
+
+/*
+ * FIFO support.
+ */
+static void serial8250_clear_fifos(struct uart_8250_port *p)
+{
+	if (p->capabilities & UART_CAP_FIFO) {
+		serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO);
+		serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO |
+			       UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
+		serial_out(p, UART_FCR, 0);
+	}
+}
+
+void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p)
+{
+	serial8250_clear_fifos(p);
+	serial_out(p, UART_FCR, p->fcr);
+}
+EXPORT_SYMBOL_GPL(serial8250_clear_and_reinit_fifos);
+
+void serial8250_rpm_get(struct uart_8250_port *p)
+{
+	if (!(p->capabilities & UART_CAP_RPM))
+		return;
+	pm_runtime_get_sync(p->port.dev);
+}
+EXPORT_SYMBOL_GPL(serial8250_rpm_get);
+
+void serial8250_rpm_put(struct uart_8250_port *p)
+{
+	if (!(p->capabilities & UART_CAP_RPM))
+		return;
+	pm_runtime_mark_last_busy(p->port.dev);
+	pm_runtime_put_autosuspend(p->port.dev);
+}
+EXPORT_SYMBOL_GPL(serial8250_rpm_put);
+
+/*
+ * These two wrappers ensure that enable_runtime_pm_tx() can be called more than
+ * once and disable_runtime_pm_tx() will still disable RPM because the fifo is
+ * empty and the HW can idle again.
+ */
+static void serial8250_rpm_get_tx(struct uart_8250_port *p)
+{
+	unsigned char rpm_active;
+
+	if (!(p->capabilities & UART_CAP_RPM))
+		return;
+
+	rpm_active = xchg(&p->rpm_tx_active, 1);
+	if (rpm_active)
+		return;
+	pm_runtime_get_sync(p->port.dev);
+}
+
+static void serial8250_rpm_put_tx(struct uart_8250_port *p)
+{
+	unsigned char rpm_active;
+
+	if (!(p->capabilities & UART_CAP_RPM))
+		return;
+
+	rpm_active = xchg(&p->rpm_tx_active, 0);
+	if (!rpm_active)
+		return;
+	pm_runtime_mark_last_busy(p->port.dev);
+	pm_runtime_put_autosuspend(p->port.dev);
+}
+
+/*
+ * IER sleep support.  UARTs which have EFRs need the "extended
+ * capability" bit enabled.  Note that on XR16C850s, we need to
+ * reset LCR to write to IER.
+ */
+static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
+{
+	unsigned char lcr = 0, efr = 0;
+	/*
+	 * Exar UARTs have a SLEEP register that enables or disables
+	 * each UART to enter sleep mode separately.  On the XR17V35x the
+	 * register is accessible to each UART at the UART_EXAR_SLEEP
+	 * offset but the UART channel may only write to the corresponding
+	 * bit.
+	 */
+	serial8250_rpm_get(p);
+	if ((p->port.type == PORT_XR17V35X) ||
+	   (p->port.type == PORT_XR17D15X)) {
+		serial_out(p, UART_EXAR_SLEEP, sleep ? 0xff : 0);
+		goto out;
+	}
+
+	if (p->capabilities & UART_CAP_SLEEP) {
+		if (p->capabilities & UART_CAP_EFR) {
+			lcr = serial_in(p, UART_LCR);
+			efr = serial_in(p, UART_EFR);
+			serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
+			serial_out(p, UART_EFR, UART_EFR_ECB);
+			serial_out(p, UART_LCR, 0);
+		}
+		serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0);
+		if (p->capabilities & UART_CAP_EFR) {
+			serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
+			serial_out(p, UART_EFR, efr);
+			serial_out(p, UART_LCR, lcr);
+		}
+	}
+out:
+	serial8250_rpm_put(p);
+}
+
+#ifdef CONFIG_SERIAL_8250_RSA
+/*
+ * Attempts to turn on the RSA FIFO.  Returns zero on failure.
+ * We set the port uart clock rate if we succeed.
+ */
+static int __enable_rsa(struct uart_8250_port *up)
+{
+	unsigned char mode;
+	int result;
+
+	mode = serial_in(up, UART_RSA_MSR);
+	result = mode & UART_RSA_MSR_FIFO;
+
+	if (!result) {
+		serial_out(up, UART_RSA_MSR, mode | UART_RSA_MSR_FIFO);
+		mode = serial_in(up, UART_RSA_MSR);
+		result = mode & UART_RSA_MSR_FIFO;
+	}
+
+	if (result)
+		up->port.uartclk = SERIAL_RSA_BAUD_BASE * 16;
+
+	return result;
+}
+
+static void enable_rsa(struct uart_8250_port *up)
+{
+	if (up->port.type == PORT_RSA) {
+		if (up->port.uartclk != SERIAL_RSA_BAUD_BASE * 16) {
+			spin_lock_irq(&up->port.lock);
+			__enable_rsa(up);
+			spin_unlock_irq(&up->port.lock);
+		}
+		if (up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16)
+			serial_out(up, UART_RSA_FRR, 0);
+	}
+}
+
+/*
+ * Attempts to turn off the RSA FIFO.  Returns zero on failure.
+ * It is unknown why interrupts were disabled in here.  However,
+ * the caller is expected to preserve this behaviour by grabbing
+ * the spinlock before calling this function.
+ */
+static void disable_rsa(struct uart_8250_port *up)
+{
+	unsigned char mode;
+	int result;
+
+	if (up->port.type == PORT_RSA &&
+	    up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16) {
+		spin_lock_irq(&up->port.lock);
+
+		mode = serial_in(up, UART_RSA_MSR);
+		result = !(mode & UART_RSA_MSR_FIFO);
+
+		if (!result) {
+			serial_out(up, UART_RSA_MSR, mode & ~UART_RSA_MSR_FIFO);
+			mode = serial_in(up, UART_RSA_MSR);
+			result = !(mode & UART_RSA_MSR_FIFO);
+		}
+
+		if (result)
+			up->port.uartclk = SERIAL_RSA_BAUD_BASE_LO * 16;
+		spin_unlock_irq(&up->port.lock);
+	}
+}
+#endif /* CONFIG_SERIAL_8250_RSA */
+
+/*
+ * This is a quickie test to see how big the FIFO is.
+ * It doesn't work at all the time, more's the pity.
+ */
+static int size_fifo(struct uart_8250_port *up)
+{
+	unsigned char old_fcr, old_mcr, old_lcr;
+	unsigned short old_dl;
+	int count;
+
+	old_lcr = serial_in(up, UART_LCR);
+	serial_out(up, UART_LCR, 0);
+	old_fcr = serial_in(up, UART_FCR);
+	old_mcr = serial_in(up, UART_MCR);
+	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO |
+		    UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
+	serial_out(up, UART_MCR, UART_MCR_LOOP);
+	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
+	old_dl = serial_dl_read(up);
+	serial_dl_write(up, 0x0001);
+	serial_out(up, UART_LCR, 0x03);
+	for (count = 0; count < 256; count++)
+		serial_out(up, UART_TX, count);
+	mdelay(20);/* FIXME - schedule_timeout */
+	for (count = 0; (serial_in(up, UART_LSR) & UART_LSR_DR) &&
+	     (count < 256); count++)
+		serial_in(up, UART_RX);
+	serial_out(up, UART_FCR, old_fcr);
+	serial_out(up, UART_MCR, old_mcr);
+	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
+	serial_dl_write(up, old_dl);
+	serial_out(up, UART_LCR, old_lcr);
+
+	return count;
+}
+
+/*
+ * Read UART ID using the divisor method - set DLL and DLM to zero
+ * and the revision will be in DLL and device type in DLM.  We
+ * preserve the device state across this.
+ */
+static unsigned int autoconfig_read_divisor_id(struct uart_8250_port *p)
+{
+	unsigned char old_dll, old_dlm, old_lcr;
+	unsigned int id;
+
+	old_lcr = serial_in(p, UART_LCR);
+	serial_out(p, UART_LCR, UART_LCR_CONF_MODE_A);
+
+	old_dll = serial_in(p, UART_DLL);
+	old_dlm = serial_in(p, UART_DLM);
+
+	serial_out(p, UART_DLL, 0);
+	serial_out(p, UART_DLM, 0);
+
+	id = serial_in(p, UART_DLL) | serial_in(p, UART_DLM) << 8;
+
+	serial_out(p, UART_DLL, old_dll);
+	serial_out(p, UART_DLM, old_dlm);
+	serial_out(p, UART_LCR, old_lcr);
+
+	return id;
+}
+
+/*
+ * This is a helper routine to autodetect StarTech/Exar/Oxsemi UART's.
+ * When this function is called we know it is at least a StarTech
+ * 16650 V2, but it might be one of several StarTech UARTs, or one of
+ * its clones.  (We treat the broken original StarTech 16650 V1 as a
+ * 16550, and why not?  Startech doesn't seem to even acknowledge its
+ * existence.)
+ *
+ * What evil have men's minds wrought...
+ */
+static void autoconfig_has_efr(struct uart_8250_port *up)
+{
+	unsigned int id1, id2, id3, rev;
+
+	/*
+	 * Everything with an EFR has SLEEP
+	 */
+	up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP;
+
+	/*
+	 * First we check to see if it's an Oxford Semiconductor UART.
+	 *
+	 * If we have to do this here because some non-National
+	 * Semiconductor clone chips lock up if you try writing to the
+	 * LSR register (which serial_icr_read does)
+	 */
+
+	/*
+	 * Check for Oxford Semiconductor 16C950.
+	 *
+	 * EFR [4] must be set else this test fails.
+	 *
+	 * This shouldn't be necessary, but Mike Hudson (Exoray@isys.ca)
+	 * claims that it's needed for 952 dual UART's (which are not
+	 * recommended for new designs).
+	 */
+	up->acr = 0;
+	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
+	serial_out(up, UART_EFR, UART_EFR_ECB);
+	serial_out(up, UART_LCR, 0x00);
+	id1 = serial_icr_read(up, UART_ID1);
+	id2 = serial_icr_read(up, UART_ID2);
+	id3 = serial_icr_read(up, UART_ID3);
+	rev = serial_icr_read(up, UART_REV);
+
+	DEBUG_AUTOCONF("950id=%02x:%02x:%02x:%02x ", id1, id2, id3, rev);
+
+	if (id1 == 0x16 && id2 == 0xC9 &&
+	    (id3 == 0x50 || id3 == 0x52 || id3 == 0x54)) {
+		up->port.type = PORT_16C950;
+
+		/*
+		 * Enable work around for the Oxford Semiconductor 952 rev B
+		 * chip which causes it to seriously miscalculate baud rates
+		 * when DLL is 0.
+		 */
+		if (id3 == 0x52 && rev == 0x01)
+			up->bugs |= UART_BUG_QUOT;
+		return;
+	}
+
+	/*
+	 * We check for a XR16C850 by setting DLL and DLM to 0, and then
+	 * reading back DLL and DLM.  The chip type depends on the DLM
+	 * value read back:
+	 *  0x10 - XR16C850 and the DLL contains the chip revision.
+	 *  0x12 - XR16C2850.
+	 *  0x14 - XR16C854.
+	 */
+	id1 = autoconfig_read_divisor_id(up);
+	DEBUG_AUTOCONF("850id=%04x ", id1);
+
+	id2 = id1 >> 8;
+	if (id2 == 0x10 || id2 == 0x12 || id2 == 0x14) {
+		up->port.type = PORT_16850;
+		return;
+	}
+
+	/*
+	 * It wasn't an XR16C850.
+	 *
+	 * We distinguish between the '654 and the '650 by counting
+	 * how many bytes are in the FIFO.  I'm using this for now,
+	 * since that's the technique that was sent to me in the
+	 * serial driver update, but I'm not convinced this works.
+	 * I've had problems doing this in the past.  -TYT
+	 */
+	if (size_fifo(up) == 64)
+		up->port.type = PORT_16654;
+	else
+		up->port.type = PORT_16650V2;
+}
+
+/*
+ * We detected a chip without a FIFO.  Only two fall into
+ * this category - the original 8250 and the 16450.  The
+ * 16450 has a scratch register (accessible with LCR=0)
+ */
+static void autoconfig_8250(struct uart_8250_port *up)
+{
+	unsigned char scratch, status1, status2;
+
+	up->port.type = PORT_8250;
+
+	scratch = serial_in(up, UART_SCR);
+	serial_out(up, UART_SCR, 0xa5);
+	status1 = serial_in(up, UART_SCR);
+	serial_out(up, UART_SCR, 0x5a);
+	status2 = serial_in(up, UART_SCR);
+	serial_out(up, UART_SCR, scratch);
+
+	if (status1 == 0xa5 && status2 == 0x5a)
+		up->port.type = PORT_16450;
+}
+
+static int broken_efr(struct uart_8250_port *up)
+{
+	/*
+	 * Exar ST16C2550 "A2" devices incorrectly detect as
+	 * having an EFR, and report an ID of 0x0201.  See
+	 * http://linux.derkeiler.com/Mailing-Lists/Kernel/2004-11/4812.html
+	 */
+	if (autoconfig_read_divisor_id(up) == 0x0201 && size_fifo(up) == 16)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * We know that the chip has FIFOs.  Does it have an EFR?  The
+ * EFR is located in the same register position as the IIR and
+ * we know the top two bits of the IIR are currently set.  The
+ * EFR should contain zero.  Try to read the EFR.
+ */
+static void autoconfig_16550a(struct uart_8250_port *up)
+{
+	unsigned char status1, status2;
+	unsigned int iersave;
+
+	up->port.type = PORT_16550A;
+	up->capabilities |= UART_CAP_FIFO;
+
+	/*
+	 * XR17V35x UARTs have an extra divisor register, DLD
+	 * that gets enabled with when DLAB is set which will
+	 * cause the device to incorrectly match and assign
+	 * port type to PORT_16650.  The EFR for this UART is
+	 * found at offset 0x09. Instead check the Deice ID (DVID)
+	 * register for a 2, 4 or 8 port UART.
+	 */
+	if (up->port.flags & UPF_EXAR_EFR) {
+		status1 = serial_in(up, UART_EXAR_DVID);
+		if (status1 == 0x82 || status1 == 0x84 || status1 == 0x88) {
+			DEBUG_AUTOCONF("Exar XR17V35x ");
+			up->port.type = PORT_XR17V35X;
+			up->capabilities |= UART_CAP_AFE | UART_CAP_EFR |
+						UART_CAP_SLEEP;
+
+			return;
+		}
+
+	}
+
+	/*
+	 * Check for presence of the EFR when DLAB is set.
+	 * Only ST16C650V1 UARTs pass this test.
+	 */
+	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
+	if (serial_in(up, UART_EFR) == 0) {
+		serial_out(up, UART_EFR, 0xA8);
+		if (serial_in(up, UART_EFR) != 0) {
+			DEBUG_AUTOCONF("EFRv1 ");
+			up->port.type = PORT_16650;
+			up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP;
+		} else {
+			serial_out(up, UART_LCR, 0);
+			serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO |
+				   UART_FCR7_64BYTE);
+			status1 = serial_in(up, UART_IIR) >> 5;
+			serial_out(up, UART_FCR, 0);
+			serial_out(up, UART_LCR, 0);
+
+			if (status1 == 7)
+				up->port.type = PORT_16550A_FSL64;
+			else
+				DEBUG_AUTOCONF("Motorola 8xxx DUART ");
+		}
+		serial_out(up, UART_EFR, 0);
+		return;
+	}
+
+	/*
+	 * Maybe it requires 0xbf to be written to the LCR.
+	 * (other ST16C650V2 UARTs, TI16C752A, etc)
+	 */
+	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
+	if (serial_in(up, UART_EFR) == 0 && !broken_efr(up)) {
+		DEBUG_AUTOCONF("EFRv2 ");
+		autoconfig_has_efr(up);
+		return;
+	}
+
+	/*
+	 * Check for a National Semiconductor SuperIO chip.
+	 * Attempt to switch to bank 2, read the value of the LOOP bit
+	 * from EXCR1. Switch back to bank 0, change it in MCR. Then
+	 * switch back to bank 2, read it from EXCR1 again and check
+	 * it's changed. If so, set baud_base in EXCR2 to 921600. -- dwmw2
+	 */
+	serial_out(up, UART_LCR, 0);
+	status1 = serial_in(up, UART_MCR);
+	serial_out(up, UART_LCR, 0xE0);
+	status2 = serial_in(up, 0x02); /* EXCR1 */
+
+	if (!((status2 ^ status1) & UART_MCR_LOOP)) {
+		serial_out(up, UART_LCR, 0);
+		serial_out(up, UART_MCR, status1 ^ UART_MCR_LOOP);
+		serial_out(up, UART_LCR, 0xE0);
+		status2 = serial_in(up, 0x02); /* EXCR1 */
+		serial_out(up, UART_LCR, 0);
+		serial_out(up, UART_MCR, status1);
+
+		if ((status2 ^ status1) & UART_MCR_LOOP) {
+			unsigned short quot;
+
+			serial_out(up, UART_LCR, 0xE0);
+
+			quot = serial_dl_read(up);
+			quot <<= 3;
+
+			if (ns16550a_goto_highspeed(up))
+				serial_dl_write(up, quot);
+
+			serial_out(up, UART_LCR, 0);
+
+			up->port.uartclk = 921600*16;
+			up->port.type = PORT_NS16550A;
+			up->capabilities |= UART_NATSEMI;
+			return;
+		}
+	}
+
+	/*
+	 * No EFR.  Try to detect a TI16750, which only sets bit 5 of
+	 * the IIR when 64 byte FIFO mode is enabled when DLAB is set.
+	 * Try setting it with and without DLAB set.  Cheap clones
+	 * set bit 5 without DLAB set.
+	 */
+	serial_out(up, UART_LCR, 0);
+	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE);
+	status1 = serial_in(up, UART_IIR) >> 5;
+	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);
+	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
+	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE);
+	status2 = serial_in(up, UART_IIR) >> 5;
+	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);
+	serial_out(up, UART_LCR, 0);
+
+	DEBUG_AUTOCONF("iir1=%d iir2=%d ", status1, status2);
+
+	if (status1 == 6 && status2 == 7) {
+		up->port.type = PORT_16750;
+		up->capabilities |= UART_CAP_AFE | UART_CAP_SLEEP;
+		return;
+	}
+
+	/*
+	 * Try writing and reading the UART_IER_UUE bit (b6).
+	 * If it works, this is probably one of the Xscale platform's
+	 * internal UARTs.
+	 * We're going to explicitly set the UUE bit to 0 before
+	 * trying to write and read a 1 just to make sure it's not
+	 * already a 1 and maybe locked there before we even start start.
+	 */
+	iersave = serial_in(up, UART_IER);
+	serial_out(up, UART_IER, iersave & ~UART_IER_UUE);
+	if (!(serial_in(up, UART_IER) & UART_IER_UUE)) {
+		/*
+		 * OK it's in a known zero state, try writing and reading
+		 * without disturbing the current state of the other bits.
+		 */
+		serial_out(up, UART_IER, iersave | UART_IER_UUE);
+		if (serial_in(up, UART_IER) & UART_IER_UUE) {
+			/*
+			 * It's an Xscale.
+			 * We'll leave the UART_IER_UUE bit set to 1 (enabled).
+			 */
+			DEBUG_AUTOCONF("Xscale ");
+			up->port.type = PORT_XSCALE;
+			up->capabilities |= UART_CAP_UUE | UART_CAP_RTOIE;
+			return;
+		}
+	} else {
+		/*
+		 * If we got here we couldn't force the IER_UUE bit to 0.
+		 * Log it and continue.
+		 */
+		DEBUG_AUTOCONF("Couldn't force IER_UUE to 0 ");
+	}
+	serial_out(up, UART_IER, iersave);
+
+	/*
+	 * Exar uarts have EFR in a weird location
+	 */
+	if (up->port.flags & UPF_EXAR_EFR) {
+		DEBUG_AUTOCONF("Exar XR17D15x ");
+		up->port.type = PORT_XR17D15X;
+		up->capabilities |= UART_CAP_AFE | UART_CAP_EFR |
+				    UART_CAP_SLEEP;
+
+		return;
+	}
+
+	/*
+	 * We distinguish between 16550A and U6 16550A by counting
+	 * how many bytes are in the FIFO.
+	 */
+	if (up->port.type == PORT_16550A && size_fifo(up) == 64) {
+		up->port.type = PORT_U6_16550A;
+		up->capabilities |= UART_CAP_AFE;
+	}
+}
+
+/*
+ * This routine is called by rs_init() to initialize a specific serial
+ * port.  It determines what type of UART chip this serial port is
+ * using: 8250, 16450, 16550, 16550A.  The important question is
+ * whether or not this UART is a 16550A or not, since this will
+ * determine whether or not we can use its FIFO features or not.
+ */
+static void autoconfig(struct uart_8250_port *up)
+{
+	unsigned char status1, scratch, scratch2, scratch3;
+	unsigned char save_lcr, save_mcr;
+	struct uart_port *port = &up->port;
+	unsigned long flags;
+	unsigned int old_capabilities;
+
+	if (!port->iobase && !port->mapbase && !port->membase)
+		return;
+
+	DEBUG_AUTOCONF("ttyS%d: autoconf (0x%04lx, 0x%p): ",
+		       serial_index(port), port->iobase, port->membase);
+
+	/*
+	 * We really do need global IRQs disabled here - we're going to
+	 * be frobbing the chips IRQ enable register to see if it exists.
+	 */
+	spin_lock_irqsave(&port->lock, flags);
+
+	up->capabilities = 0;
+	up->bugs = 0;
+
+	if (!(port->flags & UPF_BUGGY_UART)) {
+		/*
+		 * Do a simple existence test first; if we fail this,
+		 * there's no point trying anything else.
+		 *
+		 * 0x80 is used as a nonsense port to prevent against
+		 * false positives due to ISA bus float.  The
+		 * assumption is that 0x80 is a non-existent port;
+		 * which should be safe since include/asm/io.h also
+		 * makes this assumption.
+		 *
+		 * Note: this is safe as long as MCR bit 4 is clear
+		 * and the device is in "PC" mode.
+		 */
+		scratch = serial_in(up, UART_IER);
+		serial_out(up, UART_IER, 0);
+#ifdef __i386__
+		outb(0xff, 0x080);
+#endif
+		/*
+		 * Mask out IER[7:4] bits for test as some UARTs (e.g. TL
+		 * 16C754B) allow only to modify them if an EFR bit is set.
+		 */
+		scratch2 = serial_in(up, UART_IER) & 0x0f;
+		serial_out(up, UART_IER, 0x0F);
+#ifdef __i386__
+		outb(0, 0x080);
+#endif
+		scratch3 = serial_in(up, UART_IER) & 0x0f;
+		serial_out(up, UART_IER, scratch);
+		if (scratch2 != 0 || scratch3 != 0x0F) {
+			/*
+			 * We failed; there's nothing here
+			 */
+			spin_unlock_irqrestore(&port->lock, flags);
+			DEBUG_AUTOCONF("IER test failed (%02x, %02x) ",
+				       scratch2, scratch3);
+			goto out;
+		}
+	}
+
+	save_mcr = serial_in(up, UART_MCR);
+	save_lcr = serial_in(up, UART_LCR);
+
+	/*
+	 * Check to see if a UART is really there.  Certain broken
+	 * internal modems based on the Rockwell chipset fail this
+	 * test, because they apparently don't implement the loopback
+	 * test mode.  So this test is skipped on the COM 1 through
+	 * COM 4 ports.  This *should* be safe, since no board
+	 * manufacturer would be stupid enough to design a board
+	 * that conflicts with COM 1-4 --- we hope!
+	 */
+	if (!(port->flags & UPF_SKIP_TEST)) {
+		serial_out(up, UART_MCR, UART_MCR_LOOP | 0x0A);
+		status1 = serial_in(up, UART_MSR) & 0xF0;
+		serial_out(up, UART_MCR, save_mcr);
+		if (status1 != 0x90) {
+			spin_unlock_irqrestore(&port->lock, flags);
+			DEBUG_AUTOCONF("LOOP test failed (%02x) ",
+				       status1);
+			goto out;
+		}
+	}
+
+	/*
+	 * We're pretty sure there's a port here.  Lets find out what
+	 * type of port it is.  The IIR top two bits allows us to find
+	 * out if it's 8250 or 16450, 16550, 16550A or later.  This
+	 * determines what we test for next.
+	 *
+	 * We also initialise the EFR (if any) to zero for later.  The
+	 * EFR occupies the same register location as the FCR and IIR.
+	 */
+	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
+	serial_out(up, UART_EFR, 0);
+	serial_out(up, UART_LCR, 0);
+
+	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);
+	scratch = serial_in(up, UART_IIR) >> 6;
+
+	switch (scratch) {
+	case 0:
+		autoconfig_8250(up);
+		break;
+	case 1:
+		port->type = PORT_UNKNOWN;
+		break;
+	case 2:
+		port->type = PORT_16550;
+		break;
+	case 3:
+		autoconfig_16550a(up);
+		break;
+	}
+
+#ifdef CONFIG_SERIAL_8250_RSA
+	/*
+	 * Only probe for RSA ports if we got the region.
+	 */
+	if (port->type == PORT_16550A && up->probe & UART_PROBE_RSA &&
+	    __enable_rsa(up))
+		port->type = PORT_RSA;
+#endif
+
+	serial_out(up, UART_LCR, save_lcr);
+
+	port->fifosize = uart_config[up->port.type].fifo_size;
+	old_capabilities = up->capabilities;
+	up->capabilities = uart_config[port->type].flags;
+	up->tx_loadsz = uart_config[port->type].tx_loadsz;
+
+	if (port->type == PORT_UNKNOWN)
+		goto out_lock;
+
+	/*
+	 * Reset the UART.
+	 */
+#ifdef CONFIG_SERIAL_8250_RSA
+	if (port->type == PORT_RSA)
+		serial_out(up, UART_RSA_FRR, 0);
+#endif
+	serial_out(up, UART_MCR, save_mcr);
+	serial8250_clear_fifos(up);
+	serial_in(up, UART_RX);
+	if (up->capabilities & UART_CAP_UUE)
+		serial_out(up, UART_IER, UART_IER_UUE);
+	else
+		serial_out(up, UART_IER, 0);
+
+out_lock:
+	spin_unlock_irqrestore(&port->lock, flags);
+	if (up->capabilities != old_capabilities) {
+		printk(KERN_WARNING
+		       "ttyS%d: detected caps %08x should be %08x\n",
+		       serial_index(port), old_capabilities,
+		       up->capabilities);
+	}
+out:
+	DEBUG_AUTOCONF("iir=%d ", scratch);
+	DEBUG_AUTOCONF("type=%s\n", uart_config[port->type].name);
+}
+
+static void autoconfig_irq(struct uart_8250_port *up)
+{
+	struct uart_port *port = &up->port;
+	unsigned char save_mcr, save_ier;
+	unsigned char save_ICP = 0;
+	unsigned int ICP = 0;
+	unsigned long irqs;
+	int irq;
+
+	if (port->flags & UPF_FOURPORT) {
+		ICP = (port->iobase & 0xfe0) | 0x1f;
+		save_ICP = inb_p(ICP);
+		outb_p(0x80, ICP);
+		inb_p(ICP);
+	}
+
+	/* forget possible initially masked and pending IRQ */
+	probe_irq_off(probe_irq_on());
+	save_mcr = serial_in(up, UART_MCR);
+	save_ier = serial_in(up, UART_IER);
+	serial_out(up, UART_MCR, UART_MCR_OUT1 | UART_MCR_OUT2);
+
+	irqs = probe_irq_on();
+	serial_out(up, UART_MCR, 0);
+	udelay(10);
+	if (port->flags & UPF_FOURPORT) {
+		serial_out(up, UART_MCR,
+			    UART_MCR_DTR | UART_MCR_RTS);
+	} else {
+		serial_out(up, UART_MCR,
+			    UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2);
+	}
+	serial_out(up, UART_IER, 0x0f);	/* enable all intrs */
+	serial_in(up, UART_LSR);
+	serial_in(up, UART_RX);
+	serial_in(up, UART_IIR);
+	serial_in(up, UART_MSR);
+	serial_out(up, UART_TX, 0xFF);
+	udelay(20);
+	irq = probe_irq_off(irqs);
+
+	serial_out(up, UART_MCR, save_mcr);
+	serial_out(up, UART_IER, save_ier);
+
+	if (port->flags & UPF_FOURPORT)
+		outb_p(save_ICP, ICP);
+
+	port->irq = (irq > 0) ? irq : 0;
+}
+
+static inline void __stop_tx(struct uart_8250_port *p)
+{
+	if (p->ier & UART_IER_THRI) {
+		p->ier &= ~UART_IER_THRI;
+		serial_out(p, UART_IER, p->ier);
+		serial8250_rpm_put_tx(p);
+	}
+}
+
+static void serial8250_stop_tx(struct uart_port *port)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+
+	serial8250_rpm_get(up);
+	__stop_tx(up);
+
+	/*
+	 * We really want to stop the transmitter from sending.
+	 */
+	if (port->type == PORT_16C950) {
+		up->acr |= UART_ACR_TXDIS;
+		serial_icr_write(up, UART_ACR, up->acr);
+	}
+	serial8250_rpm_put(up);
+}
+
+static void serial8250_start_tx(struct uart_port *port)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+
+	serial8250_rpm_get_tx(up);
+
+	if (up->dma && !up->dma->tx_dma(up))
+		return;
+
+	if (!(up->ier & UART_IER_THRI)) {
+		up->ier |= UART_IER_THRI;
+		serial_port_out(port, UART_IER, up->ier);
+
+		if (up->bugs & UART_BUG_TXEN) {
+			unsigned char lsr;
+			lsr = serial_in(up, UART_LSR);
+			up->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS;
+			if (lsr & UART_LSR_THRE)
+				serial8250_tx_chars(up);
+		}
+	}
+
+	/*
+	 * Re-enable the transmitter if we disabled it.
+	 */
+	if (port->type == PORT_16C950 && up->acr & UART_ACR_TXDIS) {
+		up->acr &= ~UART_ACR_TXDIS;
+		serial_icr_write(up, UART_ACR, up->acr);
+	}
+}
+
+static void serial8250_throttle(struct uart_port *port)
+{
+	port->throttle(port);
+}
+
+static void serial8250_unthrottle(struct uart_port *port)
+{
+	port->unthrottle(port);
+}
+
+static void serial8250_stop_rx(struct uart_port *port)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+
+	serial8250_rpm_get(up);
+
+	up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
+	up->port.read_status_mask &= ~UART_LSR_DR;
+	serial_port_out(port, UART_IER, up->ier);
+
+	serial8250_rpm_put(up);
+}
+
+static void serial8250_disable_ms(struct uart_port *port)
+{
+	struct uart_8250_port *up =
+		container_of(port, struct uart_8250_port, port);
+
+	/* no MSR capabilities */
+	if (up->bugs & UART_BUG_NOMSR)
+		return;
+
+	up->ier &= ~UART_IER_MSI;
+	serial_port_out(port, UART_IER, up->ier);
+}
+
+static void serial8250_enable_ms(struct uart_port *port)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+
+	/* no MSR capabilities */
+	if (up->bugs & UART_BUG_NOMSR)
+		return;
+
+	up->ier |= UART_IER_MSI;
+
+	serial8250_rpm_get(up);
+	serial_port_out(port, UART_IER, up->ier);
+	serial8250_rpm_put(up);
+}
+
+/*
+ * serial8250_rx_chars: processes according to the passed in LSR
+ * value, and returns the remaining LSR bits not handled
+ * by this Rx routine.
+ */
+unsigned char
+serial8250_rx_chars(struct uart_8250_port *up, unsigned char lsr)
+{
+	struct uart_port *port = &up->port;
+	unsigned char ch;
+	int max_count = 256;
+	char flag;
+
+	do {
+		if (likely(lsr & UART_LSR_DR))
+			ch = serial_in(up, UART_RX);
+		else
+			/*
+			 * Intel 82571 has a Serial Over Lan device that will
+			 * set UART_LSR_BI without setting UART_LSR_DR when
+			 * it receives a break. To avoid reading from the
+			 * receive buffer without UART_LSR_DR bit set, we
+			 * just force the read character to be 0
+			 */
+			ch = 0;
+
+		flag = TTY_NORMAL;
+		port->icount.rx++;
+
+		lsr |= up->lsr_saved_flags;
+		up->lsr_saved_flags = 0;
+
+		if (unlikely(lsr & UART_LSR_BRK_ERROR_BITS)) {
+			if (lsr & UART_LSR_BI) {
+				lsr &= ~(UART_LSR_FE | UART_LSR_PE);
+				port->icount.brk++;
+				/*
+				 * We do the SysRQ and SAK checking
+				 * here because otherwise the break
+				 * may get masked by ignore_status_mask
+				 * or read_status_mask.
+				 */
+				if (uart_handle_break(port))
+					goto ignore_char;
+			} else if (lsr & UART_LSR_PE)
+				port->icount.parity++;
+			else if (lsr & UART_LSR_FE)
+				port->icount.frame++;
+			if (lsr & UART_LSR_OE)
+				port->icount.overrun++;
+
+			/*
+			 * Mask off conditions which should be ignored.
+			 */
+			lsr &= port->read_status_mask;
+
+			if (lsr & UART_LSR_BI) {
+				DEBUG_INTR("handling break....");
+				flag = TTY_BREAK;
+			} else if (lsr & UART_LSR_PE)
+				flag = TTY_PARITY;
+			else if (lsr & UART_LSR_FE)
+				flag = TTY_FRAME;
+		}
+		if (uart_handle_sysrq_char(port, ch))
+			goto ignore_char;
+
+		uart_insert_char(port, lsr, UART_LSR_OE, ch, flag);
+
+ignore_char:
+		lsr = serial_in(up, UART_LSR);
+	} while ((lsr & (UART_LSR_DR | UART_LSR_BI)) && (--max_count > 0));
+	spin_unlock(&port->lock);
+	tty_flip_buffer_push(&port->state->port);
+	spin_lock(&port->lock);
+	return lsr;
+}
+EXPORT_SYMBOL_GPL(serial8250_rx_chars);
+
+void serial8250_tx_chars(struct uart_8250_port *up)
+{
+	struct uart_port *port = &up->port;
+	struct circ_buf *xmit = &port->state->xmit;
+	int count;
+
+	if (port->x_char) {
+		serial_out(up, UART_TX, port->x_char);
+		port->icount.tx++;
+		port->x_char = 0;
+		return;
+	}
+	if (uart_tx_stopped(port)) {
+		serial8250_stop_tx(port);
+		return;
+	}
+	if (uart_circ_empty(xmit)) {
+		__stop_tx(up);
+		return;
+	}
+
+	count = up->tx_loadsz;
+	do {
+		serial_out(up, UART_TX, xmit->buf[xmit->tail]);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+		if (uart_circ_empty(xmit))
+			break;
+		if (up->capabilities & UART_CAP_HFIFO) {
+			if ((serial_port_in(port, UART_LSR) & BOTH_EMPTY) !=
+			    BOTH_EMPTY)
+				break;
+		}
+	} while (--count > 0);
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+
+	DEBUG_INTR("THRE...");
+
+	/*
+	 * With RPM enabled, we have to wait until the FIFO is empty before the
+	 * HW can go idle. So we get here once again with empty FIFO and disable
+	 * the interrupt and RPM in __stop_tx()
+	 */
+	if (uart_circ_empty(xmit) && !(up->capabilities & UART_CAP_RPM))
+		__stop_tx(up);
+}
+EXPORT_SYMBOL_GPL(serial8250_tx_chars);
+
+/* Caller holds uart port lock */
+unsigned int serial8250_modem_status(struct uart_8250_port *up)
+{
+	struct uart_port *port = &up->port;
+	unsigned int status = serial_in(up, UART_MSR);
+
+	status |= up->msr_saved_flags;
+	up->msr_saved_flags = 0;
+	if (status & UART_MSR_ANY_DELTA && up->ier & UART_IER_MSI &&
+	    port->state != NULL) {
+		if (status & UART_MSR_TERI)
+			port->icount.rng++;
+		if (status & UART_MSR_DDSR)
+			port->icount.dsr++;
+		if (status & UART_MSR_DDCD)
+			uart_handle_dcd_change(port, status & UART_MSR_DCD);
+		if (status & UART_MSR_DCTS)
+			uart_handle_cts_change(port, status & UART_MSR_CTS);
+
+		wake_up_interruptible(&port->state->port.delta_msr_wait);
+	}
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(serial8250_modem_status);
+
+/*
+ * This handles the interrupt from one port.
+ */
+int serial8250_handle_irq(struct uart_port *port, unsigned int iir)
+{
+	unsigned char status;
+	unsigned long flags;
+	struct uart_8250_port *up = up_to_u8250p(port);
+	int dma_err = 0;
+
+	if (iir & UART_IIR_NO_INT)
+		return 0;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	status = serial_port_in(port, UART_LSR);
+
+	DEBUG_INTR("status = %x...", status);
+
+	if (status & (UART_LSR_DR | UART_LSR_BI)) {
+		if (up->dma)
+			dma_err = up->dma->rx_dma(up, iir);
+
+		if (!up->dma || dma_err)
+			status = serial8250_rx_chars(up, status);
+	}
+	serial8250_modem_status(up);
+	if ((!up->dma || (up->dma && up->dma->tx_err)) &&
+	    (status & UART_LSR_THRE))
+		serial8250_tx_chars(up);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(serial8250_handle_irq);
+
+static int serial8250_default_handle_irq(struct uart_port *port)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+	unsigned int iir;
+	int ret;
+
+	serial8250_rpm_get(up);
+
+	iir = serial_port_in(port, UART_IIR);
+	ret = serial8250_handle_irq(port, iir);
+
+	serial8250_rpm_put(up);
+	return ret;
+}
+
+/*
+ * These Exar UARTs have an extra interrupt indicator that could
+ * fire for a few unimplemented interrupts.  One of which is a
+ * wakeup event when coming out of sleep.  Put this here just
+ * to be on the safe side that these interrupts don't go unhandled.
+ */
+static int exar_handle_irq(struct uart_port *port)
+{
+	unsigned char int0, int1, int2, int3;
+	unsigned int iir = serial_port_in(port, UART_IIR);
+	int ret;
+
+	ret = serial8250_handle_irq(port, iir);
+
+	if ((port->type == PORT_XR17V35X) ||
+	   (port->type == PORT_XR17D15X)) {
+		int0 = serial_port_in(port, 0x80);
+		int1 = serial_port_in(port, 0x81);
+		int2 = serial_port_in(port, 0x82);
+		int3 = serial_port_in(port, 0x83);
+	}
+
+	return ret;
+}
+
+static unsigned int serial8250_tx_empty(struct uart_port *port)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+	unsigned long flags;
+	unsigned int lsr;
+
+	serial8250_rpm_get(up);
+
+	spin_lock_irqsave(&port->lock, flags);
+	lsr = serial_port_in(port, UART_LSR);
+	up->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS;
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	serial8250_rpm_put(up);
+
+	return (lsr & BOTH_EMPTY) == BOTH_EMPTY ? TIOCSER_TEMT : 0;
+}
+
+static unsigned int serial8250_get_mctrl(struct uart_port *port)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+	unsigned int status;
+	unsigned int ret;
+
+	serial8250_rpm_get(up);
+	status = serial8250_modem_status(up);
+	serial8250_rpm_put(up);
+
+	ret = 0;
+	if (status & UART_MSR_DCD)
+		ret |= TIOCM_CAR;
+	if (status & UART_MSR_RI)
+		ret |= TIOCM_RNG;
+	if (status & UART_MSR_DSR)
+		ret |= TIOCM_DSR;
+	if (status & UART_MSR_CTS)
+		ret |= TIOCM_CTS;
+	return ret;
+}
+
+void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+	unsigned char mcr = 0;
+
+	if (mctrl & TIOCM_RTS)
+		mcr |= UART_MCR_RTS;
+	if (mctrl & TIOCM_DTR)
+		mcr |= UART_MCR_DTR;
+	if (mctrl & TIOCM_OUT1)
+		mcr |= UART_MCR_OUT1;
+	if (mctrl & TIOCM_OUT2)
+		mcr |= UART_MCR_OUT2;
+	if (mctrl & TIOCM_LOOP)
+		mcr |= UART_MCR_LOOP;
+
+	mcr = (mcr & up->mcr_mask) | up->mcr_force | up->mcr;
+
+	serial_port_out(port, UART_MCR, mcr);
+}
+EXPORT_SYMBOL_GPL(serial8250_do_set_mctrl);
+
+static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	if (port->set_mctrl)
+		port->set_mctrl(port, mctrl);
+	else
+		serial8250_do_set_mctrl(port, mctrl);
+}
+
+static void serial8250_break_ctl(struct uart_port *port, int break_state)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+	unsigned long flags;
+
+	serial8250_rpm_get(up);
+	spin_lock_irqsave(&port->lock, flags);
+	if (break_state == -1)
+		up->lcr |= UART_LCR_SBC;
+	else
+		up->lcr &= ~UART_LCR_SBC;
+	serial_port_out(port, UART_LCR, up->lcr);
+	spin_unlock_irqrestore(&port->lock, flags);
+	serial8250_rpm_put(up);
+}
+
+/*
+ *	Wait for transmitter & holding register to empty
+ */
+static void wait_for_xmitr(struct uart_8250_port *up, int bits)
+{
+	unsigned int status, tmout = 10000;
+
+	/* Wait up to 10ms for the character(s) to be sent. */
+	for (;;) {
+		status = serial_in(up, UART_LSR);
+
+		up->lsr_saved_flags |= status & LSR_SAVE_FLAGS;
+
+		if ((status & bits) == bits)
+			break;
+		if (--tmout == 0)
+			break;
+		udelay(1);
+	}
+
+	/* Wait up to 1s for flow control if necessary */
+	if (up->port.flags & UPF_CONS_FLOW) {
+		unsigned int tmout;
+		for (tmout = 1000000; tmout; tmout--) {
+			unsigned int msr = serial_in(up, UART_MSR);
+			up->msr_saved_flags |= msr & MSR_SAVE_FLAGS;
+			if (msr & UART_MSR_CTS)
+				break;
+			udelay(1);
+			touch_nmi_watchdog();
+		}
+	}
+}
+
+#ifdef CONFIG_CONSOLE_POLL
+/*
+ * Console polling routines for writing and reading from the uart while
+ * in an interrupt or debug context.
+ */
+
+static int serial8250_get_poll_char(struct uart_port *port)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+	unsigned char lsr;
+	int status;
+
+	serial8250_rpm_get(up);
+
+	lsr = serial_port_in(port, UART_LSR);
+
+	if (!(lsr & UART_LSR_DR)) {
+		status = NO_POLL_CHAR;
+		goto out;
+	}
+
+	status = serial_port_in(port, UART_RX);
+out:
+	serial8250_rpm_put(up);
+	return status;
+}
+
+
+static void serial8250_put_poll_char(struct uart_port *port,
+			 unsigned char c)
+{
+	unsigned int ier;
+	struct uart_8250_port *up = up_to_u8250p(port);
+
+	serial8250_rpm_get(up);
+	/*
+	 *	First save the IER then disable the interrupts
+	 */
+	ier = serial_port_in(port, UART_IER);
+	if (up->capabilities & UART_CAP_UUE)
+		serial_port_out(port, UART_IER, UART_IER_UUE);
+	else
+		serial_port_out(port, UART_IER, 0);
+
+	wait_for_xmitr(up, BOTH_EMPTY);
+	/*
+	 *	Send the character out.
+	 */
+	serial_port_out(port, UART_TX, c);
+
+	/*
+	 *	Finally, wait for transmitter to become empty
+	 *	and restore the IER
+	 */
+	wait_for_xmitr(up, BOTH_EMPTY);
+	serial_port_out(port, UART_IER, ier);
+	serial8250_rpm_put(up);
+}
+
+#endif /* CONFIG_CONSOLE_POLL */
+
+int serial8250_do_startup(struct uart_port *port)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+	unsigned long flags;
+	unsigned char lsr, iir;
+	int retval;
+
+	if (port->type == PORT_8250_CIR)
+		return -ENODEV;
+
+	if (!port->fifosize)
+		port->fifosize = uart_config[port->type].fifo_size;
+	if (!up->tx_loadsz)
+		up->tx_loadsz = uart_config[port->type].tx_loadsz;
+	if (!up->capabilities)
+		up->capabilities = uart_config[port->type].flags;
+	up->mcr = 0;
+
+	if (port->iotype != up->cur_iotype)
+		set_io_from_upio(port);
+
+	serial8250_rpm_get(up);
+	if (port->type == PORT_16C950) {
+		/* Wake up and initialize UART */
+		up->acr = 0;
+		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
+		serial_port_out(port, UART_EFR, UART_EFR_ECB);
+		serial_port_out(port, UART_IER, 0);
+		serial_port_out(port, UART_LCR, 0);
+		serial_icr_write(up, UART_CSR, 0); /* Reset the UART */
+		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
+		serial_port_out(port, UART_EFR, UART_EFR_ECB);
+		serial_port_out(port, UART_LCR, 0);
+	}
+
+#ifdef CONFIG_SERIAL_8250_RSA
+	/*
+	 * If this is an RSA port, see if we can kick it up to the
+	 * higher speed clock.
+	 */
+	enable_rsa(up);
+#endif
+	/*
+	 * Clear the FIFO buffers and disable them.
+	 * (they will be reenabled in set_termios())
+	 */
+	serial8250_clear_fifos(up);
+
+	/*
+	 * Clear the interrupt registers.
+	 */
+	serial_port_in(port, UART_LSR);
+	serial_port_in(port, UART_RX);
+	serial_port_in(port, UART_IIR);
+	serial_port_in(port, UART_MSR);
+
+	/*
+	 * At this point, there's no way the LSR could still be 0xff;
+	 * if it is, then bail out, because there's likely no UART
+	 * here.
+	 */
+	if (!(port->flags & UPF_BUGGY_UART) &&
+	    (serial_port_in(port, UART_LSR) == 0xff)) {
+		printk_ratelimited(KERN_INFO "ttyS%d: LSR safety check engaged!\n",
+				   serial_index(port));
+		retval = -ENODEV;
+		goto out;
+	}
+
+	/*
+	 * For a XR16C850, we need to set the trigger levels
+	 */
+	if (port->type == PORT_16850) {
+		unsigned char fctr;
+
+		serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
+
+		fctr = serial_in(up, UART_FCTR) & ~(UART_FCTR_RX|UART_FCTR_TX);
+		serial_port_out(port, UART_FCTR,
+				fctr | UART_FCTR_TRGD | UART_FCTR_RX);
+		serial_port_out(port, UART_TRG, UART_TRG_96);
+		serial_port_out(port, UART_FCTR,
+				fctr | UART_FCTR_TRGD | UART_FCTR_TX);
+		serial_port_out(port, UART_TRG, UART_TRG_96);
+
+		serial_port_out(port, UART_LCR, 0);
+	}
+
+	if (port->irq) {
+		unsigned char iir1;
+		/*
+		 * Test for UARTs that do not reassert THRE when the
+		 * transmitter is idle and the interrupt has already
+		 * been cleared.  Real 16550s should always reassert
+		 * this interrupt whenever the transmitter is idle and
+		 * the interrupt is enabled.  Delays are necessary to
+		 * allow register changes to become visible.
+		 */
+		spin_lock_irqsave(&port->lock, flags);
+		if (up->port.irqflags & IRQF_SHARED)
+			disable_irq_nosync(port->irq);
+
+		wait_for_xmitr(up, UART_LSR_THRE);
+		serial_port_out_sync(port, UART_IER, UART_IER_THRI);
+		udelay(1); /* allow THRE to set */
+		iir1 = serial_port_in(port, UART_IIR);
+		serial_port_out(port, UART_IER, 0);
+		serial_port_out_sync(port, UART_IER, UART_IER_THRI);
+		udelay(1); /* allow a working UART time to re-assert THRE */
+		iir = serial_port_in(port, UART_IIR);
+		serial_port_out(port, UART_IER, 0);
+
+		if (port->irqflags & IRQF_SHARED)
+			enable_irq(port->irq);
+		spin_unlock_irqrestore(&port->lock, flags);
+
+		/*
+		 * If the interrupt is not reasserted, or we otherwise
+		 * don't trust the iir, setup a timer to kick the UART
+		 * on a regular basis.
+		 */
+		if ((!(iir1 & UART_IIR_NO_INT) && (iir & UART_IIR_NO_INT)) ||
+		    up->port.flags & UPF_BUG_THRE) {
+			up->bugs |= UART_BUG_THRE;
+		}
+	}
+
+	retval = up->ops->setup_irq(up);
+	if (retval)
+		goto out;
+
+	/*
+	 * Now, initialize the UART
+	 */
+	serial_port_out(port, UART_LCR, UART_LCR_WLEN8);
+
+	spin_lock_irqsave(&port->lock, flags);
+	if (up->port.flags & UPF_FOURPORT) {
+		if (!up->port.irq)
+			up->port.mctrl |= TIOCM_OUT1;
+	} else
+		/*
+		 * Most PC uarts need OUT2 raised to enable interrupts.
+		 */
+		if (port->irq)
+			up->port.mctrl |= TIOCM_OUT2;
+
+	serial8250_set_mctrl(port, port->mctrl);
+
+	/* Serial over Lan (SoL) hack:
+	   Intel 8257x Gigabit ethernet chips have a
+	   16550 emulation, to be used for Serial Over Lan.
+	   Those chips take a longer time than a normal
+	   serial device to signalize that a transmission
+	   data was queued. Due to that, the above test generally
+	   fails. One solution would be to delay the reading of
+	   iir. However, this is not reliable, since the timeout
+	   is variable. So, let's just don't test if we receive
+	   TX irq. This way, we'll never enable UART_BUG_TXEN.
+	 */
+	if (up->port.flags & UPF_NO_TXEN_TEST)
+		goto dont_test_tx_en;
+
+	/*
+	 * Do a quick test to see if we receive an
+	 * interrupt when we enable the TX irq.
+	 */
+	serial_port_out(port, UART_IER, UART_IER_THRI);
+	lsr = serial_port_in(port, UART_LSR);
+	iir = serial_port_in(port, UART_IIR);
+	serial_port_out(port, UART_IER, 0);
+
+	if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) {
+		if (!(up->bugs & UART_BUG_TXEN)) {
+			up->bugs |= UART_BUG_TXEN;
+			pr_debug("ttyS%d - enabling bad tx status workarounds\n",
+				 serial_index(port));
+		}
+	} else {
+		up->bugs &= ~UART_BUG_TXEN;
+	}
+
+dont_test_tx_en:
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	/*
+	 * Clear the interrupt registers again for luck, and clear the
+	 * saved flags to avoid getting false values from polling
+	 * routines or the previous session.
+	 */
+	serial_port_in(port, UART_LSR);
+	serial_port_in(port, UART_RX);
+	serial_port_in(port, UART_IIR);
+	serial_port_in(port, UART_MSR);
+	up->lsr_saved_flags = 0;
+	up->msr_saved_flags = 0;
+
+	/*
+	 * Request DMA channels for both RX and TX.
+	 */
+	if (up->dma) {
+		retval = serial8250_request_dma(up);
+		if (retval) {
+			pr_warn_ratelimited("ttyS%d - failed to request DMA\n",
+					    serial_index(port));
+			up->dma = NULL;
+		}
+	}
+
+	/*
+	 * Finally, enable interrupts.  Note: Modem status interrupts
+	 * are set via set_termios(), which will be occurring imminently
+	 * anyway, so we don't enable them here.
+	 */
+	up->ier = UART_IER_RLSI | UART_IER_RDI;
+	serial_port_out(port, UART_IER, up->ier);
+
+	if (port->flags & UPF_FOURPORT) {
+		unsigned int icp;
+		/*
+		 * Enable interrupts on the AST Fourport board
+		 */
+		icp = (port->iobase & 0xfe0) | 0x01f;
+		outb_p(0x80, icp);
+		inb_p(icp);
+	}
+	retval = 0;
+out:
+	serial8250_rpm_put(up);
+	return retval;
+}
+EXPORT_SYMBOL_GPL(serial8250_do_startup);
+
+static int serial8250_startup(struct uart_port *port)
+{
+	if (port->startup)
+		return port->startup(port);
+	return serial8250_do_startup(port);
+}
+
+void serial8250_do_shutdown(struct uart_port *port)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+	unsigned long flags;
+
+	serial8250_rpm_get(up);
+	/*
+	 * Disable interrupts from this port
+	 */
+	up->ier = 0;
+	serial_port_out(port, UART_IER, 0);
+
+	if (up->dma)
+		serial8250_release_dma(up);
+
+	spin_lock_irqsave(&port->lock, flags);
+	if (port->flags & UPF_FOURPORT) {
+		/* reset interrupts on the AST Fourport board */
+		inb((port->iobase & 0xfe0) | 0x1f);
+		port->mctrl |= TIOCM_OUT1;
+	} else
+		port->mctrl &= ~TIOCM_OUT2;
+
+	serial8250_set_mctrl(port, port->mctrl);
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	/*
+	 * Disable break condition and FIFOs
+	 */
+	serial_port_out(port, UART_LCR,
+			serial_port_in(port, UART_LCR) & ~UART_LCR_SBC);
+	serial8250_clear_fifos(up);
+
+#ifdef CONFIG_SERIAL_8250_RSA
+	/*
+	 * Reset the RSA board back to 115kbps compat mode.
+	 */
+	disable_rsa(up);
+#endif
+
+	/*
+	 * Read data port to reset things, and then unlink from
+	 * the IRQ chain.
+	 */
+	serial_port_in(port, UART_RX);
+	serial8250_rpm_put(up);
+
+	up->ops->release_irq(up);
+}
+EXPORT_SYMBOL_GPL(serial8250_do_shutdown);
+
+static void serial8250_shutdown(struct uart_port *port)
+{
+	if (port->shutdown)
+		port->shutdown(port);
+	else
+		serial8250_do_shutdown(port);
+}
+
+/*
+ * XR17V35x UARTs have an extra fractional divisor register (DLD)
+ * Calculate divisor with extra 4-bit fractional portion
+ */
+static unsigned int xr17v35x_get_divisor(struct uart_8250_port *up,
+					 unsigned int baud,
+					 unsigned int *frac)
+{
+	struct uart_port *port = &up->port;
+	unsigned int quot_16;
+
+	quot_16 = DIV_ROUND_CLOSEST(port->uartclk, baud);
+	*frac = quot_16 & 0x0f;
+
+	return quot_16 >> 4;
+}
+
+static unsigned int serial8250_get_divisor(struct uart_8250_port *up,
+					   unsigned int baud,
+					   unsigned int *frac)
+{
+	struct uart_port *port = &up->port;
+	unsigned int quot;
+
+	/*
+	 * Handle magic divisors for baud rates above baud_base on
+	 * SMSC SuperIO chips.
+	 *
+	 */
+	if ((port->flags & UPF_MAGIC_MULTIPLIER) &&
+	    baud == (port->uartclk/4))
+		quot = 0x8001;
+	else if ((port->flags & UPF_MAGIC_MULTIPLIER) &&
+		 baud == (port->uartclk/8))
+		quot = 0x8002;
+	else if (up->port.type == PORT_XR17V35X)
+		quot = xr17v35x_get_divisor(up, baud, frac);
+	else
+		quot = uart_get_divisor(port, baud);
+
+	/*
+	 * Oxford Semi 952 rev B workaround
+	 */
+	if (up->bugs & UART_BUG_QUOT && (quot & 0xff) == 0)
+		quot++;
+
+	return quot;
+}
+
+static unsigned char serial8250_compute_lcr(struct uart_8250_port *up,
+					    tcflag_t c_cflag)
+{
+	unsigned char cval;
+
+	switch (c_cflag & CSIZE) {
+	case CS5:
+		cval = UART_LCR_WLEN5;
+		break;
+	case CS6:
+		cval = UART_LCR_WLEN6;
+		break;
+	case CS7:
+		cval = UART_LCR_WLEN7;
+		break;
+	default:
+	case CS8:
+		cval = UART_LCR_WLEN8;
+		break;
+	}
+
+	if (c_cflag & CSTOPB)
+		cval |= UART_LCR_STOP;
+	if (c_cflag & PARENB) {
+		cval |= UART_LCR_PARITY;
+		if (up->bugs & UART_BUG_PARITY)
+			up->fifo_bug = true;
+	}
+	if (!(c_cflag & PARODD))
+		cval |= UART_LCR_EPAR;
+#ifdef CMSPAR
+	if (c_cflag & CMSPAR)
+		cval |= UART_LCR_SPAR;
+#endif
+
+	return cval;
+}
+
+static void serial8250_set_divisor(struct uart_port *port, unsigned int baud,
+			    unsigned int quot, unsigned int quot_frac)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+
+	/* Workaround to enable 115200 baud on OMAP1510 internal ports */
+	if (is_omap1510_8250(up)) {
+		if (baud == 115200) {
+			quot = 1;
+			serial_port_out(port, UART_OMAP_OSC_12M_SEL, 1);
+		} else
+			serial_port_out(port, UART_OMAP_OSC_12M_SEL, 0);
+	}
+
+	/*
+	 * For NatSemi, switch to bank 2 not bank 1, to avoid resetting EXCR2,
+	 * otherwise just set DLAB
+	 */
+	if (up->capabilities & UART_NATSEMI)
+		serial_port_out(port, UART_LCR, 0xe0);
+	else
+		serial_port_out(port, UART_LCR, up->lcr | UART_LCR_DLAB);
+
+	serial_dl_write(up, quot);
+
+	/* XR17V35x UARTs have an extra fractional divisor register (DLD) */
+	if (up->port.type == PORT_XR17V35X)
+		serial_port_out(port, 0x2, quot_frac);
+}
+
+void
+serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios,
+		          struct ktermios *old)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+	unsigned char cval;
+	unsigned long flags;
+	unsigned int baud, quot, frac = 0;
+
+	cval = serial8250_compute_lcr(up, termios->c_cflag);
+
+	/*
+	 * Ask the core to calculate the divisor for us.
+	 */
+	baud = uart_get_baud_rate(port, termios, old,
+				  port->uartclk / 16 / 0xffff,
+				  port->uartclk / 16);
+	quot = serial8250_get_divisor(up, baud, &frac);
+
+	/*
+	 * Ok, we're now changing the port state.  Do it with
+	 * interrupts disabled.
+	 */
+	serial8250_rpm_get(up);
+	spin_lock_irqsave(&port->lock, flags);
+
+	up->lcr = cval;					/* Save computed LCR */
+
+	if (up->capabilities & UART_CAP_FIFO && port->fifosize > 1) {
+		/* NOTE: If fifo_bug is not set, a user can set RX_trigger. */
+		if ((baud < 2400 && !up->dma) || up->fifo_bug) {
+			up->fcr &= ~UART_FCR_TRIGGER_MASK;
+			up->fcr |= UART_FCR_TRIGGER_1;
+		}
+	}
+
+	/*
+	 * MCR-based auto flow control.  When AFE is enabled, RTS will be
+	 * deasserted when the receive FIFO contains more characters than
+	 * the trigger, or the MCR RTS bit is cleared.  In the case where
+	 * the remote UART is not using CTS auto flow control, we must
+	 * have sufficient FIFO entries for the latency of the remote
+	 * UART to respond.  IOW, at least 32 bytes of FIFO.
+	 */
+	if (up->capabilities & UART_CAP_AFE && port->fifosize >= 32) {
+		up->mcr &= ~UART_MCR_AFE;
+		if (termios->c_cflag & CRTSCTS)
+			up->mcr |= UART_MCR_AFE;
+	}
+
+	/*
+	 * Update the per-port timeout.
+	 */
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	port->read_status_mask = UART_LSR_OE | UART_LSR_THRE | UART_LSR_DR;
+	if (termios->c_iflag & INPCK)
+		port->read_status_mask |= UART_LSR_FE | UART_LSR_PE;
+	if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK))
+		port->read_status_mask |= UART_LSR_BI;
+
+	/*
+	 * Characteres to ignore
+	 */
+	port->ignore_status_mask = 0;
+	if (termios->c_iflag & IGNPAR)
+		port->ignore_status_mask |= UART_LSR_PE | UART_LSR_FE;
+	if (termios->c_iflag & IGNBRK) {
+		port->ignore_status_mask |= UART_LSR_BI;
+		/*
+		 * If we're ignoring parity and break indicators,
+		 * ignore overruns too (for real raw support).
+		 */
+		if (termios->c_iflag & IGNPAR)
+			port->ignore_status_mask |= UART_LSR_OE;
+	}
+
+	/*
+	 * ignore all characters if CREAD is not set
+	 */
+	if ((termios->c_cflag & CREAD) == 0)
+		port->ignore_status_mask |= UART_LSR_DR;
+
+	/*
+	 * CTS flow control flag and modem status interrupts
+	 */
+	up->ier &= ~UART_IER_MSI;
+	if (!(up->bugs & UART_BUG_NOMSR) &&
+			UART_ENABLE_MS(&up->port, termios->c_cflag))
+		up->ier |= UART_IER_MSI;
+	if (up->capabilities & UART_CAP_UUE)
+		up->ier |= UART_IER_UUE;
+	if (up->capabilities & UART_CAP_RTOIE)
+		up->ier |= UART_IER_RTOIE;
+
+	serial_port_out(port, UART_IER, up->ier);
+
+	if (up->capabilities & UART_CAP_EFR) {
+		unsigned char efr = 0;
+		/*
+		 * TI16C752/Startech hardware flow control.  FIXME:
+		 * - TI16C752 requires control thresholds to be set.
+		 * - UART_MCR_RTS is ineffective if auto-RTS mode is enabled.
+		 */
+		if (termios->c_cflag & CRTSCTS)
+			efr |= UART_EFR_CTS;
+
+		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
+		if (port->flags & UPF_EXAR_EFR)
+			serial_port_out(port, UART_XR_EFR, efr);
+		else
+			serial_port_out(port, UART_EFR, efr);
+	}
+
+	serial8250_set_divisor(port, baud, quot, frac);
+
+	/*
+	 * LCR DLAB must be set to enable 64-byte FIFO mode. If the FCR
+	 * is written without DLAB set, this mode will be disabled.
+	 */
+	if (port->type == PORT_16750)
+		serial_port_out(port, UART_FCR, up->fcr);
+
+	serial_port_out(port, UART_LCR, up->lcr);	/* reset DLAB */
+	if (port->type != PORT_16750) {
+		/* emulated UARTs (Lucent Venus 167x) need two steps */
+		if (up->fcr & UART_FCR_ENABLE_FIFO)
+			serial_port_out(port, UART_FCR, UART_FCR_ENABLE_FIFO);
+		serial_port_out(port, UART_FCR, up->fcr);	/* set fcr */
+	}
+	serial8250_set_mctrl(port, port->mctrl);
+	spin_unlock_irqrestore(&port->lock, flags);
+	serial8250_rpm_put(up);
+
+	/* Don't rewrite B0 */
+	if (tty_termios_baud_rate(termios))
+		tty_termios_encode_baud_rate(termios, baud, baud);
+}
+EXPORT_SYMBOL(serial8250_do_set_termios);
+
+static void
+serial8250_set_termios(struct uart_port *port, struct ktermios *termios,
+		       struct ktermios *old)
+{
+	if (port->set_termios)
+		port->set_termios(port, termios, old);
+	else
+		serial8250_do_set_termios(port, termios, old);
+}
+
+static void
+serial8250_set_ldisc(struct uart_port *port, struct ktermios *termios)
+{
+	if (termios->c_line == N_PPS) {
+		port->flags |= UPF_HARDPPS_CD;
+		spin_lock_irq(&port->lock);
+		serial8250_enable_ms(port);
+		spin_unlock_irq(&port->lock);
+	} else {
+		port->flags &= ~UPF_HARDPPS_CD;
+		if (!UART_ENABLE_MS(port, termios->c_cflag)) {
+			spin_lock_irq(&port->lock);
+			serial8250_disable_ms(port);
+			spin_unlock_irq(&port->lock);
+		}
+	}
+}
+
+
+void serial8250_do_pm(struct uart_port *port, unsigned int state,
+		      unsigned int oldstate)
+{
+	struct uart_8250_port *p = up_to_u8250p(port);
+
+	serial8250_set_sleep(p, state != 0);
+}
+EXPORT_SYMBOL(serial8250_do_pm);
+
+static void
+serial8250_pm(struct uart_port *port, unsigned int state,
+	      unsigned int oldstate)
+{
+	if (port->pm)
+		port->pm(port, state, oldstate);
+	else
+		serial8250_do_pm(port, state, oldstate);
+}
+
+static unsigned int serial8250_port_size(struct uart_8250_port *pt)
+{
+	if (pt->port.mapsize)
+		return pt->port.mapsize;
+	if (pt->port.iotype == UPIO_AU) {
+		if (pt->port.type == PORT_RT2880)
+			return 0x100;
+		return 0x1000;
+	}
+	if (is_omap1_8250(pt))
+		return 0x16 << pt->port.regshift;
+
+	return 8 << pt->port.regshift;
+}
+
+/*
+ * Resource handling.
+ */
+static int serial8250_request_std_resource(struct uart_8250_port *up)
+{
+	unsigned int size = serial8250_port_size(up);
+	struct uart_port *port = &up->port;
+	int ret = 0;
+
+	switch (port->iotype) {
+	case UPIO_AU:
+	case UPIO_TSI:
+	case UPIO_MEM32:
+	case UPIO_MEM32BE:
+	case UPIO_MEM:
+		if (!port->mapbase)
+			break;
+
+		if (!request_mem_region(port->mapbase, size, "serial")) {
+			ret = -EBUSY;
+			break;
+		}
+
+		if (port->flags & UPF_IOREMAP) {
+			port->membase = ioremap_nocache(port->mapbase, size);
+			if (!port->membase) {
+				release_mem_region(port->mapbase, size);
+				ret = -ENOMEM;
+			}
+		}
+		break;
+
+	case UPIO_HUB6:
+	case UPIO_PORT:
+		if (!request_region(port->iobase, size, "serial"))
+			ret = -EBUSY;
+		break;
+	}
+	return ret;
+}
+
+static void serial8250_release_std_resource(struct uart_8250_port *up)
+{
+	unsigned int size = serial8250_port_size(up);
+	struct uart_port *port = &up->port;
+
+	switch (port->iotype) {
+	case UPIO_AU:
+	case UPIO_TSI:
+	case UPIO_MEM32:
+	case UPIO_MEM32BE:
+	case UPIO_MEM:
+		if (!port->mapbase)
+			break;
+
+		if (port->flags & UPF_IOREMAP) {
+			iounmap(port->membase);
+			port->membase = NULL;
+		}
+
+		release_mem_region(port->mapbase, size);
+		break;
+
+	case UPIO_HUB6:
+	case UPIO_PORT:
+		release_region(port->iobase, size);
+		break;
+	}
+}
+
+static void serial8250_release_port(struct uart_port *port)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+
+	serial8250_release_std_resource(up);
+}
+
+static int serial8250_request_port(struct uart_port *port)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+	int ret;
+
+	if (port->type == PORT_8250_CIR)
+		return -ENODEV;
+
+	ret = serial8250_request_std_resource(up);
+
+	return ret;
+}
+
+static int fcr_get_rxtrig_bytes(struct uart_8250_port *up)
+{
+	const struct serial8250_config *conf_type = &uart_config[up->port.type];
+	unsigned char bytes;
+
+	bytes = conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(up->fcr)];
+
+	return bytes ? bytes : -EOPNOTSUPP;
+}
+
+static int bytes_to_fcr_rxtrig(struct uart_8250_port *up, unsigned char bytes)
+{
+	const struct serial8250_config *conf_type = &uart_config[up->port.type];
+	int i;
+
+	if (!conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(UART_FCR_R_TRIG_00)])
+		return -EOPNOTSUPP;
+
+	for (i = 1; i < UART_FCR_R_TRIG_MAX_STATE; i++) {
+		if (bytes < conf_type->rxtrig_bytes[i])
+			/* Use the nearest lower value */
+			return (--i) << UART_FCR_R_TRIG_SHIFT;
+	}
+
+	return UART_FCR_R_TRIG_11;
+}
+
+static int do_get_rxtrig(struct tty_port *port)
+{
+	struct uart_state *state = container_of(port, struct uart_state, port);
+	struct uart_port *uport = state->uart_port;
+	struct uart_8250_port *up =
+		container_of(uport, struct uart_8250_port, port);
+
+	if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1)
+		return -EINVAL;
+
+	return fcr_get_rxtrig_bytes(up);
+}
+
+static int do_serial8250_get_rxtrig(struct tty_port *port)
+{
+	int rxtrig_bytes;
+
+	mutex_lock(&port->mutex);
+	rxtrig_bytes = do_get_rxtrig(port);
+	mutex_unlock(&port->mutex);
+
+	return rxtrig_bytes;
+}
+
+static ssize_t serial8250_get_attr_rx_trig_bytes(struct device *dev,
+	struct device_attribute *attr, char *buf)
+{
+	struct tty_port *port = dev_get_drvdata(dev);
+	int rxtrig_bytes;
+
+	rxtrig_bytes = do_serial8250_get_rxtrig(port);
+	if (rxtrig_bytes < 0)
+		return rxtrig_bytes;
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", rxtrig_bytes);
+}
+
+static int do_set_rxtrig(struct tty_port *port, unsigned char bytes)
+{
+	struct uart_state *state = container_of(port, struct uart_state, port);
+	struct uart_port *uport = state->uart_port;
+	struct uart_8250_port *up =
+		container_of(uport, struct uart_8250_port, port);
+	int rxtrig;
+
+	if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1 ||
+	    up->fifo_bug)
+		return -EINVAL;
+
+	rxtrig = bytes_to_fcr_rxtrig(up, bytes);
+	if (rxtrig < 0)
+		return rxtrig;
+
+	serial8250_clear_fifos(up);
+	up->fcr &= ~UART_FCR_TRIGGER_MASK;
+	up->fcr |= (unsigned char)rxtrig;
+	serial_out(up, UART_FCR, up->fcr);
+	return 0;
+}
+
+static int do_serial8250_set_rxtrig(struct tty_port *port, unsigned char bytes)
+{
+	int ret;
+
+	mutex_lock(&port->mutex);
+	ret = do_set_rxtrig(port, bytes);
+	mutex_unlock(&port->mutex);
+
+	return ret;
+}
+
+static ssize_t serial8250_set_attr_rx_trig_bytes(struct device *dev,
+	struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct tty_port *port = dev_get_drvdata(dev);
+	unsigned char bytes;
+	int ret;
+
+	if (!count)
+		return -EINVAL;
+
+	ret = kstrtou8(buf, 10, &bytes);
+	if (ret < 0)
+		return ret;
+
+	ret = do_serial8250_set_rxtrig(port, bytes);
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+static DEVICE_ATTR(rx_trig_bytes, S_IRUSR | S_IWUSR | S_IRGRP,
+		   serial8250_get_attr_rx_trig_bytes,
+		   serial8250_set_attr_rx_trig_bytes);
+
+static struct attribute *serial8250_dev_attrs[] = {
+	&dev_attr_rx_trig_bytes.attr,
+	NULL,
+	};
+
+static struct attribute_group serial8250_dev_attr_group = {
+	.attrs = serial8250_dev_attrs,
+	};
+
+static void register_dev_spec_attr_grp(struct uart_8250_port *up)
+{
+	const struct serial8250_config *conf_type = &uart_config[up->port.type];
+
+	if (conf_type->rxtrig_bytes[0])
+		up->port.attr_group = &serial8250_dev_attr_group;
+}
+
+static void serial8250_config_port(struct uart_port *port, int flags)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+	int ret;
+
+	if (port->type == PORT_8250_CIR)
+		return;
+
+	/*
+	 * Find the region that we can probe for.  This in turn
+	 * tells us whether we can probe for the type of port.
+	 */
+	ret = serial8250_request_std_resource(up);
+	if (ret < 0)
+		return;
+
+	if (port->iotype != up->cur_iotype)
+		set_io_from_upio(port);
+
+	if (flags & UART_CONFIG_TYPE)
+		autoconfig(up);
+
+	/* if access method is AU, it is a 16550 with a quirk */
+	if (port->type == PORT_16550A && port->iotype == UPIO_AU)
+		up->bugs |= UART_BUG_NOMSR;
+
+	/* HW bugs may trigger IRQ while IIR == NO_INT */
+	if (port->type == PORT_TEGRA)
+		up->bugs |= UART_BUG_NOMSR;
+
+	if (port->type != PORT_UNKNOWN && flags & UART_CONFIG_IRQ)
+		autoconfig_irq(up);
+
+	if (port->type == PORT_UNKNOWN)
+		serial8250_release_std_resource(up);
+
+	/* Fixme: probably not the best place for this */
+	if ((port->type == PORT_XR17V35X) ||
+	   (port->type == PORT_XR17D15X))
+		port->handle_irq = exar_handle_irq;
+
+	register_dev_spec_attr_grp(up);
+	up->fcr = uart_config[up->port.type].fcr;
+}
+
+static int
+serial8250_verify_port(struct uart_port *port, struct serial_struct *ser)
+{
+	if (ser->irq >= nr_irqs || ser->irq < 0 ||
+	    ser->baud_base < 9600 || ser->type < PORT_UNKNOWN ||
+	    ser->type >= ARRAY_SIZE(uart_config) || ser->type == PORT_CIRRUS ||
+	    ser->type == PORT_STARTECH)
+		return -EINVAL;
+	return 0;
+}
+
+static const char *
+serial8250_type(struct uart_port *port)
+{
+	int type = port->type;
+
+	if (type >= ARRAY_SIZE(uart_config))
+		type = 0;
+	return uart_config[type].name;
+}
+
+static const struct uart_ops serial8250_pops = {
+	.tx_empty	= serial8250_tx_empty,
+	.set_mctrl	= serial8250_set_mctrl,
+	.get_mctrl	= serial8250_get_mctrl,
+	.stop_tx	= serial8250_stop_tx,
+	.start_tx	= serial8250_start_tx,
+	.throttle	= serial8250_throttle,
+	.unthrottle	= serial8250_unthrottle,
+	.stop_rx	= serial8250_stop_rx,
+	.enable_ms	= serial8250_enable_ms,
+	.break_ctl	= serial8250_break_ctl,
+	.startup	= serial8250_startup,
+	.shutdown	= serial8250_shutdown,
+	.set_termios	= serial8250_set_termios,
+	.set_ldisc	= serial8250_set_ldisc,
+	.pm		= serial8250_pm,
+	.type		= serial8250_type,
+	.release_port	= serial8250_release_port,
+	.request_port	= serial8250_request_port,
+	.config_port	= serial8250_config_port,
+	.verify_port	= serial8250_verify_port,
+#ifdef CONFIG_CONSOLE_POLL
+	.poll_get_char = serial8250_get_poll_char,
+	.poll_put_char = serial8250_put_poll_char,
+#endif
+};
+
+void serial8250_init_port(struct uart_8250_port *up)
+{
+	struct uart_port *port = &up->port;
+
+	spin_lock_init(&port->lock);
+	port->ops = &serial8250_pops;
+
+	up->cur_iotype = 0xFF;
+}
+EXPORT_SYMBOL_GPL(serial8250_init_port);
+
+void serial8250_set_defaults(struct uart_8250_port *up)
+{
+	struct uart_port *port = &up->port;
+
+	if (up->port.flags & UPF_FIXED_TYPE) {
+		unsigned int type = up->port.type;
+
+		if (!up->port.fifosize)
+			up->port.fifosize = uart_config[type].fifo_size;
+		if (!up->tx_loadsz)
+			up->tx_loadsz = uart_config[type].tx_loadsz;
+		if (!up->capabilities)
+			up->capabilities = uart_config[type].flags;
+	}
+
+	set_io_from_upio(port);
+
+	/* default dma handlers */
+	if (up->dma) {
+		if (!up->dma->tx_dma)
+			up->dma->tx_dma = serial8250_tx_dma;
+		if (!up->dma->rx_dma)
+			up->dma->rx_dma = serial8250_rx_dma;
+	}
+}
+EXPORT_SYMBOL_GPL(serial8250_set_defaults);
+
+#ifdef CONFIG_SERIAL_8250_CONSOLE
+
+static void serial8250_console_putchar(struct uart_port *port, int ch)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+
+	wait_for_xmitr(up, UART_LSR_THRE);
+	serial_port_out(port, UART_TX, ch);
+}
+
+/*
+ *	Print a string to the serial port trying not to disturb
+ *	any possible real use of the port...
+ *
+ *	The console_lock must be held when we get here.
+ */
+void serial8250_console_write(struct uart_8250_port *up, const char *s,
+			      unsigned int count)
+{
+	struct uart_port *port = &up->port;
+	unsigned long flags;
+	unsigned int ier;
+	int locked = 1;
+
+	touch_nmi_watchdog();
+
+	serial8250_rpm_get(up);
+
+	if (port->sysrq)
+		locked = 0;
+	else if (oops_in_progress)
+		locked = spin_trylock_irqsave(&port->lock, flags);
+	else
+		spin_lock_irqsave(&port->lock, flags);
+
+	/*
+	 *	First save the IER then disable the interrupts
+	 */
+	ier = serial_port_in(port, UART_IER);
+
+	if (up->capabilities & UART_CAP_UUE)
+		serial_port_out(port, UART_IER, UART_IER_UUE);
+	else
+		serial_port_out(port, UART_IER, 0);
+
+	/* check scratch reg to see if port powered off during system sleep */
+	if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
+		struct ktermios termios;
+		unsigned int baud, quot, frac = 0;
+
+		termios.c_cflag = port->cons->cflag;
+		if (port->state->port.tty && termios.c_cflag == 0)
+			termios.c_cflag = port->state->port.tty->termios.c_cflag;
+
+		baud = uart_get_baud_rate(port, &termios, NULL,
+					  port->uartclk / 16 / 0xffff,
+					  port->uartclk / 16);
+		quot = serial8250_get_divisor(up, baud, &frac);
+
+		serial8250_set_divisor(port, baud, quot, frac);
+		serial_port_out(port, UART_LCR, up->lcr);
+		serial_port_out(port, UART_MCR, UART_MCR_DTR | UART_MCR_RTS);
+
+		up->canary = 0;
+	}
+
+	uart_console_write(port, s, count, serial8250_console_putchar);
+
+	/*
+	 *	Finally, wait for transmitter to become empty
+	 *	and restore the IER
+	 */
+	wait_for_xmitr(up, BOTH_EMPTY);
+	serial_port_out(port, UART_IER, ier);
+
+	/*
+	 *	The receive handling will happen properly because the
+	 *	receive ready bit will still be set; it is not cleared
+	 *	on read.  However, modem control will not, we must
+	 *	call it if we have saved something in the saved flags
+	 *	while processing with interrupts off.
+	 */
+	if (up->msr_saved_flags)
+		serial8250_modem_status(up);
+
+	if (locked)
+		spin_unlock_irqrestore(&port->lock, flags);
+	serial8250_rpm_put(up);
+}
+
+static unsigned int probe_baud(struct uart_port *port)
+{
+	unsigned char lcr, dll, dlm;
+	unsigned int quot;
+
+	lcr = serial_port_in(port, UART_LCR);
+	serial_port_out(port, UART_LCR, lcr | UART_LCR_DLAB);
+	dll = serial_port_in(port, UART_DLL);
+	dlm = serial_port_in(port, UART_DLM);
+	serial_port_out(port, UART_LCR, lcr);
+
+	quot = (dlm << 8) | dll;
+	return (port->uartclk / 16) / quot;
+}
+
+int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
+{
+	int baud = 9600;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+
+	if (!port->iobase && !port->membase)
+		return -ENODEV;
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+	else if (probe)
+		baud = probe_baud(port);
+
+	return uart_set_options(port, port->cons, baud, parity, bits, flow);
+}
+
+#endif /* CONFIG_SERIAL_8250_CONSOLE */
diff --git a/drivers/tty/serial/8250/Makefile b/drivers/tty/serial/8250/Makefile
index 706295913c34..39c6d2277570 100644
--- a/drivers/tty/serial/8250/Makefile
+++ b/drivers/tty/serial/8250/Makefile
@@ -2,10 +2,11 @@
 # Makefile for the 8250 serial device drivers.
 #
 
-obj-$(CONFIG_SERIAL_8250)		+= 8250.o
+obj-$(CONFIG_SERIAL_8250)		+= 8250.o 8250_base.o
 8250-y					:= 8250_core.o
 8250-$(CONFIG_SERIAL_8250_PNP)		+= 8250_pnp.o
-8250-$(CONFIG_SERIAL_8250_DMA)		+= 8250_dma.o
+8250_base-y				:= 8250_port.o
+8250_base-$(CONFIG_SERIAL_8250_DMA)	+= 8250_dma.o
 obj-$(CONFIG_SERIAL_8250_GSC)		+= 8250_gsc.o
 obj-$(CONFIG_SERIAL_8250_PCI)		+= 8250_pci.o
 obj-$(CONFIG_SERIAL_8250_HP300)		+= 8250_hp300.o
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index ba82c07feb95..7f156bde38d9 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -152,6 +152,11 @@ int serial8250_handle_irq(struct uart_port *port, unsigned int iir);
 unsigned char serial8250_rx_chars(struct uart_8250_port *up, unsigned char lsr);
 void serial8250_tx_chars(struct uart_8250_port *up);
 unsigned int serial8250_modem_status(struct uart_8250_port *up);
+void serial8250_init_port(struct uart_8250_port *up);
+void serial8250_set_defaults(struct uart_8250_port *up);
+void serial8250_console_write(struct uart_8250_port *up, const char *s,
+			      unsigned int count);
+int serial8250_console_setup(struct uart_port *port, char *options, bool probe);
 
 extern void serial8250_set_isa_configurator(void (*v)
 					(int port, struct uart_port *up,
-- 
cgit v1.2.3-70-g09d2


From f3fb7ef3981abdca871d65e8c7d9a827225eb2ba Mon Sep 17 00:00:00 2001
From: Vineet Gupta <Vineet.Gupta1@synopsys.com>
Date: Thu, 9 Jul 2015 11:50:38 +0530
Subject: tty/early: make serial8250_early_{in,out} static again

Commit ed71871bed719 ("tty/8250_early: Turn serial_in/serial_out into
weak symbols") made these routines weak to allow platform specific
Big endian override

However recent updates to core, specifically ebc5e20082 ("serial:
of_serial: Support big-endian register accesses") and 6e63be3fee14
("serial: earlycon: Add support for big-endian MMIO accesses") means
that round about way to overide the early serial accessors is no longer
needed.

Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Kevin Cernekee <cernekee@gmail.com>
Acked-by: Noam Camus <noamc@ezchip.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_early.c | 4 ++--
 include/linux/serial_8250.h          | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_early.c b/drivers/tty/serial/8250/8250_early.c
index 771dda29a0f8..faed05f25bc2 100644
--- a/drivers/tty/serial/8250/8250_early.c
+++ b/drivers/tty/serial/8250/8250_early.c
@@ -35,7 +35,7 @@
 #include <asm/io.h>
 #include <asm/serial.h>
 
-unsigned int __weak __init serial8250_early_in(struct uart_port *port, int offset)
+static unsigned int __init serial8250_early_in(struct uart_port *port, int offset)
 {
 	switch (port->iotype) {
 	case UPIO_MEM:
@@ -51,7 +51,7 @@ unsigned int __weak __init serial8250_early_in(struct uart_port *port, int offse
 	}
 }
 
-void __weak __init serial8250_early_out(struct uart_port *port, int offset, int value)
+static void __init serial8250_early_out(struct uart_port *port, int offset, int value)
 {
 	switch (port->iotype) {
 	case UPIO_MEM:
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 7f156bde38d9..faa0e0370ce7 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -136,8 +136,6 @@ void serial8250_resume_port(int line);
 
 extern int early_serial_setup(struct uart_port *port);
 
-extern unsigned int serial8250_early_in(struct uart_port *port, int offset);
-extern void serial8250_early_out(struct uart_port *port, int offset, int value);
 extern int early_serial8250_setup(struct earlycon_device *device,
 					 const char *options);
 extern void serial8250_do_set_termios(struct uart_port *port,
-- 
cgit v1.2.3-70-g09d2


From 3fad386014ddc34513647a3e49d9fc9db2990cbc Mon Sep 17 00:00:00 2001
From: Cyrille Pitchen <cyrille.pitchen@atmel.com>
Date: Thu, 2 Jul 2015 15:18:10 +0200
Subject: tty/serial: at91: fix some macro definitions to fit coding style

This patch updates macro definitions in atmel_serial.h to fit the
80 column rule.

Please note that some deprecated comments such as "[AT91SAM9261 only]"
are removed as the corresponding bits also exist in some later chips.

Signed-off-by: Cyrille Pitchen <cyrille.pitchen@atmel.com>
Acked-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Acked-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/atmel_serial.h | 204 +++++++++++++++++++++----------------------
 1 file changed, 102 insertions(+), 102 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/atmel_serial.h b/include/linux/atmel_serial.h
index 00beddf6be20..c384c21d65f0 100644
--- a/include/linux/atmel_serial.h
+++ b/include/linux/atmel_serial.h
@@ -16,115 +16,115 @@
 #ifndef ATMEL_SERIAL_H
 #define ATMEL_SERIAL_H
 
-#define ATMEL_US_CR		0x00			/* Control Register */
-#define		ATMEL_US_RSTRX		(1 <<  2)		/* Reset Receiver */
-#define		ATMEL_US_RSTTX		(1 <<  3)		/* Reset Transmitter */
-#define		ATMEL_US_RXEN		(1 <<  4)		/* Receiver Enable */
-#define		ATMEL_US_RXDIS		(1 <<  5)		/* Receiver Disable */
-#define		ATMEL_US_TXEN		(1 <<  6)		/* Transmitter Enable */
-#define		ATMEL_US_TXDIS		(1 <<  7)		/* Transmitter Disable */
-#define		ATMEL_US_RSTSTA		(1 <<  8)		/* Reset Status Bits */
-#define		ATMEL_US_STTBRK		(1 <<  9)		/* Start Break */
-#define		ATMEL_US_STPBRK		(1 << 10)		/* Stop Break */
-#define		ATMEL_US_STTTO		(1 << 11)		/* Start Time-out */
-#define		ATMEL_US_SENDA		(1 << 12)		/* Send Address */
-#define		ATMEL_US_RSTIT		(1 << 13)		/* Reset Iterations */
-#define		ATMEL_US_RSTNACK	(1 << 14)		/* Reset Non Acknowledge */
-#define		ATMEL_US_RETTO		(1 << 15)		/* Rearm Time-out */
-#define		ATMEL_US_DTREN		(1 << 16)		/* Data Terminal Ready Enable [AT91RM9200 only] */
-#define		ATMEL_US_DTRDIS		(1 << 17)		/* Data Terminal Ready Disable [AT91RM9200 only] */
-#define		ATMEL_US_RTSEN		(1 << 18)		/* Request To Send Enable */
-#define		ATMEL_US_RTSDIS		(1 << 19)		/* Request To Send Disable */
+#define ATMEL_US_CR		0x00	/* Control Register */
+#define	ATMEL_US_RSTRX		BIT(2)	/* Reset Receiver */
+#define	ATMEL_US_RSTTX		BIT(3)	/* Reset Transmitter */
+#define	ATMEL_US_RXEN		BIT(4)	/* Receiver Enable */
+#define	ATMEL_US_RXDIS		BIT(5)	/* Receiver Disable */
+#define	ATMEL_US_TXEN		BIT(6)	/* Transmitter Enable */
+#define	ATMEL_US_TXDIS		BIT(7)	/* Transmitter Disable */
+#define	ATMEL_US_RSTSTA		BIT(8)	/* Reset Status Bits */
+#define	ATMEL_US_STTBRK		BIT(9)	/* Start Break */
+#define	ATMEL_US_STPBRK		BIT(10)	/* Stop Break */
+#define	ATMEL_US_STTTO		BIT(11)	/* Start Time-out */
+#define	ATMEL_US_SENDA		BIT(12)	/* Send Address */
+#define	ATMEL_US_RSTIT		BIT(13)	/* Reset Iterations */
+#define	ATMEL_US_RSTNACK	BIT(14)	/* Reset Non Acknowledge */
+#define	ATMEL_US_RETTO		BIT(15)	/* Rearm Time-out */
+#define	ATMEL_US_DTREN		BIT(16)	/* Data Terminal Ready Enable */
+#define	ATMEL_US_DTRDIS		BIT(17)	/* Data Terminal Ready Disable */
+#define	ATMEL_US_RTSEN		BIT(18)	/* Request To Send Enable */
+#define	ATMEL_US_RTSDIS		BIT(19)	/* Request To Send Disable */
 
-#define ATMEL_US_MR		0x04			/* Mode Register */
-#define		ATMEL_US_USMODE		(0xf <<  0)		/* Mode of the USART */
-#define			ATMEL_US_USMODE_NORMAL		0
-#define			ATMEL_US_USMODE_RS485		1
-#define			ATMEL_US_USMODE_HWHS		2
-#define			ATMEL_US_USMODE_MODEM		3
-#define			ATMEL_US_USMODE_ISO7816_T0	4
-#define			ATMEL_US_USMODE_ISO7816_T1	6
-#define			ATMEL_US_USMODE_IRDA		8
-#define		ATMEL_US_USCLKS		(3   <<  4)		/* Clock Selection */
-#define			ATMEL_US_USCLKS_MCK		(0 <<  4)
-#define			ATMEL_US_USCLKS_MCK_DIV8	(1 <<  4)
-#define			ATMEL_US_USCLKS_SCK		(3 <<  4)
-#define		ATMEL_US_CHRL		(3   <<  6)		/* Character Length */
-#define			ATMEL_US_CHRL_5			(0 <<  6)
-#define			ATMEL_US_CHRL_6			(1 <<  6)
-#define			ATMEL_US_CHRL_7			(2 <<  6)
-#define			ATMEL_US_CHRL_8			(3 <<  6)
-#define		ATMEL_US_SYNC		(1 <<  8)		/* Synchronous Mode Select */
-#define		ATMEL_US_PAR		(7 <<  9)		/* Parity Type */
-#define			ATMEL_US_PAR_EVEN		(0 <<  9)
-#define			ATMEL_US_PAR_ODD		(1 <<  9)
-#define			ATMEL_US_PAR_SPACE		(2 <<  9)
-#define			ATMEL_US_PAR_MARK		(3 <<  9)
-#define			ATMEL_US_PAR_NONE		(4 <<  9)
-#define			ATMEL_US_PAR_MULTI_DROP		(6 <<  9)
-#define		ATMEL_US_NBSTOP		(3 << 12)		/* Number of Stop Bits */
-#define			ATMEL_US_NBSTOP_1		(0 << 12)
-#define			ATMEL_US_NBSTOP_1_5		(1 << 12)
-#define			ATMEL_US_NBSTOP_2		(2 << 12)
-#define		ATMEL_US_CHMODE		(3 << 14)		/* Channel Mode */
-#define			ATMEL_US_CHMODE_NORMAL		(0 << 14)
-#define			ATMEL_US_CHMODE_ECHO		(1 << 14)
-#define			ATMEL_US_CHMODE_LOC_LOOP	(2 << 14)
-#define			ATMEL_US_CHMODE_REM_LOOP	(3 << 14)
-#define		ATMEL_US_MSBF		(1 << 16)		/* Bit Order */
-#define		ATMEL_US_MODE9		(1 << 17)		/* 9-bit Character Length */
-#define		ATMEL_US_CLKO		(1 << 18)		/* Clock Output Select */
-#define		ATMEL_US_OVER		(1 << 19)		/* Oversampling Mode */
-#define		ATMEL_US_INACK		(1 << 20)		/* Inhibit Non Acknowledge */
-#define		ATMEL_US_DSNACK		(1 << 21)		/* Disable Successive NACK */
-#define		ATMEL_US_MAX_ITER	(7 << 24)		/* Max Iterations */
-#define		ATMEL_US_FILTER		(1 << 28)		/* Infrared Receive Line Filter */
+#define ATMEL_US_MR		0x04	/* Mode Register */
+#define	ATMEL_US_USMODE		GENMASK(3, 0)	/* Mode of the USART */
+#define		ATMEL_US_USMODE_NORMAL		0
+#define		ATMEL_US_USMODE_RS485		1
+#define		ATMEL_US_USMODE_HWHS		2
+#define		ATMEL_US_USMODE_MODEM		3
+#define		ATMEL_US_USMODE_ISO7816_T0	4
+#define		ATMEL_US_USMODE_ISO7816_T1	6
+#define		ATMEL_US_USMODE_IRDA		8
+#define	ATMEL_US_USCLKS		GENMASK(5, 4)	/* Clock Selection */
+#define		ATMEL_US_USCLKS_MCK		(0 <<  4)
+#define		ATMEL_US_USCLKS_MCK_DIV8	(1 <<  4)
+#define		ATMEL_US_USCLKS_SCK		(3 <<  4)
+#define	ATMEL_US_CHRL		GENMASK(7, 6)	/* Character Length */
+#define		ATMEL_US_CHRL_5			(0 <<  6)
+#define		ATMEL_US_CHRL_6			(1 <<  6)
+#define		ATMEL_US_CHRL_7			(2 <<  6)
+#define		ATMEL_US_CHRL_8			(3 <<  6)
+#define	ATMEL_US_SYNC		BIT(8)		/* Synchronous Mode Select */
+#define	ATMEL_US_PAR		GENMASK(11, 9)	/* Parity Type */
+#define		ATMEL_US_PAR_EVEN		(0 <<  9)
+#define		ATMEL_US_PAR_ODD		(1 <<  9)
+#define		ATMEL_US_PAR_SPACE		(2 <<  9)
+#define		ATMEL_US_PAR_MARK		(3 <<  9)
+#define		ATMEL_US_PAR_NONE		(4 <<  9)
+#define		ATMEL_US_PAR_MULTI_DROP		(6 <<  9)
+#define	ATMEL_US_NBSTOP		GENMASK(13, 12)	/* Number of Stop Bits */
+#define		ATMEL_US_NBSTOP_1		(0 << 12)
+#define		ATMEL_US_NBSTOP_1_5		(1 << 12)
+#define		ATMEL_US_NBSTOP_2		(2 << 12)
+#define	ATMEL_US_CHMODE		GENMASK(15, 14)	/* Channel Mode */
+#define		ATMEL_US_CHMODE_NORMAL		(0 << 14)
+#define		ATMEL_US_CHMODE_ECHO		(1 << 14)
+#define		ATMEL_US_CHMODE_LOC_LOOP	(2 << 14)
+#define		ATMEL_US_CHMODE_REM_LOOP	(3 << 14)
+#define	ATMEL_US_MSBF		BIT(16)	/* Bit Order */
+#define	ATMEL_US_MODE9		BIT(17)	/* 9-bit Character Length */
+#define	ATMEL_US_CLKO		BIT(18)	/* Clock Output Select */
+#define	ATMEL_US_OVER		BIT(19)	/* Oversampling Mode */
+#define	ATMEL_US_INACK		BIT(20)	/* Inhibit Non Acknowledge */
+#define	ATMEL_US_DSNACK		BIT(21)	/* Disable Successive NACK */
+#define	ATMEL_US_MAX_ITER	GENMASK(26, 24)	/* Max Iterations */
+#define	ATMEL_US_FILTER		BIT(28)	/* Infrared Receive Line Filter */
 
-#define ATMEL_US_IER		0x08			/* Interrupt Enable Register */
-#define		ATMEL_US_RXRDY		(1 <<  0)		/* Receiver Ready */
-#define		ATMEL_US_TXRDY		(1 <<  1)		/* Transmitter Ready */
-#define		ATMEL_US_RXBRK		(1 <<  2)		/* Break Received / End of Break */
-#define		ATMEL_US_ENDRX		(1 <<  3)		/* End of Receiver Transfer */
-#define		ATMEL_US_ENDTX		(1 <<  4)		/* End of Transmitter Transfer */
-#define		ATMEL_US_OVRE		(1 <<  5)		/* Overrun Error */
-#define		ATMEL_US_FRAME		(1 <<  6)		/* Framing Error */
-#define		ATMEL_US_PARE		(1 <<  7)		/* Parity Error */
-#define		ATMEL_US_TIMEOUT	(1 <<  8)		/* Receiver Time-out */
-#define		ATMEL_US_TXEMPTY	(1 <<  9)		/* Transmitter Empty */
-#define		ATMEL_US_ITERATION	(1 << 10)		/* Max number of Repetitions Reached */
-#define		ATMEL_US_TXBUFE		(1 << 11)		/* Transmission Buffer Empty */
-#define		ATMEL_US_RXBUFF		(1 << 12)		/* Reception Buffer Full */
-#define		ATMEL_US_NACK		(1 << 13)		/* Non Acknowledge */
-#define		ATMEL_US_RIIC		(1 << 16)		/* Ring Indicator Input Change [AT91RM9200 only] */
-#define		ATMEL_US_DSRIC		(1 << 17)		/* Data Set Ready Input Change [AT91RM9200 only] */
-#define		ATMEL_US_DCDIC		(1 << 18)		/* Data Carrier Detect Input Change [AT91RM9200 only] */
-#define		ATMEL_US_CTSIC		(1 << 19)		/* Clear to Send Input Change */
-#define		ATMEL_US_RI		(1 << 20)		/* RI */
-#define		ATMEL_US_DSR		(1 << 21)		/* DSR */
-#define		ATMEL_US_DCD		(1 << 22)		/* DCD */
-#define		ATMEL_US_CTS		(1 << 23)		/* CTS */
+#define ATMEL_US_IER		0x08	/* Interrupt Enable Register */
+#define	ATMEL_US_RXRDY		BIT(0)	/* Receiver Ready */
+#define	ATMEL_US_TXRDY		BIT(1)	/* Transmitter Ready */
+#define	ATMEL_US_RXBRK		BIT(2)	/* Break Received / End of Break */
+#define	ATMEL_US_ENDRX		BIT(3)	/* End of Receiver Transfer */
+#define	ATMEL_US_ENDTX		BIT(4)	/* End of Transmitter Transfer */
+#define	ATMEL_US_OVRE		BIT(5)	/* Overrun Error */
+#define	ATMEL_US_FRAME		BIT(6)	/* Framing Error */
+#define	ATMEL_US_PARE		BIT(7)	/* Parity Error */
+#define	ATMEL_US_TIMEOUT	BIT(8)	/* Receiver Time-out */
+#define	ATMEL_US_TXEMPTY	BIT(9)	/* Transmitter Empty */
+#define	ATMEL_US_ITERATION	BIT(10)	/* Max number of Repetitions Reached */
+#define	ATMEL_US_TXBUFE		BIT(11)	/* Transmission Buffer Empty */
+#define	ATMEL_US_RXBUFF		BIT(12)	/* Reception Buffer Full */
+#define	ATMEL_US_NACK		BIT(13)	/* Non Acknowledge */
+#define	ATMEL_US_RIIC		BIT(16)	/* Ring Indicator Input Change */
+#define	ATMEL_US_DSRIC		BIT(17)	/* Data Set Ready Input Change */
+#define	ATMEL_US_DCDIC		BIT(18)	/* Data Carrier Detect Input Change */
+#define	ATMEL_US_CTSIC		BIT(19)	/* Clear to Send Input Change */
+#define	ATMEL_US_RI		BIT(20)	/* RI */
+#define	ATMEL_US_DSR		BIT(21)	/* DSR */
+#define	ATMEL_US_DCD		BIT(22)	/* DCD */
+#define	ATMEL_US_CTS		BIT(23)	/* CTS */
 
-#define ATMEL_US_IDR		0x0c			/* Interrupt Disable Register */
-#define ATMEL_US_IMR		0x10			/* Interrupt Mask Register */
-#define ATMEL_US_CSR		0x14			/* Channel Status Register */
-#define ATMEL_US_RHR		0x18			/* Receiver Holding Register */
-#define ATMEL_US_THR		0x1c			/* Transmitter Holding Register */
-#define		ATMEL_US_SYNH		(1 << 15)		/* Transmit/Receive Sync [AT91SAM9261 only] */
+#define ATMEL_US_IDR		0x0c	/* Interrupt Disable Register */
+#define ATMEL_US_IMR		0x10	/* Interrupt Mask Register */
+#define ATMEL_US_CSR		0x14	/* Channel Status Register */
+#define ATMEL_US_RHR		0x18	/* Receiver Holding Register */
+#define ATMEL_US_THR		0x1c	/* Transmitter Holding Register */
+#define	ATMEL_US_SYNH		BIT(15)	/* Transmit/Receive Sync */
 
-#define ATMEL_US_BRGR		0x20			/* Baud Rate Generator Register */
-#define		ATMEL_US_CD		(0xffff << 0)		/* Clock Divider */
+#define ATMEL_US_BRGR		0x20	/* Baud Rate Generator Register */
+#define	ATMEL_US_CD		GENMASK(15, 0)	/* Clock Divider */
 
-#define ATMEL_US_RTOR		0x24			/* Receiver Time-out Register */
-#define		ATMEL_US_TO		(0xffff << 0)		/* Time-out Value */
+#define ATMEL_US_RTOR		0x24	/* Receiver Time-out Register */
+#define	ATMEL_US_TO		GENMASK(15, 0)	/* Time-out Value */
 
-#define ATMEL_US_TTGR		0x28			/* Transmitter Timeguard Register */
-#define		ATMEL_US_TG		(0xff << 0)		/* Timeguard Value */
+#define ATMEL_US_TTGR		0x28	/* Transmitter Timeguard Register */
+#define	ATMEL_US_TG		GENMASK(7, 0)	/* Timeguard Value */
 
-#define ATMEL_US_FIDI		0x40			/* FI DI Ratio Register */
-#define ATMEL_US_NER		0x44			/* Number of Errors Register */
-#define ATMEL_US_IF		0x4c			/* IrDA Filter Register */
+#define ATMEL_US_FIDI		0x40	/* FI DI Ratio Register */
+#define ATMEL_US_NER		0x44	/* Number of Errors Register */
+#define ATMEL_US_IF		0x4c	/* IrDA Filter Register */
 
-#define ATMEL_US_NAME		0xf0			/* Ip Name */
-#define ATMEL_US_VERSION	0xfc			/* Ip Version */
+#define ATMEL_US_NAME		0xf0	/* Ip Name */
+#define ATMEL_US_VERSION	0xfc	/* Ip Version */
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From b5199d46817766c95ef759684658cd8e359c6d27 Mon Sep 17 00:00:00 2001
From: Cyrille Pitchen <cyrille.pitchen@atmel.com>
Date: Thu, 2 Jul 2015 15:18:12 +0200
Subject: tty/serial: at91: add support to FIFOs

Depending on the hardware, TX and RX FIFOs may be available. The RX
FIFO can avoid receive overruns, especially when DMA transfers are
not used to read data from the Receive Holding Register. For heavy
system load, The CPU is likely not be able to fetch data fast enough
from the RHR.

In addition, the RX FIFO can supersede the DMA/PDC to control the RTS
line when the Hardware Handshaking mode is enabled. Two thresholds
are to be set for that purpose:
- When the number of data in the RX FIFO crosses and becomes lower
  than or equal to the low threshold, the RTS line is set to low
  level: the remote peer is requested to send data.
- When the number of data in the RX FIFO crosses and becomes greater
  than or equal to the high threshold, the RTS line is set to high
  level: the remote peer should stop sending new data.
- low threshold <= high threshold
Once these two thresholds are set properly, this new feature is
enabled by setting the FIFO RTS Control bit of the FIFO Mode Register.

FIFOs also introduce a new multiple data mode: the USART works either
in multiple data mode or in single data (legacy) mode.

If MODE9 bit is set into the Mode Register or if USMODE is set to
either LIN_MASTER, LIN_SLAVE or LON_MODE, FIFOs operate in single
data mode. Otherwise, they operate in multiple data mode.

In this new multiple data mode, accesses to the Receive Holding
Register or Transmit Holding Register slightly change.

Since this driver implements neither the 9bit data feature (MODE9 bit
set into the Mode Register) nor LIN modes, the USART works in
multiple data mode whenever FIFOs are available and enabled. We also
assume that data are 8bit wide.

In single data mode, 32bit access CAN be used to read a single data
from RHR or write a single data into THR.
However in multiple data mode, a 32bit access to RHR now allows us to
read four consecutive data from RX FIFO. Also a 32bit access to THR
now allows to write four consecutive data into TX FIFO. So we MUST
use 8bit access whenever only one data have to be read/written at a
time.

Signed-off-by: Cyrille Pitchen <cyrille.pitchen@atmel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/atmel_serial.c | 100 +++++++++++++++++++++++++++++++++++---
 include/linux/atmel_serial.h      |  36 ++++++++++++++
 2 files changed, 130 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c
index e7c337de31d1..87de21f0c7a3 100644
--- a/drivers/tty/serial/atmel_serial.c
+++ b/drivers/tty/serial/atmel_serial.c
@@ -56,6 +56,15 @@
 /* Revisit: We should calculate this based on the actual port settings */
 #define PDC_RX_TIMEOUT		(3 * 10)		/* 3 bytes */
 
+/* The minium number of data FIFOs should be able to contain */
+#define ATMEL_MIN_FIFO_SIZE	8
+/*
+ * These two offsets are substracted from the RX FIFO size to define the RTS
+ * high and low thresholds
+ */
+#define ATMEL_RTS_HIGH_OFFSET	16
+#define ATMEL_RTS_LOW_OFFSET	20
+
 #if defined(CONFIG_SERIAL_ATMEL_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
 #define SUPPORT_SYSRQ
 #endif
@@ -141,6 +150,9 @@ struct atmel_uart_port {
 	struct mctrl_gpios	*gpios;
 	int			gpio_irq[UART_GPIO_MAX];
 	unsigned int		tx_done_mask;
+	u32			fifo_size;
+	u32			rts_high;
+	u32			rts_low;
 	bool			ms_irq_enabled;
 	bool			is_usart;	/* usart or uart */
 	struct timer_list	uart_timer;	/* uart timer */
@@ -191,6 +203,16 @@ static inline void atmel_uart_writel(struct uart_port *port, u32 reg, u32 value)
 	__raw_writel(value, port->membase + reg);
 }
 
+static inline u8 atmel_uart_readb(struct uart_port *port, u32 reg)
+{
+	return __raw_readb(port->membase + reg);
+}
+
+static inline void atmel_uart_writeb(struct uart_port *port, u32 reg, u8 value)
+{
+	__raw_writeb(value, port->membase + reg);
+}
+
 #ifdef CONFIG_SERIAL_ATMEL_PDC
 static bool atmel_use_pdc_rx(struct uart_port *port)
 {
@@ -635,7 +657,7 @@ static void atmel_rx_chars(struct uart_port *port)
 
 	status = atmel_uart_readl(port, ATMEL_US_CSR);
 	while (status & ATMEL_US_RXRDY) {
-		ch = atmel_uart_readl(port, ATMEL_US_RHR);
+		ch = atmel_uart_readb(port, ATMEL_US_RHR);
 
 		/*
 		 * note that the error handling code is
@@ -686,7 +708,7 @@ static void atmel_tx_chars(struct uart_port *port)
 
 	if (port->x_char &&
 	    (atmel_uart_readl(port, ATMEL_US_CSR) & atmel_port->tx_done_mask)) {
-		atmel_uart_writel(port, ATMEL_US_THR, port->x_char);
+		atmel_uart_writeb(port, ATMEL_US_THR, port->x_char);
 		port->icount.tx++;
 		port->x_char = 0;
 	}
@@ -695,7 +717,7 @@ static void atmel_tx_chars(struct uart_port *port)
 
 	while (atmel_uart_readl(port, ATMEL_US_CSR) &
 	       atmel_port->tx_done_mask) {
-		atmel_uart_writel(port, ATMEL_US_THR, xmit->buf[xmit->tail]);
+		atmel_uart_writeb(port, ATMEL_US_THR, xmit->buf[xmit->tail]);
 		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
 		port->icount.tx++;
 		if (uart_circ_empty(xmit))
@@ -1796,6 +1818,29 @@ static int atmel_startup(struct uart_port *port)
 			atmel_set_ops(port);
 	}
 
+	/*
+	 * Enable FIFO when available
+	 */
+	if (atmel_port->fifo_size) {
+		unsigned int txrdym = ATMEL_US_ONE_DATA;
+		unsigned int rxrdym = ATMEL_US_ONE_DATA;
+		unsigned int fmr;
+
+		atmel_uart_writel(port, ATMEL_US_CR,
+				  ATMEL_US_FIFOEN |
+				  ATMEL_US_RXFCLR |
+				  ATMEL_US_TXFLCLR);
+
+		fmr = ATMEL_US_TXRDYM(txrdym) | ATMEL_US_RXRDYM(rxrdym);
+		if (atmel_port->rts_high &&
+		    atmel_port->rts_low)
+			fmr |=	ATMEL_US_FRTSC |
+				ATMEL_US_RXFTHRES(atmel_port->rts_high) |
+				ATMEL_US_RXFTHRES2(atmel_port->rts_low);
+
+		atmel_uart_writel(port, ATMEL_US_FMR, fmr);
+	}
+
 	/* Save current CSR for comparison in atmel_tasklet_func() */
 	atmel_port->irq_status_prev = atmel_get_lines_status(port);
 	atmel_port->irq_status = atmel_port->irq_status_prev;
@@ -2213,7 +2258,7 @@ static int atmel_poll_get_char(struct uart_port *port)
 	while (!(atmel_uart_readl(port, ATMEL_US_CSR) & ATMEL_US_RXRDY))
 		cpu_relax();
 
-	return atmel_uart_readl(port, ATMEL_US_RHR);
+	return atmel_uart_readb(port, ATMEL_US_RHR);
 }
 
 static void atmel_poll_put_char(struct uart_port *port, unsigned char ch)
@@ -2221,7 +2266,7 @@ static void atmel_poll_put_char(struct uart_port *port, unsigned char ch)
 	while (!(atmel_uart_readl(port, ATMEL_US_CSR) & ATMEL_US_TXRDY))
 		cpu_relax();
 
-	atmel_uart_writel(port, ATMEL_US_THR, ch);
+	atmel_uart_writeb(port, ATMEL_US_THR, ch);
 }
 #endif
 
@@ -2328,7 +2373,7 @@ static void atmel_console_putchar(struct uart_port *port, int ch)
 {
 	while (!(atmel_uart_readl(port, ATMEL_US_CSR) & ATMEL_US_TXRDY))
 		cpu_relax();
-	atmel_uart_writel(port, ATMEL_US_THR, ch);
+	atmel_uart_writeb(port, ATMEL_US_THR, ch);
 }
 
 /*
@@ -2603,6 +2648,48 @@ static int atmel_init_gpios(struct atmel_uart_port *p, struct device *dev)
 	return 0;
 }
 
+static void atmel_serial_probe_fifos(struct atmel_uart_port *port,
+				     struct platform_device *pdev)
+{
+	port->fifo_size = 0;
+	port->rts_low = 0;
+	port->rts_high = 0;
+
+	if (of_property_read_u32(pdev->dev.of_node,
+				 "atmel,fifo-size",
+				 &port->fifo_size))
+		return;
+
+	if (!port->fifo_size)
+		return;
+
+	if (port->fifo_size < ATMEL_MIN_FIFO_SIZE) {
+		port->fifo_size = 0;
+		dev_err(&pdev->dev, "Invalid FIFO size\n");
+		return;
+	}
+
+	/*
+	 * 0 <= rts_low <= rts_high <= fifo_size
+	 * Once their CTS line asserted by the remote peer, some x86 UARTs tend
+	 * to flush their internal TX FIFO, commonly up to 16 data, before
+	 * actually stopping to send new data. So we try to set the RTS High
+	 * Threshold to a reasonably high value respecting this 16 data
+	 * empirical rule when possible.
+	 */
+	port->rts_high = max_t(int, port->fifo_size >> 1,
+			       port->fifo_size - ATMEL_RTS_HIGH_OFFSET);
+	port->rts_low  = max_t(int, port->fifo_size >> 2,
+			       port->fifo_size - ATMEL_RTS_LOW_OFFSET);
+
+	dev_info(&pdev->dev, "Using FIFO (%u data)\n",
+		 port->fifo_size);
+	dev_dbg(&pdev->dev, "RTS High Threshold : %2u data\n",
+		port->rts_high);
+	dev_dbg(&pdev->dev, "RTS Low Threshold  : %2u data\n",
+		port->rts_low);
+}
+
 static int atmel_serial_probe(struct platform_device *pdev)
 {
 	struct atmel_uart_port *port;
@@ -2639,6 +2726,7 @@ static int atmel_serial_probe(struct platform_device *pdev)
 	port = &atmel_ports[ret];
 	port->backup_imr = 0;
 	port->uart.line = ret;
+	atmel_serial_probe_fifos(port, pdev);
 
 	spin_lock_init(&port->lock_suspended);
 
diff --git a/include/linux/atmel_serial.h b/include/linux/atmel_serial.h
index c384c21d65f0..ee696d7e8a43 100644
--- a/include/linux/atmel_serial.h
+++ b/include/linux/atmel_serial.h
@@ -35,6 +35,11 @@
 #define	ATMEL_US_DTRDIS		BIT(17)	/* Data Terminal Ready Disable */
 #define	ATMEL_US_RTSEN		BIT(18)	/* Request To Send Enable */
 #define	ATMEL_US_RTSDIS		BIT(19)	/* Request To Send Disable */
+#define	ATMEL_US_TXFCLR		BIT(24)	/* Transmit FIFO Clear */
+#define	ATMEL_US_RXFCLR		BIT(25)	/* Receive FIFO Clear */
+#define	ATMEL_US_TXFLCLR	BIT(26)	/* Transmit FIFO Lock Clear */
+#define	ATMEL_US_FIFOEN		BIT(30)	/* FIFO enable */
+#define	ATMEL_US_FIFODIS	BIT(31)	/* FIFO disable */
 
 #define ATMEL_US_MR		0x04	/* Mode Register */
 #define	ATMEL_US_USMODE		GENMASK(3, 0)	/* Mode of the USART */
@@ -124,6 +129,37 @@
 #define ATMEL_US_NER		0x44	/* Number of Errors Register */
 #define ATMEL_US_IF		0x4c	/* IrDA Filter Register */
 
+#define ATMEL_US_CMPR		0x90	/* Comparaison Register */
+#define ATMEL_US_FMR		0xa0	/* FIFO Mode Register */
+#define	ATMEL_US_TXRDYM(data)	(((data) & 0x3) << 0)	/* TX Ready Mode */
+#define	ATMEL_US_RXRDYM(data)	(((data) & 0x3) << 4)	/* RX Ready Mode */
+#define		ATMEL_US_ONE_DATA	0x0
+#define		ATMEL_US_TWO_DATA	0x1
+#define		ATMEL_US_FOUR_DATA	0x2
+#define	ATMEL_US_FRTSC		BIT(7)	/* FIFO RTS pin Control */
+#define	ATMEL_US_TXFTHRES(thr)	(((thr) & 0x3f) << 8)	/* TX FIFO Threshold */
+#define	ATMEL_US_RXFTHRES(thr)	(((thr) & 0x3f) << 16)	/* RX FIFO Threshold */
+#define	ATMEL_US_RXFTHRES2(thr)	(((thr) & 0x3f) << 24)	/* RX FIFO Threshold2 */
+
+#define ATMEL_US_FLR		0xa4	/* FIFO Level Register */
+#define	ATMEL_US_TXFL(reg)	(((reg) >> 0) & 0x3f)	/* TX FIFO Level */
+#define	ATMEL_US_RXFL(reg)	(((reg) >> 16) & 0x3f)	/* RX FIFO Level */
+
+#define ATMEL_US_FIER		0xa8	/* FIFO Interrupt Enable Register */
+#define ATMEL_US_FIDR		0xac	/* FIFO Interrupt Disable Register */
+#define ATMEL_US_FIMR		0xb0	/* FIFO Interrupt Mask Register */
+#define ATMEL_US_FESR		0xb4	/* FIFO Event Status Register */
+#define	ATMEL_US_TXFEF		BIT(0)	/* Transmit FIFO Empty Flag */
+#define	ATMEL_US_TXFFF		BIT(1)	/* Transmit FIFO Full Flag */
+#define	ATMEL_US_TXFTHF		BIT(2)	/* Transmit FIFO Threshold Flag */
+#define	ATMEL_US_RXFEF		BIT(3)	/* Receive FIFO Empty Flag */
+#define	ATMEL_US_RXFFF		BIT(4)	/* Receive FIFO Full Flag */
+#define	ATMEL_US_RXFTHF		BIT(5)	/* Receive FIFO Threshold Flag */
+#define	ATMEL_US_TXFPTEF	BIT(6)	/* Transmit FIFO Pointer Error Flag */
+#define	ATMEL_US_RXFPTEF	BIT(7)	/* Receive FIFO Pointer Error Flag */
+#define	ATMEL_US_TXFLOCK	BIT(8)	/* Transmit FIFO Lock (FESR only) */
+#define	ATMEL_US_RXFTHF2	BIT(9)	/* Receive FIFO Threshold Flag 2 */
+
 #define ATMEL_US_NAME		0xf0	/* Ip Name */
 #define ATMEL_US_VERSION	0xfc	/* Ip Version */
 
-- 
cgit v1.2.3-70-g09d2


From e2dfa3d38797058fa03478b08bab3d3c4b081615 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sun, 12 Jul 2015 22:49:08 -0400
Subject: tty: core: Add tty_debug() for printk(KERN_DEBUG) messages

Introduce tty_debug() macro to output uniform debug information for
tty core debug messages (function name and tty name).

Note: printk(KERN_DEBUG) is retained here over pr_debug() since
messages can be enabled in non-DEBUG builds.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c | 41 +++++++++++++++++------------------------
 include/linux/tty.h  |  6 ++++++
 2 files changed, 23 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 1738fcaea891..9537979c9c51 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -531,8 +531,8 @@ static void __proc_set_tty(struct tty_struct *tty)
 	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
 	tty->session = get_pid(task_session(current));
 	if (current->signal->tty) {
-		printk(KERN_DEBUG "%s: %s: current tty %s not NULL!!\n",
-		       __func__, tty->name, current->signal->tty->name);
+		tty_debug(tty, "current tty %s not NULL!!\n",
+			  current->signal->tty->name);
 		tty_kref_put(current->signal->tty);
 	}
 	put_pid(current->signal->tty_old_pgrp);
@@ -775,7 +775,7 @@ static void do_tty_hangup(struct work_struct *work)
 void tty_hangup(struct tty_struct *tty)
 {
 #ifdef TTY_DEBUG_HANGUP
-	printk(KERN_DEBUG "%s hangup...\n", tty_name(tty));
+	tty_debug(tty, "\n");
 #endif
 	schedule_work(&tty->hangup_work);
 }
@@ -794,7 +794,7 @@ EXPORT_SYMBOL(tty_hangup);
 void tty_vhangup(struct tty_struct *tty)
 {
 #ifdef TTY_DEBUG_HANGUP
-	printk(KERN_DEBUG "%s vhangup...\n", tty_name(tty));
+	tty_debug(tty, "\n")
 #endif
 	__tty_hangup(tty, 0);
 }
@@ -833,7 +833,7 @@ void tty_vhangup_self(void)
 static void tty_vhangup_session(struct tty_struct *tty)
 {
 #ifdef TTY_DEBUG_HANGUP
-	printk(KERN_DEBUG "%s vhangup session...\n", tty_name(tty));
+	tty_debug(tty, "\n");
 #endif
 	__tty_hangup(tty, 1);
 }
@@ -930,7 +930,7 @@ void disassociate_ctty(int on_exit)
 		tty_kref_put(tty);
 	} else {
 #ifdef TTY_DEBUG_HANGUP
-		printk(KERN_DEBUG "%s: no current tty\n", __func__);
+		tty_debug(tty, "no current tty\n");
 #endif
 	}
 
@@ -1712,8 +1712,7 @@ static int tty_release_checks(struct tty_struct *tty, int idx)
 {
 #ifdef TTY_PARANOIA_CHECK
 	if (idx < 0 || idx >= tty->driver->num) {
-		printk(KERN_DEBUG "%s: %s: bad idx %d\n",
-				__func__, tty->name, idx);
+		tty_debug(tty, "bad idx %d\n", idx);
 		return -1;
 	}
 
@@ -1722,22 +1721,20 @@ static int tty_release_checks(struct tty_struct *tty, int idx)
 		return 0;
 
 	if (tty != tty->driver->ttys[idx]) {
-		printk(KERN_DEBUG "%s: %s: bad driver table[%d] = %p\n",
-		       __func__, tty->name, idx, tty->driver->ttys[idx]);
+		tty_debug(tty, "bad driver table[%d] = %p\n",
+			  idx, tty->driver->ttys[idx]);
 		return -1;
 	}
 	if (tty->driver->other) {
 		struct tty_struct *o_tty = tty->link;
 
 		if (o_tty != tty->driver->other->ttys[idx]) {
-			printk(KERN_DEBUG "%s: %s: bad other table[%d] = %p\n",
-			       __func__, tty->name, idx,
-			       tty->driver->other->ttys[idx]);
+			tty_debug(tty, "bad other table[%d] = %p\n",
+				  idx, tty->driver->other->ttys[idx]);
 			return -1;
 		}
 		if (o_tty->link != tty) {
-			printk(KERN_DEBUG "%s: %s: bad link = %p\n",
-			       __func__, tty->name, o_tty->link);
+			tty_debug(tty, "bad link = %p\n", o_tty->link);
 			return -1;
 		}
 	}
@@ -1792,8 +1789,7 @@ int tty_release(struct inode *inode, struct file *filp)
 	}
 
 #ifdef TTY_DEBUG_HANGUP
-	printk(KERN_DEBUG "%s: %s (tty count=%d)...\n", __func__,
-			tty_name(tty), tty->count);
+	tty_debug(tty, "(tty count=%d)...\n", tty->count);
 #endif
 
 	if (tty->ops->close)
@@ -1905,7 +1901,7 @@ int tty_release(struct inode *inode, struct file *filp)
 		return 0;
 
 #ifdef TTY_DEBUG_HANGUP
-	printk(KERN_DEBUG "%s: %s: final close\n", __func__, tty_name(tty));
+	tty_debug(tty, "final close\n");
 #endif
 	/*
 	 * Ask the line discipline code to release its structures
@@ -1916,8 +1912,7 @@ int tty_release(struct inode *inode, struct file *filp)
 	tty_flush_works(tty);
 
 #ifdef TTY_DEBUG_HANGUP
-	printk(KERN_DEBUG "%s: %s: freeing structure...\n", __func__,
-	       tty_name(tty));
+	tty_debug(tty, "freeing structure...\n");
 #endif
 	/*
 	 * The release_tty function takes care of the details of clearing
@@ -2108,8 +2103,7 @@ retry_open:
 	    tty->driver->subtype == PTY_TYPE_MASTER)
 		noctty = 1;
 #ifdef TTY_DEBUG_HANGUP
-	printk(KERN_DEBUG "%s: %s: (tty count=%d)\n", __func__, tty->name,
-	       tty->count);
+	tty_debug(tty, "(tty count=%d)\n", tty->count);
 #endif
 	if (tty->ops->open)
 		retval = tty->ops->open(tty, filp);
@@ -2119,8 +2113,7 @@ retry_open:
 
 	if (retval) {
 #ifdef TTY_DEBUG_HANGUP
-		printk(KERN_DEBUG "%s: %s: error %d, releasing...\n", __func__,
-		       tty->name, retval);
+		tty_debug(tty, "error %d, releasing...\n", retval);
 #endif
 		tty_unlock(tty); /* need to call tty_release without BTM */
 		tty_release(inode, filp);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index ad6c8913aa3e..d072ded41678 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -709,4 +709,10 @@ static inline void proc_tty_register_driver(struct tty_driver *d) {}
 static inline void proc_tty_unregister_driver(struct tty_driver *d) {}
 #endif
 
+#define tty_debug(tty, f, args...)					\
+	do {								\
+		printk(KERN_DEBUG "%s: %s: " f, __func__,		\
+		       tty_name(tty), ##args);				\
+	} while (0)
+
 #endif
-- 
cgit v1.2.3-70-g09d2


From a3bd4f989f532694337dd30538b635d5213ab86a Mon Sep 17 00:00:00 2001
From: Dong Aisheng <aisheng.dong@freescale.com>
Date: Wed, 22 Jul 2015 20:53:09 +0800
Subject: mmc: sdhci-esdhc-imx: clear f_max in boarddata

After commit 8d86e4fcccf6 ("mmc: sdhci-esdhc-imx: Call mmc_of_parse()"),
it's not used anymore.

Signed-off-by: Dong Aisheng <aisheng.dong@freescale.com>
Reviewed-by: Johan Derycke <johan.derycke@barco.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci-esdhc-imx.c          | 7 +------
 include/linux/platform_data/mmc-esdhc-imx.h | 1 -
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c
index 1b0e61847e73..c6b9f6492e1a 100644
--- a/drivers/mmc/host/sdhci-esdhc-imx.c
+++ b/drivers/mmc/host/sdhci-esdhc-imx.c
@@ -581,13 +581,8 @@ static void esdhc_writeb_le(struct sdhci_host *host, u8 val, int reg)
 static unsigned int esdhc_pltfm_get_max_clock(struct sdhci_host *host)
 {
 	struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
-	struct pltfm_imx_data *imx_data = pltfm_host->priv;
-	struct esdhc_platform_data *boarddata = &imx_data->boarddata;
 
-	if (boarddata->f_max && (boarddata->f_max < pltfm_host->clock))
-		return boarddata->f_max;
-	else
-		return pltfm_host->clock;
+	return pltfm_host->clock;
 }
 
 static unsigned int esdhc_pltfm_get_min_clock(struct sdhci_host *host)
diff --git a/include/linux/platform_data/mmc-esdhc-imx.h b/include/linux/platform_data/mmc-esdhc-imx.h
index 75f70f6ac137..e1571efa3f2b 100644
--- a/include/linux/platform_data/mmc-esdhc-imx.h
+++ b/include/linux/platform_data/mmc-esdhc-imx.h
@@ -43,7 +43,6 @@ struct esdhc_platform_data {
 	enum wp_types wp_type;
 	enum cd_types cd_type;
 	int max_bus_width;
-	unsigned int f_max;
 	bool support_vsel;
 	unsigned int delay_line;
 };
-- 
cgit v1.2.3-70-g09d2


From 8766018b6ef73ca124d13b0d0a06dec906726cc8 Mon Sep 17 00:00:00 2001
From: Henry Chen <henryc.chen@mediatek.com>
Date: Fri, 24 Jul 2015 13:24:41 +0800
Subject: regulator: mt6311: Add support for mt6311 regulator

Add regulator support for mt6311.
It has 2 regulaotrs - Buck and LDO, provide the related buck/ldo voltage
data to the driver, and creates the regulator_desc table. Supported
operations for Buck are enabled/disabled and voltage change, only
enabled/disabled for LDO.

Signed-off-by: Henry Chen <henryc.chen@mediatek.com>
Reviewed-by: Javier Martinez Canillas <javier@osg.samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/Kconfig            |   9 ++
 drivers/regulator/Makefile           |   1 +
 drivers/regulator/mt6311-regulator.c | 180 +++++++++++++++++++++++++++++++++++
 drivers/regulator/mt6311-regulator.h |  65 +++++++++++++
 include/linux/regulator/mt6311.h     |  29 ++++++
 5 files changed, 284 insertions(+)
 create mode 100644 drivers/regulator/mt6311-regulator.c
 create mode 100644 drivers/regulator/mt6311-regulator.h
 create mode 100644 include/linux/regulator/mt6311.h

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index bef3bde6971b..aab09ac499a0 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -451,6 +451,15 @@ config REGULATOR_MC13892
 	  Say y here to support the regulators found on the Freescale MC13892
 	  PMIC.
 
+config REGULATOR_MT6311
+	tristate "MediaTek MT6311 PMIC"
+	depends on I2C
+	help
+	  Say y here to select this option to enable the power regulator of
+	  MediaTek MT6311 PMIC.
+	  This driver supports the control of different power rails of device
+	  through regulator interface.
+
 config REGULATOR_MT6397
 	tristate "MediaTek MT6397 PMIC"
 	depends on MFD_MT6397
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 91bf76267404..45e790f92715 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_REGULATOR_MAX77843) += max77843.o
 obj-$(CONFIG_REGULATOR_MC13783) += mc13783-regulator.o
 obj-$(CONFIG_REGULATOR_MC13892) += mc13892-regulator.o
 obj-$(CONFIG_REGULATOR_MC13XXX_CORE) +=  mc13xxx-regulator-core.o
+obj-$(CONFIG_REGULATOR_MT6311) += mt6311-regulator.o
 obj-$(CONFIG_REGULATOR_MT6397)	+= mt6397-regulator.o
 obj-$(CONFIG_REGULATOR_QCOM_RPM) += qcom_rpm-regulator.o
 obj-$(CONFIG_REGULATOR_QCOM_SPMI) += qcom_spmi-regulator.o
diff --git a/drivers/regulator/mt6311-regulator.c b/drivers/regulator/mt6311-regulator.c
new file mode 100644
index 000000000000..096e6202be1c
--- /dev/null
+++ b/drivers/regulator/mt6311-regulator.c
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2015 MediaTek Inc.
+ * Author: Henry Chen <henryc.chen@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/err.h>
+#include <linux/gpio.h>
+#include <linux/i2c.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/regmap.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
+#include <linux/regulator/of_regulator.h>
+#include <linux/regulator/mt6311.h>
+#include <linux/slab.h>
+#include "mt6311-regulator.h"
+
+static const struct regmap_config mt6311_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = MT6311_FQMTR_CON4,
+};
+
+/* Default limits measured in millivolts and milliamps */
+#define MT6311_MIN_UV		600000
+#define MT6311_MAX_UV		1400000
+#define MT6311_STEP_UV		6250
+
+static const struct regulator_linear_range buck_volt_range[] = {
+	REGULATOR_LINEAR_RANGE(MT6311_MIN_UV, 0, 0x7f, MT6311_STEP_UV),
+};
+
+static struct regulator_ops mt6311_buck_ops = {
+	.list_voltage = regulator_list_voltage_linear_range,
+	.map_voltage = regulator_map_voltage_linear_range,
+	.set_voltage_sel = regulator_set_voltage_sel_regmap,
+	.get_voltage_sel = regulator_get_voltage_sel_regmap,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+};
+
+static struct regulator_ops mt6311_ldo_ops = {
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+};
+
+#define MT6311_BUCK(_id) \
+{\
+	.name = #_id,\
+	.ops = &mt6311_buck_ops,\
+	.of_match = of_match_ptr(#_id),\
+	.regulators_node = of_match_ptr("regulators"),\
+	.type = REGULATOR_VOLTAGE,\
+	.id = MT6311_ID_##_id,\
+	.n_voltages = (MT6311_MAX_UV - MT6311_MIN_UV) / MT6311_STEP_UV + 1,\
+	.min_uV = MT6311_MIN_UV,\
+	.uV_step = MT6311_STEP_UV,\
+	.owner = THIS_MODULE,\
+	.linear_ranges = buck_volt_range, \
+	.n_linear_ranges = ARRAY_SIZE(buck_volt_range), \
+	.enable_reg = MT6311_VDVFS11_CON9,\
+	.enable_mask = MT6311_PMIC_VDVFS11_EN_MASK,\
+	.vsel_reg = MT6311_VDVFS11_CON12,\
+	.vsel_mask = MT6311_PMIC_VDVFS11_VOSEL_MASK,\
+}
+
+#define MT6311_LDO(_id) \
+{\
+	.name = #_id,\
+	.ops = &mt6311_ldo_ops,\
+	.of_match = of_match_ptr(#_id),\
+	.regulators_node = of_match_ptr("regulators"),\
+	.type = REGULATOR_VOLTAGE,\
+	.id = MT6311_ID_##_id,\
+	.owner = THIS_MODULE,\
+	.enable_reg = MT6311_LDO_CON3,\
+	.enable_mask = MT6311_PMIC_RG_VBIASN_EN_MASK,\
+}
+
+static struct regulator_desc mt6311_regulators[] = {
+	MT6311_BUCK(VDVFS),
+	MT6311_LDO(VBIASN),
+};
+
+/*
+ * I2C driver interface functions
+ */
+static int mt6311_i2c_probe(struct i2c_client *i2c,
+		const struct i2c_device_id *id)
+{
+	struct regulator_config config = { };
+	struct regulator_dev *rdev;
+	struct regmap *regmap;
+	int error, i, ret;
+	unsigned int data;
+
+	regmap = devm_regmap_init_i2c(i2c, &mt6311_regmap_config);
+	if (IS_ERR(regmap)) {
+		error = PTR_ERR(regmap);
+		dev_err(&i2c->dev, "Failed to allocate register map: %d\n",
+			error);
+		return error;
+	}
+
+	ret = regmap_read(regmap, MT6311_SWCID, &data);
+	if (ret < 0) {
+		dev_err(&i2c->dev, "Failed to read DEVICE_ID reg: %d\n", ret);
+		return ret;
+	}
+
+	switch (data) {
+	case MT6311_E1_CID_CODE:
+	case MT6311_E2_CID_CODE:
+	case MT6311_E3_CID_CODE:
+		break;
+	default:
+		dev_err(&i2c->dev, "Unsupported device id = 0x%x.\n", data);
+		return -ENODEV;
+	}
+
+	for (i = 0; i < MT6311_MAX_REGULATORS; i++) {
+		config.dev = &i2c->dev;
+		config.regmap = regmap;
+
+		rdev = devm_regulator_register(&i2c->dev,
+			&mt6311_regulators[i], &config);
+		if (IS_ERR(rdev)) {
+			dev_err(&i2c->dev,
+				"Failed to register MT6311 regulator\n");
+			return PTR_ERR(rdev);
+		}
+	}
+
+	return 0;
+}
+
+static const struct i2c_device_id mt6311_i2c_id[] = {
+	{"mt6311", 0},
+	{},
+};
+MODULE_DEVICE_TABLE(i2c, mt6311_i2c_id);
+
+#ifdef CONFIG_OF
+static const struct of_device_id mt6311_dt_ids[] = {
+	{ .compatible = "mediatek,mt6311-regulator",
+	  .data = &mt6311_i2c_id[0] },
+	{},
+};
+MODULE_DEVICE_TABLE(of, mt6311_dt_ids);
+#endif
+
+static struct i2c_driver mt6311_regulator_driver = {
+	.driver = {
+		.name = "mt6311",
+		.owner = THIS_MODULE,
+		.of_match_table = of_match_ptr(mt6311_dt_ids),
+	},
+	.probe = mt6311_i2c_probe,
+	.id_table = mt6311_i2c_id,
+};
+
+module_i2c_driver(mt6311_regulator_driver);
+
+MODULE_AUTHOR("Henry Chen <henryc.chen@mediatek.com>");
+MODULE_DESCRIPTION("Regulator device driver for Mediatek MT6311");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/regulator/mt6311-regulator.h b/drivers/regulator/mt6311-regulator.h
new file mode 100644
index 000000000000..5218db46a798
--- /dev/null
+++ b/drivers/regulator/mt6311-regulator.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2015 MediaTek Inc.
+ * Author: Henry Chen <henryc.chen@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __MT6311_REGULATOR_H__
+#define __MT6311_REGULATOR_H__
+
+#define MT6311_SWCID              0x01
+
+#define MT6311_TOP_INT_CON        0x18
+#define MT6311_TOP_INT_MON        0x19
+
+#define MT6311_VDVFS11_CON0       0x87
+#define MT6311_VDVFS11_CON7       0x88
+#define MT6311_VDVFS11_CON8       0x89
+#define MT6311_VDVFS11_CON9       0x8A
+#define MT6311_VDVFS11_CON10      0x8B
+#define MT6311_VDVFS11_CON11      0x8C
+#define MT6311_VDVFS11_CON12      0x8D
+#define MT6311_VDVFS11_CON13      0x8E
+#define MT6311_VDVFS11_CON14      0x8F
+#define MT6311_VDVFS11_CON15      0x90
+#define MT6311_VDVFS11_CON16      0x91
+#define MT6311_VDVFS11_CON17      0x92
+#define MT6311_VDVFS11_CON18      0x93
+#define MT6311_VDVFS11_CON19      0x94
+
+#define MT6311_LDO_CON0           0xCC
+#define MT6311_LDO_OCFB0          0xCD
+#define MT6311_LDO_CON2           0xCE
+#define MT6311_LDO_CON3           0xCF
+#define MT6311_LDO_CON4           0xD0
+#define MT6311_FQMTR_CON0         0xD1
+#define MT6311_FQMTR_CON1         0xD2
+#define MT6311_FQMTR_CON2         0xD3
+#define MT6311_FQMTR_CON3         0xD4
+#define MT6311_FQMTR_CON4         0xD5
+
+#define MT6311_PMIC_RG_INT_POL_MASK                      0x1
+#define MT6311_PMIC_RG_INT_EN_MASK                       0x2
+#define MT6311_PMIC_RG_BUCK_OC_INT_STATUS_MASK           0x10
+
+#define MT6311_PMIC_VDVFS11_EN_CTRL_MASK                 0x1
+#define MT6311_PMIC_VDVFS11_VOSEL_CTRL_MASK              0x2
+#define MT6311_PMIC_VDVFS11_EN_SEL_MASK                  0x3
+#define MT6311_PMIC_VDVFS11_VOSEL_SEL_MASK               0xc
+#define MT6311_PMIC_VDVFS11_EN_MASK                      0x1
+#define MT6311_PMIC_VDVFS11_VOSEL_MASK                   0x7F
+#define MT6311_PMIC_VDVFS11_VOSEL_ON_MASK                0x7F
+#define MT6311_PMIC_VDVFS11_VOSEL_SLEEP_MASK             0x7F
+#define MT6311_PMIC_NI_VDVFS11_VOSEL_MASK                0x7F
+
+#define MT6311_PMIC_RG_VBIASN_EN_MASK                    0x1
+
+#endif
diff --git a/include/linux/regulator/mt6311.h b/include/linux/regulator/mt6311.h
new file mode 100644
index 000000000000..8473259395b6
--- /dev/null
+++ b/include/linux/regulator/mt6311.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2015 MediaTek Inc.
+ * Author: Henry Chen <henryc.chen@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_REGULATOR_MT6311_H
+#define __LINUX_REGULATOR_MT6311_H
+
+#define MT6311_MAX_REGULATORS	2
+
+enum {
+	MT6311_ID_VDVFS = 0,
+	MT6311_ID_VBIASN,
+};
+
+#define MT6311_E1_CID_CODE    0x10
+#define MT6311_E2_CID_CODE    0x20
+#define MT6311_E3_CID_CODE    0x30
+
+#endif /* __LINUX_REGULATOR_MT6311_H */
-- 
cgit v1.2.3-70-g09d2


From fa466c91970a0207d9384016cc7884a7f61834b6 Mon Sep 17 00:00:00 2001
From: Franklin S Cooper Jr <fcooper@ti.com>
Date: Wed, 22 Jul 2015 07:32:22 -0500
Subject: spi: davinci: Choose correct pre-scaler limit based on SOC

Currently the pre-scaler limit is incorrect. The value differs slightly
for various devices so a single value can't be used. Using the compatible
field select the correct pre-scaler limit.

Add new compatible field value for Keystone devices to support their
unique pre-scaler limit value.

Signed-off-by: Franklin S Cooper Jr <fcooper@ti.com>
Reviewed-by: Sekhar Nori <nsekhar@ti.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../devicetree/bindings/spi/spi-davinci.txt        |  2 +
 drivers/spi/spi-davinci.c                          | 43 ++++++++++++++++++----
 include/linux/platform_data/spi-davinci.h          |  1 +
 3 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/spi/spi-davinci.txt b/Documentation/devicetree/bindings/spi/spi-davinci.txt
index 12ecfe9e3599..d1e914adcf6e 100644
--- a/Documentation/devicetree/bindings/spi/spi-davinci.txt
+++ b/Documentation/devicetree/bindings/spi/spi-davinci.txt
@@ -12,6 +12,8 @@ Required properties:
 - compatible:
 	- "ti,dm6441-spi" for SPI used similar to that on DM644x SoC family
 	- "ti,da830-spi" for SPI used similar to that on DA8xx SoC family
+	- "ti,keystone-spi" for SPI used similar to that on Keystone2 SoC
+		family
 - reg: Offset and length of SPI controller register space
 - num-cs: Number of chip selects. This includes internal as well as
 	GPIO chip selects.
diff --git a/drivers/spi/spi-davinci.c b/drivers/spi/spi-davinci.c
index b4605c4158f4..3cf9faa6cc3f 100644
--- a/drivers/spi/spi-davinci.c
+++ b/drivers/spi/spi-davinci.c
@@ -139,6 +139,8 @@ struct davinci_spi {
 	u32			(*get_tx)(struct davinci_spi *);
 
 	u8			*bytes_per_word;
+
+	u8			prescaler_limit;
 };
 
 static struct davinci_spi_config davinci_spi_default_cfg;
@@ -266,7 +268,7 @@ static inline int davinci_spi_get_prescale(struct davinci_spi *dspi,
 	/* Subtract 1 to match what will be programmed into SPI register. */
 	ret = DIV_ROUND_UP(clk_get_rate(dspi->clk), max_speed_hz) - 1;
 
-	if (ret < 0 || ret > 255)
+	if (ret < dspi->prescaler_limit || ret > 255)
 		return -EINVAL;
 
 	return ret;
@@ -833,13 +835,40 @@ rx_dma_failed:
 }
 
 #if defined(CONFIG_OF)
+
+/* OF SPI data structure */
+struct davinci_spi_of_data {
+	u8	version;
+	u8	prescaler_limit;
+};
+
+static const struct davinci_spi_of_data dm6441_spi_data = {
+	.version = SPI_VERSION_1,
+	.prescaler_limit = 2,
+};
+
+static const struct davinci_spi_of_data da830_spi_data = {
+	.version = SPI_VERSION_2,
+	.prescaler_limit = 2,
+};
+
+static const struct davinci_spi_of_data keystone_spi_data = {
+	.version = SPI_VERSION_1,
+	.prescaler_limit = 0,
+};
+
 static const struct of_device_id davinci_spi_of_match[] = {
 	{
 		.compatible = "ti,dm6441-spi",
+		.data = &dm6441_spi_data,
 	},
 	{
 		.compatible = "ti,da830-spi",
-		.data = (void *)SPI_VERSION_2,
+		.data = &da830_spi_data,
+	},
+	{
+		.compatible = "ti,keystone-spi",
+		.data = &keystone_spi_data,
 	},
 	{ },
 };
@@ -858,21 +887,21 @@ static int spi_davinci_get_pdata(struct platform_device *pdev,
 			struct davinci_spi *dspi)
 {
 	struct device_node *node = pdev->dev.of_node;
+	struct davinci_spi_of_data *spi_data;
 	struct davinci_spi_platform_data *pdata;
 	unsigned int num_cs, intr_line = 0;
 	const struct of_device_id *match;
 
 	pdata = &dspi->pdata;
 
-	pdata->version = SPI_VERSION_1;
 	match = of_match_device(davinci_spi_of_match, &pdev->dev);
 	if (!match)
 		return -ENODEV;
 
-	/* match data has the SPI version number for SPI_VERSION_2 */
-	if (match->data == (void *)SPI_VERSION_2)
-		pdata->version = SPI_VERSION_2;
+	spi_data = (struct davinci_spi_of_data *)match->data;
 
+	pdata->version = spi_data->version;
+	pdata->prescaler_limit = spi_data->prescaler_limit;
 	/*
 	 * default num_cs is 1 and all chipsel are internal to the chip
 	 * indicated by chip_sel being NULL or cs_gpios being NULL or
@@ -992,7 +1021,7 @@ static int davinci_spi_probe(struct platform_device *pdev)
 
 	dspi->bitbang.chipselect = davinci_spi_chipselect;
 	dspi->bitbang.setup_transfer = davinci_spi_setup_transfer;
-
+	dspi->prescaler_limit = pdata->prescaler_limit;
 	dspi->version = pdata->version;
 
 	dspi->bitbang.flags = SPI_NO_CS | SPI_LSB_FIRST | SPI_LOOP;
diff --git a/include/linux/platform_data/spi-davinci.h b/include/linux/platform_data/spi-davinci.h
index 8dc2fa47a2aa..f4edcb03c40c 100644
--- a/include/linux/platform_data/spi-davinci.h
+++ b/include/linux/platform_data/spi-davinci.h
@@ -49,6 +49,7 @@ struct davinci_spi_platform_data {
 	u8			num_chipselect;
 	u8			intr_line;
 	u8			*chip_sel;
+	u8			prescaler_limit;
 	bool			cshold_bug;
 	enum dma_event_q	dma_event_q;
 };
-- 
cgit v1.2.3-70-g09d2


From 3a003baeec246f604ed1d2e0087560d7f15edcc6 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Fri, 17 Jul 2015 14:41:54 -0700
Subject: regulator: Add over current protection (OCP) support

Some regulators can automatically shut down when they detect an
over current event. Add an op (set_over_current_protection) and a
DT property + constraint to support this capability.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/regulator/regulator.txt | 1 +
 drivers/regulator/core.c                                  | 9 +++++++++
 drivers/regulator/of_regulator.c                          | 3 +++
 include/linux/regulator/driver.h                          | 1 +
 include/linux/regulator/machine.h                         | 1 +
 5 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/regulator/regulator.txt b/Documentation/devicetree/bindings/regulator/regulator.txt
index db88feb28c03..24bd422cecd5 100644
--- a/Documentation/devicetree/bindings/regulator/regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/regulator.txt
@@ -42,6 +42,7 @@ Optional properties:
 - regulator-system-load: Load in uA present on regulator that is not captured by
   any consumer request.
 - regulator-pull-down: Enable pull down resistor when the regulator is disabled.
+- regulator-over-current-protection: Enable over current protection.
 
 Deprecated properties:
 - regulator-compatible: If a regulator chip contains multiple
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index c9f72019bd68..520413e2bca0 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1081,6 +1081,15 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 		}
 	}
 
+	if (rdev->constraints->over_current_protection
+		&& ops->set_over_current_protection) {
+		ret = ops->set_over_current_protection(rdev);
+		if (ret < 0) {
+			rdev_err(rdev, "failed to set over current protection\n");
+			goto out;
+		}
+	}
+
 	print_constraints(rdev);
 	return 0;
 out:
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index b1c485b24ab2..250700c853bf 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -107,6 +107,9 @@ static void of_get_regulation_constraints(struct device_node *np,
 	if (!of_property_read_u32(np, "regulator-system-load", &pval))
 		constraints->system_load = pval;
 
+	constraints->over_current_protection = of_property_read_bool(np,
+					"regulator-over-current-protection");
+
 	for (i = 0; i < ARRAY_SIZE(regulator_states); i++) {
 		switch (i) {
 		case PM_SUSPEND_MEM:
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 4db9fbe4889d..45932228cbf5 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -148,6 +148,7 @@ struct regulator_ops {
 	int (*get_current_limit) (struct regulator_dev *);
 
 	int (*set_input_current_limit) (struct regulator_dev *, int lim_uA);
+	int (*set_over_current_protection) (struct regulator_dev *);
 
 	/* enable/disable regulator */
 	int (*enable) (struct regulator_dev *);
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index b11be1260129..a1067d0b3991 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -147,6 +147,7 @@ struct regulation_constraints {
 	unsigned ramp_disable:1; /* disable ramp delay */
 	unsigned soft_start:1;	/* ramp voltage slowly */
 	unsigned pull_down:1;	/* pull down resistor when regulator off */
+	unsigned over_current_protection:1; /* auto disable on over current */
 };
 
 /**
-- 
cgit v1.2.3-70-g09d2


From e3eea1404f5ff7a2ceb7b5e7ba412a6fd94f2935 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 24 Jul 2015 10:38:12 -0400
Subject: ftrace: Fix breakage of set_ftrace_pid

Commit 4104d326b670 ("ftrace: Remove global function list and call function
directly") simplified the ftrace code by removing the global_ops list with a
new design. But this cleanup also broke the filtering of PIDs that are added
to the set_ftrace_pid file.

Add back the proper hooks to have pid filtering working once again.

Cc: stable@vger.kernel.org # 3.16+
Reported-by: Matt Fleming <matt@console-pimps.org>
Reported-by: Richard Weinberger <richard.weinberger@gmail.com>
Tested-by: Matt Fleming <matt@console-pimps.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  3 +++
 kernel/trace/ftrace.c  | 52 +++++++++++++++++++++++++++++++++-----------------
 2 files changed, 37 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1da602982cf9..6cd8c0ee4b6f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -116,6 +116,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
  *            SAVE_REGS. If another ops with this flag set is already registered
  *            for any of the functions that this ops will be registered for, then
  *            this ops will fail to register or set_filter_ip.
+ * PID     - Is affected by set_ftrace_pid (allows filtering on those pids)
  */
 enum {
 	FTRACE_OPS_FL_ENABLED			= 1 << 0,
@@ -132,6 +133,7 @@ enum {
 	FTRACE_OPS_FL_MODIFYING			= 1 << 11,
 	FTRACE_OPS_FL_ALLOC_TRAMP		= 1 << 12,
 	FTRACE_OPS_FL_IPMODIFY			= 1 << 13,
+	FTRACE_OPS_FL_PID			= 1 << 14,
 };
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -159,6 +161,7 @@ struct ftrace_ops {
 	struct ftrace_ops		*next;
 	unsigned long			flags;
 	void				*private;
+	ftrace_func_t			saved_func;
 	int __percpu			*disabled;
 #ifdef CONFIG_DYNAMIC_FTRACE
 	int				nr_trampolines;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 02bece4a99ea..eb11011b5292 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -98,6 +98,13 @@ struct ftrace_pid {
 	struct pid *pid;
 };
 
+static bool ftrace_pids_enabled(void)
+{
+	return !list_empty(&ftrace_pids);
+}
+
+static void ftrace_update_trampoline(struct ftrace_ops *ops);
+
 /*
  * ftrace_disabled is set when an anomaly is discovered.
  * ftrace_disabled is much stronger than ftrace_enabled.
@@ -109,7 +116,6 @@ static DEFINE_MUTEX(ftrace_lock);
 static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
 static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
-ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
 static struct ftrace_ops global_ops;
 static struct ftrace_ops control_ops;
 
@@ -183,14 +189,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
 	if (!test_tsk_trace_trace(current))
 		return;
 
-	ftrace_pid_function(ip, parent_ip, op, regs);
-}
-
-static void set_ftrace_pid_function(ftrace_func_t func)
-{
-	/* do not set ftrace_pid_function to itself! */
-	if (func != ftrace_pid_func)
-		ftrace_pid_function = func;
+	op->saved_func(ip, parent_ip, op, regs);
 }
 
 /**
@@ -202,7 +201,6 @@ static void set_ftrace_pid_function(ftrace_func_t func)
 void clear_ftrace_function(void)
 {
 	ftrace_trace_function = ftrace_stub;
-	ftrace_pid_function = ftrace_stub;
 }
 
 static void control_ops_disable_all(struct ftrace_ops *ops)
@@ -436,6 +434,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 	} else
 		add_ftrace_ops(&ftrace_ops_list, ops);
 
+	/* Always save the function, and reset at unregistering */
+	ops->saved_func = ops->func;
+
+	if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled())
+		ops->func = ftrace_pid_func;
+
 	ftrace_update_trampoline(ops);
 
 	if (ftrace_enabled)
@@ -463,15 +467,28 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	if (ftrace_enabled)
 		update_ftrace_function();
 
+	ops->func = ops->saved_func;
+
 	return 0;
 }
 
 static void ftrace_update_pid_func(void)
 {
+	bool enabled = ftrace_pids_enabled();
+	struct ftrace_ops *op;
+
 	/* Only do something if we are tracing something */
 	if (ftrace_trace_function == ftrace_stub)
 		return;
 
+	do_for_each_ftrace_op(op, ftrace_ops_list) {
+		if (op->flags & FTRACE_OPS_FL_PID) {
+			op->func = enabled ? ftrace_pid_func :
+				op->saved_func;
+			ftrace_update_trampoline(op);
+		}
+	} while_for_each_ftrace_op(op);
+
 	update_ftrace_function();
 }
 
@@ -1133,7 +1150,8 @@ static struct ftrace_ops global_ops = {
 	.local_hash.filter_hash		= EMPTY_HASH,
 	INIT_OPS_HASH(global_ops)
 	.flags				= FTRACE_OPS_FL_RECURSION_SAFE |
-					  FTRACE_OPS_FL_INITIALIZED,
+					  FTRACE_OPS_FL_INITIALIZED |
+					  FTRACE_OPS_FL_PID,
 };
 
 /*
@@ -5023,7 +5041,9 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
 
 static struct ftrace_ops global_ops = {
 	.func			= ftrace_stub,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+	.flags			= FTRACE_OPS_FL_RECURSION_SAFE |
+				  FTRACE_OPS_FL_INITIALIZED |
+				  FTRACE_OPS_FL_PID,
 };
 
 static int __init ftrace_nodyn_init(void)
@@ -5080,11 +5100,6 @@ void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
 		if (WARN_ON(tr->ops->func != ftrace_stub))
 			printk("ftrace ops had %pS for function\n",
 			       tr->ops->func);
-		/* Only the top level instance does pid tracing */
-		if (!list_empty(&ftrace_pids)) {
-			set_ftrace_pid_function(func);
-			func = ftrace_pid_func;
-		}
 	}
 	tr->ops->func = func;
 	tr->ops->private = tr;
@@ -5371,7 +5386,7 @@ static void *fpid_start(struct seq_file *m, loff_t *pos)
 {
 	mutex_lock(&ftrace_lock);
 
-	if (list_empty(&ftrace_pids) && (!*pos))
+	if (!ftrace_pids_enabled() && (!*pos))
 		return (void *) 1;
 
 	return seq_list_start(&ftrace_pids, *pos);
@@ -5610,6 +5625,7 @@ static struct ftrace_ops graph_ops = {
 	.func			= ftrace_stub,
 	.flags			= FTRACE_OPS_FL_RECURSION_SAFE |
 				   FTRACE_OPS_FL_INITIALIZED |
+				   FTRACE_OPS_FL_PID |
 				   FTRACE_OPS_FL_STUB,
 #ifdef FTRACE_GRAPH_TRAMP_ADDR
 	.trampoline		= FTRACE_GRAPH_TRAMP_ADDR,
-- 
cgit v1.2.3-70-g09d2


From e0910bace663b78c026b73bbd711a24ccf410531 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 23 Jul 2015 15:43:56 +0200
Subject: lwtunnel: export linux/lwtunnel.h to userspace

Note also that include/linux/lwtunnel.h is not needed.

CC: Thomas Graf <tgraf@suug.ch>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
Fixes: 499a24256862 ("lwtunnel: infrastructure for handling light weight tunnels like mpls")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/lwtunnel.h  | 6 ------
 include/uapi/linux/Kbuild | 1 +
 2 files changed, 1 insertion(+), 6 deletions(-)
 delete mode 100644 include/linux/lwtunnel.h

(limited to 'include/linux')

diff --git a/include/linux/lwtunnel.h b/include/linux/lwtunnel.h
deleted file mode 100644
index 97f32f8b4ae1..000000000000
--- a/include/linux/lwtunnel.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _LINUX_LWTUNNEL_H_
-#define _LINUX_LWTUNNEL_H_
-
-#include <uapi/linux/lwtunnel.h>
-
-#endif /* _LINUX_LWTUNNEL_H_ */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 1ff9942718fe..aafb9937b162 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -243,6 +243,7 @@ header-y += limits.h
 header-y += llc.h
 header-y += loop.h
 header-y += lp.h
+header-y += lwtunnel.h
 header-y += magic.h
 header-y += major.h
 header-y += map_to_7segment.h
-- 
cgit v1.2.3-70-g09d2


From 0d3f2c92e004c67404fabea19728c1962b777bd6 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Wed, 15 Jul 2015 15:38:29 +0100
Subject: irqchip/gic: Remove redundant gic_set_irqchip_flags

Now that the GIC chip implementation enables IRQCHIP_SKIP_SET_WAKE and
IRQCHIP_MASK_ON_SUSPEND by default, the platforms requiring them need
not override the irqchip flags as before.

This patch removes all the users of gic_set_irqchip_flags and the
function itself.

Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Simon Horman <horms@verge.net.au>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Michal Simek <michal.simek@xilinx.com>
Cc: Magnus Damm <magnus.damm@gmail.com>
Cc: Gregory CLEMENT <gregory.clement@free-electrons.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1436971109-20189-2-git-send-email-sudeep.holla@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/arm/mach-shmobile/intc-sh73a0.c   | 1 -
 arch/arm/mach-shmobile/setup-r8a7779.c | 1 -
 arch/arm/mach-ux500/cpu.c              | 1 -
 arch/arm/mach-zynq/common.c            | 1 -
 drivers/irqchip/irq-gic.c              | 5 -----
 include/linux/irqchip/arm-gic.h        | 1 -
 6 files changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-shmobile/intc-sh73a0.c b/arch/arm/mach-shmobile/intc-sh73a0.c
index fd63ae6532fc..151a71a41fe3 100644
--- a/arch/arm/mach-shmobile/intc-sh73a0.c
+++ b/arch/arm/mach-shmobile/intc-sh73a0.c
@@ -313,7 +313,6 @@ void __init sh73a0_init_irq(void)
 	void __iomem *gic_cpu_base = IOMEM(0xf0000100);
 	void __iomem *intevtsa = ioremap_nocache(0xffd20100, PAGE_SIZE);
 
-	gic_set_irqchip_flags(IRQCHIP_SKIP_SET_WAKE);
 	gic_init(0, 29, gic_dist_base, gic_cpu_base);
 
 	register_intc_controller(&intcs_desc);
diff --git a/arch/arm/mach-shmobile/setup-r8a7779.c b/arch/arm/mach-shmobile/setup-r8a7779.c
index c03e562be12b..aea5cff9495d 100644
--- a/arch/arm/mach-shmobile/setup-r8a7779.c
+++ b/arch/arm/mach-shmobile/setup-r8a7779.c
@@ -719,7 +719,6 @@ void __init r8a7779_init_irq_dt(void)
 	void __iomem *gic_dist_base = ioremap_nocache(0xf0001000, 0x1000);
 	void __iomem *gic_cpu_base = ioremap_nocache(0xf0000100, 0x1000);
 #endif
-	gic_set_irqchip_flags(IRQCHIP_SKIP_SET_WAKE);
 
 #ifdef CONFIG_ARCH_SHMOBILE_LEGACY
 	gic_init(0, 29, gic_dist_base, gic_cpu_base);
diff --git a/arch/arm/mach-ux500/cpu.c b/arch/arm/mach-ux500/cpu.c
index e31d3d61c998..6cb10c77afd8 100644
--- a/arch/arm/mach-ux500/cpu.c
+++ b/arch/arm/mach-ux500/cpu.c
@@ -56,7 +56,6 @@ void __init ux500_init_irq(void)
 	struct device_node *np;
 	struct resource r;
 
-	gic_set_irqchip_flags(IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MASK_ON_SUSPEND);
 	irqchip_init();
 	np = of_find_compatible_node(NULL, NULL, "stericsson,db8500-prcmu");
 	of_address_to_resource(np, 0, &r);
diff --git a/arch/arm/mach-zynq/common.c b/arch/arm/mach-zynq/common.c
index 616d5840fc2e..2ad1accfba35 100644
--- a/arch/arm/mach-zynq/common.c
+++ b/arch/arm/mach-zynq/common.c
@@ -186,7 +186,6 @@ static void __init zynq_map_io(void)
 
 static void __init zynq_irq_init(void)
 {
-	gic_set_irqchip_flags(IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MASK_ON_SUSPEND);
 	irqchip_init();
 }
 
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 39ff8df8cf64..80fde37076c4 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -881,11 +881,6 @@ static const struct irq_domain_ops gic_irq_domain_ops = {
 	.xlate = gic_irq_domain_xlate,
 };
 
-void gic_set_irqchip_flags(unsigned long flags)
-{
-	gic_chip.flags |= flags;
-}
-
 void __init gic_init_bases(unsigned int gic_nr, int irq_start,
 			   void __iomem *dist_base, void __iomem *cpu_base,
 			   u32 percpu_offset, struct device_node *node)
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index 9de976b4f9a7..61a2007eb49a 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -95,7 +95,6 @@
 
 struct device_node;
 
-void gic_set_irqchip_flags(unsigned long flags);
 void gic_init_bases(unsigned int, int, void __iomem *, void __iomem *,
 		    u32 offset, struct device_node *);
 void gic_cascade_irq(unsigned int gic_nr, unsigned int irq);
-- 
cgit v1.2.3-70-g09d2


From be9b22b6a7e6725162c64155a08b71f0654b675c Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Wed, 22 Jul 2015 16:21:39 -0700
Subject: genirq: Add chip_[suspend|resume] PM support to irq_chip

Some (admittedly odd) irqchips perform functions that are not directly
related to any of their child IRQ lines, and therefore need to perform
some tasks during suspend/resume regardless of whether there are
any "installed" interrupts for the irqchip. However, the current
generic-chip framework does not call the chip's irq_{suspend,resume}
when there are no interrupts installed (this makes sense, because there
are no irq_data objects for such a call to be made).

More specifically, irq-bcm7120-l2 configures both a forwarding mask
(which affects other top-level GIC IRQs) and a second-level interrupt
mask (for managing its own child interrupts). The former must be
saved/restored on suspend/resume, even when there's nothing to do for
the latter.

This patch adds a new set of suspend/resume hooks to irq_chip_generic,
to help represent *chip* suspend/resume, rather than IRQ suspend/resume.
These callbacks will always be called for an IRQ chip (regardless of the
installed interrupts) and are based on the per-chip irq_chip_generic
struct, rather than the per-IRQ irq_data struct.

The original problem report is described in extra detail here:
http://lkml.kernel.org/g/20150619224123.GL4917@ld-irv-0074

Signed-off-by: Brian Norris <computersforpeace@gmail.com>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Cc: Gregory Fong <gregory.0xf0@gmail.com>
Cc: bcm-kernel-feedback-list@broadcom.com
Cc: linux-mips@linux-mips.org
Cc: Kevin Cernekee <cernekee@chromium.org>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1437607300-40858-1-git-send-email-computersforpeace@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h       | 14 ++++++++++++--
 kernel/irq/generic-chip.c |  6 ++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 5284cb166d90..2c8730a108be 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -324,8 +324,10 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
  * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips
  * @irq_cpu_online:	configure an interrupt source for a secondary CPU
  * @irq_cpu_offline:	un-configure an interrupt source for a secondary CPU
- * @irq_suspend:	function called from core code on suspend once per chip
- * @irq_resume:		function called from core code on resume once per chip
+ * @irq_suspend:	function called from core code on suspend once per
+ *			chip, when one or more interrupts are installed
+ * @irq_resume:		function called from core code on resume once per chip,
+ *			when one ore more interrupts are installed
  * @irq_pm_shutdown:	function called from core code on shutdown once per chip
  * @irq_calc_mask:	Optional function to set irq_data.mask for special cases
  * @irq_print_chip:	optional to print special chip info in show_interrupts
@@ -760,6 +762,12 @@ struct irq_chip_type {
  * @reg_base:		Register base address (virtual)
  * @reg_readl:		Alternate I/O accessor (defaults to readl if NULL)
  * @reg_writel:		Alternate I/O accessor (defaults to writel if NULL)
+ * @suspend:		Function called from core code on suspend once per
+ *			chip; can be useful instead of irq_chip::suspend to
+ *			handle chip details even when no interrupts are in use
+ * @resume:		Function called from core code on resume once per chip;
+ *			can be useful instead of irq_chip::suspend to handle
+ *			chip details even when no interrupts are in use
  * @irq_base:		Interrupt base nr for this chip
  * @irq_cnt:		Number of interrupts handled by this chip
  * @mask_cache:		Cached mask register shared between all chip types
@@ -786,6 +794,8 @@ struct irq_chip_generic {
 	void __iomem		*reg_base;
 	u32			(*reg_readl)(void __iomem *addr);
 	void			(*reg_writel)(u32 val, void __iomem *addr);
+	void			(*suspend)(struct irq_chip_generic *gc);
+	void			(*resume)(struct irq_chip_generic *gc);
 	unsigned int		irq_base;
 	unsigned int		irq_cnt;
 	u32			mask_cache;
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 15b370daf234..abd286afbd27 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -553,6 +553,9 @@ static int irq_gc_suspend(void)
 			if (data)
 				ct->chip.irq_suspend(data);
 		}
+
+		if (gc->suspend)
+			gc->suspend(gc);
 	}
 	return 0;
 }
@@ -564,6 +567,9 @@ static void irq_gc_resume(void)
 	list_for_each_entry(gc, &gc_list, list) {
 		struct irq_chip_type *ct = gc->chip_types;
 
+		if (gc->resume)
+			gc->resume(gc);
+
 		if (ct->chip.irq_resume) {
 			struct irq_data *data = irq_gc_get_irq_data(gc);
 
-- 
cgit v1.2.3-70-g09d2


From 2be6967cdbc95a9960b620defedbf5e02e2af619 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 23 Jul 2015 23:35:56 +0300
Subject: net/mlx5e: Support ETH_RSS_HASH_XOR

The ConnectX-4 HW implements inverted XOR8.
To make it act as XOR we re-order the HW RSS indirection table.

Set XOR to be the default RSS hash function and add ethtool API to
control it.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  1 +
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 39 ++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 46 +++++++++++++++++-----
 include/linux/mlx5/mlx5_ifc.h                      |  6 +--
 4 files changed, 79 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3d23bd657e3c..61d8433392aa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -195,6 +195,7 @@ struct mlx5e_params {
 	u16 rx_hash_log_tbl_sz;
 	bool lro_en;
 	u32 lro_wqe_sz;
+	u8  rss_hfunc;
 };
 
 enum {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 388938482ff9..cb2853570504 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -662,6 +662,43 @@ out:
 	return err;
 }
 
+static int mlx5e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
+			  u8 *hfunc)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	if (hfunc)
+		*hfunc = priv->params.rss_hfunc;
+
+	return 0;
+}
+
+static int mlx5e_set_rxfh(struct net_device *netdev, const u32 *indir,
+			  const u8 *key, const u8 hfunc)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err = 0;
+
+	if (hfunc == ETH_RSS_HASH_NO_CHANGE)
+		return 0;
+
+	if ((hfunc != ETH_RSS_HASH_XOR) &&
+	    (hfunc != ETH_RSS_HASH_TOP))
+		return -EINVAL;
+
+	mutex_lock(&priv->state_lock);
+
+	priv->params.rss_hfunc = hfunc;
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		mlx5e_close_locked(priv->netdev);
+		err = mlx5e_open_locked(priv->netdev);
+	}
+
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
 const struct ethtool_ops mlx5e_ethtool_ops = {
 	.get_drvinfo       = mlx5e_get_drvinfo,
 	.get_link          = ethtool_op_get_link,
@@ -676,4 +713,6 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
 	.set_coalesce      = mlx5e_set_coalesce,
 	.get_settings      = mlx5e_get_settings,
 	.set_settings      = mlx5e_set_settings,
+	.get_rxfh          = mlx5e_get_rxfh,
+	.set_rxfh          = mlx5e_set_rxfh,
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 40206da1f9d7..07d36275021e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1158,6 +1158,24 @@ static void mlx5e_close_tises(struct mlx5e_priv *priv)
 		mlx5e_close_tis(priv, tc);
 }
 
+static int mlx5e_rx_hash_fn(int hfunc)
+{
+	return (hfunc == ETH_RSS_HASH_TOP) ?
+	       MLX5_RX_HASH_FN_TOEPLITZ :
+	       MLX5_RX_HASH_FN_INVERTED_XOR8;
+}
+
+static int mlx5e_bits_invert(unsigned long a, int size)
+{
+	int inv = 0;
+	int i;
+
+	for (i = 0; i < size; i++)
+		inv |= (test_bit(size - i - 1, &a) ? 1 : 0) << i;
+
+	return inv;
+}
+
 static int mlx5e_open_rqt(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
@@ -1166,11 +1184,10 @@ static int mlx5e_open_rqt(struct mlx5e_priv *priv)
 	void *rqtc;
 	int inlen;
 	int err;
-	int sz;
+	int log_tbl_sz = priv->params.rx_hash_log_tbl_sz;
+	int sz = 1 << log_tbl_sz;
 	int i;
 
-	sz = 1 << priv->params.rx_hash_log_tbl_sz;
-
 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
 	in = mlx5_vzalloc(inlen);
 	if (!in)
@@ -1182,8 +1199,12 @@ static int mlx5e_open_rqt(struct mlx5e_priv *priv)
 	MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
 
 	for (i = 0; i < sz; i++) {
-		int ix = i % priv->params.num_channels;
+		int ix = i;
+
+		if (priv->params.rss_hfunc == ETH_RSS_HASH_XOR)
+			ix = mlx5e_bits_invert(i, log_tbl_sz);
 
+		ix = ix % priv->params.num_channels;
 		MLX5_SET(rqtc, rqtc, rq_num[i], priv->channel[ix]->rq.rqn);
 	}
 
@@ -1254,12 +1275,16 @@ static void mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 *tirc, int tt)
 		MLX5_SET(tirc, tirc, indirect_table,
 			 priv->rqtn);
 		MLX5_SET(tirc, tirc, rx_hash_fn,
-			 MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ);
-		MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
-		netdev_rss_key_fill(MLX5_ADDR_OF(tirc, tirc,
-						 rx_hash_toeplitz_key),
-				    MLX5_FLD_SZ_BYTES(tirc,
-						      rx_hash_toeplitz_key));
+			 mlx5e_rx_hash_fn(priv->params.rss_hfunc));
+		if (priv->params.rss_hfunc == ETH_RSS_HASH_TOP) {
+			void *rss_key = MLX5_ADDR_OF(tirc, tirc,
+						     rx_hash_toeplitz_key);
+			size_t len = MLX5_FLD_SZ_BYTES(tirc,
+						       rx_hash_toeplitz_key);
+
+			MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
+			netdev_rss_key_fill(rss_key, len);
+		}
 		break;
 	}
 
@@ -1700,6 +1725,7 @@ static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
 		MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ;
 	priv->params.num_tc                = 1;
 	priv->params.default_vlan_prio     = 0;
+	priv->params.rss_hfunc             = ETH_RSS_HASH_XOR;
 
 	priv->params.lro_en = false && !!MLX5_CAP_ETH(priv->mdev, lro_cap);
 	priv->params.lro_wqe_sz            =
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6d2f6fee041c..c60a62bba652 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1936,9 +1936,9 @@ enum {
 };
 
 enum {
-	MLX5_TIRC_RX_HASH_FN_HASH_NONE           = 0x0,
-	MLX5_TIRC_RX_HASH_FN_HASH_INVERTED_XOR8  = 0x1,
-	MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ       = 0x2,
+	MLX5_RX_HASH_FN_NONE           = 0x0,
+	MLX5_RX_HASH_FN_INVERTED_XOR8  = 0x1,
+	MLX5_RX_HASH_FN_TOEPLITZ       = 0x2,
 };
 
 enum {
-- 
cgit v1.2.3-70-g09d2


From 311c7c71c9bb8786c96fee353fe9886c08b017fe Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 23 Jul 2015 23:35:57 +0300
Subject: net/mlx5e: Allocate DMA coherent memory on reader NUMA node

By affinity hints and XPS, each mlx5e channel is assigned a CPU
core.

Channel DMA coherent memory that is written by the NIC and read
by SW (e.g CQ buffer) is allocated on the NUMA node of the CPU
core assigned for the channel.

Channel DMA coherent memory that is written by SW and read by the
NIC (e.g SQ/RQ buffer) is allocated on the NUMA node of the NIC.

Doorbell record (written by SW and read by the NIC) is an
exception since it is accessed by SW more frequently.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/alloc.c   | 48 +++++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 11 ++++--
 drivers/net/ethernet/mellanox/mlx5/core/main.c    |  6 ++-
 drivers/net/ethernet/mellanox/mlx5/core/wq.c      | 12 +++---
 drivers/net/ethernet/mellanox/mlx5/core/wq.h      |  3 +-
 include/linux/mlx5/driver.h                       |  8 ++++
 6 files changed, 70 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
index 0715b497511f..6cb38304669f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
@@ -45,15 +45,34 @@
  * register it in a memory region at HCA virtual address 0.
  */
 
-int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf)
+static void *mlx5_dma_zalloc_coherent_node(struct mlx5_core_dev *dev,
+					   size_t size, dma_addr_t *dma_handle,
+					   int node)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	int original_node;
+	void *cpu_handle;
+
+	mutex_lock(&priv->alloc_mutex);
+	original_node = dev_to_node(&dev->pdev->dev);
+	set_dev_node(&dev->pdev->dev, node);
+	cpu_handle = dma_zalloc_coherent(&dev->pdev->dev, size,
+					 dma_handle, GFP_KERNEL);
+	set_dev_node(&dev->pdev->dev, original_node);
+	mutex_unlock(&priv->alloc_mutex);
+	return cpu_handle;
+}
+
+int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
+			struct mlx5_buf *buf, int node)
 {
 	dma_addr_t t;
 
 	buf->size = size;
 	buf->npages       = 1;
 	buf->page_shift   = (u8)get_order(size) + PAGE_SHIFT;
-	buf->direct.buf   = dma_zalloc_coherent(&dev->pdev->dev,
-						size, &t, GFP_KERNEL);
+	buf->direct.buf   = mlx5_dma_zalloc_coherent_node(dev, size,
+							  &t, node);
 	if (!buf->direct.buf)
 		return -ENOMEM;
 
@@ -66,6 +85,11 @@ int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf)
 
 	return 0;
 }
+
+int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf)
+{
+	return mlx5_buf_alloc_node(dev, size, buf, dev->priv.numa_node);
+}
 EXPORT_SYMBOL_GPL(mlx5_buf_alloc);
 
 void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf)
@@ -75,7 +99,8 @@ void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf)
 }
 EXPORT_SYMBOL_GPL(mlx5_buf_free);
 
-static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct device *dma_device)
+static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct mlx5_core_dev *dev,
+						 int node)
 {
 	struct mlx5_db_pgdir *pgdir;
 
@@ -84,8 +109,9 @@ static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct device *dma_device)
 		return NULL;
 
 	bitmap_fill(pgdir->bitmap, MLX5_DB_PER_PAGE);
-	pgdir->db_page = dma_alloc_coherent(dma_device, PAGE_SIZE,
-					    &pgdir->db_dma, GFP_KERNEL);
+
+	pgdir->db_page = mlx5_dma_zalloc_coherent_node(dev, PAGE_SIZE,
+						       &pgdir->db_dma, node);
 	if (!pgdir->db_page) {
 		kfree(pgdir);
 		return NULL;
@@ -118,7 +144,7 @@ static int mlx5_alloc_db_from_pgdir(struct mlx5_db_pgdir *pgdir,
 	return 0;
 }
 
-int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db)
+int mlx5_db_alloc_node(struct mlx5_core_dev *dev, struct mlx5_db *db, int node)
 {
 	struct mlx5_db_pgdir *pgdir;
 	int ret = 0;
@@ -129,7 +155,7 @@ int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db)
 		if (!mlx5_alloc_db_from_pgdir(pgdir, db))
 			goto out;
 
-	pgdir = mlx5_alloc_db_pgdir(&(dev->pdev->dev));
+	pgdir = mlx5_alloc_db_pgdir(dev, node);
 	if (!pgdir) {
 		ret = -ENOMEM;
 		goto out;
@@ -145,6 +171,12 @@ out:
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(mlx5_db_alloc_node);
+
+int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db)
+{
+	return mlx5_db_alloc_node(dev, db, dev->priv.numa_node);
+}
 EXPORT_SYMBOL_GPL(mlx5_db_alloc);
 
 void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 07d36275021e..57cc8960b73b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -272,6 +272,8 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 	int err;
 	int i;
 
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
+
 	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wq,
 				&rq->wq_ctrl);
 	if (err)
@@ -502,6 +504,8 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 	if (err)
 		return err;
 
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
+
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
 				 &sq->wq_ctrl);
 	if (err)
@@ -702,7 +706,8 @@ static int mlx5e_create_cq(struct mlx5e_channel *c,
 	int err;
 	u32 i;
 
-	param->wq.numa = cpu_to_node(c->cpu);
+	param->wq.buf_numa_node = cpu_to_node(c->cpu);
+	param->wq.db_numa_node  = cpu_to_node(c->cpu);
 	param->eq_ix   = c->ix;
 
 	err = mlx5_cqwq_create(mdev, &param->wq, param->cqc, &cq->wq,
@@ -1000,7 +1005,7 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 	MLX5_SET(wq, wq, log_wq_sz,        priv->params.log_rq_size);
 	MLX5_SET(wq, wq, pd,               priv->pdn);
 
-	param->wq.numa   = dev_to_node(&priv->mdev->pdev->dev);
+	param->wq.buf_numa_node = dev_to_node(&priv->mdev->pdev->dev);
 	param->wq.linear = 1;
 }
 
@@ -1014,7 +1019,7 @@ static void mlx5e_build_sq_param(struct mlx5e_priv *priv,
 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
 	MLX5_SET(wq, wq, pd,            priv->pdn);
 
-	param->wq.numa = dev_to_node(&priv->mdev->pdev->dev);
+	param->wq.buf_numa_node = dev_to_node(&priv->mdev->pdev->dev);
 }
 
 static void mlx5e_build_common_cq_param(struct mlx5e_priv *priv,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index afad529838de..c34eafbf1c04 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -455,7 +455,7 @@ static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
 	struct mlx5_priv *priv  = &mdev->priv;
 	struct msix_entry *msix = priv->msix_arr;
 	int irq                 = msix[i + MLX5_EQ_VEC_COMP_BASE].vector;
-	int numa_node           = dev_to_node(&mdev->pdev->dev);
+	int numa_node           = priv->numa_node;
 	int err;
 
 	if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) {
@@ -668,6 +668,10 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
 	INIT_LIST_HEAD(&priv->pgdir_list);
 	spin_lock_init(&priv->mkey_lock);
 
+	mutex_init(&priv->alloc_mutex);
+
+	priv->numa_node = dev_to_node(&dev->pdev->dev);
+
 	priv->dbg_root = debugfs_create_dir(dev_name(&pdev->dev), mlx5_debugfs_root);
 	if (!priv->dbg_root)
 		return -ENOMEM;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
index 8388411582cf..ce21ee5b2357 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
@@ -73,13 +73,14 @@ int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 	wq->log_stride = MLX5_GET(wq, wqc, log_wq_stride);
 	wq->sz_m1 = (1 << MLX5_GET(wq, wqc, log_wq_sz)) - 1;
 
-	err = mlx5_db_alloc(mdev, &wq_ctrl->db);
+	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_db_alloc() failed, %d\n", err);
 		return err;
 	}
 
-	err = mlx5_buf_alloc(mdev, mlx5_wq_cyc_get_byte_size(wq), &wq_ctrl->buf);
+	err = mlx5_buf_alloc_node(mdev, mlx5_wq_cyc_get_byte_size(wq),
+				  &wq_ctrl->buf, param->buf_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_buf_alloc() failed, %d\n", err);
 		goto err_db_free;
@@ -108,13 +109,14 @@ int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 	wq->log_sz = MLX5_GET(cqc, cqc, log_cq_size);
 	wq->sz_m1 = (1 << wq->log_sz) - 1;
 
-	err = mlx5_db_alloc(mdev, &wq_ctrl->db);
+	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_db_alloc() failed, %d\n", err);
 		return err;
 	}
 
-	err = mlx5_buf_alloc(mdev, mlx5_cqwq_get_byte_size(wq), &wq_ctrl->buf);
+	err = mlx5_buf_alloc_node(mdev, mlx5_cqwq_get_byte_size(wq),
+				  &wq_ctrl->buf, param->buf_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_buf_alloc() failed, %d\n", err);
 		goto err_db_free;
@@ -144,7 +146,7 @@ int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 	wq->log_stride = MLX5_GET(wq, wqc, log_wq_stride);
 	wq->sz_m1 = (1 << MLX5_GET(wq, wqc, log_wq_sz)) - 1;
 
-	err = mlx5_db_alloc(mdev, &wq_ctrl->db);
+	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_db_alloc() failed, %d\n", err);
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
index e0ddd69fb429..6c2a8f95093c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -37,7 +37,8 @@
 
 struct mlx5_wq_param {
 	int		linear;
-	int		numa;
+	int		buf_numa_node;
+	int		db_numa_node;
 };
 
 struct mlx5_wq_ctrl {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5722d88c2429..1c0d5d062d7c 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -463,6 +463,10 @@ struct mlx5_priv {
 	/* end: mr staff */
 
 	/* start: alloc staff */
+	/* protect buffer alocation according to numa node */
+	struct mutex            alloc_mutex;
+	int                     numa_node;
+
 	struct mutex            pgdir_mutex;
 	struct list_head        pgdir_list;
 	/* end: alloc staff */
@@ -672,6 +676,8 @@ void mlx5_health_cleanup(void);
 void  __init mlx5_health_init(void);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
 void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
+int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
+			struct mlx5_buf *buf, int node);
 int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf);
 void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf);
 struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
@@ -773,6 +779,8 @@ void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev);
 int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev);
 void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev);
 int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db);
+int mlx5_db_alloc_node(struct mlx5_core_dev *dev, struct mlx5_db *db,
+		       int node);
 void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db);
 
 const char *mlx5_command_str(int command);
-- 
cgit v1.2.3-70-g09d2


From 88a85f99e51fb2373259ab83c8bb130a9bbf3804 Mon Sep 17 00:00:00 2001
From: Achiad Shochat <achiad@mellanox.com>
Date: Thu, 23 Jul 2015 23:35:59 +0300
Subject: net/mlx5e: TX latency optimization to save DMA reads

A regular TX WQE execution involves two or more DMA reads -
one to fetch the WQE, and another one per WQE gather entry.

These DMA reads obviously increase the TX latency.
There are two mlx5 mechanisms to bypass these DMA reads:
1) Inline WQE
2) Blue Flame (BF)

An inline WQE contains a whole packet, thus saves the DMA read/s
of the regular WQE gather entry/s. Inline WQE support was already
added in the previous commit.

A BF WQE is written directly to the device I/O mapped memory, thus
enables saving the DMA read that fetches the WQE.

The BF WQE I/O write must be in cache line granularity, thus uses
the CPU write combining mechanism.
A BF WQE I/O write acts also as a TX doorbell for notifying the
device of new TX WQEs.
A BF WQE is written to the same I/O mapped address as the regular TX
doorbell, thus this address is being mapped twice - once by ioremap()
and once by io_mapping_map_wc().

While both mechanisms reduce the TX latency, they both consume more CPU
cycles than a regular WQE:
- A BF WQE must still be written to host memory, in addition to being
  written directly to the device I/O mapped memory.
- An inline WQE involves copying the SKB data into it.

To handle this tradeoff, we introduce here a heuristic algorithm that
strives to avoid using these two mechanisms in case the TX queue is
being back-pressured by the device, and limit their usage rate otherwise.

An inline WQE will always be "Blue Flamed" (written directly to the
device I/O mapped memory) while a BF WQE may not be inlined (may contain
gather entries).

Preliminary testing using netperf UDP_RR shows that the latency goes down
from 17.5us to 16.9us, while the message rate (tested with pktgen) stays
the same.

Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      | 24 +++++++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 12 ++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   | 26 ++++++++++++++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/main.c    | 26 +++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/uar.c     |  6 ++++++
 include/linux/mlx5/driver.h                       |  4 +++-
 6 files changed, 79 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index d9dc506188c8..b66edd2c5a61 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -60,6 +60,7 @@
 
 #define MLX5E_TX_CQ_POLL_BUDGET        128
 #define MLX5E_UPDATE_STATS_INTERVAL    200 /* msecs */
+#define MLX5E_SQ_BF_BUDGET             16
 
 static const char vport_strings[][ETH_GSTRING_LEN] = {
 	/* vport statistics */
@@ -268,7 +269,9 @@ struct mlx5e_sq {
 	/* dirtied @xmit */
 	u16                        pc ____cacheline_aligned_in_smp;
 	u32                        dma_fifo_pc;
-	u32                        bf_offset;
+	u16                        bf_offset;
+	u16                        prev_cc;
+	u8                         bf_budget;
 	struct mlx5e_sq_stats      stats;
 
 	struct mlx5e_cq            cq;
@@ -281,9 +284,10 @@ struct mlx5e_sq {
 	struct mlx5_wq_cyc         wq;
 	u32                        dma_fifo_mask;
 	void __iomem              *uar_map;
+	void __iomem              *uar_bf_map;
 	struct netdev_queue       *txq;
 	u32                        sqn;
-	u32                        bf_buf_size;
+	u16                        bf_buf_size;
 	u16                        max_inline;
 	u16                        edge;
 	struct device             *pdev;
@@ -493,8 +497,10 @@ int mlx5e_update_priv_params(struct mlx5e_priv *priv,
 			     struct mlx5e_params *new_params);
 
 static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
-				      struct mlx5e_tx_wqe *wqe)
+				      struct mlx5e_tx_wqe *wqe, int bf_sz)
 {
+	u16 ofst = MLX5_BF_OFFSET + sq->bf_offset;
+
 	/* ensure wqe is visible to device before updating doorbell record */
 	dma_wmb();
 
@@ -505,9 +511,15 @@ static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
 	 */
 	wmb();
 
-	mlx5_write64((__be32 *)&wqe->ctrl,
-		     sq->uar_map + MLX5_BF_OFFSET + sq->bf_offset,
-		     NULL);
+	if (bf_sz) {
+		__iowrite64_copy(sq->uar_bf_map + ofst, &wqe->ctrl, bf_sz);
+
+		/* flush the write-combining mapped buffer */
+		wmb();
+
+	} else {
+		mlx5_write64((__be32 *)&wqe->ctrl, sq->uar_map + ofst, NULL);
+	}
 
 	sq->bf_offset ^= sq->bf_buf_size;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index c55fad431cbf..4a87e9dcf52c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -514,6 +514,7 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 
 	sq->wq.db       = &sq->wq.db[MLX5_SND_DBR];
 	sq->uar_map     = sq->uar.map;
+	sq->uar_bf_map  = sq->uar.bf_map;
 	sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2;
 	sq->max_inline  = param->max_inline;
 
@@ -524,11 +525,12 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 	txq_ix = c->ix + tc * priv->params.num_channels;
 	sq->txq = netdev_get_tx_queue(priv->netdev, txq_ix);
 
-	sq->pdev    = c->pdev;
-	sq->mkey_be = c->mkey_be;
-	sq->channel = c;
-	sq->tc      = tc;
-	sq->edge    = (sq->wq.sz_m1 + 1) - MLX5_SEND_WQE_MAX_WQEBBS;
+	sq->pdev      = c->pdev;
+	sq->mkey_be   = c->mkey_be;
+	sq->channel   = c;
+	sq->tc        = tc;
+	sq->edge      = (sq->wq.sz_m1 + 1) - MLX5_SEND_WQE_MAX_WQEBBS;
+	sq->bf_budget = MLX5E_SQ_BF_BUDGET;
 	priv->txq_to_sq_map[txq_ix] = sq;
 
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 351ac6982e22..64380bc0cd6a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -57,7 +57,7 @@ void mlx5e_send_nop(struct mlx5e_sq *sq, bool notify_hw)
 
 	if (notify_hw) {
 		cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
-		mlx5e_tx_notify_hw(sq, wqe);
+		mlx5e_tx_notify_hw(sq, wqe, 0);
 	}
 }
 
@@ -110,7 +110,7 @@ u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
 }
 
 static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq,
-					    struct sk_buff *skb)
+					    struct sk_buff *skb, bool bf)
 {
 	/* Some NIC TX decisions, e.g loopback, are based on the packet
 	 * headers and occur before the data gather.
@@ -118,7 +118,7 @@ static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq,
 	 */
 #define MLX5E_MIN_INLINE (ETH_HLEN + 2/*vlan tag*/)
 
-	if (skb_headlen(skb) <= sq->max_inline)
+	if (bf && (skb_headlen(skb) <= sq->max_inline))
 		return skb_headlen(skb);
 
 	return MLX5E_MIN_INLINE;
@@ -137,6 +137,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
 
 	u8  opcode = MLX5_OPCODE_SEND;
 	dma_addr_t dma_addr = 0;
+	bool bf = false;
 	u16 headlen;
 	u16 ds_cnt;
 	u16 ihs;
@@ -149,6 +150,11 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
 	else
 		sq->stats.csum_offload_none++;
 
+	if (sq->cc != sq->prev_cc) {
+		sq->prev_cc = sq->cc;
+		sq->bf_budget = (sq->cc == sq->pc) ? MLX5E_SQ_BF_BUDGET : 0;
+	}
+
 	if (skb_is_gso(skb)) {
 		u32 payload_len;
 
@@ -161,7 +167,10 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
 		sq->stats.tso_packets++;
 		sq->stats.tso_bytes += payload_len;
 	} else {
-		ihs = mlx5e_get_inline_hdr_size(sq, skb);
+		bf = sq->bf_budget &&
+		     !skb->xmit_more &&
+		     !skb_shinfo(skb)->nr_frags;
+		ihs = mlx5e_get_inline_hdr_size(sq, skb, bf);
 		MLX5E_TX_SKB_CB(skb)->num_bytes = max_t(unsigned int, skb->len,
 							ETH_ZLEN);
 	}
@@ -233,14 +242,21 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
 	}
 
 	if (!skb->xmit_more || netif_xmit_stopped(sq->txq)) {
+		int bf_sz = 0;
+
+		if (bf && sq->uar_bf_map)
+			bf_sz = MLX5E_TX_SKB_CB(skb)->num_wqebbs << 3;
+
 		cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
-		mlx5e_tx_notify_hw(sq, wqe);
+		mlx5e_tx_notify_hw(sq, wqe, bf_sz);
 	}
 
 	/* fill sq edge with nops to avoid wqe wrap around */
 	while ((sq->pc & wq->sz_m1) > sq->edge)
 		mlx5e_send_nop(sq, false);
 
+	sq->bf_budget = bf ? sq->bf_budget - 1 : 0;
+
 	sq->stats.packets++;
 	return NETDEV_TX_OK;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index c34eafbf1c04..603a8b0908ee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -654,6 +654,22 @@ static int mlx5_core_set_issi(struct mlx5_core_dev *dev)
 }
 #endif
 
+static int map_bf_area(struct mlx5_core_dev *dev)
+{
+	resource_size_t bf_start = pci_resource_start(dev->pdev, 0);
+	resource_size_t bf_len = pci_resource_len(dev->pdev, 0);
+
+	dev->priv.bf_mapping = io_mapping_create_wc(bf_start, bf_len);
+
+	return dev->priv.bf_mapping ? 0 : -ENOMEM;
+}
+
+static void unmap_bf_area(struct mlx5_core_dev *dev)
+{
+	if (dev->priv.bf_mapping)
+		io_mapping_free(dev->priv.bf_mapping);
+}
+
 static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
 {
 	struct mlx5_priv *priv = &dev->priv;
@@ -808,10 +824,13 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
 		goto err_stop_eqs;
 	}
 
+	if (map_bf_area(dev))
+		dev_err(&pdev->dev, "Failed to map blue flame area\n");
+
 	err = mlx5_irq_set_affinity_hints(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
-		goto err_free_comp_eqs;
+		goto err_unmap_bf_area;
 	}
 
 	MLX5_INIT_DOORBELL_LOCK(&priv->cq_uar_lock);
@@ -823,7 +842,9 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
 
 	return 0;
 
-err_free_comp_eqs:
+err_unmap_bf_area:
+	unmap_bf_area(dev);
+
 	free_comp_eqs(dev);
 
 err_stop_eqs:
@@ -881,6 +902,7 @@ static void mlx5_dev_cleanup(struct mlx5_core_dev *dev)
 	mlx5_cleanup_qp_table(dev);
 	mlx5_cleanup_cq_table(dev);
 	mlx5_irq_clear_affinity_hints(dev);
+	unmap_bf_area(dev);
 	free_comp_eqs(dev);
 	mlx5_stop_eqs(dev);
 	mlx5_free_uuars(dev, &priv->uuari);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
index 9ef85873ceea..eb05c845ece9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -32,6 +32,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/io-mapping.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
@@ -246,6 +247,10 @@ int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
 		goto err_free_uar;
 	}
 
+	if (mdev->priv.bf_mapping)
+		uar->bf_map = io_mapping_map_wc(mdev->priv.bf_mapping,
+						uar->index << PAGE_SHIFT);
+
 	return 0;
 
 err_free_uar:
@@ -257,6 +262,7 @@ EXPORT_SYMBOL(mlx5_alloc_map_uar);
 
 void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
 {
+	io_mapping_unmap(uar->bf_map);
 	iounmap(uar->map);
 	mlx5_cmd_free_uar(mdev, uar->index);
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 1c0d5d062d7c..5fe0cae1a515 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -380,7 +380,7 @@ struct mlx5_uar {
 	u32			index;
 	struct list_head	bf_list;
 	unsigned		free_bf_bmap;
-	void __iomem	       *wc_map;
+	void __iomem	       *bf_map;
 	void __iomem	       *map;
 };
 
@@ -435,6 +435,8 @@ struct mlx5_priv {
 	struct mlx5_uuar_info	uuari;
 	MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock);
 
+	struct io_mapping	*bf_mapping;
+
 	/* pages stuff */
 	struct workqueue_struct *pg_wq;
 	struct rb_root		page_root;
-- 
cgit v1.2.3-70-g09d2


From e018a0cce3d849bc73e72686c571420adc40bad2 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 24 Jul 2015 21:24:04 +0300
Subject: net/macb: convert to kernel doc

This patch coverts struct description to the kernel doc format. There is no
functional change.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/platform_data/macb.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/macb.h b/include/linux/platform_data/macb.h
index 044a124bfbbc..21b15f6fee25 100644
--- a/include/linux/platform_data/macb.h
+++ b/include/linux/platform_data/macb.h
@@ -8,11 +8,19 @@
 #ifndef __MACB_PDATA_H__
 #define __MACB_PDATA_H__
 
+/**
+ * struct macb_platform_data - platform data for MACB Ethernet
+ * @phy_mask:		phy mask passed when register the MDIO bus
+ *			within the driver
+ * @phy_irq_pin:	PHY IRQ
+ * @is_rmii:		using RMII interface?
+ * @rev_eth_addr:	reverse Ethernet address byte order
+ */
 struct macb_platform_data {
 	u32		phy_mask;
-	int		phy_irq_pin;	/* PHY IRQ */
-	u8		is_rmii;	/* using RMII interface? */
-	u8		rev_eth_addr;	/* reverse Ethernet address byte order */
+	int		phy_irq_pin;
+	u8		is_rmii;
+	u8		rev_eth_addr;
 };
 
 #endif /* __MACB_PDATA_H__ */
-- 
cgit v1.2.3-70-g09d2


From e6942b7de2dfe44ebde9bae57dadece5abca9de8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 23 Apr 2014 19:32:50 +0200
Subject: atomic: Provide atomic_{or,xor,and}

Implement atomic logic ops -- atomic_{or,xor,and}.

These will replace the atomic_{set,clear}_mask functions that are
available on some archs.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/alpha/include/asm/atomic.h        |  1 -
 arch/arc/include/asm/atomic.h          |  1 -
 arch/arm/include/asm/atomic.h          |  1 -
 arch/arm64/include/asm/atomic.h        |  1 -
 arch/avr32/include/asm/atomic.h        |  2 --
 arch/blackfin/include/asm/atomic.h     |  2 --
 arch/frv/include/asm/atomic.h          |  2 --
 arch/h8300/include/asm/atomic.h        |  2 --
 arch/hexagon/include/asm/atomic.h      |  2 --
 arch/ia64/include/asm/atomic.h         |  2 --
 arch/m32r/include/asm/atomic.h         |  2 --
 arch/m68k/include/asm/atomic.h         |  2 --
 arch/metag/include/asm/atomic_lnkget.h |  2 --
 arch/mips/include/asm/atomic.h         |  2 --
 arch/mn10300/include/asm/atomic.h      |  2 --
 arch/parisc/include/asm/atomic.h       |  2 --
 arch/powerpc/include/asm/atomic.h      |  2 --
 arch/s390/include/asm/atomic.h         |  2 --
 arch/sh/include/asm/atomic-grb.h       |  2 --
 arch/sparc/include/asm/atomic_32.h     |  2 --
 arch/sparc/include/asm/atomic_64.h     |  2 --
 arch/tile/include/asm/atomic_32.h      |  2 --
 arch/tile/include/asm/atomic_64.h      |  2 --
 arch/x86/include/asm/atomic.h          |  2 --
 arch/xtensa/include/asm/atomic.h       |  2 --
 include/asm-generic/atomic.h           | 21 ++++++++++++---------
 include/asm-generic/atomic64.h         |  4 ++++
 include/linux/atomic.h                 | 13 -------------
 lib/atomic64.c                         |  3 +++
 29 files changed, 19 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h
index 0eff853398d2..e8c956098424 100644
--- a/arch/alpha/include/asm/atomic.h
+++ b/arch/alpha/include/asm/atomic.h
@@ -110,7 +110,6 @@ static __inline__ long atomic64_##op##_return(long i, atomic64_t * v)	\
 ATOMIC_OPS(add)
 ATOMIC_OPS(sub)
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
 #define atomic_andnot atomic_andnot
 #define atomic64_andnot atomic64_andnot
 
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index e90b701fc6a8..2a847821dee1 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -144,7 +144,6 @@ static inline int atomic_##op##_return(int i, atomic_t *v)		\
 ATOMIC_OPS(add, +=, add)
 ATOMIC_OPS(sub, -=, sub)
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
 #define atomic_andnot atomic_andnot
 
 ATOMIC_OP(and, &=, and)
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index ff214bac9cb4..82b75a7cb762 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -194,7 +194,6 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 ATOMIC_OPS(add, +=, add)
 ATOMIC_OPS(sub, -=, sub)
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
 #define atomic_andnot atomic_andnot
 
 ATOMIC_OP(and, &=, and)
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index 2876173397b2..866a71fca9a3 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -85,7 +85,6 @@ static inline int atomic_##op##_return(int i, atomic_t *v)		\
 ATOMIC_OPS(add, add)
 ATOMIC_OPS(sub, sub)
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
 #define atomic_andnot atomic_andnot
 
 ATOMIC_OP(and, and)
diff --git a/arch/avr32/include/asm/atomic.h b/arch/avr32/include/asm/atomic.h
index 115d3005e4bc..97c9bdf83409 100644
--- a/arch/avr32/include/asm/atomic.h
+++ b/arch/avr32/include/asm/atomic.h
@@ -51,8 +51,6 @@ static inline void atomic_##op(int i, atomic_t *v)			\
 	(void)__atomic_##op##_return(i, v);				\
 }
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(and, and)
 ATOMIC_OP(or, or)
 ATOMIC_OP(xor, eor)
diff --git a/arch/blackfin/include/asm/atomic.h b/arch/blackfin/include/asm/atomic.h
index eafa55b81a7b..2d6a7a3823c3 100644
--- a/arch/blackfin/include/asm/atomic.h
+++ b/arch/blackfin/include/asm/atomic.h
@@ -28,8 +28,6 @@ asmlinkage int __raw_atomic_test_asm(const volatile int *ptr, int value);
 #define atomic_add_return(i, v) __raw_atomic_add_asm(&(v)->counter, i)
 #define atomic_sub_return(i, v) __raw_atomic_add_asm(&(v)->counter, -(i))
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 #define atomic_or(i, v)  (void)__raw_atomic_or_asm(&(v)->counter, i)
 #define atomic_and(i, v) (void)__raw_atomic_and_asm(&(v)->counter, i)
 #define atomic_xor(i, v) (void)__raw_atomic_xor_asm(&(v)->counter, i)
diff --git a/arch/frv/include/asm/atomic.h b/arch/frv/include/asm/atomic.h
index 74d22454d7c6..fc48bea26b40 100644
--- a/arch/frv/include/asm/atomic.h
+++ b/arch/frv/include/asm/atomic.h
@@ -192,8 +192,6 @@ static inline void atomic64_##op(long long i, atomic64_t *v)		\
 	(void)__atomic64_fetch_##op(i, &v->counter);			\
 }
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(or)
 ATOMIC_OP(and)
 ATOMIC_OP(xor)
diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h
index f181f820be33..c4d061f09c44 100644
--- a/arch/h8300/include/asm/atomic.h
+++ b/arch/h8300/include/asm/atomic.h
@@ -41,8 +41,6 @@ static inline void atomic_##op(int i, atomic_t *v)		\
 ATOMIC_OP_RETURN(add, +=)
 ATOMIC_OP_RETURN(sub, -=)
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(and, &=)
 ATOMIC_OP(or,  |=)
 ATOMIC_OP(xor, ^=)
diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h
index 4efe2c7c0dd8..811d61f6422d 100644
--- a/arch/hexagon/include/asm/atomic.h
+++ b/arch/hexagon/include/asm/atomic.h
@@ -132,8 +132,6 @@ static inline int atomic_##op##_return(int i, atomic_t *v)		\
 ATOMIC_OPS(add)
 ATOMIC_OPS(sub)
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(and)
 ATOMIC_OP(or)
 ATOMIC_OP(xor)
diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
index 0809ef5d6b9a..be4beeb77d57 100644
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h
@@ -69,8 +69,6 @@ ATOMIC_OP(sub, -)
 		: ia64_atomic_sub(__ia64_asr_i, v);			\
 })
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(and, &)
 ATOMIC_OP(or, |)
 ATOMIC_OP(xor, ^)
diff --git a/arch/m32r/include/asm/atomic.h b/arch/m32r/include/asm/atomic.h
index 7245463c1e98..b2a13fbd5be0 100644
--- a/arch/m32r/include/asm/atomic.h
+++ b/arch/m32r/include/asm/atomic.h
@@ -94,8 +94,6 @@ static __inline__ int atomic_##op##_return(int i, atomic_t *v)		\
 ATOMIC_OPS(add)
 ATOMIC_OPS(sub)
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(and)
 ATOMIC_OP(or)
 ATOMIC_OP(xor)
diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h
index c30e43ea49a3..93ebd96aa494 100644
--- a/arch/m68k/include/asm/atomic.h
+++ b/arch/m68k/include/asm/atomic.h
@@ -77,8 +77,6 @@ static inline int atomic_##op##_return(int i, atomic_t * v)		\
 ATOMIC_OPS(add, +=, add)
 ATOMIC_OPS(sub, -=, sub)
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(and, &=, and)
 ATOMIC_OP(or, |=, or)
 ATOMIC_OP(xor, ^=, eor)
diff --git a/arch/metag/include/asm/atomic_lnkget.h b/arch/metag/include/asm/atomic_lnkget.h
index 930c12cb8d37..0642606de901 100644
--- a/arch/metag/include/asm/atomic_lnkget.h
+++ b/arch/metag/include/asm/atomic_lnkget.h
@@ -74,8 +74,6 @@ static inline int atomic_##op##_return(int i, atomic_t *v)		\
 ATOMIC_OPS(add)
 ATOMIC_OPS(sub)
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(and)
 ATOMIC_OP(or)
 ATOMIC_OP(xor)
diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index 0430ba6ab762..4c42fd9af777 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -137,8 +137,6 @@ static __inline__ int atomic_##op##_return(int i, atomic_t * v)		      \
 ATOMIC_OPS(add, +=, addu)
 ATOMIC_OPS(sub, -=, subu)
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(and, &=, and)
 ATOMIC_OP(or, |=, or)
 ATOMIC_OP(xor, ^=, xor)
diff --git a/arch/mn10300/include/asm/atomic.h b/arch/mn10300/include/asm/atomic.h
index 03eea8158cf9..f5a63f0bda46 100644
--- a/arch/mn10300/include/asm/atomic.h
+++ b/arch/mn10300/include/asm/atomic.h
@@ -89,8 +89,6 @@ static inline int atomic_##op##_return(int i, atomic_t *v)		\
 ATOMIC_OPS(add)
 ATOMIC_OPS(sub)
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(and)
 ATOMIC_OP(or)
 ATOMIC_OP(xor)
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index be2c50ddebd6..2536965d00ea 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -126,8 +126,6 @@ static __inline__ int atomic_##op##_return(int i, atomic_t *v)		\
 ATOMIC_OPS(add, +=)
 ATOMIC_OPS(sub, -=)
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(and, &=)
 ATOMIC_OP(or, |=)
 ATOMIC_OP(xor, ^=)
diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 6ca89e2aca15..55f106ed12bf 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -67,8 +67,6 @@ static __inline__ int atomic_##op##_return(int a, atomic_t *v)		\
 ATOMIC_OPS(add, add)
 ATOMIC_OPS(sub, subf)
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(and, and)
 ATOMIC_OP(or, or)
 ATOMIC_OP(xor, xor)
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index b3859d8e001f..d761aeff72da 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -282,8 +282,6 @@ static inline void atomic64_##op(long i, atomic64_t *v)			\
 	__ATOMIC64_LOOP(v, i, __ATOMIC64_##OP, __ATOMIC64_NO_BARRIER);	\
 }
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC64_OP(and, AND)
 ATOMIC64_OP(or, OR)
 ATOMIC64_OP(xor, XOR)
diff --git a/arch/sh/include/asm/atomic-grb.h b/arch/sh/include/asm/atomic-grb.h
index 4b03830d48c7..b94df40e5f2d 100644
--- a/arch/sh/include/asm/atomic-grb.h
+++ b/arch/sh/include/asm/atomic-grb.h
@@ -48,8 +48,6 @@ static inline int atomic_##op##_return(int i, atomic_t *v)		\
 ATOMIC_OPS(add)
 ATOMIC_OPS(sub)
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(and)
 ATOMIC_OP(or)
 ATOMIC_OP(xor)
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index e19d8880b146..7dcbebbcaec6 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -17,8 +17,6 @@
 #include <asm/barrier.h>
 #include <asm-generic/atomic64.h>
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 #define ATOMIC_INIT(i)  { (i) }
 
 int atomic_add_return(int, atomic_t *);
diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h
index d6af27c93450..917084ace49d 100644
--- a/arch/sparc/include/asm/atomic_64.h
+++ b/arch/sparc/include/asm/atomic_64.h
@@ -33,8 +33,6 @@ long atomic64_##op##_return(long, atomic64_t *);
 ATOMIC_OPS(add)
 ATOMIC_OPS(sub)
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(and)
 ATOMIC_OP(or)
 ATOMIC_OP(xor)
diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h
index 94237922f0dd..d320ce253d86 100644
--- a/arch/tile/include/asm/atomic_32.h
+++ b/arch/tile/include/asm/atomic_32.h
@@ -41,8 +41,6 @@ static inline void atomic_##op(int i, atomic_t *v)			\
 	_atomic_##op((unsigned long *)&v->counter, i);			\
 }
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(and)
 ATOMIC_OP(or)
 ATOMIC_OP(xor)
diff --git a/arch/tile/include/asm/atomic_64.h b/arch/tile/include/asm/atomic_64.h
index d07d9fc6e2a1..096a56d6ead4 100644
--- a/arch/tile/include/asm/atomic_64.h
+++ b/arch/tile/include/asm/atomic_64.h
@@ -58,8 +58,6 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 	return oldval;
 }
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 static inline void atomic_and(int i, atomic_t *v)
 {
 	__insn_fetchand4((void *)&v->counter, i);
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index f3a3ec040694..b3493023efda 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -191,8 +191,6 @@ static inline void atomic_##op(int i, atomic_t *v)			\
 			: "memory");					\
 }
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(and)
 ATOMIC_OP(or)
 ATOMIC_OP(xor)
diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h
index 4dd2450300a6..31371f43c23b 100644
--- a/arch/xtensa/include/asm/atomic.h
+++ b/arch/xtensa/include/asm/atomic.h
@@ -145,8 +145,6 @@ static inline int atomic_##op##_return(int i, atomic_t * v)		\
 ATOMIC_OPS(add)
 ATOMIC_OPS(sub)
 
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-
 ATOMIC_OP(and)
 ATOMIC_OP(or)
 ATOMIC_OP(xor)
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index 92947e0a532a..a41b0b8f7404 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -102,24 +102,27 @@ ATOMIC_OP_RETURN(sub, -)
 ATOMIC_OP(and, &)
 #endif
 
-#ifndef atomic_clear_mask
-#define atomic_clear_mask(i, v) atomic_and(~(i), (v))
-#endif
-
 #ifndef atomic_or
-#ifndef CONFIG_ARCH_HAS_ATOMIC_OR
-#define CONFIG_ARCH_HAS_ATOMIC_OR
-#endif
 ATOMIC_OP(or, |)
 #endif
 
-#ifndef atomic_set_mask
-#define atomic_set_mask(i, v)	atomic_or((i), (v))
+#ifndef atomic_xor
+ATOMIC_OP(xor, ^)
 #endif
 
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 
+static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
+{
+	atomic_and(~mask, v);
+}
+
+static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
+{
+	atomic_or(mask, v);
+}
+
 /*
  * Atomic operations that C can't guarantee us.  Useful for
  * resource counting etc..
diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h
index 30ad9c86cebb..d48e78ccad3d 100644
--- a/include/asm-generic/atomic64.h
+++ b/include/asm-generic/atomic64.h
@@ -32,6 +32,10 @@ extern long long atomic64_##op##_return(long long a, atomic64_t *v);
 ATOMIC64_OPS(add)
 ATOMIC64_OPS(sub)
 
+ATOMIC64_OP(and)
+ATOMIC64_OP(or)
+ATOMIC64_OP(xor)
+
 #undef ATOMIC64_OPS
 #undef ATOMIC64_OP_RETURN
 #undef ATOMIC64_OP
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 5b08a8540ecf..7d6279012a1f 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -111,19 +111,6 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 }
 #endif
 
-#ifndef CONFIG_ARCH_HAS_ATOMIC_OR
-static inline void atomic_or(int i, atomic_t *v)
-{
-	int old;
-	int new;
-
-	do {
-		old = atomic_read(v);
-		new = old | i;
-	} while (atomic_cmpxchg(v, old, new) != old);
-}
-#endif /* #ifndef CONFIG_ARCH_HAS_ATOMIC_OR */
-
 #include <asm-generic/atomic-long.h>
 #ifdef CONFIG_GENERIC_ATOMIC64
 #include <asm-generic/atomic64.h>
diff --git a/lib/atomic64.c b/lib/atomic64.c
index 1298c05ef528..2886ebac6567 100644
--- a/lib/atomic64.c
+++ b/lib/atomic64.c
@@ -102,6 +102,9 @@ EXPORT_SYMBOL(atomic64_##op##_return);
 
 ATOMIC64_OPS(add, +=)
 ATOMIC64_OPS(sub, -=)
+ATOMIC64_OP(and, &=)
+ATOMIC64_OP(or, |=)
+ATOMIC64_OP(xor, ^=)
 
 #undef ATOMIC64_OPS
 #undef ATOMIC64_OP_RETURN
-- 
cgit v1.2.3-70-g09d2


From de9e432cb5de1bf2952919dc0b22e4bec0ed8d53 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 24 Apr 2015 01:12:32 +0200
Subject: atomic: Collapse all atomic_{set,clear}_mask definitions

Move the now generic definitions of atomic_{set,clear}_mask() into
linux/atomic.h to avoid endless and pointless repetition.

Also, provide an atomic_andnot() wrapper for those few archs that can
implement that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/arc/include/asm/atomic.h          | 10 ----------
 arch/blackfin/include/asm/atomic.h     | 10 ----------
 arch/frv/include/asm/atomic.h          | 10 ----------
 arch/h8300/include/asm/atomic.h        | 10 ----------
 arch/m32r/include/asm/atomic.h         | 11 -----------
 arch/m68k/include/asm/atomic.h         | 10 ----------
 arch/metag/include/asm/atomic_lnkget.h | 10 ----------
 arch/metag/include/asm/atomic_lock1.h  | 10 ----------
 arch/mn10300/include/asm/atomic.h      | 24 ------------------------
 arch/powerpc/kernel/misc_32.S          | 19 -------------------
 arch/s390/include/asm/atomic.h         | 10 ----------
 arch/sh/include/asm/atomic.h           | 10 ----------
 arch/x86/include/asm/atomic.h          | 10 ----------
 arch/xtensa/include/asm/atomic.h       | 10 ----------
 include/asm-generic/atomic.h           | 10 ----------
 include/linux/atomic.h                 | 25 +++++++++++++++++++++++++
 16 files changed, 25 insertions(+), 174 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 2a847821dee1..d8a85e706fba 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -155,16 +155,6 @@ ATOMIC_OP(xor, ^=, xor)
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 
-static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_and(~mask, v);
-}
-
-static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_or(mask, v);
-}
-
 /**
  * __atomic_add_unless - add unless the number is a given value
  * @v: pointer of type atomic_t
diff --git a/arch/blackfin/include/asm/atomic.h b/arch/blackfin/include/asm/atomic.h
index 2d6a7a3823c3..1c1c42330c99 100644
--- a/arch/blackfin/include/asm/atomic.h
+++ b/arch/blackfin/include/asm/atomic.h
@@ -32,16 +32,6 @@ asmlinkage int __raw_atomic_test_asm(const volatile int *ptr, int value);
 #define atomic_and(i, v) (void)__raw_atomic_and_asm(&(v)->counter, i)
 #define atomic_xor(i, v) (void)__raw_atomic_xor_asm(&(v)->counter, i)
 
-static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_and(~mask, v);
-}
-
-static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_or(mask, v);
-}
-
 #endif
 
 #include <asm-generic/atomic.h>
diff --git a/arch/frv/include/asm/atomic.h b/arch/frv/include/asm/atomic.h
index fc48bea26b40..0da689def4cc 100644
--- a/arch/frv/include/asm/atomic.h
+++ b/arch/frv/include/asm/atomic.h
@@ -198,14 +198,4 @@ ATOMIC_OP(xor)
 
 #undef ATOMIC_OP
 
-static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_and(~mask, v);
-}
-
-static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_or(mask, v);
-}
-
 #endif /* _ASM_ATOMIC_H */
diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h
index c4d061f09c44..702ee539f87d 100644
--- a/arch/h8300/include/asm/atomic.h
+++ b/arch/h8300/include/asm/atomic.h
@@ -89,14 +89,4 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 	return ret;
 }
 
-static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_and(~mask, v);
-}
-
-static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_or(mask, v);
-}
-
 #endif /* __ARCH_H8300_ATOMIC __ */
diff --git a/arch/m32r/include/asm/atomic.h b/arch/m32r/include/asm/atomic.h
index b2a13fbd5be0..025e2a170493 100644
--- a/arch/m32r/include/asm/atomic.h
+++ b/arch/m32r/include/asm/atomic.h
@@ -243,15 +243,4 @@ static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
 	return c;
 }
 
-
-static __inline__ __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_and(~mask, v);
-}
-
-static __inline__ __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_or(mask, v);
-}
-
 #endif	/* _ASM_M32R_ATOMIC_H */
diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h
index 93ebd96aa494..039fac120cc0 100644
--- a/arch/m68k/include/asm/atomic.h
+++ b/arch/m68k/include/asm/atomic.h
@@ -174,16 +174,6 @@ static inline int atomic_add_negative(int i, atomic_t *v)
 	return c != 0;
 }
 
-static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_and(~mask, v);
-}
-
-static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_or(mask, v);
-}
-
 static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
diff --git a/arch/metag/include/asm/atomic_lnkget.h b/arch/metag/include/asm/atomic_lnkget.h
index 0642606de901..21c4c268b86c 100644
--- a/arch/metag/include/asm/atomic_lnkget.h
+++ b/arch/metag/include/asm/atomic_lnkget.h
@@ -82,16 +82,6 @@ ATOMIC_OP(xor)
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 
-static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_and(~mask, v);
-}
-
-static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_or(mask, v);
-}
-
 static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
 	int result, temp;
diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h
index 7d88725a85da..f8efe380fe8b 100644
--- a/arch/metag/include/asm/atomic_lock1.h
+++ b/arch/metag/include/asm/atomic_lock1.h
@@ -76,16 +76,6 @@ ATOMIC_OP(xor, ^=)
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 
-static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_and(~mask, v);
-}
-
-static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_or(mask, v);
-}
-
 static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
 	int ret;
diff --git a/arch/mn10300/include/asm/atomic.h b/arch/mn10300/include/asm/atomic.h
index f5a63f0bda46..375e59140c9c 100644
--- a/arch/mn10300/include/asm/atomic.h
+++ b/arch/mn10300/include/asm/atomic.h
@@ -131,30 +131,6 @@ static inline void atomic_dec(atomic_t *v)
 #define atomic_xchg(ptr, v)		(xchg(&(ptr)->counter, (v)))
 #define atomic_cmpxchg(v, old, new)	(cmpxchg(&((v)->counter), (old), (new)))
 
-/**
- * atomic_clear_mask - Atomically clear bits in memory
- * @mask: Mask of the bits to be cleared
- * @v: pointer to word in memory
- *
- * Atomically clears the bits set in mask from the memory word specified.
- */
-static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_and(~mask, v);
-}
-
-/**
- * atomic_set_mask - Atomically set bits in memory
- * @mask: Mask of the bits to be set
- * @v: pointer to word in memory
- *
- * Atomically sets the bits set in mask from the memory word specified.
- */
-static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_or(mask, v);
-}
-
 #endif /* __KERNEL__ */
 #endif /* CONFIG_SMP */
 #endif /* _ASM_ATOMIC_H */
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 7c6bb4b17b49..ed3ab509faca 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -595,25 +595,6 @@ _GLOBAL(copy_page)
 	li	r11,4
 	b	2b
 
-/*
- * void atomic_clear_mask(atomic_t mask, atomic_t *addr)
- * void atomic_set_mask(atomic_t mask, atomic_t *addr);
- */
-_GLOBAL(atomic_clear_mask)
-10:	lwarx	r5,0,r4
-	andc	r5,r5,r3
-	PPC405_ERR77(0,r4)
-	stwcx.	r5,0,r4
-	bne-	10b
-	blr
-_GLOBAL(atomic_set_mask)
-10:	lwarx	r5,0,r4
-	or	r5,r5,r3
-	PPC405_ERR77(0,r4)
-	stwcx.	r5,0,r4
-	bne-	10b
-	blr
-
 /*
  * Extended precision shifts.
  *
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index d761aeff72da..117fa5c921c1 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -132,16 +132,6 @@ ATOMIC_OP(xor, XOR)
 
 #undef ATOMIC_OP
 
-static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_and(~mask, v);
-}
-
-static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_or(mask, v);
-}
-
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h
index cee0245257e1..05b9f74ce2d5 100644
--- a/arch/sh/include/asm/atomic.h
+++ b/arch/sh/include/asm/atomic.h
@@ -25,16 +25,6 @@
 #include <asm/atomic-irq.h>
 #endif
 
-static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_and(~mask, v);
-}
-
-static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_or(mask, v);
-}
-
 #define atomic_add_negative(a, v)	(atomic_add_return((a), (v)) < 0)
 #define atomic_dec_return(v)		atomic_sub_return(1, (v))
 #define atomic_inc_return(v)		atomic_add_return(1, (v))
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index b3493023efda..fb52aa644aab 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -234,16 +234,6 @@ static __always_inline short int atomic_inc_short(short int *v)
 	return *v;
 }
 
-static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_and(~mask, v);
-}
-
-static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_or(mask, v);
-}
-
 #ifdef CONFIG_X86_32
 # include <asm/atomic64_32.h>
 #else
diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h
index 31371f43c23b..e0be67936990 100644
--- a/arch/xtensa/include/asm/atomic.h
+++ b/arch/xtensa/include/asm/atomic.h
@@ -153,16 +153,6 @@ ATOMIC_OP(xor)
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 
-static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_or(mask, v);
-}
-
-static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_and(~mask, v);
-}
-
 /**
  * atomic_sub_and_test - subtract value from variable and test result
  * @i: integer value to subtract
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index a41b0b8f7404..d4d7e337fdcb 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -113,16 +113,6 @@ ATOMIC_OP(xor, ^)
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 
-static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_and(~mask, v);
-}
-
-static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_or(mask, v);
-}
-
 /*
  * Atomic operations that C can't guarantee us.  Useful for
  * resource counting etc..
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 7d6279012a1f..8b98b423388f 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -28,6 +28,23 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 #define atomic_inc_not_zero(v)		atomic_add_unless((v), 1, 0)
 #endif
 
+#ifndef atomic_andnot
+static inline void atomic_andnot(int i, atomic_t *v)
+{
+	atomic_and(~i, v);
+}
+#endif
+
+static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
+{
+	atomic_andnot(mask, v);
+}
+
+static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
+{
+	atomic_or(mask, v);
+}
+
 /**
  * atomic_inc_not_zero_hint - increment if not null
  * @v: pointer of type atomic_t
@@ -115,4 +132,12 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 #ifdef CONFIG_GENERIC_ATOMIC64
 #include <asm-generic/atomic64.h>
 #endif
+
+#ifndef atomic64_andnot
+static inline void atomic64_andnot(long long i, atomic64_t *v)
+{
+	atomic64_and(~i, v);
+}
+#endif
+
 #endif /* _LINUX_ATOMIC_H */
-- 
cgit v1.2.3-70-g09d2


From 91492a44b998cf762150de8f1b40bda1902e8ea7 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin@rab.in>
Date: Wed, 22 Jul 2015 15:05:18 +0200
Subject: gpio: generic: support input-only chips

Allow chips to indicates that they are input-only and thus cannot set
the output value.  This will be used by the gpio-etraxfs driver.

Signed-off-by: Rabin Vincent <rabin@rab.in>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-generic.c     | 23 ++++++++++++++++++++---
 include/linux/basic_mmio_gpio.h |  1 +
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-generic.c b/drivers/gpio/gpio-generic.c
index 802e6d2c64e9..a3f07537fe62 100644
--- a/drivers/gpio/gpio-generic.c
+++ b/drivers/gpio/gpio-generic.c
@@ -153,6 +153,10 @@ static int bgpio_get(struct gpio_chip *gc, unsigned int gpio)
 	return !!(bgc->read_reg(bgc->reg_dat) & bgc->pin2mask(bgc, gpio));
 }
 
+static void bgpio_set_none(struct gpio_chip *gc, unsigned int gpio, int val)
+{
+}
+
 static void bgpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
 {
 	struct bgpio_chip *bgc = to_bgpio_chip(gc);
@@ -279,6 +283,12 @@ static int bgpio_simple_dir_in(struct gpio_chip *gc, unsigned int gpio)
 	return 0;
 }
 
+static int bgpio_dir_out_err(struct gpio_chip *gc, unsigned int gpio,
+				int val)
+{
+	return -EINVAL;
+}
+
 static int bgpio_simple_dir_out(struct gpio_chip *gc, unsigned int gpio,
 				int val)
 {
@@ -460,6 +470,9 @@ static int bgpio_setup_io(struct bgpio_chip *bgc,
 		bgc->reg_set = set;
 		bgc->gc.set = bgpio_set_set;
 		bgc->gc.set_multiple = bgpio_set_multiple_set;
+	} else if (flags & BGPIOF_NO_OUTPUT) {
+		bgc->gc.set = bgpio_set_none;
+		bgc->gc.set_multiple = NULL;
 	} else {
 		bgc->gc.set = bgpio_set;
 		bgc->gc.set_multiple = bgpio_set_multiple;
@@ -476,7 +489,8 @@ static int bgpio_setup_io(struct bgpio_chip *bgc,
 
 static int bgpio_setup_direction(struct bgpio_chip *bgc,
 				 void __iomem *dirout,
-				 void __iomem *dirin)
+				 void __iomem *dirin,
+				 unsigned long flags)
 {
 	if (dirout && dirin) {
 		return -EINVAL;
@@ -491,7 +505,10 @@ static int bgpio_setup_direction(struct bgpio_chip *bgc,
 		bgc->gc.direction_input = bgpio_dir_in_inv;
 		bgc->gc.get_direction = bgpio_get_dir_inv;
 	} else {
-		bgc->gc.direction_output = bgpio_simple_dir_out;
+		if (flags & BGPIOF_NO_OUTPUT)
+			bgc->gc.direction_output = bgpio_dir_out_err;
+		else
+			bgc->gc.direction_output = bgpio_simple_dir_out;
 		bgc->gc.direction_input = bgpio_simple_dir_in;
 	}
 
@@ -543,7 +560,7 @@ int bgpio_init(struct bgpio_chip *bgc, struct device *dev,
 	if (ret)
 		return ret;
 
-	ret = bgpio_setup_direction(bgc, dirout, dirin);
+	ret = bgpio_setup_direction(bgc, dirout, dirin, flags);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/basic_mmio_gpio.h b/include/linux/basic_mmio_gpio.h
index 14eea946e640..ed3768f4ecc7 100644
--- a/include/linux/basic_mmio_gpio.h
+++ b/include/linux/basic_mmio_gpio.h
@@ -75,5 +75,6 @@ int bgpio_init(struct bgpio_chip *bgc, struct device *dev,
 #define BGPIOF_UNREADABLE_REG_DIR	BIT(2) /* reg_dir is unreadable */
 #define BGPIOF_BIG_ENDIAN_BYTE_ORDER	BIT(3)
 #define BGPIOF_READ_OUTPUT_REG_SET     BIT(4) /* reg_set stores output value */
+#define BGPIOF_NO_OUTPUT		BIT(5) /* only input */
 
 #endif /* __BASIC_MMIO_GPIO_H */
-- 
cgit v1.2.3-70-g09d2


From 35068ce8cbf1749ef1a4b9b1493af83b8488c37b Mon Sep 17 00:00:00 2001
From: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Date: Wed, 1 Jul 2015 09:10:43 +0200
Subject: of: constify drv arg of of_driver_match_device stub

With this change the stub has the same signature as the actual function,
preventing this compiler warning when building without CONFIG_OF:

   drivers/base/property.c: In function 'fwnode_driver_match_device':
>> drivers/base/property.c:608:38: warning: passing argument 2 of 'of_driver_match_device' discards 'const' qualifier from pointer target type
      return of_driver_match_device(dev, drv);
                                         ^
   In file included from drivers/base/property.c:18:0:
   include/linux/of_device.h:61:19: note: expected 'struct device_driver *' but argument is of type 'const struct device_driver *'
    static inline int of_driver_match_device(struct device *dev,
                      ^

Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of_device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 4c508549833a..cc7dd687a89d 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -59,7 +59,7 @@ void of_dma_configure(struct device *dev, struct device_node *np);
 #else /* CONFIG_OF */
 
 static inline int of_driver_match_device(struct device *dev,
-					 struct device_driver *drv)
+					 const struct device_driver *drv)
 {
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 77fc29c4bbbbd01ee22c50ce8260fd0f2e08c124 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 27 Jul 2015 14:46:31 +0300
Subject: net/mlx4_core: Preparations for 802.1ad VLAN support

mlx4_core preparation to support hardware accelerated 802.1ad VLAN
device.

To allow 802.1ad accelerated device, "packet has vlan" (phv)
Firmware capability should be available. Firmware without the
phv capability won't behave properly and can't support 802.1ad device
acceleration.

The driver checks the Firmware capability and sets the phv bit
accordingly in SET_PORT command.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/fw.c   | 82 +++++++++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/fw.h   |  1 +
 drivers/net/ethernet/mellanox/mlx4/main.c | 15 ++++++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |  3 ++
 include/linux/mlx4/device.h               |  5 ++
 5 files changed, 106 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index e30bf57ad7a1..5a1c3d249530 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -154,6 +154,7 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[26] = "Port ETS Scheduler support",
 		[27] = "Port beacon support",
 		[28] = "RX-ALL support",
+		[29] = "802.1ad offload support",
 	};
 	int i;
 
@@ -307,6 +308,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 
 #define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80
 #define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS (1 << 31)
+#define QUERY_FUNC_CAP_PHV_BIT			0x40
 
 	if (vhcr->op_modifier == 1) {
 		struct mlx4_active_ports actv_ports =
@@ -351,6 +353,12 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 		MLX4_PUT(outbox->buf, dev->caps.phys_port_id[vhcr->in_modifier],
 			 QUERY_FUNC_CAP_PHYS_PORT_ID);
 
+		if (dev->caps.phv_bit[port]) {
+			field = QUERY_FUNC_CAP_PHV_BIT;
+			MLX4_PUT(outbox->buf, field,
+				 QUERY_FUNC_CAP_FLAGS0_OFFSET);
+		}
+
 	} else if (vhcr->op_modifier == 0) {
 		struct mlx4_active_ports actv_ports =
 			mlx4_get_active_ports(dev, slave);
@@ -600,6 +608,9 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
 		MLX4_GET(func_cap->phys_port_id, outbox,
 			 QUERY_FUNC_CAP_PHYS_PORT_ID);
 
+	MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS0_OFFSET);
+	func_cap->flags |= (field & QUERY_FUNC_CAP_PHV_BIT);
+
 	/* All other resources are allocated by the master, but we still report
 	 * 'num' and 'reserved' capabilities as follows:
 	 * - num remains the maximum resource index
@@ -700,6 +711,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET	0x92
 #define QUERY_DEV_CAP_BMME_FLAGS_OFFSET		0x94
 #define QUERY_DEV_CAP_CONFIG_DEV_OFFSET		0x94
+#define QUERY_DEV_CAP_PHV_EN_OFFSET		0x96
 #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET		0x98
 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET		0xa0
 #define QUERY_DEV_CAP_ETH_BACKPL_OFFSET		0x9c
@@ -898,6 +910,12 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CONFIG_DEV;
 	if (field & (1 << 2))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_IGNORE_FCS;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_PHV_EN_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PHV_EN;
+	if (field & 0x40)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN;
+
 	MLX4_GET(dev_cap->reserved_lkey, outbox,
 		 QUERY_DEV_CAP_RSVD_LKEY_OFFSET);
 	MLX4_GET(field32, outbox, QUERY_DEV_CAP_ETH_BACKPL_OFFSET);
@@ -1992,6 +2010,10 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 	MLX4_GET(param->uar_page_sz, outbox, INIT_HCA_UAR_PAGE_SZ_OFFSET);
 	MLX4_GET(param->log_uar_sz, outbox, INIT_HCA_LOG_UAR_SZ_OFFSET);
 
+	/* phv_check enable */
+	MLX4_GET(byte_field, outbox, INIT_HCA_CACHELINE_SZ_OFFSET);
+	if (byte_field & 0x2)
+		param->phv_check_en = 1;
 out:
 	mlx4_free_cmd_mailbox(dev, mailbox);
 
@@ -2758,3 +2780,63 @@ int mlx4_ACCESS_REG_wrapper(struct mlx4_dev *dev, int slave,
 			    0, MLX4_CMD_ACCESS_REG, MLX4_CMD_TIME_CLASS_C,
 			    MLX4_CMD_NATIVE);
 }
+
+static int mlx4_SET_PORT_phv_bit(struct mlx4_dev *dev, u8 port, u8 phv_bit)
+{
+#define SET_PORT_GEN_PHV_VALID	0x10
+#define SET_PORT_GEN_PHV_EN	0x80
+
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+
+	context->v_ignore_fcs |=  SET_PORT_GEN_PHV_VALID;
+	if (phv_bit)
+		context->phv_en |=  SET_PORT_GEN_PHV_EN;
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int get_phv_bit(struct mlx4_dev *dev, u8 port, int *phv)
+{
+	int err;
+	struct mlx4_func_cap func_cap;
+
+	memset(&func_cap, 0, sizeof(func_cap));
+	err = mlx4_QUERY_FUNC_CAP(dev, 1, &func_cap);
+	if (!err)
+		*phv = func_cap.flags & QUERY_FUNC_CAP_PHV_BIT;
+	return err;
+}
+EXPORT_SYMBOL(get_phv_bit);
+
+int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val)
+{
+	int ret;
+
+	if (mlx4_is_slave(dev))
+		return -EPERM;
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN &&
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN)) {
+		ret = mlx4_SET_PORT_phv_bit(dev, port, new_val);
+		if (!ret)
+			dev->caps.phv_bit[port] = new_val;
+		return ret;
+	}
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(set_phv_bit);
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
index 07cb7c2461ad..08de5555c2f4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -204,6 +204,7 @@ struct mlx4_init_hca_param {
 	u16 cqe_size; /* For use only when CQE stride feature enabled */
 	u16 eqe_size; /* For use only when EQE stride feature enabled */
 	u8 rss_ip_frags;
+	u8 phv_check_en; /* for QUERY_HCA */
 };
 
 struct mlx4_init_ib_param {
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index d76f4257e305..6f35b6c06193 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -405,6 +405,21 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.max_gso_sz	     = dev_cap->max_gso_sz;
 	dev->caps.max_rss_tbl_sz     = dev_cap->max_rss_tbl_sz;
 
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN) {
+		struct mlx4_init_hca_param hca_param;
+
+		memset(&hca_param, 0, sizeof(hca_param));
+		err = mlx4_QUERY_HCA(dev, &hca_param);
+		/* Turn off PHV_EN flag in case phv_check_en is set.
+		 * phv_check_en is a HW check that parse the packet and verify
+		 * phv bit was reported correctly in the wqe. To allow QinQ
+		 * PHV_EN flag should be set and phv_check_en must be cleared
+		 * otherwise QinQ packets will be drop by the HW.
+		 */
+		if (err || hca_param.phv_check_en)
+			dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_PHV_EN;
+	}
+
 	/* Sense port always allowed on supported devices for ConnectX-1 and -2 */
 	if (mlx4_priv(dev)->pci_dev_data & MLX4_PCI_DEV_FORCE_SENSE_PORT)
 		dev->caps.flags |= MLX4_DEV_CAP_FLAG_SENSE_SUPPORT;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index a092c5c34d43..232b2b55f23b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -787,6 +787,9 @@ struct mlx4_set_port_general_context {
 	u8 pprx;
 	u8 pfcrx;
 	u16 reserved4;
+	u32 reserved5;
+	u8 phv_en;
+	u8 reserved6[3];
 };
 
 struct mlx4_set_port_rqp_calc_context {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index fd13c1ce3b4a..bcbf8c72a77b 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -211,6 +211,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_ETS_CFG		= 1LL <<  26,
 	MLX4_DEV_CAP_FLAG2_PORT_BEACON		= 1LL <<  27,
 	MLX4_DEV_CAP_FLAG2_IGNORE_FCS		= 1LL <<  28,
+	MLX4_DEV_CAP_FLAG2_PHV_EN		= 1LL <<  29,
+	MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN	= 1LL <<  30,
 };
 
 enum {
@@ -581,6 +583,7 @@ struct mlx4_caps {
 	u64			phys_port_id[MLX4_MAX_PORTS + 1];
 	int			tunnel_offload_mode;
 	u8			rx_checksum_flags_port[MLX4_MAX_PORTS + 1];
+	u8			phv_bit[MLX4_MAX_PORTS + 1];
 	u8			alloc_res_qp_mask;
 	u32			dmfs_high_rate_qpn_base;
 	u32			dmfs_high_rate_qpn_range;
@@ -1332,6 +1335,8 @@ int mlx4_SET_PORT_BEACON(struct mlx4_dev *dev, u8 port, u16 time);
 int mlx4_SET_PORT_fcs_check(struct mlx4_dev *dev, u8 port,
 			    u8 ignore_fcs_value);
 int mlx4_SET_PORT_VXLAN(struct mlx4_dev *dev, u8 port, u8 steering, int enable);
+int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val);
+int get_phv_bit(struct mlx4_dev *dev, u8 port, int *phv);
 int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx);
 int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx);
 int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
-- 
cgit v1.2.3-70-g09d2


From e802f8e4c54e6adf4215ef9fa3d6eea8fcb10bf9 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 27 Jul 2015 14:46:33 +0300
Subject: net/mlx4: Prepare VLAN macros for 802.1ad Hardware accelerated
 support

To add Hardware accelerated support in 802.1ad vlan, replace
Current VLAN macros to CVLAN.
Replace:
MLX4_WQE_CTRL_INS_VLAN
MLX4_CQE_VLAN_PRESENT_MASK
With:
MLX4_WQE_CTRL_INS_CVLAN
MLX4_CQE_CVLAN_PRESENT_MASK

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/cq.c            | 2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 6 +++---
 drivers/net/ethernet/mellanox/mlx4/en_tx.c | 2 +-
 include/linux/mlx4/cq.h                    | 2 +-
 include/linux/mlx4/qp.h                    | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 36eb3d012b6d..180a8f7ec82d 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -871,7 +871,7 @@ repoll:
 		if (is_eth) {
 			wc->sl  = be16_to_cpu(cqe->sl_vid) >> 13;
 			if (be32_to_cpu(cqe->vlan_my_qpn) &
-					MLX4_CQE_VLAN_PRESENT_MASK) {
+					MLX4_CQE_CVLAN_PRESENT_MASK) {
 				wc->vlan_id = be16_to_cpu(cqe->sl_vid) &
 					MLX4_CQE_VID_MASK;
 			} else {
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 12c65e1ad6a9..10f6c2f1d5a0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -726,7 +726,7 @@ static int check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, void *va,
 
 	hw_checksum = csum_unfold((__force __sum16)cqe->checksum);
 
-	if (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK) &&
+	if (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK) &&
 	    !(dev_features & NETIF_F_HW_VLAN_CTAG_RX)) {
 		hw_checksum = get_fixed_vlan_csum(hw_checksum, hdr);
 		hdr += sizeof(struct vlan_hdr);
@@ -907,7 +907,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 				gro_skb->csum_level = 1;
 
 			if ((cqe->vlan_my_qpn &
-			    cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK)) &&
+			    cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK)) &&
 			    (dev->features & NETIF_F_HW_VLAN_CTAG_RX)) {
 				u16 vid = be16_to_cpu(cqe->sl_vid);
 
@@ -970,7 +970,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 					PKT_HASH_TYPE_L3);
 
 		if ((be32_to_cpu(cqe->vlan_my_qpn) &
-		    MLX4_CQE_VLAN_PRESENT_MASK) &&
+		    MLX4_CQE_CVLAN_PRESENT_MASK) &&
 		    (dev->features & NETIF_F_HW_VLAN_CTAG_RX))
 			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), be16_to_cpu(cqe->sl_vid));
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index c10d98f6ad96..7c858f67ef28 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -958,7 +958,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		ring->bf.offset ^= ring->bf.buf_size;
 	} else {
 		tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
-		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN *
+		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN *
 			!!skb_vlan_tag_present(skb);
 		tx_desc->ctrl.fence_size = real_size;
 
diff --git a/include/linux/mlx4/cq.h b/include/linux/mlx4/cq.h
index e7ecc12a1163..899a97b20d27 100644
--- a/include/linux/mlx4/cq.h
+++ b/include/linux/mlx4/cq.h
@@ -88,7 +88,7 @@ struct mlx4_ts_cqe {
 
 enum {
 	MLX4_CQE_L2_TUNNEL_IPOK		= 1 << 31,
-	MLX4_CQE_VLAN_PRESENT_MASK	= 1 << 29,
+	MLX4_CQE_CVLAN_PRESENT_MASK	= 1 << 29,
 	MLX4_CQE_L2_TUNNEL		= 1 << 27,
 	MLX4_CQE_L2_TUNNEL_CSUM		= 1 << 26,
 	MLX4_CQE_L2_TUNNEL_IPV4		= 1 << 25,
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index 6fed539e5456..6c619006c21f 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -272,7 +272,7 @@ enum {
 	MLX4_WQE_CTRL_SOLICITED		= 1 << 1,
 	MLX4_WQE_CTRL_IP_CSUM		= 1 << 4,
 	MLX4_WQE_CTRL_TCP_UDP_CSUM	= 1 << 5,
-	MLX4_WQE_CTRL_INS_VLAN		= 1 << 6,
+	MLX4_WQE_CTRL_INS_CVLAN		= 1 << 6,
 	MLX4_WQE_CTRL_STRONG_ORDER	= 1 << 7,
 	MLX4_WQE_CTRL_FORCE_LOOPBACK	= 1 << 0,
 };
-- 
cgit v1.2.3-70-g09d2


From e38af4faf01d0b35df6995fb395e5fa4a4898289 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 27 Jul 2015 14:46:34 +0300
Subject: net/mlx4_en: Add support for hardware accelerated 802.1ad vlan

To enable device support in accelerated 802.1ad vlan, the port
capability "packet has vlan enable" (phv_en) should be set.
Firmware won't work properly, in case phv_en is not set.

The user can enable "phv_en" port capability with the new ethtool
private flag phv-bit. The phv-bit private flag default value is OFF,
users who are interested in 802.1ad hardware acceleration should turn ON
the phv-bit private flag:
$ ethtool --set-priv-flags eth1 phv-bit on

Once the private flag is set, the device is ready for 802.1ad vlan
acceleration.

The user should also change the interface device features and turn on
"tx-vlan-stag-hw-insert" which is off by default:
$ ethtool -K eth1  tx-vlan-stag-hw-insert on

"phv-bit" private flag setting is available only for Physical
Functions(PF), the Virtual Function (VF) will be able to use the feature
by setting "tx-vlan-stag-hw-insert" ethtool device feature only if the
feature was enabled by the Hypervisor.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 16 +++++++++
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c  | 46 +++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/en_rx.c      | 16 ++++++++-
 drivers/net/ethernet/mellanox/mlx4/en_tx.c      | 13 ++++---
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h    |  1 +
 include/linux/mlx4/cq.h                         |  1 +
 include/linux/mlx4/qp.h                         |  1 +
 7 files changed, 89 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 70f65534e786..f79d8124321e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -102,6 +102,7 @@ mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo)
 
 static const char mlx4_en_priv_flags[][ETH_GSTRING_LEN] = {
 	"blueflame",
+	"phv-bit"
 };
 
 static const char main_strings[][ETH_GSTRING_LEN] = {
@@ -1797,9 +1798,13 @@ static int mlx4_en_get_ts_info(struct net_device *dev,
 static int mlx4_en_set_priv_flags(struct net_device *dev, u32 flags)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
 	bool bf_enabled_new = !!(flags & MLX4_EN_PRIV_FLAGS_BLUEFLAME);
 	bool bf_enabled_old = !!(priv->pflags & MLX4_EN_PRIV_FLAGS_BLUEFLAME);
+	bool phv_enabled_new = !!(flags & MLX4_EN_PRIV_FLAGS_PHV);
+	bool phv_enabled_old = !!(priv->pflags & MLX4_EN_PRIV_FLAGS_PHV);
 	int i;
+	int ret = 0;
 
 	if (bf_enabled_new != bf_enabled_old) {
 		if (bf_enabled_new) {
@@ -1825,6 +1830,17 @@ static int mlx4_en_set_priv_flags(struct net_device *dev, u32 flags)
 			bf_enabled_new ?  "Enabled" : "Disabled");
 	}
 
+	if (phv_enabled_new != phv_enabled_old) {
+		ret = set_phv_bit(mdev->dev, priv->port, (int)phv_enabled_new);
+		if (ret)
+			return ret;
+		else if (phv_enabled_new)
+			priv->pflags |= MLX4_EN_PRIV_FLAGS_PHV;
+		else
+			priv->pflags &= ~MLX4_EN_PRIV_FLAGS_PHV;
+		en_info(priv, "PHV bit %s\n",
+			phv_enabled_new ?  "Enabled" : "Disabled");
+	}
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index e0de2fd1ce12..4726122ea76b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2184,6 +2184,25 @@ static int mlx4_en_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	}
 }
 
+static netdev_features_t mlx4_en_fix_features(struct net_device *netdev,
+					      netdev_features_t features)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(netdev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	/* Since there is no support for separate RX C-TAG/S-TAG vlan accel
+	 * enable/disable make sure S-TAG flag is always in same state as
+	 * C-TAG.
+	 */
+	if (features & NETIF_F_HW_VLAN_CTAG_RX &&
+	    !(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN))
+		features |= NETIF_F_HW_VLAN_STAG_RX;
+	else
+		features &= ~NETIF_F_HW_VLAN_STAG_RX;
+
+	return features;
+}
+
 static int mlx4_en_set_features(struct net_device *netdev,
 		netdev_features_t features)
 {
@@ -2218,6 +2237,10 @@ static int mlx4_en_set_features(struct net_device *netdev,
 		en_info(priv, "Turn %s TX vlan strip offload\n",
 			(features & NETIF_F_HW_VLAN_CTAG_TX) ? "ON" : "OFF");
 
+	if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_HW_VLAN_STAG_TX))
+		en_info(priv, "Turn %s TX S-VLAN strip offload\n",
+			(features & NETIF_F_HW_VLAN_STAG_TX) ? "ON" : "OFF");
+
 	if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_LOOPBACK)) {
 		en_info(priv, "Turn %s loopback\n",
 			(features & NETIF_F_LOOPBACK) ? "ON" : "OFF");
@@ -2460,6 +2483,7 @@ static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_poll_controller	= mlx4_en_netpoll,
 #endif
 	.ndo_set_features	= mlx4_en_set_features,
+	.ndo_fix_features	= mlx4_en_fix_features,
 	.ndo_setup_tc		= mlx4_en_setup_tc,
 #ifdef CONFIG_RFS_ACCEL
 	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
@@ -2500,6 +2524,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
 	.ndo_poll_controller	= mlx4_en_netpoll,
 #endif
 	.ndo_set_features	= mlx4_en_set_features,
+	.ndo_fix_features	= mlx4_en_fix_features,
 	.ndo_setup_tc		= mlx4_en_setup_tc,
 #ifdef CONFIG_RFS_ACCEL
 	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
@@ -2931,6 +2956,27 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 	dev->hw_features |= NETIF_F_LOOPBACK |
 			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 
+	if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN)) {
+		dev->features |= NETIF_F_HW_VLAN_STAG_RX |
+			NETIF_F_HW_VLAN_STAG_FILTER;
+		dev->hw_features |= NETIF_F_HW_VLAN_STAG_RX;
+	}
+
+	if (mlx4_is_slave(mdev->dev)) {
+		int phv;
+
+		err = get_phv_bit(mdev->dev, port, &phv);
+		if (!err && phv) {
+			dev->hw_features |= NETIF_F_HW_VLAN_STAG_TX;
+			priv->pflags |= MLX4_EN_PRIV_FLAGS_PHV;
+		}
+	} else {
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN &&
+		    !(mdev->dev->caps.flags2 &
+		      MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN))
+			dev->hw_features |= NETIF_F_HW_VLAN_STAG_TX;
+	}
+
 	if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP)
 		dev->hw_features |= NETIF_F_RXFCS;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 10f6c2f1d5a0..a67fbb90d69e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -912,6 +912,12 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 				u16 vid = be16_to_cpu(cqe->sl_vid);
 
 				__vlan_hwaccel_put_tag(gro_skb, htons(ETH_P_8021Q), vid);
+			} else if ((be32_to_cpu(cqe->vlan_my_qpn) &
+				  MLX4_CQE_SVLAN_PRESENT_MASK) &&
+				 (dev->features & NETIF_F_HW_VLAN_STAG_RX)) {
+				__vlan_hwaccel_put_tag(gro_skb,
+						       htons(ETH_P_8021AD),
+						       be16_to_cpu(cqe->sl_vid));
 			}
 
 			if (dev->features & NETIF_F_RXHASH)
@@ -973,6 +979,11 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 		    MLX4_CQE_CVLAN_PRESENT_MASK) &&
 		    (dev->features & NETIF_F_HW_VLAN_CTAG_RX))
 			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), be16_to_cpu(cqe->sl_vid));
+		else if ((be32_to_cpu(cqe->vlan_my_qpn) &
+			  MLX4_CQE_SVLAN_PRESENT_MASK) &&
+			 (dev->features & NETIF_F_HW_VLAN_STAG_RX))
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD),
+					       be16_to_cpu(cqe->sl_vid));
 
 		if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) {
 			timestamp = mlx4_en_get_cqe_ts(cqe);
@@ -1070,7 +1081,10 @@ static const int frag_sizes[] = {
 void mlx4_en_calc_rx_buf(struct net_device *dev)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
-	int eff_mtu = dev->mtu + ETH_HLEN + VLAN_HLEN;
+	/* VLAN_HLEN is added twice,to support skb vlan tagged with multiple
+	 * headers. (For example: ETH_P_8021Q and ETH_P_8021AD).
+	 */
+	int eff_mtu = dev->mtu + ETH_HLEN + (2 * VLAN_HLEN);
 	int buf_size = 0;
 	int i = 0;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 7c858f67ef28..494e7762fdb1 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -718,6 +718,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 	u32 index, bf_index;
 	__be32 op_own;
 	u16 vlan_tag = 0;
+	u16 vlan_proto = 0;
 	int i_frag;
 	int lso_header_size;
 	void *fragptr = NULL;
@@ -750,9 +751,10 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto tx_drop;
 	}
 
-	if (skb_vlan_tag_present(skb))
+	if (skb_vlan_tag_present(skb)) {
 		vlan_tag = skb_vlan_tag_get(skb);
-
+		vlan_proto = be16_to_cpu(skb->vlan_proto);
+	}
 
 	netdev_txq_bql_enqueue_prefetchw(ring->tx_queue);
 
@@ -958,8 +960,11 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		ring->bf.offset ^= ring->bf.buf_size;
 	} else {
 		tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
-		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN *
-			!!skb_vlan_tag_present(skb);
+		if (vlan_proto == ETH_P_8021AD)
+			tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_SVLAN;
+		else if (vlan_proto == ETH_P_8021Q)
+			tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN;
+
 		tx_desc->ctrl.fence_size = real_size;
 
 		/* Ensure new descriptor hits memory
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 666d1669eb52..defcf8c395bf 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -95,6 +95,7 @@
  */
 
 #define MLX4_EN_PRIV_FLAGS_BLUEFLAME 1
+#define MLX4_EN_PRIV_FLAGS_PHV	     2
 
 #define MLX4_EN_WATCHDOG_TIMEOUT	(15 * HZ)
 
diff --git a/include/linux/mlx4/cq.h b/include/linux/mlx4/cq.h
index 899a97b20d27..09cebe528488 100644
--- a/include/linux/mlx4/cq.h
+++ b/include/linux/mlx4/cq.h
@@ -89,6 +89,7 @@ struct mlx4_ts_cqe {
 enum {
 	MLX4_CQE_L2_TUNNEL_IPOK		= 1 << 31,
 	MLX4_CQE_CVLAN_PRESENT_MASK	= 1 << 29,
+	MLX4_CQE_SVLAN_PRESENT_MASK	= 1 << 30,
 	MLX4_CQE_L2_TUNNEL		= 1 << 27,
 	MLX4_CQE_L2_TUNNEL_CSUM		= 1 << 26,
 	MLX4_CQE_L2_TUNNEL_IPV4		= 1 << 25,
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index 6c619006c21f..de45a51b3f04 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -273,6 +273,7 @@ enum {
 	MLX4_WQE_CTRL_IP_CSUM		= 1 << 4,
 	MLX4_WQE_CTRL_TCP_UDP_CSUM	= 1 << 5,
 	MLX4_WQE_CTRL_INS_CVLAN		= 1 << 6,
+	MLX4_WQE_CTRL_INS_SVLAN		= 1 << 7,
 	MLX4_WQE_CTRL_STRONG_ORDER	= 1 << 7,
 	MLX4_WQE_CTRL_FORCE_LOOPBACK	= 1 << 0,
 };
-- 
cgit v1.2.3-70-g09d2


From 0817b62cc037a56c5e4238c7eb7522299ea27aef Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Tue, 7 Jul 2015 20:48:08 +0200
Subject: clk: change clk_ops' ->determine_rate() prototype
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Clock rates are stored in an unsigned long field, but ->determine_rate()
(which returns a rounded rate from a requested one) returns a long
value (errors are reported using negative error codes), which can lead
to long overflow if the clock rate exceed 2Ghz.

Change ->determine_rate() prototype to return 0 or an error code, and pass
a pointer to a clk_rate_request structure containing the expected target
rate and the rate constraints imposed by clk users.

The clk_rate_request structure might be extended in the future to contain
other kind of constraints like the rounding policy, the maximum clock
inaccuracy or other things that are not yet supported by the CCF
(power consumption constraints ?).

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
CC: Jonathan Corbet <corbet@lwn.net>
CC: Tony Lindgren <tony@atomide.com>
CC: Ralf Baechle <ralf@linux-mips.org>
CC: "Emilio López" <emilio@elopez.com.ar>
CC: Maxime Ripard <maxime.ripard@free-electrons.com>
Acked-by: Tero Kristo <t-kristo@ti.com>
CC: Peter De Schrijver <pdeschrijver@nvidia.com>
CC: Prashant Gaikwad <pgaikwad@nvidia.com>
CC: Stephen Warren <swarren@wwwdotorg.org>
CC: Thierry Reding <thierry.reding@gmail.com>
CC: Alexandre Courbot <gnurou@gmail.com>
CC: linux-doc@vger.kernel.org
CC: linux-kernel@vger.kernel.org
CC: linux-arm-kernel@lists.infradead.org
CC: linux-omap@vger.kernel.org
CC: linux-mips@linux-mips.org
CC: linux-tegra@vger.kernel.org
[sboyd@codeaurora.org: Fix parent dereference problem in
__clk_determine_rate()]
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Tested-by: Romain Perier <romain.perier@gmail.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
[sboyd@codeaurora.org: Folded in fix from Heiko for fixed-rate
clocks without parents or a rate determining op]
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 Documentation/clk.txt               |   8 +-
 arch/arm/mach-omap2/dpll3xxx.c      |  29 +++---
 arch/arm/mach-omap2/dpll44xx.c      |  30 +++---
 arch/mips/alchemy/common/clock.c    |  61 +++++--------
 drivers/clk/at91/clk-programmable.c |  25 ++---
 drivers/clk/at91/clk-usb.c          |  28 +++---
 drivers/clk/bcm/clk-kona.c          |  34 ++++---
 drivers/clk/clk-composite.c         |  48 +++++-----
 drivers/clk/clk.c                   | 176 ++++++++++++++++++++----------------
 drivers/clk/hisilicon/clk-hi3620.c  |  39 ++++----
 drivers/clk/mmp/clk-mix.c           |  20 ++--
 drivers/clk/qcom/clk-pll.c          |  18 ++--
 drivers/clk/qcom/clk-rcg.c          |  44 ++++-----
 drivers/clk/qcom/clk-rcg2.c         |  78 ++++++++--------
 drivers/clk/sunxi/clk-factors.c     |  21 ++---
 drivers/clk/sunxi/clk-sun6i-ar100.c |  21 ++---
 drivers/clk/sunxi/clk-sunxi.c       |  20 ++--
 drivers/clk/tegra/clk-emc.c         |  28 +++---
 include/linux/clk-provider.h        |  49 ++++++----
 include/linux/clk/ti.h              |  16 +---
 20 files changed, 392 insertions(+), 401 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/clk.txt b/Documentation/clk.txt
index f463bdc37f88..5c4bc4d01d0c 100644
--- a/Documentation/clk.txt
+++ b/Documentation/clk.txt
@@ -71,12 +71,8 @@ the operations defined in clk.h:
 		long		(*round_rate)(struct clk_hw *hw,
 						unsigned long rate,
 						unsigned long *parent_rate);
-		long		(*determine_rate)(struct clk_hw *hw,
-						unsigned long rate,
-						unsigned long min_rate,
-						unsigned long max_rate,
-						unsigned long *best_parent_rate,
-						struct clk_hw **best_parent_clk);
+		int		(*determine_rate)(struct clk_hw *hw,
+						  struct clk_rate_request *req);
 		int		(*set_parent)(struct clk_hw *hw, u8 index);
 		u8		(*get_parent)(struct clk_hw *hw);
 		int		(*set_rate)(struct clk_hw *hw,
diff --git a/arch/arm/mach-omap2/dpll3xxx.c b/arch/arm/mach-omap2/dpll3xxx.c
index 44e57ec225d4..8c57ace30421 100644
--- a/arch/arm/mach-omap2/dpll3xxx.c
+++ b/arch/arm/mach-omap2/dpll3xxx.c
@@ -462,43 +462,38 @@ void omap3_noncore_dpll_disable(struct clk_hw *hw)
 /**
  * omap3_noncore_dpll_determine_rate - determine rate for a DPLL
  * @hw: pointer to the clock to determine rate for
- * @rate: target rate for the DPLL
- * @best_parent_rate: pointer for returning best parent rate
- * @best_parent_clk: pointer for returning best parent clock
+ * @req: target rate request
  *
  * Determines which DPLL mode to use for reaching a desired target rate.
  * Checks whether the DPLL shall be in bypass or locked mode, and if
  * locked, calculates the M,N values for the DPLL via round-rate.
- * Returns a positive clock rate with success, negative error value
- * in failure.
+ * Returns a 0 on success, negative error value in failure.
  */
-long omap3_noncore_dpll_determine_rate(struct clk_hw *hw, unsigned long rate,
-				       unsigned long min_rate,
-				       unsigned long max_rate,
-				       unsigned long *best_parent_rate,
-				       struct clk_hw **best_parent_clk)
+int omap3_noncore_dpll_determine_rate(struct clk_hw *hw,
+				      struct clk_rate_request *req)
 {
 	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
 	struct dpll_data *dd;
 
-	if (!hw || !rate)
+	if (!req->rate)
 		return -EINVAL;
 
 	dd = clk->dpll_data;
 	if (!dd)
 		return -EINVAL;
 
-	if (__clk_get_rate(dd->clk_bypass) == rate &&
+	if (__clk_get_rate(dd->clk_bypass) == req->rate &&
 	    (dd->modes & (1 << DPLL_LOW_POWER_BYPASS))) {
-		*best_parent_clk = __clk_get_hw(dd->clk_bypass);
+		req->best_parent_hw = __clk_get_hw(dd->clk_bypass);
 	} else {
-		rate = omap2_dpll_round_rate(hw, rate, best_parent_rate);
-		*best_parent_clk = __clk_get_hw(dd->clk_ref);
+		req->rate = omap2_dpll_round_rate(hw, req->rate,
+					  &req->best_parent_rate);
+		req->best_parent_hw = __clk_get_hw(dd->clk_ref);
 	}
 
-	*best_parent_rate = rate;
+	req->best_parent_rate = req->rate;
 
-	return rate;
+	return 0;
 }
 
 /**
diff --git a/arch/arm/mach-omap2/dpll44xx.c b/arch/arm/mach-omap2/dpll44xx.c
index f231be05b9a6..446a4e0d5a6a 100644
--- a/arch/arm/mach-omap2/dpll44xx.c
+++ b/arch/arm/mach-omap2/dpll44xx.c
@@ -191,42 +191,36 @@ out:
 /**
  * omap4_dpll_regm4xen_determine_rate - determine rate for a DPLL
  * @hw: pointer to the clock to determine rate for
- * @rate: target rate for the DPLL
- * @best_parent_rate: pointer for returning best parent rate
- * @best_parent_clk: pointer for returning best parent clock
+ * @req: target rate request
  *
  * Determines which DPLL mode to use for reaching a desired rate.
  * Checks whether the DPLL shall be in bypass or locked mode, and if
  * locked, calculates the M,N values for the DPLL via round-rate.
- * Returns a positive clock rate with success, negative error value
- * in failure.
+ * Returns 0 on success and a negative error value otherwise.
  */
-long omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw, unsigned long rate,
-					unsigned long min_rate,
-					unsigned long max_rate,
-					unsigned long *best_parent_rate,
-					struct clk_hw **best_parent_clk)
+int omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw,
+				       struct clk_rate_request *req)
 {
 	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
 	struct dpll_data *dd;
 
-	if (!hw || !rate)
+	if (!req->rate)
 		return -EINVAL;
 
 	dd = clk->dpll_data;
 	if (!dd)
 		return -EINVAL;
 
-	if (__clk_get_rate(dd->clk_bypass) == rate &&
+	if (__clk_get_rate(dd->clk_bypass) == req->rate &&
 	    (dd->modes & (1 << DPLL_LOW_POWER_BYPASS))) {
-		*best_parent_clk = __clk_get_hw(dd->clk_bypass);
+		req->best_parent_hw = __clk_get_hw(dd->clk_bypass);
 	} else {
-		rate = omap4_dpll_regm4xen_round_rate(hw, rate,
-						      best_parent_rate);
-		*best_parent_clk = __clk_get_hw(dd->clk_ref);
+		req->rate = omap4_dpll_regm4xen_round_rate(hw, req->rate,
+						&req->best_parent_rate);
+		req->best_parent_hw = __clk_get_hw(dd->clk_ref);
 	}
 
-	*best_parent_rate = rate;
+	req->best_parent_rate = req->rate;
 
-	return rate;
+	return 0;
 }
diff --git a/arch/mips/alchemy/common/clock.c b/arch/mips/alchemy/common/clock.c
index 6e46abe0dac6..0b4cf3e9f005 100644
--- a/arch/mips/alchemy/common/clock.c
+++ b/arch/mips/alchemy/common/clock.c
@@ -389,10 +389,9 @@ static long alchemy_calc_div(unsigned long rate, unsigned long prate,
 	return div1;
 }
 
-static long alchemy_clk_fgcs_detr(struct clk_hw *hw, unsigned long rate,
-					unsigned long *best_parent_rate,
-					struct clk_hw **best_parent_clk,
-					int scale, int maxdiv)
+static int alchemy_clk_fgcs_detr(struct clk_hw *hw,
+				 struct clk_rate_request *req,
+				 int scale, int maxdiv)
 {
 	struct clk *pc, *bpc, *free;
 	long tdv, tpr, pr, nr, br, bpr, diff, lastdiff;
@@ -422,14 +421,14 @@ static long alchemy_clk_fgcs_detr(struct clk_hw *hw, unsigned long rate,
 		}
 
 		pr = clk_get_rate(pc);
-		if (pr < rate)
+		if (pr < req->rate)
 			continue;
 
 		/* what can hardware actually provide */
-		tdv = alchemy_calc_div(rate, pr, scale, maxdiv, NULL);
+		tdv = alchemy_calc_div(req->rate, pr, scale, maxdiv, NULL);
 		nr = pr / tdv;
-		diff = rate - nr;
-		if (nr > rate)
+		diff = req->rate - nr;
+		if (nr > req->rate)
 			continue;
 
 		if (diff < lastdiff) {
@@ -448,15 +447,16 @@ static long alchemy_clk_fgcs_detr(struct clk_hw *hw, unsigned long rate,
 	 */
 	if (lastdiff && free) {
 		for (j = (maxdiv == 4) ? 1 : scale; j <= maxdiv; j += scale) {
-			tpr = rate * j;
+			tpr = req->rate * j;
 			if (tpr < 0)
 				break;
 			pr = clk_round_rate(free, tpr);
 
-			tdv = alchemy_calc_div(rate, pr, scale, maxdiv, NULL);
+			tdv = alchemy_calc_div(req->rate, pr, scale, maxdiv,
+					       NULL);
 			nr = pr / tdv;
-			diff = rate - nr;
-			if (nr > rate)
+			diff = req->rate - nr;
+			if (nr > req->rate)
 				continue;
 			if (diff < lastdiff) {
 				lastdiff = diff;
@@ -469,9 +469,10 @@ static long alchemy_clk_fgcs_detr(struct clk_hw *hw, unsigned long rate,
 		}
 	}
 
-	*best_parent_rate = bpr;
-	*best_parent_clk = __clk_get_hw(bpc);
-	return br;
+	req->best_parent_rate = bpr;
+	req->best_parent_hw = __clk_get_hw(bpc);
+	req->rate = br;
+	return 0;
 }
 
 static int alchemy_clk_fgv1_en(struct clk_hw *hw)
@@ -562,14 +563,10 @@ static unsigned long alchemy_clk_fgv1_recalc(struct clk_hw *hw,
 	return parent_rate / v;
 }
 
-static long alchemy_clk_fgv1_detr(struct clk_hw *hw, unsigned long rate,
-					unsigned long min_rate,
-					unsigned long max_rate,
-					unsigned long *best_parent_rate,
-					struct clk_hw **best_parent_clk)
+static int alchemy_clk_fgv1_detr(struct clk_hw *hw,
+				 struct clk_rate_request *req)
 {
-	return alchemy_clk_fgcs_detr(hw, rate, best_parent_rate,
-				     best_parent_clk, 2, 512);
+	return alchemy_clk_fgcs_detr(hw, req, 2, 512);
 }
 
 /* Au1000, Au1100, Au15x0, Au12x0 */
@@ -696,11 +693,8 @@ static unsigned long alchemy_clk_fgv2_recalc(struct clk_hw *hw,
 	return t;
 }
 
-static long alchemy_clk_fgv2_detr(struct clk_hw *hw, unsigned long rate,
-					unsigned long min_rate,
-					unsigned long max_rate,
-					unsigned long *best_parent_rate,
-					struct clk_hw **best_parent_clk)
+static int alchemy_clk_fgv2_detr(struct clk_hw *hw,
+				 struct clk_rate_request *req)
 {
 	struct alchemy_fgcs_clk *c = to_fgcs_clk(hw);
 	int scale, maxdiv;
@@ -713,8 +707,7 @@ static long alchemy_clk_fgv2_detr(struct clk_hw *hw, unsigned long rate,
 		maxdiv = 512;
 	}
 
-	return alchemy_clk_fgcs_detr(hw, rate, best_parent_rate,
-				     best_parent_clk, scale, maxdiv);
+	return alchemy_clk_fgcs_detr(hw, req, scale, maxdiv);
 }
 
 /* Au1300 larger input mux, no separate disable bit, flexible divider */
@@ -917,17 +910,13 @@ static int alchemy_clk_csrc_setr(struct clk_hw *hw, unsigned long rate,
 	return 0;
 }
 
-static long alchemy_clk_csrc_detr(struct clk_hw *hw, unsigned long rate,
-					unsigned long min_rate,
-					unsigned long max_rate,
-					unsigned long *best_parent_rate,
-					struct clk_hw **best_parent_clk)
+static int alchemy_clk_csrc_detr(struct clk_hw *hw,
+				 struct clk_rate_request *req)
 {
 	struct alchemy_fgcs_clk *c = to_fgcs_clk(hw);
 	int scale = c->dt[2] == 3 ? 1 : 2; /* au1300 check */
 
-	return alchemy_clk_fgcs_detr(hw, rate, best_parent_rate,
-				     best_parent_clk, scale, 4);
+	return alchemy_clk_fgcs_detr(hw, req, scale, 4);
 }
 
 static struct clk_ops alchemy_clkops_csrc = {
diff --git a/drivers/clk/at91/clk-programmable.c b/drivers/clk/at91/clk-programmable.c
index 8c86c0f7847a..43dacad5c96d 100644
--- a/drivers/clk/at91/clk-programmable.c
+++ b/drivers/clk/at91/clk-programmable.c
@@ -54,12 +54,8 @@ static unsigned long clk_programmable_recalc_rate(struct clk_hw *hw,
 	return parent_rate >> pres;
 }
 
-static long clk_programmable_determine_rate(struct clk_hw *hw,
-					    unsigned long rate,
-					    unsigned long min_rate,
-					    unsigned long max_rate,
-					    unsigned long *best_parent_rate,
-					    struct clk_hw **best_parent_hw)
+static int clk_programmable_determine_rate(struct clk_hw *hw,
+					   struct clk_rate_request *req)
 {
 	struct clk *parent = NULL;
 	long best_rate = -EINVAL;
@@ -76,24 +72,29 @@ static long clk_programmable_determine_rate(struct clk_hw *hw,
 		parent_rate = __clk_get_rate(parent);
 		for (shift = 0; shift < PROG_PRES_MASK; shift++) {
 			tmp_rate = parent_rate >> shift;
-			if (tmp_rate <= rate)
+			if (tmp_rate <= req->rate)
 				break;
 		}
 
-		if (tmp_rate > rate)
+		if (tmp_rate > req->rate)
 			continue;
 
-		if (best_rate < 0 || (rate - tmp_rate) < (rate - best_rate)) {
+		if (best_rate < 0 ||
+		    (req->rate - tmp_rate) < (req->rate - best_rate)) {
 			best_rate = tmp_rate;
-			*best_parent_rate = parent_rate;
-			*best_parent_hw = __clk_get_hw(parent);
+			req->best_parent_rate = parent_rate;
+			req->best_parent_hw = __clk_get_hw(parent);
 		}
 
 		if (!best_rate)
 			break;
 	}
 
-	return best_rate;
+	if (best_rate < 0)
+		return best_rate;
+
+	req->rate = best_rate;
+	return 0;
 }
 
 static int clk_programmable_set_parent(struct clk_hw *hw, u8 index)
diff --git a/drivers/clk/at91/clk-usb.c b/drivers/clk/at91/clk-usb.c
index b0cbd2b1ff59..24747df97742 100644
--- a/drivers/clk/at91/clk-usb.c
+++ b/drivers/clk/at91/clk-usb.c
@@ -56,12 +56,8 @@ static unsigned long at91sam9x5_clk_usb_recalc_rate(struct clk_hw *hw,
 	return DIV_ROUND_CLOSEST(parent_rate, (usbdiv + 1));
 }
 
-static long at91sam9x5_clk_usb_determine_rate(struct clk_hw *hw,
-					      unsigned long rate,
-					      unsigned long min_rate,
-					      unsigned long max_rate,
-					      unsigned long *best_parent_rate,
-					      struct clk_hw **best_parent_hw)
+static int at91sam9x5_clk_usb_determine_rate(struct clk_hw *hw,
+					     struct clk_rate_request *req)
 {
 	struct clk *parent = NULL;
 	long best_rate = -EINVAL;
@@ -80,23 +76,23 @@ static long at91sam9x5_clk_usb_determine_rate(struct clk_hw *hw,
 		for (div = 1; div < SAM9X5_USB_MAX_DIV + 2; div++) {
 			unsigned long tmp_parent_rate;
 
-			tmp_parent_rate = rate * div;
+			tmp_parent_rate = req->rate * div;
 			tmp_parent_rate = __clk_round_rate(parent,
 							   tmp_parent_rate);
 			tmp_rate = DIV_ROUND_CLOSEST(tmp_parent_rate, div);
-			if (tmp_rate < rate)
-				tmp_diff = rate - tmp_rate;
+			if (tmp_rate < req->rate)
+				tmp_diff = req->rate - tmp_rate;
 			else
-				tmp_diff = tmp_rate - rate;
+				tmp_diff = tmp_rate - req->rate;
 
 			if (best_diff < 0 || best_diff > tmp_diff) {
 				best_rate = tmp_rate;
 				best_diff = tmp_diff;
-				*best_parent_rate = tmp_parent_rate;
-				*best_parent_hw = __clk_get_hw(parent);
+				req->best_parent_rate = tmp_parent_rate;
+				req->best_parent_hw = __clk_get_hw(parent);
 			}
 
-			if (!best_diff || tmp_rate < rate)
+			if (!best_diff || tmp_rate < req->rate)
 				break;
 		}
 
@@ -104,7 +100,11 @@ static long at91sam9x5_clk_usb_determine_rate(struct clk_hw *hw,
 			break;
 	}
 
-	return best_rate;
+	if (best_rate < 0)
+		return best_rate;
+
+	req->rate = best_rate;
+	return 0;
 }
 
 static int at91sam9x5_clk_usb_set_parent(struct clk_hw *hw, u8 index)
diff --git a/drivers/clk/bcm/clk-kona.c b/drivers/clk/bcm/clk-kona.c
index 79a98506c433..d9c039c1902c 100644
--- a/drivers/clk/bcm/clk-kona.c
+++ b/drivers/clk/bcm/clk-kona.c
@@ -1017,10 +1017,8 @@ static long kona_peri_clk_round_rate(struct clk_hw *hw, unsigned long rate,
 				rate ? rate : 1, *parent_rate, NULL);
 }
 
-static long kona_peri_clk_determine_rate(struct clk_hw *hw, unsigned long rate,
-		unsigned long min_rate,
-		unsigned long max_rate,
-		unsigned long *best_parent_rate, struct clk_hw **best_parent)
+static int kona_peri_clk_determine_rate(struct clk_hw *hw,
+					struct clk_rate_request *req)
 {
 	struct kona_clk *bcm_clk = to_kona_clk(hw);
 	struct clk *clk = hw->clk;
@@ -1029,6 +1027,7 @@ static long kona_peri_clk_determine_rate(struct clk_hw *hw, unsigned long rate,
 	unsigned long best_delta;
 	unsigned long best_rate;
 	u32 parent_count;
+	long rate;
 	u32 which;
 
 	/*
@@ -1037,14 +1036,21 @@ static long kona_peri_clk_determine_rate(struct clk_hw *hw, unsigned long rate,
 	 */
 	WARN_ON_ONCE(bcm_clk->init_data.flags & CLK_SET_RATE_NO_REPARENT);
 	parent_count = (u32)bcm_clk->init_data.num_parents;
-	if (parent_count < 2)
-		return kona_peri_clk_round_rate(hw, rate, best_parent_rate);
+	if (parent_count < 2) {
+		rate = kona_peri_clk_round_rate(hw, req->rate,
+						&req->best_parent_rate);
+		if (rate < 0)
+			return rate;
+
+		req->rate = rate;
+		return 0;
+	}
 
 	/* Unless we can do better, stick with current parent */
 	current_parent = clk_get_parent(clk);
 	parent_rate = __clk_get_rate(current_parent);
-	best_rate = kona_peri_clk_round_rate(hw, rate, &parent_rate);
-	best_delta = abs(best_rate - rate);
+	best_rate = kona_peri_clk_round_rate(hw, req->rate, &parent_rate);
+	best_delta = abs(best_rate - req->rate);
 
 	/* Check whether any other parent clock can produce a better result */
 	for (which = 0; which < parent_count; which++) {
@@ -1058,17 +1064,19 @@ static long kona_peri_clk_determine_rate(struct clk_hw *hw, unsigned long rate,
 
 		/* We don't support CLK_SET_RATE_PARENT */
 		parent_rate = __clk_get_rate(parent);
-		other_rate = kona_peri_clk_round_rate(hw, rate, &parent_rate);
-		delta = abs(other_rate - rate);
+		other_rate = kona_peri_clk_round_rate(hw, req->rate,
+						      &parent_rate);
+		delta = abs(other_rate - req->rate);
 		if (delta < best_delta) {
 			best_delta = delta;
 			best_rate = other_rate;
-			*best_parent = __clk_get_hw(parent);
-			*best_parent_rate = parent_rate;
+			req->best_parent_hw = __clk_get_hw(parent);
+			req->best_parent_rate = parent_rate;
 		}
 	}
 
-	return best_rate;
+	req->rate = best_rate;
+	return 0;
 }
 
 static int kona_peri_clk_set_parent(struct clk_hw *hw, u8 index)
diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c
index 616f5aef3c26..9e69f346ecc6 100644
--- a/drivers/clk/clk-composite.c
+++ b/drivers/clk/clk-composite.c
@@ -55,11 +55,8 @@ static unsigned long clk_composite_recalc_rate(struct clk_hw *hw,
 	return rate_ops->recalc_rate(rate_hw, parent_rate);
 }
 
-static long clk_composite_determine_rate(struct clk_hw *hw, unsigned long rate,
-					unsigned long min_rate,
-					unsigned long max_rate,
-					unsigned long *best_parent_rate,
-					struct clk_hw **best_parent_p)
+static int clk_composite_determine_rate(struct clk_hw *hw,
+					struct clk_rate_request *req)
 {
 	struct clk_composite *composite = to_clk_composite(hw);
 	const struct clk_ops *rate_ops = composite->rate_ops;
@@ -71,25 +68,28 @@ static long clk_composite_determine_rate(struct clk_hw *hw, unsigned long rate,
 	long tmp_rate, best_rate = 0;
 	unsigned long rate_diff;
 	unsigned long best_rate_diff = ULONG_MAX;
+	long rate;
 	int i;
 
 	if (rate_hw && rate_ops && rate_ops->determine_rate) {
 		__clk_hw_set_clk(rate_hw, hw);
-		return rate_ops->determine_rate(rate_hw, rate, min_rate,
-						max_rate,
-						best_parent_rate,
-						best_parent_p);
+		return rate_ops->determine_rate(rate_hw, req);
 	} else if (rate_hw && rate_ops && rate_ops->round_rate &&
 		   mux_hw && mux_ops && mux_ops->set_parent) {
-		*best_parent_p = NULL;
+		req->best_parent_hw = NULL;
 
 		if (__clk_get_flags(hw->clk) & CLK_SET_RATE_NO_REPARENT) {
 			parent = clk_get_parent(mux_hw->clk);
-			*best_parent_p = __clk_get_hw(parent);
-			*best_parent_rate = __clk_get_rate(parent);
+			req->best_parent_hw = __clk_get_hw(parent);
+			req->best_parent_rate = __clk_get_rate(parent);
 
-			return rate_ops->round_rate(rate_hw, rate,
-						    best_parent_rate);
+			rate = rate_ops->round_rate(rate_hw, req->rate,
+						    &req->best_parent_rate);
+			if (rate < 0)
+				return rate;
+
+			req->rate = rate;
+			return 0;
 		}
 
 		for (i = 0; i < __clk_get_num_parents(mux_hw->clk); i++) {
@@ -99,33 +99,33 @@ static long clk_composite_determine_rate(struct clk_hw *hw, unsigned long rate,
 
 			parent_rate = __clk_get_rate(parent);
 
-			tmp_rate = rate_ops->round_rate(rate_hw, rate,
+			tmp_rate = rate_ops->round_rate(rate_hw, req->rate,
 							&parent_rate);
 			if (tmp_rate < 0)
 				continue;
 
-			rate_diff = abs(rate - tmp_rate);
+			rate_diff = abs(req->rate - tmp_rate);
 
-			if (!rate_diff || !*best_parent_p
+			if (!rate_diff || !req->best_parent_hw
 				       || best_rate_diff > rate_diff) {
-				*best_parent_p = __clk_get_hw(parent);
-				*best_parent_rate = parent_rate;
+				req->best_parent_hw = __clk_get_hw(parent);
+				req->best_parent_rate = parent_rate;
 				best_rate_diff = rate_diff;
 				best_rate = tmp_rate;
 			}
 
 			if (!rate_diff)
-				return rate;
+				return 0;
 		}
 
-		return best_rate;
+		req->rate = best_rate;
+		return 0;
 	} else if (mux_hw && mux_ops && mux_ops->determine_rate) {
 		__clk_hw_set_clk(mux_hw, hw);
-		return mux_ops->determine_rate(mux_hw, rate, min_rate,
-					       max_rate, best_parent_rate,
-					       best_parent_p);
+		return mux_ops->determine_rate(mux_hw, req);
 	} else {
 		pr_err("clk: clk_composite_determine_rate function called, but no mux or rate callback set!\n");
+		req->rate = 0;
 		return 0;
 	}
 }
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index ddb4b541016f..4e9ff928ef88 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -436,28 +436,31 @@ static bool mux_is_better_rate(unsigned long rate, unsigned long now,
 	return now <= rate && now > best;
 }
 
-static long
-clk_mux_determine_rate_flags(struct clk_hw *hw, unsigned long rate,
-			     unsigned long min_rate,
-			     unsigned long max_rate,
-			     unsigned long *best_parent_rate,
-			     struct clk_hw **best_parent_p,
+static int
+clk_mux_determine_rate_flags(struct clk_hw *hw, struct clk_rate_request *req,
 			     unsigned long flags)
 {
 	struct clk_core *core = hw->core, *parent, *best_parent = NULL;
-	int i, num_parents;
-	unsigned long parent_rate, best = 0;
+	int i, num_parents, ret;
+	unsigned long best = 0;
+	struct clk_rate_request parent_req = *req;
 
 	/* if NO_REPARENT flag set, pass through to current parent */
 	if (core->flags & CLK_SET_RATE_NO_REPARENT) {
 		parent = core->parent;
-		if (core->flags & CLK_SET_RATE_PARENT)
-			best = __clk_determine_rate(parent ? parent->hw : NULL,
-						    rate, min_rate, max_rate);
-		else if (parent)
+		if (core->flags & CLK_SET_RATE_PARENT) {
+			ret = __clk_determine_rate(parent ? parent->hw : NULL,
+						   &parent_req);
+			if (ret)
+				return ret;
+
+			best = parent_req.rate;
+		} else if (parent) {
 			best = clk_core_get_rate_nolock(parent);
-		else
+		} else {
 			best = clk_core_get_rate_nolock(core);
+		}
+
 		goto out;
 	}
 
@@ -467,24 +470,30 @@ clk_mux_determine_rate_flags(struct clk_hw *hw, unsigned long rate,
 		parent = clk_core_get_parent_by_index(core, i);
 		if (!parent)
 			continue;
-		if (core->flags & CLK_SET_RATE_PARENT)
-			parent_rate = __clk_determine_rate(parent->hw, rate,
-							   min_rate,
-							   max_rate);
-		else
-			parent_rate = clk_core_get_rate_nolock(parent);
-		if (mux_is_better_rate(rate, parent_rate, best, flags)) {
+
+		if (core->flags & CLK_SET_RATE_PARENT) {
+			parent_req = *req;
+			ret = __clk_determine_rate(parent->hw, &parent_req);
+			if (ret)
+				continue;
+		} else {
+			parent_req.rate = clk_core_get_rate_nolock(parent);
+		}
+
+		if (mux_is_better_rate(req->rate, parent_req.rate,
+				       best, flags)) {
 			best_parent = parent;
-			best = parent_rate;
+			best = parent_req.rate;
 		}
 	}
 
 out:
 	if (best_parent)
-		*best_parent_p = best_parent->hw;
-	*best_parent_rate = best;
+		req->best_parent_hw = best_parent->hw;
+	req->best_parent_rate = best;
+	req->rate = best;
 
-	return best;
+	return 0;
 }
 
 struct clk *__clk_lookup(const char *name)
@@ -515,28 +524,17 @@ static void clk_core_get_boundaries(struct clk_core *core,
  * directly as a determine_rate callback (e.g. for a mux), or from a more
  * complex clock that may combine a mux with other operations.
  */
-long __clk_mux_determine_rate(struct clk_hw *hw, unsigned long rate,
-			      unsigned long min_rate,
-			      unsigned long max_rate,
-			      unsigned long *best_parent_rate,
-			      struct clk_hw **best_parent_p)
+int __clk_mux_determine_rate(struct clk_hw *hw,
+			     struct clk_rate_request *req)
 {
-	return clk_mux_determine_rate_flags(hw, rate, min_rate, max_rate,
-					    best_parent_rate,
-					    best_parent_p, 0);
+	return clk_mux_determine_rate_flags(hw, req, 0);
 }
 EXPORT_SYMBOL_GPL(__clk_mux_determine_rate);
 
-long __clk_mux_determine_rate_closest(struct clk_hw *hw, unsigned long rate,
-			      unsigned long min_rate,
-			      unsigned long max_rate,
-			      unsigned long *best_parent_rate,
-			      struct clk_hw **best_parent_p)
+int __clk_mux_determine_rate_closest(struct clk_hw *hw,
+				     struct clk_rate_request *req)
 {
-	return clk_mux_determine_rate_flags(hw, rate, min_rate, max_rate,
-					    best_parent_rate,
-					    best_parent_p,
-					    CLK_MUX_ROUND_CLOSEST);
+	return clk_mux_determine_rate_flags(hw, req, CLK_MUX_ROUND_CLOSEST);
 }
 EXPORT_SYMBOL_GPL(__clk_mux_determine_rate_closest);
 
@@ -759,14 +757,11 @@ int clk_enable(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_enable);
 
-static unsigned long clk_core_round_rate_nolock(struct clk_core *core,
-						unsigned long rate,
-						unsigned long min_rate,
-						unsigned long max_rate)
+static int clk_core_round_rate_nolock(struct clk_core *core,
+				      struct clk_rate_request *req)
 {
-	unsigned long parent_rate = 0;
 	struct clk_core *parent;
-	struct clk_hw *parent_hw;
+	long rate;
 
 	lockdep_assert_held(&prepare_lock);
 
@@ -774,21 +769,30 @@ static unsigned long clk_core_round_rate_nolock(struct clk_core *core,
 		return 0;
 
 	parent = core->parent;
-	if (parent)
-		parent_rate = parent->rate;
+	if (parent) {
+		req->best_parent_hw = parent->hw;
+		req->best_parent_rate = parent->rate;
+	} else {
+		req->best_parent_hw = NULL;
+		req->best_parent_rate = 0;
+	}
 
 	if (core->ops->determine_rate) {
-		parent_hw = parent ? parent->hw : NULL;
-		return core->ops->determine_rate(core->hw, rate,
-						min_rate, max_rate,
-						&parent_rate, &parent_hw);
-	} else if (core->ops->round_rate)
-		return core->ops->round_rate(core->hw, rate, &parent_rate);
-	else if (core->flags & CLK_SET_RATE_PARENT)
-		return clk_core_round_rate_nolock(core->parent, rate, min_rate,
-						  max_rate);
-	else
-		return core->rate;
+		return core->ops->determine_rate(core->hw, req);
+	} else if (core->ops->round_rate) {
+		rate = core->ops->round_rate(core->hw, req->rate,
+					     &req->best_parent_rate);
+		if (rate < 0)
+			return rate;
+
+		req->rate = rate;
+	} else if (core->flags & CLK_SET_RATE_PARENT) {
+		return clk_core_round_rate_nolock(parent, req);
+	} else {
+		req->rate = core->rate;
+	}
+
+	return 0;
 }
 
 /**
@@ -800,15 +804,14 @@ static unsigned long clk_core_round_rate_nolock(struct clk_core *core,
  *
  * Useful for clk_ops such as .set_rate and .determine_rate.
  */
-unsigned long __clk_determine_rate(struct clk_hw *hw,
-				   unsigned long rate,
-				   unsigned long min_rate,
-				   unsigned long max_rate)
+int __clk_determine_rate(struct clk_hw *hw, struct clk_rate_request *req)
 {
-	if (!hw)
+	if (!hw) {
+		req->rate = 0;
 		return 0;
+	}
 
-	return clk_core_round_rate_nolock(hw->core, rate, min_rate, max_rate);
+	return clk_core_round_rate_nolock(hw->core, req);
 }
 EXPORT_SYMBOL_GPL(__clk_determine_rate);
 
@@ -821,15 +824,20 @@ EXPORT_SYMBOL_GPL(__clk_determine_rate);
  */
 unsigned long __clk_round_rate(struct clk *clk, unsigned long rate)
 {
-	unsigned long min_rate;
-	unsigned long max_rate;
+	struct clk_rate_request req;
+	int ret;
 
 	if (!clk)
 		return 0;
 
-	clk_core_get_boundaries(clk->core, &min_rate, &max_rate);
+	clk_core_get_boundaries(clk->core, &req.min_rate, &req.max_rate);
+	req.rate = rate;
+
+	ret = clk_core_round_rate_nolock(clk->core, &req);
+	if (ret)
+		return 0;
 
-	return clk_core_round_rate_nolock(clk->core, rate, min_rate, max_rate);
+	return req.rate;
 }
 EXPORT_SYMBOL_GPL(__clk_round_rate);
 
@@ -1249,7 +1257,6 @@ static struct clk_core *clk_calc_new_rates(struct clk_core *core,
 {
 	struct clk_core *top = core;
 	struct clk_core *old_parent, *parent;
-	struct clk_hw *parent_hw;
 	unsigned long best_parent_rate = 0;
 	unsigned long new_rate;
 	unsigned long min_rate;
@@ -1270,20 +1277,29 @@ static struct clk_core *clk_calc_new_rates(struct clk_core *core,
 
 	/* find the closest rate and parent clk/rate */
 	if (core->ops->determine_rate) {
-		parent_hw = parent ? parent->hw : NULL;
-		ret = core->ops->determine_rate(core->hw, rate,
-					       min_rate,
-					       max_rate,
-					       &best_parent_rate,
-					       &parent_hw);
+		struct clk_rate_request req;
+
+		req.rate = rate;
+		req.min_rate = min_rate;
+		req.max_rate = max_rate;
+		if (parent) {
+			req.best_parent_hw = parent->hw;
+			req.best_parent_rate = parent->rate;
+		} else {
+			req.best_parent_hw = NULL;
+			req.best_parent_rate = 0;
+		}
+
+		ret = core->ops->determine_rate(core->hw, &req);
 		if (ret < 0)
 			return NULL;
 
-		new_rate = ret;
-		parent = parent_hw ? parent_hw->core : NULL;
+		best_parent_rate = req.best_parent_rate;
+		new_rate = req.rate;
+		parent = req.best_parent_hw ? req.best_parent_hw->core : NULL;
 	} else if (core->ops->round_rate) {
 		ret = core->ops->round_rate(core->hw, rate,
-					   &best_parent_rate);
+					    &best_parent_rate);
 		if (ret < 0)
 			return NULL;
 
diff --git a/drivers/clk/hisilicon/clk-hi3620.c b/drivers/clk/hisilicon/clk-hi3620.c
index 715d34a5ef9b..a0674ba6659e 100644
--- a/drivers/clk/hisilicon/clk-hi3620.c
+++ b/drivers/clk/hisilicon/clk-hi3620.c
@@ -294,34 +294,29 @@ static unsigned long mmc_clk_recalc_rate(struct clk_hw *hw,
 	}
 }
 
-static long mmc_clk_determine_rate(struct clk_hw *hw, unsigned long rate,
-			      unsigned long min_rate,
-			      unsigned long max_rate,
-			      unsigned long *best_parent_rate,
-			      struct clk_hw **best_parent_p)
+static int mmc_clk_determine_rate(struct clk_hw *hw,
+				  struct clk_rate_request *req)
 {
 	struct clk_mmc *mclk = to_mmc(hw);
-	unsigned long best = 0;
 
-	if ((rate <= 13000000) && (mclk->id == HI3620_MMC_CIUCLK1)) {
-		rate = 13000000;
-		best = 26000000;
-	} else if (rate <= 26000000) {
-		rate = 25000000;
-		best = 180000000;
-	} else if (rate <= 52000000) {
-		rate = 50000000;
-		best = 360000000;
-	} else if (rate <= 100000000) {
-		rate = 100000000;
-		best = 720000000;
+	if ((req->rate <= 13000000) && (mclk->id == HI3620_MMC_CIUCLK1)) {
+		req->rate = 13000000;
+		req->best_parent_rate = 26000000;
+	} else if (req->rate <= 26000000) {
+		req->rate = 25000000;
+		req->best_parent_rate = 180000000;
+	} else if (req->rate <= 52000000) {
+		req->rate = 50000000;
+		req->best_parent_rate = 360000000;
+	} else if (req->rate <= 100000000) {
+		req->rate = 100000000;
+		req->best_parent_rate = 720000000;
 	} else {
 		/* max is 180M */
-		rate = 180000000;
-		best = 1440000000;
+		req->rate = 180000000;
+		req->best_parent_rate = 1440000000;
 	}
-	*best_parent_rate = best;
-	return rate;
+	return 0;
 }
 
 static u32 mmc_clk_delay(u32 val, u32 para, u32 off, u32 len)
diff --git a/drivers/clk/mmp/clk-mix.c b/drivers/clk/mmp/clk-mix.c
index de6a873175d2..7a37432761f9 100644
--- a/drivers/clk/mmp/clk-mix.c
+++ b/drivers/clk/mmp/clk-mix.c
@@ -201,11 +201,8 @@ error:
 	return ret;
 }
 
-static long mmp_clk_mix_determine_rate(struct clk_hw *hw, unsigned long rate,
-					unsigned long min_rate,
-					unsigned long max_rate,
-					unsigned long *best_parent_rate,
-					struct clk_hw **best_parent_clk)
+static int mmp_clk_mix_determine_rate(struct clk_hw *hw,
+				      struct clk_rate_request *req)
 {
 	struct mmp_clk_mix *mix = to_clk_mix(hw);
 	struct mmp_clk_mix_clk_table *item;
@@ -221,7 +218,7 @@ static long mmp_clk_mix_determine_rate(struct clk_hw *hw, unsigned long rate,
 	parent = NULL;
 	mix_rate_best = 0;
 	parent_rate_best = 0;
-	gap_best = rate;
+	gap_best = req->rate;
 	parent_best = NULL;
 
 	if (mix->table) {
@@ -233,7 +230,7 @@ static long mmp_clk_mix_determine_rate(struct clk_hw *hw, unsigned long rate,
 							item->parent_index);
 			parent_rate = __clk_get_rate(parent);
 			mix_rate = parent_rate / item->divisor;
-			gap = abs(mix_rate - rate);
+			gap = abs(mix_rate - req->rate);
 			if (parent_best == NULL || gap < gap_best) {
 				parent_best = parent;
 				parent_rate_best = parent_rate;
@@ -251,7 +248,7 @@ static long mmp_clk_mix_determine_rate(struct clk_hw *hw, unsigned long rate,
 			for (j = 0; j < div_val_max; j++) {
 				div = _get_div(mix, j);
 				mix_rate = parent_rate / div;
-				gap = abs(mix_rate - rate);
+				gap = abs(mix_rate - req->rate);
 				if (parent_best == NULL || gap < gap_best) {
 					parent_best = parent;
 					parent_rate_best = parent_rate;
@@ -265,10 +262,11 @@ static long mmp_clk_mix_determine_rate(struct clk_hw *hw, unsigned long rate,
 	}
 
 found:
-	*best_parent_rate = parent_rate_best;
-	*best_parent_clk = __clk_get_hw(parent_best);
+	req->best_parent_rate = parent_rate_best;
+	req->best_parent_hw = __clk_get_hw(parent_best);
+	req->rate = mix_rate_best;
 
-	return mix_rate_best;
+	return 0;
 }
 
 static int mmp_clk_mix_set_rate_and_parent(struct clk_hw *hw,
diff --git a/drivers/clk/qcom/clk-pll.c b/drivers/clk/qcom/clk-pll.c
index 245d5063a385..6017a76b47c8 100644
--- a/drivers/clk/qcom/clk-pll.c
+++ b/drivers/clk/qcom/clk-pll.c
@@ -135,19 +135,23 @@ struct pll_freq_tbl *find_freq(const struct pll_freq_tbl *f, unsigned long rate)
 	return NULL;
 }
 
-static long
-clk_pll_determine_rate(struct clk_hw *hw, unsigned long rate,
-		       unsigned long min_rate, unsigned long max_rate,
-		       unsigned long *p_rate, struct clk_hw **p)
+static int
+clk_pll_determine_rate(struct clk_hw *hw, struct clk_rate_request *req)
 {
+	struct clk *parent = __clk_get_parent(hw->clk);
 	struct clk_pll *pll = to_clk_pll(hw);
 	const struct pll_freq_tbl *f;
 
-	f = find_freq(pll->freq_tbl, rate);
+	req->best_parent_hw = __clk_get_hw(parent);
+	req->best_parent_rate = __clk_get_rate(parent);
+
+	f = find_freq(pll->freq_tbl, req->rate);
 	if (!f)
-		return clk_pll_recalc_rate(hw, *p_rate);
+		req->rate = clk_pll_recalc_rate(hw, req->best_parent_rate);
+	else
+		req->rate = f->freq;
 
-	return f->freq;
+	return 0;
 }
 
 static int
diff --git a/drivers/clk/qcom/clk-rcg.c b/drivers/clk/qcom/clk-rcg.c
index 7b3d62674203..2bc42bb21b3d 100644
--- a/drivers/clk/qcom/clk-rcg.c
+++ b/drivers/clk/qcom/clk-rcg.c
@@ -404,13 +404,11 @@ clk_dyn_rcg_recalc_rate(struct clk_hw *hw, unsigned long parent_rate)
 	return calc_rate(parent_rate, m, n, mode, pre_div);
 }
 
-static long _freq_tbl_determine_rate(struct clk_hw *hw,
-		const struct freq_tbl *f, unsigned long rate,
-		unsigned long min_rate, unsigned long max_rate,
-		unsigned long *p_rate, struct clk_hw **p_hw,
+static int _freq_tbl_determine_rate(struct clk_hw *hw, const struct freq_tbl *f,
+		struct clk_rate_request *req,
 		const struct parent_map *parent_map)
 {
-	unsigned long clk_flags;
+	unsigned long clk_flags, rate = req->rate;
 	struct clk *p;
 	int index;
 
@@ -435,25 +433,24 @@ static long _freq_tbl_determine_rate(struct clk_hw *hw,
 	} else {
 		rate =  __clk_get_rate(p);
 	}
-	*p_hw = __clk_get_hw(p);
-	*p_rate = rate;
+	req->best_parent_hw = __clk_get_hw(p);
+	req->best_parent_rate = rate;
+	req->rate = f->freq;
 
-	return f->freq;
+	return 0;
 }
 
-static long clk_rcg_determine_rate(struct clk_hw *hw, unsigned long rate,
-		unsigned long min_rate, unsigned long max_rate,
-		unsigned long *p_rate, struct clk_hw **p)
+static int clk_rcg_determine_rate(struct clk_hw *hw,
+				  struct clk_rate_request *req)
 {
 	struct clk_rcg *rcg = to_clk_rcg(hw);
 
-	return _freq_tbl_determine_rate(hw, rcg->freq_tbl, rate, min_rate,
-			max_rate, p_rate, p, rcg->s.parent_map);
+	return _freq_tbl_determine_rate(hw, rcg->freq_tbl, req,
+					rcg->s.parent_map);
 }
 
-static long clk_dyn_rcg_determine_rate(struct clk_hw *hw, unsigned long rate,
-		unsigned long min_rate, unsigned long max_rate,
-		unsigned long *p_rate, struct clk_hw **p)
+static int clk_dyn_rcg_determine_rate(struct clk_hw *hw,
+				      struct clk_rate_request *req)
 {
 	struct clk_dyn_rcg *rcg = to_clk_dyn_rcg(hw);
 	u32 reg;
@@ -464,13 +461,11 @@ static long clk_dyn_rcg_determine_rate(struct clk_hw *hw, unsigned long rate,
 	bank = reg_to_bank(rcg, reg);
 	s = &rcg->s[bank];
 
-	return _freq_tbl_determine_rate(hw, rcg->freq_tbl, rate, min_rate,
-			max_rate, p_rate, p, s->parent_map);
+	return _freq_tbl_determine_rate(hw, rcg->freq_tbl, req, s->parent_map);
 }
 
-static long clk_rcg_bypass_determine_rate(struct clk_hw *hw, unsigned long rate,
-		unsigned long min_rate, unsigned long max_rate,
-		unsigned long *p_rate, struct clk_hw **p_hw)
+static int clk_rcg_bypass_determine_rate(struct clk_hw *hw,
+					 struct clk_rate_request *req)
 {
 	struct clk_rcg *rcg = to_clk_rcg(hw);
 	const struct freq_tbl *f = rcg->freq_tbl;
@@ -478,10 +473,11 @@ static long clk_rcg_bypass_determine_rate(struct clk_hw *hw, unsigned long rate,
 	int index = qcom_find_src_index(hw, rcg->s.parent_map, f->src);
 
 	p = clk_get_parent_by_index(hw->clk, index);
-	*p_hw = __clk_get_hw(p);
-	*p_rate = __clk_round_rate(p, rate);
+	req->best_parent_hw = __clk_get_hw(p);
+	req->best_parent_rate = __clk_round_rate(p, req->rate);
+	req->rate = req->best_parent_rate;
 
-	return *p_rate;
+	return 0;
 }
 
 static int __clk_rcg_set_rate(struct clk_rcg *rcg, const struct freq_tbl *f)
diff --git a/drivers/clk/qcom/clk-rcg2.c b/drivers/clk/qcom/clk-rcg2.c
index b95d17fbb8d7..aa6c3bdac040 100644
--- a/drivers/clk/qcom/clk-rcg2.c
+++ b/drivers/clk/qcom/clk-rcg2.c
@@ -176,11 +176,10 @@ clk_rcg2_recalc_rate(struct clk_hw *hw, unsigned long parent_rate)
 	return calc_rate(parent_rate, m, n, mode, hid_div);
 }
 
-static long _freq_tbl_determine_rate(struct clk_hw *hw,
-		const struct freq_tbl *f, unsigned long rate,
-		unsigned long *p_rate, struct clk_hw **p_hw)
+static int _freq_tbl_determine_rate(struct clk_hw *hw,
+		const struct freq_tbl *f, struct clk_rate_request *req)
 {
-	unsigned long clk_flags;
+	unsigned long clk_flags, rate = req->rate;
 	struct clk *p;
 	struct clk_rcg2 *rcg = to_clk_rcg2(hw);
 	int index;
@@ -210,19 +209,19 @@ static long _freq_tbl_determine_rate(struct clk_hw *hw,
 	} else {
 		rate =  __clk_get_rate(p);
 	}
-	*p_hw = __clk_get_hw(p);
-	*p_rate = rate;
+	req->best_parent_hw = __clk_get_hw(p);
+	req->best_parent_rate = rate;
+	req->rate = f->freq;
 
-	return f->freq;
+	return 0;
 }
 
-static long clk_rcg2_determine_rate(struct clk_hw *hw, unsigned long rate,
-		unsigned long min_rate, unsigned long max_rate,
-		unsigned long *p_rate, struct clk_hw **p)
+static int clk_rcg2_determine_rate(struct clk_hw *hw,
+				   struct clk_rate_request *req)
 {
 	struct clk_rcg2 *rcg = to_clk_rcg2(hw);
 
-	return _freq_tbl_determine_rate(hw, rcg->freq_tbl, rate, p_rate, p);
+	return _freq_tbl_determine_rate(hw, rcg->freq_tbl, req);
 }
 
 static int clk_rcg2_configure(struct clk_rcg2 *rcg, const struct freq_tbl *f)
@@ -374,35 +373,34 @@ static int clk_edp_pixel_set_rate_and_parent(struct clk_hw *hw,
 	return clk_edp_pixel_set_rate(hw, rate, parent_rate);
 }
 
-static long clk_edp_pixel_determine_rate(struct clk_hw *hw, unsigned long rate,
-				 unsigned long min_rate,
-				 unsigned long max_rate,
-				 unsigned long *p_rate, struct clk_hw **p)
+static int clk_edp_pixel_determine_rate(struct clk_hw *hw,
+					struct clk_rate_request *req)
 {
 	struct clk_rcg2 *rcg = to_clk_rcg2(hw);
 	const struct freq_tbl *f = rcg->freq_tbl;
 	const struct frac_entry *frac;
 	int delta = 100000;
-	s64 src_rate = *p_rate;
 	s64 request;
 	u32 mask = BIT(rcg->hid_width) - 1;
 	u32 hid_div;
 	int index = qcom_find_src_index(hw, rcg->parent_map, f->src);
+	struct clk *p = clk_get_parent_by_index(hw->clk, index);
 
 	/* Force the correct parent */
-	*p = __clk_get_hw(clk_get_parent_by_index(hw->clk, index));
+	req->best_parent_hw = __clk_get_hw(p);
+	req->best_parent_rate = __clk_get_rate(p);
 
-	if (src_rate == 810000000)
+	if (req->best_parent_rate == 810000000)
 		frac = frac_table_810m;
 	else
 		frac = frac_table_675m;
 
 	for (; frac->num; frac++) {
-		request = rate;
+		request = req->rate;
 		request *= frac->den;
 		request = div_s64(request, frac->num);
-		if ((src_rate < (request - delta)) ||
-		    (src_rate > (request + delta)))
+		if ((req->best_parent_rate < (request - delta)) ||
+		    (req->best_parent_rate > (request + delta)))
 			continue;
 
 		regmap_read(rcg->clkr.regmap, rcg->cmd_rcgr + CFG_REG,
@@ -410,8 +408,10 @@ static long clk_edp_pixel_determine_rate(struct clk_hw *hw, unsigned long rate,
 		hid_div >>= CFG_SRC_DIV_SHIFT;
 		hid_div &= mask;
 
-		return calc_rate(src_rate, frac->num, frac->den, !!frac->den,
-				 hid_div);
+		req->rate = calc_rate(req->best_parent_rate,
+				      frac->num, frac->den,
+				      !!frac->den, hid_div);
+		return 0;
 	}
 
 	return -EINVAL;
@@ -428,9 +428,8 @@ const struct clk_ops clk_edp_pixel_ops = {
 };
 EXPORT_SYMBOL_GPL(clk_edp_pixel_ops);
 
-static long clk_byte_determine_rate(struct clk_hw *hw, unsigned long rate,
-			 unsigned long min_rate, unsigned long max_rate,
-			 unsigned long *p_rate, struct clk_hw **p_hw)
+static int clk_byte_determine_rate(struct clk_hw *hw,
+				   struct clk_rate_request *req)
 {
 	struct clk_rcg2 *rcg = to_clk_rcg2(hw);
 	const struct freq_tbl *f = rcg->freq_tbl;
@@ -439,17 +438,19 @@ static long clk_byte_determine_rate(struct clk_hw *hw, unsigned long rate,
 	u32 mask = BIT(rcg->hid_width) - 1;
 	struct clk *p;
 
-	if (rate == 0)
+	if (req->rate == 0)
 		return -EINVAL;
 
 	p = clk_get_parent_by_index(hw->clk, index);
-	*p_hw = __clk_get_hw(p);
-	*p_rate = parent_rate = __clk_round_rate(p, rate);
+	req->best_parent_hw = __clk_get_hw(p);
+	req->best_parent_rate = parent_rate = __clk_round_rate(p, req->rate);
 
-	div = DIV_ROUND_UP((2 * parent_rate), rate) - 1;
+	div = DIV_ROUND_UP((2 * parent_rate), req->rate) - 1;
 	div = min_t(u32, div, mask);
 
-	return calc_rate(parent_rate, 0, 0, 0, div);
+	req->rate = calc_rate(parent_rate, 0, 0, 0, div);
+
+	return 0;
 }
 
 static int clk_byte_set_rate(struct clk_hw *hw, unsigned long rate,
@@ -494,10 +495,8 @@ static const struct frac_entry frac_table_pixel[] = {
 	{ }
 };
 
-static long clk_pixel_determine_rate(struct clk_hw *hw, unsigned long rate,
-				 unsigned long min_rate,
-				 unsigned long max_rate,
-				 unsigned long *p_rate, struct clk_hw **p)
+static int clk_pixel_determine_rate(struct clk_hw *hw,
+				    struct clk_rate_request *req)
 {
 	struct clk_rcg2 *rcg = to_clk_rcg2(hw);
 	unsigned long request, src_rate;
@@ -507,18 +506,19 @@ static long clk_pixel_determine_rate(struct clk_hw *hw, unsigned long rate,
 	int index = qcom_find_src_index(hw, rcg->parent_map, f->src);
 	struct clk *parent = clk_get_parent_by_index(hw->clk, index);
 
-	*p = __clk_get_hw(parent);
+	req->best_parent_hw = __clk_get_hw(parent);
 
 	for (; frac->num; frac++) {
-		request = (rate * frac->den) / frac->num;
+		request = (req->rate * frac->den) / frac->num;
 
 		src_rate = __clk_round_rate(parent, request);
 		if ((src_rate < (request - delta)) ||
 			(src_rate > (request + delta)))
 			continue;
 
-		*p_rate = src_rate;
-		return (src_rate * frac->num) / frac->den;
+		req->best_parent_rate = src_rate;
+		req->rate = (src_rate * frac->num) / frac->den;
+		return 0;
 	}
 
 	return -EINVAL;
diff --git a/drivers/clk/sunxi/clk-factors.c b/drivers/clk/sunxi/clk-factors.c
index 8c20190a3e9f..7a485870991d 100644
--- a/drivers/clk/sunxi/clk-factors.c
+++ b/drivers/clk/sunxi/clk-factors.c
@@ -79,11 +79,8 @@ static long clk_factors_round_rate(struct clk_hw *hw, unsigned long rate,
 	return rate;
 }
 
-static long clk_factors_determine_rate(struct clk_hw *hw, unsigned long rate,
-				       unsigned long min_rate,
-				       unsigned long max_rate,
-				       unsigned long *best_parent_rate,
-				       struct clk_hw **best_parent_p)
+static int clk_factors_determine_rate(struct clk_hw *hw,
+				      struct clk_rate_request *req)
 {
 	struct clk *clk = hw->clk, *parent, *best_parent = NULL;
 	int i, num_parents;
@@ -96,13 +93,14 @@ static long clk_factors_determine_rate(struct clk_hw *hw, unsigned long rate,
 		if (!parent)
 			continue;
 		if (__clk_get_flags(clk) & CLK_SET_RATE_PARENT)
-			parent_rate = __clk_round_rate(parent, rate);
+			parent_rate = __clk_round_rate(parent, req->rate);
 		else
 			parent_rate = __clk_get_rate(parent);
 
-		child_rate = clk_factors_round_rate(hw, rate, &parent_rate);
+		child_rate = clk_factors_round_rate(hw, req->rate,
+						    &parent_rate);
 
-		if (child_rate <= rate && child_rate > best_child_rate) {
+		if (child_rate <= req->rate && child_rate > best_child_rate) {
 			best_parent = parent;
 			best = parent_rate;
 			best_child_rate = child_rate;
@@ -110,10 +108,11 @@ static long clk_factors_determine_rate(struct clk_hw *hw, unsigned long rate,
 	}
 
 	if (best_parent)
-		*best_parent_p = __clk_get_hw(best_parent);
-	*best_parent_rate = best;
+		req->best_parent_hw = __clk_get_hw(best_parent);
+	req->best_parent_rate = best;
+	req->rate = best_child_rate;
 
-	return best_child_rate;
+	return 0;
 }
 
 static int clk_factors_set_rate(struct clk_hw *hw, unsigned long rate,
diff --git a/drivers/clk/sunxi/clk-sun6i-ar100.c b/drivers/clk/sunxi/clk-sun6i-ar100.c
index 63cf149195ae..d70c1ea345db 100644
--- a/drivers/clk/sunxi/clk-sun6i-ar100.c
+++ b/drivers/clk/sunxi/clk-sun6i-ar100.c
@@ -44,17 +44,14 @@ static unsigned long ar100_recalc_rate(struct clk_hw *hw,
 	return (parent_rate >> shift) / (div + 1);
 }
 
-static long ar100_determine_rate(struct clk_hw *hw, unsigned long rate,
-				 unsigned long min_rate,
-				 unsigned long max_rate,
-				 unsigned long *best_parent_rate,
-				 struct clk_hw **best_parent_clk)
+static int ar100_determine_rate(struct clk_hw *hw,
+				struct clk_rate_request *req)
 {
 	int nparents = __clk_get_num_parents(hw->clk);
 	long best_rate = -EINVAL;
 	int i;
 
-	*best_parent_clk = NULL;
+	req->best_parent_hw = NULL;
 
 	for (i = 0; i < nparents; i++) {
 		unsigned long parent_rate;
@@ -65,7 +62,7 @@ static long ar100_determine_rate(struct clk_hw *hw, unsigned long rate,
 
 		parent = clk_get_parent_by_index(hw->clk, i);
 		parent_rate = __clk_get_rate(parent);
-		div = DIV_ROUND_UP(parent_rate, rate);
+		div = DIV_ROUND_UP(parent_rate, req->rate);
 
 		/*
 		 * The AR100 clk contains 2 divisors:
@@ -101,14 +98,16 @@ static long ar100_determine_rate(struct clk_hw *hw, unsigned long rate,
 			continue;
 
 		tmp_rate = (parent_rate >> shift) / div;
-		if (!*best_parent_clk || tmp_rate > best_rate) {
-			*best_parent_clk = __clk_get_hw(parent);
-			*best_parent_rate = parent_rate;
+		if (!req->best_parent_hw || tmp_rate > best_rate) {
+			req->best_parent_hw = __clk_get_hw(parent);
+			req->best_parent_rate = parent_rate;
 			best_rate = tmp_rate;
 		}
 	}
 
-	return best_rate;
+	req->rate = best_rate;
+
+	return 0;
 }
 
 static int ar100_set_parent(struct clk_hw *hw, u8 index)
diff --git a/drivers/clk/sunxi/clk-sunxi.c b/drivers/clk/sunxi/clk-sunxi.c
index 9a82f17d2d73..d0f72a151bf1 100644
--- a/drivers/clk/sunxi/clk-sunxi.c
+++ b/drivers/clk/sunxi/clk-sunxi.c
@@ -118,11 +118,8 @@ static long sun6i_ahb1_clk_round(unsigned long rate, u8 *divp, u8 *pre_divp,
 	return (parent_rate / calcm) >> calcp;
 }
 
-static long sun6i_ahb1_clk_determine_rate(struct clk_hw *hw, unsigned long rate,
-					  unsigned long min_rate,
-					  unsigned long max_rate,
-					  unsigned long *best_parent_rate,
-					  struct clk_hw **best_parent_clk)
+static int sun6i_ahb1_clk_determine_rate(struct clk_hw *hw,
+					 struct clk_rate_request *req)
 {
 	struct clk *clk = hw->clk, *parent, *best_parent = NULL;
 	int i, num_parents;
@@ -135,14 +132,14 @@ static long sun6i_ahb1_clk_determine_rate(struct clk_hw *hw, unsigned long rate,
 		if (!parent)
 			continue;
 		if (__clk_get_flags(clk) & CLK_SET_RATE_PARENT)
-			parent_rate = __clk_round_rate(parent, rate);
+			parent_rate = __clk_round_rate(parent, req->rate);
 		else
 			parent_rate = __clk_get_rate(parent);
 
-		child_rate = sun6i_ahb1_clk_round(rate, NULL, NULL, i,
+		child_rate = sun6i_ahb1_clk_round(req->rate, NULL, NULL, i,
 						  parent_rate);
 
-		if (child_rate <= rate && child_rate > best_child_rate) {
+		if (child_rate <= req->rate && child_rate > best_child_rate) {
 			best_parent = parent;
 			best = parent_rate;
 			best_child_rate = child_rate;
@@ -150,10 +147,11 @@ static long sun6i_ahb1_clk_determine_rate(struct clk_hw *hw, unsigned long rate,
 	}
 
 	if (best_parent)
-		*best_parent_clk = __clk_get_hw(best_parent);
-	*best_parent_rate = best;
+		req->best_parent_hw = __clk_get_hw(best_parent);
+	req->best_parent_rate = best;
+	req->rate = best_child_rate;
 
-	return best_child_rate;
+	return 0;
 }
 
 static int sun6i_ahb1_clk_set_rate(struct clk_hw *hw, unsigned long rate,
diff --git a/drivers/clk/tegra/clk-emc.c b/drivers/clk/tegra/clk-emc.c
index 7649685c86bc..08ae518c9950 100644
--- a/drivers/clk/tegra/clk-emc.c
+++ b/drivers/clk/tegra/clk-emc.c
@@ -116,11 +116,7 @@ static unsigned long emc_recalc_rate(struct clk_hw *hw,
  * safer since things have EMC rate floors. Also don't touch parent_rate
  * since we don't want the CCF to play with our parent clocks.
  */
-static long emc_determine_rate(struct clk_hw *hw, unsigned long rate,
-			       unsigned long min_rate,
-			       unsigned long max_rate,
-			       unsigned long *best_parent_rate,
-			       struct clk_hw **best_parent_hw)
+static int emc_determine_rate(struct clk_hw *hw, struct clk_rate_request *req)
 {
 	struct tegra_clk_emc *tegra;
 	u8 ram_code = tegra_read_ram_code();
@@ -135,22 +131,28 @@ static long emc_determine_rate(struct clk_hw *hw, unsigned long rate,
 
 		timing = tegra->timings + i;
 
-		if (timing->rate > max_rate) {
+		if (timing->rate > req->max_rate) {
 			i = min(i, 1);
-			return tegra->timings[i - 1].rate;
+			req->rate = tegra->timings[i - 1].rate;
+			return 0;
 		}
 
-		if (timing->rate < min_rate)
+		if (timing->rate < req->min_rate)
 			continue;
 
-		if (timing->rate >= rate)
-			return timing->rate;
+		if (timing->rate >= req->rate) {
+			req->rate = timing->rate;
+			return 0;
+		}
 	}
 
-	if (timing)
-		return timing->rate;
+	if (timing) {
+		req->rate = timing->rate;
+		return 0;
+	}
 
-	return __clk_get_rate(hw->clk);
+	req->rate = __clk_get_rate(hw->clk);
+	return 0;
 }
 
 static u8 emc_get_parent(struct clk_hw *hw)
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 78842f46f152..14998f05acf2 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -37,6 +37,28 @@ struct clk_hw;
 struct clk_core;
 struct dentry;
 
+/**
+ * struct clk_rate_request - Structure encoding the clk constraints that
+ * a clock user might require.
+ *
+ * @rate:		Requested clock rate. This field will be adjusted by
+ *			clock drivers according to hardware capabilities.
+ * @min_rate:		Minimum rate imposed by clk users.
+ * @max_rate:		Maximum rate a imposed by clk users.
+ * @best_parent_rate:	The best parent rate a parent can provide to fulfill the
+ *			requested constraints.
+ * @best_parent_hw:	The most appropriate parent clock that fulfills the
+ *			requested constraints.
+ *
+ */
+struct clk_rate_request {
+	unsigned long rate;
+	unsigned long min_rate;
+	unsigned long max_rate;
+	unsigned long best_parent_rate;
+	struct clk_hw *best_parent_hw;
+};
+
 /**
  * struct clk_ops -  Callback operations for hardware clocks; these are to
  * be provided by the clock implementation, and will be called by drivers
@@ -176,12 +198,8 @@ struct clk_ops {
 					unsigned long parent_rate);
 	long		(*round_rate)(struct clk_hw *hw, unsigned long rate,
 					unsigned long *parent_rate);
-	long		(*determine_rate)(struct clk_hw *hw,
-					  unsigned long rate,
-					  unsigned long min_rate,
-					  unsigned long max_rate,
-					  unsigned long *best_parent_rate,
-					  struct clk_hw **best_parent_hw);
+	int		(*determine_rate)(struct clk_hw *hw,
+					  struct clk_rate_request *req);
 	int		(*set_parent)(struct clk_hw *hw, u8 index);
 	u8		(*get_parent)(struct clk_hw *hw);
 	int		(*set_rate)(struct clk_hw *hw, unsigned long rate,
@@ -578,20 +596,11 @@ unsigned long __clk_get_flags(struct clk *clk);
 bool __clk_is_prepared(struct clk *clk);
 bool __clk_is_enabled(struct clk *clk);
 struct clk *__clk_lookup(const char *name);
-long __clk_mux_determine_rate(struct clk_hw *hw, unsigned long rate,
-			      unsigned long min_rate,
-			      unsigned long max_rate,
-			      unsigned long *best_parent_rate,
-			      struct clk_hw **best_parent_p);
-unsigned long __clk_determine_rate(struct clk_hw *core,
-				   unsigned long rate,
-				   unsigned long min_rate,
-				   unsigned long max_rate);
-long __clk_mux_determine_rate_closest(struct clk_hw *hw, unsigned long rate,
-			      unsigned long min_rate,
-			      unsigned long max_rate,
-			      unsigned long *best_parent_rate,
-			      struct clk_hw **best_parent_p);
+int __clk_mux_determine_rate(struct clk_hw *hw,
+			     struct clk_rate_request *req);
+int __clk_determine_rate(struct clk_hw *core, struct clk_rate_request *req);
+int __clk_mux_determine_rate_closest(struct clk_hw *hw,
+				     struct clk_rate_request *req);
 void clk_hw_reparent(struct clk_hw *hw, struct clk_hw *new_parent);
 
 static inline void __clk_hw_set_clk(struct clk_hw *dst, struct clk_hw *src)
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 79b76e13d904..448b4f87b9eb 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -269,23 +269,15 @@ int omap3_noncore_dpll_set_rate_and_parent(struct clk_hw *hw,
 					   unsigned long rate,
 					   unsigned long parent_rate,
 					   u8 index);
-long omap3_noncore_dpll_determine_rate(struct clk_hw *hw,
-				       unsigned long rate,
-				       unsigned long min_rate,
-				       unsigned long max_rate,
-				       unsigned long *best_parent_rate,
-				       struct clk_hw **best_parent_clk);
+int omap3_noncore_dpll_determine_rate(struct clk_hw *hw,
+				      struct clk_rate_request *req);
 unsigned long omap4_dpll_regm4xen_recalc(struct clk_hw *hw,
 					 unsigned long parent_rate);
 long omap4_dpll_regm4xen_round_rate(struct clk_hw *hw,
 				    unsigned long target_rate,
 				    unsigned long *parent_rate);
-long omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw,
-					unsigned long rate,
-					unsigned long min_rate,
-					unsigned long max_rate,
-					unsigned long *best_parent_rate,
-					struct clk_hw **best_parent_clk);
+int omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw,
+				       struct clk_rate_request *req);
 u8 omap2_init_dpll_parent(struct clk_hw *hw);
 unsigned long omap3_dpll_recalc(struct clk_hw *hw, unsigned long parent_rate);
 long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate,
-- 
cgit v1.2.3-70-g09d2


From 730daa164e7c7e31c08fab940549f4acc3329432 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Jul 2015 18:02:48 -0700
Subject: Yama: remove needless CONFIG_SECURITY_YAMA_STACKED

Now that minor LSMs can cleanly stack with major LSMs, remove the unneeded
config for Yama to be made to explicitly stack. Just selecting the main
Yama CONFIG will allow it to work, regardless of the major LSM. Since
distros using Yama are already forcing it to stack, this is effectively
a no-op change.

Additionally add MAINTAINERS entry.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 Documentation/security/Yama.txt       | 10 ++++------
 MAINTAINERS                           |  6 ++++++
 arch/mips/configs/pistachio_defconfig |  1 -
 include/linux/lsm_hooks.h             |  6 ++++--
 security/Kconfig                      |  5 -----
 security/security.c                   | 11 +++--------
 security/yama/Kconfig                 |  9 +--------
 security/yama/yama_lsm.c              | 32 ++++++++++----------------------
 8 files changed, 28 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/security/Yama.txt b/Documentation/security/Yama.txt
index 227a63f018a2..d9ee7d7a6c7f 100644
--- a/Documentation/security/Yama.txt
+++ b/Documentation/security/Yama.txt
@@ -1,9 +1,7 @@
-Yama is a Linux Security Module that collects a number of system-wide DAC
-security protections that are not handled by the core kernel itself. To
-select it at boot time, specify "security=yama" (though this will disable
-any other LSM).
-
-Yama is controlled through sysctl in /proc/sys/kernel/yama:
+Yama is a Linux Security Module that collects system-wide DAC security
+protections that are not handled by the core kernel itself. This is
+selectable at build-time with CONFIG_SECURITY_YAMA, and can be controlled
+at run-time through sysctls in /proc/sys/kernel/yama:
 
 - ptrace_scope
 
diff --git a/MAINTAINERS b/MAINTAINERS
index a2264167791a..f8be2f797197 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9102,6 +9102,12 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jj/apparmor-dev.git
 S:	Supported
 F:	security/apparmor/
 
+YAMA SECURITY MODULE
+M:	Kees Cook <keescook@chromium.org>
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git yama/tip
+S:	Supported
+F:	security/yama/
+
 SENSABLE PHANTOM
 M:	Jiri Slaby <jirislaby@gmail.com>
 S:	Maintained
diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig
index 1646cce032c3..642b50946943 100644
--- a/arch/mips/configs/pistachio_defconfig
+++ b/arch/mips/configs/pistachio_defconfig
@@ -320,7 +320,6 @@ CONFIG_KEYS=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_YAMA=y
-CONFIG_SECURITY_YAMA_STACKED=y
 CONFIG_DEFAULT_SECURITY_DAC=y
 CONFIG_CRYPTO_AUTHENC=y
 CONFIG_CRYPTO_HMAC=y
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 9429f054c323..ec3a6bab29de 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1881,8 +1881,10 @@ static inline void security_delete_hooks(struct security_hook_list *hooks,
 
 extern int __init security_module_enable(const char *module);
 extern void __init capability_add_hooks(void);
-#ifdef CONFIG_SECURITY_YAMA_STACKED
-void __init yama_add_hooks(void);
+#ifdef CONFIG_SECURITY_YAMA
+extern void __init yama_add_hooks(void);
+#else
+static inline void __init yama_add_hooks(void) { }
 #endif
 
 #endif /* ! __LINUX_LSM_HOOKS_H */
diff --git a/security/Kconfig b/security/Kconfig
index bf4ec46474b6..e45237897b43 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -132,7 +132,6 @@ choice
 	default DEFAULT_SECURITY_SMACK if SECURITY_SMACK
 	default DEFAULT_SECURITY_TOMOYO if SECURITY_TOMOYO
 	default DEFAULT_SECURITY_APPARMOR if SECURITY_APPARMOR
-	default DEFAULT_SECURITY_YAMA if SECURITY_YAMA
 	default DEFAULT_SECURITY_DAC
 
 	help
@@ -151,9 +150,6 @@ choice
 	config DEFAULT_SECURITY_APPARMOR
 		bool "AppArmor" if SECURITY_APPARMOR=y
 
-	config DEFAULT_SECURITY_YAMA
-		bool "Yama" if SECURITY_YAMA=y
-
 	config DEFAULT_SECURITY_DAC
 		bool "Unix Discretionary Access Controls"
 
@@ -165,7 +161,6 @@ config DEFAULT_SECURITY
 	default "smack" if DEFAULT_SECURITY_SMACK
 	default "tomoyo" if DEFAULT_SECURITY_TOMOYO
 	default "apparmor" if DEFAULT_SECURITY_APPARMOR
-	default "yama" if DEFAULT_SECURITY_YAMA
 	default "" if DEFAULT_SECURITY_DAC
 
 endmenu
diff --git a/security/security.c b/security/security.c
index 595fffab48b0..e693ffcf9266 100644
--- a/security/security.c
+++ b/security/security.c
@@ -56,18 +56,13 @@ int __init security_init(void)
 	pr_info("Security Framework initialized\n");
 
 	/*
-	 * Always load the capability module.
+	 * Load minor LSMs, with the capability module always first.
 	 */
 	capability_add_hooks();
-#ifdef CONFIG_SECURITY_YAMA_STACKED
-	/*
-	 * If Yama is configured for stacking load it next.
-	 */
 	yama_add_hooks();
-#endif
+
 	/*
-	 * Load the chosen module if there is one.
-	 * This will also find yama if it is stacking
+	 * Load all the remaining security modules.
 	 */
 	do_security_initcalls();
 
diff --git a/security/yama/Kconfig b/security/yama/Kconfig
index 3123e1da2fed..90c605eea892 100644
--- a/security/yama/Kconfig
+++ b/security/yama/Kconfig
@@ -6,14 +6,7 @@ config SECURITY_YAMA
 	  This selects Yama, which extends DAC support with additional
 	  system-wide security settings beyond regular Linux discretionary
 	  access controls. Currently available is ptrace scope restriction.
+	  Like capabilities, this security module stacks with other LSMs.
 	  Further information can be found in Documentation/security/Yama.txt.
 
 	  If you are unsure how to answer this question, answer N.
-
-config SECURITY_YAMA_STACKED
-	bool "Yama stacked with other LSMs"
-	depends on SECURITY_YAMA
-	default n
-	help
-	  When Yama is built into the kernel, force it to stack with the
-	  selected primary LSM.
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index 9ed32502470e..d3c19c970a06 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -353,11 +353,6 @@ static struct security_hook_list yama_hooks[] = {
 	LSM_HOOK_INIT(task_free, yama_task_free),
 };
 
-void __init yama_add_hooks(void)
-{
-	security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks));
-}
-
 #ifdef CONFIG_SYSCTL
 static int yama_dointvec_minmax(struct ctl_table *table, int write,
 				void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -396,25 +391,18 @@ static struct ctl_table yama_sysctl_table[] = {
 	},
 	{ }
 };
-#endif /* CONFIG_SYSCTL */
-
-static __init int yama_init(void)
+static void __init yama_init_sysctl(void)
 {
-#ifndef CONFIG_SECURITY_YAMA_STACKED
-	/*
-	 * If yama is being stacked this is already taken care of.
-	 */
-	if (!security_module_enable("yama"))
-		return 0;
-#endif
-	pr_info("Yama: becoming mindful.\n");
-
-#ifdef CONFIG_SYSCTL
 	if (!register_sysctl_paths(yama_sysctl_path, yama_sysctl_table))
 		panic("Yama: sysctl registration failed.\n");
-#endif
-
-	return 0;
 }
+#else
+static inline void yama_init_sysctl(void) { }
+#endif /* CONFIG_SYSCTL */
 
-security_initcall(yama_init);
+void __init yama_add_hooks(void)
+{
+	pr_info("Yama: becoming mindful.\n");
+	security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks));
+	yama_init_sysctl();
+}
-- 
cgit v1.2.3-70-g09d2


From 13b2c4a0c3b1cd37ee6bcfbb5b6e2b94e9a75364 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 27 Jul 2015 18:03:56 +0300
Subject: PM / QoS: Make it possible to expose device latency tolerance to
 userspace

Typically when a device is created the bus core it belongs to (for example
PCI) does not know if the device supports things like latency tolerance.
This is left to the driver that binds to the device in question. However,
at that time the device has already been created and there is no way to set
its dev->power.set_latency_tolerance anymore.

So follow what has been done for other PM QoS attributes as well and allow
drivers to expose and hide latency tolerance from userspace, if the device
supports it.

Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/base/power/power.h |  2 ++
 drivers/base/power/qos.c   | 37 +++++++++++++++++++++++++++++++++++++
 drivers/base/power/sysfs.c | 11 +++++++++++
 include/linux/pm_qos.h     |  5 +++++
 4 files changed, 55 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h
index f1a5d95e7b20..998fa6b23084 100644
--- a/drivers/base/power/power.h
+++ b/drivers/base/power/power.h
@@ -73,6 +73,8 @@ extern int pm_qos_sysfs_add_resume_latency(struct device *dev);
 extern void pm_qos_sysfs_remove_resume_latency(struct device *dev);
 extern int pm_qos_sysfs_add_flags(struct device *dev);
 extern void pm_qos_sysfs_remove_flags(struct device *dev);
+extern int pm_qos_sysfs_add_latency_tolerance(struct device *dev);
+extern void pm_qos_sysfs_remove_latency_tolerance(struct device *dev);
 
 #else /* CONFIG_PM */
 
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index e56d538d039e..7f3646e459cb 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -883,3 +883,40 @@ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
 	mutex_unlock(&dev_pm_qos_mtx);
 	return ret;
 }
+
+/**
+ * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace
+ * @dev: Device whose latency tolerance to expose
+ */
+int dev_pm_qos_expose_latency_tolerance(struct device *dev)
+{
+	int ret;
+
+	if (!dev->power.set_latency_tolerance)
+		return -EINVAL;
+
+	mutex_lock(&dev_pm_qos_sysfs_mtx);
+	ret = pm_qos_sysfs_add_latency_tolerance(dev);
+	mutex_unlock(&dev_pm_qos_sysfs_mtx);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_tolerance);
+
+/**
+ * dev_pm_qos_hide_latency_tolerance - Hide latency tolerance from userspace
+ * @dev: Device whose latency tolerance to hide
+ */
+void dev_pm_qos_hide_latency_tolerance(struct device *dev)
+{
+	mutex_lock(&dev_pm_qos_sysfs_mtx);
+	pm_qos_sysfs_remove_latency_tolerance(dev);
+	mutex_unlock(&dev_pm_qos_sysfs_mtx);
+
+	/* Remove the request from user space now */
+	pm_runtime_get_sync(dev);
+	dev_pm_qos_update_user_latency_tolerance(dev,
+		PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT);
+	pm_runtime_put(dev);
+}
+EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_tolerance);
diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index d2be3f9c211c..a7b46798c81d 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -738,6 +738,17 @@ void pm_qos_sysfs_remove_flags(struct device *dev)
 	sysfs_unmerge_group(&dev->kobj, &pm_qos_flags_attr_group);
 }
 
+int pm_qos_sysfs_add_latency_tolerance(struct device *dev)
+{
+	return sysfs_merge_group(&dev->kobj,
+				 &pm_qos_latency_tolerance_attr_group);
+}
+
+void pm_qos_sysfs_remove_latency_tolerance(struct device *dev)
+{
+	sysfs_unmerge_group(&dev->kobj, &pm_qos_latency_tolerance_attr_group);
+}
+
 void rpm_sysfs_remove(struct device *dev)
 {
 	sysfs_unmerge_group(&dev->kobj, &pm_runtime_attr_group);
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 7b3ae0cffc05..0f65d36c2a75 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -161,6 +161,8 @@ void dev_pm_qos_hide_flags(struct device *dev);
 int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set);
 s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev);
 int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val);
+int dev_pm_qos_expose_latency_tolerance(struct device *dev);
+void dev_pm_qos_hide_latency_tolerance(struct device *dev);
 
 static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev)
 {
@@ -229,6 +231,9 @@ static inline s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev)
 			{ return PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; }
 static inline int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
 			{ return 0; }
+static inline int dev_pm_qos_expose_latency_tolerance(struct device *dev)
+			{ return 0; }
+static inline void dev_pm_qos_hide_latency_tolerance(struct device *dev) {}
 
 static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev) { return 0; }
 static inline s32 dev_pm_qos_requested_flags(struct device *dev) { return 0; }
-- 
cgit v1.2.3-70-g09d2


From 2e0fed7f7cdc41679e209c5636ad7537dc6210a9 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 27 Jul 2015 18:03:59 +0300
Subject: klist: implement klist_prev()

klist_prev() gets the previous element in the list. It is useful to traverse
through the list in reverse order, for example, to provide LIFO (last in first
out) variant of access.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/klist.h |  1 +
 lib/klist.c           | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/klist.h b/include/linux/klist.h
index 61e5b723ae73..953f283f8451 100644
--- a/include/linux/klist.h
+++ b/include/linux/klist.h
@@ -63,6 +63,7 @@ extern void klist_iter_init(struct klist *k, struct klist_iter *i);
 extern void klist_iter_init_node(struct klist *k, struct klist_iter *i,
 				 struct klist_node *n);
 extern void klist_iter_exit(struct klist_iter *i);
+extern struct klist_node *klist_prev(struct klist_iter *i);
 extern struct klist_node *klist_next(struct klist_iter *i);
 
 #endif
diff --git a/lib/klist.c b/lib/klist.c
index 89b485a2a58d..d74cf7a29afd 100644
--- a/lib/klist.c
+++ b/lib/klist.c
@@ -323,6 +323,47 @@ static struct klist_node *to_klist_node(struct list_head *n)
 	return container_of(n, struct klist_node, n_node);
 }
 
+/**
+ * klist_prev - Ante up prev node in list.
+ * @i: Iterator structure.
+ *
+ * First grab list lock. Decrement the reference count of the previous
+ * node, if there was one. Grab the prev node, increment its reference
+ * count, drop the lock, and return that prev node.
+ */
+struct klist_node *klist_prev(struct klist_iter *i)
+{
+	void (*put)(struct klist_node *) = i->i_klist->put;
+	struct klist_node *last = i->i_cur;
+	struct klist_node *prev;
+
+	spin_lock(&i->i_klist->k_lock);
+
+	if (last) {
+		prev = to_klist_node(last->n_node.prev);
+		if (!klist_dec_and_del(last))
+			put = NULL;
+	} else
+		prev = to_klist_node(i->i_klist->k_list.prev);
+
+	i->i_cur = NULL;
+	while (prev != to_klist_node(&i->i_klist->k_list)) {
+		if (likely(!knode_dead(prev))) {
+			kref_get(&prev->n_ref);
+			i->i_cur = prev;
+			break;
+		}
+		prev = to_klist_node(prev->n_node.prev);
+	}
+
+	spin_unlock(&i->i_klist->k_lock);
+
+	if (put && last)
+		put(last);
+	return i->i_cur;
+}
+EXPORT_SYMBOL_GPL(klist_prev);
+
 /**
  * klist_next - Ante up next node in list.
  * @i: Iterator structure.
-- 
cgit v1.2.3-70-g09d2


From 3d060aeb72113cda0acf906bfe26914fc689506a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 27 Jul 2015 18:04:00 +0300
Subject: driver core: implement device_for_each_child_reverse()

The new function device_for_each_child_reverse() is helpful to traverse the
registered devices in a reversed order, e.g. in the case when an operation on
each device should be done first on the last added device, then on one before
last and so on.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/base/core.c    | 43 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/device.h |  2 ++
 2 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index dafae6d2f7ac..7d6279554afc 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1252,6 +1252,19 @@ void device_unregister(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(device_unregister);
 
+static struct device *prev_device(struct klist_iter *i)
+{
+	struct klist_node *n = klist_prev(i);
+	struct device *dev = NULL;
+	struct device_private *p;
+
+	if (n) {
+		p = to_device_private_parent(n);
+		dev = p->device;
+	}
+	return dev;
+}
+
 static struct device *next_device(struct klist_iter *i)
 {
 	struct klist_node *n = klist_next(i);
@@ -1340,6 +1353,36 @@ int device_for_each_child(struct device *parent, void *data,
 }
 EXPORT_SYMBOL_GPL(device_for_each_child);
 
+/**
+ * device_for_each_child_reverse - device child iterator in reversed order.
+ * @parent: parent struct device.
+ * @fn: function to be called for each device.
+ * @data: data for the callback.
+ *
+ * Iterate over @parent's child devices, and call @fn for each,
+ * passing it @data.
+ *
+ * We check the return of @fn each time. If it returns anything
+ * other than 0, we break out and return that value.
+ */
+int device_for_each_child_reverse(struct device *parent, void *data,
+				  int (*fn)(struct device *dev, void *data))
+{
+	struct klist_iter i;
+	struct device *child;
+	int error = 0;
+
+	if (!parent->p)
+		return 0;
+
+	klist_iter_init(&parent->p->klist_children, &i);
+	while ((child = prev_device(&i)) && !error)
+		error = fn(child, data);
+	klist_iter_exit(&i);
+	return error;
+}
+EXPORT_SYMBOL_GPL(device_for_each_child_reverse);
+
 /**
  * device_find_child - device iterator for locating a particular device.
  * @parent: parent struct device
diff --git a/include/linux/device.h b/include/linux/device.h
index 5a31bf3a4024..af6fbc35d8a6 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -958,6 +958,8 @@ extern int __must_check device_add(struct device *dev);
 extern void device_del(struct device *dev);
 extern int device_for_each_child(struct device *dev, void *data,
 		     int (*fn)(struct device *dev, void *data));
+extern int device_for_each_child_reverse(struct device *dev, void *data,
+		     int (*fn)(struct device *dev, void *data));
 extern struct device *device_find_child(struct device *dev, void *data,
 				int (*match)(struct device *dev, void *data));
 extern int device_rename(struct device *dev, const char *new_name);
-- 
cgit v1.2.3-70-g09d2


From 28355f81969962cf01aef5b13d7de5b4ab0c5f13 Mon Sep 17 00:00:00 2001
From: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Date: Tue, 14 Jul 2015 10:29:54 +0200
Subject: gpio: defer probe if pinctrl cannot be found

When an OF node has a pin range for its GPIOs, return -EPROBE_DEFER if
the pin controller isn't available.

Otherwise, the GPIO range wouldn't be set at all unless the pin
controller probed always before the GPIO chip.

With this change, the probe of the GPIO chip will be deferred and will
be retried at a later point, hopefully once the pin controller has been
registered and probed already.

Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-of.c | 27 ++++++++++++++++++---------
 drivers/gpio/gpiolib.c    |  5 ++++-
 include/linux/of_gpio.h   |  4 ++--
 3 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
index 1e36ec5e2e0c..fa6e3c8823d6 100644
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -335,7 +335,7 @@ void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc)
 EXPORT_SYMBOL(of_mm_gpiochip_remove);
 
 #ifdef CONFIG_PINCTRL
-static void of_gpiochip_add_pin_range(struct gpio_chip *chip)
+static int of_gpiochip_add_pin_range(struct gpio_chip *chip)
 {
 	struct device_node *np = chip->of_node;
 	struct of_phandle_args pinspec;
@@ -346,7 +346,7 @@ static void of_gpiochip_add_pin_range(struct gpio_chip *chip)
 	struct property *group_names;
 
 	if (!np)
-		return;
+		return 0;
 
 	group_names = of_find_property(np, group_names_propname, NULL);
 
@@ -358,7 +358,7 @@ static void of_gpiochip_add_pin_range(struct gpio_chip *chip)
 
 		pctldev = of_pinctrl_get(pinspec.np);
 		if (!pctldev)
-			break;
+			return -EPROBE_DEFER;
 
 		if (pinspec.args[2]) {
 			if (group_names) {
@@ -378,7 +378,7 @@ static void of_gpiochip_add_pin_range(struct gpio_chip *chip)
 					pinspec.args[1],
 					pinspec.args[2]);
 			if (ret)
-				break;
+				return ret;
 		} else {
 			/* npins == 0: special range */
 			if (pinspec.args[1]) {
@@ -408,32 +408,41 @@ static void of_gpiochip_add_pin_range(struct gpio_chip *chip)
 			ret = gpiochip_add_pingroup_range(chip, pctldev,
 						pinspec.args[0], name);
 			if (ret)
-				break;
+				return ret;
 		}
 	}
+
+	return 0;
 }
 
 #else
-static void of_gpiochip_add_pin_range(struct gpio_chip *chip) {}
+static int of_gpiochip_add_pin_range(struct gpio_chip *chip) { return 0; }
 #endif
 
-void of_gpiochip_add(struct gpio_chip *chip)
+int of_gpiochip_add(struct gpio_chip *chip)
 {
+	int status;
+
 	if ((!chip->of_node) && (chip->dev))
 		chip->of_node = chip->dev->of_node;
 
 	if (!chip->of_node)
-		return;
+		return 0;
 
 	if (!chip->of_xlate) {
 		chip->of_gpio_n_cells = 2;
 		chip->of_xlate = of_gpio_simple_xlate;
 	}
 
-	of_gpiochip_add_pin_range(chip);
+	status = of_gpiochip_add_pin_range(chip);
+	if (status)
+		return status;
+
 	of_node_get(chip->of_node);
 
 	of_gpiochip_scan_hogs(chip);
+
+	return 0;
 }
 
 void of_gpiochip_remove(struct gpio_chip *chip)
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 9312bbcb19b9..1b5b8da71154 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -290,7 +290,10 @@ int gpiochip_add(struct gpio_chip *chip)
 	if (!chip->owner && chip->dev && chip->dev->driver)
 		chip->owner = chip->dev->driver->owner;
 
-	of_gpiochip_add(chip);
+	status = of_gpiochip_add(chip);
+	if (status)
+		goto err_remove_chip;
+
 	acpi_gpiochip_add(chip);
 
 	status = gpiochip_sysfs_register(chip);
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index 69dbe312b11b..f3191828f037 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -54,7 +54,7 @@ extern int of_mm_gpiochip_add(struct device_node *np,
 			      struct of_mm_gpio_chip *mm_gc);
 extern void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc);
 
-extern void of_gpiochip_add(struct gpio_chip *gc);
+extern int of_gpiochip_add(struct gpio_chip *gc);
 extern void of_gpiochip_remove(struct gpio_chip *gc);
 extern int of_gpio_simple_xlate(struct gpio_chip *gc,
 				const struct of_phandle_args *gpiospec,
@@ -76,7 +76,7 @@ static inline int of_gpio_simple_xlate(struct gpio_chip *gc,
 	return -ENOSYS;
 }
 
-static inline void of_gpiochip_add(struct gpio_chip *gc) { }
+static inline int of_gpiochip_add(struct gpio_chip *gc) { return 0; }
 static inline void of_gpiochip_remove(struct gpio_chip *gc) { }
 
 #endif /* CONFIG_OF_GPIO */
-- 
cgit v1.2.3-70-g09d2


From 559ed40752dc63e68f9b9ad301b20e6a3fe5cf21 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 26 Jul 2015 02:07:47 +0200
Subject: cpufreq: Avoid attempts to create duplicate symbolic links

After commit 87549141d516 (cpufreq: Stop migrating sysfs files on
hotplug) there is a problem with CPUs that share cpufreq policy
objects with other CPUs and are initially offline.

Say CPU1 shares a policy with CPU0 which is online and is registered
first.  As part of the registration process, cpufreq_add_dev() is
called for it.  It creates the policy object and a symbolic link
to it from the CPU1's sysfs directory.  If CPU1 is registered
subsequently and it is offline at that time, cpufreq_add_dev() will
attempt to create a symbolic link to the policy object for it, but
that link is present already, so a warning about that will be
triggered.

To avoid that warning, make cpufreq use an additional CPU mask
containing related CPUs that are actually present for each policy
object.  That mask is initialized when the policy object is populated
after its creation (for the first online CPU using it) and it includes
CPUs from the "policy CPUs" mask returned by the cpufreq driver's
->init() callback that are physically present at that time.  Symbolic
links to the policy are created only for the CPUs in that mask.

If cpufreq_add_dev() is invoked for an offline CPU, it checks the
new mask and only creates the symlink if the CPU was not in it (the
CPU is added to the mask at the same time).

In turn, cpufreq_remove_dev() drops the given CPU from the new mask,
removes its symlink to the policy object and returns, unless it is
the CPU owning the policy object.  In that case, the policy object
is moved to a new CPU's sysfs directory or deleted if the CPU being
removed was the last user of the policy.

While at it, notice that cpufreq_remove_dev() can't fail, because
its return value is ignored, so make it ignore return values from
__cpufreq_remove_dev_prepare() and __cpufreq_remove_dev_finish()
and prevent these functions from aborting on errors returned by
__cpufreq_governor().  Also drop the now unused sif argument from
them.

Fixes: 87549141d516 (cpufreq: Stop migrating sysfs files on hotplug)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reported-and-tested-by: Russell King <linux@arm.linux.org.uk>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 108 +++++++++++++++++++++++-----------------------
 include/linux/cpufreq.h   |   1 +
 2 files changed, 56 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 26063afb3eba..7a3c30c4336f 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1002,7 +1002,7 @@ static int cpufreq_add_dev_symlink(struct cpufreq_policy *policy)
 	int ret = 0;
 
 	/* Some related CPUs might not be present (physically hotplugged) */
-	for_each_cpu_and(j, policy->related_cpus, cpu_present_mask) {
+	for_each_cpu(j, policy->real_cpus) {
 		if (j == policy->kobj_cpu)
 			continue;
 
@@ -1019,7 +1019,7 @@ static void cpufreq_remove_dev_symlink(struct cpufreq_policy *policy)
 	unsigned int j;
 
 	/* Some related CPUs might not be present (physically hotplugged) */
-	for_each_cpu_and(j, policy->related_cpus, cpu_present_mask) {
+	for_each_cpu(j, policy->real_cpus) {
 		if (j == policy->kobj_cpu)
 			continue;
 
@@ -1163,11 +1163,14 @@ static struct cpufreq_policy *cpufreq_policy_alloc(struct device *dev)
 	if (!zalloc_cpumask_var(&policy->related_cpus, GFP_KERNEL))
 		goto err_free_cpumask;
 
+	if (!zalloc_cpumask_var(&policy->real_cpus, GFP_KERNEL))
+		goto err_free_rcpumask;
+
 	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, &dev->kobj,
 				   "cpufreq");
 	if (ret) {
 		pr_err("%s: failed to init policy->kobj: %d\n", __func__, ret);
-		goto err_free_rcpumask;
+		goto err_free_real_cpus;
 	}
 
 	INIT_LIST_HEAD(&policy->policy_list);
@@ -1184,6 +1187,8 @@ static struct cpufreq_policy *cpufreq_policy_alloc(struct device *dev)
 
 	return policy;
 
+err_free_real_cpus:
+	free_cpumask_var(policy->real_cpus);
 err_free_rcpumask:
 	free_cpumask_var(policy->related_cpus);
 err_free_cpumask:
@@ -1234,6 +1239,7 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy, bool notify)
 	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
 	cpufreq_policy_put_kobj(policy, notify);
+	free_cpumask_var(policy->real_cpus);
 	free_cpumask_var(policy->related_cpus);
 	free_cpumask_var(policy->cpus);
 	kfree(policy);
@@ -1258,14 +1264,17 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
 
 	pr_debug("adding CPU %u\n", cpu);
 
-	/*
-	 * Only possible if 'cpu' wasn't physically present earlier and we are
-	 * here from subsys_interface add callback. A hotplug notifier will
-	 * follow and we will handle it like logical CPU hotplug then. For now,
-	 * just create the sysfs link.
-	 */
-	if (cpu_is_offline(cpu))
-		return add_cpu_dev_symlink(per_cpu(cpufreq_cpu_data, cpu), cpu);
+	if (cpu_is_offline(cpu)) {
+		/*
+		 * Only possible if we are here from the subsys_interface add
+		 * callback.  A hotplug notifier will follow and we will handle
+		 * it as CPU online then.  For now, just create the sysfs link,
+		 * unless there is no policy or the link is already present.
+		 */
+		policy = per_cpu(cpufreq_cpu_data, cpu);
+		return policy && !cpumask_test_and_set_cpu(cpu, policy->real_cpus)
+			? add_cpu_dev_symlink(policy, cpu) : 0;
+	}
 
 	if (!down_read_trylock(&cpufreq_rwsem))
 		return 0;
@@ -1307,6 +1316,10 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
 	/* related cpus should atleast have policy->cpus */
 	cpumask_or(policy->related_cpus, policy->related_cpus, policy->cpus);
 
+	/* Remember which CPUs have been present at the policy creation time. */
+	if (!recover_policy)
+		cpumask_and(policy->real_cpus, policy->cpus, cpu_present_mask);
+
 	/*
 	 * affected cpus must always be the one, which are online. We aren't
 	 * managing offline cpus here.
@@ -1420,8 +1433,7 @@ nomem_out:
 	return ret;
 }
 
-static int __cpufreq_remove_dev_prepare(struct device *dev,
-					struct subsys_interface *sif)
+static int __cpufreq_remove_dev_prepare(struct device *dev)
 {
 	unsigned int cpu = dev->id;
 	int ret = 0;
@@ -1437,10 +1449,8 @@ static int __cpufreq_remove_dev_prepare(struct device *dev,
 
 	if (has_target()) {
 		ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
-		if (ret) {
+		if (ret)
 			pr_err("%s: Failed to stop governor\n", __func__);
-			return ret;
-		}
 	}
 
 	down_write(&policy->rwsem);
@@ -1473,8 +1483,7 @@ static int __cpufreq_remove_dev_prepare(struct device *dev,
 	return ret;
 }
 
-static int __cpufreq_remove_dev_finish(struct device *dev,
-				       struct subsys_interface *sif)
+static int __cpufreq_remove_dev_finish(struct device *dev)
 {
 	unsigned int cpu = dev->id;
 	int ret;
@@ -1492,10 +1501,8 @@ static int __cpufreq_remove_dev_finish(struct device *dev,
 	/* If cpu is last user of policy, free policy */
 	if (has_target()) {
 		ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
-		if (ret) {
+		if (ret)
 			pr_err("%s: Failed to exit governor\n", __func__);
-			return ret;
-		}
 	}
 
 	/*
@@ -1506,10 +1513,6 @@ static int __cpufreq_remove_dev_finish(struct device *dev,
 	if (cpufreq_driver->exit)
 		cpufreq_driver->exit(policy);
 
-	/* Free the policy only if the driver is getting removed. */
-	if (sif)
-		cpufreq_policy_free(policy, true);
-
 	return 0;
 }
 
@@ -1521,42 +1524,41 @@ static int __cpufreq_remove_dev_finish(struct device *dev,
 static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 {
 	unsigned int cpu = dev->id;
-	int ret;
-
-	/*
-	 * Only possible if 'cpu' is getting physically removed now. A hotplug
-	 * notifier should have already been called and we just need to remove
-	 * link or free policy here.
-	 */
-	if (cpu_is_offline(cpu)) {
-		struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu);
-		struct cpumask mask;
+	struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu);
 
-		if (!policy)
-			return 0;
+	if (!policy)
+		return 0;
 
-		cpumask_copy(&mask, policy->related_cpus);
-		cpumask_clear_cpu(cpu, &mask);
+	if (cpu_online(cpu)) {
+		__cpufreq_remove_dev_prepare(dev);
+		__cpufreq_remove_dev_finish(dev);
+	}
 
-		/*
-		 * Free policy only if all policy->related_cpus are removed
-		 * physically.
-		 */
-		if (cpumask_intersects(&mask, cpu_present_mask)) {
-			remove_cpu_dev_symlink(policy, cpu);
-			return 0;
-		}
+	cpumask_clear_cpu(cpu, policy->real_cpus);
 
+	if (cpumask_empty(policy->real_cpus)) {
 		cpufreq_policy_free(policy, true);
 		return 0;
 	}
 
-	ret = __cpufreq_remove_dev_prepare(dev, sif);
+	if (cpu != policy->kobj_cpu) {
+		remove_cpu_dev_symlink(policy, cpu);
+	} else {
+		/*
+		 * The CPU owning the policy object is going away.  Move it to
+		 * another suitable CPU.
+		 */
+		unsigned int new_cpu = cpumask_first(policy->real_cpus);
+		struct device *new_dev = get_cpu_device(new_cpu);
+
+		dev_dbg(dev, "%s: Moving policy object to CPU%u\n", __func__, new_cpu);
 
-	if (!ret)
-		ret = __cpufreq_remove_dev_finish(dev, sif);
+		sysfs_remove_link(&new_dev->kobj, "cpufreq");
+		policy->kobj_cpu = new_cpu;
+		WARN_ON(kobject_move(&policy->kobj, &new_dev->kobj));
+	}
 
-	return ret;
+	return 0;
 }
 
 static void handle_update(struct work_struct *work)
@@ -2395,11 +2397,11 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb,
 			break;
 
 		case CPU_DOWN_PREPARE:
-			__cpufreq_remove_dev_prepare(dev, NULL);
+			__cpufreq_remove_dev_prepare(dev);
 			break;
 
 		case CPU_POST_DEAD:
-			__cpufreq_remove_dev_finish(dev, NULL);
+			__cpufreq_remove_dev_finish(dev);
 			break;
 
 		case CPU_DOWN_FAILED:
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 29ad97c34fd5..bde1e567b3a9 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -62,6 +62,7 @@ struct cpufreq_policy {
 	/* CPUs sharing clock, require sw coordination */
 	cpumask_var_t		cpus;	/* Online CPUs only */
 	cpumask_var_t		related_cpus; /* Online + Offline CPUs */
+	cpumask_var_t		real_cpus; /* Related and present */
 
 	unsigned int		shared_type; /* ACPI: ANY or ALL affected CPUs
 						should set cpufreq */
-- 
cgit v1.2.3-70-g09d2


From 841df7df196237ea63233f0f9eaa41db53afd70f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.com>
Date: Tue, 28 Jul 2015 14:57:14 -0400
Subject: jbd2: avoid infinite loop when destroying aborted journal

Commit 6f6a6fda2945 "jbd2: fix ocfs2 corrupt when updating journal
superblock fails" changed jbd2_cleanup_journal_tail() to return EIO
when the journal is aborted. That makes logic in
jbd2_log_do_checkpoint() bail out which is fine, except that
jbd2_journal_destroy() expects jbd2_log_do_checkpoint() to always make
a progress in cleaning the journal. Without it jbd2_journal_destroy()
just loops in an infinite loop.

Fix jbd2_journal_destroy() to cleanup journal checkpoint lists of
jbd2_log_do_checkpoint() fails with error.

Reported-by: Eryu Guan <guaneryu@gmail.com>
Tested-by: Eryu Guan <guaneryu@gmail.com>
Fixes: 6f6a6fda294506dfe0e3e0a253bb2d2923f28f0a
Signed-off-by: Jan Kara <jack@suse.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c | 39 +++++++++++++++++++++++++++++++++------
 fs/jbd2/commit.c     |  2 +-
 fs/jbd2/journal.c    | 11 ++++++++++-
 include/linux/jbd2.h |  3 ++-
 4 files changed, 46 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 4227dc4f7437..8c44654ce274 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -417,12 +417,12 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
  * journal_clean_one_cp_list
  *
  * Find all the written-back checkpoint buffers in the given list and
- * release them.
+ * release them. If 'destroy' is set, clean all buffers unconditionally.
  *
  * Called with j_list_lock held.
  * Returns 1 if we freed the transaction, 0 otherwise.
  */
-static int journal_clean_one_cp_list(struct journal_head *jh)
+static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
 {
 	struct journal_head *last_jh;
 	struct journal_head *next_jh = jh;
@@ -436,7 +436,10 @@ static int journal_clean_one_cp_list(struct journal_head *jh)
 	do {
 		jh = next_jh;
 		next_jh = jh->b_cpnext;
-		ret = __try_to_free_cp_buf(jh);
+		if (!destroy)
+			ret = __try_to_free_cp_buf(jh);
+		else
+			ret = __jbd2_journal_remove_checkpoint(jh) + 1;
 		if (!ret)
 			return freed;
 		if (ret == 2)
@@ -459,10 +462,11 @@ static int journal_clean_one_cp_list(struct journal_head *jh)
  * journal_clean_checkpoint_list
  *
  * Find all the written-back checkpoint buffers in the journal and release them.
+ * If 'destroy' is set, release all buffers unconditionally.
  *
  * Called with j_list_lock held.
  */
-void __jbd2_journal_clean_checkpoint_list(journal_t *journal)
+void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
 	int ret;
@@ -476,7 +480,8 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal)
 	do {
 		transaction = next_transaction;
 		next_transaction = transaction->t_cpnext;
-		ret = journal_clean_one_cp_list(transaction->t_checkpoint_list);
+		ret = journal_clean_one_cp_list(transaction->t_checkpoint_list,
+						destroy);
 		/*
 		 * This function only frees up some memory if possible so we
 		 * dont have an obligation to finish processing. Bail out if
@@ -492,7 +497,7 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal)
 		 * we can possibly see not yet submitted buffers on io_list
 		 */
 		ret = journal_clean_one_cp_list(transaction->
-				t_checkpoint_io_list);
+				t_checkpoint_io_list, destroy);
 		if (need_resched())
 			return;
 		/*
@@ -505,6 +510,28 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal)
 	} while (transaction != last_transaction);
 }
 
+/*
+ * Remove buffers from all checkpoint lists as journal is aborted and we just
+ * need to free memory
+ */
+void jbd2_journal_destroy_checkpoint(journal_t *journal)
+{
+	/*
+	 * We loop because __jbd2_journal_clean_checkpoint_list() may abort
+	 * early due to a need of rescheduling.
+	 */
+	while (1) {
+		spin_lock(&journal->j_list_lock);
+		if (!journal->j_checkpoint_transactions) {
+			spin_unlock(&journal->j_list_lock);
+			break;
+		}
+		__jbd2_journal_clean_checkpoint_list(journal, true);
+		spin_unlock(&journal->j_list_lock);
+		cond_resched();
+	}
+}
+
 /*
  * journal_remove_checkpoint: called after a buffer has been committed
  * to disk (either by being write-back flushed to disk, or being
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index b73e0215baa7..362e5f614450 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -510,7 +510,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * frees some memory
 	 */
 	spin_lock(&journal->j_list_lock);
-	__jbd2_journal_clean_checkpoint_list(journal);
+	__jbd2_journal_clean_checkpoint_list(journal, false);
 	spin_unlock(&journal->j_list_lock);
 
 	jbd_debug(3, "JBD2: commit phase 1\n");
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fe1b4bdecdfa..8270fe9e3641 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1693,8 +1693,17 @@ int jbd2_journal_destroy(journal_t *journal)
 	while (journal->j_checkpoint_transactions != NULL) {
 		spin_unlock(&journal->j_list_lock);
 		mutex_lock(&journal->j_checkpoint_mutex);
-		jbd2_log_do_checkpoint(journal);
+		err = jbd2_log_do_checkpoint(journal);
 		mutex_unlock(&journal->j_checkpoint_mutex);
+		/*
+		 * If checkpointing failed, just free the buffers to avoid
+		 * looping forever
+		 */
+		if (err) {
+			jbd2_journal_destroy_checkpoint(journal);
+			spin_lock(&journal->j_list_lock);
+			break;
+		}
 		spin_lock(&journal->j_list_lock);
 	}
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index edb640ae9a94..eb1cebed3f36 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1042,8 +1042,9 @@ void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
 extern void jbd2_journal_commit_transaction(journal_t *);
 
 /* Checkpoint list management */
-void __jbd2_journal_clean_checkpoint_list(journal_t *journal);
+void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy);
 int __jbd2_journal_remove_checkpoint(struct journal_head *);
+void jbd2_journal_destroy_checkpoint(journal_t *journal);
 void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);
 
 
-- 
cgit v1.2.3-70-g09d2


From 9783c0d98501aa146ff467916ab4b8830a655d7c Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Thu, 16 Jul 2015 12:50:27 -0700
Subject: clk: Allow providers to configure min/max rates

clk providers are using the consumer APIs to set min/max rates on
the clock they're providing. To encourage clk providers to move
away from the consumer APIs, add a provider API to set the
min/max rate of a clock. The assumption is that this is done
before the clock can be requested via clk_get() and that the
clock rate is already within the boundaries of the min/max that's
configured.

Tested-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk.c            | 16 ++++++++++++++--
 include/linux/clk-provider.h |  2 ++
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index bd6dfbe04cf0..1ac237fe2fdb 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -58,6 +58,8 @@ struct clk_core {
 	unsigned long		flags;
 	unsigned int		enable_count;
 	unsigned int		prepare_count;
+	unsigned long		min_rate;
+	unsigned long		max_rate;
 	unsigned long		accuracy;
 	int			phase;
 	struct hlist_head	children;
@@ -512,8 +514,8 @@ static void clk_core_get_boundaries(struct clk_core *core,
 {
 	struct clk *clk_user;
 
-	*min_rate = 0;
-	*max_rate = ULONG_MAX;
+	*min_rate = core->min_rate;
+	*max_rate = core->max_rate;
 
 	hlist_for_each_entry(clk_user, &core->clks, clks_node)
 		*min_rate = max(*min_rate, clk_user->min_rate);
@@ -522,6 +524,14 @@ static void clk_core_get_boundaries(struct clk_core *core,
 		*max_rate = min(*max_rate, clk_user->max_rate);
 }
 
+void clk_hw_set_rate_range(struct clk_hw *hw, unsigned long min_rate,
+			   unsigned long max_rate)
+{
+	hw->core->min_rate = min_rate;
+	hw->core->max_rate = max_rate;
+}
+EXPORT_SYMBOL_GPL(clk_hw_set_rate_range);
+
 /*
  * Helper for finding best parent to provide a given frequency. This can be used
  * directly as a determine_rate callback (e.g. for a mux), or from a more
@@ -2498,6 +2508,8 @@ struct clk *clk_register(struct device *dev, struct clk_hw *hw)
 	core->hw = hw;
 	core->flags = hw->init->flags;
 	core->num_parents = hw->init->num_parents;
+	core->min_rate = 0;
+	core->max_rate = ULONG_MAX;
 	hw->core = core;
 
 	/* allocate local copy in case parent_names is __initdata */
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 2116e2b8a5f2..d62e7eab1dbe 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -619,6 +619,8 @@ int __clk_determine_rate(struct clk_hw *core, struct clk_rate_request *req);
 int __clk_mux_determine_rate_closest(struct clk_hw *hw,
 				     struct clk_rate_request *req);
 void clk_hw_reparent(struct clk_hw *hw, struct clk_hw *new_parent);
+void clk_hw_set_rate_range(struct clk_hw *hw, unsigned long min_rate,
+			   unsigned long max_rate);
 
 static inline void __clk_hw_set_clk(struct clk_hw *dst, struct clk_hw *src)
 {
-- 
cgit v1.2.3-70-g09d2


From afe76c8fd030dd6b75fa69f7af7b7eb1e212f248 Mon Sep 17 00:00:00 2001
From: Jim Quinlan <jim2101024@gmail.com>
Date: Fri, 15 May 2015 15:45:47 -0400
Subject: clk: allow a clk divider with max divisor when zero

This commit allows certain Broadcom STB clock dividers to be used with
clk-divider.c.  It allows for a clock whose field value is the equal
to the divisor, execpt when the field value is zero, in which case the
divisor is 2^width.  For example, consider a divisor clock with a two
bit field:

value		divisor
0		4
1		1
2		2
3		3

Signed-off-by: Jim Quinlan <jim2101024@gmail.com>
Signed-off-by: Michael Turquette <mturquette@baylibre.com>
---
 drivers/clk/clk-divider.c    | 16 +++++++++++-----
 include/linux/clk-provider.h |  4 ++++
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c
index 706b5783c360..2cab88b9c1a8 100644
--- a/drivers/clk/clk-divider.c
+++ b/drivers/clk/clk-divider.c
@@ -78,12 +78,14 @@ static unsigned int _get_table_div(const struct clk_div_table *table,
 }
 
 static unsigned int _get_div(const struct clk_div_table *table,
-			     unsigned int val, unsigned long flags)
+			     unsigned int val, unsigned long flags, u8 width)
 {
 	if (flags & CLK_DIVIDER_ONE_BASED)
 		return val;
 	if (flags & CLK_DIVIDER_POWER_OF_TWO)
 		return 1 << val;
+	if (flags & CLK_DIVIDER_MAX_AT_ZERO)
+		return val ? val : div_mask(width) + 1;
 	if (table)
 		return _get_table_div(table, val);
 	return val + 1;
@@ -101,12 +103,14 @@ static unsigned int _get_table_val(const struct clk_div_table *table,
 }
 
 static unsigned int _get_val(const struct clk_div_table *table,
-			     unsigned int div, unsigned long flags)
+			     unsigned int div, unsigned long flags, u8 width)
 {
 	if (flags & CLK_DIVIDER_ONE_BASED)
 		return div;
 	if (flags & CLK_DIVIDER_POWER_OF_TWO)
 		return __ffs(div);
+	if (flags & CLK_DIVIDER_MAX_AT_ZERO)
+		return (div == div_mask(width) + 1) ? 0 : div;
 	if (table)
 		return  _get_table_val(table, div);
 	return div - 1;
@@ -117,9 +121,10 @@ unsigned long divider_recalc_rate(struct clk_hw *hw, unsigned long parent_rate,
 				  const struct clk_div_table *table,
 				  unsigned long flags)
 {
+	struct clk_divider *divider = to_clk_divider(hw);
 	unsigned int div;
 
-	div = _get_div(table, val, flags);
+	div = _get_div(table, val, flags, divider->width);
 	if (!div) {
 		WARN(!(flags & CLK_DIVIDER_ALLOW_ZERO),
 			"%s: Zero divisor and CLK_DIVIDER_ALLOW_ZERO not set\n",
@@ -351,7 +356,8 @@ static long clk_divider_round_rate(struct clk_hw *hw, unsigned long rate,
 	if (divider->flags & CLK_DIVIDER_READ_ONLY) {
 		bestdiv = readl(divider->reg) >> divider->shift;
 		bestdiv &= div_mask(divider->width);
-		bestdiv = _get_div(divider->table, bestdiv, divider->flags);
+		bestdiv = _get_div(divider->table, bestdiv, divider->flags,
+			divider->width);
 		return DIV_ROUND_UP(*prate, bestdiv);
 	}
 
@@ -370,7 +376,7 @@ int divider_get_val(unsigned long rate, unsigned long parent_rate,
 	if (!_is_valid_div(table, div, flags))
 		return -EINVAL;
 
-	value = _get_val(table, div, flags);
+	value = _get_val(table, div, flags, width);
 
 	return min_t(unsigned int, value, div_mask(width));
 }
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 402478ed9933..699a25075170 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -361,6 +361,9 @@ struct clk_div_table {
  *	to the closest integer instead of the up one.
  * CLK_DIVIDER_READ_ONLY - The divider settings are preconfigured and should
  *	not be changed by the clock framework.
+ * CLK_DIVIDER_MAX_AT_ZERO - For dividers which are like CLK_DIVIDER_ONE_BASED
+ *	except when the value read from the register is zero, the divisor is
+ *	2^width of the field.
  */
 struct clk_divider {
 	struct clk_hw	hw;
@@ -378,6 +381,7 @@ struct clk_divider {
 #define CLK_DIVIDER_HIWORD_MASK		BIT(3)
 #define CLK_DIVIDER_ROUND_CLOSEST	BIT(4)
 #define CLK_DIVIDER_READ_ONLY		BIT(5)
+#define CLK_DIVIDER_MAX_AT_ZERO		BIT(6)
 
 extern const struct clk_ops clk_divider_ops;
 
-- 
cgit v1.2.3-70-g09d2


From 37bff2c159a3629b592e54162239cb8c337c965d Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Fri, 24 Jul 2015 09:31:29 -0700
Subject: clk: gpio: Mark parent_names array const

Let's encourage const arrays of parent names like other basic
clock types.

Cc: Sergej Sawazki <ce3a@gmx.de>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-gpio.c       | 13 +++++++------
 include/linux/clk-provider.h |  2 +-
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-gpio.c b/drivers/clk/clk-gpio.c
index 41277a1526c7..10819e248414 100644
--- a/drivers/clk/clk-gpio.c
+++ b/drivers/clk/clk-gpio.c
@@ -95,7 +95,7 @@ const struct clk_ops clk_gpio_mux_ops = {
 EXPORT_SYMBOL_GPL(clk_gpio_mux_ops);
 
 static struct clk *clk_register_gpio(struct device *dev, const char *name,
-		const char **parent_names, u8 num_parents, unsigned gpio,
+		const char * const *parent_names, u8 num_parents, unsigned gpio,
 		bool active_low, unsigned long flags,
 		const struct clk_ops *clk_gpio_ops)
 {
@@ -188,7 +188,7 @@ EXPORT_SYMBOL_GPL(clk_register_gpio_gate);
  * @flags: clock flags
  */
 struct clk *clk_register_gpio_mux(struct device *dev, const char *name,
-		const char **parent_names, u8 num_parents, unsigned gpio,
+		const char * const *parent_names, u8 num_parents, unsigned gpio,
 		bool active_low, unsigned long flags)
 {
 	if (num_parents != 2) {
@@ -213,7 +213,7 @@ struct clk_gpio_delayed_register_data {
 	struct mutex lock;
 	struct clk *clk;
 	struct clk *(*clk_register_get)(const char *name,
-			const char **parent_names, u8 num_parents,
+			const char * const *parent_names, u8 num_parents,
 			unsigned gpio, bool active_low);
 };
 
@@ -273,7 +273,7 @@ out:
 }
 
 static struct clk *of_clk_gpio_gate_delayed_register_get(const char *name,
-		const char **parent_names, u8 num_parents,
+		const char * const *parent_names, u8 num_parents,
 		unsigned gpio, bool active_low)
 {
 	return clk_register_gpio_gate(NULL, name, parent_names[0],
@@ -281,7 +281,7 @@ static struct clk *of_clk_gpio_gate_delayed_register_get(const char *name,
 }
 
 static struct clk *of_clk_gpio_mux_delayed_register_get(const char *name,
-		const char **parent_names, u8 num_parents, unsigned gpio,
+		const char * const *parent_names, u8 num_parents, unsigned gpio,
 		bool active_low)
 {
 	return clk_register_gpio_mux(NULL, name, parent_names, num_parents,
@@ -291,7 +291,8 @@ static struct clk *of_clk_gpio_mux_delayed_register_get(const char *name,
 static void __init of_gpio_clk_setup(struct device_node *node,
 		const char *gpio_name,
 		struct clk *(*clk_register_get)(const char *name,
-				const char **parent_names, u8 num_parents,
+				const char * const *parent_names,
+				u8 num_parents,
 				unsigned gpio, bool active_low))
 {
 	struct clk_gpio_delayed_register_data *data;
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 699a25075170..06a56e55cfaf 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -583,7 +583,7 @@ void of_gpio_clk_gate_setup(struct device_node *node);
 
 extern const struct clk_ops clk_gpio_mux_ops;
 struct clk *clk_register_gpio_mux(struct device *dev, const char *name,
-		const char **parent_names, u8 num_parents, unsigned gpio,
+		const char * const *parent_names, u8 num_parents, unsigned gpio,
 		bool active_low, unsigned long flags);
 
 void of_gpio_mux_clk_setup(struct device_node *node);
-- 
cgit v1.2.3-70-g09d2


From 4b638df4c9d556a6d947d6dbac364bee37b68b8e Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Date: Fri, 26 Jun 2015 14:50:10 -0700
Subject: soc: qcom: Add Shared Memory Manager driver

The Shared Memory Manager driver implements an interface for allocating
and accessing items in the memory area shared among all of the
processors in a Qualcomm platform.

Signed-off-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Acked-by: Andy Gross <agross@codeaurora.org>
Signed-off-by: Andy Gross <agross@codeaurora.org>
---
 drivers/soc/qcom/Kconfig      |   8 +
 drivers/soc/qcom/Makefile     |   1 +
 drivers/soc/qcom/smem.c       | 775 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/soc/qcom/smem.h |  11 +
 4 files changed, 795 insertions(+)
 create mode 100644 drivers/soc/qcom/smem.c
 create mode 100644 include/linux/soc/qcom/smem.h

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig
index 5eea374c8fa6..8544e1594c2c 100644
--- a/drivers/soc/qcom/Kconfig
+++ b/drivers/soc/qcom/Kconfig
@@ -17,3 +17,11 @@ config QCOM_PM
 	  QCOM Platform specific power driver to manage cores and L2 low power
 	  modes. It interface with various system drivers to put the cores in
 	  low power modes.
+
+config QCOM_SMEM
+	tristate "Qualcomm Shared Memory Manager (SMEM)"
+	depends on ARCH_QCOM
+	help
+	  Say y here to enable support for the Qualcomm Shared Memory Manager.
+	  The driver provides an interface to items in a heap shared among all
+	  processors in a Qualcomm platform.
diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile
index 931d385386c5..3a033c43c0ef 100644
--- a/drivers/soc/qcom/Makefile
+++ b/drivers/soc/qcom/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_QCOM_GSBI)	+=	qcom_gsbi.o
 obj-$(CONFIG_QCOM_PM)	+=	spm.o
+obj-$(CONFIG_QCOM_SMEM) +=	smem.o
diff --git a/drivers/soc/qcom/smem.c b/drivers/soc/qcom/smem.c
new file mode 100644
index 000000000000..7c2c324c4b10
--- /dev/null
+++ b/drivers/soc/qcom/smem.c
@@ -0,0 +1,775 @@
+/*
+ * Copyright (c) 2015, Sony Mobile Communications AB.
+ * Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/hwspinlock.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/soc/qcom/smem.h>
+
+/*
+ * The Qualcomm shared memory system is a allocate only heap structure that
+ * consists of one of more memory areas that can be accessed by the processors
+ * in the SoC.
+ *
+ * All systems contains a global heap, accessible by all processors in the SoC,
+ * with a table of contents data structure (@smem_header) at the beginning of
+ * the main shared memory block.
+ *
+ * The global header contains meta data for allocations as well as a fixed list
+ * of 512 entries (@smem_global_entry) that can be initialized to reference
+ * parts of the shared memory space.
+ *
+ *
+ * In addition to this global heap a set of "private" heaps can be set up at
+ * boot time with access restrictions so that only certain processor pairs can
+ * access the data.
+ *
+ * These partitions are referenced from an optional partition table
+ * (@smem_ptable), that is found 4kB from the end of the main smem region. The
+ * partition table entries (@smem_ptable_entry) lists the involved processors
+ * (or hosts) and their location in the main shared memory region.
+ *
+ * Each partition starts with a header (@smem_partition_header) that identifies
+ * the partition and holds properties for the two internal memory regions. The
+ * two regions are cached and non-cached memory respectively. Each region
+ * contain a link list of allocation headers (@smem_private_entry) followed by
+ * their data.
+ *
+ * Items in the non-cached region are allocated from the start of the partition
+ * while items in the cached region are allocated from the end. The free area
+ * is hence the region between the cached and non-cached offsets.
+ *
+ *
+ * To synchronize allocations in the shared memory heaps a remote spinlock must
+ * be held - currently lock number 3 of the sfpb or tcsr is used for this on all
+ * platforms.
+ *
+ */
+
+/*
+ * Item 3 of the global heap contains an array of versions for the various
+ * software components in the SoC. We verify that the boot loader version is
+ * what the expected version (SMEM_EXPECTED_VERSION) as a sanity check.
+ */
+#define SMEM_ITEM_VERSION	3
+#define  SMEM_MASTER_SBL_VERSION_INDEX	7
+#define  SMEM_EXPECTED_VERSION		11
+
+/*
+ * The first 8 items are only to be allocated by the boot loader while
+ * initializing the heap.
+ */
+#define SMEM_ITEM_LAST_FIXED	8
+
+/* Highest accepted item number, for both global and private heaps */
+#define SMEM_ITEM_COUNT		512
+
+/* Processor/host identifier for the application processor */
+#define SMEM_HOST_APPS		0
+
+/* Max number of processors/hosts in a system */
+#define SMEM_HOST_COUNT		9
+
+/**
+  * struct smem_proc_comm - proc_comm communication struct (legacy)
+  * @command:	current command to be executed
+  * @status:	status of the currently requested command
+  * @params:	parameters to the command
+  */
+struct smem_proc_comm {
+	u32 command;
+	u32 status;
+	u32 params[2];
+};
+
+/**
+ * struct smem_global_entry - entry to reference smem items on the heap
+ * @allocated:	boolean to indicate if this entry is used
+ * @offset:	offset to the allocated space
+ * @size:	size of the allocated space, 8 byte aligned
+ * @aux_base:	base address for the memory region used by this unit, or 0 for
+ *		the default region. bits 0,1 are reserved
+ */
+struct smem_global_entry {
+	u32 allocated;
+	u32 offset;
+	u32 size;
+	u32 aux_base; /* bits 1:0 reserved */
+};
+#define AUX_BASE_MASK		0xfffffffc
+
+/**
+ * struct smem_header - header found in beginning of primary smem region
+ * @proc_comm:		proc_comm communication interface (legacy)
+ * @version:		array of versions for the various subsystems
+ * @initialized:	boolean to indicate that smem is initialized
+ * @free_offset:	index of the first unallocated byte in smem
+ * @available:		number of bytes available for allocation
+ * @reserved:		reserved field, must be 0
+ * toc:			array of references to items
+ */
+struct smem_header {
+	struct smem_proc_comm proc_comm[4];
+	u32 version[32];
+	u32 initialized;
+	u32 free_offset;
+	u32 available;
+	u32 reserved;
+	struct smem_global_entry toc[SMEM_ITEM_COUNT];
+};
+
+/**
+ * struct smem_ptable_entry - one entry in the @smem_ptable list
+ * @offset:	offset, within the main shared memory region, of the partition
+ * @size:	size of the partition
+ * @flags:	flags for the partition (currently unused)
+ * @host0:	first processor/host with access to this partition
+ * @host1:	second processor/host with access to this partition
+ * @reserved:	reserved entries for later use
+ */
+struct smem_ptable_entry {
+	u32 offset;
+	u32 size;
+	u32 flags;
+	u16 host0;
+	u16 host1;
+	u32 reserved[8];
+};
+
+/**
+ * struct smem_ptable - partition table for the private partitions
+ * @magic:	magic number, must be SMEM_PTABLE_MAGIC
+ * @version:	version of the partition table
+ * @num_entries: number of partitions in the table
+ * @reserved:	for now reserved entries
+ * @entry:	list of @smem_ptable_entry for the @num_entries partitions
+ */
+struct smem_ptable {
+	u32 magic;
+	u32 version;
+	u32 num_entries;
+	u32 reserved[5];
+	struct smem_ptable_entry entry[];
+};
+#define SMEM_PTABLE_MAGIC	0x434f5424 /* "$TOC" */
+
+/**
+ * struct smem_partition_header - header of the partitions
+ * @magic:	magic number, must be SMEM_PART_MAGIC
+ * @host0:	first processor/host with access to this partition
+ * @host1:	second processor/host with access to this partition
+ * @size:	size of the partition
+ * @offset_free_uncached: offset to the first free byte of uncached memory in
+ *		this partition
+ * @offset_free_cached: offset to the first free byte of cached memory in this
+ *		partition
+ * @reserved:	for now reserved entries
+ */
+struct smem_partition_header {
+	u32 magic;
+	u16 host0;
+	u16 host1;
+	u32 size;
+	u32 offset_free_uncached;
+	u32 offset_free_cached;
+	u32 reserved[3];
+};
+#define SMEM_PART_MAGIC		0x54525024 /* "$PRT" */
+
+/**
+ * struct smem_private_entry - header of each item in the private partition
+ * @canary:	magic number, must be SMEM_PRIVATE_CANARY
+ * @item:	identifying number of the smem item
+ * @size:	size of the data, including padding bytes
+ * @padding_data: number of bytes of padding of data
+ * @padding_hdr: number of bytes of padding between the header and the data
+ * @reserved:	for now reserved entry
+ */
+struct smem_private_entry {
+	u16 canary;
+	u16 item;
+	u32 size; /* includes padding bytes */
+	u16 padding_data;
+	u16 padding_hdr;
+	u32 reserved;
+};
+#define SMEM_PRIVATE_CANARY	0xa5a5
+
+/**
+ * struct smem_region - representation of a chunk of memory used for smem
+ * @aux_base:	identifier of aux_mem base
+ * @virt_base:	virtual base address of memory with this aux_mem identifier
+ * @size:	size of the memory region
+ */
+struct smem_region {
+	u32 aux_base;
+	void __iomem *virt_base;
+	size_t size;
+};
+
+/**
+ * struct qcom_smem - device data for the smem device
+ * @dev:	device pointer
+ * @hwlock:	reference to a hwspinlock
+ * @partitions:	list of pointers to partitions affecting the current
+ *		processor/host
+ * @num_regions: number of @regions
+ * @regions:	list of the memory regions defining the shared memory
+ */
+struct qcom_smem {
+	struct device *dev;
+
+	struct hwspinlock *hwlock;
+
+	struct smem_partition_header *partitions[SMEM_HOST_COUNT];
+
+	unsigned num_regions;
+	struct smem_region regions[0];
+};
+
+/* Pointer to the one and only smem handle */
+static struct qcom_smem *__smem;
+
+/* Timeout (ms) for the trylock of remote spinlocks */
+#define HWSPINLOCK_TIMEOUT	1000
+
+static int qcom_smem_alloc_private(struct qcom_smem *smem,
+				   unsigned host,
+				   unsigned item,
+				   size_t size)
+{
+	struct smem_partition_header *phdr;
+	struct smem_private_entry *hdr;
+	size_t alloc_size;
+	void *p;
+
+	/* We're not going to find it if there's no matching partition */
+	if (host >= SMEM_HOST_COUNT || !smem->partitions[host])
+		return -ENOENT;
+
+	phdr = smem->partitions[host];
+
+	p = (void *)phdr + sizeof(*phdr);
+	while (p < (void *)phdr + phdr->offset_free_uncached) {
+		hdr = p;
+
+		if (hdr->canary != SMEM_PRIVATE_CANARY) {
+			dev_err(smem->dev,
+				"Found invalid canary in host %d partition\n",
+				host);
+			return -EINVAL;
+		}
+
+		if (hdr->item == item)
+			return -EEXIST;
+
+		p += sizeof(*hdr) + hdr->padding_hdr + hdr->size;
+	}
+
+	/* Check that we don't grow into the cached region */
+	alloc_size = sizeof(*hdr) + ALIGN(size, 8);
+	if (p + alloc_size >= (void *)phdr + phdr->offset_free_cached) {
+		dev_err(smem->dev, "Out of memory\n");
+		return -ENOSPC;
+	}
+
+	hdr = p;
+	hdr->canary = SMEM_PRIVATE_CANARY;
+	hdr->item = item;
+	hdr->size = ALIGN(size, 8);
+	hdr->padding_data = hdr->size - size;
+	hdr->padding_hdr = 0;
+
+	/*
+	 * Ensure the header is written before we advance the free offset, so
+	 * that remote processors that does not take the remote spinlock still
+	 * gets a consistent view of the linked list.
+	 */
+	wmb();
+	phdr->offset_free_uncached += alloc_size;
+
+	return 0;
+}
+
+static int qcom_smem_alloc_global(struct qcom_smem *smem,
+				  unsigned item,
+				  size_t size)
+{
+	struct smem_header *header;
+	struct smem_global_entry *entry;
+
+	if (WARN_ON(item >= SMEM_ITEM_COUNT))
+		return -EINVAL;
+
+	header = smem->regions[0].virt_base;
+	entry = &header->toc[item];
+	if (entry->allocated)
+		return -EEXIST;
+
+	size = ALIGN(size, 8);
+	if (WARN_ON(size > header->available))
+		return -ENOMEM;
+
+	entry->offset = header->free_offset;
+	entry->size = size;
+
+	/*
+	 * Ensure the header is consistent before we mark the item allocated,
+	 * so that remote processors will get a consistent view of the item
+	 * even though they do not take the spinlock on read.
+	 */
+	wmb();
+	entry->allocated = 1;
+
+	header->free_offset += size;
+	header->available -= size;
+
+	return 0;
+}
+
+/**
+ * qcom_smem_alloc() - allocate space for a smem item
+ * @host:	remote processor id, or -1
+ * @item:	smem item handle
+ * @size:	number of bytes to be allocated
+ *
+ * Allocate space for a given smem item of size @size, given that the item is
+ * not yet allocated.
+ */
+int qcom_smem_alloc(unsigned host, unsigned item, size_t size)
+{
+	unsigned long flags;
+	int ret;
+
+	if (!__smem)
+		return -EPROBE_DEFER;
+
+	if (item < SMEM_ITEM_LAST_FIXED) {
+		dev_err(__smem->dev,
+			"Rejecting allocation of static entry %d\n", item);
+		return -EINVAL;
+	}
+
+	ret = hwspin_lock_timeout_irqsave(__smem->hwlock,
+					  HWSPINLOCK_TIMEOUT,
+					  &flags);
+	if (ret)
+		return ret;
+
+	ret = qcom_smem_alloc_private(__smem, host, item, size);
+	if (ret == -ENOENT)
+		ret = qcom_smem_alloc_global(__smem, item, size);
+
+	hwspin_unlock_irqrestore(__smem->hwlock, &flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(qcom_smem_alloc);
+
+static int qcom_smem_get_global(struct qcom_smem *smem,
+				unsigned item,
+				void **ptr,
+				size_t *size)
+{
+	struct smem_header *header;
+	struct smem_region *area;
+	struct smem_global_entry *entry;
+	u32 aux_base;
+	unsigned i;
+
+	if (WARN_ON(item >= SMEM_ITEM_COUNT))
+		return -EINVAL;
+
+	header = smem->regions[0].virt_base;
+	entry = &header->toc[item];
+	if (!entry->allocated)
+		return -ENXIO;
+
+	if (ptr != NULL) {
+		aux_base = entry->aux_base & AUX_BASE_MASK;
+
+		for (i = 0; i < smem->num_regions; i++) {
+			area = &smem->regions[i];
+
+			if (area->aux_base == aux_base || !aux_base) {
+				*ptr = area->virt_base + entry->offset;
+				break;
+			}
+		}
+	}
+	if (size != NULL)
+		*size = entry->size;
+
+	return 0;
+}
+
+static int qcom_smem_get_private(struct qcom_smem *smem,
+				 unsigned host,
+				 unsigned item,
+				 void **ptr,
+				 size_t *size)
+{
+	struct smem_partition_header *phdr;
+	struct smem_private_entry *hdr;
+	void *p;
+
+	/* We're not going to find it if there's no matching partition */
+	if (host >= SMEM_HOST_COUNT || !smem->partitions[host])
+		return -ENOENT;
+
+	phdr = smem->partitions[host];
+
+	p = (void *)phdr + sizeof(*phdr);
+	while (p < (void *)phdr + phdr->offset_free_uncached) {
+		hdr = p;
+
+		if (hdr->canary != SMEM_PRIVATE_CANARY) {
+			dev_err(smem->dev,
+				"Found invalid canary in host %d partition\n",
+				host);
+			return -EINVAL;
+		}
+
+		if (hdr->item == item) {
+			if (ptr != NULL)
+				*ptr = p + sizeof(*hdr) + hdr->padding_hdr;
+
+			if (size != NULL)
+				*size = hdr->size - hdr->padding_data;
+
+			return 0;
+		}
+
+		p += sizeof(*hdr) + hdr->padding_hdr + hdr->size;
+	}
+
+	return -ENOENT;
+}
+
+/**
+ * qcom_smem_get() - resolve ptr of size of a smem item
+ * @host:	the remote processor, or -1
+ * @item:	smem item handle
+ * @ptr:	pointer to be filled out with address of the item
+ * @size:	pointer to be filled out with size of the item
+ *
+ * Looks up pointer and size of a smem item.
+ */
+int qcom_smem_get(unsigned host, unsigned item, void **ptr, size_t *size)
+{
+	unsigned long flags;
+	int ret;
+
+	if (!__smem)
+		return -EPROBE_DEFER;
+
+	ret = hwspin_lock_timeout_irqsave(__smem->hwlock,
+					  HWSPINLOCK_TIMEOUT,
+					  &flags);
+	if (ret)
+		return ret;
+
+	ret = qcom_smem_get_private(__smem, host, item, ptr, size);
+	if (ret == -ENOENT)
+		ret = qcom_smem_get_global(__smem, item, ptr, size);
+
+	hwspin_unlock_irqrestore(__smem->hwlock, &flags);
+	return ret;
+
+}
+EXPORT_SYMBOL(qcom_smem_get);
+
+/**
+ * qcom_smem_get_free_space() - retrieve amount of free space in a partition
+ * @host:	the remote processor identifying a partition, or -1
+ *
+ * To be used by smem clients as a quick way to determine if any new
+ * allocations has been made.
+ */
+int qcom_smem_get_free_space(unsigned host)
+{
+	struct smem_partition_header *phdr;
+	struct smem_header *header;
+	unsigned ret;
+
+	if (!__smem)
+		return -EPROBE_DEFER;
+
+	if (host < SMEM_HOST_COUNT && __smem->partitions[host]) {
+		phdr = __smem->partitions[host];
+		ret = phdr->offset_free_cached - phdr->offset_free_uncached;
+	} else {
+		header = __smem->regions[0].virt_base;
+		ret = header->available;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(qcom_smem_get_free_space);
+
+static int qcom_smem_get_sbl_version(struct qcom_smem *smem)
+{
+	unsigned *versions;
+	size_t size;
+	int ret;
+
+	ret = qcom_smem_get_global(smem, SMEM_ITEM_VERSION,
+				   (void **)&versions, &size);
+	if (ret < 0) {
+		dev_err(smem->dev, "Unable to read the version item\n");
+		return -ENOENT;
+	}
+
+	if (size < sizeof(unsigned) * SMEM_MASTER_SBL_VERSION_INDEX) {
+		dev_err(smem->dev, "Version item is too small\n");
+		return -EINVAL;
+	}
+
+	return versions[SMEM_MASTER_SBL_VERSION_INDEX];
+}
+
+static int qcom_smem_enumerate_partitions(struct qcom_smem *smem,
+					  unsigned local_host)
+{
+	struct smem_partition_header *header;
+	struct smem_ptable_entry *entry;
+	struct smem_ptable *ptable;
+	unsigned remote_host;
+	int i;
+
+	ptable = smem->regions[0].virt_base + smem->regions[0].size - SZ_4K;
+	if (ptable->magic != SMEM_PTABLE_MAGIC)
+		return 0;
+
+	if (ptable->version != 1) {
+		dev_err(smem->dev,
+			"Unsupported partition header version %d\n",
+			ptable->version);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < ptable->num_entries; i++) {
+		entry = &ptable->entry[i];
+
+		if (entry->host0 != local_host && entry->host1 != local_host)
+			continue;
+
+		if (!entry->offset)
+			continue;
+
+		if (!entry->size)
+			continue;
+
+		if (entry->host0 == local_host)
+			remote_host = entry->host1;
+		else
+			remote_host = entry->host0;
+
+		if (remote_host >= SMEM_HOST_COUNT) {
+			dev_err(smem->dev,
+				"Invalid remote host %d\n",
+				remote_host);
+			return -EINVAL;
+		}
+
+		if (smem->partitions[remote_host]) {
+			dev_err(smem->dev,
+				"Already found a partition for host %d\n",
+				remote_host);
+			return -EINVAL;
+		}
+
+		header = smem->regions[0].virt_base + entry->offset;
+
+		if (header->magic != SMEM_PART_MAGIC) {
+			dev_err(smem->dev,
+				"Partition %d has invalid magic\n", i);
+			return -EINVAL;
+		}
+
+		if (header->host0 != local_host && header->host1 != local_host) {
+			dev_err(smem->dev,
+				"Partition %d hosts are invalid\n", i);
+			return -EINVAL;
+		}
+
+		if (header->host0 != remote_host && header->host1 != remote_host) {
+			dev_err(smem->dev,
+				"Partition %d hosts are invalid\n", i);
+			return -EINVAL;
+		}
+
+		if (header->size != entry->size) {
+			dev_err(smem->dev,
+				"Partition %d has invalid size\n", i);
+			return -EINVAL;
+		}
+
+		if (header->offset_free_uncached > header->size) {
+			dev_err(smem->dev,
+				"Partition %d has invalid free pointer\n", i);
+			return -EINVAL;
+		}
+
+		smem->partitions[remote_host] = header;
+	}
+
+	return 0;
+}
+
+static int qcom_smem_count_mem_regions(struct platform_device *pdev)
+{
+	struct resource *res;
+	int num_regions = 0;
+	int i;
+
+	for (i = 0; i < pdev->num_resources; i++) {
+		res = &pdev->resource[i];
+
+		if (resource_type(res) == IORESOURCE_MEM)
+			num_regions++;
+	}
+
+	return num_regions;
+}
+
+static int qcom_smem_probe(struct platform_device *pdev)
+{
+	struct smem_header *header;
+	struct device_node *np;
+	struct qcom_smem *smem;
+	struct resource *res;
+	struct resource r;
+	size_t array_size;
+	int num_regions = 0;
+	int hwlock_id;
+	u32 version;
+	int ret;
+	int i;
+
+	num_regions = qcom_smem_count_mem_regions(pdev) + 1;
+
+	array_size = num_regions * sizeof(struct smem_region);
+	smem = devm_kzalloc(&pdev->dev, sizeof(*smem) + array_size, GFP_KERNEL);
+	if (!smem)
+		return -ENOMEM;
+
+	smem->dev = &pdev->dev;
+	smem->num_regions = num_regions;
+
+	np = of_parse_phandle(pdev->dev.of_node, "memory-region", 0);
+	if (!np) {
+		dev_err(&pdev->dev, "No memory-region specified\n");
+		return -EINVAL;
+	}
+
+	ret = of_address_to_resource(np, 0, &r);
+	of_node_put(np);
+	if (ret)
+		return ret;
+
+	smem->regions[0].aux_base = (u32)r.start;
+	smem->regions[0].size = resource_size(&r);
+	smem->regions[0].virt_base = devm_ioremap_nocache(&pdev->dev,
+							  r.start,
+							  resource_size(&r));
+	if (!smem->regions[0].virt_base)
+		return -ENOMEM;
+
+	for (i = 1; i < num_regions; i++) {
+		res = platform_get_resource(pdev, IORESOURCE_MEM, i - 1);
+
+		smem->regions[i].aux_base = (u32)res->start;
+		smem->regions[i].size = resource_size(res);
+		smem->regions[i].virt_base = devm_ioremap_nocache(&pdev->dev,
+								  res->start,
+								  resource_size(res));
+		if (!smem->regions[i].virt_base)
+			return -ENOMEM;
+	}
+
+	header = smem->regions[0].virt_base;
+	if (header->initialized != 1 || header->reserved) {
+		dev_err(&pdev->dev, "SMEM is not initialized by SBL\n");
+		return -EINVAL;
+	}
+
+	version = qcom_smem_get_sbl_version(smem);
+	if (version >> 16 != SMEM_EXPECTED_VERSION) {
+		dev_err(&pdev->dev, "Unsupported SMEM version 0x%x\n", version);
+		return -EINVAL;
+	}
+
+	ret = qcom_smem_enumerate_partitions(smem, SMEM_HOST_APPS);
+	if (ret < 0)
+		return ret;
+
+	hwlock_id = of_hwspin_lock_get_id(pdev->dev.of_node, 0);
+	if (hwlock_id < 0) {
+		dev_err(&pdev->dev, "failed to retrieve hwlock\n");
+		return hwlock_id;
+	}
+
+	smem->hwlock = hwspin_lock_request_specific(hwlock_id);
+	if (!smem->hwlock)
+		return -ENXIO;
+
+	__smem = smem;
+
+	return 0;
+}
+
+static int qcom_smem_remove(struct platform_device *pdev)
+{
+	__smem = NULL;
+	hwspin_lock_free(__smem->hwlock);
+
+	return 0;
+}
+
+static const struct of_device_id qcom_smem_of_match[] = {
+	{ .compatible = "qcom,smem" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, qcom_smem_of_match);
+
+static struct platform_driver qcom_smem_driver = {
+	.probe = qcom_smem_probe,
+	.remove = qcom_smem_remove,
+	.driver  = {
+		.name = "qcom-smem",
+		.of_match_table = qcom_smem_of_match,
+		.suppress_bind_attrs = true,
+	},
+};
+
+static int __init qcom_smem_init(void)
+{
+	return platform_driver_register(&qcom_smem_driver);
+}
+arch_initcall(qcom_smem_init);
+
+static void __exit qcom_smem_exit(void)
+{
+	platform_driver_unregister(&qcom_smem_driver);
+}
+module_exit(qcom_smem_exit)
+
+MODULE_AUTHOR("Bjorn Andersson <bjorn.andersson@sonymobile.com>");
+MODULE_DESCRIPTION("Qualcomm Shared Memory Manager");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/soc/qcom/smem.h b/include/linux/soc/qcom/smem.h
new file mode 100644
index 000000000000..bc9630d3aced
--- /dev/null
+++ b/include/linux/soc/qcom/smem.h
@@ -0,0 +1,11 @@
+#ifndef __QCOM_SMEM_H__
+#define __QCOM_SMEM_H__
+
+#define QCOM_SMEM_HOST_ANY -1
+
+int qcom_smem_alloc(unsigned host, unsigned item, size_t size);
+int qcom_smem_get(unsigned host, unsigned item, void **ptr, size_t *size);
+
+int qcom_smem_get_free_space(unsigned host);
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From 0933328a1b8adb6c8b2b8c8b823dad0295659c40 Mon Sep 17 00:00:00 2001
From: Joachim Eastwood <manabian@gmail.com>
Date: Wed, 29 Jul 2015 00:09:02 +0200
Subject: stmmac: remove unused stmmac_of_data struct

As dwmac-* drivers that need OF match have been converted
to use their own internal OF match data structure this can
now be removed.

Signed-off-by: Joachim Eastwood <manabian@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/stmmac.txt |  2 --
 include/linux/stmmac.h              | 18 ------------------
 2 files changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt
index 5fddefa69baf..de5c42342ec3 100644
--- a/Documentation/networking/stmmac.txt
+++ b/Documentation/networking/stmmac.txt
@@ -274,8 +274,6 @@ capability register can replace what has been passed from the platform.
 Please see the following document:
 	Documentation/devicetree/bindings/net/stmmac.txt
 
-and the stmmac_of_data structure inside the include/linux/stmmac.h header file.
-
 4.11) This is a summary of the content of some relevant files:
  o stmmac_main.c: to implement the main network device driver;
  o stmmac_mdio.c: to provide mdio functions;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index c86a20047cb1..b43cd56b78e9 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -125,22 +125,4 @@ struct plat_stmmacenet_data {
 	void (*exit)(struct platform_device *pdev, void *priv);
 	void *bsp_priv;
 };
-
-/* of_data for SoC glue layer device tree bindings */
-
-struct stmmac_of_data {
-	int has_gmac;
-	int enh_desc;
-	int tx_coe;
-	int rx_coe;
-	int bugged_jumbo;
-	int pmt;
-	int riwt_off;
-	void (*fix_mac_speed)(void *priv, unsigned int speed);
-	void (*bus_setup)(void __iomem *ioaddr);
-	void *(*setup)(struct platform_device *pdev);
-	void (*free)(struct platform_device *pdev, void *priv);
-	int (*init)(struct platform_device *pdev, void *priv);
-	void (*exit)(struct platform_device *pdev, void *priv);
-};
 #endif
-- 
cgit v1.2.3-70-g09d2


From 75fee59550a9899fd9438ebc0a64c972829a8dd2 Mon Sep 17 00:00:00 2001
From: Joachim Eastwood <manabian@gmail.com>
Date: Wed, 29 Jul 2015 00:09:03 +0200
Subject: stmmac: remove setup/free glue callbacks

As all dwmac-* drivers have been converted to have a proper probe
function the setup callback can now be removed. Also remove the
free callback that wasn't used by any driver.

New dwmac-* drivers should implement standard probe and remove
functions to preform any needed setup and teardown.

Signed-off-by: Joachim Eastwood <manabian@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/stmmac.txt                   | 8 ++------
 drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c   | 7 -------
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 3 ---
 include/linux/stmmac.h                                | 2 --
 4 files changed, 2 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt
index de5c42342ec3..2903b1cf4d70 100644
--- a/Documentation/networking/stmmac.txt
+++ b/Documentation/networking/stmmac.txt
@@ -135,8 +135,6 @@ struct plat_stmmacenet_data {
 	int maxmtu;
 	void (*fix_mac_speed)(void *priv, unsigned int speed);
 	void (*bus_setup)(void __iomem *ioaddr);
-	void *(*setup)(struct platform_device *pdev);
-	void (*free)(struct platform_device *pdev, void *priv);
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
 	void *bsp_priv;
@@ -177,12 +175,10 @@ Where:
  o bus_setup: perform HW setup of the bus. For example, on some ST platforms
 	     this field is used to configure the AMBA  bridge to generate more
 	     efficient STBus traffic.
- o setup/init/exit: callbacks used for calling a custom initialization;
+ o init/exit: callbacks used for calling a custom initialization;
 	     this is sometime necessary on some platforms (e.g. ST boxes)
 	     where the HW needs to have set some PIO lines or system cfg
-	     registers. setup should return a pointer to private data,
-	     which will be stored in bsp_priv, and then passed to init and
-	     exit callbacks. init/exit callbacks should not use or modify
+	     registers.  init/exit callbacks should not use or modify
 	     platform data.
  o bsp_priv: another private pointer.
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c
index f4fe9f1a33b4..b1e5f24708c9 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c
@@ -46,13 +46,6 @@ static int dwmac_generic_probe(struct platform_device *pdev)
 		plat_dat->unicast_filter_entries = 1;
 	}
 
-	/* Custom setup (if needed) */
-	if (plat_dat->setup) {
-		plat_dat->bsp_priv = plat_dat->setup(pdev);
-		if (IS_ERR(plat_dat->bsp_priv))
-			return PTR_ERR(plat_dat->bsp_priv);
-	}
-
 	/* Custom initialisation (if needed) */
 	if (plat_dat->init) {
 		ret = plat_dat->init(pdev, plat_dat->bsp_priv);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 55e569b330b2..1cb660405f35 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -300,9 +300,6 @@ int stmmac_pltfr_remove(struct platform_device *pdev)
 	if (priv->plat->exit)
 		priv->plat->exit(pdev, priv->plat->bsp_priv);
 
-	if (priv->plat->free)
-		priv->plat->free(pdev, priv->plat->bsp_priv);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(stmmac_pltfr_remove);
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index b43cd56b78e9..eead8ab93c0a 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -119,8 +119,6 @@ struct plat_stmmacenet_data {
 	int rx_fifo_size;
 	void (*fix_mac_speed)(void *priv, unsigned int speed);
 	void (*bus_setup)(void __iomem *ioaddr);
-	void *(*setup)(struct platform_device *pdev);
-	void (*free)(struct platform_device *pdev, void *priv);
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
 	void *bsp_priv;
-- 
cgit v1.2.3-70-g09d2


From d71ba788345c2b5646101766e0c52714a9b5ed7f Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 29 Jul 2015 11:56:48 +0200
Subject: KVM: move code related to KVM_SET_BOOT_CPU_ID to x86

This is another remnant of ia64 support.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/x86.c              | 21 +++++++++++++++++++++
 include/linux/kvm_host.h        | 16 ----------------
 virt/kvm/kvm_main.c             | 14 --------------
 4 files changed, 24 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fa32b5314dcd..2f9e504f9f0c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -667,6 +667,7 @@ struct kvm_arch {
 	#endif
 
 	bool boot_vcpu_runs_old_kvmclock;
+	u32 bsp_vcpu_id;
 
 	u64 disabled_quirks;
 };
@@ -1215,5 +1216,7 @@ int __x86_set_memory_region(struct kvm *kvm,
 			    const struct kvm_userspace_memory_region *mem);
 int x86_set_memory_region(struct kvm *kvm,
 			  const struct kvm_userspace_memory_region *mem);
+bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
+bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
 
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 28076c266a9a..c675ea3351cf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2461,6 +2461,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_TSC_DEADLINE_TIMER:
 	case KVM_CAP_ENABLE_CAP_VM:
 	case KVM_CAP_DISABLE_QUIRKS:
+	case KVM_CAP_SET_BOOT_CPU_ID:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_PCI_2_3:
@@ -3777,6 +3778,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_reinject(kvm, &control);
 		break;
 	}
+	case KVM_SET_BOOT_CPU_ID:
+		r = 0;
+		mutex_lock(&kvm->lock);
+		if (atomic_read(&kvm->online_vcpus) != 0)
+			r = -EBUSY;
+		else
+			kvm->arch.bsp_vcpu_id = arg;
+		mutex_unlock(&kvm->lock);
+		break;
 	case KVM_XEN_HVM_CONFIG: {
 		r = -EFAULT;
 		if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
@@ -7291,6 +7301,17 @@ void kvm_arch_check_processor_compat(void *rtn)
 	kvm_x86_ops->check_processor_compatibility(rtn);
 }
 
+bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
+{
+	return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
+
+bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
+}
+
 bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
 {
 	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 51103f0feb7e..bd1097a95704 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -364,9 +364,6 @@ struct kvm {
 	struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM];
 	struct srcu_struct srcu;
 	struct srcu_struct irq_srcu;
-#ifdef CONFIG_KVM_APIC_ARCHITECTURE
-	u32 bsp_vcpu_id;
-#endif
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 	atomic_t online_vcpus;
 	int last_boosted_vcpu;
@@ -1059,22 +1056,9 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 #endif /* CONFIG_HAVE_KVM_EVENTFD */
 
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
-static inline bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
-{
-	return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
-}
-
-static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
-{
-	return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
-}
-
 bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu);
-
 #else
-
 static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; }
-
 #endif
 
 static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8b8a44453670..8dc4828f623f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2618,9 +2618,6 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_USER_MEMORY:
 	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
 	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
-#ifdef CONFIG_KVM_APIC_ARCHITECTURE
-	case KVM_CAP_SET_BOOT_CPU_ID:
-#endif
 	case KVM_CAP_INTERNAL_ERROR_DATA:
 #ifdef CONFIG_HAVE_KVM_MSI
 	case KVM_CAP_SIGNAL_MSI:
@@ -2716,17 +2713,6 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = kvm_ioeventfd(kvm, &data);
 		break;
 	}
-#ifdef CONFIG_KVM_APIC_ARCHITECTURE
-	case KVM_SET_BOOT_CPU_ID:
-		r = 0;
-		mutex_lock(&kvm->lock);
-		if (atomic_read(&kvm->online_vcpus) != 0)
-			r = -EBUSY;
-		else
-			kvm->bsp_vcpu_id = arg;
-		mutex_unlock(&kvm->lock);
-		break;
-#endif
 #ifdef CONFIG_HAVE_KVM_MSI
 	case KVM_SIGNAL_MSI: {
 		struct kvm_msi msi;
-- 
cgit v1.2.3-70-g09d2


From e075867681ca9b8c0b8823e24d0fb4ce3b4f2655 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 10 Oct 2014 02:44:01 +0200
Subject: jiffies: Remove HZ > USEC_PER_SEC special case

HZ never goes much further 1000 and a bit. And if we ever reach one tick
per microsecond, we might be having a problem.

Lets stop maintaining this special case, just leave a paranoid check.

Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc; John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/jiffies.h |  9 +--------
 kernel/time/time.c      | 10 +++++++---
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 535fd3bb1ba8..7c6febede6ba 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -363,18 +363,11 @@ static inline unsigned long msecs_to_jiffies(const unsigned int m)
 }
 
 extern unsigned long __usecs_to_jiffies(const unsigned int u);
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+#if !(USEC_PER_SEC % HZ)
 static inline unsigned long _usecs_to_jiffies(const unsigned int u)
 {
 	return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
 }
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
-static inline unsigned long _usecs_to_jiffies(const unsigned int u)
-{
-	return u * (HZ / USEC_PER_SEC);
-}
-static inline unsigned long _usecs_to_jiffies(const unsigned int u)
-{
 #else
 static inline unsigned long _usecs_to_jiffies(const unsigned int u)
 {
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 85d5bb1d67eb..ad1bf23e6eb7 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -268,10 +268,14 @@ EXPORT_SYMBOL(jiffies_to_msecs);
 
 unsigned int jiffies_to_usecs(const unsigned long j)
 {
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+	/*
+	 * Hz usually doesn't go much further MSEC_PER_SEC.
+	 * jiffies_to_usecs() and usecs_to_jiffies() depend on that.
+	 */
+	BUILD_BUG_ON(HZ > USEC_PER_SEC);
+
+#if !(USEC_PER_SEC % HZ)
 	return (USEC_PER_SEC / HZ) * j;
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
-	return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
 #else
 # if BITS_PER_LONG == 32
 	return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
-- 
cgit v1.2.3-70-g09d2


From 03f6199a359e460714b6bd08c10b566760f150a6 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@ezchip.com>
Date: Fri, 10 Jul 2015 15:37:25 -0400
Subject: nohz: Prevent tilegx network driver interrupts

Normally the tilegx networking shim sends irqs to all the cores
to distribute the load of processing incoming-packet interrupts,
so that you can get to multiple Gb's of traffic inbound.

However, in nohz_full mode we don't want to interrupt the
nohz_full cores by default, so we limit the set of cores we use
to only the online housekeeping cores.

To make client code easier to read, we introduce a new nohz_full
accessor, housekeeping_cpumask(), which returns a pointer to the
housekeeping_mask if nohz_full is enabled, and otherwise returns
the cpu_possible_mask.

Signed-off-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 drivers/net/ethernet/tile/tilegx.c | 4 +++-
 include/linux/tick.h               | 9 +++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
index a3f7610002aa..0a15acc075b3 100644
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -40,6 +40,7 @@
 #include <linux/tcp.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
+#include <linux/tick.h>
 
 #include <asm/checksum.h>
 #include <asm/homecache.h>
@@ -2273,7 +2274,8 @@ static int __init tile_net_init_module(void)
 		tile_net_dev_init(name, mac);
 
 	if (!network_cpus_init())
-		network_cpus_map = *cpu_online_mask;
+		cpumask_and(&network_cpus_map, housekeeping_cpumask(),
+			    cpu_online_mask);
 
 	return 0;
 }
diff --git a/include/linux/tick.h b/include/linux/tick.h
index edbfc9a5293e..1ca93f2de6f5 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -163,6 +163,15 @@ static inline void tick_nohz_full_kick_all(void) { }
 static inline void __tick_nohz_task_switch(struct task_struct *tsk) { }
 #endif
 
+static inline const struct cpumask *housekeeping_cpumask(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+	if (tick_nohz_full_enabled())
+		return housekeeping_mask;
+#endif
+	return cpu_possible_mask;
+}
+
 static inline bool is_housekeeping_cpu(int cpu)
 {
 #ifdef CONFIG_NO_HZ_FULL
-- 
cgit v1.2.3-70-g09d2


From 73738a95d00467812664b7f86ba3052f5faf96d7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 27 May 2015 19:22:08 +0200
Subject: nohz: Restart nohz full tick from irq exit

Restart the tick when necessary from the irq exit path. It makes nohz
full more flexible, simplify the related IPIs and doesn't bring
significant overhead on irq exit.

In a longer term view, it will allow us to piggyback the nohz kick
on the scheduler IPI in the future instead of sending a dedicated IPI
that often doubles the scheduler IPI on task wakeup. This will require
more changes though including careful review of resched_curr() callers
to include nohz full needs.

Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/tick.h     |  8 --------
 kernel/time/tick-sched.c | 34 ++++++++++------------------------
 2 files changed, 10 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 1ca93f2de6f5..7d35b0fec399 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -147,7 +147,6 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
 		cpumask_or(mask, mask, tick_nohz_full_mask);
 }
 
-extern void __tick_nohz_full_check(void);
 extern void tick_nohz_full_kick(void);
 extern void tick_nohz_full_kick_cpu(int cpu);
 extern void tick_nohz_full_kick_all(void);
@@ -156,7 +155,6 @@ extern void __tick_nohz_task_switch(struct task_struct *tsk);
 static inline bool tick_nohz_full_enabled(void) { return false; }
 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
 static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
-static inline void __tick_nohz_full_check(void) { }
 static inline void tick_nohz_full_kick_cpu(int cpu) { }
 static inline void tick_nohz_full_kick(void) { }
 static inline void tick_nohz_full_kick_all(void) { }
@@ -190,12 +188,6 @@ static inline void housekeeping_affine(struct task_struct *t)
 #endif
 }
 
-static inline void tick_nohz_full_check(void)
-{
-	if (tick_nohz_full_enabled())
-		__tick_nohz_full_check();
-}
-
 static inline void tick_nohz_task_switch(struct task_struct *tsk)
 {
 	if (tick_nohz_full_enabled())
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d6c8eff6e7b4..a06cd4af0ff1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -197,25 +197,9 @@ static bool can_stop_full_tick(void)
 	return true;
 }
 
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
-
-/*
- * Re-evaluate the need for the tick on the current CPU
- * and restart it if necessary.
- */
-void __tick_nohz_full_check(void)
-{
-	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
-
-	if (tick_nohz_full_cpu(smp_processor_id())) {
-		if (ts->tick_stopped && !can_stop_full_tick())
-			tick_nohz_restart_sched_tick(ts, ktime_get());
-	}
-}
-
 static void nohz_full_kick_work_func(struct irq_work *work)
 {
-	__tick_nohz_full_check();
+	/* Empty, the tick restart happens on tick_nohz_irq_exit() */
 }
 
 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
@@ -250,7 +234,7 @@ void tick_nohz_full_kick_cpu(int cpu)
 
 static void nohz_full_kick_ipi(void *info)
 {
-	__tick_nohz_full_check();
+	/* Empty, the tick restart happens on tick_nohz_irq_exit() */
 }
 
 /*
@@ -703,7 +687,9 @@ out:
 	return tick;
 }
 
-static void tick_nohz_full_stop_tick(struct tick_sched *ts)
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
+
+static void tick_nohz_full_update_tick(struct tick_sched *ts)
 {
 #ifdef CONFIG_NO_HZ_FULL
 	int cpu = smp_processor_id();
@@ -714,10 +700,10 @@ static void tick_nohz_full_stop_tick(struct tick_sched *ts)
 	if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
 		return;
 
-	if (!can_stop_full_tick())
-		return;
-
-	tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+	if (can_stop_full_tick())
+		tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+	else if (ts->tick_stopped)
+		tick_nohz_restart_sched_tick(ts, ktime_get());
 #endif
 }
 
@@ -847,7 +833,7 @@ void tick_nohz_irq_exit(void)
 	if (ts->inidle)
 		__tick_nohz_idle_enter(ts);
 	else
-		tick_nohz_full_stop_tick(ts);
+		tick_nohz_full_update_tick(ts);
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From de734f89b67c2df30e35a09e7e56a3659e5b6ac6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 11 Jun 2015 18:07:12 +0200
Subject: nohz: Remove useless argument on tick_nohz_task_switch()

Leftover from early code.

Cc: Christoph Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/tick.h     | 8 ++++----
 kernel/sched/core.c      | 2 +-
 kernel/time/tick-sched.c | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 7d35b0fec399..48d901f83f92 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -150,7 +150,7 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
 extern void tick_nohz_full_kick(void);
 extern void tick_nohz_full_kick_cpu(int cpu);
 extern void tick_nohz_full_kick_all(void);
-extern void __tick_nohz_task_switch(struct task_struct *tsk);
+extern void __tick_nohz_task_switch(void);
 #else
 static inline bool tick_nohz_full_enabled(void) { return false; }
 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
@@ -158,7 +158,7 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
 static inline void tick_nohz_full_kick_cpu(int cpu) { }
 static inline void tick_nohz_full_kick(void) { }
 static inline void tick_nohz_full_kick_all(void) { }
-static inline void __tick_nohz_task_switch(struct task_struct *tsk) { }
+static inline void __tick_nohz_task_switch(void) { }
 #endif
 
 static inline const struct cpumask *housekeeping_cpumask(void)
@@ -188,10 +188,10 @@ static inline void housekeeping_affine(struct task_struct *t)
 #endif
 }
 
-static inline void tick_nohz_task_switch(struct task_struct *tsk)
+static inline void tick_nohz_task_switch(void)
 {
 	if (tick_nohz_full_enabled())
-		__tick_nohz_task_switch(tsk);
+		__tick_nohz_task_switch();
 }
 
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78b4bad10081..4d34035bb3ee 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2489,7 +2489,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		put_task_struct(prev);
 	}
 
-	tick_nohz_task_switch(current);
+	tick_nohz_task_switch();
 	return rq;
 }
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6b0d14d4c350..3319e16f31e5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -258,7 +258,7 @@ void tick_nohz_full_kick_all(void)
  * It might need the tick due to per task/process properties:
  * perf events, posix cpu timers, ...
  */
-void __tick_nohz_task_switch(struct task_struct *tsk)
+void __tick_nohz_task_switch(void)
 {
 	unsigned long flags;
 
-- 
cgit v1.2.3-70-g09d2


From 4246a0b63bd8f56a1469b12eafeb875b1041a451 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Jul 2015 15:29:37 +0200
Subject: block: add a bi_error field to struct bio

Currently we have two different ways to signal an I/O error on a BIO:

 (1) by clearing the BIO_UPTODATE flag
 (2) by returning a Linux errno value to the bi_end_io callback

The first one has the drawback of only communicating a single possible
error (-EIO), and the second one has the drawback of not beeing persistent
when bios are queued up, and are not passed along from child to parent
bio in the ever more popular chaining scenario.  Having both mechanisms
available has the additional drawback of utterly confusing driver authors
and introducing bugs where various I/O submitters only deal with one of
them, and the others have to add boilerplate code to deal with both kinds
of error returns.

So add a new bi_error field to store an errno value directly in struct
bio and remove the existing mechanisms to clean all this up.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/biodoc.txt      |  2 +-
 arch/m68k/emu/nfblock.c             |  2 +-
 arch/powerpc/sysdev/axonram.c       |  2 +-
 arch/xtensa/platforms/iss/simdisk.c | 12 ++-----
 block/bio-integrity.c               | 11 +++----
 block/bio.c                         | 43 +++++++++++--------------
 block/blk-core.c                    | 15 ++++-----
 block/blk-lib.c                     | 30 ++++++++----------
 block/blk-map.c                     |  2 +-
 block/blk-mq.c                      |  6 ++--
 block/bounce.c                      | 27 ++++++++--------
 drivers/block/aoe/aoecmd.c          | 10 +++---
 drivers/block/aoe/aoedev.c          |  2 +-
 drivers/block/brd.c                 | 13 +++++---
 drivers/block/drbd/drbd_actlog.c    |  4 +--
 drivers/block/drbd/drbd_bitmap.c    | 19 +++---------
 drivers/block/drbd/drbd_int.h       | 11 ++++---
 drivers/block/drbd/drbd_req.c       | 10 +++---
 drivers/block/drbd/drbd_worker.c    | 44 +++++++-------------------
 drivers/block/floppy.c              |  7 +++--
 drivers/block/null_blk.c            |  2 +-
 drivers/block/pktcdvd.c             | 32 +++++++++----------
 drivers/block/ps3vram.c             |  3 +-
 drivers/block/rsxx/dev.c            |  9 ++++--
 drivers/block/umem.c                |  4 +--
 drivers/block/xen-blkback/blkback.c |  4 +--
 drivers/block/xen-blkfront.c        |  9 ++----
 drivers/block/zram/zram_drv.c       |  5 ++-
 drivers/md/bcache/btree.c           | 10 +++---
 drivers/md/bcache/closure.h         |  2 +-
 drivers/md/bcache/io.c              |  8 ++---
 drivers/md/bcache/journal.c         |  8 ++---
 drivers/md/bcache/movinggc.c        |  8 ++---
 drivers/md/bcache/request.c         | 27 ++++++++--------
 drivers/md/bcache/super.c           | 14 ++++-----
 drivers/md/bcache/writeback.c       | 10 +++---
 drivers/md/dm-bio-prison.c          |  6 ++--
 drivers/md/dm-bufio.c               | 26 ++++++++++------
 drivers/md/dm-cache-target.c        | 24 +++++++-------
 drivers/md/dm-crypt.c               | 14 ++++-----
 drivers/md/dm-flakey.c              |  2 +-
 drivers/md/dm-io.c                  |  6 ++--
 drivers/md/dm-log-writes.c          | 11 +++----
 drivers/md/dm-raid1.c               | 24 +++++++-------
 drivers/md/dm-snap.c                |  6 ++--
 drivers/md/dm-stripe.c              |  2 +-
 drivers/md/dm-thin.c                | 41 +++++++++++++-----------
 drivers/md/dm-verity.c              |  9 +++---
 drivers/md/dm-zero.c                |  2 +-
 drivers/md/dm.c                     | 15 +++++----
 drivers/md/faulty.c                 |  4 +--
 drivers/md/linear.c                 |  2 +-
 drivers/md/md.c                     | 18 +++++------
 drivers/md/multipath.c              | 12 +++----
 drivers/md/raid0.c                  |  2 +-
 drivers/md/raid1.c                  | 53 ++++++++++++++++---------------
 drivers/md/raid10.c                 | 55 +++++++++++++++-----------------
 drivers/md/raid5.c                  | 52 +++++++++++++++----------------
 drivers/nvdimm/blk.c                |  5 +--
 drivers/nvdimm/btt.c                |  5 +--
 drivers/nvdimm/pmem.c               |  2 +-
 drivers/s390/block/dcssblk.c        |  2 +-
 drivers/s390/block/xpram.c          |  3 +-
 drivers/target/target_core_iblock.c | 21 +++++--------
 drivers/target/target_core_pscsi.c  |  6 ++--
 fs/btrfs/check-integrity.c          | 10 +++---
 fs/btrfs/compression.c              | 24 ++++++++------
 fs/btrfs/disk-io.c                  | 35 +++++++++++----------
 fs/btrfs/extent_io.c                | 30 +++++++-----------
 fs/btrfs/inode.c                    | 50 ++++++++++++++++--------------
 fs/btrfs/raid56.c                   | 62 +++++++++++++++++--------------------
 fs/btrfs/scrub.c                    | 22 ++++++-------
 fs/btrfs/volumes.c                  | 23 +++++++-------
 fs/buffer.c                         |  4 +--
 fs/direct-io.c                      | 13 ++++----
 fs/ext4/page-io.c                   | 15 ++++-----
 fs/ext4/readpage.c                  |  6 ++--
 fs/f2fs/data.c                      | 10 +++---
 fs/gfs2/lops.c                      | 10 +++---
 fs/gfs2/ops_fstype.c                |  6 ++--
 fs/jfs/jfs_logmgr.c                 |  8 ++---
 fs/jfs/jfs_metapage.c               |  8 ++---
 fs/logfs/dev_bdev.c                 | 12 +++----
 fs/mpage.c                          |  4 +--
 fs/nfs/blocklayout/blocklayout.c    | 14 ++++-----
 fs/nilfs2/segbuf.c                  |  5 ++-
 fs/ocfs2/cluster/heartbeat.c        |  9 +++---
 fs/xfs/xfs_aops.c                   |  5 ++-
 fs/xfs/xfs_buf.c                    |  7 ++---
 include/linux/bio.h                 | 13 +++++---
 include/linux/blk_types.h           |  4 +--
 include/linux/swap.h                |  4 +--
 kernel/power/swap.c                 | 12 +++----
 kernel/trace/blktrace.c             | 10 ++----
 mm/page_io.c                        | 12 +++----
 95 files changed, 622 insertions(+), 682 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index fd12c0d835fd..5be8a7f4cc7f 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -1109,7 +1109,7 @@ it will loop and handle as many sectors (on a bio-segment granularity)
 as specified.
 
 Now bh->b_end_io is replaced by bio->bi_end_io, but most of the time the
-right thing to use is bio_endio(bio, uptodate) instead.
+right thing to use is bio_endio(bio) instead.
 
 If the driver is dropping the io_request_lock from its request_fn strategy,
 then it just needs to replace that with q->queue_lock instead.
diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index 2d75ae246167..f2a00c591bf7 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -76,7 +76,7 @@ static void nfhd_make_request(struct request_queue *queue, struct bio *bio)
 				bvec_to_phys(&bvec));
 		sec += len;
 	}
-	bio_endio(bio, 0);
+	bio_endio(bio);
 }
 
 static int nfhd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index ee90db17b097..f86250c48b53 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -132,7 +132,7 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
 		phys_mem += vec.bv_len;
 		transfered += vec.bv_len;
 	}
-	bio_endio(bio, 0);
+	bio_endio(bio);
 }
 
 /**
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index 48eebacdf5fe..fa84ca990caa 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -101,8 +101,9 @@ static void simdisk_transfer(struct simdisk *dev, unsigned long sector,
 	spin_unlock(&dev->lock);
 }
 
-static int simdisk_xfer_bio(struct simdisk *dev, struct bio *bio)
+static void simdisk_make_request(struct request_queue *q, struct bio *bio)
 {
+	struct simdisk *dev = q->queuedata;
 	struct bio_vec bvec;
 	struct bvec_iter iter;
 	sector_t sector = bio->bi_iter.bi_sector;
@@ -116,17 +117,10 @@ static int simdisk_xfer_bio(struct simdisk *dev, struct bio *bio)
 		sector += len;
 		__bio_kunmap_atomic(buffer);
 	}
-	return 0;
-}
 
-static void simdisk_make_request(struct request_queue *q, struct bio *bio)
-{
-	struct simdisk *dev = q->queuedata;
-	int status = simdisk_xfer_bio(dev, bio);
-	bio_endio(bio, status);
+	bio_endio(bio);
 }
 
-
 static int simdisk_open(struct block_device *bdev, fmode_t mode)
 {
 	struct simdisk *dev = bdev->bd_disk->private_data;
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 719b7152aed1..4aecca79374a 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -355,13 +355,12 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 		container_of(work, struct bio_integrity_payload, bip_work);
 	struct bio *bio = bip->bip_bio;
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-	int error;
 
-	error = bio_integrity_process(bio, bi->verify_fn);
+	bio->bi_error = bio_integrity_process(bio, bi->verify_fn);
 
 	/* Restore original bio completion handler */
 	bio->bi_end_io = bip->bip_end_io;
-	bio_endio(bio, error);
+	bio_endio(bio);
 }
 
 /**
@@ -376,7 +375,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
  * in process context.	This function postpones completion
  * accordingly.
  */
-void bio_integrity_endio(struct bio *bio, int error)
+void bio_integrity_endio(struct bio *bio)
 {
 	struct bio_integrity_payload *bip = bio_integrity(bio);
 
@@ -386,9 +385,9 @@ void bio_integrity_endio(struct bio *bio, int error)
 	 * integrity metadata.  Restore original bio end_io handler
 	 * and run it.
 	 */
-	if (error) {
+	if (bio->bi_error) {
 		bio->bi_end_io = bip->bip_end_io;
-		bio_endio(bio, error);
+		bio_endio(bio);
 
 		return;
 	}
diff --git a/block/bio.c b/block/bio.c
index 2a00d349cd68..a23f489f398f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -269,7 +269,6 @@ static void bio_free(struct bio *bio)
 void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
-	bio->bi_flags = 1 << BIO_UPTODATE;
 	atomic_set(&bio->__bi_remaining, 1);
 	atomic_set(&bio->__bi_cnt, 1);
 }
@@ -292,14 +291,17 @@ void bio_reset(struct bio *bio)
 	__bio_free(bio);
 
 	memset(bio, 0, BIO_RESET_BYTES);
-	bio->bi_flags = flags | (1 << BIO_UPTODATE);
+	bio->bi_flags = flags;
 	atomic_set(&bio->__bi_remaining, 1);
 }
 EXPORT_SYMBOL(bio_reset);
 
-static void bio_chain_endio(struct bio *bio, int error)
+static void bio_chain_endio(struct bio *bio)
 {
-	bio_endio(bio->bi_private, error);
+	struct bio *parent = bio->bi_private;
+
+	parent->bi_error = bio->bi_error;
+	bio_endio(parent);
 	bio_put(bio);
 }
 
@@ -896,11 +898,11 @@ struct submit_bio_ret {
 	int error;
 };
 
-static void submit_bio_wait_endio(struct bio *bio, int error)
+static void submit_bio_wait_endio(struct bio *bio)
 {
 	struct submit_bio_ret *ret = bio->bi_private;
 
-	ret->error = error;
+	ret->error = bio->bi_error;
 	complete(&ret->event);
 }
 
@@ -1445,7 +1447,7 @@ void bio_unmap_user(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_unmap_user);
 
-static void bio_map_kern_endio(struct bio *bio, int err)
+static void bio_map_kern_endio(struct bio *bio)
 {
 	bio_put(bio);
 }
@@ -1501,13 +1503,13 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
 }
 EXPORT_SYMBOL(bio_map_kern);
 
-static void bio_copy_kern_endio(struct bio *bio, int err)
+static void bio_copy_kern_endio(struct bio *bio)
 {
 	bio_free_pages(bio);
 	bio_put(bio);
 }
 
-static void bio_copy_kern_endio_read(struct bio *bio, int err)
+static void bio_copy_kern_endio_read(struct bio *bio)
 {
 	char *p = bio->bi_private;
 	struct bio_vec *bvec;
@@ -1518,7 +1520,7 @@ static void bio_copy_kern_endio_read(struct bio *bio, int err)
 		p += bvec->bv_len;
 	}
 
-	bio_copy_kern_endio(bio, err);
+	bio_copy_kern_endio(bio);
 }
 
 /**
@@ -1778,25 +1780,15 @@ static inline bool bio_remaining_done(struct bio *bio)
 /**
  * bio_endio - end I/O on a bio
  * @bio:	bio
- * @error:	error, if any
  *
  * Description:
- *   bio_endio() will end I/O on the whole bio. bio_endio() is the
- *   preferred way to end I/O on a bio, it takes care of clearing
- *   BIO_UPTODATE on error. @error is 0 on success, and and one of the
- *   established -Exxxx (-EIO, for instance) error values in case
- *   something went wrong. No one should call bi_end_io() directly on a
- *   bio unless they own it and thus know that it has an end_io
- *   function.
+ *   bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
+ *   way to end I/O on a bio. No one should call bi_end_io() directly on a
+ *   bio unless they own it and thus know that it has an end_io function.
  **/
-void bio_endio(struct bio *bio, int error)
+void bio_endio(struct bio *bio)
 {
 	while (bio) {
-		if (error)
-			clear_bit(BIO_UPTODATE, &bio->bi_flags);
-		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-			error = -EIO;
-
 		if (unlikely(!bio_remaining_done(bio)))
 			break;
 
@@ -1810,11 +1802,12 @@ void bio_endio(struct bio *bio, int error)
 		 */
 		if (bio->bi_end_io == bio_chain_endio) {
 			struct bio *parent = bio->bi_private;
+			parent->bi_error = bio->bi_error;
 			bio_put(bio);
 			bio = parent;
 		} else {
 			if (bio->bi_end_io)
-				bio->bi_end_io(bio, error);
+				bio->bi_end_io(bio);
 			bio = NULL;
 		}
 	}
diff --git a/block/blk-core.c b/block/blk-core.c
index 627ed0c593fb..7ef15b947b91 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -143,9 +143,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
 	if (error)
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-		error = -EIO;
+		bio->bi_error = error;
 
 	if (unlikely(rq->cmd_flags & REQ_QUIET))
 		set_bit(BIO_QUIET, &bio->bi_flags);
@@ -154,7 +152,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 
 	/* don't actually finish bio if it's part of flush sequence */
 	if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
-		bio_endio(bio, error);
+		bio_endio(bio);
 }
 
 void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -1620,7 +1618,8 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	blk_queue_bounce(q, &bio);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
-		bio_endio(bio, -EIO);
+		bio->bi_error = -EIO;
+		bio_endio(bio);
 		return;
 	}
 
@@ -1673,7 +1672,8 @@ get_rq:
 	 */
 	req = get_request(q, rw_flags, bio, GFP_NOIO);
 	if (IS_ERR(req)) {
-		bio_endio(bio, PTR_ERR(req));	/* @q is dead */
+		bio->bi_error = PTR_ERR(req);
+		bio_endio(bio);
 		goto out_unlock;
 	}
 
@@ -1896,7 +1896,8 @@ generic_make_request_checks(struct bio *bio)
 	return true;
 
 end_io:
-	bio_endio(bio, err);
+	bio->bi_error = err;
+	bio_endio(bio);
 	return false;
 }
 
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 7688ee3f5d72..6dee17443f14 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -11,16 +11,16 @@
 
 struct bio_batch {
 	atomic_t		done;
-	unsigned long		flags;
+	int			error;
 	struct completion	*wait;
 };
 
-static void bio_batch_end_io(struct bio *bio, int err)
+static void bio_batch_end_io(struct bio *bio)
 {
 	struct bio_batch *bb = bio->bi_private;
 
-	if (err && (err != -EOPNOTSUPP))
-		clear_bit(BIO_UPTODATE, &bb->flags);
+	if (bio->bi_error && bio->bi_error != -EOPNOTSUPP)
+		bb->error = bio->bi_error;
 	if (atomic_dec_and_test(&bb->done))
 		complete(bb->wait);
 	bio_put(bio);
@@ -78,7 +78,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	}
 
 	atomic_set(&bb.done, 1);
-	bb.flags = 1 << BIO_UPTODATE;
+	bb.error = 0;
 	bb.wait = &wait;
 
 	blk_start_plug(&plug);
@@ -134,9 +134,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	if (!atomic_dec_and_test(&bb.done))
 		wait_for_completion_io(&wait);
 
-	if (!test_bit(BIO_UPTODATE, &bb.flags))
-		ret = -EIO;
-
+	if (bb.error)
+		return bb.error;
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
@@ -172,7 +171,7 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		return -EOPNOTSUPP;
 
 	atomic_set(&bb.done, 1);
-	bb.flags = 1 << BIO_UPTODATE;
+	bb.error = 0;
 	bb.wait = &wait;
 
 	while (nr_sects) {
@@ -208,9 +207,8 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 	if (!atomic_dec_and_test(&bb.done))
 		wait_for_completion_io(&wait);
 
-	if (!test_bit(BIO_UPTODATE, &bb.flags))
-		ret = -ENOTSUPP;
-
+	if (bb.error)
+		return bb.error;
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_write_same);
@@ -236,7 +234,7 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	DECLARE_COMPLETION_ONSTACK(wait);
 
 	atomic_set(&bb.done, 1);
-	bb.flags = 1 << BIO_UPTODATE;
+	bb.error = 0;
 	bb.wait = &wait;
 
 	ret = 0;
@@ -270,10 +268,8 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	if (!atomic_dec_and_test(&bb.done))
 		wait_for_completion_io(&wait);
 
-	if (!test_bit(BIO_UPTODATE, &bb.flags))
-		/* One of bios in the batch was completed with error.*/
-		ret = -EIO;
-
+	if (bb.error)
+		return bb.error;
 	return ret;
 }
 
diff --git a/block/blk-map.c b/block/blk-map.c
index da310a105429..5fe1c30bfba7 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -103,7 +103,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 		 * normal IO completion path
 		 */
 		bio_get(bio);
-		bio_endio(bio, 0);
+		bio_endio(bio);
 		__blk_rq_unmap_user(bio);
 		return -EINVAL;
 	}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7d842db59699..94559025c5e6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1199,7 +1199,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	struct blk_mq_alloc_data alloc_data;
 
 	if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) {
-		bio_endio(bio, -EIO);
+		bio_io_error(bio);
 		return NULL;
 	}
 
@@ -1283,7 +1283,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	blk_queue_bounce(q, &bio);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
-		bio_endio(bio, -EIO);
+		bio_io_error(bio);
 		return;
 	}
 
@@ -1368,7 +1368,7 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	blk_queue_bounce(q, &bio);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
-		bio_endio(bio, -EIO);
+		bio_io_error(bio);
 		return;
 	}
 
diff --git a/block/bounce.c b/block/bounce.c
index b17311227c12..f4db245b9f3a 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -123,7 +123,7 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
 	}
 }
 
-static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
+static void bounce_end_io(struct bio *bio, mempool_t *pool)
 {
 	struct bio *bio_orig = bio->bi_private;
 	struct bio_vec *bvec, *org_vec;
@@ -141,39 +141,40 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
 		mempool_free(bvec->bv_page, pool);
 	}
 
-	bio_endio(bio_orig, err);
+	bio_orig->bi_error = bio->bi_error;
+	bio_endio(bio_orig);
 	bio_put(bio);
 }
 
-static void bounce_end_io_write(struct bio *bio, int err)
+static void bounce_end_io_write(struct bio *bio)
 {
-	bounce_end_io(bio, page_pool, err);
+	bounce_end_io(bio, page_pool);
 }
 
-static void bounce_end_io_write_isa(struct bio *bio, int err)
+static void bounce_end_io_write_isa(struct bio *bio)
 {
 
-	bounce_end_io(bio, isa_page_pool, err);
+	bounce_end_io(bio, isa_page_pool);
 }
 
-static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
+static void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
 {
 	struct bio *bio_orig = bio->bi_private;
 
-	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+	if (!bio->bi_error)
 		copy_to_high_bio_irq(bio_orig, bio);
 
-	bounce_end_io(bio, pool, err);
+	bounce_end_io(bio, pool);
 }
 
-static void bounce_end_io_read(struct bio *bio, int err)
+static void bounce_end_io_read(struct bio *bio)
 {
-	__bounce_end_io_read(bio, page_pool, err);
+	__bounce_end_io_read(bio, page_pool);
 }
 
-static void bounce_end_io_read_isa(struct bio *bio, int err)
+static void bounce_end_io_read_isa(struct bio *bio)
 {
-	__bounce_end_io_read(bio, isa_page_pool, err);
+	__bounce_end_io_read(bio, isa_page_pool);
 }
 
 #ifdef CONFIG_NEED_BOUNCE_POOL
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 422b7d84f686..ad80c85e0857 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1110,7 +1110,7 @@ aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
 		d->ip.rq = NULL;
 	do {
 		bio = rq->bio;
-		bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags);
+		bok = !fastfail && !bio->bi_error;
 	} while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_iter.bi_size));
 
 	/* cf. http://lkml.org/lkml/2006/10/31/28 */
@@ -1172,7 +1172,7 @@ ktiocomplete(struct frame *f)
 			ahout->cmdstat, ahin->cmdstat,
 			d->aoemajor, d->aoeminor);
 noskb:		if (buf)
-			clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+			buf->bio->bi_error = -EIO;
 		goto out;
 	}
 
@@ -1185,7 +1185,7 @@ noskb:		if (buf)
 				"aoe: runt data size in read from",
 				(long) d->aoemajor, d->aoeminor,
 			       skb->len, n);
-			clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+			buf->bio->bi_error = -EIO;
 			break;
 		}
 		if (n > f->iter.bi_size) {
@@ -1193,7 +1193,7 @@ noskb:		if (buf)
 				"aoe: too-large data size in read from",
 				(long) d->aoemajor, d->aoeminor,
 				n, f->iter.bi_size);
-			clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+			buf->bio->bi_error = -EIO;
 			break;
 		}
 		bvcpy(skb, f->buf->bio, f->iter, n);
@@ -1695,7 +1695,7 @@ aoe_failbuf(struct aoedev *d, struct buf *buf)
 	if (buf == NULL)
 		return;
 	buf->iter.bi_size = 0;
-	clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+	buf->bio->bi_error = -EIO;
 	if (buf->nframesout == 0)
 		aoe_end_buf(d, buf);
 }
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index e774c50b6842..ffd1947500c6 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -170,7 +170,7 @@ aoe_failip(struct aoedev *d)
 	if (rq == NULL)
 		return;
 	while ((bio = d->ip.nxbio)) {
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+		bio->bi_error = -EIO;
 		d->ip.nxbio = bio->bi_next;
 		n = (unsigned long) rq->special;
 		rq->special = (void *) --n;
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index e573e470bd8a..f9ab74505e69 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -331,14 +331,12 @@ static void brd_make_request(struct request_queue *q, struct bio *bio)
 	struct bio_vec bvec;
 	sector_t sector;
 	struct bvec_iter iter;
-	int err = -EIO;
 
 	sector = bio->bi_iter.bi_sector;
 	if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
-		goto out;
+		goto io_error;
 
 	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
-		err = 0;
 		discard_from_brd(brd, sector, bio->bi_iter.bi_size);
 		goto out;
 	}
@@ -349,15 +347,20 @@ static void brd_make_request(struct request_queue *q, struct bio *bio)
 
 	bio_for_each_segment(bvec, bio, iter) {
 		unsigned int len = bvec.bv_len;
+		int err;
+
 		err = brd_do_bvec(brd, bvec.bv_page, len,
 					bvec.bv_offset, rw, sector);
 		if (err)
-			break;
+			goto io_error;
 		sector += len >> SECTOR_SHIFT;
 	}
 
 out:
-	bio_endio(bio, err);
+	bio_endio(bio);
+	return;
+io_error:
+	bio_io_error(bio);
 }
 
 static int brd_rw_page(struct block_device *bdev, sector_t sector,
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 1318e3217cb0..b3868e7a1ffd 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -175,11 +175,11 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
 	atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */
 	device->md_io.submit_jif = jiffies;
 	if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
-		bio_endio(bio, -EIO);
+		bio_io_error(bio);
 	else
 		submit_bio(rw, bio);
 	wait_until_done_or_force_detached(device, bdev, &device->md_io.done);
-	if (bio_flagged(bio, BIO_UPTODATE))
+	if (!bio->bi_error)
 		err = device->md_io.error;
 
  out:
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 434c77dcc99e..e5e0f19ceda0 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -941,36 +941,27 @@ static void drbd_bm_aio_ctx_destroy(struct kref *kref)
 }
 
 /* bv_page may be a copy, or may be the original */
-static void drbd_bm_endio(struct bio *bio, int error)
+static void drbd_bm_endio(struct bio *bio)
 {
 	struct drbd_bm_aio_ctx *ctx = bio->bi_private;
 	struct drbd_device *device = ctx->device;
 	struct drbd_bitmap *b = device->bitmap;
 	unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
-	int uptodate = bio_flagged(bio, BIO_UPTODATE);
-
-
-	/* strange behavior of some lower level drivers...
-	 * fail the request by clearing the uptodate flag,
-	 * but do not return any error?!
-	 * do we want to WARN() on this? */
-	if (!error && !uptodate)
-		error = -EIO;
 
 	if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
 	    !bm_test_page_unchanged(b->bm_pages[idx]))
 		drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx);
 
-	if (error) {
+	if (bio->bi_error) {
 		/* ctx error will hold the completed-last non-zero error code,
 		 * in case error codes differ. */
-		ctx->error = error;
+		ctx->error = bio->bi_error;
 		bm_set_page_io_err(b->bm_pages[idx]);
 		/* Not identical to on disk version of it.
 		 * Is BM_PAGE_IO_ERROR enough? */
 		if (__ratelimit(&drbd_ratelimit_state))
 			drbd_err(device, "IO ERROR %d on bitmap page idx %u\n",
-					error, idx);
+					bio->bi_error, idx);
 	} else {
 		bm_clear_page_io_err(b->bm_pages[idx]);
 		dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx);
@@ -1031,7 +1022,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
 
 	if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
 		bio->bi_rw |= rw;
-		bio_endio(bio, -EIO);
+		bio_io_error(bio);
 	} else {
 		submit_bio(rw, bio);
 		/* this should not count as user activity and cause the
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index efd19c2da9c2..a08c4a9179f1 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1481,9 +1481,9 @@ extern int drbd_khelper(struct drbd_device *device, char *cmd);
 
 /* drbd_worker.c */
 /* bi_end_io handlers */
-extern void drbd_md_endio(struct bio *bio, int error);
-extern void drbd_peer_request_endio(struct bio *bio, int error);
-extern void drbd_request_endio(struct bio *bio, int error);
+extern void drbd_md_endio(struct bio *bio);
+extern void drbd_peer_request_endio(struct bio *bio);
+extern void drbd_request_endio(struct bio *bio);
 extern int drbd_worker(struct drbd_thread *thi);
 enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor);
 void drbd_resync_after_changed(struct drbd_device *device);
@@ -1604,12 +1604,13 @@ static inline void drbd_generic_make_request(struct drbd_device *device,
 	__release(local);
 	if (!bio->bi_bdev) {
 		drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n");
-		bio_endio(bio, -ENODEV);
+		bio->bi_error = -ENODEV;
+		bio_endio(bio);
 		return;
 	}
 
 	if (drbd_insert_fault(device, fault_type))
-		bio_endio(bio, -EIO);
+		bio_io_error(bio);
 	else
 		generic_make_request(bio);
 }
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3907202fb9d9..9cb41166366e 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -201,7 +201,8 @@ void start_new_tl_epoch(struct drbd_connection *connection)
 void complete_master_bio(struct drbd_device *device,
 		struct bio_and_error *m)
 {
-	bio_endio(m->bio, m->error);
+	m->bio->bi_error = m->error;
+	bio_endio(m->bio);
 	dec_ap_bio(device);
 }
 
@@ -1153,12 +1154,12 @@ drbd_submit_req_private_bio(struct drbd_request *req)
 				      rw == WRITE ? DRBD_FAULT_DT_WR
 				    : rw == READ  ? DRBD_FAULT_DT_RD
 				    :               DRBD_FAULT_DT_RA))
-			bio_endio(bio, -EIO);
+			bio_io_error(bio);
 		else
 			generic_make_request(bio);
 		put_ldev(device);
 	} else
-		bio_endio(bio, -EIO);
+		bio_io_error(bio);
 }
 
 static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req)
@@ -1191,7 +1192,8 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
 		/* only pass the error to the upper layers.
 		 * if user cannot handle io errors, that's not our business. */
 		drbd_err(device, "could not kmalloc() req\n");
-		bio_endio(bio, -ENOMEM);
+		bio->bi_error = -ENOMEM;
+		bio_endio(bio);
 		return ERR_PTR(-ENOMEM);
 	}
 	req->start_jif = start_jif;
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index d0fae55d871d..5578c1477ba6 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -65,12 +65,12 @@ rwlock_t global_state_lock;
 /* used for synchronous meta data and bitmap IO
  * submitted by drbd_md_sync_page_io()
  */
-void drbd_md_endio(struct bio *bio, int error)
+void drbd_md_endio(struct bio *bio)
 {
 	struct drbd_device *device;
 
 	device = bio->bi_private;
-	device->md_io.error = error;
+	device->md_io.error = bio->bi_error;
 
 	/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
 	 * to timeout on the lower level device, and eventually detach from it.
@@ -170,31 +170,20 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
 /* writes on behalf of the partner, or resync writes,
  * "submitted" by the receiver.
  */
-void drbd_peer_request_endio(struct bio *bio, int error)
+void drbd_peer_request_endio(struct bio *bio)
 {
 	struct drbd_peer_request *peer_req = bio->bi_private;
 	struct drbd_device *device = peer_req->peer_device->device;
-	int uptodate = bio_flagged(bio, BIO_UPTODATE);
 	int is_write = bio_data_dir(bio) == WRITE;
 	int is_discard = !!(bio->bi_rw & REQ_DISCARD);
 
-	if (error && __ratelimit(&drbd_ratelimit_state))
+	if (bio->bi_error && __ratelimit(&drbd_ratelimit_state))
 		drbd_warn(device, "%s: error=%d s=%llus\n",
 				is_write ? (is_discard ? "discard" : "write")
-					: "read", error,
+					: "read", bio->bi_error,
 				(unsigned long long)peer_req->i.sector);
-	if (!error && !uptodate) {
-		if (__ratelimit(&drbd_ratelimit_state))
-			drbd_warn(device, "%s: setting error to -EIO s=%llus\n",
-					is_write ? "write" : "read",
-					(unsigned long long)peer_req->i.sector);
-		/* strange behavior of some lower level drivers...
-		 * fail the request by clearing the uptodate flag,
-		 * but do not return any error?! */
-		error = -EIO;
-	}
 
-	if (error)
+	if (bio->bi_error)
 		set_bit(__EE_WAS_ERROR, &peer_req->flags);
 
 	bio_put(bio); /* no need for the bio anymore */
@@ -208,24 +197,13 @@ void drbd_peer_request_endio(struct bio *bio, int error)
 
 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
  */
-void drbd_request_endio(struct bio *bio, int error)
+void drbd_request_endio(struct bio *bio)
 {
 	unsigned long flags;
 	struct drbd_request *req = bio->bi_private;
 	struct drbd_device *device = req->device;
 	struct bio_and_error m;
 	enum drbd_req_event what;
-	int uptodate = bio_flagged(bio, BIO_UPTODATE);
-
-	if (!error && !uptodate) {
-		drbd_warn(device, "p %s: setting error to -EIO\n",
-			 bio_data_dir(bio) == WRITE ? "write" : "read");
-		/* strange behavior of some lower level drivers...
-		 * fail the request by clearing the uptodate flag,
-		 * but do not return any error?! */
-		error = -EIO;
-	}
-
 
 	/* If this request was aborted locally before,
 	 * but now was completed "successfully",
@@ -259,14 +237,14 @@ void drbd_request_endio(struct bio *bio, int error)
 		if (__ratelimit(&drbd_ratelimit_state))
 			drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
 
-		if (!error)
+		if (!bio->bi_error)
 			panic("possible random memory corruption caused by delayed completion of aborted local request\n");
 	}
 
 	/* to avoid recursion in __req_mod */
-	if (unlikely(error)) {
+	if (unlikely(bio->bi_error)) {
 		if (bio->bi_rw & REQ_DISCARD)
-			what = (error == -EOPNOTSUPP)
+			what = (bio->bi_error == -EOPNOTSUPP)
 				? DISCARD_COMPLETED_NOTSUPP
 				: DISCARD_COMPLETED_WITH_ERROR;
 		else
@@ -279,7 +257,7 @@ void drbd_request_endio(struct bio *bio, int error)
 		what = COMPLETED_OK;
 
 	bio_put(req->private_bio);
-	req->private_bio = ERR_PTR(error);
+	req->private_bio = ERR_PTR(bio->bi_error);
 
 	/* not req_mod(), we need irqsave here! */
 	spin_lock_irqsave(&device->resource->req_lock, flags);
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index a08cda955285..331363e7de0f 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3771,13 +3771,14 @@ struct rb0_cbdata {
 	struct completion complete;
 };
 
-static void floppy_rb0_cb(struct bio *bio, int err)
+static void floppy_rb0_cb(struct bio *bio)
 {
 	struct rb0_cbdata *cbdata = (struct rb0_cbdata *)bio->bi_private;
 	int drive = cbdata->drive;
 
-	if (err) {
-		pr_info("floppy: error %d while reading block 0\n", err);
+	if (bio->bi_error) {
+		pr_info("floppy: error %d while reading block 0\n",
+			bio->bi_error);
 		set_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags);
 	}
 	complete(&cbdata->complete);
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 69de41a87b74..016a59afcf24 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -222,7 +222,7 @@ static void end_cmd(struct nullb_cmd *cmd)
 		blk_end_request_all(cmd->rq, 0);
 		break;
 	case NULL_Q_BIO:
-		bio_endio(cmd->bio, 0);
+		bio_endio(cmd->bio);
 		break;
 	}
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 4c20c228184c..a7a259e031da 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -977,7 +977,7 @@ static void pkt_make_local_copy(struct packet_data *pkt, struct bio_vec *bvec)
 	}
 }
 
-static void pkt_end_io_read(struct bio *bio, int err)
+static void pkt_end_io_read(struct bio *bio)
 {
 	struct packet_data *pkt = bio->bi_private;
 	struct pktcdvd_device *pd = pkt->pd;
@@ -985,9 +985,9 @@ static void pkt_end_io_read(struct bio *bio, int err)
 
 	pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n",
 		bio, (unsigned long long)pkt->sector,
-		(unsigned long long)bio->bi_iter.bi_sector, err);
+		(unsigned long long)bio->bi_iter.bi_sector, bio->bi_error);
 
-	if (err)
+	if (bio->bi_error)
 		atomic_inc(&pkt->io_errors);
 	if (atomic_dec_and_test(&pkt->io_wait)) {
 		atomic_inc(&pkt->run_sm);
@@ -996,13 +996,13 @@ static void pkt_end_io_read(struct bio *bio, int err)
 	pkt_bio_finished(pd);
 }
 
-static void pkt_end_io_packet_write(struct bio *bio, int err)
+static void pkt_end_io_packet_write(struct bio *bio)
 {
 	struct packet_data *pkt = bio->bi_private;
 	struct pktcdvd_device *pd = pkt->pd;
 	BUG_ON(!pd);
 
-	pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, err);
+	pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, bio->bi_error);
 
 	pd->stats.pkt_ended++;
 
@@ -1340,22 +1340,22 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 	pkt_queue_bio(pd, pkt->w_bio);
 }
 
-static void pkt_finish_packet(struct packet_data *pkt, int uptodate)
+static void pkt_finish_packet(struct packet_data *pkt, int error)
 {
 	struct bio *bio;
 
-	if (!uptodate)
+	if (error)
 		pkt->cache_valid = 0;
 
 	/* Finish all bios corresponding to this packet */
-	while ((bio = bio_list_pop(&pkt->orig_bios)))
-		bio_endio(bio, uptodate ? 0 : -EIO);
+	while ((bio = bio_list_pop(&pkt->orig_bios))) {
+		bio->bi_error = error;
+		bio_endio(bio);
+	}
 }
 
 static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt)
 {
-	int uptodate;
-
 	pkt_dbg(2, pd, "pkt %d\n", pkt->id);
 
 	for (;;) {
@@ -1384,7 +1384,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data
 			if (atomic_read(&pkt->io_wait) > 0)
 				return;
 
-			if (test_bit(BIO_UPTODATE, &pkt->w_bio->bi_flags)) {
+			if (!pkt->w_bio->bi_error) {
 				pkt_set_state(pkt, PACKET_FINISHED_STATE);
 			} else {
 				pkt_set_state(pkt, PACKET_RECOVERY_STATE);
@@ -1401,8 +1401,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data
 			break;
 
 		case PACKET_FINISHED_STATE:
-			uptodate = test_bit(BIO_UPTODATE, &pkt->w_bio->bi_flags);
-			pkt_finish_packet(pkt, uptodate);
+			pkt_finish_packet(pkt, pkt->w_bio->bi_error);
 			return;
 
 		default:
@@ -2332,13 +2331,14 @@ static void pkt_close(struct gendisk *disk, fmode_t mode)
 }
 
 
-static void pkt_end_io_read_cloned(struct bio *bio, int err)
+static void pkt_end_io_read_cloned(struct bio *bio)
 {
 	struct packet_stacked_data *psd = bio->bi_private;
 	struct pktcdvd_device *pd = psd->pd;
 
+	psd->bio->bi_error = bio->bi_error;
 	bio_put(bio);
-	bio_endio(psd->bio, err);
+	bio_endio(psd->bio);
 	mempool_free(psd, psd_pool);
 	pkt_bio_finished(pd);
 }
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index b1612eb16172..49b4706b162c 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -593,7 +593,8 @@ out:
 	next = bio_list_peek(&priv->list);
 	spin_unlock_irq(&priv->lock);
 
-	bio_endio(bio, error);
+	bio->bi_error = error;
+	bio_endio(bio);
 	return next;
 }
 
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index ac8c62cb4875..63b9d2ffa8ee 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -137,7 +137,10 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card,
 		if (!card->eeh_state && card->gendisk)
 			disk_stats_complete(card, meta->bio, meta->start_time);
 
-		bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0);
+		if (atomic_read(&meta->error))
+			bio_io_error(meta->bio);
+		else
+			bio_endio(meta->bio);
 		kmem_cache_free(bio_meta_pool, meta);
 	}
 }
@@ -199,7 +202,9 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio)
 queue_err:
 	kmem_cache_free(bio_meta_pool, bio_meta);
 req_err:
-	bio_endio(bio, st);
+	if (st)
+		bio->bi_error = st;
+	bio_endio(bio);
 }
 
 /*----------------- Device Setup -------------------*/
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 4cf81b5bf0f7..3b3afd2ec5d6 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -456,7 +456,7 @@ static void process_page(unsigned long data)
 				PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE);
 		if (control & DMASCR_HARD_ERROR) {
 			/* error */
-			clear_bit(BIO_UPTODATE, &bio->bi_flags);
+			bio->bi_error = -EIO;
 			dev_printk(KERN_WARNING, &card->dev->dev,
 				"I/O error on sector %d/%d\n",
 				le32_to_cpu(desc->local_addr)>>9,
@@ -505,7 +505,7 @@ static void process_page(unsigned long data)
 
 		return_bio = bio->bi_next;
 		bio->bi_next = NULL;
-		bio_endio(bio, 0);
+		bio_endio(bio);
 	}
 }
 
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index ced96777b677..662648e08596 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -1078,9 +1078,9 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
 /*
  * bio callback.
  */
-static void end_block_io_op(struct bio *bio, int error)
+static void end_block_io_op(struct bio *bio)
 {
-	__end_block_io_op(bio->bi_private, error);
+	__end_block_io_op(bio->bi_private, bio->bi_error);
 	bio_put(bio);
 }
 
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 6d89ed35d80c..d542db7a6c73 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -82,7 +82,6 @@ struct blk_shadow {
 struct split_bio {
 	struct bio *bio;
 	atomic_t pending;
-	int err;
 };
 
 static DEFINE_MUTEX(blkfront_mutex);
@@ -1478,16 +1477,14 @@ static int blkfront_probe(struct xenbus_device *dev,
 	return 0;
 }
 
-static void split_bio_end(struct bio *bio, int error)
+static void split_bio_end(struct bio *bio)
 {
 	struct split_bio *split_bio = bio->bi_private;
 
-	if (error)
-		split_bio->err = error;
-
 	if (atomic_dec_and_test(&split_bio->pending)) {
 		split_bio->bio->bi_phys_segments = 0;
-		bio_endio(split_bio->bio, split_bio->err);
+		split_bio->bio->bi_error = bio->bi_error;
+		bio_endio(split_bio->bio);
 		kfree(split_bio);
 	}
 	bio_put(bio);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index f439ad2800da..68c3d4800464 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -850,7 +850,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
 
 	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
 		zram_bio_discard(zram, index, offset, bio);
-		bio_endio(bio, 0);
+		bio_endio(bio);
 		return;
 	}
 
@@ -883,8 +883,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
 		update_position(&index, &offset, &bvec);
 	}
 
-	set_bit(BIO_UPTODATE, &bio->bi_flags);
-	bio_endio(bio, 0);
+	bio_endio(bio);
 	return;
 
 out:
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 00cde40db572..83392f856dfd 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -278,7 +278,7 @@ err:
 	goto out;
 }
 
-static void btree_node_read_endio(struct bio *bio, int error)
+static void btree_node_read_endio(struct bio *bio)
 {
 	struct closure *cl = bio->bi_private;
 	closure_put(cl);
@@ -305,7 +305,7 @@ static void bch_btree_node_read(struct btree *b)
 	bch_submit_bbio(bio, b->c, &b->key, 0);
 	closure_sync(&cl);
 
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+	if (bio->bi_error)
 		set_btree_node_io_error(b);
 
 	bch_bbio_free(bio, b->c);
@@ -371,15 +371,15 @@ static void btree_node_write_done(struct closure *cl)
 	__btree_node_write_done(cl);
 }
 
-static void btree_node_write_endio(struct bio *bio, int error)
+static void btree_node_write_endio(struct bio *bio)
 {
 	struct closure *cl = bio->bi_private;
 	struct btree *b = container_of(cl, struct btree, io);
 
-	if (error)
+	if (bio->bi_error)
 		set_btree_node_io_error(b);
 
-	bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
+	bch_bbio_count_io_errors(b->c, bio, bio->bi_error, "writing btree");
 	closure_put(cl);
 }
 
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 79a6d63e8ed3..782cc2c8a185 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -38,7 +38,7 @@
  * they are running owned by the thread that is running them. Otherwise, suppose
  * you submit some bios and wish to have a function run when they all complete:
  *
- * foo_endio(struct bio *bio, int error)
+ * foo_endio(struct bio *bio)
  * {
  *	closure_put(cl);
  * }
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index bf6a9ca18403..9440df94bc83 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -55,19 +55,19 @@ static void bch_bio_submit_split_done(struct closure *cl)
 
 	s->bio->bi_end_io = s->bi_end_io;
 	s->bio->bi_private = s->bi_private;
-	bio_endio(s->bio, 0);
+	bio_endio(s->bio);
 
 	closure_debug_destroy(&s->cl);
 	mempool_free(s, s->p->bio_split_hook);
 }
 
-static void bch_bio_submit_split_endio(struct bio *bio, int error)
+static void bch_bio_submit_split_endio(struct bio *bio)
 {
 	struct closure *cl = bio->bi_private;
 	struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
 
-	if (error)
-		clear_bit(BIO_UPTODATE, &s->bio->bi_flags);
+	if (bio->bi_error)
+		s->bio->bi_error = bio->bi_error;
 
 	bio_put(bio);
 	closure_put(cl);
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 418607a6ba33..d6a4e16030a6 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -24,7 +24,7 @@
  * bit.
  */
 
-static void journal_read_endio(struct bio *bio, int error)
+static void journal_read_endio(struct bio *bio)
 {
 	struct closure *cl = bio->bi_private;
 	closure_put(cl);
@@ -401,7 +401,7 @@ retry:
 
 #define last_seq(j)	((j)->seq - fifo_used(&(j)->pin) + 1)
 
-static void journal_discard_endio(struct bio *bio, int error)
+static void journal_discard_endio(struct bio *bio)
 {
 	struct journal_device *ja =
 		container_of(bio, struct journal_device, discard_bio);
@@ -547,11 +547,11 @@ void bch_journal_next(struct journal *j)
 		pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
 }
 
-static void journal_write_endio(struct bio *bio, int error)
+static void journal_write_endio(struct bio *bio)
 {
 	struct journal_write *w = bio->bi_private;
 
-	cache_set_err_on(error, w->c, "journal io error");
+	cache_set_err_on(bio->bi_error, w->c, "journal io error");
 	closure_put(&w->c->journal.io);
 }
 
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index cd7490311e51..b929fc944e9c 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -60,20 +60,20 @@ static void write_moving_finish(struct closure *cl)
 	closure_return_with_destructor(cl, moving_io_destructor);
 }
 
-static void read_moving_endio(struct bio *bio, int error)
+static void read_moving_endio(struct bio *bio)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 	struct moving_io *io = container_of(bio->bi_private,
 					    struct moving_io, cl);
 
-	if (error)
-		io->op.error = error;
+	if (bio->bi_error)
+		io->op.error = bio->bi_error;
 	else if (!KEY_DIRTY(&b->key) &&
 		 ptr_stale(io->op.c, &b->key, 0)) {
 		io->op.error = -EINTR;
 	}
 
-	bch_bbio_endio(io->op.c, bio, error, "reading data to move");
+	bch_bbio_endio(io->op.c, bio, bio->bi_error, "reading data to move");
 }
 
 static void moving_init(struct moving_io *io)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index f292790997d7..a09b9462ff49 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -173,22 +173,22 @@ static void bch_data_insert_error(struct closure *cl)
 	bch_data_insert_keys(cl);
 }
 
-static void bch_data_insert_endio(struct bio *bio, int error)
+static void bch_data_insert_endio(struct bio *bio)
 {
 	struct closure *cl = bio->bi_private;
 	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 
-	if (error) {
+	if (bio->bi_error) {
 		/* TODO: We could try to recover from this. */
 		if (op->writeback)
-			op->error = error;
+			op->error = bio->bi_error;
 		else if (!op->replace)
 			set_closure_fn(cl, bch_data_insert_error, op->wq);
 		else
 			set_closure_fn(cl, NULL, NULL);
 	}
 
-	bch_bbio_endio(op->c, bio, error, "writing data to cache");
+	bch_bbio_endio(op->c, bio, bio->bi_error, "writing data to cache");
 }
 
 static void bch_data_insert_start(struct closure *cl)
@@ -477,7 +477,7 @@ struct search {
 	struct data_insert_op	iop;
 };
 
-static void bch_cache_read_endio(struct bio *bio, int error)
+static void bch_cache_read_endio(struct bio *bio)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 	struct closure *cl = bio->bi_private;
@@ -490,15 +490,15 @@ static void bch_cache_read_endio(struct bio *bio, int error)
 	 * from the backing device.
 	 */
 
-	if (error)
-		s->iop.error = error;
+	if (bio->bi_error)
+		s->iop.error = bio->bi_error;
 	else if (!KEY_DIRTY(&b->key) &&
 		 ptr_stale(s->iop.c, &b->key, 0)) {
 		atomic_long_inc(&s->iop.c->cache_read_races);
 		s->iop.error = -EINTR;
 	}
 
-	bch_bbio_endio(s->iop.c, bio, error, "reading from cache");
+	bch_bbio_endio(s->iop.c, bio, bio->bi_error, "reading from cache");
 }
 
 /*
@@ -591,13 +591,13 @@ static void cache_lookup(struct closure *cl)
 
 /* Common code for the make_request functions */
 
-static void request_endio(struct bio *bio, int error)
+static void request_endio(struct bio *bio)
 {
 	struct closure *cl = bio->bi_private;
 
-	if (error) {
+	if (bio->bi_error) {
 		struct search *s = container_of(cl, struct search, cl);
-		s->iop.error = error;
+		s->iop.error = bio->bi_error;
 		/* Only cache read errors are recoverable */
 		s->recoverable = false;
 	}
@@ -613,7 +613,8 @@ static void bio_complete(struct search *s)
 				    &s->d->disk->part0, s->start_time);
 
 		trace_bcache_request_end(s->d, s->orig_bio);
-		bio_endio(s->orig_bio, s->iop.error);
+		s->orig_bio->bi_error = s->iop.error;
+		bio_endio(s->orig_bio);
 		s->orig_bio = NULL;
 	}
 }
@@ -992,7 +993,7 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 	} else {
 		if ((bio->bi_rw & REQ_DISCARD) &&
 		    !blk_queue_discard(bdev_get_queue(dc->bdev)))
-			bio_endio(bio, 0);
+			bio_endio(bio);
 		else
 			bch_generic_make_request(bio, &d->bio_split_hook);
 	}
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fc8e545ced18..be01fd3c87f1 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -221,7 +221,7 @@ err:
 	return err;
 }
 
-static void write_bdev_super_endio(struct bio *bio, int error)
+static void write_bdev_super_endio(struct bio *bio)
 {
 	struct cached_dev *dc = bio->bi_private;
 	/* XXX: error checking */
@@ -290,11 +290,11 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
 	closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
 }
 
-static void write_super_endio(struct bio *bio, int error)
+static void write_super_endio(struct bio *bio)
 {
 	struct cache *ca = bio->bi_private;
 
-	bch_count_io_errors(ca, error, "writing superblock");
+	bch_count_io_errors(ca, bio->bi_error, "writing superblock");
 	closure_put(&ca->set->sb_write);
 }
 
@@ -339,12 +339,12 @@ void bcache_write_super(struct cache_set *c)
 
 /* UUID io */
 
-static void uuid_endio(struct bio *bio, int error)
+static void uuid_endio(struct bio *bio)
 {
 	struct closure *cl = bio->bi_private;
 	struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
 
-	cache_set_err_on(error, c, "accessing uuids");
+	cache_set_err_on(bio->bi_error, c, "accessing uuids");
 	bch_bbio_free(bio, c);
 	closure_put(cl);
 }
@@ -512,11 +512,11 @@ static struct uuid_entry *uuid_find_empty(struct cache_set *c)
  * disk.
  */
 
-static void prio_endio(struct bio *bio, int error)
+static void prio_endio(struct bio *bio)
 {
 	struct cache *ca = bio->bi_private;
 
-	cache_set_err_on(error, ca->set, "accessing priorities");
+	cache_set_err_on(bio->bi_error, ca->set, "accessing priorities");
 	bch_bbio_free(bio, ca->set);
 	closure_put(&ca->prio);
 }
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index f1986bcd1bf0..b4fc874c30fd 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -166,12 +166,12 @@ static void write_dirty_finish(struct closure *cl)
 	closure_return_with_destructor(cl, dirty_io_destructor);
 }
 
-static void dirty_endio(struct bio *bio, int error)
+static void dirty_endio(struct bio *bio)
 {
 	struct keybuf_key *w = bio->bi_private;
 	struct dirty_io *io = w->private;
 
-	if (error)
+	if (bio->bi_error)
 		SET_KEY_DIRTY(&w->key, false);
 
 	closure_put(&io->cl);
@@ -193,15 +193,15 @@ static void write_dirty(struct closure *cl)
 	continue_at(cl, write_dirty_finish, system_wq);
 }
 
-static void read_dirty_endio(struct bio *bio, int error)
+static void read_dirty_endio(struct bio *bio)
 {
 	struct keybuf_key *w = bio->bi_private;
 	struct dirty_io *io = w->private;
 
 	bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
-			    error, "reading dirty data from cache");
+			    bio->bi_error, "reading dirty data from cache");
 
-	dirty_endio(bio, error);
+	dirty_endio(bio);
 }
 
 static void read_dirty_submit(struct closure *cl)
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index cd6d1d21e057..03af174485d3 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -236,8 +236,10 @@ void dm_cell_error(struct dm_bio_prison *prison,
 	bio_list_init(&bios);
 	dm_cell_release(prison, cell, &bios);
 
-	while ((bio = bio_list_pop(&bios)))
-		bio_endio(bio, error);
+	while ((bio = bio_list_pop(&bios))) {
+		bio->bi_error = error;
+		bio_endio(bio);
+	}
 }
 EXPORT_SYMBOL_GPL(dm_cell_error);
 
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 86dbbc737402..83cc52eaf56d 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -545,7 +545,8 @@ static void dmio_complete(unsigned long error, void *context)
 {
 	struct dm_buffer *b = context;
 
-	b->bio.bi_end_io(&b->bio, error ? -EIO : 0);
+	b->bio.bi_error = error ? -EIO : 0;
+	b->bio.bi_end_io(&b->bio);
 }
 
 static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
@@ -575,13 +576,16 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
 	b->bio.bi_end_io = end_io;
 
 	r = dm_io(&io_req, 1, &region, NULL);
-	if (r)
-		end_io(&b->bio, r);
+	if (r) {
+		b->bio.bi_error = r;
+		end_io(&b->bio);
+	}
 }
 
-static void inline_endio(struct bio *bio, int error)
+static void inline_endio(struct bio *bio)
 {
 	bio_end_io_t *end_fn = bio->bi_private;
+	int error = bio->bi_error;
 
 	/*
 	 * Reset the bio to free any attached resources
@@ -589,7 +593,8 @@ static void inline_endio(struct bio *bio, int error)
 	 */
 	bio_reset(bio);
 
-	end_fn(bio, error);
+	bio->bi_error = error;
+	end_fn(bio);
 }
 
 static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
@@ -661,13 +666,14 @@ static void submit_io(struct dm_buffer *b, int rw, sector_t block,
  * Set the error, clear B_WRITING bit and wake anyone who was waiting on
  * it.
  */
-static void write_endio(struct bio *bio, int error)
+static void write_endio(struct bio *bio)
 {
 	struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
 
-	b->write_error = error;
-	if (unlikely(error)) {
+	b->write_error = bio->bi_error;
+	if (unlikely(bio->bi_error)) {
 		struct dm_bufio_client *c = b->c;
+		int error = bio->bi_error;
 		(void)cmpxchg(&c->async_write_error, 0, error);
 	}
 
@@ -1026,11 +1032,11 @@ found_buffer:
  * The endio routine for reading: set the error, clear the bit and wake up
  * anyone waiting on the buffer.
  */
-static void read_endio(struct bio *bio, int error)
+static void read_endio(struct bio *bio)
 {
 	struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
 
-	b->read_error = error;
+	b->read_error = bio->bi_error;
 
 	BUG_ON(!test_bit(B_READING, &b->state));
 
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 1b4e1756b169..04d0dadc48b1 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -919,14 +919,14 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
 	wake_worker(cache);
 }
 
-static void writethrough_endio(struct bio *bio, int err)
+static void writethrough_endio(struct bio *bio)
 {
 	struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
 
 	dm_unhook_bio(&pb->hook_info, bio);
 
-	if (err) {
-		bio_endio(bio, err);
+	if (bio->bi_error) {
+		bio_endio(bio);
 		return;
 	}
 
@@ -1231,7 +1231,7 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
 			 * The block was promoted via an overwrite, so it's dirty.
 			 */
 			set_dirty(cache, mg->new_oblock, mg->cblock);
-			bio_endio(mg->new_ocell->holder, 0);
+			bio_endio(mg->new_ocell->holder);
 			cell_defer(cache, mg->new_ocell, false);
 		}
 		free_io_migration(mg);
@@ -1284,7 +1284,7 @@ static void issue_copy(struct dm_cache_migration *mg)
 	}
 }
 
-static void overwrite_endio(struct bio *bio, int err)
+static void overwrite_endio(struct bio *bio)
 {
 	struct dm_cache_migration *mg = bio->bi_private;
 	struct cache *cache = mg->cache;
@@ -1294,7 +1294,7 @@ static void overwrite_endio(struct bio *bio, int err)
 
 	dm_unhook_bio(&pb->hook_info, bio);
 
-	if (err)
+	if (bio->bi_error)
 		mg->err = true;
 
 	mg->requeue_holder = false;
@@ -1358,7 +1358,7 @@ static void issue_discard(struct dm_cache_migration *mg)
 		b = to_dblock(from_dblock(b) + 1);
 	}
 
-	bio_endio(bio, 0);
+	bio_endio(bio);
 	cell_defer(mg->cache, mg->new_ocell, false);
 	free_migration(mg);
 }
@@ -1631,7 +1631,7 @@ static void process_discard_bio(struct cache *cache, struct prealloc *structs,
 
 	calc_discard_block_range(cache, bio, &b, &e);
 	if (b == e) {
-		bio_endio(bio, 0);
+		bio_endio(bio);
 		return;
 	}
 
@@ -2213,8 +2213,10 @@ static void requeue_deferred_bios(struct cache *cache)
 	bio_list_merge(&bios, &cache->deferred_bios);
 	bio_list_init(&cache->deferred_bios);
 
-	while ((bio = bio_list_pop(&bios)))
-		bio_endio(bio, DM_ENDIO_REQUEUE);
+	while ((bio = bio_list_pop(&bios))) {
+		bio->bi_error = DM_ENDIO_REQUEUE;
+		bio_endio(bio);
+	}
 }
 
 static int more_work(struct cache *cache)
@@ -3119,7 +3121,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 			 * This is a duplicate writethrough io that is no
 			 * longer needed because the block has been demoted.
 			 */
-			bio_endio(bio, 0);
+			bio_endio(bio);
 			// FIXME: remap everything as a miss
 			cell_defer(cache, cell, false);
 			r = DM_MAPIO_SUBMITTED;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 0f48fed44a17..744b80c608e5 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1076,7 +1076,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 	if (io->ctx.req)
 		crypt_free_req(cc, io->ctx.req, base_bio);
 
-	bio_endio(base_bio, error);
+	base_bio->bi_error = error;
+	bio_endio(base_bio);
 }
 
 /*
@@ -1096,15 +1097,12 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
  * The work is done per CPU global for all dm-crypt instances.
  * They should not depend on each other and do not block.
  */
-static void crypt_endio(struct bio *clone, int error)
+static void crypt_endio(struct bio *clone)
 {
 	struct dm_crypt_io *io = clone->bi_private;
 	struct crypt_config *cc = io->cc;
 	unsigned rw = bio_data_dir(clone);
 
-	if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error))
-		error = -EIO;
-
 	/*
 	 * free the processed pages
 	 */
@@ -1113,13 +1111,13 @@ static void crypt_endio(struct bio *clone, int error)
 
 	bio_put(clone);
 
-	if (rw == READ && !error) {
+	if (rw == READ && !clone->bi_error) {
 		kcryptd_queue_crypt(io);
 		return;
 	}
 
-	if (unlikely(error))
-		io->error = error;
+	if (unlikely(clone->bi_error))
+		io->error = clone->bi_error;
 
 	crypt_dec_pending(io);
 }
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index b257e46876d3..04481247aab8 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -296,7 +296,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
 		 * Drop writes?
 		 */
 		if (test_bit(DROP_WRITES, &fc->flags)) {
-			bio_endio(bio, 0);
+			bio_endio(bio);
 			return DM_MAPIO_SUBMITTED;
 		}
 
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 74adcd2c967e..efc6659f9d6a 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -134,12 +134,12 @@ static void dec_count(struct io *io, unsigned int region, int error)
 		complete_io(io);
 }
 
-static void endio(struct bio *bio, int error)
+static void endio(struct bio *bio)
 {
 	struct io *io;
 	unsigned region;
 
-	if (error && bio_data_dir(bio) == READ)
+	if (bio->bi_error && bio_data_dir(bio) == READ)
 		zero_fill_bio(bio);
 
 	/*
@@ -149,7 +149,7 @@ static void endio(struct bio *bio, int error)
 
 	bio_put(bio);
 
-	dec_count(io, region, error);
+	dec_count(io, region, bio->bi_error);
 }
 
 /*-----------------------------------------------------------------
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index ad1b049ae2ab..e9d17488d5e3 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -146,16 +146,16 @@ static void put_io_block(struct log_writes_c *lc)
 	}
 }
 
-static void log_end_io(struct bio *bio, int err)
+static void log_end_io(struct bio *bio)
 {
 	struct log_writes_c *lc = bio->bi_private;
 	struct bio_vec *bvec;
 	int i;
 
-	if (err) {
+	if (bio->bi_error) {
 		unsigned long flags;
 
-		DMERR("Error writing log block, error=%d", err);
+		DMERR("Error writing log block, error=%d", bio->bi_error);
 		spin_lock_irqsave(&lc->blocks_lock, flags);
 		lc->logging_enabled = false;
 		spin_unlock_irqrestore(&lc->blocks_lock, flags);
@@ -205,7 +205,6 @@ static int write_metadata(struct log_writes_c *lc, void *entry,
 	bio->bi_bdev = lc->logdev->bdev;
 	bio->bi_end_io = log_end_io;
 	bio->bi_private = lc;
-	set_bit(BIO_UPTODATE, &bio->bi_flags);
 
 	page = alloc_page(GFP_KERNEL);
 	if (!page) {
@@ -270,7 +269,6 @@ static int log_one_block(struct log_writes_c *lc,
 	bio->bi_bdev = lc->logdev->bdev;
 	bio->bi_end_io = log_end_io;
 	bio->bi_private = lc;
-	set_bit(BIO_UPTODATE, &bio->bi_flags);
 
 	for (i = 0; i < block->vec_cnt; i++) {
 		/*
@@ -292,7 +290,6 @@ static int log_one_block(struct log_writes_c *lc,
 			bio->bi_bdev = lc->logdev->bdev;
 			bio->bi_end_io = log_end_io;
 			bio->bi_private = lc;
-			set_bit(BIO_UPTODATE, &bio->bi_flags);
 
 			ret = bio_add_page(bio, block->vecs[i].bv_page,
 					   block->vecs[i].bv_len, 0);
@@ -606,7 +603,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio)
 		WARN_ON(flush_bio || fua_bio);
 		if (lc->device_supports_discard)
 			goto map_bio;
-		bio_endio(bio, 0);
+		bio_endio(bio);
 		return DM_MAPIO_SUBMITTED;
 	}
 
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d83696bf403b..e1eabfb2f52d 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -490,9 +490,11 @@ static void hold_bio(struct mirror_set *ms, struct bio *bio)
 		 * If device is suspended, complete the bio.
 		 */
 		if (dm_noflush_suspending(ms->ti))
-			bio_endio(bio, DM_ENDIO_REQUEUE);
+			bio->bi_error = DM_ENDIO_REQUEUE;
 		else
-			bio_endio(bio, -EIO);
+			bio->bi_error = -EIO;
+
+		bio_endio(bio);
 		return;
 	}
 
@@ -515,7 +517,7 @@ static void read_callback(unsigned long error, void *context)
 	bio_set_m(bio, NULL);
 
 	if (likely(!error)) {
-		bio_endio(bio, 0);
+		bio_endio(bio);
 		return;
 	}
 
@@ -531,7 +533,7 @@ static void read_callback(unsigned long error, void *context)
 
 	DMERR_LIMIT("Read failure on mirror device %s.  Failing I/O.",
 		    m->dev->name);
-	bio_endio(bio, -EIO);
+	bio_io_error(bio);
 }
 
 /* Asynchronous read. */
@@ -580,7 +582,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
 		if (likely(m))
 			read_async_bio(m, bio);
 		else
-			bio_endio(bio, -EIO);
+			bio_io_error(bio);
 	}
 }
 
@@ -598,7 +600,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
 
 static void write_callback(unsigned long error, void *context)
 {
-	unsigned i, ret = 0;
+	unsigned i;
 	struct bio *bio = (struct bio *) context;
 	struct mirror_set *ms;
 	int should_wake = 0;
@@ -614,7 +616,7 @@ static void write_callback(unsigned long error, void *context)
 	 * regions with the same code.
 	 */
 	if (likely(!error)) {
-		bio_endio(bio, ret);
+		bio_endio(bio);
 		return;
 	}
 
@@ -623,7 +625,8 @@ static void write_callback(unsigned long error, void *context)
 	 * degrade the array.
 	 */
 	if (bio->bi_rw & REQ_DISCARD) {
-		bio_endio(bio, -EOPNOTSUPP);
+		bio->bi_error = -EOPNOTSUPP;
+		bio_endio(bio);
 		return;
 	}
 
@@ -828,13 +831,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 		 * be wrong if the failed leg returned after reboot and
 		 * got replicated back to the good legs.)
 		 */
-
 		if (unlikely(!get_valid_mirror(ms) || (keep_log(ms) && ms->log_failure)))
-			bio_endio(bio, -EIO);
+			bio_io_error(bio);
 		else if (errors_handled(ms) && !keep_log(ms))
 			hold_bio(ms, bio);
 		else
-			bio_endio(bio, 0);
+			bio_endio(bio);
 	}
 }
 
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 7c82d3ccce87..dd8ca0bb0980 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1490,7 +1490,7 @@ out:
 		error_bios(snapshot_bios);
 	} else {
 		if (full_bio)
-			bio_endio(full_bio, 0);
+			bio_endio(full_bio);
 		flush_bios(snapshot_bios);
 	}
 
@@ -1580,11 +1580,11 @@ static void start_copy(struct dm_snap_pending_exception *pe)
 	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
 }
 
-static void full_bio_end_io(struct bio *bio, int error)
+static void full_bio_end_io(struct bio *bio)
 {
 	void *callback_data = bio->bi_private;
 
-	dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0);
+	dm_kcopyd_do_callback(callback_data, 0, bio->bi_error ? 1 : 0);
 }
 
 static void start_full_bio(struct dm_snap_pending_exception *pe,
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index a672a1502c14..4f94c7da82f6 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -273,7 +273,7 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
 		return DM_MAPIO_REMAPPED;
 	} else {
 		/* The range doesn't map to the target stripe */
-		bio_endio(bio, 0);
+		bio_endio(bio);
 		return DM_MAPIO_SUBMITTED;
 	}
 }
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c33f61a4cc28..2ade2c46dca9 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -614,8 +614,10 @@ static void error_bio_list(struct bio_list *bios, int error)
 {
 	struct bio *bio;
 
-	while ((bio = bio_list_pop(bios)))
-		bio_endio(bio, error);
+	while ((bio = bio_list_pop(bios))) {
+		bio->bi_error = error;
+		bio_endio(bio);
+	}
 }
 
 static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)
@@ -864,14 +866,14 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
 	complete_mapping_preparation(m);
 }
 
-static void overwrite_endio(struct bio *bio, int err)
+static void overwrite_endio(struct bio *bio)
 {
 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 	struct dm_thin_new_mapping *m = h->overwrite_mapping;
 
 	bio->bi_end_io = m->saved_bi_end_io;
 
-	m->err = err;
+	m->err = bio->bi_error;
 	complete_mapping_preparation(m);
 }
 
@@ -996,7 +998,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	 */
 	if (bio) {
 		inc_remap_and_issue_cell(tc, m->cell, m->data_block);
-		bio_endio(bio, 0);
+		bio_endio(bio);
 	} else {
 		inc_all_io_entry(tc->pool, m->cell->holder);
 		remap_and_issue(tc, m->cell->holder, m->data_block);
@@ -1026,7 +1028,7 @@ static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
 
 static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
 {
-	bio_endio(m->bio, 0);
+	bio_endio(m->bio);
 	free_discard_mapping(m);
 }
 
@@ -1040,7 +1042,7 @@ static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
 		metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
 		bio_io_error(m->bio);
 	} else
-		bio_endio(m->bio, 0);
+		bio_endio(m->bio);
 
 	cell_defer_no_holder(tc, m->cell);
 	mempool_free(m, tc->pool->mapping_pool);
@@ -1111,7 +1113,8 @@ static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
 	 * Even if r is set, there could be sub discards in flight that we
 	 * need to wait for.
 	 */
-	bio_endio(m->bio, r);
+	m->bio->bi_error = r;
+	bio_endio(m->bio);
 	cell_defer_no_holder(tc, m->cell);
 	mempool_free(m, pool->mapping_pool);
 }
@@ -1487,9 +1490,10 @@ static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
 {
 	int error = should_error_unserviceable_bio(pool);
 
-	if (error)
-		bio_endio(bio, error);
-	else
+	if (error) {
+		bio->bi_error = error;
+		bio_endio(bio);
+	} else
 		retry_on_resume(bio);
 }
 
@@ -1625,7 +1629,7 @@ static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_priso
 	 * will prevent completion until the sub range discards have
 	 * completed.
 	 */
-	bio_endio(bio, 0);
+	bio_endio(bio);
 }
 
 static void process_discard_bio(struct thin_c *tc, struct bio *bio)
@@ -1639,7 +1643,7 @@ static void process_discard_bio(struct thin_c *tc, struct bio *bio)
 		/*
 		 * The discard covers less than a block.
 		 */
-		bio_endio(bio, 0);
+		bio_endio(bio);
 		return;
 	}
 
@@ -1784,7 +1788,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 	if (bio_data_dir(bio) == READ) {
 		zero_fill_bio(bio);
 		cell_defer_no_holder(tc, cell);
-		bio_endio(bio, 0);
+		bio_endio(bio);
 		return;
 	}
 
@@ -1849,7 +1853,7 @@ static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 
 			} else {
 				zero_fill_bio(bio);
-				bio_endio(bio, 0);
+				bio_endio(bio);
 			}
 		} else
 			provision_block(tc, bio, block, cell);
@@ -1920,7 +1924,7 @@ static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
 		}
 
 		zero_fill_bio(bio);
-		bio_endio(bio, 0);
+		bio_endio(bio);
 		break;
 
 	default:
@@ -1945,7 +1949,7 @@ static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell
 
 static void process_bio_success(struct thin_c *tc, struct bio *bio)
 {
-	bio_endio(bio, 0);
+	bio_endio(bio);
 }
 
 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
@@ -2581,7 +2585,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 	thin_hook_bio(tc, bio);
 
 	if (tc->requeue_mode) {
-		bio_endio(bio, DM_ENDIO_REQUEUE);
+		bio->bi_error = DM_ENDIO_REQUEUE;
+		bio_endio(bio);
 		return DM_MAPIO_SUBMITTED;
 	}
 
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index bb9c6a00e4b0..4b34df8fdb58 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -458,8 +458,9 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
 
 	bio->bi_end_io = io->orig_bi_end_io;
 	bio->bi_private = io->orig_bi_private;
+	bio->bi_error = error;
 
-	bio_endio(bio, error);
+	bio_endio(bio);
 }
 
 static void verity_work(struct work_struct *w)
@@ -469,12 +470,12 @@ static void verity_work(struct work_struct *w)
 	verity_finish_io(io, verity_verify_io(io));
 }
 
-static void verity_end_io(struct bio *bio, int error)
+static void verity_end_io(struct bio *bio)
 {
 	struct dm_verity_io *io = bio->bi_private;
 
-	if (error) {
-		verity_finish_io(io, error);
+	if (bio->bi_error) {
+		verity_finish_io(io, bio->bi_error);
 		return;
 	}
 
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index b9a64bbce304..766bc93006e6 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -47,7 +47,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio)
 		break;
 	}
 
-	bio_endio(bio, 0);
+	bio_endio(bio);
 
 	/* accepted bio, don't make new request */
 	return DM_MAPIO_SUBMITTED;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f331d888e7f5..7f367fcace03 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -944,7 +944,8 @@ static void dec_pending(struct dm_io *io, int error)
 		} else {
 			/* done with normal IO or empty flush */
 			trace_block_bio_complete(md->queue, bio, io_error);
-			bio_endio(bio, io_error);
+			bio->bi_error = io_error;
+			bio_endio(bio);
 		}
 	}
 }
@@ -957,17 +958,15 @@ static void disable_write_same(struct mapped_device *md)
 	limits->max_write_same_sectors = 0;
 }
 
-static void clone_endio(struct bio *bio, int error)
+static void clone_endio(struct bio *bio)
 {
+	int error = bio->bi_error;
 	int r = error;
 	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 	struct dm_io *io = tio->io;
 	struct mapped_device *md = tio->io->md;
 	dm_endio_fn endio = tio->ti->type->end_io;
 
-	if (!bio_flagged(bio, BIO_UPTODATE) && !error)
-		error = -EIO;
-
 	if (endio) {
 		r = endio(tio->ti, bio, error);
 		if (r < 0 || r == DM_ENDIO_REQUEUE)
@@ -996,7 +995,7 @@ static void clone_endio(struct bio *bio, int error)
 /*
  * Partial completion handling for request-based dm
  */
-static void end_clone_bio(struct bio *clone, int error)
+static void end_clone_bio(struct bio *clone)
 {
 	struct dm_rq_clone_bio_info *info =
 		container_of(clone, struct dm_rq_clone_bio_info, clone);
@@ -1013,13 +1012,13 @@ static void end_clone_bio(struct bio *clone, int error)
 		 * the remainder.
 		 */
 		return;
-	else if (error) {
+	else if (bio->bi_error) {
 		/*
 		 * Don't notice the error to the upper layer yet.
 		 * The error handling decision is made by the target driver,
 		 * when the request is completed.
 		 */
-		tio->error = error;
+		tio->error = bio->bi_error;
 		return;
 	}
 
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 1277eb26b58a..4a8e15058e8b 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -70,7 +70,7 @@
 #include <linux/seq_file.h>
 
 
-static void faulty_fail(struct bio *bio, int error)
+static void faulty_fail(struct bio *bio)
 {
 	struct bio *b = bio->bi_private;
 
@@ -181,7 +181,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
 			/* special case - don't decrement, don't generic_make_request,
 			 * just fail immediately
 			 */
-			bio_endio(bio, -EIO);
+			bio_io_error(bio);
 			return;
 		}
 
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index fa7d577f3d12..aefd66142eef 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -297,7 +297,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 		if (unlikely((split->bi_rw & REQ_DISCARD) &&
 			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
 			/* Just ignore it */
-			bio_endio(split, 0);
+			bio_endio(split);
 		} else
 			generic_make_request(split);
 	} while (split != bio);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d429c30cd514..ac4381a6625c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -263,7 +263,9 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
 		return;
 	}
 	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
-		bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
+		if (bio_sectors(bio) != 0)
+			bio->bi_error = -EROFS;
+		bio_endio(bio);
 		return;
 	}
 	smp_rmb(); /* Ensure implications of  'active' are visible */
@@ -377,7 +379,7 @@ static int md_mergeable_bvec(struct request_queue *q,
  * Generic flush handling for md
  */
 
-static void md_end_flush(struct bio *bio, int err)
+static void md_end_flush(struct bio *bio)
 {
 	struct md_rdev *rdev = bio->bi_private;
 	struct mddev *mddev = rdev->mddev;
@@ -433,7 +435,7 @@ static void md_submit_flush_data(struct work_struct *ws)
 
 	if (bio->bi_iter.bi_size == 0)
 		/* an empty barrier - all done */
-		bio_endio(bio, 0);
+		bio_endio(bio);
 	else {
 		bio->bi_rw &= ~REQ_FLUSH;
 		mddev->pers->make_request(mddev, bio);
@@ -728,15 +730,13 @@ void md_rdev_clear(struct md_rdev *rdev)
 }
 EXPORT_SYMBOL_GPL(md_rdev_clear);
 
-static void super_written(struct bio *bio, int error)
+static void super_written(struct bio *bio)
 {
 	struct md_rdev *rdev = bio->bi_private;
 	struct mddev *mddev = rdev->mddev;
 
-	if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
-		printk("md: super_written gets error=%d, uptodate=%d\n",
-		       error, test_bit(BIO_UPTODATE, &bio->bi_flags));
-		WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
+	if (bio->bi_error) {
+		printk("md: super_written gets error=%d\n", bio->bi_error);
 		md_error(mddev, rdev);
 	}
 
@@ -791,7 +791,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 	bio_add_page(bio, page, size, 0);
 	submit_bio_wait(rw, bio);
 
-	ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	ret = !bio->bi_error;
 	bio_put(bio);
 	return ret;
 }
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index ac3ede2bd00e..082a489af9d3 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -77,18 +77,18 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
 	struct bio *bio = mp_bh->master_bio;
 	struct mpconf *conf = mp_bh->mddev->private;
 
-	bio_endio(bio, err);
+	bio->bi_error = err;
+	bio_endio(bio);
 	mempool_free(mp_bh, conf->pool);
 }
 
-static void multipath_end_request(struct bio *bio, int error)
+static void multipath_end_request(struct bio *bio)
 {
-	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct multipath_bh *mp_bh = bio->bi_private;
 	struct mpconf *conf = mp_bh->mddev->private;
 	struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev;
 
-	if (uptodate)
+	if (!bio->bi_error)
 		multipath_end_bh_io(mp_bh, 0);
 	else if (!(bio->bi_rw & REQ_RAHEAD)) {
 		/*
@@ -101,7 +101,7 @@ static void multipath_end_request(struct bio *bio, int error)
 		       (unsigned long long)bio->bi_iter.bi_sector);
 		multipath_reschedule_retry(mp_bh);
 	} else
-		multipath_end_bh_io(mp_bh, error);
+		multipath_end_bh_io(mp_bh, bio->bi_error);
 	rdev_dec_pending(rdev, conf->mddev);
 }
 
@@ -123,7 +123,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
 
 	mp_bh->path = multipath_map(conf);
 	if (mp_bh->path < 0) {
-		bio_endio(bio, -EIO);
+		bio_io_error(bio);
 		mempool_free(mp_bh, conf->pool);
 		return;
 	}
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index efb654eb5399..e6e0ae56f66b 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -543,7 +543,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 		if (unlikely((split->bi_rw & REQ_DISCARD) &&
 			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
 			/* Just ignore it */
-			bio_endio(split, 0);
+			bio_endio(split);
 		} else
 			generic_make_request(split);
 	} while (split != bio);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f80f1af61ce7..9aa7d1fb2bc1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -255,9 +255,10 @@ static void call_bio_endio(struct r1bio *r1_bio)
 		done = 1;
 
 	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+		bio->bi_error = -EIO;
+
 	if (done) {
-		bio_endio(bio, 0);
+		bio_endio(bio);
 		/*
 		 * Wake up any possible resync thread that waits for the device
 		 * to go idle.
@@ -312,9 +313,9 @@ static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
 	return mirror;
 }
 
-static void raid1_end_read_request(struct bio *bio, int error)
+static void raid1_end_read_request(struct bio *bio)
 {
-	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	int uptodate = !bio->bi_error;
 	struct r1bio *r1_bio = bio->bi_private;
 	int mirror;
 	struct r1conf *conf = r1_bio->mddev->private;
@@ -397,9 +398,8 @@ static void r1_bio_write_done(struct r1bio *r1_bio)
 	}
 }
 
-static void raid1_end_write_request(struct bio *bio, int error)
+static void raid1_end_write_request(struct bio *bio)
 {
-	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct r1bio *r1_bio = bio->bi_private;
 	int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
 	struct r1conf *conf = r1_bio->mddev->private;
@@ -410,7 +410,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
 	/*
 	 * 'one mirror IO has finished' event handler:
 	 */
-	if (!uptodate) {
+	if (bio->bi_error) {
 		set_bit(WriteErrorSeen,
 			&conf->mirrors[mirror].rdev->flags);
 		if (!test_and_set_bit(WantReplacement,
@@ -793,7 +793,7 @@ static void flush_pending_writes(struct r1conf *conf)
 			if (unlikely((bio->bi_rw & REQ_DISCARD) &&
 			    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
 				/* Just ignore it */
-				bio_endio(bio, 0);
+				bio_endio(bio);
 			else
 				generic_make_request(bio);
 			bio = next;
@@ -1068,7 +1068,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 		if (unlikely((bio->bi_rw & REQ_DISCARD) &&
 		    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
 			/* Just ignore it */
-			bio_endio(bio, 0);
+			bio_endio(bio);
 		else
 			generic_make_request(bio);
 		bio = next;
@@ -1734,7 +1734,7 @@ abort:
 	return err;
 }
 
-static void end_sync_read(struct bio *bio, int error)
+static void end_sync_read(struct bio *bio)
 {
 	struct r1bio *r1_bio = bio->bi_private;
 
@@ -1745,16 +1745,16 @@ static void end_sync_read(struct bio *bio, int error)
 	 * or re-read if the read failed.
 	 * We don't do much here, just schedule handling by raid1d
 	 */
-	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+	if (!bio->bi_error)
 		set_bit(R1BIO_Uptodate, &r1_bio->state);
 
 	if (atomic_dec_and_test(&r1_bio->remaining))
 		reschedule_retry(r1_bio);
 }
 
-static void end_sync_write(struct bio *bio, int error)
+static void end_sync_write(struct bio *bio)
 {
-	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	int uptodate = !bio->bi_error;
 	struct r1bio *r1_bio = bio->bi_private;
 	struct mddev *mddev = r1_bio->mddev;
 	struct r1conf *conf = mddev->private;
@@ -1941,7 +1941,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 		idx ++;
 	}
 	set_bit(R1BIO_Uptodate, &r1_bio->state);
-	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio->bi_error = 0;
 	return 1;
 }
 
@@ -1965,15 +1965,14 @@ static void process_checks(struct r1bio *r1_bio)
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 		int j;
 		int size;
-		int uptodate;
+		int error;
 		struct bio *b = r1_bio->bios[i];
 		if (b->bi_end_io != end_sync_read)
 			continue;
-		/* fixup the bio for reuse, but preserve BIO_UPTODATE */
-		uptodate = test_bit(BIO_UPTODATE, &b->bi_flags);
+		/* fixup the bio for reuse, but preserve errno */
+		error = b->bi_error;
 		bio_reset(b);
-		if (!uptodate)
-			clear_bit(BIO_UPTODATE, &b->bi_flags);
+		b->bi_error = error;
 		b->bi_vcnt = vcnt;
 		b->bi_iter.bi_size = r1_bio->sectors << 9;
 		b->bi_iter.bi_sector = r1_bio->sector +
@@ -1996,7 +1995,7 @@ static void process_checks(struct r1bio *r1_bio)
 	}
 	for (primary = 0; primary < conf->raid_disks * 2; primary++)
 		if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
-		    test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
+		    !r1_bio->bios[primary]->bi_error) {
 			r1_bio->bios[primary]->bi_end_io = NULL;
 			rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
 			break;
@@ -2006,14 +2005,14 @@ static void process_checks(struct r1bio *r1_bio)
 		int j;
 		struct bio *pbio = r1_bio->bios[primary];
 		struct bio *sbio = r1_bio->bios[i];
-		int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags);
+		int error = sbio->bi_error;
 
 		if (sbio->bi_end_io != end_sync_read)
 			continue;
-		/* Now we can 'fixup' the BIO_UPTODATE flag */
-		set_bit(BIO_UPTODATE, &sbio->bi_flags);
+		/* Now we can 'fixup' the error value */
+		sbio->bi_error = 0;
 
-		if (uptodate) {
+		if (!error) {
 			for (j = vcnt; j-- ; ) {
 				struct page *p, *s;
 				p = pbio->bi_io_vec[j].bv_page;
@@ -2028,7 +2027,7 @@ static void process_checks(struct r1bio *r1_bio)
 		if (j >= 0)
 			atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
 		if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
-			      && uptodate)) {
+			      && !error)) {
 			/* No need to write to this device. */
 			sbio->bi_end_io = NULL;
 			rdev_dec_pending(conf->mirrors[i].rdev, mddev);
@@ -2269,11 +2268,11 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
 		struct bio *bio = r1_bio->bios[m];
 		if (bio->bi_end_io == NULL)
 			continue;
-		if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+		if (!bio->bi_error &&
 		    test_bit(R1BIO_MadeGood, &r1_bio->state)) {
 			rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
 		}
-		if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+		if (bio->bi_error &&
 		    test_bit(R1BIO_WriteError, &r1_bio->state)) {
 			if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
 				md_error(conf->mddev, rdev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 940f2f365461..929e9a26d81b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -101,7 +101,7 @@ static int _enough(struct r10conf *conf, int previous, int ignore);
 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 				int *skipped);
 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
-static void end_reshape_write(struct bio *bio, int error);
+static void end_reshape_write(struct bio *bio);
 static void end_reshape(struct r10conf *conf);
 
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
@@ -307,9 +307,9 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
 	} else
 		done = 1;
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+		bio->bi_error = -EIO;
 	if (done) {
-		bio_endio(bio, 0);
+		bio_endio(bio);
 		/*
 		 * Wake up any possible resync thread that waits for the device
 		 * to go idle.
@@ -358,9 +358,9 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
 	return r10_bio->devs[slot].devnum;
 }
 
-static void raid10_end_read_request(struct bio *bio, int error)
+static void raid10_end_read_request(struct bio *bio)
 {
-	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	int uptodate = !bio->bi_error;
 	struct r10bio *r10_bio = bio->bi_private;
 	int slot, dev;
 	struct md_rdev *rdev;
@@ -438,9 +438,8 @@ static void one_write_done(struct r10bio *r10_bio)
 	}
 }
 
-static void raid10_end_write_request(struct bio *bio, int error)
+static void raid10_end_write_request(struct bio *bio)
 {
-	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct r10bio *r10_bio = bio->bi_private;
 	int dev;
 	int dec_rdev = 1;
@@ -460,7 +459,7 @@ static void raid10_end_write_request(struct bio *bio, int error)
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
-	if (!uptodate) {
+	if (bio->bi_error) {
 		if (repl)
 			/* Never record new bad blocks to replacement,
 			 * just fail it.
@@ -957,7 +956,7 @@ static void flush_pending_writes(struct r10conf *conf)
 			if (unlikely((bio->bi_rw & REQ_DISCARD) &&
 			    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
 				/* Just ignore it */
-				bio_endio(bio, 0);
+				bio_endio(bio);
 			else
 				generic_make_request(bio);
 			bio = next;
@@ -1133,7 +1132,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 		if (unlikely((bio->bi_rw & REQ_DISCARD) &&
 		    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
 			/* Just ignore it */
-			bio_endio(bio, 0);
+			bio_endio(bio);
 		else
 			generic_make_request(bio);
 		bio = next;
@@ -1916,7 +1915,7 @@ abort:
 	return err;
 }
 
-static void end_sync_read(struct bio *bio, int error)
+static void end_sync_read(struct bio *bio)
 {
 	struct r10bio *r10_bio = bio->bi_private;
 	struct r10conf *conf = r10_bio->mddev->private;
@@ -1928,7 +1927,7 @@ static void end_sync_read(struct bio *bio, int error)
 	} else
 		d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
 
-	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+	if (!bio->bi_error)
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
 	else
 		/* The write handler will notice the lack of
@@ -1977,9 +1976,8 @@ static void end_sync_request(struct r10bio *r10_bio)
 	}
 }
 
-static void end_sync_write(struct bio *bio, int error)
+static void end_sync_write(struct bio *bio)
 {
-	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct r10bio *r10_bio = bio->bi_private;
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
@@ -1996,7 +1994,7 @@ static void end_sync_write(struct bio *bio, int error)
 	else
 		rdev = conf->mirrors[d].rdev;
 
-	if (!uptodate) {
+	if (bio->bi_error) {
 		if (repl)
 			md_error(mddev, rdev);
 		else {
@@ -2044,7 +2042,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 
 	/* find the first device with a block */
 	for (i=0; i<conf->copies; i++)
-		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
+		if (!r10_bio->devs[i].bio->bi_error)
 			break;
 
 	if (i == conf->copies)
@@ -2064,7 +2062,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 			continue;
 		if (i == first)
 			continue;
-		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
+		if (!r10_bio->devs[i].bio->bi_error) {
 			/* We know that the bi_io_vec layout is the same for
 			 * both 'first' and 'i', so we just compare them.
 			 * All vec entries are PAGE_SIZE;
@@ -2706,8 +2704,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 			rdev = conf->mirrors[dev].rdev;
 			if (r10_bio->devs[m].bio == NULL)
 				continue;
-			if (test_bit(BIO_UPTODATE,
-				     &r10_bio->devs[m].bio->bi_flags)) {
+			if (!r10_bio->devs[m].bio->bi_error) {
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
@@ -2722,8 +2719,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 			rdev = conf->mirrors[dev].replacement;
 			if (r10_bio->devs[m].repl_bio == NULL)
 				continue;
-			if (test_bit(BIO_UPTODATE,
-				     &r10_bio->devs[m].repl_bio->bi_flags)) {
+
+			if (!r10_bio->devs[m].repl_bio->bi_error) {
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
@@ -2748,8 +2745,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 					r10_bio->devs[m].addr,
 					r10_bio->sectors, 0);
 				rdev_dec_pending(rdev, conf->mddev);
-			} else if (bio != NULL &&
-				   !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+			} else if (bio != NULL && bio->bi_error) {
 				if (!narrow_write_error(r10_bio, m)) {
 					md_error(conf->mddev, rdev);
 					set_bit(R10BIO_Degraded,
@@ -3263,7 +3259,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 
 			bio = r10_bio->devs[i].bio;
 			bio_reset(bio);
-			clear_bit(BIO_UPTODATE, &bio->bi_flags);
+			bio->bi_error = -EIO;
 			if (conf->mirrors[d].rdev == NULL ||
 			    test_bit(Faulty, &conf->mirrors[d].rdev->flags))
 				continue;
@@ -3300,7 +3296,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 			/* Need to set up for writing to the replacement */
 			bio = r10_bio->devs[i].repl_bio;
 			bio_reset(bio);
-			clear_bit(BIO_UPTODATE, &bio->bi_flags);
+			bio->bi_error = -EIO;
 
 			sector = r10_bio->devs[i].addr;
 			atomic_inc(&conf->mirrors[d].rdev->nr_pending);
@@ -3377,7 +3373,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 
 		if (bio->bi_end_io == end_sync_read) {
 			md_sync_acct(bio->bi_bdev, nr_sectors);
-			set_bit(BIO_UPTODATE, &bio->bi_flags);
+			bio->bi_error = 0;
 			generic_make_request(bio);
 		}
 	}
@@ -4380,7 +4376,7 @@ read_more:
 	read_bio->bi_end_io = end_sync_read;
 	read_bio->bi_rw = READ;
 	read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
-	__set_bit(BIO_UPTODATE, &read_bio->bi_flags);
+	read_bio->bi_error = 0;
 	read_bio->bi_vcnt = 0;
 	read_bio->bi_iter.bi_size = 0;
 	r10_bio->master_bio = read_bio;
@@ -4601,9 +4597,8 @@ static int handle_reshape_read_error(struct mddev *mddev,
 	return 0;
 }
 
-static void end_reshape_write(struct bio *bio, int error)
+static void end_reshape_write(struct bio *bio)
 {
-	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct r10bio *r10_bio = bio->bi_private;
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
@@ -4620,7 +4615,7 @@ static void end_reshape_write(struct bio *bio, int error)
 		rdev = conf->mirrors[d].rdev;
 	}
 
-	if (!uptodate) {
+	if (bio->bi_error) {
 		/* FIXME should record badblock */
 		md_error(mddev, rdev);
 	}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 59e44e99eef3..84d6eec1033e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -233,7 +233,7 @@ static void return_io(struct bio *return_bi)
 		bi->bi_iter.bi_size = 0;
 		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
 					 bi, 0);
-		bio_endio(bi, 0);
+		bio_endio(bi);
 		bi = return_bi;
 	}
 }
@@ -887,9 +887,9 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
 }
 
 static void
-raid5_end_read_request(struct bio *bi, int error);
+raid5_end_read_request(struct bio *bi);
 static void
-raid5_end_write_request(struct bio *bi, int error);
+raid5_end_write_request(struct bio *bi);
 
 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 {
@@ -2277,12 +2277,11 @@ static void shrink_stripes(struct r5conf *conf)
 	conf->slab_cache = NULL;
 }
 
-static void raid5_end_read_request(struct bio * bi, int error)
+static void raid5_end_read_request(struct bio * bi)
 {
 	struct stripe_head *sh = bi->bi_private;
 	struct r5conf *conf = sh->raid_conf;
 	int disks = sh->disks, i;
-	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 	char b[BDEVNAME_SIZE];
 	struct md_rdev *rdev = NULL;
 	sector_t s;
@@ -2291,9 +2290,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		if (bi == &sh->dev[i].req)
 			break;
 
-	pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
+	pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
 		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
-		uptodate);
+		bi->bi_error);
 	if (i == disks) {
 		BUG();
 		return;
@@ -2312,7 +2311,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		s = sh->sector + rdev->new_data_offset;
 	else
 		s = sh->sector + rdev->data_offset;
-	if (uptodate) {
+	if (!bi->bi_error) {
 		set_bit(R5_UPTODATE, &sh->dev[i].flags);
 		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
 			/* Note that this cannot happen on a
@@ -2400,13 +2399,12 @@ static void raid5_end_read_request(struct bio * bi, int error)
 	release_stripe(sh);
 }
 
-static void raid5_end_write_request(struct bio *bi, int error)
+static void raid5_end_write_request(struct bio *bi)
 {
 	struct stripe_head *sh = bi->bi_private;
 	struct r5conf *conf = sh->raid_conf;
 	int disks = sh->disks, i;
 	struct md_rdev *uninitialized_var(rdev);
-	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 	sector_t first_bad;
 	int bad_sectors;
 	int replacement = 0;
@@ -2429,23 +2427,23 @@ static void raid5_end_write_request(struct bio *bi, int error)
 			break;
 		}
 	}
-	pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
+	pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
 		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
-		uptodate);
+		bi->bi_error);
 	if (i == disks) {
 		BUG();
 		return;
 	}
 
 	if (replacement) {
-		if (!uptodate)
+		if (bi->bi_error)
 			md_error(conf->mddev, rdev);
 		else if (is_badblock(rdev, sh->sector,
 				     STRIPE_SECTORS,
 				     &first_bad, &bad_sectors))
 			set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
 	} else {
-		if (!uptodate) {
+		if (bi->bi_error) {
 			set_bit(STRIPE_DEGRADED, &sh->state);
 			set_bit(WriteErrorSeen, &rdev->flags);
 			set_bit(R5_WriteError, &sh->dev[i].flags);
@@ -2466,7 +2464,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
 	}
 	rdev_dec_pending(rdev, conf->mddev);
 
-	if (sh->batch_head && !uptodate && !replacement)
+	if (sh->batch_head && bi->bi_error && !replacement)
 		set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
 
 	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
@@ -3107,7 +3105,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		while (bi && bi->bi_iter.bi_sector <
 			sh->dev[i].sector + STRIPE_SECTORS) {
 			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
-			clear_bit(BIO_UPTODATE, &bi->bi_flags);
+
+			bi->bi_error = -EIO;
 			if (!raid5_dec_bi_active_stripes(bi)) {
 				md_write_end(conf->mddev);
 				bi->bi_next = *return_bi;
@@ -3131,7 +3130,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		while (bi && bi->bi_iter.bi_sector <
 		       sh->dev[i].sector + STRIPE_SECTORS) {
 			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
-			clear_bit(BIO_UPTODATE, &bi->bi_flags);
+
+			bi->bi_error = -EIO;
 			if (!raid5_dec_bi_active_stripes(bi)) {
 				md_write_end(conf->mddev);
 				bi->bi_next = *return_bi;
@@ -3156,7 +3156,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			       sh->dev[i].sector + STRIPE_SECTORS) {
 				struct bio *nextbi =
 					r5_next_bio(bi, sh->dev[i].sector);
-				clear_bit(BIO_UPTODATE, &bi->bi_flags);
+
+				bi->bi_error = -EIO;
 				if (!raid5_dec_bi_active_stripes(bi)) {
 					bi->bi_next = *return_bi;
 					*return_bi = bi;
@@ -4749,12 +4750,11 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
  *  first).
  *  If the read failed..
  */
-static void raid5_align_endio(struct bio *bi, int error)
+static void raid5_align_endio(struct bio *bi)
 {
 	struct bio* raid_bi  = bi->bi_private;
 	struct mddev *mddev;
 	struct r5conf *conf;
-	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 	struct md_rdev *rdev;
 
 	bio_put(bi);
@@ -4766,10 +4766,10 @@ static void raid5_align_endio(struct bio *bi, int error)
 
 	rdev_dec_pending(rdev, conf->mddev);
 
-	if (!error && uptodate) {
+	if (!bi->bi_error) {
 		trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
 					 raid_bi, 0);
-		bio_endio(raid_bi, 0);
+		bio_endio(raid_bi);
 		if (atomic_dec_and_test(&conf->active_aligned_reads))
 			wake_up(&conf->wait_for_quiescent);
 		return;
@@ -5133,7 +5133,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	remaining = raid5_dec_bi_active_stripes(bi);
 	if (remaining == 0) {
 		md_write_end(mddev);
-		bio_endio(bi, 0);
+		bio_endio(bi);
 	}
 }
 
@@ -5297,7 +5297,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 			release_stripe_plug(mddev, sh);
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
-			clear_bit(BIO_UPTODATE, &bi->bi_flags);
+			bi->bi_error = -EIO;
 			break;
 		}
 	}
@@ -5311,7 +5311,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 
 		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
 					 bi, 0);
-		bio_endio(bi, 0);
+		bio_endio(bi);
 	}
 }
 
@@ -5707,7 +5707,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 	if (remaining == 0) {
 		trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
 					 raid_bio, 0);
-		bio_endio(raid_bio, 0);
+		bio_endio(raid_bio);
 	}
 	if (atomic_dec_and_test(&conf->active_aligned_reads))
 		wake_up(&conf->wait_for_quiescent);
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 4f97b248c236..0df77cb07df6 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -180,7 +180,7 @@ static void nd_blk_make_request(struct request_queue *q, struct bio *bio)
 	 * another kernel subsystem, and we just pass it through.
 	 */
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
-		err = -EIO;
+		bio->bi_error = -EIO;
 		goto out;
 	}
 
@@ -199,6 +199,7 @@ static void nd_blk_make_request(struct request_queue *q, struct bio *bio)
 					"io error in %s sector %lld, len %d,\n",
 					(rw == READ) ? "READ" : "WRITE",
 					(unsigned long long) iter.bi_sector, len);
+			bio->bi_error = err;
 			break;
 		}
 	}
@@ -206,7 +207,7 @@ static void nd_blk_make_request(struct request_queue *q, struct bio *bio)
 		nd_iostat_end(bio, start);
 
  out:
-	bio_endio(bio, err);
+	bio_endio(bio);
 }
 
 static int nd_blk_rw_bytes(struct nd_namespace_common *ndns,
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 411c7b2bb37a..341202ed32b4 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1189,7 +1189,7 @@ static void btt_make_request(struct request_queue *q, struct bio *bio)
 	 * another kernel subsystem, and we just pass it through.
 	 */
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
-		err = -EIO;
+		bio->bi_error = -EIO;
 		goto out;
 	}
 
@@ -1211,6 +1211,7 @@ static void btt_make_request(struct request_queue *q, struct bio *bio)
 					"io error in %s sector %lld, len %d,\n",
 					(rw == READ) ? "READ" : "WRITE",
 					(unsigned long long) iter.bi_sector, len);
+			bio->bi_error = err;
 			break;
 		}
 	}
@@ -1218,7 +1219,7 @@ static void btt_make_request(struct request_queue *q, struct bio *bio)
 		nd_iostat_end(bio, start);
 
 out:
-	bio_endio(bio, err);
+	bio_endio(bio);
 }
 
 static int btt_rw_page(struct block_device *bdev, sector_t sector,
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index ade9eb917a4d..4c079d5cb539 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -77,7 +77,7 @@ static void pmem_make_request(struct request_queue *q, struct bio *bio)
 	if (bio_data_dir(bio))
 		wmb_pmem();
 
-	bio_endio(bio, 0);
+	bio_endio(bio);
 }
 
 static int pmem_rw_page(struct block_device *bdev, sector_t sector,
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index da212813f2d5..8bcb822b0bac 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -871,7 +871,7 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio)
 		}
 		bytes_done += bvec.bv_len;
 	}
-	bio_endio(bio, 0);
+	bio_endio(bio);
 	return;
 fail:
 	bio_io_error(bio);
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index 7d4e9397ac31..93856b9b6214 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -220,8 +220,7 @@ static void xpram_make_request(struct request_queue *q, struct bio *bio)
 			index++;
 		}
 	}
-	set_bit(BIO_UPTODATE, &bio->bi_flags);
-	bio_endio(bio, 0);
+	bio_endio(bio);
 	return;
 fail:
 	bio_io_error(bio);
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 6d88d24e6cce..5a9982f5d5d6 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -306,20 +306,13 @@ static void iblock_complete_cmd(struct se_cmd *cmd)
 	kfree(ibr);
 }
 
-static void iblock_bio_done(struct bio *bio, int err)
+static void iblock_bio_done(struct bio *bio)
 {
 	struct se_cmd *cmd = bio->bi_private;
 	struct iblock_req *ibr = cmd->priv;
 
-	/*
-	 * Set -EIO if !BIO_UPTODATE and the passed is still err=0
-	 */
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && !err)
-		err = -EIO;
-
-	if (err != 0) {
-		pr_err("test_bit(BIO_UPTODATE) failed for bio: %p,"
-			" err: %d\n", bio, err);
+	if (bio->bi_error) {
+		pr_err("bio error: %p,  err: %d\n", bio, bio->bi_error);
 		/*
 		 * Bump the ib_bio_err_cnt and release bio.
 		 */
@@ -370,15 +363,15 @@ static void iblock_submit_bios(struct bio_list *list, int rw)
 	blk_finish_plug(&plug);
 }
 
-static void iblock_end_io_flush(struct bio *bio, int err)
+static void iblock_end_io_flush(struct bio *bio)
 {
 	struct se_cmd *cmd = bio->bi_private;
 
-	if (err)
-		pr_err("IBLOCK: cache flush failed: %d\n", err);
+	if (bio->bi_error)
+		pr_err("IBLOCK: cache flush failed: %d\n", bio->bi_error);
 
 	if (cmd) {
-		if (err)
+		if (bio->bi_error)
 			target_complete_cmd(cmd, SAM_STAT_CHECK_CONDITION);
 		else
 			target_complete_cmd(cmd, SAM_STAT_GOOD);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 08e9084ee615..de18790eb21c 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -852,7 +852,7 @@ static ssize_t pscsi_show_configfs_dev_params(struct se_device *dev, char *b)
 	return bl;
 }
 
-static void pscsi_bi_endio(struct bio *bio, int error)
+static void pscsi_bi_endio(struct bio *bio)
 {
 	bio_put(bio);
 }
@@ -973,7 +973,7 @@ fail:
 	while (*hbio) {
 		bio = *hbio;
 		*hbio = (*hbio)->bi_next;
-		bio_endio(bio, 0);	/* XXX: should be error */
+		bio_endio(bio);
 	}
 	return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 }
@@ -1061,7 +1061,7 @@ fail_free_bio:
 	while (hbio) {
 		struct bio *bio = hbio;
 		hbio = hbio->bi_next;
-		bio_endio(bio, 0);	/* XXX: should be error */
+		bio_endio(bio);
 	}
 	ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 fail:
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index ce7dec88f4b8..541fbfaed276 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -343,7 +343,7 @@ static int btrfsic_process_written_superblock(
 		struct btrfsic_state *state,
 		struct btrfsic_block *const block,
 		struct btrfs_super_block *const super_hdr);
-static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bio_end_io(struct bio *bp);
 static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
 static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
 					      const struct btrfsic_block *block,
@@ -2207,7 +2207,7 @@ continue_loop:
 	goto again;
 }
 
-static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+static void btrfsic_bio_end_io(struct bio *bp)
 {
 	struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
 	int iodone_w_error;
@@ -2215,7 +2215,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
 	/* mutex is not held! This is not save if IO is not yet completed
 	 * on umount */
 	iodone_w_error = 0;
-	if (bio_error_status)
+	if (bp->bi_error)
 		iodone_w_error = 1;
 
 	BUG_ON(NULL == block);
@@ -2230,7 +2230,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
 		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
 			printk(KERN_INFO
 			       "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
-			       bio_error_status,
+			       bp->bi_error,
 			       btrfsic_get_block_type(dev_state->state, block),
 			       block->logical_bytenr, dev_state->name,
 			       block->dev_bytenr, block->mirror_num);
@@ -2252,7 +2252,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
 		block = next_block;
 	} while (NULL != block);
 
-	bp->bi_end_io(bp, bio_error_status);
+	bp->bi_end_io(bp);
 }
 
 static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ce62324c78e7..302266ec2cdb 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -152,7 +152,7 @@ fail:
  * The compressed pages are freed here, and it must be run
  * in process context
  */
-static void end_compressed_bio_read(struct bio *bio, int err)
+static void end_compressed_bio_read(struct bio *bio)
 {
 	struct compressed_bio *cb = bio->bi_private;
 	struct inode *inode;
@@ -160,7 +160,7 @@ static void end_compressed_bio_read(struct bio *bio, int err)
 	unsigned long index;
 	int ret;
 
-	if (err)
+	if (bio->bi_error)
 		cb->errors = 1;
 
 	/* if there are more bios still pending for this compressed
@@ -210,7 +210,7 @@ csum_failed:
 		bio_for_each_segment_all(bvec, cb->orig_bio, i)
 			SetPageChecked(bvec->bv_page);
 
-		bio_endio(cb->orig_bio, 0);
+		bio_endio(cb->orig_bio);
 	}
 
 	/* finally free the cb struct */
@@ -266,7 +266,7 @@ static noinline void end_compressed_writeback(struct inode *inode,
  * This also calls the writeback end hooks for the file pages so that
  * metadata and checksums can be updated in the file.
  */
-static void end_compressed_bio_write(struct bio *bio, int err)
+static void end_compressed_bio_write(struct bio *bio)
 {
 	struct extent_io_tree *tree;
 	struct compressed_bio *cb = bio->bi_private;
@@ -274,7 +274,7 @@ static void end_compressed_bio_write(struct bio *bio, int err)
 	struct page *page;
 	unsigned long index;
 
-	if (err)
+	if (bio->bi_error)
 		cb->errors = 1;
 
 	/* if there are more bios still pending for this compressed
@@ -293,7 +293,7 @@ static void end_compressed_bio_write(struct bio *bio, int err)
 					 cb->start,
 					 cb->start + cb->len - 1,
 					 NULL,
-					 err ? 0 : 1);
+					 bio->bi_error ? 0 : 1);
 	cb->compressed_pages[0]->mapping = NULL;
 
 	end_compressed_writeback(inode, cb);
@@ -697,8 +697,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 			ret = btrfs_map_bio(root, READ, comp_bio,
 					    mirror_num, 0);
-			if (ret)
-				bio_endio(comp_bio, ret);
+			if (ret) {
+				bio->bi_error = ret;
+				bio_endio(comp_bio);
+			}
 
 			bio_put(comp_bio);
 
@@ -724,8 +726,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	}
 
 	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
-	if (ret)
-		bio_endio(comp_bio, ret);
+	if (ret) {
+		bio->bi_error = ret;
+		bio_endio(comp_bio);
+	}
 
 	bio_put(comp_bio);
 	return 0;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a9aadb2ad525..a8c0de888a9d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -703,7 +703,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
 	return -EIO;	/* we fixed nothing */
 }
 
-static void end_workqueue_bio(struct bio *bio, int err)
+static void end_workqueue_bio(struct bio *bio)
 {
 	struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
 	struct btrfs_fs_info *fs_info;
@@ -711,7 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
 	btrfs_work_func_t func;
 
 	fs_info = end_io_wq->info;
-	end_io_wq->error = err;
+	end_io_wq->error = bio->bi_error;
 
 	if (bio->bi_rw & REQ_WRITE) {
 		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
@@ -808,7 +808,8 @@ static void run_one_async_done(struct btrfs_work *work)
 
 	/* If an error occured we just want to clean up the bio and move on */
 	if (async->error) {
-		bio_endio(async->bio, async->error);
+		async->bio->bi_error = async->error;
+		bio_endio(async->bio);
 		return;
 	}
 
@@ -908,8 +909,10 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 	 * submission context.  Just jump into btrfs_map_bio
 	 */
 	ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
-	if (ret)
-		bio_endio(bio, ret);
+	if (ret) {
+		bio->bi_error = ret;
+		bio_endio(bio);
+	}
 	return ret;
 }
 
@@ -960,10 +963,13 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 					  __btree_submit_bio_done);
 	}
 
-	if (ret) {
+	if (ret)
+		goto out_w_error;
+	return 0;
+
 out_w_error:
-		bio_endio(bio, ret);
-	}
+	bio->bi_error = ret;
+	bio_endio(bio);
 	return ret;
 }
 
@@ -1735,16 +1741,15 @@ static void end_workqueue_fn(struct btrfs_work *work)
 {
 	struct bio *bio;
 	struct btrfs_end_io_wq *end_io_wq;
-	int error;
 
 	end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
 	bio = end_io_wq->bio;
 
-	error = end_io_wq->error;
+	bio->bi_error = end_io_wq->error;
 	bio->bi_private = end_io_wq->private;
 	bio->bi_end_io = end_io_wq->end_io;
 	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
-	bio_endio(bio, error);
+	bio_endio(bio);
 }
 
 static int cleaner_kthread(void *arg)
@@ -3323,10 +3328,8 @@ static int write_dev_supers(struct btrfs_device *device,
  * endio for the write_dev_flush, this will wake anyone waiting
  * for the barrier when it is done
  */
-static void btrfs_end_empty_barrier(struct bio *bio, int err)
+static void btrfs_end_empty_barrier(struct bio *bio)
 {
-	if (err)
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	if (bio->bi_private)
 		complete(bio->bi_private);
 	bio_put(bio);
@@ -3354,8 +3357,8 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 
 		wait_for_completion(&device->flush_wait);
 
-		if (!bio_flagged(bio, BIO_UPTODATE)) {
-			ret = -EIO;
+		if (bio->bi_error) {
+			ret = bio->bi_error;
 			btrfs_dev_stat_inc_and_print(device,
 				BTRFS_DEV_STAT_FLUSH_ERRS);
 		}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 02d05817cbdf..c22f175ed024 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2486,7 +2486,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
-static void end_bio_extent_writepage(struct bio *bio, int err)
+static void end_bio_extent_writepage(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	u64 start;
@@ -2516,7 +2516,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
 		start = page_offset(page);
 		end = start + bvec->bv_offset + bvec->bv_len - 1;
 
-		if (end_extent_writepage(page, err, start, end))
+		if (end_extent_writepage(page, bio->bi_error, start, end))
 			continue;
 
 		end_page_writeback(page);
@@ -2548,10 +2548,10 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
-static void end_bio_extent_readpage(struct bio *bio, int err)
+static void end_bio_extent_readpage(struct bio *bio)
 {
 	struct bio_vec *bvec;
-	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	int uptodate = !bio->bi_error;
 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
 	struct extent_io_tree *tree;
 	u64 offset = 0;
@@ -2564,16 +2564,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 	int ret;
 	int i;
 
-	if (err)
-		uptodate = 0;
-
 	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
 		struct inode *inode = page->mapping->host;
 
 		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
-			 "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
-			 io_bio->mirror_num);
+			 "mirror=%u\n", (u64)bio->bi_iter.bi_sector,
+			 bio->bi_error, io_bio->mirror_num);
 		tree = &BTRFS_I(inode)->io_tree;
 
 		/* We always issue full-page reads, but if some block
@@ -2614,8 +2611,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 
 		if (tree->ops && tree->ops->readpage_io_failed_hook) {
 			ret = tree->ops->readpage_io_failed_hook(page, mirror);
-			if (!ret && !err &&
-			    test_bit(BIO_UPTODATE, &bio->bi_flags))
+			if (!ret && !bio->bi_error)
 				uptodate = 1;
 		} else {
 			/*
@@ -2631,10 +2627,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 			ret = bio_readpage_error(bio, offset, page, start, end,
 						 mirror);
 			if (ret == 0) {
-				uptodate =
-					test_bit(BIO_UPTODATE, &bio->bi_flags);
-				if (err)
-					uptodate = 0;
+				uptodate = !bio->bi_error;
 				offset += len;
 				continue;
 			}
@@ -2684,7 +2677,7 @@ readpage_ok:
 		endio_readpage_release_extent(tree, extent_start, extent_len,
 					      uptodate);
 	if (io_bio->end_io)
-		io_bio->end_io(io_bio, err);
+		io_bio->end_io(io_bio, bio->bi_error);
 	bio_put(bio);
 }
 
@@ -3696,7 +3689,7 @@ static void set_btree_ioerr(struct page *page)
 	}
 }
 
-static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
+static void end_bio_extent_buffer_writepage(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	struct extent_buffer *eb;
@@ -3709,7 +3702,8 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
 		BUG_ON(!eb);
 		done = atomic_dec_and_test(&eb->io_pages);
 
-		if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
+		if (bio->bi_error ||
+		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
 			ClearPageUptodate(page);
 			set_btree_ioerr(page);
 		}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b33c0cf02668..6b8becfe2057 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1845,8 +1845,10 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 	int ret;
 
 	ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
-	if (ret)
-		bio_endio(bio, ret);
+	if (ret) {
+		bio->bi_error = ret;
+		bio_endio(bio);
+	}
 	return ret;
 }
 
@@ -1906,8 +1908,10 @@ mapit:
 	ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
 
 out:
-	if (ret < 0)
-		bio_endio(bio, ret);
+	if (ret < 0) {
+		bio->bi_error = ret;
+		bio_endio(bio);
+	}
 	return ret;
 }
 
@@ -7689,13 +7693,13 @@ struct btrfs_retry_complete {
 	int uptodate;
 };
 
-static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
+static void btrfs_retry_endio_nocsum(struct bio *bio)
 {
 	struct btrfs_retry_complete *done = bio->bi_private;
 	struct bio_vec *bvec;
 	int i;
 
-	if (err)
+	if (bio->bi_error)
 		goto end;
 
 	done->uptodate = 1;
@@ -7744,7 +7748,7 @@ try_again:
 	return 0;
 }
 
-static void btrfs_retry_endio(struct bio *bio, int err)
+static void btrfs_retry_endio(struct bio *bio)
 {
 	struct btrfs_retry_complete *done = bio->bi_private;
 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
@@ -7753,7 +7757,7 @@ static void btrfs_retry_endio(struct bio *bio, int err)
 	int ret;
 	int i;
 
-	if (err)
+	if (bio->bi_error)
 		goto end;
 
 	uptodate = 1;
@@ -7836,12 +7840,13 @@ static int btrfs_subio_endio_read(struct inode *inode,
 	}
 }
 
-static void btrfs_endio_direct_read(struct bio *bio, int err)
+static void btrfs_endio_direct_read(struct bio *bio)
 {
 	struct btrfs_dio_private *dip = bio->bi_private;
 	struct inode *inode = dip->inode;
 	struct bio *dio_bio;
 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+	int err = bio->bi_error;
 
 	if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
 		err = btrfs_subio_endio_read(inode, io_bio, err);
@@ -7852,17 +7857,14 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
 
 	kfree(dip);
 
-	/* If we had a csum failure make sure to clear the uptodate flag */
-	if (err)
-		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
-	dio_end_io(dio_bio, err);
+	dio_end_io(dio_bio, bio->bi_error);
 
 	if (io_bio->end_io)
 		io_bio->end_io(io_bio, err);
 	bio_put(bio);
 }
 
-static void btrfs_endio_direct_write(struct bio *bio, int err)
+static void btrfs_endio_direct_write(struct bio *bio)
 {
 	struct btrfs_dio_private *dip = bio->bi_private;
 	struct inode *inode = dip->inode;
@@ -7876,7 +7878,8 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
 again:
 	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
 						   &ordered_offset,
-						   ordered_bytes, !err);
+						   ordered_bytes,
+						   !bio->bi_error);
 	if (!ret)
 		goto out_test;
 
@@ -7899,10 +7902,7 @@ out_test:
 
 	kfree(dip);
 
-	/* If we had an error make sure to clear the uptodate flag */
-	if (err)
-		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
-	dio_end_io(dio_bio, err);
+	dio_end_io(dio_bio, bio->bi_error);
 	bio_put(bio);
 }
 
@@ -7917,9 +7917,10 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
 	return 0;
 }
 
-static void btrfs_end_dio_bio(struct bio *bio, int err)
+static void btrfs_end_dio_bio(struct bio *bio)
 {
 	struct btrfs_dio_private *dip = bio->bi_private;
+	int err = bio->bi_error;
 
 	if (err)
 		btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
@@ -7948,8 +7949,8 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
 	if (dip->errors) {
 		bio_io_error(dip->orig_bio);
 	} else {
-		set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
-		bio_endio(dip->orig_bio, 0);
+		dip->dio_bio->bi_error = 0;
+		bio_endio(dip->orig_bio);
 	}
 out:
 	bio_put(bio);
@@ -8220,7 +8221,8 @@ free_ordered:
 	 * callbacks - they require an allocated dip and a clone of dio_bio.
 	 */
 	if (io_bio && dip) {
-		bio_endio(io_bio, ret);
+		io_bio->bi_error = -EIO;
+		bio_endio(io_bio);
 		/*
 		 * The end io callbacks free our dip, do the final put on io_bio
 		 * and all the cleanup and final put for dio_bio (through
@@ -8247,7 +8249,7 @@ free_ordered:
 			unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
 			      file_offset + dio_bio->bi_iter.bi_size - 1);
 		}
-		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+		dio_bio->bi_error = -EIO;
 		/*
 		 * Releases and cleans up our dio_bio, no need to bio_put()
 		 * nor bio_endio()/bio_io_error() against dio_bio.
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index fa72068bd256..0a02e24900aa 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -851,7 +851,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio)
  * this frees the rbio and runs through all the bios in the
  * bio_list and calls end_io on them
  */
-static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err)
 {
 	struct bio *cur = bio_list_get(&rbio->bio_list);
 	struct bio *next;
@@ -864,9 +864,8 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
 	while (cur) {
 		next = cur->bi_next;
 		cur->bi_next = NULL;
-		if (uptodate)
-			set_bit(BIO_UPTODATE, &cur->bi_flags);
-		bio_endio(cur, err);
+		cur->bi_error = err;
+		bio_endio(cur);
 		cur = next;
 	}
 }
@@ -875,9 +874,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
  * end io function used by finish_rmw.  When we finally
  * get here, we've written a full stripe
  */
-static void raid_write_end_io(struct bio *bio, int err)
+static void raid_write_end_io(struct bio *bio)
 {
 	struct btrfs_raid_bio *rbio = bio->bi_private;
+	int err = bio->bi_error;
 
 	if (err)
 		fail_bio_stripe(rbio, bio);
@@ -893,7 +893,7 @@ static void raid_write_end_io(struct bio *bio, int err)
 	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
 		err = -EIO;
 
-	rbio_orig_end_io(rbio, err, 0);
+	rbio_orig_end_io(rbio, err);
 	return;
 }
 
@@ -1071,7 +1071,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 		 * devices or if they are not contiguous
 		 */
 		if (last_end == disk_start && stripe->dev->bdev &&
-		    test_bit(BIO_UPTODATE, &last->bi_flags) &&
+		    !last->bi_error &&
 		    last->bi_bdev == stripe->dev->bdev) {
 			ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
 			if (ret == PAGE_CACHE_SIZE)
@@ -1087,7 +1087,6 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 	bio->bi_iter.bi_size = 0;
 	bio->bi_bdev = stripe->dev->bdev;
 	bio->bi_iter.bi_sector = disk_start >> 9;
-	set_bit(BIO_UPTODATE, &bio->bi_flags);
 
 	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
 	bio_list_add(bio_list, bio);
@@ -1312,13 +1311,12 @@ write_data:
 
 		bio->bi_private = rbio;
 		bio->bi_end_io = raid_write_end_io;
-		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
 		submit_bio(WRITE, bio);
 	}
 	return;
 
 cleanup:
-	rbio_orig_end_io(rbio, -EIO, 0);
+	rbio_orig_end_io(rbio, -EIO);
 }
 
 /*
@@ -1441,11 +1439,11 @@ static void set_bio_pages_uptodate(struct bio *bio)
  * This will usually kick off finish_rmw once all the bios are read in, but it
  * may trigger parity reconstruction if we had any errors along the way
  */
-static void raid_rmw_end_io(struct bio *bio, int err)
+static void raid_rmw_end_io(struct bio *bio)
 {
 	struct btrfs_raid_bio *rbio = bio->bi_private;
 
-	if (err)
+	if (bio->bi_error)
 		fail_bio_stripe(rbio, bio);
 	else
 		set_bio_pages_uptodate(bio);
@@ -1455,7 +1453,6 @@ static void raid_rmw_end_io(struct bio *bio, int err)
 	if (!atomic_dec_and_test(&rbio->stripes_pending))
 		return;
 
-	err = 0;
 	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
 		goto cleanup;
 
@@ -1469,7 +1466,7 @@ static void raid_rmw_end_io(struct bio *bio, int err)
 
 cleanup:
 
-	rbio_orig_end_io(rbio, -EIO, 0);
+	rbio_orig_end_io(rbio, -EIO);
 }
 
 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
@@ -1572,14 +1569,13 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
 		btrfs_bio_wq_end_io(rbio->fs_info, bio,
 				    BTRFS_WQ_ENDIO_RAID56);
 
-		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
 		submit_bio(READ, bio);
 	}
 	/* the actual write will happen once the reads are done */
 	return 0;
 
 cleanup:
-	rbio_orig_end_io(rbio, -EIO, 0);
+	rbio_orig_end_io(rbio, -EIO);
 	return -EIO;
 
 finish:
@@ -1964,7 +1960,7 @@ cleanup_io:
 		else
 			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 
-		rbio_orig_end_io(rbio, err, err == 0);
+		rbio_orig_end_io(rbio, err);
 	} else if (err == 0) {
 		rbio->faila = -1;
 		rbio->failb = -1;
@@ -1976,7 +1972,7 @@ cleanup_io:
 		else
 			BUG();
 	} else {
-		rbio_orig_end_io(rbio, err, 0);
+		rbio_orig_end_io(rbio, err);
 	}
 }
 
@@ -1984,7 +1980,7 @@ cleanup_io:
  * This is called only for stripes we've read from disk to
  * reconstruct the parity.
  */
-static void raid_recover_end_io(struct bio *bio, int err)
+static void raid_recover_end_io(struct bio *bio)
 {
 	struct btrfs_raid_bio *rbio = bio->bi_private;
 
@@ -1992,7 +1988,7 @@ static void raid_recover_end_io(struct bio *bio, int err)
 	 * we only read stripe pages off the disk, set them
 	 * up to date if there were no errors
 	 */
-	if (err)
+	if (bio->bi_error)
 		fail_bio_stripe(rbio, bio);
 	else
 		set_bio_pages_uptodate(bio);
@@ -2002,7 +1998,7 @@ static void raid_recover_end_io(struct bio *bio, int err)
 		return;
 
 	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
-		rbio_orig_end_io(rbio, -EIO, 0);
+		rbio_orig_end_io(rbio, -EIO);
 	else
 		__raid_recover_end_io(rbio);
 }
@@ -2094,7 +2090,6 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 		btrfs_bio_wq_end_io(rbio->fs_info, bio,
 				    BTRFS_WQ_ENDIO_RAID56);
 
-		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
 		submit_bio(READ, bio);
 	}
 out:
@@ -2102,7 +2097,7 @@ out:
 
 cleanup:
 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
-		rbio_orig_end_io(rbio, -EIO, 0);
+		rbio_orig_end_io(rbio, -EIO);
 	return -EIO;
 }
 
@@ -2277,11 +2272,12 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
  * end io function used by finish_rmw.  When we finally
  * get here, we've written a full stripe
  */
-static void raid_write_parity_end_io(struct bio *bio, int err)
+static void raid_write_parity_end_io(struct bio *bio)
 {
 	struct btrfs_raid_bio *rbio = bio->bi_private;
+	int err = bio->bi_error;
 
-	if (err)
+	if (bio->bi_error)
 		fail_bio_stripe(rbio, bio);
 
 	bio_put(bio);
@@ -2294,7 +2290,7 @@ static void raid_write_parity_end_io(struct bio *bio, int err)
 	if (atomic_read(&rbio->error))
 		err = -EIO;
 
-	rbio_orig_end_io(rbio, err, 0);
+	rbio_orig_end_io(rbio, err);
 }
 
 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
@@ -2437,7 +2433,7 @@ submit_write:
 	nr_data = bio_list_size(&bio_list);
 	if (!nr_data) {
 		/* Every parity is right */
-		rbio_orig_end_io(rbio, 0, 0);
+		rbio_orig_end_io(rbio, 0);
 		return;
 	}
 
@@ -2450,13 +2446,12 @@ submit_write:
 
 		bio->bi_private = rbio;
 		bio->bi_end_io = raid_write_parity_end_io;
-		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
 		submit_bio(WRITE, bio);
 	}
 	return;
 
 cleanup:
-	rbio_orig_end_io(rbio, -EIO, 0);
+	rbio_orig_end_io(rbio, -EIO);
 }
 
 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
@@ -2524,7 +2519,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
 	return;
 
 cleanup:
-	rbio_orig_end_io(rbio, -EIO, 0);
+	rbio_orig_end_io(rbio, -EIO);
 }
 
 /*
@@ -2535,11 +2530,11 @@ cleanup:
  * This will usually kick off finish_rmw once all the bios are read in, but it
  * may trigger parity reconstruction if we had any errors along the way
  */
-static void raid56_parity_scrub_end_io(struct bio *bio, int err)
+static void raid56_parity_scrub_end_io(struct bio *bio)
 {
 	struct btrfs_raid_bio *rbio = bio->bi_private;
 
-	if (err)
+	if (bio->bi_error)
 		fail_bio_stripe(rbio, bio);
 	else
 		set_bio_pages_uptodate(bio);
@@ -2632,14 +2627,13 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
 		btrfs_bio_wq_end_io(rbio->fs_info, bio,
 				    BTRFS_WQ_ENDIO_RAID56);
 
-		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
 		submit_bio(READ, bio);
 	}
 	/* the actual write will happen once the reads are done */
 	return;
 
 cleanup:
-	rbio_orig_end_io(rbio, -EIO, 0);
+	rbio_orig_end_io(rbio, -EIO);
 	return;
 
 finish:
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 94db0fa5225a..ebb8260186fe 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -278,7 +278,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 		       u64 physical, struct btrfs_device *dev, u64 flags,
 		       u64 gen, int mirror_num, u8 *csum, int force,
 		       u64 physical_for_dev_replace);
-static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_bio_end_io(struct bio *bio);
 static void scrub_bio_end_io_worker(struct btrfs_work *work);
 static void scrub_block_complete(struct scrub_block *sblock);
 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
@@ -295,7 +295,7 @@ static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 				    struct scrub_page *spage);
 static void scrub_wr_submit(struct scrub_ctx *sctx);
-static void scrub_wr_bio_end_io(struct bio *bio, int err);
+static void scrub_wr_bio_end_io(struct bio *bio);
 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 static int write_page_nocow(struct scrub_ctx *sctx,
 			    u64 physical_for_dev_replace, struct page *page);
@@ -1429,11 +1429,11 @@ struct scrub_bio_ret {
 	int error;
 };
 
-static void scrub_bio_wait_endio(struct bio *bio, int error)
+static void scrub_bio_wait_endio(struct bio *bio)
 {
 	struct scrub_bio_ret *ret = bio->bi_private;
 
-	ret->error = error;
+	ret->error = bio->bi_error;
 	complete(&ret->event);
 }
 
@@ -1790,12 +1790,12 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
 	btrfsic_submit_bio(WRITE, sbio->bio);
 }
 
-static void scrub_wr_bio_end_io(struct bio *bio, int err)
+static void scrub_wr_bio_end_io(struct bio *bio)
 {
 	struct scrub_bio *sbio = bio->bi_private;
 	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
 
-	sbio->err = err;
+	sbio->err = bio->bi_error;
 	sbio->bio = bio;
 
 	btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
@@ -2098,7 +2098,7 @@ static void scrub_submit(struct scrub_ctx *sctx)
 		 */
 		printk_ratelimited(KERN_WARNING
 			"BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
-		bio_endio(sbio->bio, -EIO);
+		bio_io_error(sbio->bio);
 	} else {
 		btrfsic_submit_bio(READ, sbio->bio);
 	}
@@ -2260,12 +2260,12 @@ leave_nomem:
 	return 0;
 }
 
-static void scrub_bio_end_io(struct bio *bio, int err)
+static void scrub_bio_end_io(struct bio *bio)
 {
 	struct scrub_bio *sbio = bio->bi_private;
 	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
 
-	sbio->err = err;
+	sbio->err = bio->bi_error;
 	sbio->bio = bio;
 
 	btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
@@ -2672,11 +2672,11 @@ static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
 	scrub_pending_bio_dec(sctx);
 }
 
-static void scrub_parity_bio_endio(struct bio *bio, int error)
+static void scrub_parity_bio_endio(struct bio *bio)
 {
 	struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
 
-	if (error)
+	if (bio->bi_error)
 		bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
 			  sparity->nsectors);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fbe7c104531c..8f2ca18c71f4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5741,23 +5741,23 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 	return 0;
 }
 
-static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
+static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
 {
 	bio->bi_private = bbio->private;
 	bio->bi_end_io = bbio->end_io;
-	bio_endio(bio, err);
+	bio_endio(bio);
 
 	btrfs_put_bbio(bbio);
 }
 
-static void btrfs_end_bio(struct bio *bio, int err)
+static void btrfs_end_bio(struct bio *bio)
 {
 	struct btrfs_bio *bbio = bio->bi_private;
 	int is_orig_bio = 0;
 
-	if (err) {
+	if (bio->bi_error) {
 		atomic_inc(&bbio->error);
-		if (err == -EIO || err == -EREMOTEIO) {
+		if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) {
 			unsigned int stripe_index =
 				btrfs_io_bio(bio)->stripe_index;
 			struct btrfs_device *dev;
@@ -5795,17 +5795,16 @@ static void btrfs_end_bio(struct bio *bio, int err)
 		 * beyond the tolerance of the btrfs bio
 		 */
 		if (atomic_read(&bbio->error) > bbio->max_errors) {
-			err = -EIO;
+			bio->bi_error = -EIO;
 		} else {
 			/*
 			 * this bio is actually up to date, we didn't
 			 * go over the max number of errors
 			 */
-			set_bit(BIO_UPTODATE, &bio->bi_flags);
-			err = 0;
+			bio->bi_error = 0;
 		}
 
-		btrfs_end_bbio(bbio, bio, err);
+		btrfs_end_bbio(bbio, bio);
 	} else if (!is_orig_bio) {
 		bio_put(bio);
 	}
@@ -5826,7 +5825,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
 	struct btrfs_pending_bios *pending_bios;
 
 	if (device->missing || !device->bdev) {
-		bio_endio(bio, -EIO);
+		bio_io_error(bio);
 		return;
 	}
 
@@ -5973,8 +5972,8 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
 
 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		bio->bi_iter.bi_sector = logical >> 9;
-
-		btrfs_end_bbio(bbio, bio, -EIO);
+		bio->bi_error = -EIO;
+		btrfs_end_bbio(bbio, bio);
 	}
 }
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 1cf7a53a0277..7a49bb84ecb5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2957,14 +2957,14 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
 }
 EXPORT_SYMBOL(generic_block_bmap);
 
-static void end_bio_bh_io_sync(struct bio *bio, int err)
+static void end_bio_bh_io_sync(struct bio *bio)
 {
 	struct buffer_head *bh = bio->bi_private;
 
 	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
 		set_bit(BH_Quiet, &bh->b_state);
 
-	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
+	bh->b_end_io(bh, !bio->bi_error);
 	bio_put(bio);
 }
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 745d2342651a..e1639c8c14d5 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -285,7 +285,7 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio);
 /*
  * Asynchronous IO callback. 
  */
-static void dio_bio_end_aio(struct bio *bio, int error)
+static void dio_bio_end_aio(struct bio *bio)
 {
 	struct dio *dio = bio->bi_private;
 	unsigned long remaining;
@@ -318,7 +318,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
  * During I/O bi_private points at the dio.  After I/O, bi_private is used to
  * implement a singly-linked list of completed BIOs, at dio->bio_list.
  */
-static void dio_bio_end_io(struct bio *bio, int error)
+static void dio_bio_end_io(struct bio *bio)
 {
 	struct dio *dio = bio->bi_private;
 	unsigned long flags;
@@ -345,9 +345,9 @@ void dio_end_io(struct bio *bio, int error)
 	struct dio *dio = bio->bi_private;
 
 	if (dio->is_async)
-		dio_bio_end_aio(bio, error);
+		dio_bio_end_aio(bio);
 	else
-		dio_bio_end_io(bio, error);
+		dio_bio_end_io(bio);
 }
 EXPORT_SYMBOL_GPL(dio_end_io);
 
@@ -457,11 +457,10 @@ static struct bio *dio_await_one(struct dio *dio)
  */
 static int dio_bio_complete(struct dio *dio, struct bio *bio)
 {
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec;
 	unsigned i;
 
-	if (!uptodate)
+	if (bio->bi_error)
 		dio->io_error = -EIO;
 
 	if (dio->is_async && dio->rw == READ) {
@@ -476,7 +475,7 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
 		}
 		bio_put(bio);
 	}
-	return uptodate ? 0 : -EIO;
+	return bio->bi_error;
 }
 
 /*
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 5602450f03f6..aa95566f14be 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -61,7 +61,6 @@ static void buffer_io_error(struct buffer_head *bh)
 static void ext4_finish_bio(struct bio *bio)
 {
 	int i;
-	int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec;
 
 	bio_for_each_segment_all(bvec, bio, i) {
@@ -88,7 +87,7 @@ static void ext4_finish_bio(struct bio *bio)
 		}
 #endif
 
-		if (error) {
+		if (bio->bi_error) {
 			SetPageError(page);
 			set_bit(AS_EIO, &page->mapping->flags);
 		}
@@ -107,7 +106,7 @@ static void ext4_finish_bio(struct bio *bio)
 				continue;
 			}
 			clear_buffer_async_write(bh);
-			if (error)
+			if (bio->bi_error)
 				buffer_io_error(bh);
 		} while ((bh = bh->b_this_page) != head);
 		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
@@ -310,27 +309,25 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
 }
 
 /* BIO completion function for page writeback */
-static void ext4_end_bio(struct bio *bio, int error)
+static void ext4_end_bio(struct bio *bio)
 {
 	ext4_io_end_t *io_end = bio->bi_private;
 	sector_t bi_sector = bio->bi_iter.bi_sector;
 
 	BUG_ON(!io_end);
 	bio->bi_end_io = NULL;
-	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
-		error = 0;
 
-	if (error) {
+	if (bio->bi_error) {
 		struct inode *inode = io_end->inode;
 
 		ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
 			     "(offset %llu size %ld starting block %llu)",
-			     error, inode->i_ino,
+			     bio->bi_error, inode->i_ino,
 			     (unsigned long long) io_end->offset,
 			     (long) io_end->size,
 			     (unsigned long long)
 			     bi_sector >> (inode->i_blkbits - 9));
-		mapping_set_error(inode->i_mapping, error);
+		mapping_set_error(inode->i_mapping, bio->bi_error);
 	}
 
 	if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index ec3ef93a52db..5de5b871c178 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -98,7 +98,7 @@ static inline bool ext4_bio_encrypted(struct bio *bio)
  * status of that page is hard.  See end_buffer_async_read() for the details.
  * There is no point in duplicating all that complexity.
  */
-static void mpage_end_io(struct bio *bio, int err)
+static void mpage_end_io(struct bio *bio)
 {
 	struct bio_vec *bv;
 	int i;
@@ -106,7 +106,7 @@ static void mpage_end_io(struct bio *bio, int err)
 	if (ext4_bio_encrypted(bio)) {
 		struct ext4_crypto_ctx *ctx = bio->bi_private;
 
-		if (err) {
+		if (bio->bi_error) {
 			ext4_release_crypto_ctx(ctx);
 		} else {
 			INIT_WORK(&ctx->r.work, completion_pages);
@@ -118,7 +118,7 @@ static void mpage_end_io(struct bio *bio, int err)
 	bio_for_each_segment_all(bv, bio, i) {
 		struct page *page = bv->bv_page;
 
-		if (!err) {
+		if (!bio->bi_error) {
 			SetPageUptodate(page);
 		} else {
 			ClearPageUptodate(page);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 9bedfa8dd3a5..8f0baa7ffb50 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -29,13 +29,13 @@
 static struct kmem_cache *extent_tree_slab;
 static struct kmem_cache *extent_node_slab;
 
-static void f2fs_read_end_io(struct bio *bio, int err)
+static void f2fs_read_end_io(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	int i;
 
 	if (f2fs_bio_encrypted(bio)) {
-		if (err) {
+		if (bio->bi_error) {
 			f2fs_release_crypto_ctx(bio->bi_private);
 		} else {
 			f2fs_end_io_crypto_work(bio->bi_private, bio);
@@ -46,7 +46,7 @@ static void f2fs_read_end_io(struct bio *bio, int err)
 	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
 
-		if (!err) {
+		if (!bio->bi_error) {
 			SetPageUptodate(page);
 		} else {
 			ClearPageUptodate(page);
@@ -57,7 +57,7 @@ static void f2fs_read_end_io(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-static void f2fs_write_end_io(struct bio *bio, int err)
+static void f2fs_write_end_io(struct bio *bio)
 {
 	struct f2fs_sb_info *sbi = bio->bi_private;
 	struct bio_vec *bvec;
@@ -68,7 +68,7 @@ static void f2fs_write_end_io(struct bio *bio, int err)
 
 		f2fs_restore_and_release_control_page(&page);
 
-		if (unlikely(err)) {
+		if (unlikely(bio->bi_error)) {
 			set_page_dirty(page);
 			set_bit(AS_EIO, &page->mapping->flags);
 			f2fs_stop_checkpoint(sbi);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 2c1ae861dc94..c0a1b967deba 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -202,22 +202,22 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
  *
  */
 
-static void gfs2_end_log_write(struct bio *bio, int error)
+static void gfs2_end_log_write(struct bio *bio)
 {
 	struct gfs2_sbd *sdp = bio->bi_private;
 	struct bio_vec *bvec;
 	struct page *page;
 	int i;
 
-	if (error) {
-		sdp->sd_log_error = error;
-		fs_err(sdp, "Error %d writing to log\n", error);
+	if (bio->bi_error) {
+		sdp->sd_log_error = bio->bi_error;
+		fs_err(sdp, "Error %d writing to log\n", bio->bi_error);
 	}
 
 	bio_for_each_segment_all(bvec, bio, i) {
 		page = bvec->bv_page;
 		if (page_has_buffers(page))
-			gfs2_end_log_write_bh(sdp, bvec, error);
+			gfs2_end_log_write_bh(sdp, bvec, bio->bi_error);
 		else
 			mempool_free(page, gfs2_page_pool);
 	}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1e3a93f2f71d..02586e7eb964 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -171,14 +171,14 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
 	return -EINVAL;
 }
 
-static void end_bio_io_page(struct bio *bio, int error)
+static void end_bio_io_page(struct bio *bio)
 {
 	struct page *page = bio->bi_private;
 
-	if (!error)
+	if (!bio->bi_error)
 		SetPageUptodate(page);
 	else
-		pr_warn("error %d reading superblock\n", error);
+		pr_warn("error %d reading superblock\n", bio->bi_error);
 	unlock_page(page);
 }
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index bc462dcd7a40..d301acfdb80d 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2011,7 +2011,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 	/*check if journaling to disk has been disabled*/
 	if (log->no_integrity) {
 		bio->bi_iter.bi_size = 0;
-		lbmIODone(bio, 0);
+		lbmIODone(bio);
 	} else {
 		submit_bio(READ_SYNC, bio);
 	}
@@ -2158,7 +2158,7 @@ static void lbmStartIO(struct lbuf * bp)
 	/* check if journaling to disk has been disabled */
 	if (log->no_integrity) {
 		bio->bi_iter.bi_size = 0;
-		lbmIODone(bio, 0);
+		lbmIODone(bio);
 	} else {
 		submit_bio(WRITE_SYNC, bio);
 		INCREMENT(lmStat.submitted);
@@ -2196,7 +2196,7 @@ static int lbmIOWait(struct lbuf * bp, int flag)
  *
  * executed at INTIODONE level
  */
-static void lbmIODone(struct bio *bio, int error)
+static void lbmIODone(struct bio *bio)
 {
 	struct lbuf *bp = bio->bi_private;
 	struct lbuf *nextbp, *tail;
@@ -2212,7 +2212,7 @@ static void lbmIODone(struct bio *bio, int error)
 
 	bp->l_flag |= lbmDONE;
 
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+	if (bio->bi_error) {
 		bp->l_flag |= lbmERROR;
 
 		jfs_err("lbmIODone: I/O error in JFS log");
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 16a0922beb59..a3eb316b1ac3 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -276,11 +276,11 @@ static void last_read_complete(struct page *page)
 	unlock_page(page);
 }
 
-static void metapage_read_end_io(struct bio *bio, int err)
+static void metapage_read_end_io(struct bio *bio)
 {
 	struct page *page = bio->bi_private;
 
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+	if (bio->bi_error) {
 		printk(KERN_ERR "metapage_read_end_io: I/O error\n");
 		SetPageError(page);
 	}
@@ -331,13 +331,13 @@ static void last_write_complete(struct page *page)
 	end_page_writeback(page);
 }
 
-static void metapage_write_end_io(struct bio *bio, int err)
+static void metapage_write_end_io(struct bio *bio)
 {
 	struct page *page = bio->bi_private;
 
 	BUG_ON(!PagePrivate(page));
 
-	if (! test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+	if (bio->bi_error) {
 		printk(KERN_ERR "metapage_write_end_io: I/O error\n");
 		SetPageError(page);
 	}
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 76279e11982d..cea0cc9878b7 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -53,16 +53,14 @@ static int bdev_readpage(void *_sb, struct page *page)
 
 static DECLARE_WAIT_QUEUE_HEAD(wq);
 
-static void writeseg_end_io(struct bio *bio, int err)
+static void writeseg_end_io(struct bio *bio)
 {
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec;
 	int i;
 	struct super_block *sb = bio->bi_private;
 	struct logfs_super *super = logfs_super(sb);
 
-	BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
-	BUG_ON(err);
+	BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */
 
 	bio_for_each_segment_all(bvec, bio, i) {
 		end_page_writeback(bvec->bv_page);
@@ -153,14 +151,12 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
 }
 
 
-static void erase_end_io(struct bio *bio, int err) 
+static void erase_end_io(struct bio *bio)
 { 
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 
 	struct super_block *sb = bio->bi_private; 
 	struct logfs_super *super = logfs_super(sb); 
 
-	BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ 
-	BUG_ON(err); 
+	BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */ 
 	BUG_ON(bio->bi_vcnt == 0); 
 	bio_put(bio); 
 	if (atomic_dec_and_test(&super->s_pending_writes))
diff --git a/fs/mpage.c b/fs/mpage.c
index ca0244b69de8..abac9361b3f1 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -42,14 +42,14 @@
  * status of that page is hard.  See end_buffer_async_read() for the details.
  * There is no point in duplicating all that complexity.
  */
-static void mpage_end_io(struct bio *bio, int err)
+static void mpage_end_io(struct bio *bio)
 {
 	struct bio_vec *bv;
 	int i;
 
 	bio_for_each_segment_all(bv, bio, i) {
 		struct page *page = bv->bv_page;
-		page_endio(page, bio_data_dir(bio), err);
+		page_endio(page, bio_data_dir(bio), bio->bi_error);
 	}
 
 	bio_put(bio);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index d2554fe140a3..9cd4eb3a1e22 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -116,7 +116,7 @@ bl_submit_bio(int rw, struct bio *bio)
 
 static struct bio *
 bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
-		void (*end_io)(struct bio *, int err), struct parallel_io *par)
+		bio_end_io_t end_io, struct parallel_io *par)
 {
 	struct bio *bio;
 
@@ -139,8 +139,7 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
 static struct bio *
 do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
 		struct page *page, struct pnfs_block_dev_map *map,
-		struct pnfs_block_extent *be,
-		void (*end_io)(struct bio *, int err),
+		struct pnfs_block_extent *be, bio_end_io_t end_io,
 		struct parallel_io *par, unsigned int offset, int *len)
 {
 	struct pnfs_block_dev *dev =
@@ -183,11 +182,11 @@ retry:
 	return bio;
 }
 
-static void bl_end_io_read(struct bio *bio, int err)
+static void bl_end_io_read(struct bio *bio)
 {
 	struct parallel_io *par = bio->bi_private;
 
-	if (err) {
+	if (bio->bi_error) {
 		struct nfs_pgio_header *header = par->data;
 
 		if (!header->pnfs_error)
@@ -316,13 +315,12 @@ out:
 	return PNFS_ATTEMPTED;
 }
 
-static void bl_end_io_write(struct bio *bio, int err)
+static void bl_end_io_write(struct bio *bio)
 {
 	struct parallel_io *par = bio->bi_private;
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct nfs_pgio_header *header = par->data;
 
-	if (!uptodate) {
+	if (bio->bi_error) {
 		if (!header->pnfs_error)
 			header->pnfs_error = -EIO;
 		pnfs_set_lo_fail(header->lseg);
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 42468e5ab3e7..550b10efb14e 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -338,12 +338,11 @@ void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed)
 /*
  * BIO operations
  */
-static void nilfs_end_bio_write(struct bio *bio, int err)
+static void nilfs_end_bio_write(struct bio *bio)
 {
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct nilfs_segment_buffer *segbuf = bio->bi_private;
 
-	if (!uptodate)
+	if (bio->bi_error)
 		atomic_inc(&segbuf->sb_err);
 
 	bio_put(bio);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 16eff45727ee..140de3c93d2e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -372,14 +372,13 @@ static void o2hb_wait_on_io(struct o2hb_region *reg,
 	wait_for_completion(&wc->wc_io_complete);
 }
 
-static void o2hb_bio_end_io(struct bio *bio,
-			   int error)
+static void o2hb_bio_end_io(struct bio *bio)
 {
 	struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
 
-	if (error) {
-		mlog(ML_ERROR, "IO Error %d\n", error);
-		wc->wc_error = error;
+	if (bio->bi_error) {
+		mlog(ML_ERROR, "IO Error %d\n", bio->bi_error);
+		wc->wc_error = bio->bi_error;
 	}
 
 	o2hb_bio_wait_dec(wc, 1);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3859f5e27a4d..3714844a81d8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -351,12 +351,11 @@ xfs_imap_valid(
  */
 STATIC void
 xfs_end_bio(
-	struct bio		*bio,
-	int			error)
+	struct bio		*bio)
 {
 	xfs_ioend_t		*ioend = bio->bi_private;
 
-	ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
+	ioend->io_error = bio->bi_error;
 
 	/* Toss bio and pass work off to an xfsdatad thread */
 	bio->bi_private = NULL;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index a4b7d92e946c..01bd6781974e 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1096,8 +1096,7 @@ xfs_bwrite(
 
 STATIC void
 xfs_buf_bio_end_io(
-	struct bio		*bio,
-	int			error)
+	struct bio		*bio)
 {
 	xfs_buf_t		*bp = (xfs_buf_t *)bio->bi_private;
 
@@ -1105,10 +1104,10 @@ xfs_buf_bio_end_io(
 	 * don't overwrite existing errors - otherwise we can lose errors on
 	 * buffers that require multiple bios to complete.
 	 */
-	if (error) {
+	if (bio->bi_error) {
 		spin_lock(&bp->b_lock);
 		if (!bp->b_io_error)
-			bp->b_io_error = error;
+			bp->b_io_error = bio->bi_error;
 		spin_unlock(&bp->b_lock);
 	}
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5e963a6d7c14..6b918177002d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -195,8 +195,6 @@ static inline bool bvec_gap_to_prev(struct bio_vec *bprv, unsigned int offset)
 	return offset || ((bprv->bv_offset + bprv->bv_len) & (PAGE_SIZE - 1));
 }
 
-#define bio_io_error(bio) bio_endio((bio), -EIO)
-
 /*
  * drivers should _never_ use the all version - the bio may have been split
  * before it got to the driver and the driver won't own all of it
@@ -426,7 +424,14 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
 
 }
 
-extern void bio_endio(struct bio *, int);
+extern void bio_endio(struct bio *);
+
+static inline void bio_io_error(struct bio *bio)
+{
+	bio->bi_error = -EIO;
+	bio_endio(bio);
+}
+
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
 
@@ -717,7 +722,7 @@ extern void bio_integrity_free(struct bio *);
 extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
 extern bool bio_integrity_enabled(struct bio *bio);
 extern int bio_integrity_prep(struct bio *);
-extern void bio_integrity_endio(struct bio *, int);
+extern void bio_integrity_endio(struct bio *);
 extern void bio_integrity_advance(struct bio *, unsigned int);
 extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int);
 extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 7303b3405520..6164fb8a817b 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -14,7 +14,7 @@ struct page;
 struct block_device;
 struct io_context;
 struct cgroup_subsys_state;
-typedef void (bio_end_io_t) (struct bio *, int);
+typedef void (bio_end_io_t) (struct bio *);
 typedef void (bio_destructor_t) (struct bio *);
 
 /*
@@ -53,6 +53,7 @@ struct bio {
 
 	struct bvec_iter	bi_iter;
 
+	int			bi_error;
 	/* Number of segments in this BIO after
 	 * physical address coalescing is performed.
 	 */
@@ -111,7 +112,6 @@ struct bio {
 /*
  * bio flags
  */
-#define BIO_UPTODATE	0	/* ok after I/O completion */
 #define BIO_SEG_VALID	1	/* bi_phys_segments valid */
 #define BIO_CLONED	2	/* doesn't own data */
 #define BIO_BOUNCED	3	/* bio is a bounce bio */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 38874729dc5f..31496d201fdc 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -373,9 +373,9 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
-extern void end_swap_bio_write(struct bio *bio, int err);
+extern void end_swap_bio_write(struct bio *bio);
 extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
-	void (*end_write_func)(struct bio *, int));
+	bio_end_io_t end_write_func);
 extern int swap_set_page_dirty(struct page *page);
 
 int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 2f30ca91e4fa..b2066fb5b10f 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -227,27 +227,23 @@ static void hib_init_batch(struct hib_bio_batch *hb)
 	hb->error = 0;
 }
 
-static void hib_end_io(struct bio *bio, int error)
+static void hib_end_io(struct bio *bio)
 {
 	struct hib_bio_batch *hb = bio->bi_private;
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct page *page = bio->bi_io_vec[0].bv_page;
 
-	if (!uptodate || error) {
+	if (bio->bi_error) {
 		printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
 				imajor(bio->bi_bdev->bd_inode),
 				iminor(bio->bi_bdev->bd_inode),
 				(unsigned long long)bio->bi_iter.bi_sector);
-
-		if (!error)
-			error = -EIO;
 	}
 
 	if (bio_data_dir(bio) == WRITE)
 		put_page(page);
 
-	if (error && !hb->error)
-		hb->error = error;
+	if (bio->bi_error && !hb->error)
+		hb->error = bio->bi_error;
 	if (atomic_dec_and_test(&hb->count))
 		wake_up(&hb->wait);
 
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b3e6b39b6cf9..90e72a0c3047 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -778,9 +778,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 	if (likely(!bt))
 		return;
 
-	if (!error && !bio_flagged(bio, BIO_UPTODATE))
-		error = EIO;
-
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
 			bio->bi_rw, what, error, 0, NULL);
 }
@@ -887,8 +884,7 @@ static void blk_add_trace_split(void *ignore,
 
 		__blk_add_trace(bt, bio->bi_iter.bi_sector,
 				bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
-				!bio_flagged(bio, BIO_UPTODATE),
-				sizeof(rpdu), &rpdu);
+				bio->bi_error, sizeof(rpdu), &rpdu);
 	}
 }
 
@@ -920,8 +916,8 @@ static void blk_add_trace_bio_remap(void *ignore,
 	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio->bi_rw, BLK_TA_REMAP,
-			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+			bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
+			sizeof(r), &r);
 }
 
 /**
diff --git a/mm/page_io.c b/mm/page_io.c
index 520baa4b04d7..338ce68942a0 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -43,12 +43,11 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
 	return bio;
 }
 
-void end_swap_bio_write(struct bio *bio, int err)
+void end_swap_bio_write(struct bio *bio)
 {
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct page *page = bio->bi_io_vec[0].bv_page;
 
-	if (!uptodate) {
+	if (bio->bi_error) {
 		SetPageError(page);
 		/*
 		 * We failed to write the page out to swap-space.
@@ -69,12 +68,11 @@ void end_swap_bio_write(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-static void end_swap_bio_read(struct bio *bio, int err)
+static void end_swap_bio_read(struct bio *bio)
 {
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct page *page = bio->bi_io_vec[0].bv_page;
 
-	if (!uptodate) {
+	if (bio->bi_error) {
 		SetPageError(page);
 		ClearPageUptodate(page);
 		printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
@@ -254,7 +252,7 @@ static sector_t swap_page_sector(struct page *page)
 }
 
 int __swap_writepage(struct page *page, struct writeback_control *wbc,
-	void (*end_write_func)(struct bio *, int))
+		bio_end_io_t end_write_func)
 {
 	struct bio *bio;
 	int ret, rw = WRITE;
-- 
cgit v1.2.3-70-g09d2


From b7c44ed9d2fc6b461378c65eaf144ccc80a47772 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 24 Jul 2015 12:37:59 -0600
Subject: block: manipulate bio->bi_flags through helpers

Some places use helpers now, others don't. We only have the 'is set'
helper, add helpers for setting and clearing flags too.

It was a bit of a mess of atomic vs non-atomic access. With
BIO_UPTODATE gone, we don't have any risk of concurrent access to the
flags. So relax the restriction and don't make any of them atomic. The
flags that do have serialization issues (reffed and chained), we
already handle those separately.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c               | 14 +++++++-------
 block/blk-core.c          |  2 +-
 block/blk-map.c           |  2 +-
 block/blk-merge.c         |  2 +-
 block/bounce.c            |  2 +-
 drivers/md/raid1.c        |  4 ++--
 drivers/md/raid10.c       |  6 +++---
 drivers/md/raid5.c        |  2 +-
 fs/buffer.c               |  2 +-
 include/linux/bio.h       | 15 +++++++++++++++
 include/linux/blk_types.h |  2 --
 11 files changed, 33 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index a23f489f398f..911ae8f82752 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -311,7 +311,7 @@ static void bio_chain_endio(struct bio *bio)
  */
 static inline void bio_inc_remaining(struct bio *bio)
 {
-	bio->bi_flags |= (1 << BIO_CHAIN);
+	bio_set_flag(bio, BIO_CHAIN);
 	smp_mb__before_atomic();
 	atomic_inc(&bio->__bi_remaining);
 }
@@ -495,7 +495,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 		if (unlikely(!bvl))
 			goto err_free;
 
-		bio->bi_flags |= 1 << BIO_OWNS_VEC;
+		bio_set_flag(bio, BIO_OWNS_VEC);
 	} else if (nr_iovecs) {
 		bvl = bio->bi_inline_vecs;
 	}
@@ -580,7 +580,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	 * so we don't set nor calculate new physical/hw segment counts here
 	 */
 	bio->bi_bdev = bio_src->bi_bdev;
-	bio->bi_flags |= 1 << BIO_CLONED;
+	bio_set_flag(bio, BIO_CLONED);
 	bio->bi_rw = bio_src->bi_rw;
 	bio->bi_iter = bio_src->bi_iter;
 	bio->bi_io_vec = bio_src->bi_io_vec;
@@ -829,7 +829,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 
 	/* If we may be able to merge these biovecs, force a recount */
 	if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
-		bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+		bio_clear_flag(bio, BIO_SEG_VALID);
 
  done:
 	return len;
@@ -1390,7 +1390,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 	if (iter->type & WRITE)
 		bio->bi_rw |= REQ_WRITE;
 
-	bio->bi_flags |= (1 << BIO_USER_MAPPED);
+	bio_set_flag(bio, BIO_USER_MAPPED);
 
 	/*
 	 * subtle -- if __bio_map_user() ended up bouncing a bio,
@@ -1770,7 +1770,7 @@ static inline bool bio_remaining_done(struct bio *bio)
 	BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
 
 	if (atomic_dec_and_test(&bio->__bi_remaining)) {
-		clear_bit(BIO_CHAIN, &bio->bi_flags);
+		bio_clear_flag(bio, BIO_CHAIN);
 		return true;
 	}
 
@@ -1866,7 +1866,7 @@ void bio_trim(struct bio *bio, int offset, int size)
 	if (offset == 0 && size == bio->bi_iter.bi_size)
 		return;
 
-	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+	bio_clear_flag(bio, BIO_SEG_VALID);
 
 	bio_advance(bio, offset << 9);
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 7ef15b947b91..d1796b54e97a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -146,7 +146,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 		bio->bi_error = error;
 
 	if (unlikely(rq->cmd_flags & REQ_QUIET))
-		set_bit(BIO_QUIET, &bio->bi_flags);
+		bio_set_flag(bio, BIO_QUIET);
 
 	bio_advance(bio, nbytes);
 
diff --git a/block/blk-map.c b/block/blk-map.c
index 5fe1c30bfba7..233841644c9d 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -94,7 +94,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 		return PTR_ERR(bio);
 
 	if (map_data && map_data->null_mapped)
-		bio->bi_flags |= (1 << BIO_NULL_MAPPED);
+		bio_set_flag(bio, BIO_NULL_MAPPED);
 
 	if (bio->bi_iter.bi_size != iter->count) {
 		/*
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 30a0d9f89017..a455b9860143 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -116,7 +116,7 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio)
 		bio->bi_next = nxt;
 	}
 
-	bio->bi_flags |= (1 << BIO_SEG_VALID);
+	bio_set_flag(bio, BIO_SEG_VALID);
 }
 EXPORT_SYMBOL(blk_recount_segments);
 
diff --git a/block/bounce.c b/block/bounce.c
index f4db245b9f3a..2c310ea007ee 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -186,7 +186,7 @@ static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
 	if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
 		return 0;
 
-	return test_bit(BIO_SNAP_STABLE, &bio->bi_flags);
+	return bio_flagged(bio, BIO_SNAP_STABLE);
 }
 #else
 static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 9aa7d1fb2bc1..60d0a8626e63 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1157,7 +1157,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	 * non-zero, then it is the number of not-completed requests.
 	 */
 	bio->bi_phys_segments = 0;
-	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+	bio_clear_flag(bio, BIO_SEG_VALID);
 
 	if (rw == READ) {
 		/*
@@ -2711,7 +2711,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 						/* remove last page from this bio */
 						bio->bi_vcnt--;
 						bio->bi_iter.bi_size -= len;
-						__clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+						bio_clear_flag(bio, BIO_SEG_VALID);
 					}
 					goto bio_full;
 				}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 929e9a26d81b..316ff6f611e9 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1216,7 +1216,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 	 * non-zero, then it is the number of not-completed requests.
 	 */
 	bio->bi_phys_segments = 0;
-	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+	bio_clear_flag(bio, BIO_SEG_VALID);
 
 	if (rw == READ) {
 		/*
@@ -3353,7 +3353,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 				/* remove last page from this bio */
 				bio2->bi_vcnt--;
 				bio2->bi_iter.bi_size -= len;
-				__clear_bit(BIO_SEG_VALID, &bio2->bi_flags);
+				bio_clear_flag(bio2, BIO_SEG_VALID);
 			}
 			goto bio_full;
 		}
@@ -4433,7 +4433,7 @@ read_more:
 				/* Remove last page from this bio */
 				bio2->bi_vcnt--;
 				bio2->bi_iter.bi_size -= len;
-				__clear_bit(BIO_SEG_VALID, &bio2->bi_flags);
+				bio_clear_flag(bio2, BIO_SEG_VALID);
 			}
 			goto bio_full;
 		}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 84d6eec1033e..e3d48775c9df 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4850,7 +4850,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 		rcu_read_unlock();
 		raid_bio->bi_next = (void*)rdev;
 		align_bi->bi_bdev =  rdev->bdev;
-		__clear_bit(BIO_SEG_VALID, &align_bi->bi_flags);
+		bio_clear_flag(align_bi, BIO_SEG_VALID);
 
 		if (!bio_fits_rdev(align_bi) ||
 		    is_badblock(rdev, align_bi->bi_iter.bi_sector,
diff --git a/fs/buffer.c b/fs/buffer.c
index 7a49bb84ecb5..7887bb466368 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2961,7 +2961,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
 {
 	struct buffer_head *bh = bio->bi_private;
 
-	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
+	if (unlikely(bio_flagged(bio, BIO_QUIET)))
 		set_bit(BH_Quiet, &bh->b_state);
 
 	bh->b_end_io(bh, !bio->bi_error);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 6b918177002d..986e6e19feb5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -304,6 +304,21 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count)
 	atomic_set(&bio->__bi_cnt, count);
 }
 
+static inline bool bio_flagged(struct bio *bio, unsigned int bit)
+{
+	return (bio->bi_flags & (1UL << bit)) != 0;
+}
+
+static inline void bio_set_flag(struct bio *bio, unsigned int bit)
+{
+	bio->bi_flags |= (1UL << bit);
+}
+
+static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
+{
+	bio->bi_flags &= ~(1UL << bit);
+}
+
 enum bip_flags {
 	BIP_BLOCK_INTEGRITY	= 1 << 0, /* block layer owns integrity data */
 	BIP_MAPPED_INTEGRITY	= 1 << 1, /* ref tag has been remapped */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6164fb8a817b..a765a50e780f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -129,8 +129,6 @@ struct bio {
 #define BIO_RESET_BITS	13
 #define BIO_OWNS_VEC	13	/* bio_free() should free bvec */
 
-#define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
-
 /*
  * top 4 bits of bio flags indicate the pool this bio came from
  */
-- 
cgit v1.2.3-70-g09d2


From 2c68f6dc6e621153a708bef6c569805762da2020 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 28 Jul 2015 13:14:32 -0600
Subject: block: shrink struct bio down to 2 cache lines again

Commit bcf2843b3f8f added ->bi_error to cleanup the error passing
for struct bio, but that ended up adding 4 bytes and a 4 byte hole
to the size of struct bio. For a clean config, that bumped it from
128 bytes, to 136 bytes, on x86-64.

The ->bi_flags member is currently an unsigned long, but it fits
easily within an int. Change it to an unsigned int, adjust the
the pool offset code, and move ->bi_error into the new hole. Then
we end up with a 128 byte bio again.

Change the bio flag set/clear to use cmpxchg to ensure we don't
lose any flags when manipulating them.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/bio.h       | 6 +++---
 include/linux/blk_types.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 986e6e19feb5..b7892a1906bd 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -306,17 +306,17 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count)
 
 static inline bool bio_flagged(struct bio *bio, unsigned int bit)
 {
-	return (bio->bi_flags & (1UL << bit)) != 0;
+	return (bio->bi_flags & (1U << bit)) != 0;
 }
 
 static inline void bio_set_flag(struct bio *bio, unsigned int bit)
 {
-	bio->bi_flags |= (1UL << bit);
+	bio->bi_flags |= (1U << bit);
 }
 
 static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
 {
-	bio->bi_flags &= ~(1UL << bit);
+	bio->bi_flags &= ~(1U << bit);
 }
 
 enum bip_flags {
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a765a50e780f..4b7b4ebaa633 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -46,14 +46,14 @@ struct bvec_iter {
 struct bio {
 	struct bio		*bi_next;	/* request queue link */
 	struct block_device	*bi_bdev;
-	unsigned long		bi_flags;	/* status, command, etc */
+	unsigned int		bi_flags;	/* status, command, etc */
+	int			bi_error;
 	unsigned long		bi_rw;		/* bottom bits READ/WRITE,
 						 * top bits priority
 						 */
 
 	struct bvec_iter	bi_iter;
 
-	int			bi_error;
 	/* Number of segments in this BIO after
 	 * physical address coalescing is performed.
 	 */
@@ -134,7 +134,7 @@ struct bio {
  */
 #define BIO_POOL_BITS		(4)
 #define BIO_POOL_NONE		((1UL << BIO_POOL_BITS) - 1)
-#define BIO_POOL_OFFSET		(BITS_PER_LONG - BIO_POOL_BITS)
+#define BIO_POOL_OFFSET		(32 - BIO_POOL_BITS)
 #define BIO_POOL_MASK		(1UL << BIO_POOL_OFFSET)
 #define BIO_POOL_IDX(bio)	((bio)->bi_flags >> BIO_POOL_OFFSET)
 
-- 
cgit v1.2.3-70-g09d2


From 83b7b67c780500a1d5d87c44ee8963166154adfa Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Wed, 1 Jul 2015 13:11:34 +0900
Subject: usb: phy: msm-usb: Replace deprecated API of extcon

This patch removes the deprecated notifier API of extcon framwork
and then use the new extcon API with the unique id to indicate
the each external connector (USB, USB-HOST).

Alter deprecated API as following:
- extcon_register_interest() -> extcon_register_notifier()
- extcon_get_cable_state(*edev, char *) -> extcon_get_cable_state_(*edev, id)

Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/phy/phy-msm-usb.c | 20 ++++++++++----------
 include/linux/usb/msm_hsusb.h |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/phy/phy-msm-usb.c b/drivers/usb/phy/phy-msm-usb.c
index 00c49bb1bd29..61d86d8bf5b7 100644
--- a/drivers/usb/phy/phy-msm-usb.c
+++ b/drivers/usb/phy/phy-msm-usb.c
@@ -1561,15 +1561,16 @@ static int msm_otg_read_dt(struct platform_device *pdev, struct msm_otg *motg)
 	}
 
 	if (!IS_ERR(ext_vbus)) {
+		motg->vbus.extcon = ext_vbus;
 		motg->vbus.nb.notifier_call = msm_otg_vbus_notifier;
-		ret = extcon_register_interest(&motg->vbus.conn, ext_vbus->name,
-					       "USB", &motg->vbus.nb);
+		ret = extcon_register_notifier(ext_vbus, EXTCON_USB,
+						&motg->vbus.nb);
 		if (ret < 0) {
 			dev_err(&pdev->dev, "register VBUS notifier failed\n");
 			return ret;
 		}
 
-		ret = extcon_get_cable_state(ext_vbus, "USB");
+		ret = extcon_get_cable_state_(ext_vbus, EXTCON_USB);
 		if (ret)
 			set_bit(B_SESS_VLD, &motg->inputs);
 		else
@@ -1577,15 +1578,16 @@ static int msm_otg_read_dt(struct platform_device *pdev, struct msm_otg *motg)
 	}
 
 	if (!IS_ERR(ext_id)) {
+		motg->id.extcon = ext_id;
 		motg->id.nb.notifier_call = msm_otg_id_notifier;
-		ret = extcon_register_interest(&motg->id.conn, ext_id->name,
-					       "USB-HOST", &motg->id.nb);
+		ret = extcon_register_notifier(ext_id, EXTCON_USB_HOST,
+						&motg->id.nb);
 		if (ret < 0) {
 			dev_err(&pdev->dev, "register ID notifier failed\n");
 			return ret;
 		}
 
-		ret = extcon_get_cable_state(ext_id, "USB-HOST");
+		ret = extcon_get_cable_state_(ext_id, EXTCON_USB_HOST);
 		if (ret)
 			clear_bit(ID, &motg->inputs);
 		else
@@ -1805,10 +1807,8 @@ static int msm_otg_remove(struct platform_device *pdev)
 	if (phy->otg->host || phy->otg->gadget)
 		return -EBUSY;
 
-	if (motg->id.conn.edev)
-		extcon_unregister_interest(&motg->id.conn);
-	if (motg->vbus.conn.edev)
-		extcon_unregister_interest(&motg->vbus.conn);
+	extcon_unregister_notifier(motg->id.extcon, EXTCON_USB_HOST, &motg->id.nb);
+	extcon_unregister_notifier(motg->vbus.extcon, EXTCON_USB, &motg->vbus.nb);
 
 	msm_otg_debugfs_cleanup();
 	cancel_delayed_work_sync(&motg->chg_work);
diff --git a/include/linux/usb/msm_hsusb.h b/include/linux/usb/msm_hsusb.h
index e55a1504266e..5df2c8f59aa0 100644
--- a/include/linux/usb/msm_hsusb.h
+++ b/include/linux/usb/msm_hsusb.h
@@ -128,7 +128,7 @@ struct msm_otg_platform_data {
  */
 struct msm_usb_cable {
 	struct notifier_block		nb;
-	struct extcon_specific_cable_nb conn;
+	struct extcon_dev		*extcon;
 };
 
 /**
-- 
cgit v1.2.3-70-g09d2


From ccdf138fe3e243c70301fcb6a101e366b7daef07 Mon Sep 17 00:00:00 2001
From: Robert Baldyga <r.baldyga@samsung.com>
Date: Mon, 4 May 2015 14:55:11 +0200
Subject: usb: gadget: add usb_gadget_activate/deactivate functions

These functions allows to deactivate gadget to make it not visible to
host and make it active again when gadget driver is finally ready.

They are needed to fix usb_function_activate() and usb_function_deactivate()
functions which currently are not working as usb_gadget_connect() is
called immediately after function bind regardless to previous calls of
usb_gadget_disconnect() function.

Signed-off-by: Robert Baldyga <r.baldyga@samsung.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/linux/usb/gadget.h | 100 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 94 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 4f3dfb7d0654..15604bb3e524 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -526,6 +526,9 @@ struct usb_gadget_ops {
  * @quirk_ep_out_aligned_size: epout requires buffer size to be aligned to
  *	MaxPacketSize.
  * @is_selfpowered: if the gadget is self-powered.
+ * @deactivated: True if gadget is deactivated - in deactivated state it cannot
+ *	be connected.
+ * @connected: True if gadget is connected.
  *
  * Gadgets have a mostly-portable "gadget driver" implementing device
  * functions, handling all usb configurations and interfaces.  Gadget
@@ -568,6 +571,8 @@ struct usb_gadget {
 	unsigned			a_alt_hnp_support:1;
 	unsigned			quirk_ep_out_aligned_size:1;
 	unsigned			is_selfpowered:1;
+	unsigned			deactivated:1;
+	unsigned			connected:1;
 };
 #define work_to_gadget(w)	(container_of((w), struct usb_gadget, work))
 
@@ -771,9 +776,24 @@ static inline int usb_gadget_vbus_disconnect(struct usb_gadget *gadget)
  */
 static inline int usb_gadget_connect(struct usb_gadget *gadget)
 {
+	int ret;
+
 	if (!gadget->ops->pullup)
 		return -EOPNOTSUPP;
-	return gadget->ops->pullup(gadget, 1);
+
+	if (gadget->deactivated) {
+		/*
+		 * If gadget is deactivated we only save new state.
+		 * Gadget will be connected automatically after activation.
+		 */
+		gadget->connected = true;
+		return 0;
+	}
+
+	ret = gadget->ops->pullup(gadget, 1);
+	if (!ret)
+		gadget->connected = 1;
+	return ret;
 }
 
 /**
@@ -784,20 +804,88 @@ static inline int usb_gadget_connect(struct usb_gadget *gadget)
  * as a disconnect (when a VBUS session is active).  Not all systems
  * support software pullup controls.
  *
+ * Returns zero on success, else negative errno.
+ */
+static inline int usb_gadget_disconnect(struct usb_gadget *gadget)
+{
+	int ret;
+
+	if (!gadget->ops->pullup)
+		return -EOPNOTSUPP;
+
+	if (gadget->deactivated) {
+		/*
+		 * If gadget is deactivated we only save new state.
+		 * Gadget will stay disconnected after activation.
+		 */
+		gadget->connected = false;
+		return 0;
+	}
+
+	ret = gadget->ops->pullup(gadget, 0);
+	if (!ret)
+		gadget->connected = 0;
+	return ret;
+}
+
+/**
+ * usb_gadget_deactivate - deactivate function which is not ready to work
+ * @gadget: the peripheral being deactivated
+ *
  * This routine may be used during the gadget driver bind() call to prevent
  * the peripheral from ever being visible to the USB host, unless later
- * usb_gadget_connect() is called.  For example, user mode components may
+ * usb_gadget_activate() is called.  For example, user mode components may
  * need to be activated before the system can talk to hosts.
  *
  * Returns zero on success, else negative errno.
  */
-static inline int usb_gadget_disconnect(struct usb_gadget *gadget)
+static inline int usb_gadget_deactivate(struct usb_gadget *gadget)
 {
-	if (!gadget->ops->pullup)
-		return -EOPNOTSUPP;
-	return gadget->ops->pullup(gadget, 0);
+	int ret;
+
+	if (gadget->deactivated)
+		return 0;
+
+	if (gadget->connected) {
+		ret = usb_gadget_disconnect(gadget);
+		if (ret)
+			return ret;
+		/*
+		 * If gadget was being connected before deactivation, we want
+		 * to reconnect it in usb_gadget_activate().
+		 */
+		gadget->connected = true;
+	}
+	gadget->deactivated = true;
+
+	return 0;
 }
 
+/**
+ * usb_gadget_activate - activate function which is not ready to work
+ * @gadget: the peripheral being activated
+ *
+ * This routine activates gadget which was previously deactivated with
+ * usb_gadget_deactivate() call. It calls usb_gadget_connect() if needed.
+ *
+ * Returns zero on success, else negative errno.
+ */
+static inline int usb_gadget_activate(struct usb_gadget *gadget)
+{
+	if (!gadget->deactivated)
+		return 0;
+
+	gadget->deactivated = false;
+
+	/*
+	 * If gadget has been connected before deactivation, or became connected
+	 * while it was being deactivated, we call usb_gadget_connect().
+	 */
+	if (gadget->connected)
+		return usb_gadget_connect(gadget);
+
+	return 0;
+}
 
 /*-------------------------------------------------------------------------*/
 
-- 
cgit v1.2.3-70-g09d2


From d5bb9b81dbfa35d117ecb58022ee6e7e41e4772d Mon Sep 17 00:00:00 2001
From: Robert Baldyga <r.baldyga@samsung.com>
Date: Mon, 4 May 2015 14:55:13 +0200
Subject: usb: composite: add bind_deactivated flag to usb_function

This patch introduces 'bind_deactivated' flag in struct usb_function.
Functions which don't want to be activated automatically after bind should
set this flag, and when they start to be ready to work they should call
usb_function_activate().

When USB function sets 'bind_deactivated' flag, initial deactivation
counter is incremented automatically, so there is no need to call
usb_function_deactivate() in function bind.

Signed-off-by: Robert Baldyga <r.baldyga@samsung.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/gadget/composite.c | 6 ++++++
 include/linux/usb/composite.h  | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index 86d4e8fdf8d3..36c6f47642f8 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -209,6 +209,12 @@ int usb_add_function(struct usb_configuration *config,
 	function->config = config;
 	list_add_tail(&function->list, &config->functions);
 
+	if (function->bind_deactivated) {
+		value = usb_function_deactivate(function);
+		if (value)
+			goto done;
+	}
+
 	/* REVISIT *require* function->bind? */
 	if (function->bind) {
 		value = function->bind(config, function);
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index 2511469a9904..1074b8921a5d 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -228,6 +228,8 @@ struct usb_function {
 	struct list_head		list;
 	DECLARE_BITMAP(endpoints, 32);
 	const struct usb_function_instance *fi;
+
+	unsigned int		bind_deactivated:1;
 };
 
 int usb_add_function(struct usb_configuration *, struct usb_function *);
-- 
cgit v1.2.3-70-g09d2


From 6a88bbe8e30d4beb2320b5a7452242a1fe7889c5 Mon Sep 17 00:00:00 2001
From: Li Jun <jun.li@freescale.com>
Date: Thu, 9 Jul 2015 15:18:40 +0800
Subject: usb: otg: add usb_otg_caps structure for otg capabilities

This patch adds a structure usb_otg_caps to cover all otg related
capabilities of the device, including otg revision, and if hnp/srp/adp
is supported.

Signed-off-by: Li Jun <jun.li@freescale.com>
Reviewed-by: Roger Quadros <rogerq@ti.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/linux/usb/otg.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/otg.h b/include/linux/usb/otg.h
index 52661c5da690..bd1dcf816100 100644
--- a/include/linux/usb/otg.h
+++ b/include/linux/usb/otg.h
@@ -41,6 +41,21 @@ struct usb_otg {
 
 };
 
+/**
+ * struct usb_otg_caps - describes the otg capabilities of the device
+ * @otg_rev: The OTG revision number the device is compliant with, it's
+ *		in binary-coded decimal (i.e. 2.0 is 0200H).
+ * @hnp_support: Indicates if the device supports HNP.
+ * @srp_support: Indicates if the device supports SRP.
+ * @adp_support: Indicates if the device supports ADP.
+ */
+struct usb_otg_caps {
+	u16 otg_rev;
+	bool hnp_support;
+	bool srp_support;
+	bool adp_support;
+};
+
 extern const char *usb_otg_state_string(enum usb_otg_state state);
 
 /* Context: can sleep */
-- 
cgit v1.2.3-70-g09d2


From 84704bb3d183e55d042bf57043552f2649443a64 Mon Sep 17 00:00:00 2001
From: Macpaul Lin <macpaul@gmail.com>
Date: Thu, 9 Jul 2015 15:18:41 +0800
Subject: usb: add usb_otg_caps to usb_gadget structure.

Add usb_otg_caps pointer to usb_gadget structure to indicate its
otg capabilities.

Signed-off-by: Macpaul Lin <macpaul@gmail.com>
Signed-off-by: Li Jun <jun.li@freescale.com>
Reviewed-by: Roger Quadros <rogerq@ti.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/linux/usb/gadget.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 15604bb3e524..fffceafb6b8c 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -511,6 +511,7 @@ struct usb_gadget_ops {
  * @dev: Driver model state for this abstract device.
  * @out_epnum: last used out ep number
  * @in_epnum: last used in ep number
+ * @otg_caps: OTG capabilities of this gadget.
  * @sg_supported: true if we can handle scatter-gather
  * @is_otg: True if the USB device port uses a Mini-AB jack, so that the
  *	gadget driver must provide a USB OTG descriptor.
@@ -562,6 +563,7 @@ struct usb_gadget {
 	struct device			dev;
 	unsigned			out_epnum;
 	unsigned			in_epnum;
+	struct usb_otg_caps		*otg_caps;
 
 	unsigned			sg_supported:1;
 	unsigned			is_otg:1;
-- 
cgit v1.2.3-70-g09d2


From 929412d94f2b75fe2a662afa2977bfb6a233c1c3 Mon Sep 17 00:00:00 2001
From: Li Jun <jun.li@freescale.com>
Date: Thu, 9 Jul 2015 15:18:44 +0800
Subject: usb: common: add API to update usb otg capabilities by device tree

Check property of usb hardware to update otg version and disable SRP, HNP
and ADP if its disable flag is present.

Reviewed-by: Roger Quadros <rogerq@ti.com>
Signed-off-by: Li Jun <jun.li@freescale.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/common/common.c | 56 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb/of.h      |  7 ++++++
 2 files changed, 63 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/common/common.c b/drivers/usb/common/common.c
index b530fd403ffb..9e39286a4e5a 100644
--- a/drivers/usb/common/common.c
+++ b/drivers/usb/common/common.c
@@ -154,6 +154,62 @@ bool of_usb_host_tpl_support(struct device_node *np)
 	return false;
 }
 EXPORT_SYMBOL_GPL(of_usb_host_tpl_support);
+
+/**
+ * of_usb_update_otg_caps - to update usb otg capabilities according to
+ * the passed properties in DT.
+ * @np: Pointer to the given device_node
+ * @otg_caps: Pointer to the target usb_otg_caps to be set
+ *
+ * The function updates the otg capabilities
+ */
+int of_usb_update_otg_caps(struct device_node *np,
+			struct usb_otg_caps *otg_caps)
+{
+	u32 otg_rev;
+
+	if (!otg_caps)
+		return -EINVAL;
+
+	if (!of_property_read_u32(np, "otg-rev", &otg_rev)) {
+		switch (otg_rev) {
+		case 0x0100:
+		case 0x0120:
+		case 0x0130:
+		case 0x0200:
+			/* Choose the lesser one if it's already been set */
+			if (otg_caps->otg_rev)
+				otg_caps->otg_rev = min_t(u16, otg_rev,
+							otg_caps->otg_rev);
+			else
+				otg_caps->otg_rev = otg_rev;
+			break;
+		default:
+			pr_err("%s: unsupported otg-rev: 0x%x\n",
+						np->full_name, otg_rev);
+			return -EINVAL;
+		}
+	} else {
+		/*
+		 * otg-rev is mandatory for otg properties, if not passed
+		 * we set it to be 0 and assume it's a legacy otg device.
+		 * Non-dt platform can set it afterwards.
+		 */
+		otg_caps->otg_rev = 0;
+	}
+
+	if (of_find_property(np, "hnp-disable", NULL))
+		otg_caps->hnp_support = false;
+	if (of_find_property(np, "srp-disable", NULL))
+		otg_caps->srp_support = false;
+	if (of_find_property(np, "adp-disable", NULL) ||
+				(otg_caps->otg_rev < 0x0200))
+		otg_caps->adp_support = false;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(of_usb_update_otg_caps);
+
 #endif
 
 MODULE_LICENSE("GPL");
diff --git a/include/linux/usb/of.h b/include/linux/usb/of.h
index cfe0528cdbb1..8c5a818ec244 100644
--- a/include/linux/usb/of.h
+++ b/include/linux/usb/of.h
@@ -15,6 +15,8 @@
 enum usb_dr_mode of_usb_get_dr_mode(struct device_node *np);
 enum usb_device_speed of_usb_get_maximum_speed(struct device_node *np);
 bool of_usb_host_tpl_support(struct device_node *np);
+int of_usb_update_otg_caps(struct device_node *np,
+			struct usb_otg_caps *otg_caps);
 #else
 static inline enum usb_dr_mode of_usb_get_dr_mode(struct device_node *np)
 {
@@ -30,6 +32,11 @@ static inline bool of_usb_host_tpl_support(struct device_node *np)
 {
 	return false;
 }
+static inline int of_usb_update_otg_caps(struct device_node *np,
+				struct usb_otg_caps *otg_caps)
+{
+	return 0;
+}
 #endif
 
 #if IS_ENABLED(CONFIG_OF) && IS_ENABLED(CONFIG_USB_SUPPORT)
-- 
cgit v1.2.3-70-g09d2


From 79742351c89b76ebcf82b73103aed50f98ac2ee4 Mon Sep 17 00:00:00 2001
From: Li Jun <jun.li@freescale.com>
Date: Thu, 9 Jul 2015 15:18:45 +0800
Subject: usb: chipidea: set usb otg capabilities

Init and update otg capabilities by DT, set gadget's otg capabilities
accordingly.

Acked-by: Peter Chen <peter.chen@freescale.com>
Reviewed-by: Roger Quadros <rogerq@ti.com>
Signed-off-by: Li Jun <jun.li@freescale.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/chipidea/core.c  | 15 +++++++++++++++
 drivers/usb/chipidea/udc.c   |  7 ++++++-
 include/linux/usb/chipidea.h |  1 +
 3 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c
index 74fea4fa41b1..1e6d5f0c18f2 100644
--- a/drivers/usb/chipidea/core.c
+++ b/drivers/usb/chipidea/core.c
@@ -560,6 +560,8 @@ static irqreturn_t ci_irq(int irq, void *data)
 static int ci_get_platdata(struct device *dev,
 		struct ci_hdrc_platform_data *platdata)
 {
+	int ret;
+
 	if (!platdata->phy_mode)
 		platdata->phy_mode = of_usb_get_phy_mode(dev->of_node);
 
@@ -588,6 +590,19 @@ static int ci_get_platdata(struct device *dev,
 				of_usb_host_tpl_support(dev->of_node);
 	}
 
+	if (platdata->dr_mode == USB_DR_MODE_OTG) {
+		/* We can support HNP and SRP of OTG 2.0 */
+		platdata->ci_otg_caps.otg_rev = 0x0200;
+		platdata->ci_otg_caps.hnp_support = true;
+		platdata->ci_otg_caps.srp_support = true;
+
+		/* Update otg capabilities by DT properties */
+		ret = of_usb_update_otg_caps(dev->of_node,
+					&platdata->ci_otg_caps);
+		if (ret)
+			return ret;
+	}
+
 	if (of_usb_get_maximum_speed(dev->of_node) == USB_SPEED_FULL)
 		platdata->flags |= CI_HDRC_FORCE_FULLSPEED;
 
diff --git a/drivers/usb/chipidea/udc.c b/drivers/usb/chipidea/udc.c
index 764f668d45a9..b7cca3e597bf 100644
--- a/drivers/usb/chipidea/udc.c
+++ b/drivers/usb/chipidea/udc.c
@@ -1827,6 +1827,7 @@ static irqreturn_t udc_irq(struct ci_hdrc *ci)
 static int udc_start(struct ci_hdrc *ci)
 {
 	struct device *dev = ci->dev;
+	struct usb_otg_caps *otg_caps = &ci->platdata->ci_otg_caps;
 	int retval = 0;
 
 	spin_lock_init(&ci->lock);
@@ -1834,8 +1835,12 @@ static int udc_start(struct ci_hdrc *ci)
 	ci->gadget.ops          = &usb_gadget_ops;
 	ci->gadget.speed        = USB_SPEED_UNKNOWN;
 	ci->gadget.max_speed    = USB_SPEED_HIGH;
-	ci->gadget.is_otg       = ci->is_otg ? 1 : 0;
 	ci->gadget.name         = ci->platdata->name;
+	ci->gadget.otg_caps	= otg_caps;
+
+	if (otg_caps->hnp_support || otg_caps->srp_support ||
+					otg_caps->adp_support)
+		ci->gadget.is_otg = 1;
 
 	INIT_LIST_HEAD(&ci->gadget.ep_list);
 
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index ab94f78c4dd1..e10cefc721ad 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -34,6 +34,7 @@ struct ci_hdrc_platform_data {
 #define CI_HDRC_CONTROLLER_STOPPED_EVENT	1
 	void	(*notify_event) (struct ci_hdrc *ci, unsigned event);
 	struct regulator	*reg_vbus;
+	struct usb_otg_caps	ci_otg_caps;
 	bool			tpl_support;
 };
 
-- 
cgit v1.2.3-70-g09d2


From d1606dfb98e59221332704c05f5908d9116456ab Mon Sep 17 00:00:00 2001
From: Li Jun <jun.li@freescale.com>
Date: Thu, 9 Jul 2015 15:18:47 +0800
Subject: usb: gadget: add usb otg descriptor allocate and init interface

Allocate usb otg descriptor and initialize it according to gadget's otg
capabilities, if usb_otg_caps is not set, keep settings as current gadget
drivers. With this 2 new interfaces, gadget can use usb_otg_descriptor
for OTG 1.x, and usb_otg20_descriptor for OTG 2.0 or above, and otg
features can be decided by the combination of usb hardware property
and driver config.

Signed-off-by: Li Jun <jun.li@freescale.com>
Reviewed-by: Roger Quadros <rogerq@ti.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/gadget/config.c | 56 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb/gadget.h  |  4 ++++
 2 files changed, 60 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/config.c b/drivers/usb/gadget/config.c
index 34e12fc52c23..0fafa7a1b6f6 100644
--- a/drivers/usb/gadget/config.c
+++ b/drivers/usb/gadget/config.c
@@ -20,6 +20,7 @@
 #include <linux/usb/ch9.h>
 #include <linux/usb/gadget.h>
 #include <linux/usb/composite.h>
+#include <linux/usb/otg.h>
 
 /**
  * usb_descriptor_fillbuf - fill buffer with descriptors
@@ -195,3 +196,58 @@ void usb_free_all_descriptors(struct usb_function *f)
 	usb_free_descriptors(f->ss_descriptors);
 }
 EXPORT_SYMBOL_GPL(usb_free_all_descriptors);
+
+struct usb_descriptor_header *usb_otg_descriptor_alloc(
+				struct usb_gadget *gadget)
+{
+	struct usb_descriptor_header *otg_desc;
+	unsigned length = 0;
+
+	if (gadget->otg_caps && (gadget->otg_caps->otg_rev >= 0x0200))
+		length = sizeof(struct usb_otg20_descriptor);
+	else
+		length = sizeof(struct usb_otg_descriptor);
+
+	otg_desc = kzalloc(length, GFP_KERNEL);
+	return otg_desc;
+}
+EXPORT_SYMBOL_GPL(usb_otg_descriptor_alloc);
+
+int usb_otg_descriptor_init(struct usb_gadget *gadget,
+		struct usb_descriptor_header *otg_desc)
+{
+	struct usb_otg_descriptor *otg1x_desc;
+	struct usb_otg20_descriptor *otg20_desc;
+	struct usb_otg_caps *otg_caps = gadget->otg_caps;
+	u8 otg_attributes = 0;
+
+	if (!otg_desc)
+		return -EINVAL;
+
+	if (otg_caps && otg_caps->otg_rev) {
+		if (otg_caps->hnp_support)
+			otg_attributes |= USB_OTG_HNP;
+		if (otg_caps->srp_support)
+			otg_attributes |= USB_OTG_SRP;
+		if (otg_caps->adp_support && (otg_caps->otg_rev >= 0x0200))
+			otg_attributes |= USB_OTG_ADP;
+	} else {
+		otg_attributes = USB_OTG_SRP | USB_OTG_HNP;
+	}
+
+	if (otg_caps && (otg_caps->otg_rev >= 0x0200)) {
+		otg20_desc = (struct usb_otg20_descriptor *)otg_desc;
+		otg20_desc->bLength = sizeof(struct usb_otg20_descriptor);
+		otg20_desc->bDescriptorType = USB_DT_OTG;
+		otg20_desc->bmAttributes = otg_attributes;
+		otg20_desc->bcdOTG = cpu_to_le16(otg_caps->otg_rev);
+	} else {
+		otg1x_desc = (struct usb_otg_descriptor *)otg_desc;
+		otg1x_desc->bLength = sizeof(struct usb_otg_descriptor);
+		otg1x_desc->bDescriptorType = USB_DT_OTG;
+		otg1x_desc->bmAttributes = otg_attributes;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(usb_otg_descriptor_init);
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index fffceafb6b8c..cea0511a1bc9 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -1092,6 +1092,10 @@ int usb_assign_descriptors(struct usb_function *f,
 		struct usb_descriptor_header **ss);
 void usb_free_all_descriptors(struct usb_function *f);
 
+struct usb_descriptor_header *usb_otg_descriptor_alloc(
+				struct usb_gadget *gadget);
+int usb_otg_descriptor_init(struct usb_gadget *gadget,
+		struct usb_descriptor_header *otg_desc);
 /*-------------------------------------------------------------------------*/
 
 /* utility to simplify map/unmap of usb_requests to/from DMA */
-- 
cgit v1.2.3-70-g09d2


From f2ab3298fb4932358d27fc4c7ea1a1891ad7e042 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Date: Mon, 27 Jul 2015 20:20:30 -0700
Subject: soc: qcom: Add Shared Memory Driver

This adds the Qualcomm Shared Memory Driver (SMD) providing
communication channels to remote processors, ontop of SMEM.

Signed-off-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Signed-off-by: Andy Gross <agross@codeaurora.org>
---
 drivers/soc/qcom/Kconfig     |    8 +
 drivers/soc/qcom/Makefile    |    1 +
 drivers/soc/qcom/smd.c       | 1319 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/soc/qcom/smd.h |   46 ++
 4 files changed, 1374 insertions(+)
 create mode 100644 drivers/soc/qcom/smd.c
 create mode 100644 include/linux/soc/qcom/smd.h

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig
index 0d4faaf32662..188295e2c9ba 100644
--- a/drivers/soc/qcom/Kconfig
+++ b/drivers/soc/qcom/Kconfig
@@ -19,6 +19,14 @@ config QCOM_PM
 	  modes. It interface with various system drivers to put the cores in
 	  low power modes.
 
+config QCOM_SMD
+	tristate "Qualcomm Shared Memory Driver (SMD)"
+	depends on QCOM_SMEM
+	help
+	  Say y here to enable support for the Qualcomm Shared Memory Driver
+	  providing communication channels to remote processors in Qualcomm
+	  platforms.
+
 config QCOM_SMEM
 	tristate "Qualcomm Shared Memory Manager (SMEM)"
 	depends on ARCH_QCOM
diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile
index 3a033c43c0ef..f961a8796ed2 100644
--- a/drivers/soc/qcom/Makefile
+++ b/drivers/soc/qcom/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_QCOM_GSBI)	+=	qcom_gsbi.o
 obj-$(CONFIG_QCOM_PM)	+=	spm.o
+obj-$(CONFIG_QCOM_SMD) +=	smd.o
 obj-$(CONFIG_QCOM_SMEM) +=	smem.o
diff --git a/drivers/soc/qcom/smd.c b/drivers/soc/qcom/smd.c
new file mode 100644
index 000000000000..327adcf117c1
--- /dev/null
+++ b/drivers/soc/qcom/smd.c
@@ -0,0 +1,1319 @@
+/*
+ * Copyright (c) 2015, Sony Mobile Communications AB.
+ * Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/mfd/syscon.h>
+#include <linux/module.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/soc/qcom/smd.h>
+#include <linux/soc/qcom/smem.h>
+#include <linux/wait.h>
+
+/*
+ * The Qualcomm Shared Memory communication solution provides point-to-point
+ * channels for clients to send and receive streaming or packet based data.
+ *
+ * Each channel consists of a control item (channel info) and a ring buffer
+ * pair. The channel info carry information related to channel state, flow
+ * control and the offsets within the ring buffer.
+ *
+ * All allocated channels are listed in an allocation table, identifying the
+ * pair of items by name, type and remote processor.
+ *
+ * Upon creating a new channel the remote processor allocates channel info and
+ * ring buffer items from the smem heap and populate the allocation table. An
+ * interrupt is sent to the other end of the channel and a scan for new
+ * channels should be done. A channel never goes away, it will only change
+ * state.
+ *
+ * The remote processor signals it intent for bring up the communication
+ * channel by setting the state of its end of the channel to "opening" and
+ * sends out an interrupt. We detect this change and register a smd device to
+ * consume the channel. Upon finding a consumer we finish the handshake and the
+ * channel is up.
+ *
+ * Upon closing a channel, the remote processor will update the state of its
+ * end of the channel and signal us, we will then unregister any attached
+ * device and close our end of the channel.
+ *
+ * Devices attached to a channel can use the qcom_smd_send function to push
+ * data to the channel, this is done by copying the data into the tx ring
+ * buffer, updating the pointers in the channel info and signaling the remote
+ * processor.
+ *
+ * The remote processor does the equivalent when it transfer data and upon
+ * receiving the interrupt we check the channel info for new data and delivers
+ * this to the attached device. If the device is not ready to receive the data
+ * we leave it in the ring buffer for now.
+ */
+
+struct smd_channel_info;
+struct smd_channel_info_word;
+
+#define SMD_ALLOC_TBL_COUNT	2
+#define SMD_ALLOC_TBL_SIZE	64
+
+/*
+ * This lists the various smem heap items relevant for the allocation table and
+ * smd channel entries.
+ */
+static const struct {
+	unsigned alloc_tbl_id;
+	unsigned info_base_id;
+	unsigned fifo_base_id;
+} smem_items[SMD_ALLOC_TBL_COUNT] = {
+	{
+		.alloc_tbl_id = 13,
+		.info_base_id = 14,
+		.fifo_base_id = 338
+	},
+	{
+		.alloc_tbl_id = 14,
+		.info_base_id = 266,
+		.fifo_base_id = 202,
+	},
+};
+
+/**
+ * struct qcom_smd_edge - representing a remote processor
+ * @smd:		handle to qcom_smd
+ * @of_node:		of_node handle for information related to this edge
+ * @edge_id:		identifier of this edge
+ * @irq:		interrupt for signals on this edge
+ * @ipc_regmap:		regmap handle holding the outgoing ipc register
+ * @ipc_offset:		offset within @ipc_regmap of the register for ipc
+ * @ipc_bit:		bit in the register at @ipc_offset of @ipc_regmap
+ * @channels:		list of all channels detected on this edge
+ * @channels_lock:	guard for modifications of @channels
+ * @allocated:		array of bitmaps representing already allocated channels
+ * @need_rescan:	flag that the @work needs to scan smem for new channels
+ * @smem_available:	last available amount of smem triggering a channel scan
+ * @work:		work item for edge house keeping
+ */
+struct qcom_smd_edge {
+	struct qcom_smd *smd;
+	struct device_node *of_node;
+	unsigned edge_id;
+
+	int irq;
+
+	struct regmap *ipc_regmap;
+	int ipc_offset;
+	int ipc_bit;
+
+	struct list_head channels;
+	spinlock_t channels_lock;
+
+	DECLARE_BITMAP(allocated[SMD_ALLOC_TBL_COUNT], SMD_ALLOC_TBL_SIZE);
+
+	bool need_rescan;
+	unsigned smem_available;
+
+	struct work_struct work;
+};
+
+/*
+ * SMD channel states.
+ */
+enum smd_channel_state {
+	SMD_CHANNEL_CLOSED,
+	SMD_CHANNEL_OPENING,
+	SMD_CHANNEL_OPENED,
+	SMD_CHANNEL_FLUSHING,
+	SMD_CHANNEL_CLOSING,
+	SMD_CHANNEL_RESET,
+	SMD_CHANNEL_RESET_OPENING
+};
+
+/**
+ * struct qcom_smd_channel - smd channel struct
+ * @edge:		qcom_smd_edge this channel is living on
+ * @qsdev:		reference to a associated smd client device
+ * @name:		name of the channel
+ * @state:		local state of the channel
+ * @remote_state:	remote state of the channel
+ * @tx_info:		byte aligned outgoing channel info
+ * @rx_info:		byte aligned incoming channel info
+ * @tx_info_word:	word aligned outgoing channel info
+ * @rx_info_word:	word aligned incoming channel info
+ * @tx_lock:		lock to make writes to the channel mutually exclusive
+ * @fblockread_event:	wakeup event tied to tx fBLOCKREADINTR
+ * @tx_fifo:		pointer to the outgoing ring buffer
+ * @rx_fifo:		pointer to the incoming ring buffer
+ * @fifo_size:		size of each ring buffer
+ * @bounce_buffer:	bounce buffer for reading wrapped packets
+ * @cb:			callback function registered for this channel
+ * @recv_lock:		guard for rx info modifications and cb pointer
+ * @pkt_size:		size of the currently handled packet
+ * @list:		lite entry for @channels in qcom_smd_edge
+ */
+struct qcom_smd_channel {
+	struct qcom_smd_edge *edge;
+
+	struct qcom_smd_device *qsdev;
+
+	char *name;
+	enum smd_channel_state state;
+	enum smd_channel_state remote_state;
+
+	struct smd_channel_info *tx_info;
+	struct smd_channel_info *rx_info;
+
+	struct smd_channel_info_word *tx_info_word;
+	struct smd_channel_info_word *rx_info_word;
+
+	struct mutex tx_lock;
+	wait_queue_head_t fblockread_event;
+
+	void *tx_fifo;
+	void *rx_fifo;
+	int fifo_size;
+
+	void *bounce_buffer;
+	int (*cb)(struct qcom_smd_device *, const void *, size_t);
+
+	spinlock_t recv_lock;
+
+	int pkt_size;
+
+	struct list_head list;
+};
+
+/**
+ * struct qcom_smd - smd struct
+ * @dev:	device struct
+ * @num_edges:	number of entries in @edges
+ * @edges:	array of edges to be handled
+ */
+struct qcom_smd {
+	struct device *dev;
+
+	unsigned num_edges;
+	struct qcom_smd_edge edges[0];
+};
+
+/*
+ * Format of the smd_info smem items, for byte aligned channels.
+ */
+struct smd_channel_info {
+	u32 state;
+	u8  fDSR;
+	u8  fCTS;
+	u8  fCD;
+	u8  fRI;
+	u8  fHEAD;
+	u8  fTAIL;
+	u8  fSTATE;
+	u8  fBLOCKREADINTR;
+	u32 tail;
+	u32 head;
+};
+
+/*
+ * Format of the smd_info smem items, for word aligned channels.
+ */
+struct smd_channel_info_word {
+	u32 state;
+	u32 fDSR;
+	u32 fCTS;
+	u32 fCD;
+	u32 fRI;
+	u32 fHEAD;
+	u32 fTAIL;
+	u32 fSTATE;
+	u32 fBLOCKREADINTR;
+	u32 tail;
+	u32 head;
+};
+
+#define GET_RX_CHANNEL_INFO(channel, param) \
+	(channel->rx_info_word ? \
+		channel->rx_info_word->param : \
+		channel->rx_info->param)
+
+#define SET_RX_CHANNEL_INFO(channel, param, value) \
+	(channel->rx_info_word ? \
+		(channel->rx_info_word->param = value) : \
+		(channel->rx_info->param = value))
+
+#define GET_TX_CHANNEL_INFO(channel, param) \
+	(channel->tx_info_word ? \
+		channel->tx_info_word->param : \
+		channel->tx_info->param)
+
+#define SET_TX_CHANNEL_INFO(channel, param, value) \
+	(channel->tx_info_word ? \
+		(channel->tx_info_word->param = value) : \
+		(channel->tx_info->param = value))
+
+/**
+ * struct qcom_smd_alloc_entry - channel allocation entry
+ * @name:	channel name
+ * @cid:	channel index
+ * @flags:	channel flags and edge id
+ * @ref_count:	reference count of the channel
+ */
+struct qcom_smd_alloc_entry {
+	u8 name[20];
+	u32 cid;
+	u32 flags;
+	u32 ref_count;
+} __packed;
+
+#define SMD_CHANNEL_FLAGS_EDGE_MASK	0xff
+#define SMD_CHANNEL_FLAGS_STREAM	BIT(8)
+#define SMD_CHANNEL_FLAGS_PACKET	BIT(9)
+
+/*
+ * Each smd packet contains a 20 byte header, with the first 4 being the length
+ * of the packet.
+ */
+#define SMD_PACKET_HEADER_LEN	20
+
+/*
+ * Signal the remote processor associated with 'channel'.
+ */
+static void qcom_smd_signal_channel(struct qcom_smd_channel *channel)
+{
+	struct qcom_smd_edge *edge = channel->edge;
+
+	regmap_write(edge->ipc_regmap, edge->ipc_offset, BIT(edge->ipc_bit));
+}
+
+/*
+ * Initialize the tx channel info
+ */
+static void qcom_smd_channel_reset(struct qcom_smd_channel *channel)
+{
+	SET_TX_CHANNEL_INFO(channel, state, SMD_CHANNEL_CLOSED);
+	SET_TX_CHANNEL_INFO(channel, fDSR, 0);
+	SET_TX_CHANNEL_INFO(channel, fCTS, 0);
+	SET_TX_CHANNEL_INFO(channel, fCD, 0);
+	SET_TX_CHANNEL_INFO(channel, fRI, 0);
+	SET_TX_CHANNEL_INFO(channel, fHEAD, 0);
+	SET_TX_CHANNEL_INFO(channel, fTAIL, 0);
+	SET_TX_CHANNEL_INFO(channel, fSTATE, 1);
+	SET_TX_CHANNEL_INFO(channel, fBLOCKREADINTR, 0);
+	SET_TX_CHANNEL_INFO(channel, head, 0);
+	SET_TX_CHANNEL_INFO(channel, tail, 0);
+
+	qcom_smd_signal_channel(channel);
+
+	channel->state = SMD_CHANNEL_CLOSED;
+	channel->pkt_size = 0;
+}
+
+/*
+ * Calculate the amount of data available in the rx fifo
+ */
+static size_t qcom_smd_channel_get_rx_avail(struct qcom_smd_channel *channel)
+{
+	unsigned head;
+	unsigned tail;
+
+	head = GET_RX_CHANNEL_INFO(channel, head);
+	tail = GET_RX_CHANNEL_INFO(channel, tail);
+
+	return (head - tail) & (channel->fifo_size - 1);
+}
+
+/*
+ * Set tx channel state and inform the remote processor
+ */
+static void qcom_smd_channel_set_state(struct qcom_smd_channel *channel,
+				       int state)
+{
+	struct qcom_smd_edge *edge = channel->edge;
+	bool is_open = state == SMD_CHANNEL_OPENED;
+
+	if (channel->state == state)
+		return;
+
+	dev_dbg(edge->smd->dev, "set_state(%s, %d)\n", channel->name, state);
+
+	SET_TX_CHANNEL_INFO(channel, fDSR, is_open);
+	SET_TX_CHANNEL_INFO(channel, fCTS, is_open);
+	SET_TX_CHANNEL_INFO(channel, fCD, is_open);
+
+	SET_TX_CHANNEL_INFO(channel, state, state);
+	SET_TX_CHANNEL_INFO(channel, fSTATE, 1);
+
+	channel->state = state;
+	qcom_smd_signal_channel(channel);
+}
+
+/*
+ * Copy count bytes of data using 32bit accesses, if that's required.
+ */
+static void smd_copy_to_fifo(void __iomem *_dst,
+			     const void *_src,
+			     size_t count,
+			     bool word_aligned)
+{
+	u32 *dst = (u32 *)_dst;
+	u32 *src = (u32 *)_src;
+
+	if (word_aligned) {
+		count /= sizeof(u32);
+		while (count--)
+			writel_relaxed(*src++, dst++);
+	} else {
+		memcpy_toio(_dst, _src, count);
+	}
+}
+
+/*
+ * Copy count bytes of data using 32bit accesses, if that is required.
+ */
+static void smd_copy_from_fifo(void *_dst,
+			       const void __iomem *_src,
+			       size_t count,
+			       bool word_aligned)
+{
+	u32 *dst = (u32 *)_dst;
+	u32 *src = (u32 *)_src;
+
+	if (word_aligned) {
+		count /= sizeof(u32);
+		while (count--)
+			*dst++ = readl_relaxed(src++);
+	} else {
+		memcpy_fromio(_dst, _src, count);
+	}
+}
+
+/*
+ * Read count bytes of data from the rx fifo into buf, but don't advance the
+ * tail.
+ */
+static size_t qcom_smd_channel_peek(struct qcom_smd_channel *channel,
+				    void *buf, size_t count)
+{
+	bool word_aligned;
+	unsigned tail;
+	size_t len;
+
+	word_aligned = channel->rx_info_word != NULL;
+	tail = GET_RX_CHANNEL_INFO(channel, tail);
+
+	len = min_t(size_t, count, channel->fifo_size - tail);
+	if (len) {
+		smd_copy_from_fifo(buf,
+				   channel->rx_fifo + tail,
+				   len,
+				   word_aligned);
+	}
+
+	if (len != count) {
+		smd_copy_from_fifo(buf + len,
+				   channel->rx_fifo,
+				   count - len,
+				   word_aligned);
+	}
+
+	return count;
+}
+
+/*
+ * Advance the rx tail by count bytes.
+ */
+static void qcom_smd_channel_advance(struct qcom_smd_channel *channel,
+				     size_t count)
+{
+	unsigned tail;
+
+	tail = GET_RX_CHANNEL_INFO(channel, tail);
+	tail += count;
+	tail &= (channel->fifo_size - 1);
+	SET_RX_CHANNEL_INFO(channel, tail, tail);
+}
+
+/*
+ * Read out a single packet from the rx fifo and deliver it to the device
+ */
+static int qcom_smd_channel_recv_single(struct qcom_smd_channel *channel)
+{
+	struct qcom_smd_device *qsdev = channel->qsdev;
+	unsigned tail;
+	size_t len;
+	void *ptr;
+	int ret;
+
+	if (!channel->cb)
+		return 0;
+
+	tail = GET_RX_CHANNEL_INFO(channel, tail);
+
+	/* Use bounce buffer if the data wraps */
+	if (tail + channel->pkt_size >= channel->fifo_size) {
+		ptr = channel->bounce_buffer;
+		len = qcom_smd_channel_peek(channel, ptr, channel->pkt_size);
+	} else {
+		ptr = channel->rx_fifo + tail;
+		len = channel->pkt_size;
+	}
+
+	ret = channel->cb(qsdev, ptr, len);
+	if (ret < 0)
+		return ret;
+
+	/* Only forward the tail if the client consumed the data */
+	qcom_smd_channel_advance(channel, len);
+
+	channel->pkt_size = 0;
+
+	return 0;
+}
+
+/*
+ * Per channel interrupt handling
+ */
+static bool qcom_smd_channel_intr(struct qcom_smd_channel *channel)
+{
+	bool need_state_scan = false;
+	int remote_state;
+	u32 pktlen;
+	int avail;
+	int ret;
+
+	/* Handle state changes */
+	remote_state = GET_RX_CHANNEL_INFO(channel, state);
+	if (remote_state != channel->remote_state) {
+		channel->remote_state = remote_state;
+		need_state_scan = true;
+	}
+	/* Indicate that we have seen any state change */
+	SET_RX_CHANNEL_INFO(channel, fSTATE, 0);
+
+	/* Signal waiting qcom_smd_send() about the interrupt */
+	if (!GET_TX_CHANNEL_INFO(channel, fBLOCKREADINTR))
+		wake_up_interruptible(&channel->fblockread_event);
+
+	/* Don't consume any data until we've opened the channel */
+	if (channel->state != SMD_CHANNEL_OPENED)
+		goto out;
+
+	/* Indicate that we've seen the new data */
+	SET_RX_CHANNEL_INFO(channel, fHEAD, 0);
+
+	/* Consume data */
+	for (;;) {
+		avail = qcom_smd_channel_get_rx_avail(channel);
+
+		if (!channel->pkt_size && avail >= SMD_PACKET_HEADER_LEN) {
+			qcom_smd_channel_peek(channel, &pktlen, sizeof(pktlen));
+			qcom_smd_channel_advance(channel, SMD_PACKET_HEADER_LEN);
+			channel->pkt_size = pktlen;
+		} else if (channel->pkt_size && avail >= channel->pkt_size) {
+			ret = qcom_smd_channel_recv_single(channel);
+			if (ret)
+				break;
+		} else {
+			break;
+		}
+	}
+
+	/* Indicate that we have seen and updated tail */
+	SET_RX_CHANNEL_INFO(channel, fTAIL, 1);
+
+	/* Signal the remote that we've consumed the data (if requested) */
+	if (!GET_RX_CHANNEL_INFO(channel, fBLOCKREADINTR)) {
+		/* Ensure ordering of channel info updates */
+		wmb();
+
+		qcom_smd_signal_channel(channel);
+	}
+
+out:
+	return need_state_scan;
+}
+
+/*
+ * The edge interrupts are triggered by the remote processor on state changes,
+ * channel info updates or when new channels are created.
+ */
+static irqreturn_t qcom_smd_edge_intr(int irq, void *data)
+{
+	struct qcom_smd_edge *edge = data;
+	struct qcom_smd_channel *channel;
+	unsigned available;
+	bool kick_worker = false;
+
+	/*
+	 * Handle state changes or data on each of the channels on this edge
+	 */
+	spin_lock(&edge->channels_lock);
+	list_for_each_entry(channel, &edge->channels, list) {
+		spin_lock(&channel->recv_lock);
+		kick_worker |= qcom_smd_channel_intr(channel);
+		spin_unlock(&channel->recv_lock);
+	}
+	spin_unlock(&edge->channels_lock);
+
+	/*
+	 * Creating a new channel requires allocating an smem entry, so we only
+	 * have to scan if the amount of available space in smem have changed
+	 * since last scan.
+	 */
+	available = qcom_smem_get_free_space(edge->edge_id);
+	if (available != edge->smem_available) {
+		edge->smem_available = available;
+		edge->need_rescan = true;
+		kick_worker = true;
+	}
+
+	if (kick_worker)
+		schedule_work(&edge->work);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Delivers any outstanding packets in the rx fifo, can be used after probe of
+ * the clients to deliver any packets that wasn't delivered before the client
+ * was setup.
+ */
+static void qcom_smd_channel_resume(struct qcom_smd_channel *channel)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&channel->recv_lock, flags);
+	qcom_smd_channel_intr(channel);
+	spin_unlock_irqrestore(&channel->recv_lock, flags);
+}
+
+/*
+ * Calculate how much space is available in the tx fifo.
+ */
+static size_t qcom_smd_get_tx_avail(struct qcom_smd_channel *channel)
+{
+	unsigned head;
+	unsigned tail;
+	unsigned mask = channel->fifo_size - 1;
+
+	head = GET_TX_CHANNEL_INFO(channel, head);
+	tail = GET_TX_CHANNEL_INFO(channel, tail);
+
+	return mask - ((head - tail) & mask);
+}
+
+/*
+ * Write count bytes of data into channel, possibly wrapping in the ring buffer
+ */
+static int qcom_smd_write_fifo(struct qcom_smd_channel *channel,
+			       const void *data,
+			       size_t count)
+{
+	bool word_aligned;
+	unsigned head;
+	size_t len;
+
+	word_aligned = channel->tx_info_word != NULL;
+	head = GET_TX_CHANNEL_INFO(channel, head);
+
+	len = min_t(size_t, count, channel->fifo_size - head);
+	if (len) {
+		smd_copy_to_fifo(channel->tx_fifo + head,
+				 data,
+				 len,
+				 word_aligned);
+	}
+
+	if (len != count) {
+		smd_copy_to_fifo(channel->tx_fifo,
+				 data + len,
+				 count - len,
+				 word_aligned);
+	}
+
+	head += count;
+	head &= (channel->fifo_size - 1);
+	SET_TX_CHANNEL_INFO(channel, head, head);
+
+	return count;
+}
+
+/**
+ * qcom_smd_send - write data to smd channel
+ * @channel:	channel handle
+ * @data:	buffer of data to write
+ * @len:	number of bytes to write
+ *
+ * This is a blocking write of len bytes into the channel's tx ring buffer and
+ * signal the remote end. It will sleep until there is enough space available
+ * in the tx buffer, utilizing the fBLOCKREADINTR signaling mechanism to avoid
+ * polling.
+ */
+int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len)
+{
+	u32 hdr[5] = {len,};
+	int tlen = sizeof(hdr) + len;
+	int ret;
+
+	/* Word aligned channels only accept word size aligned data */
+	if (channel->rx_info_word != NULL && len % 4)
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&channel->tx_lock);
+	if (ret)
+		return ret;
+
+	while (qcom_smd_get_tx_avail(channel) < tlen) {
+		if (channel->state != SMD_CHANNEL_OPENED) {
+			ret = -EPIPE;
+			goto out;
+		}
+
+		SET_TX_CHANNEL_INFO(channel, fBLOCKREADINTR, 1);
+
+		ret = wait_event_interruptible(channel->fblockread_event,
+				       qcom_smd_get_tx_avail(channel) >= tlen ||
+				       channel->state != SMD_CHANNEL_OPENED);
+		if (ret)
+			goto out;
+
+		SET_TX_CHANNEL_INFO(channel, fBLOCKREADINTR, 0);
+	}
+
+	SET_TX_CHANNEL_INFO(channel, fTAIL, 0);
+
+	qcom_smd_write_fifo(channel, hdr, sizeof(hdr));
+	qcom_smd_write_fifo(channel, data, len);
+
+	SET_TX_CHANNEL_INFO(channel, fHEAD, 1);
+
+	/* Ensure ordering of channel info updates */
+	wmb();
+
+	qcom_smd_signal_channel(channel);
+
+out:
+	mutex_unlock(&channel->tx_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(qcom_smd_send);
+
+static struct qcom_smd_device *to_smd_device(struct device *dev)
+{
+	return container_of(dev, struct qcom_smd_device, dev);
+}
+
+static struct qcom_smd_driver *to_smd_driver(struct device *dev)
+{
+	struct qcom_smd_device *qsdev = to_smd_device(dev);
+
+	return container_of(qsdev->dev.driver, struct qcom_smd_driver, driver);
+}
+
+static int qcom_smd_dev_match(struct device *dev, struct device_driver *drv)
+{
+	return of_driver_match_device(dev, drv);
+}
+
+/*
+ * Probe the smd client.
+ *
+ * The remote side have indicated that it want the channel to be opened, so
+ * complete the state handshake and probe our client driver.
+ */
+static int qcom_smd_dev_probe(struct device *dev)
+{
+	struct qcom_smd_device *qsdev = to_smd_device(dev);
+	struct qcom_smd_driver *qsdrv = to_smd_driver(dev);
+	struct qcom_smd_channel *channel = qsdev->channel;
+	size_t bb_size;
+	int ret;
+
+	/*
+	 * Packets are maximum 4k, but reduce if the fifo is smaller
+	 */
+	bb_size = min(channel->fifo_size, SZ_4K);
+	channel->bounce_buffer = kmalloc(bb_size, GFP_KERNEL);
+	if (!channel->bounce_buffer)
+		return -ENOMEM;
+
+	channel->cb = qsdrv->callback;
+
+	qcom_smd_channel_set_state(channel, SMD_CHANNEL_OPENING);
+
+	qcom_smd_channel_set_state(channel, SMD_CHANNEL_OPENED);
+
+	ret = qsdrv->probe(qsdev);
+	if (ret)
+		goto err;
+
+	qcom_smd_channel_resume(channel);
+
+	return 0;
+
+err:
+	dev_err(&qsdev->dev, "probe failed\n");
+
+	channel->cb = NULL;
+	kfree(channel->bounce_buffer);
+	channel->bounce_buffer = NULL;
+
+	qcom_smd_channel_set_state(channel, SMD_CHANNEL_CLOSED);
+	return ret;
+}
+
+/*
+ * Remove the smd client.
+ *
+ * The channel is going away, for some reason, so remove the smd client and
+ * reset the channel state.
+ */
+static int qcom_smd_dev_remove(struct device *dev)
+{
+	struct qcom_smd_device *qsdev = to_smd_device(dev);
+	struct qcom_smd_driver *qsdrv = to_smd_driver(dev);
+	struct qcom_smd_channel *channel = qsdev->channel;
+	unsigned long flags;
+
+	qcom_smd_channel_set_state(channel, SMD_CHANNEL_CLOSING);
+
+	/*
+	 * Make sure we don't race with the code receiving data.
+	 */
+	spin_lock_irqsave(&channel->recv_lock, flags);
+	channel->cb = NULL;
+	spin_unlock_irqrestore(&channel->recv_lock, flags);
+
+	/* Wake up any sleepers in qcom_smd_send() */
+	wake_up_interruptible(&channel->fblockread_event);
+
+	/*
+	 * We expect that the client might block in remove() waiting for any
+	 * outstanding calls to qcom_smd_send() to wake up and finish.
+	 */
+	if (qsdrv->remove)
+		qsdrv->remove(qsdev);
+
+	/*
+	 * The client is now gone, cleanup and reset the channel state.
+	 */
+	channel->qsdev = NULL;
+	kfree(channel->bounce_buffer);
+	channel->bounce_buffer = NULL;
+
+	qcom_smd_channel_set_state(channel, SMD_CHANNEL_CLOSED);
+
+	qcom_smd_channel_reset(channel);
+
+	return 0;
+}
+
+static struct bus_type qcom_smd_bus = {
+	.name = "qcom_smd",
+	.match = qcom_smd_dev_match,
+	.probe = qcom_smd_dev_probe,
+	.remove = qcom_smd_dev_remove,
+};
+
+/*
+ * Release function for the qcom_smd_device object.
+ */
+static void qcom_smd_release_device(struct device *dev)
+{
+	struct qcom_smd_device *qsdev = to_smd_device(dev);
+
+	kfree(qsdev);
+}
+
+/*
+ * Finds the device_node for the smd child interested in this channel.
+ */
+static struct device_node *qcom_smd_match_channel(struct device_node *edge_node,
+						  const char *channel)
+{
+	struct device_node *child;
+	const char *name;
+	const char *key;
+	int ret;
+
+	for_each_available_child_of_node(edge_node, child) {
+		key = "qcom,smd-channels";
+		ret = of_property_read_string(child, key, &name);
+		if (ret) {
+			of_node_put(child);
+			continue;
+		}
+
+		if (strcmp(name, channel) == 0)
+			return child;
+	}
+
+	return NULL;
+}
+
+/*
+ * Create a smd client device for channel that is being opened.
+ */
+static int qcom_smd_create_device(struct qcom_smd_channel *channel)
+{
+	struct qcom_smd_device *qsdev;
+	struct qcom_smd_edge *edge = channel->edge;
+	struct device_node *node;
+	struct qcom_smd *smd = edge->smd;
+	int ret;
+
+	if (channel->qsdev)
+		return -EEXIST;
+
+	node = qcom_smd_match_channel(edge->of_node, channel->name);
+	if (!node) {
+		dev_dbg(smd->dev, "no match for '%s'\n", channel->name);
+		return -ENXIO;
+	}
+
+	dev_dbg(smd->dev, "registering '%s'\n", channel->name);
+
+	qsdev = kzalloc(sizeof(*qsdev), GFP_KERNEL);
+	if (!qsdev)
+		return -ENOMEM;
+
+	dev_set_name(&qsdev->dev, "%s.%s", edge->of_node->name, node->name);
+	qsdev->dev.parent = smd->dev;
+	qsdev->dev.bus = &qcom_smd_bus;
+	qsdev->dev.release = qcom_smd_release_device;
+	qsdev->dev.of_node = node;
+
+	qsdev->channel = channel;
+
+	channel->qsdev = qsdev;
+
+	ret = device_register(&qsdev->dev);
+	if (ret) {
+		dev_err(smd->dev, "device_register failed: %d\n", ret);
+		put_device(&qsdev->dev);
+	}
+
+	return ret;
+}
+
+/*
+ * Destroy a smd client device for a channel that's going away.
+ */
+static void qcom_smd_destroy_device(struct qcom_smd_channel *channel)
+{
+	struct device *dev;
+
+	BUG_ON(!channel->qsdev);
+
+	dev = &channel->qsdev->dev;
+
+	device_unregister(dev);
+	of_node_put(dev->of_node);
+	put_device(dev);
+}
+
+/**
+ * qcom_smd_driver_register - register a smd driver
+ * @qsdrv:	qcom_smd_driver struct
+ */
+int qcom_smd_driver_register(struct qcom_smd_driver *qsdrv)
+{
+	qsdrv->driver.bus = &qcom_smd_bus;
+	return driver_register(&qsdrv->driver);
+}
+EXPORT_SYMBOL(qcom_smd_driver_register);
+
+/**
+ * qcom_smd_driver_unregister - unregister a smd driver
+ * @qsdrv:	qcom_smd_driver struct
+ */
+void qcom_smd_driver_unregister(struct qcom_smd_driver *qsdrv)
+{
+	driver_unregister(&qsdrv->driver);
+}
+EXPORT_SYMBOL(qcom_smd_driver_unregister);
+
+/*
+ * Allocate the qcom_smd_channel object for a newly found smd channel,
+ * retrieving and validating the smem items involved.
+ */
+static struct qcom_smd_channel *qcom_smd_create_channel(struct qcom_smd_edge *edge,
+							unsigned smem_info_item,
+							unsigned smem_fifo_item,
+							char *name)
+{
+	struct qcom_smd_channel *channel;
+	struct qcom_smd *smd = edge->smd;
+	size_t fifo_size;
+	size_t info_size;
+	void *fifo_base;
+	void *info;
+	int ret;
+
+	channel = devm_kzalloc(smd->dev, sizeof(*channel), GFP_KERNEL);
+	if (!channel)
+		return ERR_PTR(-ENOMEM);
+
+	channel->edge = edge;
+	channel->name = devm_kstrdup(smd->dev, name, GFP_KERNEL);
+	if (!channel->name)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&channel->tx_lock);
+	spin_lock_init(&channel->recv_lock);
+	init_waitqueue_head(&channel->fblockread_event);
+
+	ret = qcom_smem_get(edge->edge_id, smem_info_item, (void **)&info, &info_size);
+	if (ret)
+		goto free_name_and_channel;
+
+	/*
+	 * Use the size of the item to figure out which channel info struct to
+	 * use.
+	 */
+	if (info_size == 2 * sizeof(struct smd_channel_info_word)) {
+		channel->tx_info_word = info;
+		channel->rx_info_word = info + sizeof(struct smd_channel_info_word);
+	} else if (info_size == 2 * sizeof(struct smd_channel_info)) {
+		channel->tx_info = info;
+		channel->rx_info = info + sizeof(struct smd_channel_info);
+	} else {
+		dev_err(smd->dev,
+			"channel info of size %zu not supported\n", info_size);
+		ret = -EINVAL;
+		goto free_name_and_channel;
+	}
+
+	ret = qcom_smem_get(edge->edge_id, smem_fifo_item, &fifo_base, &fifo_size);
+	if (ret)
+		goto free_name_and_channel;
+
+	/* The channel consist of a rx and tx fifo of equal size */
+	fifo_size /= 2;
+
+	dev_dbg(smd->dev, "new channel '%s' info-size: %zu fifo-size: %zu\n",
+			  name, info_size, fifo_size);
+
+	channel->tx_fifo = fifo_base;
+	channel->rx_fifo = fifo_base + fifo_size;
+	channel->fifo_size = fifo_size;
+
+	qcom_smd_channel_reset(channel);
+
+	return channel;
+
+free_name_and_channel:
+	devm_kfree(smd->dev, channel->name);
+	devm_kfree(smd->dev, channel);
+
+	return ERR_PTR(ret);
+}
+
+/*
+ * Scans the allocation table for any newly allocated channels, calls
+ * qcom_smd_create_channel() to create representations of these and add
+ * them to the edge's list of channels.
+ */
+static void qcom_discover_channels(struct qcom_smd_edge *edge)
+{
+	struct qcom_smd_alloc_entry *alloc_tbl;
+	struct qcom_smd_alloc_entry *entry;
+	struct qcom_smd_channel *channel;
+	struct qcom_smd *smd = edge->smd;
+	unsigned long flags;
+	unsigned fifo_id;
+	unsigned info_id;
+	int ret;
+	int tbl;
+	int i;
+
+	for (tbl = 0; tbl < SMD_ALLOC_TBL_COUNT; tbl++) {
+		ret = qcom_smem_get(edge->edge_id,
+				    smem_items[tbl].alloc_tbl_id,
+				    (void **)&alloc_tbl,
+				    NULL);
+		if (ret < 0)
+			continue;
+
+		for (i = 0; i < SMD_ALLOC_TBL_SIZE; i++) {
+			entry = &alloc_tbl[i];
+			if (test_bit(i, edge->allocated[tbl]))
+				continue;
+
+			if (entry->ref_count == 0)
+				continue;
+
+			if (!entry->name[0])
+				continue;
+
+			if (!(entry->flags & SMD_CHANNEL_FLAGS_PACKET))
+				continue;
+
+			if ((entry->flags & SMD_CHANNEL_FLAGS_EDGE_MASK) != edge->edge_id)
+				continue;
+
+			info_id = smem_items[tbl].info_base_id + entry->cid;
+			fifo_id = smem_items[tbl].fifo_base_id + entry->cid;
+
+			channel = qcom_smd_create_channel(edge, info_id, fifo_id, entry->name);
+			if (IS_ERR(channel))
+				continue;
+
+			spin_lock_irqsave(&edge->channels_lock, flags);
+			list_add(&channel->list, &edge->channels);
+			spin_unlock_irqrestore(&edge->channels_lock, flags);
+
+			dev_dbg(smd->dev, "new channel found: '%s'\n", channel->name);
+			set_bit(i, edge->allocated[tbl]);
+		}
+	}
+
+	schedule_work(&edge->work);
+}
+
+/*
+ * This per edge worker scans smem for any new channels and register these. It
+ * then scans all registered channels for state changes that should be handled
+ * by creating or destroying smd client devices for the registered channels.
+ *
+ * LOCKING: edge->channels_lock is not needed to be held during the traversal
+ * of the channels list as it's done synchronously with the only writer.
+ */
+static void qcom_channel_state_worker(struct work_struct *work)
+{
+	struct qcom_smd_channel *channel;
+	struct qcom_smd_edge *edge = container_of(work,
+						  struct qcom_smd_edge,
+						  work);
+	unsigned remote_state;
+
+	/*
+	 * Rescan smem if we have reason to belive that there are new channels.
+	 */
+	if (edge->need_rescan) {
+		edge->need_rescan = false;
+		qcom_discover_channels(edge);
+	}
+
+	/*
+	 * Register a device for any closed channel where the remote processor
+	 * is showing interest in opening the channel.
+	 */
+	list_for_each_entry(channel, &edge->channels, list) {
+		if (channel->state != SMD_CHANNEL_CLOSED)
+			continue;
+
+		remote_state = GET_RX_CHANNEL_INFO(channel, state);
+		if (remote_state != SMD_CHANNEL_OPENING &&
+		    remote_state != SMD_CHANNEL_OPENED)
+			continue;
+
+		qcom_smd_create_device(channel);
+	}
+
+	/*
+	 * Unregister the device for any channel that is opened where the
+	 * remote processor is closing the channel.
+	 */
+	list_for_each_entry(channel, &edge->channels, list) {
+		if (channel->state != SMD_CHANNEL_OPENING &&
+		    channel->state != SMD_CHANNEL_OPENED)
+			continue;
+
+		remote_state = GET_RX_CHANNEL_INFO(channel, state);
+		if (remote_state == SMD_CHANNEL_OPENING ||
+		    remote_state == SMD_CHANNEL_OPENED)
+			continue;
+
+		qcom_smd_destroy_device(channel);
+	}
+}
+
+/*
+ * Parses an of_node describing an edge.
+ */
+static int qcom_smd_parse_edge(struct device *dev,
+			       struct device_node *node,
+			       struct qcom_smd_edge *edge)
+{
+	struct device_node *syscon_np;
+	const char *key;
+	int irq;
+	int ret;
+
+	INIT_LIST_HEAD(&edge->channels);
+	spin_lock_init(&edge->channels_lock);
+
+	INIT_WORK(&edge->work, qcom_channel_state_worker);
+
+	edge->of_node = of_node_get(node);
+
+	irq = irq_of_parse_and_map(node, 0);
+	if (irq < 0) {
+		dev_err(dev, "required smd interrupt missing\n");
+		return -EINVAL;
+	}
+
+	ret = devm_request_irq(dev, irq,
+			       qcom_smd_edge_intr, IRQF_TRIGGER_RISING,
+			       node->name, edge);
+	if (ret) {
+		dev_err(dev, "failed to request smd irq\n");
+		return ret;
+	}
+
+	edge->irq = irq;
+
+	key = "qcom,smd-edge";
+	ret = of_property_read_u32(node, key, &edge->edge_id);
+	if (ret) {
+		dev_err(dev, "edge missing %s property\n", key);
+		return -EINVAL;
+	}
+
+	syscon_np = of_parse_phandle(node, "qcom,ipc", 0);
+	if (!syscon_np) {
+		dev_err(dev, "no qcom,ipc node\n");
+		return -ENODEV;
+	}
+
+	edge->ipc_regmap = syscon_node_to_regmap(syscon_np);
+	if (IS_ERR(edge->ipc_regmap))
+		return PTR_ERR(edge->ipc_regmap);
+
+	key = "qcom,ipc";
+	ret = of_property_read_u32_index(node, key, 1, &edge->ipc_offset);
+	if (ret < 0) {
+		dev_err(dev, "no offset in %s\n", key);
+		return -EINVAL;
+	}
+
+	ret = of_property_read_u32_index(node, key, 2, &edge->ipc_bit);
+	if (ret < 0) {
+		dev_err(dev, "no bit in %s\n", key);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int qcom_smd_probe(struct platform_device *pdev)
+{
+	struct qcom_smd_edge *edge;
+	struct device_node *node;
+	struct qcom_smd *smd;
+	size_t array_size;
+	int num_edges;
+	int ret;
+	int i = 0;
+
+	/* Wait for smem */
+	ret = qcom_smem_get(QCOM_SMEM_HOST_ANY, smem_items[0].alloc_tbl_id, NULL, NULL);
+	if (ret == -EPROBE_DEFER)
+		return ret;
+
+	num_edges = of_get_available_child_count(pdev->dev.of_node);
+	array_size = sizeof(*smd) + num_edges * sizeof(struct qcom_smd_edge);
+	smd = devm_kzalloc(&pdev->dev, array_size, GFP_KERNEL);
+	if (!smd)
+		return -ENOMEM;
+	smd->dev = &pdev->dev;
+
+	smd->num_edges = num_edges;
+	for_each_available_child_of_node(pdev->dev.of_node, node) {
+		edge = &smd->edges[i++];
+		edge->smd = smd;
+
+		ret = qcom_smd_parse_edge(&pdev->dev, node, edge);
+		if (ret)
+			continue;
+
+		edge->need_rescan = true;
+		schedule_work(&edge->work);
+	}
+
+	platform_set_drvdata(pdev, smd);
+
+	return 0;
+}
+
+/*
+ * Shut down all smd clients by making sure that each edge stops processing
+ * events and scanning for new channels, then call destroy on the devices.
+ */
+static int qcom_smd_remove(struct platform_device *pdev)
+{
+	struct qcom_smd_channel *channel;
+	struct qcom_smd_edge *edge;
+	struct qcom_smd *smd = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < smd->num_edges; i++) {
+		edge = &smd->edges[i];
+
+		disable_irq(edge->irq);
+		cancel_work_sync(&edge->work);
+
+		list_for_each_entry(channel, &edge->channels, list) {
+			if (!channel->qsdev)
+				continue;
+
+			qcom_smd_destroy_device(channel);
+		}
+	}
+
+	return 0;
+}
+
+static const struct of_device_id qcom_smd_of_match[] = {
+	{ .compatible = "qcom,smd" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, qcom_smd_of_match);
+
+static struct platform_driver qcom_smd_driver = {
+	.probe = qcom_smd_probe,
+	.remove = qcom_smd_remove,
+	.driver = {
+		.name = "qcom-smd",
+		.of_match_table = qcom_smd_of_match,
+	},
+};
+
+static int __init qcom_smd_init(void)
+{
+	int ret;
+
+	ret = bus_register(&qcom_smd_bus);
+	if (ret) {
+		pr_err("failed to register smd bus: %d\n", ret);
+		return ret;
+	}
+
+	return platform_driver_register(&qcom_smd_driver);
+}
+postcore_initcall(qcom_smd_init);
+
+static void __exit qcom_smd_exit(void)
+{
+	platform_driver_unregister(&qcom_smd_driver);
+	bus_unregister(&qcom_smd_bus);
+}
+module_exit(qcom_smd_exit);
+
+MODULE_AUTHOR("Bjorn Andersson <bjorn.andersson@sonymobile.com>");
+MODULE_DESCRIPTION("Qualcomm Shared Memory Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/soc/qcom/smd.h b/include/linux/soc/qcom/smd.h
new file mode 100644
index 000000000000..d7e50aa6a4ac
--- /dev/null
+++ b/include/linux/soc/qcom/smd.h
@@ -0,0 +1,46 @@
+#ifndef __QCOM_SMD_H__
+#define __QCOM_SMD_H__
+
+#include <linux/device.h>
+#include <linux/mod_devicetable.h>
+
+struct qcom_smd;
+struct qcom_smd_channel;
+struct qcom_smd_lookup;
+
+/**
+ * struct qcom_smd_device - smd device struct
+ * @dev:	the device struct
+ * @channel:	handle to the smd channel for this device
+ */
+struct qcom_smd_device {
+	struct device dev;
+	struct qcom_smd_channel *channel;
+};
+
+/**
+ * struct qcom_smd_driver - smd driver struct
+ * @driver:	underlying device driver
+ * @probe:	invoked when the smd channel is found
+ * @remove:	invoked when the smd channel is closed
+ * @callback:	invoked when an inbound message is received on the channel,
+ *		should return 0 on success or -EBUSY if the data cannot be
+ *		consumed at this time
+ */
+struct qcom_smd_driver {
+	struct device_driver driver;
+	int (*probe)(struct qcom_smd_device *dev);
+	void (*remove)(struct qcom_smd_device *dev);
+	int (*callback)(struct qcom_smd_device *, const void *, size_t);
+};
+
+int qcom_smd_driver_register(struct qcom_smd_driver *drv);
+void qcom_smd_driver_unregister(struct qcom_smd_driver *drv);
+
+#define module_qcom_smd_driver(__smd_driver) \
+	module_driver(__smd_driver, qcom_smd_driver_register, \
+		      qcom_smd_driver_unregister)
+
+int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len);
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From 936f14cf4e67168fcd37f10cebf5a475f490fb6e Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Date: Mon, 27 Jul 2015 20:20:32 -0700
Subject: soc: qcom: Driver for the Qualcomm RPM over SMD

Driver for the Resource Power Manager (RPM) found in Qualcomm 8974 based
devices.
The driver exposes resources that child drivers can operate on; to
implementing regulator, clock and bus frequency drivers.

Signed-off-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Signed-off-by: Andy Gross <agross@codeaurora.org>
---
 drivers/soc/qcom/Kconfig         |  14 +++
 drivers/soc/qcom/Makefile        |   1 +
 drivers/soc/qcom/smd-rpm.c       | 244 +++++++++++++++++++++++++++++++++++++++
 include/linux/soc/qcom/smd-rpm.h |  35 ++++++
 4 files changed, 294 insertions(+)
 create mode 100644 drivers/soc/qcom/smd-rpm.c
 create mode 100644 include/linux/soc/qcom/smd-rpm.h

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig
index 188295e2c9ba..ba47b70f4d85 100644
--- a/drivers/soc/qcom/Kconfig
+++ b/drivers/soc/qcom/Kconfig
@@ -27,6 +27,20 @@ config QCOM_SMD
 	  providing communication channels to remote processors in Qualcomm
 	  platforms.
 
+config QCOM_SMD_RPM
+	tristate "Qualcomm Resource Power Manager (RPM) over SMD"
+	depends on QCOM_SMD && OF
+	help
+	  If you say yes to this option, support will be included for the
+	  Resource Power Manager system found in the Qualcomm 8974 based
+	  devices.
+
+	  This is required to access many regulators, clocks and bus
+	  frequencies controlled by the RPM on these devices.
+
+	  Say M here if you want to include support for the Qualcomm RPM as a
+	  module. This will build a module called "qcom-smd-rpm".
+
 config QCOM_SMEM
 	tristate "Qualcomm Shared Memory Manager (SMEM)"
 	depends on ARCH_QCOM
diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile
index f961a8796ed2..10a93d168e0e 100644
--- a/drivers/soc/qcom/Makefile
+++ b/drivers/soc/qcom/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_QCOM_GSBI)	+=	qcom_gsbi.o
 obj-$(CONFIG_QCOM_PM)	+=	spm.o
 obj-$(CONFIG_QCOM_SMD) +=	smd.o
+obj-$(CONFIG_QCOM_SMD_RPM)	+= smd-rpm.o
 obj-$(CONFIG_QCOM_SMEM) +=	smem.o
diff --git a/drivers/soc/qcom/smd-rpm.c b/drivers/soc/qcom/smd-rpm.c
new file mode 100644
index 000000000000..1392ccf14a20
--- /dev/null
+++ b/drivers/soc/qcom/smd-rpm.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2015, Sony Mobile Communications AB.
+ * Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/of_platform.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+
+#include <linux/soc/qcom/smd.h>
+#include <linux/soc/qcom/smd-rpm.h>
+
+#define RPM_REQUEST_TIMEOUT     (5 * HZ)
+
+/**
+ * struct qcom_smd_rpm - state of the rpm device driver
+ * @rpm_channel:	reference to the smd channel
+ * @ack:		completion for acks
+ * @lock:		mutual exclusion around the send/complete pair
+ * @ack_status:		result of the rpm request
+ */
+struct qcom_smd_rpm {
+	struct qcom_smd_channel *rpm_channel;
+
+	struct completion ack;
+	struct mutex lock;
+	int ack_status;
+};
+
+/**
+ * struct qcom_rpm_header - header for all rpm requests and responses
+ * @service_type:	identifier of the service
+ * @length:		length of the payload
+ */
+struct qcom_rpm_header {
+	u32 service_type;
+	u32 length;
+};
+
+/**
+ * struct qcom_rpm_request - request message to the rpm
+ * @msg_id:	identifier of the outgoing message
+ * @flags:	active/sleep state flags
+ * @type:	resource type
+ * @id:		resource id
+ * @data_len:	length of the payload following this header
+ */
+struct qcom_rpm_request {
+	u32 msg_id;
+	u32 flags;
+	u32 type;
+	u32 id;
+	u32 data_len;
+};
+
+/**
+ * struct qcom_rpm_message - response message from the rpm
+ * @msg_type:	indicator of the type of message
+ * @length:	the size of this message, including the message header
+ * @msg_id:	message id
+ * @message:	textual message from the rpm
+ *
+ * Multiple of these messages can be stacked in an rpm message.
+ */
+struct qcom_rpm_message {
+	u32 msg_type;
+	u32 length;
+	union {
+		u32 msg_id;
+		u8 message[0];
+	};
+};
+
+#define RPM_SERVICE_TYPE_REQUEST	0x00716572 /* "req\0" */
+
+#define RPM_MSG_TYPE_ERR		0x00727265 /* "err\0" */
+#define RPM_MSG_TYPE_MSG_ID		0x2367736d /* "msg#" */
+
+/**
+ * qcom_rpm_smd_write - write @buf to @type:@id
+ * @rpm:	rpm handle
+ * @type:	resource type
+ * @id:		resource identifier
+ * @buf:	the data to be written
+ * @count:	number of bytes in @buf
+ */
+int qcom_rpm_smd_write(struct qcom_smd_rpm *rpm,
+		       int state,
+		       u32 type, u32 id,
+		       void *buf,
+		       size_t count)
+{
+	static unsigned msg_id = 1;
+	int left;
+	int ret;
+
+	struct {
+		struct qcom_rpm_header hdr;
+		struct qcom_rpm_request req;
+		u8 payload[count];
+	} pkt;
+
+	/* SMD packets to the RPM may not exceed 256 bytes */
+	if (WARN_ON(sizeof(pkt) >= 256))
+		return -EINVAL;
+
+	mutex_lock(&rpm->lock);
+
+	pkt.hdr.service_type = RPM_SERVICE_TYPE_REQUEST;
+	pkt.hdr.length = sizeof(struct qcom_rpm_request) + count;
+
+	pkt.req.msg_id = msg_id++;
+	pkt.req.flags = BIT(state);
+	pkt.req.type = type;
+	pkt.req.id = id;
+	pkt.req.data_len = count;
+	memcpy(pkt.payload, buf, count);
+
+	ret = qcom_smd_send(rpm->rpm_channel, &pkt, sizeof(pkt));
+	if (ret)
+		goto out;
+
+	left = wait_for_completion_timeout(&rpm->ack, RPM_REQUEST_TIMEOUT);
+	if (!left)
+		ret = -ETIMEDOUT;
+	else
+		ret = rpm->ack_status;
+
+out:
+	mutex_unlock(&rpm->lock);
+	return ret;
+}
+EXPORT_SYMBOL(qcom_rpm_smd_write);
+
+static int qcom_smd_rpm_callback(struct qcom_smd_device *qsdev,
+				 const void *data,
+				 size_t count)
+{
+	const struct qcom_rpm_header *hdr = data;
+	const struct qcom_rpm_message *msg;
+	struct qcom_smd_rpm *rpm = dev_get_drvdata(&qsdev->dev);
+	const u8 *buf = data + sizeof(struct qcom_rpm_header);
+	const u8 *end = buf + hdr->length;
+	char msgbuf[32];
+	int status = 0;
+	u32 len;
+
+	if (hdr->service_type != RPM_SERVICE_TYPE_REQUEST ||
+	    hdr->length < sizeof(struct qcom_rpm_message)) {
+		dev_err(&qsdev->dev, "invalid request\n");
+		return 0;
+	}
+
+	while (buf < end) {
+		msg = (struct qcom_rpm_message *)buf;
+		switch (msg->msg_type) {
+		case RPM_MSG_TYPE_MSG_ID:
+			break;
+		case RPM_MSG_TYPE_ERR:
+			len = min_t(u32, ALIGN(msg->length, 4), sizeof(msgbuf));
+			memcpy_fromio(msgbuf, msg->message, len);
+			msgbuf[len - 1] = 0;
+
+			if (!strcmp(msgbuf, "resource does not exist"))
+				status = -ENXIO;
+			else
+				status = -EINVAL;
+			break;
+		}
+
+		buf = PTR_ALIGN(buf + 2 * sizeof(u32) + msg->length, 4);
+	}
+
+	rpm->ack_status = status;
+	complete(&rpm->ack);
+	return 0;
+}
+
+static int qcom_smd_rpm_probe(struct qcom_smd_device *sdev)
+{
+	struct qcom_smd_rpm *rpm;
+
+	rpm = devm_kzalloc(&sdev->dev, sizeof(*rpm), GFP_KERNEL);
+	if (!rpm)
+		return -ENOMEM;
+
+	mutex_init(&rpm->lock);
+	init_completion(&rpm->ack);
+
+	rpm->rpm_channel = sdev->channel;
+
+	dev_set_drvdata(&sdev->dev, rpm);
+
+	return of_platform_populate(sdev->dev.of_node, NULL, NULL, &sdev->dev);
+}
+
+static void qcom_smd_rpm_remove(struct qcom_smd_device *sdev)
+{
+	of_platform_depopulate(&sdev->dev);
+}
+
+static const struct of_device_id qcom_smd_rpm_of_match[] = {
+	{ .compatible = "qcom,rpm-msm8974" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, qcom_smd_rpm_of_match);
+
+static struct qcom_smd_driver qcom_smd_rpm_driver = {
+	.probe = qcom_smd_rpm_probe,
+	.remove = qcom_smd_rpm_remove,
+	.callback = qcom_smd_rpm_callback,
+	.driver  = {
+		.name  = "qcom_smd_rpm",
+		.owner = THIS_MODULE,
+		.of_match_table = qcom_smd_rpm_of_match,
+	},
+};
+
+static int __init qcom_smd_rpm_init(void)
+{
+	return qcom_smd_driver_register(&qcom_smd_rpm_driver);
+}
+arch_initcall(qcom_smd_rpm_init);
+
+static void __exit qcom_smd_rpm_exit(void)
+{
+	qcom_smd_driver_unregister(&qcom_smd_rpm_driver);
+}
+module_exit(qcom_smd_rpm_exit);
+
+MODULE_AUTHOR("Bjorn Andersson <bjorn.andersson@sonymobile.com>");
+MODULE_DESCRIPTION("Qualcomm SMD backed RPM driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/soc/qcom/smd-rpm.h b/include/linux/soc/qcom/smd-rpm.h
new file mode 100644
index 000000000000..2a53dcaeeeed
--- /dev/null
+++ b/include/linux/soc/qcom/smd-rpm.h
@@ -0,0 +1,35 @@
+#ifndef __QCOM_SMD_RPM_H__
+#define __QCOM_SMD_RPM_H__
+
+struct qcom_smd_rpm;
+
+#define QCOM_SMD_RPM_ACTIVE_STATE        0
+#define QCOM_SMD_RPM_SLEEP_STATE         1
+
+/*
+ * Constants used for addressing resources in the RPM.
+ */
+#define QCOM_SMD_RPM_BOOST	0x61747362
+#define QCOM_SMD_RPM_BUS_CLK	0x316b6c63
+#define QCOM_SMD_RPM_BUS_MASTER	0x73616d62
+#define QCOM_SMD_RPM_BUS_SLAVE	0x766c7362
+#define QCOM_SMD_RPM_CLK_BUF_A	0x616B6C63
+#define QCOM_SMD_RPM_LDOA	0x616f646c
+#define QCOM_SMD_RPM_LDOB	0x626F646C
+#define QCOM_SMD_RPM_MEM_CLK	0x326b6c63
+#define QCOM_SMD_RPM_MISC_CLK	0x306b6c63
+#define QCOM_SMD_RPM_NCPA	0x6170636E
+#define QCOM_SMD_RPM_NCPB	0x6270636E
+#define QCOM_SMD_RPM_OCMEM_PWR	0x706d636f
+#define QCOM_SMD_RPM_QPIC_CLK	0x63697071
+#define QCOM_SMD_RPM_SMPA	0x61706d73
+#define QCOM_SMD_RPM_SMPB	0x62706d73
+#define QCOM_SMD_RPM_SPDM	0x63707362
+#define QCOM_SMD_RPM_VSA	0x00617376
+
+int qcom_rpm_smd_write(struct qcom_smd_rpm *rpm,
+		       int state,
+		       u32 resource_type, u32 resource_id,
+		       void *buf, size_t count);
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From ad3aedfbb04b3a2af54473cfe31f13953cfe9d84 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 28 Jul 2015 14:46:08 +0100
Subject: genirq/irqdomain: Allow irq domain aliasing

It is not uncommon (at least with the ARM stuff) to have a piece
of hardware that implements different flavours of "interrupts".
A typical example of this is the GICv3 ITS, which implements
standard PCI/MSI support, but also some form of "generic MSI".

So far, the PCI/MSI domain is registered using the ITS device_node,
so that irq_find_host can return it. On the contrary, the raw MSI
domain is not registered with an device_node, making it impossible
to be looked up by another subsystem (obviously, using the same
device_node twice would only result in confusion, as it is not
defined which one irq_find_host would return).

A solution to this is to "type" domains that may be aliasing, and
to be able to lookup an device_node that matches a given type.
For this, we introduce irq_find_matching_host() as a superset
of irq_find_host:

struct irq_domain *irq_find_matching_host(struct device_node *node,
                                enum irq_domain_bus_token bus_token);

where bus_token is the "type" we want to match the domain against
(so far, only DOMAIN_BUS_ANY is defined). This result in some
moderately invasive changes on the PPC side (which is the only
user of the .match method).

This has otherwise no functionnal change.

Reviewed-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: Yijing Wang <wangyijing@huawei.com>
Cc: Ma Jun <majun258@huawei.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Duc Dang <dhdang@apm.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1438091186-10244-2-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/powerpc/platforms/512x/mpc5121_ads_cpld.c   |  3 ++-
 arch/powerpc/platforms/cell/interrupt.c          |  3 ++-
 arch/powerpc/platforms/embedded6xx/flipper-pic.c |  3 ++-
 arch/powerpc/platforms/powermac/pic.c            |  3 ++-
 arch/powerpc/platforms/powernv/opal-irqchip.c    |  3 ++-
 arch/powerpc/platforms/ps3/interrupt.c           |  3 ++-
 arch/powerpc/sysdev/ehv_pic.c                    |  3 ++-
 arch/powerpc/sysdev/i8259.c                      |  3 ++-
 arch/powerpc/sysdev/ipic.c                       |  3 ++-
 arch/powerpc/sysdev/mpic.c                       |  3 ++-
 arch/powerpc/sysdev/qe_lib/qe_ic.c               |  3 ++-
 arch/powerpc/sysdev/xics/xics-common.c           |  3 ++-
 include/linux/irqdomain.h                        | 23 +++++++++++++++++++++--
 kernel/irq/irqdomain.c                           | 18 +++++++++++++-----
 14 files changed, 58 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c
index ca3a062ed1b9..11090ab4bf59 100644
--- a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c
+++ b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c
@@ -123,7 +123,8 @@ cpld_pic_cascade(unsigned int irq, struct irq_desc *desc)
 }
 
 static int
-cpld_pic_host_match(struct irq_domain *h, struct device_node *node)
+cpld_pic_host_match(struct irq_domain *h, struct device_node *node,
+		    enum irq_domain_bus_token bus_token)
 {
 	return cpld_pic_node == node;
 }
diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c
index 3af8324c122e..a15f1efc295f 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -222,7 +222,8 @@ void iic_request_IPIs(void)
 #endif /* CONFIG_SMP */
 
 
-static int iic_host_match(struct irq_domain *h, struct device_node *node)
+static int iic_host_match(struct irq_domain *h, struct device_node *node,
+			  enum irq_domain_bus_token bus_token)
 {
 	return of_device_is_compatible(node,
 				    "IBM,CBEA-Internal-Interrupt-Controller");
diff --git a/arch/powerpc/platforms/embedded6xx/flipper-pic.c b/arch/powerpc/platforms/embedded6xx/flipper-pic.c
index 4cde8e7da4b8..b7866e01483d 100644
--- a/arch/powerpc/platforms/embedded6xx/flipper-pic.c
+++ b/arch/powerpc/platforms/embedded6xx/flipper-pic.c
@@ -108,7 +108,8 @@ static int flipper_pic_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static int flipper_pic_match(struct irq_domain *h, struct device_node *np)
+static int flipper_pic_match(struct irq_domain *h, struct device_node *np,
+			     enum irq_domain_bus_token bus_token)
 {
 	return 1;
 }
diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index 59cfc9d63c2d..6f4f8b060def 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -268,7 +268,8 @@ static struct irqaction gatwick_cascade_action = {
 	.name		= "cascade",
 };
 
-static int pmac_pic_host_match(struct irq_domain *h, struct device_node *node)
+static int pmac_pic_host_match(struct irq_domain *h, struct device_node *node,
+			       enum irq_domain_bus_token bus_token)
 {
 	/* We match all, we don't always have a node anyway */
 	return 1;
diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c
index e2e7d75f52f3..2c91ee7800b9 100644
--- a/arch/powerpc/platforms/powernv/opal-irqchip.c
+++ b/arch/powerpc/platforms/powernv/opal-irqchip.c
@@ -134,7 +134,8 @@ static void opal_handle_irq_work(struct irq_work *work)
 	opal_handle_events(be64_to_cpu(last_outstanding_events));
 }
 
-static int opal_event_match(struct irq_domain *h, struct device_node *node)
+static int opal_event_match(struct irq_domain *h, struct device_node *node,
+			    enum irq_domain_bus_token bus_token)
 {
 	return h->of_node == node;
 }
diff --git a/arch/powerpc/platforms/ps3/interrupt.c b/arch/powerpc/platforms/ps3/interrupt.c
index a6c42f34303a..638c4060938e 100644
--- a/arch/powerpc/platforms/ps3/interrupt.c
+++ b/arch/powerpc/platforms/ps3/interrupt.c
@@ -678,7 +678,8 @@ static int ps3_host_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static int ps3_host_match(struct irq_domain *h, struct device_node *np)
+static int ps3_host_match(struct irq_domain *h, struct device_node *np,
+			  enum irq_domain_bus_token bus_token)
 {
 	/* Match all */
 	return 1;
diff --git a/arch/powerpc/sysdev/ehv_pic.c b/arch/powerpc/sysdev/ehv_pic.c
index 2d20f10a4203..eca0b00794fa 100644
--- a/arch/powerpc/sysdev/ehv_pic.c
+++ b/arch/powerpc/sysdev/ehv_pic.c
@@ -177,7 +177,8 @@ unsigned int ehv_pic_get_irq(void)
 	return irq_linear_revmap(global_ehv_pic->irqhost, irq);
 }
 
-static int ehv_pic_host_match(struct irq_domain *h, struct device_node *node)
+static int ehv_pic_host_match(struct irq_domain *h, struct device_node *node,
+			      enum irq_domain_bus_token bus_token)
 {
 	/* Exact match, unless ehv_pic node is NULL */
 	return h->of_node == NULL || h->of_node == node;
diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c
index 31c33475c7b7..e1a9c2c2d5d3 100644
--- a/arch/powerpc/sysdev/i8259.c
+++ b/arch/powerpc/sysdev/i8259.c
@@ -162,7 +162,8 @@ static struct resource pic_edgectrl_iores = {
 	.flags = IORESOURCE_BUSY,
 };
 
-static int i8259_host_match(struct irq_domain *h, struct device_node *node)
+static int i8259_host_match(struct irq_domain *h, struct device_node *node,
+			    enum irq_domain_bus_token bus_token)
 {
 	return h->of_node == NULL || h->of_node == node;
 }
diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c
index d78f1364b639..6b2b68914810 100644
--- a/arch/powerpc/sysdev/ipic.c
+++ b/arch/powerpc/sysdev/ipic.c
@@ -671,7 +671,8 @@ static struct irq_chip ipic_edge_irq_chip = {
 	.irq_set_type	= ipic_set_irq_type,
 };
 
-static int ipic_host_match(struct irq_domain *h, struct device_node *node)
+static int ipic_host_match(struct irq_domain *h, struct device_node *node,
+			   enum irq_domain_bus_token bus_token)
 {
 	/* Exact match, unless ipic node is NULL */
 	return h->of_node == NULL || h->of_node == node;
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index c8e73332eaad..97a8ae8f94dd 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -1007,7 +1007,8 @@ static struct irq_chip mpic_irq_ht_chip = {
 #endif /* CONFIG_MPIC_U3_HT_IRQS */
 
 
-static int mpic_host_match(struct irq_domain *h, struct device_node *node)
+static int mpic_host_match(struct irq_domain *h, struct device_node *node,
+			   enum irq_domain_bus_token bus_token)
 {
 	/* Exact match, unless mpic node is NULL */
 	return h->of_node == NULL || h->of_node == node;
diff --git a/arch/powerpc/sysdev/qe_lib/qe_ic.c b/arch/powerpc/sysdev/qe_lib/qe_ic.c
index 6512cd8caa51..47b352e4bc74 100644
--- a/arch/powerpc/sysdev/qe_lib/qe_ic.c
+++ b/arch/powerpc/sysdev/qe_lib/qe_ic.c
@@ -244,7 +244,8 @@ static struct irq_chip qe_ic_irq_chip = {
 	.irq_mask_ack = qe_ic_mask_irq,
 };
 
-static int qe_ic_host_match(struct irq_domain *h, struct device_node *node)
+static int qe_ic_host_match(struct irq_domain *h, struct device_node *node,
+			    enum irq_domain_bus_token bus_token)
 {
 	/* Exact match, unless qe_ic node is NULL */
 	return h->of_node == NULL || h->of_node == node;
diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c
index 08c248eb491b..47e43b7b076b 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -298,7 +298,8 @@ int xics_get_irq_server(unsigned int virq, const struct cpumask *cpumask,
 }
 #endif /* CONFIG_SMP */
 
-static int xics_host_match(struct irq_domain *h, struct device_node *node)
+static int xics_host_match(struct irq_domain *h, struct device_node *node,
+			   enum irq_domain_bus_token bus_token)
 {
 	struct ics *ics;
 
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 744ac0ec98eb..91a83adf5e45 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -45,6 +45,17 @@ struct irq_data;
 /* Number of irqs reserved for a legacy isa controller */
 #define NUM_ISA_INTERRUPTS	16
 
+/*
+ * Should several domains have the same device node, but serve
+ * different purposes (for example one domain is for PCI/MSI, and the
+ * other for wired IRQs), they can be distinguished using a
+ * bus-specific token. Most domains are expected to only carry
+ * DOMAIN_BUS_ANY.
+ */
+enum irq_domain_bus_token {
+	DOMAIN_BUS_ANY		= 0,
+};
+
 /**
  * struct irq_domain_ops - Methods for irq_domain objects
  * @match: Match an interrupt controller device node to a host, returns
@@ -61,7 +72,8 @@ struct irq_data;
  * to setup the irq_desc when returning from map().
  */
 struct irq_domain_ops {
-	int (*match)(struct irq_domain *d, struct device_node *node);
+	int (*match)(struct irq_domain *d, struct device_node *node,
+		     enum irq_domain_bus_token bus_token);
 	int (*map)(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw);
 	void (*unmap)(struct irq_domain *d, unsigned int virq);
 	int (*xlate)(struct irq_domain *d, struct device_node *node,
@@ -116,6 +128,7 @@ struct irq_domain {
 
 	/* Optional data */
 	struct device_node *of_node;
+	enum irq_domain_bus_token bus_token;
 	struct irq_domain_chip_generic *gc;
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
 	struct irq_domain *parent;
@@ -161,9 +174,15 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 					 irq_hw_number_t first_hwirq,
 					 const struct irq_domain_ops *ops,
 					 void *host_data);
-extern struct irq_domain *irq_find_host(struct device_node *node);
+extern struct irq_domain *irq_find_matching_host(struct device_node *node,
+						 enum irq_domain_bus_token bus_token);
 extern void irq_set_default_host(struct irq_domain *host);
 
+static inline struct irq_domain *irq_find_host(struct device_node *node)
+{
+	return irq_find_matching_host(node, DOMAIN_BUS_ANY);
+}
+
 /**
  * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain.
  * @of_node: pointer to interrupt controller's device tree node.
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8c3577fef78c..79baaf8a7813 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -187,10 +187,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
 
 /**
- * irq_find_host() - Locates a domain for a given device node
+ * irq_find_matching_host() - Locates a domain for a given device node
  * @node: device-tree node of the interrupt controller
+ * @bus_token: domain-specific data
  */
-struct irq_domain *irq_find_host(struct device_node *node)
+struct irq_domain *irq_find_matching_host(struct device_node *node,
+					  enum irq_domain_bus_token bus_token)
 {
 	struct irq_domain *h, *found = NULL;
 	int rc;
@@ -199,13 +201,19 @@ struct irq_domain *irq_find_host(struct device_node *node)
 	 * it might potentially be set to match all interrupts in
 	 * the absence of a device node. This isn't a problem so far
 	 * yet though...
+	 *
+	 * bus_token == DOMAIN_BUS_ANY matches any domain, any other
+	 * values must generate an exact match for the domain to be
+	 * selected.
 	 */
 	mutex_lock(&irq_domain_mutex);
 	list_for_each_entry(h, &irq_domain_list, link) {
 		if (h->ops->match)
-			rc = h->ops->match(h, node);
+			rc = h->ops->match(h, node, bus_token);
 		else
-			rc = (h->of_node != NULL) && (h->of_node == node);
+			rc = ((h->of_node != NULL) && (h->of_node == node) &&
+			      ((bus_token == DOMAIN_BUS_ANY) ||
+			       (h->bus_token == bus_token)));
 
 		if (rc) {
 			found = h;
@@ -215,7 +223,7 @@ struct irq_domain *irq_find_host(struct device_node *node)
 	mutex_unlock(&irq_domain_mutex);
 	return found;
 }
-EXPORT_SYMBOL_GPL(irq_find_host);
+EXPORT_SYMBOL_GPL(irq_find_matching_host);
 
 /**
  * irq_set_default_host() - Set a "default" irq domain
-- 
cgit v1.2.3-70-g09d2


From 0380839dc90c53e24ddfa0f17ad909c2ddc345c2 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 28 Jul 2015 14:46:09 +0100
Subject: PCI/MSI: Register irq domain with specific token

When creating a PCI/MSI domain, tag it with DOMAIN_BUS_PCI_MSI so
that it can be looked-up using irq_find_matching_host().

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: Yijing Wang <wangyijing@huawei.com>
Cc: Ma Jun <majun258@huawei.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Duc Dang <dhdang@apm.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1438091186-10244-3-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/msi.c         | 9 ++++++++-
 include/linux/irqdomain.h | 1 +
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index cd4c78c193de..3aae7c9ad31c 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1273,12 +1273,19 @@ struct irq_domain *pci_msi_create_irq_domain(struct device_node *node,
 					     struct msi_domain_info *info,
 					     struct irq_domain *parent)
 {
+	struct irq_domain *domain;
+
 	if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
 		pci_msi_domain_update_dom_ops(info);
 	if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
 		pci_msi_domain_update_chip_ops(info);
 
-	return msi_create_irq_domain(node, info, parent);
+	domain = msi_create_irq_domain(node, info, parent);
+	if (!domain)
+		return NULL;
+
+	domain->bus_token = DOMAIN_BUS_PCI_MSI;
+	return domain;
 }
 
 /**
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 91a83adf5e45..25e9e6696a65 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -54,6 +54,7 @@ struct irq_data;
  */
 enum irq_domain_bus_token {
 	DOMAIN_BUS_ANY		= 0,
+	DOMAIN_BUS_PCI_MSI,
 };
 
 /**
-- 
cgit v1.2.3-70-g09d2


From f1421db8ca4c110144be97a5997ed83d34685db5 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 28 Jul 2015 14:46:10 +0100
Subject: device core: Introduce per-device MSI domain pointer

As MSI-type features are creeping into non-PCI devices, it is
starting to make sense to give our struct device some form of
support for this, by allowing a pointer to an MSI irq domain to
be set/retrieved.

Reviewed-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: Yijing Wang <wangyijing@huawei.com>
Cc: Ma Jun <majun258@huawei.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Duc Dang <dhdang@apm.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1438091186-10244-4-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/device.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 3d3139ad5705..50e000576d3c 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -715,6 +715,7 @@ struct device_dma_parameters {
  * @pins:	For device pin management.
  *		See Documentation/pinctrl.txt for details.
  * @msi_list:	Hosts MSI descriptors
+ * @msi_domain: The generic MSI domain this device is using.
  * @numa_node:	NUMA node this device is close to.
  * @dma_mask:	Dma mask (if dma'ble device).
  * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all
@@ -775,6 +776,9 @@ struct device {
 	struct dev_pm_info	power;
 	struct dev_pm_domain	*pm_domain;
 
+#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+	struct irq_domain	*msi_domain;
+#endif
 #ifdef CONFIG_PINCTRL
 	struct dev_pin_info	*pins;
 #endif
@@ -865,6 +869,22 @@ static inline void set_dev_node(struct device *dev, int node)
 }
 #endif
 
+static inline struct irq_domain *dev_get_msi_domain(const struct device *dev)
+{
+#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+	return dev->msi_domain;
+#else
+	return NULL;
+#endif
+}
+
+static inline void dev_set_msi_domain(struct device *dev, struct irq_domain *d)
+{
+#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+	dev->msi_domain = d;
+#endif
+}
+
 static inline void *dev_get_drvdata(const struct device *dev)
 {
 	return dev->driver_data;
-- 
cgit v1.2.3-70-g09d2


From b165e2b60b39888a7ff8efbc1de40137471dda41 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 28 Jul 2015 14:46:12 +0100
Subject: PCI/MSI: Add support for OF-provided msi_domain

In order to populate the PCI host bridge msi_domain, use the
"msi-parent" attribute to lookup a corresponding irq domain.
If found, this is our MSI domain.

This gets plugged into the core PCI code.

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: Yijing Wang <wangyijing@huawei.com>
Cc: Ma Jun <majun258@huawei.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Duc Dang <dhdang@apm.com>
Cc: Hanjun Guo <hanjun.guo@linaro.org>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1438091186-10244-6-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/of.c    | 25 +++++++++++++++++++++++++
 drivers/pci/probe.c |  5 ++++-
 include/linux/pci.h |  4 ++++
 3 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/of.c b/drivers/pci/of.c
index f0929934bb7a..85844d8c3efd 100644
--- a/drivers/pci/of.c
+++ b/drivers/pci/of.c
@@ -9,6 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/irqdomain.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/of.h>
@@ -59,3 +60,27 @@ struct device_node * __weak pcibios_get_phb_of_node(struct pci_bus *bus)
 		return of_node_get(bus->bridge->parent->of_node);
 	return NULL;
 }
+
+struct irq_domain *pci_host_bridge_of_msi_domain(struct pci_bus *bus)
+{
+#ifdef CONFIG_IRQ_DOMAIN
+	struct device_node *np;
+	struct irq_domain *d;
+
+	if (!bus->dev.of_node)
+		return NULL;
+
+	/* Start looking for a phandle to an MSI controller. */
+	np = of_parse_phandle(bus->dev.of_node, "msi-parent", 0);
+	if (!np)
+		return NULL;
+
+	d = irq_find_matching_host(np, DOMAIN_BUS_PCI_MSI);
+	if (d)
+		return d;
+
+	return irq_find_host(np);
+#else
+	return NULL;
+#endif
+}
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index c03ecbffc50b..a7afeacce7f1 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -663,12 +663,15 @@ static void pci_set_bus_speed(struct pci_bus *bus)
 
 static struct irq_domain *pci_host_bridge_msi_domain(struct pci_bus *bus)
 {
+	struct irq_domain *d;
+
 	/*
 	 * Any firmware interface that can resolve the msi_domain
 	 * should be called from here.
 	 */
+	d = pci_host_bridge_of_msi_domain(bus);
 
-	return NULL;
+	return d;
 }
 
 static void pci_set_bus_msi_domain(struct pci_bus *bus)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index fbf245f5eba7..772616d3b184 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1841,10 +1841,12 @@ int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off,
 /* PCI <-> OF binding helpers */
 #ifdef CONFIG_OF
 struct device_node;
+struct irq_domain;
 void pci_set_of_node(struct pci_dev *dev);
 void pci_release_of_node(struct pci_dev *dev);
 void pci_set_bus_of_node(struct pci_bus *bus);
 void pci_release_bus_of_node(struct pci_bus *bus);
+struct irq_domain *pci_host_bridge_of_msi_domain(struct pci_bus *bus);
 
 /* Arch may override this (weak) */
 struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus);
@@ -1867,6 +1869,8 @@ static inline void pci_set_bus_of_node(struct pci_bus *bus) { }
 static inline void pci_release_bus_of_node(struct pci_bus *bus) { }
 static inline struct device_node *
 pci_device_to_OF_node(const struct pci_dev *pdev) { return NULL; }
+static inline struct irq_domain *
+pci_host_bridge_of_msi_domain(struct pci_bus *bus) { return NULL; }
 #endif  /* CONFIG_OF */
 
 #ifdef CONFIG_EEH
-- 
cgit v1.2.3-70-g09d2


From c706c239af5bc297b5fbf1adc715632e1c222f7a Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 28 Jul 2015 14:46:15 +0100
Subject: of/platform: Assign MSI domain to platform device

As for PCI, we're able to populate the msi_domain field at probe time,
provided that the device tree has an "msi-parent" property.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: Yijing Wang <wangyijing@huawei.com>
Cc: Ma Jun <majun258@huawei.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Duc Dang <dhdang@apm.com>
Cc: Hanjun Guo <hanjun.guo@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1438091186-10244-9-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/of/irq.c          | 21 +++++++++++++++++++++
 drivers/of/platform.c     |  1 +
 include/linux/irqdomain.h |  1 +
 include/linux/of_irq.h    |  1 +
 4 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index 3cf7a01f557f..2956d725649f 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -18,6 +18,7 @@
  * driver.
  */
 
+#include <linux/device.h>
 #include <linux/errno.h>
 #include <linux/list.h>
 #include <linux/module.h>
@@ -576,3 +577,23 @@ err:
 		kfree(desc);
 	}
 }
+
+/**
+ * of_msi_configure - Set the msi_domain field of a device
+ * @dev: device structure to associate with an MSI irq domain
+ * @np: device node for that device
+ */
+void of_msi_configure(struct device *dev, struct device_node *np)
+{
+	struct device_node *msi_np;
+	struct irq_domain *d;
+
+	msi_np = of_parse_phandle(np, "msi-parent", 0);
+	if (!msi_np)
+		return;
+
+	d = irq_find_matching_host(msi_np, DOMAIN_BUS_PLATFORM_MSI);
+	if (!d)
+		d = irq_find_host(msi_np);
+	dev_set_msi_domain(dev, d);
+}
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index ddf8e42c9367..8a002d6151f2 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -184,6 +184,7 @@ static struct platform_device *of_platform_device_create_pdata(
 	dev->dev.bus = &platform_bus_type;
 	dev->dev.platform_data = platform_data;
 	of_dma_configure(&dev->dev, dev->dev.of_node);
+	of_msi_configure(&dev->dev, dev->dev.of_node);
 
 	if (of_device_add(dev) != 0) {
 		of_dma_deconfigure(&dev->dev);
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 25e9e6696a65..b4a74f73a0c3 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -55,6 +55,7 @@ struct irq_data;
 enum irq_domain_bus_token {
 	DOMAIN_BUS_ANY		= 0,
 	DOMAIN_BUS_PCI_MSI,
+	DOMAIN_BUS_PLATFORM_MSI,
 };
 
 /**
diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index d884929a7747..4bcbd586a672 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -74,6 +74,7 @@ static inline int of_irq_to_resource_table(struct device_node *dev,
  */
 extern unsigned int irq_of_parse_and_map(struct device_node *node, int index);
 extern struct device_node *of_irq_find_parent(struct device_node *child);
+extern void of_msi_configure(struct device *dev, struct device_node *np);
 
 #else /* !CONFIG_OF */
 static inline unsigned int irq_of_parse_and_map(struct device_node *dev,
-- 
cgit v1.2.3-70-g09d2


From c09fcc4b2b48d58d769a8cff5041533535ece449 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 28 Jul 2015 14:46:16 +0100
Subject: drivers/base: Add MSI domain support for non-PCI devices

With the msi_list and the msi_domain properties now being at the
generic device level, it is starting to be relatively easy to offer
a generic way of providing non-PCI MSIs.

The two major hurdles with this idea are:

- Lack of global ID that identifies a device: this is worked around by
  having a global ID allocator for each device that gets enrolled in
  the platform MSI subsystem

- Lack of standard way to write the message in the generating device.
  This is solved by mandating driver code to provide a write_msg
  callback, so that everyone can have their own square wheel

Apart from that, the API is fairly straightforward:

- platform_msi_create_irq_domain creates an MSI domain that gets
  tagged with DOMAIN_BUS_PLATFORM_MSI

- platform_msi_domain_alloc_irqs allocate MSIs for a given device,
  populating the msi_list

- platform_msi_domain_free_irqs does what is written on the tin

[ tglx: Created a seperate struct platform_msi_desc and added
  	kerneldoc entries ]

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: Yijing Wang <wangyijing@huawei.com>
Cc: Ma Jun <majun258@huawei.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Duc Dang <dhdang@apm.com>
Cc: Hanjun Guo <hanjun.guo@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1438091186-10244-10-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/base/Makefile       |   1 +
 drivers/base/platform-msi.c | 282 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/msi.h         |  22 ++++
 3 files changed, 305 insertions(+)
 create mode 100644 drivers/base/platform-msi.c

(limited to 'include/linux')

diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 527d291706e8..6b2a84e7f2be 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_REGMAP)	+= regmap/
 obj-$(CONFIG_SOC_BUS) += soc.o
 obj-$(CONFIG_PINCTRL) += pinctrl.o
 obj-$(CONFIG_DEV_COREDUMP) += devcoredump.o
+obj-$(CONFIG_GENERIC_MSI_IRQ_DOMAIN) += platform-msi.o
 
 ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG
 
diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c
new file mode 100644
index 000000000000..1857a5dd0816
--- /dev/null
+++ b/drivers/base/platform-msi.c
@@ -0,0 +1,282 @@
+/*
+ * MSI framework for platform devices
+ *
+ * Copyright (C) 2015 ARM Limited, All Rights Reserved.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/device.h>
+#include <linux/idr.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/msi.h>
+#include <linux/slab.h>
+
+#define DEV_ID_SHIFT	24
+
+/*
+ * Internal data structure containing a (made up, but unique) devid
+ * and the callback to write the MSI message.
+ */
+struct platform_msi_priv_data {
+	irq_write_msi_msg_t	write_msg;
+	int			devid;
+};
+
+/* The devid allocator */
+static DEFINE_IDA(platform_msi_devid_ida);
+
+#ifdef GENERIC_MSI_DOMAIN_OPS
+/*
+ * Convert an msi_desc to a globaly unique identifier (per-device
+ * devid + msi_desc position in the msi_list).
+ */
+static irq_hw_number_t platform_msi_calc_hwirq(struct msi_desc *desc)
+{
+	u32 devid;
+
+	devid = desc->platform.msi_priv_data->devid;
+
+	return (devid << (32 - DEV_ID_SHIFT)) | desc->platform.msi_index;
+}
+
+static void platform_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+{
+	arg->desc = desc;
+	arg->hwirq = platform_msi_calc_hwirq(desc);
+}
+
+static int platform_msi_init(struct irq_domain *domain,
+			     struct msi_domain_info *info,
+			     unsigned int virq, irq_hw_number_t hwirq,
+			     msi_alloc_info_t *arg)
+{
+	struct irq_data *data;
+
+	irq_domain_set_hwirq_and_chip(domain, virq, hwirq,
+				      info->chip, info->chip_data);
+
+	/*
+	 * Save the MSI descriptor in handler_data so that the
+	 * irq_write_msi_msg callback can retrieve it (and the
+	 * associated device).
+	 */
+	data = irq_domain_get_irq_data(domain, virq);
+	data->handler_data = arg->desc;
+
+	return 0;
+}
+#else
+#define platform_msi_set_desc		NULL
+#define platform_msi_init		NULL
+#endif
+
+static void platform_msi_update_dom_ops(struct msi_domain_info *info)
+{
+	struct msi_domain_ops *ops = info->ops;
+
+	BUG_ON(!ops);
+
+	if (ops->msi_init == NULL)
+		ops->msi_init = platform_msi_init;
+	if (ops->set_desc == NULL)
+		ops->set_desc = platform_msi_set_desc;
+}
+
+static void platform_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	struct msi_desc *desc = irq_data_get_irq_handler_data(data);
+	struct platform_msi_priv_data *priv_data;
+
+	priv_data = desc->platform.msi_priv_data;
+
+	priv_data->write_msg(desc, msg);
+}
+
+static void platform_msi_update_chip_ops(struct msi_domain_info *info)
+{
+	struct irq_chip *chip = info->chip;
+
+	BUG_ON(!chip);
+	if (!chip->irq_mask)
+		chip->irq_mask = irq_chip_mask_parent;
+	if (!chip->irq_unmask)
+		chip->irq_unmask = irq_chip_unmask_parent;
+	if (!chip->irq_eoi)
+		chip->irq_eoi = irq_chip_eoi_parent;
+	if (!chip->irq_set_affinity)
+		chip->irq_set_affinity = msi_domain_set_affinity;
+	if (!chip->irq_write_msi_msg)
+		chip->irq_write_msi_msg = platform_msi_write_msg;
+}
+
+static void platform_msi_free_descs(struct device *dev)
+{
+	struct msi_desc *desc, *tmp;
+
+	list_for_each_entry_safe(desc, tmp, dev_to_msi_list(dev), list) {
+		list_del(&desc->list);
+		free_msi_entry(desc);
+	}
+}
+
+static int platform_msi_alloc_descs(struct device *dev, int nvec,
+				    struct platform_msi_priv_data *data)
+
+{
+	int i;
+
+	for (i = 0; i < nvec; i++) {
+		struct msi_desc *desc;
+
+		desc = alloc_msi_entry(dev);
+		if (!desc)
+			break;
+
+		desc->platform.msi_priv_data = data;
+		desc->platform.msi_index = i;
+		desc->nvec_used = 1;
+
+		list_add_tail(&desc->list, dev_to_msi_list(dev));
+	}
+
+	if (i != nvec) {
+		/* Clean up the mess */
+		platform_msi_free_descs(dev);
+
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/**
+ * platform_msi_create_irq_domain - Create a platform MSI interrupt domain
+ * @np:		Optional device-tree node of the interrupt controller
+ * @info:	MSI domain info
+ * @parent:	Parent irq domain
+ *
+ * Updates the domain and chip ops and creates a platform MSI
+ * interrupt domain.
+ *
+ * Returns:
+ * A domain pointer or NULL in case of failure.
+ */
+struct irq_domain *platform_msi_create_irq_domain(struct device_node *np,
+						  struct msi_domain_info *info,
+						  struct irq_domain *parent)
+{
+	struct irq_domain *domain;
+
+	if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
+		platform_msi_update_dom_ops(info);
+	if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
+		platform_msi_update_chip_ops(info);
+
+	domain = msi_create_irq_domain(np, info, parent);
+	if (domain)
+		domain->bus_token = DOMAIN_BUS_PLATFORM_MSI;
+
+	return domain;
+}
+
+/**
+ * platform_msi_domain_alloc_irqs - Allocate MSI interrupts for @dev
+ * @dev:		The device for which to allocate interrupts
+ * @nvec:		The number of interrupts to allocate
+ * @write_msi_msg:	Callback to write an interrupt message for @dev
+ *
+ * Returns:
+ * Zero for success, or an error code in case of failure
+ */
+int platform_msi_domain_alloc_irqs(struct device *dev, unsigned int nvec,
+				   irq_write_msi_msg_t write_msi_msg)
+{
+	struct platform_msi_priv_data *priv_data;
+	int err;
+
+	/*
+	 * Limit the number of interrupts to 256 per device. Should we
+	 * need to bump this up, DEV_ID_SHIFT should be adjusted
+	 * accordingly (which would impact the max number of MSI
+	 * capable devices).
+	 */
+	if (!dev->msi_domain || !write_msi_msg || !nvec ||
+	    nvec > (1 << (32 - DEV_ID_SHIFT)))
+		return -EINVAL;
+
+	if (dev->msi_domain->bus_token != DOMAIN_BUS_PLATFORM_MSI) {
+		dev_err(dev, "Incompatible msi_domain, giving up\n");
+		return -EINVAL;
+	}
+
+	/* Already had a helping of MSI? Greed... */
+	if (!list_empty(dev_to_msi_list(dev)))
+		return -EBUSY;
+
+	priv_data = kzalloc(sizeof(*priv_data), GFP_KERNEL);
+	if (!priv_data)
+		return -ENOMEM;
+
+	priv_data->devid = ida_simple_get(&platform_msi_devid_ida,
+					  0, 1 << DEV_ID_SHIFT, GFP_KERNEL);
+	if (priv_data->devid < 0) {
+		err = priv_data->devid;
+		goto out_free_data;
+	}
+
+	priv_data->write_msg = write_msi_msg;
+
+	err = platform_msi_alloc_descs(dev, nvec, priv_data);
+	if (err)
+		goto out_free_id;
+
+	err = msi_domain_alloc_irqs(dev->msi_domain, dev, nvec);
+	if (err)
+		goto out_free_desc;
+
+	return 0;
+
+out_free_desc:
+	platform_msi_free_descs(dev);
+out_free_id:
+	ida_simple_remove(&platform_msi_devid_ida, priv_data->devid);
+out_free_data:
+	kfree(priv_data);
+
+	return err;
+}
+
+/**
+ * platform_msi_domain_free_irqs - Free MSI interrupts for @dev
+ * @dev:	The device for which to free interrupts
+ */
+void platform_msi_domain_free_irqs(struct device *dev)
+{
+	struct msi_desc *desc;
+
+	desc = first_msi_entry(dev);
+	if (desc) {
+		struct platform_msi_priv_data *data;
+
+		data = desc->platform.msi_priv_data;
+
+		ida_simple_remove(&platform_msi_devid_ida, data->devid);
+		kfree(data);
+	}
+
+	msi_domain_free_irqs(dev->msi_domain, dev);
+	platform_msi_free_descs(dev);
+}
diff --git a/include/linux/msi.h b/include/linux/msi.h
index f83c87e447bc..809b749f9300 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -15,9 +15,23 @@ extern int pci_msi_ignore_mask;
 struct irq_data;
 struct msi_desc;
 struct pci_dev;
+struct platform_msi_priv_data;
 void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg);
 
+typedef void (*irq_write_msi_msg_t)(struct msi_desc *desc,
+				    struct msi_msg *msg);
+
+/**
+ * platform_msi_desc - Platform device specific msi descriptor data
+ * @msi_priv_data:	Pointer to platform private data
+ * @msi_index:		The index of the MSI descriptor for multi MSI
+ */
+struct platform_msi_desc {
+	struct platform_msi_priv_data	*msi_priv_data;
+	u16				msi_index;
+};
+
 /**
  * struct msi_desc - Descriptor structure for MSI based interrupts
  * @list:	List head for management
@@ -36,6 +50,7 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg);
  * @default_irq:[PCI MSI/X] The default pre-assigned non-MSI irq
  * @mask_pos:	[PCI MSI]   Mask register position
  * @mask_base:	[PCI MSI-X] Mask register base address
+ * @platform:	[platform]  Platform device specific msi descriptor data
  */
 struct msi_desc {
 	/* Shared device/bus type independent data */
@@ -71,6 +86,7 @@ struct msi_desc {
 		 * anonymous for now as it would require an immediate
 		 * tree wide cleanup.
 		 */
+		struct platform_msi_desc platform;
 	};
 };
 
@@ -257,6 +273,12 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
 struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain);
 
+struct irq_domain *platform_msi_create_irq_domain(struct device_node *np,
+						  struct msi_domain_info *info,
+						  struct irq_domain *parent);
+int platform_msi_domain_alloc_irqs(struct device *dev, unsigned int nvec,
+				   irq_write_msi_msg_t write_msi_msg);
+void platform_msi_domain_free_irqs(struct device *dev);
 #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
 
 #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
-- 
cgit v1.2.3-70-g09d2


From a5716070d88cba1a0a8a18fea809ea6e3374e276 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 28 Jul 2015 14:46:17 +0100
Subject: genirq: Add DOMAIN_BUS_NEXUS irqdomain property

Some IRQ domains are not designed to directly provide interrupts
to devices, but strictly to be used by other domains. An example
of this is the GICv3 ITS, which is completely bus agnostic, and
on which it is possible to implement a PCI/MSI domain.

Just introduce the irq_domain_bus_token property for now.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: Yijing Wang <wangyijing@huawei.com>
Cc: Ma Jun <majun258@huawei.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Duc Dang <dhdang@apm.com>
Cc: Hanjun Guo <hanjun.guo@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1438091186-10244-11-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdomain.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index b4a74f73a0c3..d3ca79236fb0 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -56,6 +56,7 @@ enum irq_domain_bus_token {
 	DOMAIN_BUS_ANY		= 0,
 	DOMAIN_BUS_PCI_MSI,
 	DOMAIN_BUS_PLATFORM_MSI,
+	DOMAIN_BUS_NEXUS,
 };
 
 /**
-- 
cgit v1.2.3-70-g09d2


From f130420e51df30891b55efcef24f5358b2fc2b97 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 28 Jul 2015 14:46:18 +0100
Subject: irqchip/gicv3-its: Split PCI/MSI code from the core ITS driver

It is becoming obvious that having the PCI/MSI code in the same
file as the the core ITS code is giving people implementing non-PCI
MSI support the wrong kind of idea.

In order to make things a bit clearer, let's move the PCI/MSI code
out to its own file. Hopefully it will make it clear that whoever
thinks of hooking into the core ITS better have a very strong point.

We use a temporary entry point that will get removed in a subsequent
patch, once the proper infrastructure is added.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: Yijing Wang <wangyijing@huawei.com>
Cc: Ma Jun <majun258@huawei.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Duc Dang <dhdang@apm.com>
Cc: Hanjun Guo <hanjun.guo@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1438091186-10244-12-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/irqchip/Makefile                 |   2 +-
 drivers/irqchip/irq-gic-v3-its-pci-msi.c | 105 +++++++++++++++++++++++++++++++
 drivers/irqchip/irq-gic-v3-its.c         |  94 ++++-----------------------
 include/linux/irqchip/arm-gic-v3.h       |   6 ++
 4 files changed, 123 insertions(+), 84 deletions(-)
 create mode 100644 drivers/irqchip/irq-gic-v3-its-pci-msi.c

(limited to 'include/linux')

diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index b8d4e9691890..0d5f2a98a6ef 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -22,7 +22,7 @@ obj-$(CONFIG_ARCH_SPEAR3XX)		+= spear-shirq.o
 obj-$(CONFIG_ARM_GIC)			+= irq-gic.o irq-gic-common.o
 obj-$(CONFIG_ARM_GIC_V2M)		+= irq-gic-v2m.o
 obj-$(CONFIG_ARM_GIC_V3)		+= irq-gic-v3.o irq-gic-common.o
-obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o
+obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v3-its-pci-msi.o
 obj-$(CONFIG_ARM_NVIC)			+= irq-nvic.o
 obj-$(CONFIG_ARM_VIC)			+= irq-vic.o
 obj-$(CONFIG_ATMEL_AIC_IRQ)		+= irq-atmel-aic-common.o irq-atmel-aic.o
diff --git a/drivers/irqchip/irq-gic-v3-its-pci-msi.c b/drivers/irqchip/irq-gic-v3-its-pci-msi.c
new file mode 100644
index 000000000000..76147219da7f
--- /dev/null
+++ b/drivers/irqchip/irq-gic-v3-its-pci-msi.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (C) 2013-2015 ARM Limited, All Rights Reserved.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/msi.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/of_pci.h>
+
+#include <linux/irqchip/arm-gic-v3.h>
+
+static void its_mask_msi_irq(struct irq_data *d)
+{
+	pci_msi_mask_irq(d);
+	irq_chip_mask_parent(d);
+}
+
+static void its_unmask_msi_irq(struct irq_data *d)
+{
+	pci_msi_unmask_irq(d);
+	irq_chip_unmask_parent(d);
+}
+
+static struct irq_chip its_msi_irq_chip = {
+	.name			= "ITS-MSI",
+	.irq_unmask		= its_unmask_msi_irq,
+	.irq_mask		= its_mask_msi_irq,
+	.irq_eoi		= irq_chip_eoi_parent,
+	.irq_write_msi_msg	= pci_msi_domain_write_msg,
+};
+
+struct its_pci_alias {
+	struct pci_dev	*pdev;
+	u32		dev_id;
+	u32		count;
+};
+
+static int its_pci_msi_vec_count(struct pci_dev *pdev)
+{
+	int msi, msix;
+
+	msi = max(pci_msi_vec_count(pdev), 0);
+	msix = max(pci_msix_vec_count(pdev), 0);
+
+	return max(msi, msix);
+}
+
+static int its_get_pci_alias(struct pci_dev *pdev, u16 alias, void *data)
+{
+	struct its_pci_alias *dev_alias = data;
+
+	dev_alias->dev_id = alias;
+	if (pdev != dev_alias->pdev)
+		dev_alias->count += its_pci_msi_vec_count(dev_alias->pdev);
+
+	return 0;
+}
+
+static int its_pci_msi_prepare(struct irq_domain *domain, struct device *dev,
+			       int nvec, msi_alloc_info_t *info)
+{
+	struct pci_dev *pdev;
+	struct its_pci_alias dev_alias;
+
+	if (!dev_is_pci(dev))
+		return -EINVAL;
+
+	pdev = to_pci_dev(dev);
+	dev_alias.pdev = pdev;
+	dev_alias.count = nvec;
+
+	pci_for_each_dma_alias(pdev, its_get_pci_alias, &dev_alias);
+
+	return its_msi_prepare(domain, dev_alias.dev_id, dev_alias.count, info);
+}
+
+static struct msi_domain_ops its_pci_msi_ops = {
+	.msi_prepare	= its_pci_msi_prepare,
+};
+
+static struct msi_domain_info its_pci_msi_domain_info = {
+	.flags	= (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+		   MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX),
+	.ops	= &its_pci_msi_ops,
+	.chip	= &its_msi_irq_chip,
+};
+
+struct irq_domain *its_pci_msi_alloc_domain(struct device_node *np,
+					    struct irq_domain *parent)
+{
+	return pci_msi_create_irq_domain(np, &its_pci_msi_domain_info, parent);
+}
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 1df956afb937..7f995d876029 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -642,26 +642,6 @@ static struct irq_chip its_irq_chip = {
 	.irq_compose_msi_msg	= its_irq_compose_msi_msg,
 };
 
-static void its_mask_msi_irq(struct irq_data *d)
-{
-	pci_msi_mask_irq(d);
-	irq_chip_mask_parent(d);
-}
-
-static void its_unmask_msi_irq(struct irq_data *d)
-{
-	pci_msi_unmask_irq(d);
-	irq_chip_unmask_parent(d);
-}
-
-static struct irq_chip its_msi_irq_chip = {
-	.name			= "ITS-MSI",
-	.irq_unmask		= its_unmask_msi_irq,
-	.irq_mask		= its_mask_msi_irq,
-	.irq_eoi		= irq_chip_eoi_parent,
-	.irq_write_msi_msg	= pci_msi_domain_write_msg,
-};
-
 /*
  * How we allocate LPIs:
  *
@@ -1208,85 +1188,34 @@ static int its_alloc_device_irq(struct its_device *dev, irq_hw_number_t *hwirq)
 	return 0;
 }
 
-struct its_pci_alias {
-	struct pci_dev	*pdev;
-	u32		dev_id;
-	u32		count;
-};
-
-static int its_pci_msi_vec_count(struct pci_dev *pdev)
-{
-	int msi, msix;
-
-	msi = max(pci_msi_vec_count(pdev), 0);
-	msix = max(pci_msix_vec_count(pdev), 0);
-
-	return max(msi, msix);
-}
-
-static int its_get_pci_alias(struct pci_dev *pdev, u16 alias, void *data)
+int its_msi_prepare(struct irq_domain *domain, u32 dev_id,
+		    int nvec, msi_alloc_info_t *info)
 {
-	struct its_pci_alias *dev_alias = data;
-
-	dev_alias->dev_id = alias;
-	if (pdev != dev_alias->pdev)
-		dev_alias->count += its_pci_msi_vec_count(dev_alias->pdev);
-
-	return 0;
-}
-
-static int its_msi_prepare(struct irq_domain *domain, struct device *dev,
-			   int nvec, msi_alloc_info_t *info)
-{
-	struct pci_dev *pdev;
 	struct its_node *its;
 	struct its_device *its_dev;
-	struct its_pci_alias dev_alias;
-
-	if (!dev_is_pci(dev))
-		return -EINVAL;
-
-	pdev = to_pci_dev(dev);
-	dev_alias.pdev = pdev;
-	dev_alias.count = nvec;
 
-	pci_for_each_dma_alias(pdev, its_get_pci_alias, &dev_alias);
 	its = domain->parent->host_data;
-
-	its_dev = its_find_device(its, dev_alias.dev_id);
+	its_dev = its_find_device(its, dev_id);
 	if (its_dev) {
 		/*
 		 * We already have seen this ID, probably through
 		 * another alias (PCI bridge of some sort). No need to
 		 * create the device.
 		 */
-		dev_dbg(dev, "Reusing ITT for devID %x\n", dev_alias.dev_id);
+		pr_debug("Reusing ITT for devID %x\n", dev_id);
 		goto out;
 	}
 
-	its_dev = its_create_device(its, dev_alias.dev_id, dev_alias.count);
+	its_dev = its_create_device(its, dev_id, nvec);
 	if (!its_dev)
 		return -ENOMEM;
 
-	dev_dbg(&pdev->dev, "ITT %d entries, %d bits\n",
-		dev_alias.count, ilog2(dev_alias.count));
+	pr_debug("ITT %d entries, %d bits\n", nvec, ilog2(nvec));
 out:
 	info->scratchpad[0].ptr = its_dev;
-	info->scratchpad[1].ptr = dev;
 	return 0;
 }
 
-static struct msi_domain_ops its_pci_msi_ops = {
-	.msi_prepare	= its_msi_prepare,
-};
-
-static struct msi_domain_info its_pci_msi_domain_info = {
-	.flags	= (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
-		   MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX),
-	.ops	= &its_pci_msi_ops,
-	.chip	= &its_msi_irq_chip,
-};
-
 static int its_irq_gic_domain_alloc(struct irq_domain *domain,
 				    unsigned int virq,
 				    irq_hw_number_t hwirq)
@@ -1322,9 +1251,9 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 
 		irq_domain_set_hwirq_and_chip(domain, virq + i,
 					      hwirq, &its_irq_chip, its_dev);
-		dev_dbg(info->scratchpad[1].ptr, "ID:%d pID:%d vID:%d\n",
-			(int)(hwirq - its_dev->event_map.lpi_base),
-			(int)hwirq, virq + i);
+		pr_debug("ID:%d pID:%d vID:%d\n",
+			 (int)(hwirq - its_dev->event_map.lpi_base),
+			 (int) hwirq, virq + i);
 	}
 
 	return 0;
@@ -1523,9 +1452,8 @@ static int its_probe(struct device_node *node, struct irq_domain *parent)
 
 		its->domain->parent = parent;
 
-		its->msi_chip.domain = pci_msi_create_irq_domain(node,
-								 &its_pci_msi_domain_info,
-								 its->domain);
+		its->msi_chip.domain = its_pci_msi_alloc_domain(node,
+								its->domain);
 		if (!its->msi_chip.domain) {
 			err = -ENOMEM;
 			goto out_free_domains;
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index ffbc034c8810..d6149baaf643 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -360,6 +360,7 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/stringify.h>
+#include <asm/msi.h>
 
 /*
  * We need a value to serve as a irq-type for LPIs. Choose one that will
@@ -388,6 +389,11 @@ struct irq_domain;
 int its_cpu_init(void);
 int its_init(struct device_node *node, struct rdists *rdists,
 	     struct irq_domain *domain);
+int its_msi_prepare(struct irq_domain *domain, u32 dev_id,
+		    int nvec, msi_alloc_info_t *info);
+
+struct irq_domain *its_pci_msi_alloc_domain(struct device_node *node,
+					    struct irq_domain *parent);
 
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 54456db9a23753b87ce4d49adabe7da853bf13a2 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 28 Jul 2015 14:46:21 +0100
Subject: irqchip/gicv3-its: Make the PCI/MSI code standalone

We can now lookup the base ITS domain, making it possible to
initialize the PCI/MSI code independently from the main ITS
subsystem.

This allows us to remove all the previously add hooks.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: Yijing Wang <wangyijing@huawei.com>
Cc: Ma Jun <majun258@huawei.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Duc Dang <dhdang@apm.com>
Cc: Hanjun Guo <hanjun.guo@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1438091186-10244-15-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/irqchip/irq-gic-v3-its-pci-msi.c | 47 +++++++++++++++++++++++++++----
 drivers/irqchip/irq-gic-v3-its.c         | 48 +++++++++++++++++++++-----------
 include/linux/irqchip/arm-gic-v3.h       |  5 ----
 3 files changed, 73 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic-v3-its-pci-msi.c b/drivers/irqchip/irq-gic-v3-its-pci-msi.c
index 76147219da7f..cf351c637464 100644
--- a/drivers/irqchip/irq-gic-v3-its-pci-msi.c
+++ b/drivers/irqchip/irq-gic-v3-its-pci-msi.c
@@ -20,8 +20,6 @@
 #include <linux/of_irq.h>
 #include <linux/of_pci.h>
 
-#include <linux/irqchip/arm-gic-v3.h>
-
 static void its_mask_msi_irq(struct irq_data *d)
 {
 	pci_msi_mask_irq(d);
@@ -74,17 +72,24 @@ static int its_pci_msi_prepare(struct irq_domain *domain, struct device *dev,
 {
 	struct pci_dev *pdev;
 	struct its_pci_alias dev_alias;
+	struct msi_domain_info *msi_info;
 
 	if (!dev_is_pci(dev))
 		return -EINVAL;
 
+	msi_info = msi_get_domain_info(domain->parent);
+
 	pdev = to_pci_dev(dev);
 	dev_alias.pdev = pdev;
 	dev_alias.count = nvec;
 
 	pci_for_each_dma_alias(pdev, its_get_pci_alias, &dev_alias);
 
-	return its_msi_prepare(domain, dev_alias.dev_id, dev_alias.count, info);
+	/* ITS specific DeviceID, as the core ITS ignores dev. */
+	info->scratchpad[0].ul = dev_alias.dev_id;
+
+	return msi_info->ops->msi_prepare(domain->parent,
+					  dev, dev_alias.count, info);
 }
 
 static struct msi_domain_ops its_pci_msi_ops = {
@@ -98,8 +103,38 @@ static struct msi_domain_info its_pci_msi_domain_info = {
 	.chip	= &its_msi_irq_chip,
 };
 
-struct irq_domain *its_pci_msi_alloc_domain(struct device_node *np,
-					    struct irq_domain *parent)
+static struct of_device_id its_device_id[] = {
+	{	.compatible	= "arm,gic-v3-its",	},
+	{},
+};
+
+static int __init its_pci_msi_init(void)
 {
-	return pci_msi_create_irq_domain(np, &its_pci_msi_domain_info, parent);
+	struct device_node *np;
+	struct irq_domain *parent;
+
+	for (np = of_find_matching_node(NULL, its_device_id); np;
+	     np = of_find_matching_node(np, its_device_id)) {
+		if (!of_property_read_bool(np, "msi-controller"))
+			continue;
+
+		parent = irq_find_matching_host(np, DOMAIN_BUS_NEXUS);
+		if (!parent || !msi_get_domain_info(parent)) {
+			pr_err("%s: unable to locate ITS domain\n",
+			       np->full_name);
+			continue;
+		}
+
+		if (!pci_msi_create_irq_domain(np, &its_pci_msi_domain_info,
+					       parent)) {
+			pr_err("%s: unable to create PCI domain\n",
+			       np->full_name);
+			continue;
+		}
+
+		pr_info("PCI/MSI: %s domain created\n", np->full_name);
+	}
+
+	return 0;
 }
+early_initcall(its_pci_msi_init);
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index dc4fbbfa0212..26b55c53755f 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -59,7 +59,6 @@ struct its_collection {
 struct its_node {
 	raw_spinlock_t		lock;
 	struct list_head	entry;
-	struct irq_domain	*domain;
 	void __iomem		*base;
 	unsigned long		phys_base;
 	struct its_cmd_block	*cmd_base;
@@ -1187,13 +1186,25 @@ static int its_alloc_device_irq(struct its_device *dev, irq_hw_number_t *hwirq)
 	return 0;
 }
 
-int its_msi_prepare(struct irq_domain *domain, u32 dev_id,
-		    int nvec, msi_alloc_info_t *info)
+static int its_msi_prepare(struct irq_domain *domain, struct device *dev,
+			   int nvec, msi_alloc_info_t *info)
 {
 	struct its_node *its;
 	struct its_device *its_dev;
+	struct msi_domain_info *msi_info;
+	u32 dev_id;
+
+	/*
+	 * We ignore "dev" entierely, and rely on the dev_id that has
+	 * been passed via the scratchpad. This limits this domain's
+	 * usefulness to upper layers that definitely know that they
+	 * are built on top of the ITS.
+	 */
+	dev_id = info->scratchpad[0].ul;
+
+	msi_info = msi_get_domain_info(domain);
+	its = msi_info->data;
 
-	its = domain->parent->host_data;
 	its_dev = its_find_device(its, dev_id);
 	if (its_dev) {
 		/*
@@ -1215,6 +1226,10 @@ out:
 	return 0;
 }
 
+static struct msi_domain_ops its_msi_domain_ops = {
+	.msi_prepare	= its_msi_prepare,
+};
+
 static int its_irq_gic_domain_alloc(struct irq_domain *domain,
 				    unsigned int virq,
 				    irq_hw_number_t hwirq)
@@ -1353,7 +1368,7 @@ static int its_probe(struct device_node *node, struct irq_domain *parent)
 	struct resource res;
 	struct its_node *its;
 	void __iomem *its_base;
-	struct irq_domain *inner_domain = NULL;
+	struct irq_domain *inner_domain;
 	u32 val;
 	u64 baser, tmp;
 	int err;
@@ -1443,20 +1458,26 @@ static int its_probe(struct device_node *node, struct irq_domain *parent)
 	writel_relaxed(GITS_CTLR_ENABLE, its->base + GITS_CTLR);
 
 	if (of_property_read_bool(node, "msi-controller")) {
+		struct msi_domain_info *info;
+
+		info = kzalloc(sizeof(*info), GFP_KERNEL);
+		if (!info) {
+			err = -ENOMEM;
+			goto out_free_tables;
+		}
+
 		inner_domain = irq_domain_add_tree(node, &its_domain_ops, its);
 		if (!inner_domain) {
 			err = -ENOMEM;
+			kfree(info);
 			goto out_free_tables;
 		}
 
 		inner_domain->parent = parent;
 		inner_domain->bus_token = DOMAIN_BUS_NEXUS;
-
-		its->domain = its_pci_msi_alloc_domain(node, inner_domain);
-		if (!its->domain) {
-			err = -ENOMEM;
-			goto out_free_domains;
-		}
+		info->ops = &its_msi_domain_ops;
+		info->data = its;
+		inner_domain->host_data = info;
 	}
 
 	spin_lock(&its_lock);
@@ -1465,11 +1486,6 @@ static int its_probe(struct device_node *node, struct irq_domain *parent)
 
 	return 0;
 
-out_free_domains:
-	if (its->domain)
-		irq_domain_remove(its->domain);
-	if (inner_domain)
-		irq_domain_remove(inner_domain);
 out_free_tables:
 	its_free_tables(its);
 out_free_cmd:
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index d6149baaf643..bf982e021fbd 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -389,11 +389,6 @@ struct irq_domain;
 int its_cpu_init(void);
 int its_init(struct device_node *node, struct rdists *rdists,
 	     struct irq_domain *domain);
-int its_msi_prepare(struct irq_domain *domain, u32 dev_id,
-		    int nvec, msi_alloc_info_t *info);
-
-struct irq_domain *its_pci_msi_alloc_domain(struct device_node *node,
-					    struct irq_domain *parent);
 
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From f075915ac0b11847fcfc8c4d55526a317e71c4d1 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 28 Jul 2015 14:46:26 +0100
Subject: PCI/MSI: Drop domain field from msi_controller

The only three users of that field are not using the msi_controller
structure anymore, so drop it altogether.

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: Yijing Wang <wangyijing@huawei.com>
Cc: Ma Jun <majun258@huawei.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Duc Dang <dhdang@apm.com>
Cc: Hanjun Guo <hanjun.guo@linaro.org>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1438091186-10244-20-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/msi.c   | 3 ---
 include/linux/msi.h | 3 ---
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 460334409794..f70aa0f5cbaf 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -45,9 +45,6 @@ static struct irq_domain *pci_msi_get_domain(struct pci_dev *dev)
 	if (domain)
 		return domain;
 
-	if (dev->bus->msi && (domain = dev->bus->msi->domain))
-		return domain;
-
 	return arch_get_pci_msi_domain(dev);
 }
 
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 809b749f9300..ad939d0ba816 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -160,9 +160,6 @@ struct msi_controller {
 	struct device *dev;
 	struct device_node *of_node;
 	struct list_head list;
-#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
-	struct irq_domain *domain;
-#endif
 
 	int (*setup_irq)(struct msi_controller *chip, struct pci_dev *dev,
 			 struct msi_desc *desc);
-- 
cgit v1.2.3-70-g09d2


From 72b1e5e4cac72efa6b739b47e41f53e4520b4194 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 23 Jul 2015 16:21:30 +0200
Subject: netfilter: bridge: reduce nf_bridge_info to 32 bytes again

We can use union for most of the temporary cruft (original ipv4/ipv6
address, source mac, physoutdev) since they're used during different
stages of br netfilter traversal.

Also get rid of the last two ->mask users.

Shrinks struct from 48 to 32 on 64bit arch.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge.h          | 12 +++++++++---
 include/linux/skbuff.h                    | 19 +++++++++++++------
 net/bridge/br_netfilter_hooks.c           | 14 ++++++--------
 net/bridge/br_netfilter_ipv6.c            |  2 +-
 net/ipv4/netfilter/nf_defrag_ipv4.c       |  7 ++-----
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c |  7 ++-----
 6 files changed, 33 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index 6d80fc686323..2437b8a5d7a9 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -17,9 +17,6 @@ enum nf_br_hook_priorities {
 
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 
-#define BRNF_BRIDGED_DNAT		0x02
-#define BRNF_NF_BRIDGE_PREROUTING	0x08
-
 int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb);
 
 static inline void br_drop_fake_rtable(struct sk_buff *skb)
@@ -63,8 +60,17 @@ nf_bridge_get_physoutdev(const struct sk_buff *skb)
 {
 	return skb->nf_bridge ? skb->nf_bridge->physoutdev : NULL;
 }
+
+static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb)
+{
+	return skb->nf_bridge && skb->nf_bridge->in_prerouting;
+}
 #else
 #define br_drop_fake_rtable(skb)	        do { } while (0)
+static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb)
+{
+	return false;
+}
 #endif /* CONFIG_BRIDGE_NETFILTER */
 
 #endif
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d6cdd6e87d53..ac732e67a6c8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -173,17 +173,24 @@ struct nf_bridge_info {
 		BRNF_PROTO_8021Q,
 		BRNF_PROTO_PPPOE
 	} orig_proto:8;
-	bool			pkt_otherhost;
+	u8			pkt_otherhost:1;
+	u8			in_prerouting:1;
+	u8			bridged_dnat:1;
 	__u16			frag_max_size;
-	unsigned int		mask;
 	struct net_device	*physindev;
 	union {
-		struct net_device *physoutdev;
-		char neigh_header[8];
-	};
-	union {
+		/* prerouting: detect dnat in orig/reply direction */
 		__be32          ipv4_daddr;
 		struct in6_addr ipv6_daddr;
+
+		/* after prerouting + nat detected: store original source
+		 * mac since neigh resolution overwrites it, only used while
+		 * skb is out in neigh layer.
+		 */
+		char neigh_header[8];
+
+		/* always valid & non-NULL from FORWARD on, for physdev match */
+		struct net_device *physoutdev;
 	};
 };
 #endif
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index c8b9bcfe997e..ec51c2ba30e9 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -284,7 +284,7 @@ int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb)
 							 nf_bridge->neigh_header,
 							 ETH_HLEN-ETH_ALEN);
 			/* tell br_dev_xmit to continue with forwarding */
-			nf_bridge->mask |= BRNF_BRIDGED_DNAT;
+			nf_bridge->bridged_dnat = 1;
 			/* FIXME Need to refragment */
 			ret = neigh->output(neigh, skb);
 		}
@@ -356,7 +356,7 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb)
 		skb->pkt_type = PACKET_OTHERHOST;
 		nf_bridge->pkt_otherhost = false;
 	}
-	nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING;
+	nf_bridge->in_prerouting = 0;
 	if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) {
 		if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
 			struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -444,7 +444,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb)
 		nf_bridge->pkt_otherhost = true;
 	}
 
-	nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING;
+	nf_bridge->in_prerouting = 1;
 	nf_bridge->physindev = skb->dev;
 	skb->dev = brnf_get_logical_dev(skb, skb->dev);
 
@@ -850,10 +850,8 @@ static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops,
 				   struct sk_buff *skb,
 				   const struct nf_hook_state *state)
 {
-	if (skb->nf_bridge &&
-	    !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) {
+	if (skb->nf_bridge && !skb->nf_bridge->in_prerouting)
 		return NF_STOP;
-	}
 
 	return NF_ACCEPT;
 }
@@ -872,7 +870,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
 	struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 
 	skb_pull(skb, ETH_HLEN);
-	nf_bridge->mask &= ~BRNF_BRIDGED_DNAT;
+	nf_bridge->bridged_dnat = 0;
 
 	BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN));
 
@@ -887,7 +885,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
 
 static int br_nf_dev_xmit(struct sk_buff *skb)
 {
-	if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) {
+	if (skb->nf_bridge && skb->nf_bridge->bridged_dnat) {
 		br_nf_pre_routing_finish_bridge_slow(skb);
 		return 1;
 	}
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 13b7d1e3d185..77383bfe7ea3 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -174,7 +174,7 @@ static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb)
 		skb->pkt_type = PACKET_OTHERHOST;
 		nf_bridge->pkt_otherhost = false;
 	}
-	nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING;
+	nf_bridge->in_prerouting = 0;
 	if (br_nf_ipv6_daddr_was_changed(skb, nf_bridge)) {
 		skb_dst_drop(skb);
 		v6ops->route_input(skb);
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index c88b7d434718..b69e82bda215 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -49,12 +49,9 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
 	if (skb->nfct)
 		zone = nf_ct_zone((struct nf_conn *)skb->nfct);
 #endif
-
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-	if (skb->nf_bridge &&
-	    skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
+	if (nf_bridge_in_prerouting(skb))
 		return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
-#endif
+
 	if (hooknum == NF_INET_PRE_ROUTING)
 		return IP_DEFRAG_CONNTRACK_IN + zone;
 	else
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index a45db0b4785c..267fb8d5876e 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -39,12 +39,9 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
 	if (skb->nfct)
 		zone = nf_ct_zone((struct nf_conn *)skb->nfct);
 #endif
-
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-	if (skb->nf_bridge &&
-	    skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
+	if (nf_bridge_in_prerouting(skb))
 		return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
-#endif
+
 	if (hooknum == NF_INET_PRE_ROUTING)
 		return IP6_DEFRAG_CONNTRACK_IN + zone;
 	else
-- 
cgit v1.2.3-70-g09d2


From dd489240a21afc3ff3962aba5d987229536cae63 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 29 Jul 2015 11:32:20 +0200
Subject: KVM: document memory barriers for kvm->vcpus/kvm->online_vcpus

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 4 ++++
 virt/kvm/kvm_main.c      | 5 +++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bd1097a95704..81089cf1f0c1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -427,6 +427,10 @@ struct kvm {
 
 static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
 {
+	/* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu, in case
+	 * the caller has read kvm->online_vcpus before (as is the case
+	 * for kvm_for_each_vcpu, for example).
+	 */
 	smp_rmb();
 	return kvm->vcpus[i];
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8dc4828f623f..d8db2f8fce9c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2206,6 +2206,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	}
 
 	kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
+
+	/*
+	 * Pairs with smp_rmb() in kvm_get_vcpu.  Write kvm->vcpus
+	 * before kvm->online_vcpu's incremented value.
+	 */
 	smp_wmb();
 	atomic_inc(&kvm->online_vcpus);
 
-- 
cgit v1.2.3-70-g09d2


From ffd9a0fcbbed300b55f84e8397e96c2edd06cbdf Mon Sep 17 00:00:00 2001
From: Robert Baldyga <r.baldyga@samsung.com>
Date: Tue, 28 Jul 2015 07:19:58 +0200
Subject: usb: gadget: add 'quirk_altset_not_supp' to usb_gadget

Due to some UDC controllers may not support altsettings, usb gadget layer
needs to provide a generic way to inform gadget functions about non-standard
hardware limitations.

This patch adds 'quirk_altset_not_supp' field to struct usb_gadget and helper
function gadget_is_altset_supported(). It also sets 'quirk_altset_not_supp'
to 1 in pxa25x_udc and pxa27x_udc drivers, which have such limitation.

Signed-off-by: Robert Baldyga <r.baldyga@samsung.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/gadget/udc/pxa25x_udc.c |  1 +
 drivers/usb/gadget/udc/pxa27x_udc.c |  1 +
 include/linux/usb/gadget.h          | 11 +++++++++++
 3 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/udc/pxa25x_udc.c b/drivers/usb/gadget/udc/pxa25x_udc.c
index f6cbe667ce39..27f944231477 100644
--- a/drivers/usb/gadget/udc/pxa25x_udc.c
+++ b/drivers/usb/gadget/udc/pxa25x_udc.c
@@ -1176,6 +1176,7 @@ static void udc_reinit(struct pxa25x_udc *dev)
 	INIT_LIST_HEAD (&dev->gadget.ep_list);
 	INIT_LIST_HEAD (&dev->gadget.ep0->ep_list);
 	dev->ep0state = EP0_IDLE;
+	dev->gadget.quirk_altset_not_supp = 1;
 
 	/* basic endpoint records init */
 	for (i = 0; i < PXA_UDC_NUM_ENDPOINTS; i++) {
diff --git a/drivers/usb/gadget/udc/pxa27x_udc.c b/drivers/usb/gadget/udc/pxa27x_udc.c
index 042f06b52677..670ac0b12f00 100644
--- a/drivers/usb/gadget/udc/pxa27x_udc.c
+++ b/drivers/usb/gadget/udc/pxa27x_udc.c
@@ -1710,6 +1710,7 @@ static void udc_init_data(struct pxa_udc *dev)
 	INIT_LIST_HEAD(&dev->gadget.ep_list);
 	INIT_LIST_HEAD(&dev->gadget.ep0->ep_list);
 	dev->udc_usb_ep[0].pxa_ep = &dev->pxa_ep[0];
+	dev->gadget.quirk_altset_not_supp = 1;
 	ep0_idle(dev);
 
 	/* PXA endpoints init */
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index cea0511a1bc9..31be84b7e645 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -572,6 +572,7 @@ struct usb_gadget {
 	unsigned			a_hnp_support:1;
 	unsigned			a_alt_hnp_support:1;
 	unsigned			quirk_ep_out_aligned_size:1;
+	unsigned			quirk_altset_not_supp:1;
 	unsigned			is_selfpowered:1;
 	unsigned			deactivated:1;
 	unsigned			connected:1;
@@ -609,6 +610,16 @@ usb_ep_align_maybe(struct usb_gadget *g, struct usb_ep *ep, size_t len)
 			round_up(len, (size_t)ep->desc->wMaxPacketSize);
 }
 
+/**
+ * gadget_is_altset_supported - return true iff the hardware supports
+ *	altsettings
+ * @g: controller to check for quirk
+ */
+static inline int gadget_is_altset_supported(struct usb_gadget *g)
+{
+	return !g->quirk_altset_not_supp;
+}
+
 /**
  * gadget_is_dualspeed - return true iff the hardware handles high speed
  * @g: controller that might support both high and full speeds
-- 
cgit v1.2.3-70-g09d2


From 02ded1b0d8e73dad7d2626c960ef20fb7dc30753 Mon Sep 17 00:00:00 2001
From: Robert Baldyga <r.baldyga@samsung.com>
Date: Tue, 28 Jul 2015 07:19:59 +0200
Subject: usb: gadget: add 'quirk_stall_not_supp' to usb_gadget

Due to some UDC controllers may not support stalling, usb gadget layer
needs to provide a generic way to inform gadget functions about non-standard
hardware limitations.

This patch adds 'quirk_stall_not_supp' field to struct usb_gadget and helper
function gadget_is_stall_supported(). It also sets 'quirk_stall_not_supp'
to 1 in at91_udc driver, which has such limitation.

Signed-off-by: Robert Baldyga <r.baldyga@samsung.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/gadget/udc/at91_udc.c |  1 +
 include/linux/usb/gadget.h        | 10 ++++++++++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/udc/at91_udc.c b/drivers/usb/gadget/udc/at91_udc.c
index fc4226462f8f..32f50a7944dd 100644
--- a/drivers/usb/gadget/udc/at91_udc.c
+++ b/drivers/usb/gadget/udc/at91_udc.c
@@ -825,6 +825,7 @@ static void udc_reinit(struct at91_udc *udc)
 
 	INIT_LIST_HEAD(&udc->gadget.ep_list);
 	INIT_LIST_HEAD(&udc->gadget.ep0->ep_list);
+	udc->gadget.quirk_stall_not_supp = 1;
 
 	for (i = 0; i < NUM_ENDPOINTS; i++) {
 		struct at91_ep *ep = &udc->ep[i];
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 31be84b7e645..f195a76548f6 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -573,6 +573,7 @@ struct usb_gadget {
 	unsigned			a_alt_hnp_support:1;
 	unsigned			quirk_ep_out_aligned_size:1;
 	unsigned			quirk_altset_not_supp:1;
+	unsigned			quirk_stall_not_supp:1;
 	unsigned			is_selfpowered:1;
 	unsigned			deactivated:1;
 	unsigned			connected:1;
@@ -620,6 +621,15 @@ static inline int gadget_is_altset_supported(struct usb_gadget *g)
 	return !g->quirk_altset_not_supp;
 }
 
+/**
+ * gadget_is_stall_supported - return true iff the hardware supports stalling
+ * @g: controller to check for quirk
+ */
+static inline int gadget_is_stall_supported(struct usb_gadget *g)
+{
+	return !g->quirk_stall_not_supp;
+}
+
 /**
  * gadget_is_dualspeed - return true iff the hardware handles high speed
  * @g: controller that might support both high and full speeds
-- 
cgit v1.2.3-70-g09d2


From ca1023c81dd10f76a5d0a8be2fdbe724fe7a126a Mon Sep 17 00:00:00 2001
From: Robert Baldyga <r.baldyga@samsung.com>
Date: Tue, 28 Jul 2015 07:20:00 +0200
Subject: usb: gadget: add 'quirk_zlp_not_supp' to usb_gadget

Due to some UDC controllers may not support zlp, usb gadget layer
needs to provide a generic way to inform gadget functions about non-standard
hardware limitations.

This patch adds 'quirk_zlp_not_supp' field to struct usb_gadget and helper
function gadget_is_zlp_supported(). It also sets 'quirk_zlp_not_supp'
to 1 in musb UDC driver, which has such limitation.

[ balbi@ti.com : make it build ]

Signed-off-by: Robert Baldyga <r.baldyga@samsung.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/musb/musb_gadget.c |  1 +
 include/linux/usb/gadget.h     | 10 ++++++++++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/musb/musb_gadget.c b/drivers/usb/musb/musb_gadget.c
index 625d482f1a97..9e18178f1d45 100644
--- a/drivers/usb/musb/musb_gadget.c
+++ b/drivers/usb/musb/musb_gadget.c
@@ -2075,6 +2075,7 @@ __acquires(musb->lock)
 	musb->g.b_hnp_enable = 0;
 	musb->g.a_alt_hnp_support = 0;
 	musb->g.a_hnp_support = 0;
+	musb->g.quirk_zlp_not_supp = 1;
 
 	/* Normal reset, as B-Device;
 	 * or else after HNP, as A-Device
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index f195a76548f6..353a72096dda 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -574,6 +574,7 @@ struct usb_gadget {
 	unsigned			quirk_ep_out_aligned_size:1;
 	unsigned			quirk_altset_not_supp:1;
 	unsigned			quirk_stall_not_supp:1;
+	unsigned			quirk_zlp_not_supp:1;
 	unsigned			is_selfpowered:1;
 	unsigned			deactivated:1;
 	unsigned			connected:1;
@@ -630,6 +631,15 @@ static inline int gadget_is_stall_supported(struct usb_gadget *g)
 	return !g->quirk_stall_not_supp;
 }
 
+/**
+ * gadget_is_zlp_supported - return true iff the hardware supports zlp
+ * @g: controller to check for quirk
+ */
+static inline int gadget_is_zlp_supported(struct usb_gadget *g)
+{
+	return !g->quirk_zlp_not_supp;
+}
+
 /**
  * gadget_is_dualspeed - return true iff the hardware handles high speed
  * @g: controller that might support both high and full speeds
-- 
cgit v1.2.3-70-g09d2


From 6f98f545b0b4effdf67e83e214a4eb13cd41fba2 Mon Sep 17 00:00:00 2001
From: "Ivan T. Ivanov" <ivan.ivanov@linaro.org>
Date: Tue, 28 Jul 2015 11:10:22 +0300
Subject: usb: phy: msm: Add D+/D- lines route control

apq8016-sbc board is using Dual SPDT USB Switch (TC7USB40MU),
witch is controlled by GPIO to de/multiplex D+/D- USB lines to
USB2513B Hub and uB connector. Add support for this.

Signed-off-by: Ivan T. Ivanov <ivan.ivanov@linaro.org>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 .../devicetree/bindings/usb/msm-hsusb.txt          |  4 ++
 drivers/usb/phy/phy-msm-usb.c                      | 47 ++++++++++++++++++++++
 include/linux/usb/msm_hsusb.h                      |  7 ++++
 3 files changed, 58 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/usb/msm-hsusb.txt b/Documentation/devicetree/bindings/usb/msm-hsusb.txt
index bd8d9e753029..8654a3ec23e4 100644
--- a/Documentation/devicetree/bindings/usb/msm-hsusb.txt
+++ b/Documentation/devicetree/bindings/usb/msm-hsusb.txt
@@ -52,6 +52,10 @@ Required properties:
 Optional properties:
 - dr_mode:      One of "host", "peripheral" or "otg". Defaults to "otg"
 
+- switch-gpio:  A phandle + gpio-specifier pair. Some boards are using Dual
+                SPDT USB Switch, witch is cotrolled by GPIO to de/multiplex
+                D+/D- USB lines between connectors.
+
 - qcom,phy-init-sequence: PHY configuration sequence values. This is related to Device
                 Mode Eye Diagram test. Start address at which these values will be
                 written is ULPI_EXT_VENDOR_SPECIFIC. Value of -1 is reserved as
diff --git a/drivers/usb/phy/phy-msm-usb.c b/drivers/usb/phy/phy-msm-usb.c
index 61d86d8bf5b7..c58c3c0dbe35 100644
--- a/drivers/usb/phy/phy-msm-usb.c
+++ b/drivers/usb/phy/phy-msm-usb.c
@@ -18,6 +18,7 @@
 
 #include <linux/module.h>
 #include <linux/device.h>
+#include <linux/gpio/consumer.h>
 #include <linux/platform_device.h>
 #include <linux/clk.h>
 #include <linux/slab.h>
@@ -32,6 +33,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
+#include <linux/reboot.h>
 #include <linux/reset.h>
 
 #include <linux/usb.h>
@@ -1471,6 +1473,14 @@ static int msm_otg_vbus_notifier(struct notifier_block *nb, unsigned long event,
 	else
 		clear_bit(B_SESS_VLD, &motg->inputs);
 
+	if (test_bit(B_SESS_VLD, &motg->inputs)) {
+		/* Switch D+/D- lines to Device connector */
+		gpiod_set_value_cansleep(motg->switch_gpio, 0);
+	} else {
+		/* Switch D+/D- lines to Hub */
+		gpiod_set_value_cansleep(motg->switch_gpio, 1);
+	}
+
 	schedule_work(&motg->sm_work);
 
 	return NOTIFY_DONE;
@@ -1546,6 +1556,11 @@ static int msm_otg_read_dt(struct platform_device *pdev, struct msm_otg *motg)
 
 	motg->manual_pullup = of_property_read_bool(node, "qcom,manual-pullup");
 
+	motg->switch_gpio = devm_gpiod_get_optional(&pdev->dev, "switch",
+						    GPIOD_OUT_LOW);
+	if (IS_ERR(motg->switch_gpio))
+		return PTR_ERR(motg->switch_gpio);
+
 	ext_id = ERR_PTR(-ENODEV);
 	ext_vbus = ERR_PTR(-ENODEV);
 	if (of_property_read_bool(node, "extcon")) {
@@ -1617,6 +1632,19 @@ static int msm_otg_read_dt(struct platform_device *pdev, struct msm_otg *motg)
 	return 0;
 }
 
+static int msm_otg_reboot_notify(struct notifier_block *this,
+				 unsigned long code, void *unused)
+{
+	struct msm_otg *motg = container_of(this, struct msm_otg, reboot);
+
+	/*
+	 * Ensure that D+/D- lines are routed to uB connector, so
+	 * we could load bootloader/kernel at next reboot
+	 */
+	gpiod_set_value_cansleep(motg->switch_gpio, 0);
+	return NOTIFY_DONE;
+}
+
 static int msm_otg_probe(struct platform_device *pdev)
 {
 	struct regulator_bulk_data regs[3];
@@ -1781,6 +1809,17 @@ static int msm_otg_probe(struct platform_device *pdev)
 			dev_dbg(&pdev->dev, "Can not create mode change file\n");
 	}
 
+	if (test_bit(B_SESS_VLD, &motg->inputs)) {
+		/* Switch D+/D- lines to Device connector */
+		gpiod_set_value_cansleep(motg->switch_gpio, 0);
+	} else {
+		/* Switch D+/D- lines to Hub */
+		gpiod_set_value_cansleep(motg->switch_gpio, 1);
+	}
+
+	motg->reboot.notifier_call = msm_otg_reboot_notify;
+	register_reboot_notifier(&motg->reboot);
+
 	pm_runtime_set_active(&pdev->dev);
 	pm_runtime_enable(&pdev->dev);
 
@@ -1807,6 +1846,14 @@ static int msm_otg_remove(struct platform_device *pdev)
 	if (phy->otg->host || phy->otg->gadget)
 		return -EBUSY;
 
+	unregister_reboot_notifier(&motg->reboot);
+
+	/*
+	 * Ensure that D+/D- lines are routed to uB connector, so
+	 * we could load bootloader/kernel at next reboot
+	 */
+	gpiod_set_value_cansleep(motg->switch_gpio, 0);
+
 	extcon_unregister_notifier(motg->id.extcon, EXTCON_USB_HOST, &motg->id.nb);
 	extcon_unregister_notifier(motg->vbus.extcon, EXTCON_USB, &motg->vbus.nb);
 
diff --git a/include/linux/usb/msm_hsusb.h b/include/linux/usb/msm_hsusb.h
index 5df2c8f59aa0..8c8f6854c993 100644
--- a/include/linux/usb/msm_hsusb.h
+++ b/include/linux/usb/msm_hsusb.h
@@ -155,6 +155,10 @@ struct msm_usb_cable {
  *	starting controller using usbcmd run/stop bit.
  * @vbus: VBUS signal state trakining, using extcon framework
  * @id: ID signal state trakining, using extcon framework
+ * @switch_gpio: Descriptor for GPIO used to control external Dual
+ *               SPDT USB Switch.
+ * @reboot: Used to inform the driver to route USB D+/D- line to Device
+ *	    connector
  */
 struct msm_otg {
 	struct usb_phy phy;
@@ -188,6 +192,9 @@ struct msm_otg {
 
 	struct msm_usb_cable vbus;
 	struct msm_usb_cable id;
+
+	struct gpio_desc *switch_gpio;
+	struct notifier_block reboot;
 };
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 7b36f92934e40d1ee24e5617ddedb852e10086ca Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 30 Jul 2015 12:42:47 +0200
Subject: bpf: provide helper that indicates eBPF was migrated

During recent discussions we had with Michael, we found that it would
be useful to have an indicator that tells the JIT that an eBPF program
had been migrated from classic instructions into eBPF instructions, as
only in that case A and X need to be cleared in the prologue. Such eBPF
programs do not set a particular type, but all have BPF_PROG_TYPE_UNSPEC.
Thus, introduce a small helper for cde66c2d88da ("s390/bpf: Only clear
A and X for converted BPF programs") and possibly others in future.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/s390/net/bpf_jit_comp.c |  2 +-
 include/linux/filter.h       | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index bbbac6da37af..9f4bbc09bf07 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1245,7 +1245,7 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp)
 	jit->lit = jit->lit_start;
 	jit->prg = 0;
 
-	bpf_jit_prologue(jit, fp->type == BPF_PROG_TYPE_UNSPEC);
+	bpf_jit_prologue(jit, bpf_prog_was_classic(fp));
 	for (i = 0; i < fp->len; i += insn_count) {
 		insn_count = bpf_jit_insn(jit, fp, i);
 		if (insn_count < 0)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 69d00555ce35..6b025491120d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -354,6 +354,16 @@ static inline unsigned int bpf_prog_size(unsigned int proglen)
 		   offsetof(struct bpf_prog, insns[proglen]));
 }
 
+static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
+{
+	/* When classic BPF programs have been loaded and the arch
+	 * does not have a classic BPF JIT (anymore), they have been
+	 * converted via bpf_migrate_filter() to eBPF and thus always
+	 * have an unspec program type.
+	 */
+	return prog->type == BPF_PROG_TYPE_UNSPEC;
+}
+
 #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
 
 #ifdef CONFIG_DEBUG_SET_MODULE_RONX
-- 
cgit v1.2.3-70-g09d2


From b13138ef72178a13f34e33883f9f093f9e3b1bda Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 30 Jul 2015 12:42:49 +0200
Subject: bpf: also show process name/pid in bpf_jit_dump

It can be useful for testing to see the actual process/pid who is loading
a given filter. I was running some BPF test program and noticed unusual
filter loads from time to time, triggered by some other application in the
background. bpf_jit_disasm is still working after this change.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6b025491120d..fa2cab985e57 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -12,6 +12,7 @@
 #include <linux/linkage.h>
 #include <linux/printk.h>
 #include <linux/workqueue.h>
+#include <linux/sched.h>
 
 #include <asm/cacheflush.h>
 
@@ -438,8 +439,9 @@ void bpf_jit_free(struct bpf_prog *fp);
 static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
 				u32 pass, void *image)
 {
-	pr_err("flen=%u proglen=%u pass=%u image=%pK\n",
-	       flen, proglen, pass, image);
+	pr_err("flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n", flen,
+	       proglen, pass, image, current->comm, task_pid_nr(current));
+
 	if (image)
 		print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
 			       16, 1, image, proglen, false);
-- 
cgit v1.2.3-70-g09d2


From 890e4847587fcff5eb0438e90992ad7d2a261f33 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Wed, 10 Jun 2015 16:54:58 +0800
Subject: PCI: Add pcibios_alloc_irq() and pcibios_free_irq()

Add pcibios_alloc_irq() and pcibios_free_irq(), which are called when
binding/unbinding PCI device drivers.

PCI arch code may implement these to manage IRQ resources for hotplugged
devices.

[bhelgaas: changelog]
Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/pci-driver.c | 26 ++++++++++++++++++++------
 include/linux/pci.h      |  2 ++
 2 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 3cb2210de553..52a880ca1768 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -388,18 +388,31 @@ static int __pci_device_probe(struct pci_driver *drv, struct pci_dev *pci_dev)
 	return error;
 }
 
+int __weak pcibios_alloc_irq(struct pci_dev *dev)
+{
+	return 0;
+}
+
+void __weak pcibios_free_irq(struct pci_dev *dev)
+{
+}
+
 static int pci_device_probe(struct device *dev)
 {
-	int error = 0;
-	struct pci_driver *drv;
-	struct pci_dev *pci_dev;
+	int error;
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	struct pci_driver *drv = to_pci_driver(dev->driver);
+
+	error = pcibios_alloc_irq(pci_dev);
+	if (error < 0)
+		return error;
 
-	drv = to_pci_driver(dev->driver);
-	pci_dev = to_pci_dev(dev);
 	pci_dev_get(pci_dev);
 	error = __pci_device_probe(drv, pci_dev);
-	if (error)
+	if (error) {
+		pcibios_free_irq(pci_dev);
 		pci_dev_put(pci_dev);
+	}
 
 	return error;
 }
@@ -415,6 +428,7 @@ static int pci_device_remove(struct device *dev)
 			drv->remove(pci_dev);
 			pm_runtime_put_noidle(dev);
 		}
+		pcibios_free_irq(pci_dev);
 		pci_dev->driver = NULL;
 	}
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8a0321a8fb59..b4832c92f23e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1645,6 +1645,8 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev,
 int pcibios_add_device(struct pci_dev *dev);
 void pcibios_release_device(struct pci_dev *dev);
 void pcibios_penalize_isa_irq(int irq, int active);
+int pcibios_alloc_irq(struct pci_dev *dev);
+void pcibios_free_irq(struct pci_dev *dev);
 
 #ifdef CONFIG_HIBERNATE_CALLBACKS
 extern struct dev_pm_ops pcibios_pm_ops;
-- 
cgit v1.2.3-70-g09d2


From 811a4e6fce09bc9239c664c6a1a53645a678c303 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Wed, 10 Jun 2015 16:55:00 +0800
Subject: PCI: Add helpers to manage pci_dev->irq and pci_dev->irq_managed

Add pci_has_managed_irq(), pci_set_managed_irq(), and
pci_reset_managed_irq() to simplify code.  No functional change.

[bhelgaas: changelog]
Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/pci/intel_mid_pci.c |  4 ++--
 arch/x86/pci/irq.c           | 10 ++++------
 drivers/acpi/pci_irq.c       | 10 ++++------
 include/linux/pci.h          | 17 +++++++++++++++++
 4 files changed, 27 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index fb7a1f96d80c..22aaefb4f1ca 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -211,7 +211,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 	struct irq_alloc_info info;
 	int polarity;
 
-	if (dev->irq_managed && dev->irq > 0)
+	if (pci_has_managed_irq(dev))
 		return 0;
 
 	if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
@@ -234,7 +234,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 
 static void intel_mid_pci_irq_disable(struct pci_dev *dev)
 {
-	if (dev->irq_managed && dev->irq > 0) {
+	if (pci_has_managed_irq(dev)) {
 		mp_unmap_irq(dev->irq);
 		dev->irq_managed = 0;
 		/*
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 72108f0b66b1..32e70343e6fd 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1202,7 +1202,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
 			struct pci_dev *temp_dev;
 			int irq;
 
-			if (dev->irq_managed && dev->irq > 0)
+			if (pci_has_managed_irq(dev))
 				return 0;
 
 			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
@@ -1230,8 +1230,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
 			}
 			dev = temp_dev;
 			if (irq >= 0) {
-				dev->irq_managed = 1;
-				dev->irq = irq;
+				pci_set_managed_irq(dev, irq);
 				dev_info(&dev->dev, "PCI->APIC IRQ transform: "
 					 "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
 				return 0;
@@ -1259,9 +1258,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
 
 static void pirq_disable_irq(struct pci_dev *dev)
 {
-	if (io_apic_assign_pci_irqs && dev->irq_managed && dev->irq) {
+	if (io_apic_assign_pci_irqs && pci_has_managed_irq(dev)) {
 		mp_unmap_irq(dev->irq);
-		dev->irq = 0;
-		dev->irq_managed = 0;
+		pci_reset_managed_irq(dev);
 	}
 }
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index d1aad6900b4c..afa16c557c17 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -412,7 +412,7 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
 		return 0;
 	}
 
-	if (dev->irq_managed && dev->irq > 0)
+	if (pci_has_managed_irq(dev))
 		return 0;
 
 	entry = acpi_pci_irq_lookup(dev, pin);
@@ -457,8 +457,7 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
 		kfree(entry);
 		return rc;
 	}
-	dev->irq = rc;
-	dev->irq_managed = 1;
+	pci_set_managed_irq(dev, rc);
 
 	if (link)
 		snprintf(link_desc, sizeof(link_desc), " -> Link[%s]", link);
@@ -481,7 +480,7 @@ void acpi_pci_irq_disable(struct pci_dev *dev)
 	u8 pin;
 
 	pin = dev->pin;
-	if (!pin || !dev->irq_managed || dev->irq <= 0)
+	if (!pin || !pci_has_managed_irq(dev))
 		return;
 
 	entry = acpi_pci_irq_lookup(dev, pin);
@@ -503,7 +502,6 @@ void acpi_pci_irq_disable(struct pci_dev *dev)
 	dev_dbg(&dev->dev, "PCI INT %c disabled\n", pin_name(pin));
 	if (gsi >= 0) {
 		acpi_unregister_gsi(gsi);
-		dev->irq_managed = 0;
-		dev->irq = 0;
+		pci_reset_managed_irq(dev);
 	}
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b4832c92f23e..b7ab0c424ed6 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -963,6 +963,23 @@ static inline int pci_is_managed(struct pci_dev *pdev)
 	return pdev->is_managed;
 }
 
+static inline void pci_set_managed_irq(struct pci_dev *pdev, unsigned int irq)
+{
+	pdev->irq = irq;
+	pdev->irq_managed = 1;
+}
+
+static inline void pci_reset_managed_irq(struct pci_dev *pdev)
+{
+	pdev->irq = 0;
+	pdev->irq_managed = 0;
+}
+
+static inline bool pci_has_managed_irq(struct pci_dev *pdev)
+{
+	return pdev->irq_managed && pdev->irq > 0;
+}
+
 void pci_disable_device(struct pci_dev *dev);
 
 extern unsigned int pcibios_max_latency;
-- 
cgit v1.2.3-70-g09d2


From 67546762978f523749eac157903e0b01c18e083a Mon Sep 17 00:00:00 2001
From: Yijing Wang <wangyijing@huawei.com>
Date: Fri, 17 Jul 2015 17:16:31 +0800
Subject: PCI: Protect pci_bus->slots with pci_slot_mutex, not pci_bus_sem

Rajat Jain reported a deadlock when PCIe hot-add and AER recovery happen at
the same time:

thread 1:

  pciehp_enable_slot
    pciehp_configure_device
      pci_bus_add_devices
        pci_bus_add_device
          device_attach
            device_lock(dev)                       # acquire device lock
            ...
            pciehp_probe
              init_slot
                pci_hp_register
                  pci_create_slot
                    down_write(pci_bus_sem)        # deadlock here

thread 2:

  aer_isr_one_error
    aer_process_err_device
      do_recovery
        broadcast_error_message(..., report_error_detected)
          pci_walk_bus(..., cb=report_error_detected, ...)
            down_read(&pci_bus_sem)                # acquire pci_bus_sem
            report_error_detected(dev)             # cb()
              device_lock(dev)                     # deadlock here

Previously, the bus->devices and bus->slots list were protected by
pci_bus_sem.  In pci_create_slot(), we held it for writing so we could
add to the bus->slots list.

Add a new local pci_slot_mutex to protect bus->slots.  Hold pci_bus_sem for
reading while searching the bus->devices list.

[bhelgaas: changelog]
Link: http://lkml.kernel.org/r/CAA93t1qpPqbih+UB0McA_d_+2rVaNkXsinAUxYzK9+JXSS+L-g@mail.gmail.com
Reported-by: Rajat Jain <rajatja@google.com>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/slot.c  | 18 +++++++++++-------
 include/linux/pci.h |  3 ++-
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index 396c200b9ddb..4bd3fce93fa4 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -14,6 +14,7 @@
 
 struct kset *pci_slots_kset;
 EXPORT_SYMBOL_GPL(pci_slots_kset);
+static DEFINE_MUTEX(pci_slot_mutex);
 
 static ssize_t pci_slot_attr_show(struct kobject *kobj,
 					struct attribute *attr, char *buf)
@@ -106,9 +107,11 @@ static void pci_slot_release(struct kobject *kobj)
 	dev_dbg(&slot->bus->dev, "dev %02x, released physical slot %s\n",
 		slot->number, pci_slot_name(slot));
 
+	down_read(&pci_bus_sem);
 	list_for_each_entry(dev, &slot->bus->devices, bus_list)
 		if (PCI_SLOT(dev->devfn) == slot->number)
 			dev->slot = NULL;
+	up_read(&pci_bus_sem);
 
 	list_del(&slot->list);
 
@@ -194,9 +197,8 @@ static int rename_slot(struct pci_slot *slot, const char *name)
 static struct pci_slot *get_slot(struct pci_bus *parent, int slot_nr)
 {
 	struct pci_slot *slot;
-	/*
-	 * We already hold pci_bus_sem so don't worry
-	 */
+
+	/* We already hold pci_slot_mutex */
 	list_for_each_entry(slot, &parent->slots, list)
 		if (slot->number == slot_nr) {
 			kobject_get(&slot->kobj);
@@ -253,7 +255,7 @@ struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr,
 	int err = 0;
 	char *slot_name = NULL;
 
-	down_write(&pci_bus_sem);
+	mutex_lock(&pci_slot_mutex);
 
 	if (slot_nr == -1)
 		goto placeholder;
@@ -301,16 +303,18 @@ placeholder:
 	INIT_LIST_HEAD(&slot->list);
 	list_add(&slot->list, &parent->slots);
 
+	down_read(&pci_bus_sem);
 	list_for_each_entry(dev, &parent->devices, bus_list)
 		if (PCI_SLOT(dev->devfn) == slot_nr)
 			dev->slot = slot;
+	up_read(&pci_bus_sem);
 
 	dev_dbg(&parent->dev, "dev %02x, created physical slot %s\n",
 		slot_nr, pci_slot_name(slot));
 
 out:
 	kfree(slot_name);
-	up_write(&pci_bus_sem);
+	mutex_unlock(&pci_slot_mutex);
 	return slot;
 err:
 	kfree(slot);
@@ -332,9 +336,9 @@ void pci_destroy_slot(struct pci_slot *slot)
 	dev_dbg(&slot->bus->dev, "dev %02x, dec refcount to %d\n",
 		slot->number, atomic_read(&slot->kobj.kref.refcount) - 1);
 
-	down_write(&pci_bus_sem);
+	mutex_lock(&pci_slot_mutex);
 	kobject_put(&slot->kobj);
-	up_write(&pci_bus_sem);
+	mutex_unlock(&pci_slot_mutex);
 }
 EXPORT_SYMBOL_GPL(pci_destroy_slot);
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8a0321a8fb59..aaee493174e2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -446,7 +446,8 @@ struct pci_bus {
 	struct list_head children;	/* list of child buses */
 	struct list_head devices;	/* list of devices on this bus */
 	struct pci_dev	*self;		/* bridge device as seen by parent */
-	struct list_head slots;		/* list of slots on this bus */
+	struct list_head slots;		/* list of slots on this bus;
+					   protected by pci_slot_mutex */
 	struct resource *resource[PCI_BRIDGE_RESOURCE_NUM];
 	struct list_head resources;	/* address space routed to this bus */
 	struct resource busn_res;	/* bus numbers routed to this bus */
-- 
cgit v1.2.3-70-g09d2


From 017ffe64e8b8c8db0f50433a71da41c6a4e12710 Mon Sep 17 00:00:00 2001
From: Yijing Wang <wangyijing@huawei.com>
Date: Fri, 17 Jul 2015 17:16:32 +0800
Subject: PCI: Hold pci_slot_mutex while searching bus->slots list

Previously, pci_setup_device() and similar functions searched the
pci_bus->slots list without any locking.  It was possible for another
thread to update the list while we searched it.

Add pci_dev_assign_slot() to search the list while holding pci_slot_mutex.

[bhelgaas: changelog, fold in CONFIG_SYSFS fix]
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/powerpc/kernel/pci_of_scan.c |  6 +-----
 arch/sparc/kernel/pci.c           |  6 +-----
 drivers/pci/probe.c               |  6 +-----
 drivers/pci/slot.c                | 11 +++++++++++
 include/linux/pci.h               |  5 +++++
 5 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c
index 42e02a2d570b..5e2debfc6ce5 100644
--- a/arch/powerpc/kernel/pci_of_scan.c
+++ b/arch/powerpc/kernel/pci_of_scan.c
@@ -126,7 +126,6 @@ struct pci_dev *of_create_pci_dev(struct device_node *node,
 {
 	struct pci_dev *dev;
 	const char *type;
-	struct pci_slot *slot;
 
 	dev = pci_alloc_dev(bus);
 	if (!dev)
@@ -145,10 +144,7 @@ struct pci_dev *of_create_pci_dev(struct device_node *node,
 	dev->needs_freset = 0;		/* pcie fundamental reset required */
 	set_pcie_port_type(dev);
 
-	list_for_each_entry(slot, &dev->bus->slots, list)
-		if (PCI_SLOT(dev->devfn) == slot->number)
-			dev->slot = slot;
-
+	pci_dev_assign_slot(dev);
 	dev->vendor = get_int_prop(node, "vendor-id", 0xffff);
 	dev->device = get_int_prop(node, "device-id", 0xffff);
 	dev->subsystem_vendor = get_int_prop(node, "subsystem-vendor-id", 0);
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index c928bc64b4ba..3a0e1a986bfe 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -249,7 +249,6 @@ static struct pci_dev *of_create_pci_dev(struct pci_pbm_info *pbm,
 					 struct pci_bus *bus, int devfn)
 {
 	struct dev_archdata *sd;
-	struct pci_slot *slot;
 	struct platform_device *op;
 	struct pci_dev *dev;
 	const char *type;
@@ -290,10 +289,7 @@ static struct pci_dev *of_create_pci_dev(struct pci_pbm_info *pbm,
 	dev->multifunction = 0;		/* maybe a lie? */
 	set_pcie_port_type(dev);
 
-	list_for_each_entry(slot, &dev->bus->slots, list)
-		if (PCI_SLOT(dev->devfn) == slot->number)
-			dev->slot = slot;
-
+	pci_dev_assign_slot(dev);
 	dev->vendor = of_getintprop_default(node, "vendor-id", 0xffff);
 	dev->device = of_getintprop_default(node, "device-id", 0xffff);
 	dev->subsystem_vendor =
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index cefd636681b6..2a9ce16cb374 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1133,7 +1133,6 @@ int pci_setup_device(struct pci_dev *dev)
 {
 	u32 class;
 	u8 hdr_type;
-	struct pci_slot *slot;
 	int pos = 0;
 	struct pci_bus_region region;
 	struct resource *res;
@@ -1149,10 +1148,7 @@ int pci_setup_device(struct pci_dev *dev)
 	dev->error_state = pci_channel_io_normal;
 	set_pcie_port_type(dev);
 
-	list_for_each_entry(slot, &dev->bus->slots, list)
-		if (PCI_SLOT(dev->devfn) == slot->number)
-			dev->slot = slot;
-
+	pci_dev_assign_slot(dev);
 	/* Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer)
 	   set this higher, assuming the system even supports it.  */
 	dev->dma_mask = 0xffffffff;
diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index 4bd3fce93fa4..429d34c348b9 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -194,6 +194,17 @@ static int rename_slot(struct pci_slot *slot, const char *name)
 	return result;
 }
 
+void pci_dev_assign_slot(struct pci_dev *dev)
+{
+	struct pci_slot *slot;
+
+	mutex_lock(&pci_slot_mutex);
+	list_for_each_entry(slot, &dev->bus->slots, list)
+		if (PCI_SLOT(dev->devfn) == slot->number)
+			dev->slot = slot;
+	mutex_unlock(&pci_slot_mutex);
+}
+
 static struct pci_slot *get_slot(struct pci_bus *parent, int slot_nr)
 {
 	struct pci_slot *slot;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index aaee493174e2..b3ba7fef2916 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -798,6 +798,11 @@ struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr,
 				 const char *name,
 				 struct hotplug_slot *hotplug);
 void pci_destroy_slot(struct pci_slot *slot);
+#ifdef CONFIG_SYSFS
+void pci_dev_assign_slot(struct pci_dev *dev);
+#else
+static inline void pci_dev_assign_slot(struct pci_dev *dev) { }
+#endif
 int pci_scan_slot(struct pci_bus *bus, int devfn);
 struct pci_dev *pci_scan_single_device(struct pci_bus *bus, int devfn);
 void pci_device_add(struct pci_dev *dev, struct pci_bus *bus);
-- 
cgit v1.2.3-70-g09d2


From 8013d1d7eafb0589ca766db6b74026f76b7f5cb4 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Thu, 30 Jul 2015 14:28:42 +0800
Subject: net/ipv6: add sysctl option accept_ra_min_hop_limit

Commit 6fd99094de2b ("ipv6: Don't reduce hop limit for an interface")
disabled accept hop limit from RA if it is smaller than the current hop
limit for security stuff. But this behavior kind of break the RFC definition.

RFC 4861, 6.3.4.  Processing Received Router Advertisements
   A Router Advertisement field (e.g., Cur Hop Limit, Reachable Time,
   and Retrans Timer) may contain a value denoting that it is
   unspecified.  In such cases, the parameter should be ignored and the
   host should continue using whatever value it is already using.

   If the received Cur Hop Limit value is non-zero, the host SHOULD set
   its CurHopLimit variable to the received value.

So add sysctl option accept_ra_min_hop_limit to let user choose the minimum
hop limit value they can accept from RA. And set default to 1 to meet RFC
standards.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: YOSHIFUJI Hideaki <hideaki.yoshifuji@miraclelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  8 ++++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 10 ++++++++++
 net/ipv6/ndisc.c                       | 16 +++++++---------
 5 files changed, 27 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 1a5ab21bcca5..00d26d919459 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1340,6 +1340,14 @@ accept_ra_from_local - BOOLEAN
 	   disabled if accept_ra_from_local is disabled
                on a specific interface.
 
+accept_ra_min_hop_limit - INTEGER
+	Minimum hop limit Information in Router Advertisement.
+
+	Hop limit Information in Router Advertisement less than this
+	variable shall be ignored.
+
+	Default: 1
+
 accept_ra_pinfo - BOOLEAN
 	Learn Prefix Information in Router Advertisement.
 
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 06ed637225b8..cb9dcad72372 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -29,6 +29,7 @@ struct ipv6_devconf {
 	__s32		max_desync_factor;
 	__s32		max_addresses;
 	__s32		accept_ra_defrtr;
+	__s32		accept_ra_min_hop_limit;
 	__s32		accept_ra_pinfo;
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	__s32		accept_ra_rtr_pref;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 641a146ead7d..80f3b74446a1 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -172,6 +172,7 @@ enum {
 	DEVCONF_ACCEPT_RA_MTU,
 	DEVCONF_STABLE_SECRET,
 	DEVCONF_USE_OIF_ADDRS_ONLY,
+	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index eb0c6a3a8a00..53e3a9d756b0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -195,6 +195,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.max_addresses		= IPV6_MAX_ADDRESSES,
 	.accept_ra_defrtr	= 1,
 	.accept_ra_from_local	= 0,
+	.accept_ra_min_hop_limit= 1,
 	.accept_ra_pinfo	= 1,
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	.accept_ra_rtr_pref	= 1,
@@ -237,6 +238,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.max_addresses		= IPV6_MAX_ADDRESSES,
 	.accept_ra_defrtr	= 1,
 	.accept_ra_from_local	= 0,
+	.accept_ra_min_hop_limit= 1,
 	.accept_ra_pinfo	= 1,
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	.accept_ra_rtr_pref	= 1,
@@ -4588,6 +4590,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor;
 	array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses;
 	array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr;
+	array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = cnf->accept_ra_min_hop_limit;
 	array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo;
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref;
@@ -5484,6 +5487,13 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
+		{
+			.procname	= "accept_ra_min_hop_limit",
+			.data		= &ipv6_devconf.accept_ra_min_hop_limit,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
 		{
 			.procname	= "accept_ra_pinfo",
 			.data		= &ipv6_devconf.accept_ra_pinfo,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 0a05b35a90fc..6e184e02fd3c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1225,18 +1225,16 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 
 	if (rt)
 		rt6_set_expires(rt, jiffies + (HZ * lifetime));
-	if (ra_msg->icmph.icmp6_hop_limit) {
-		/* Only set hop_limit on the interface if it is higher than
-		 * the current hop_limit.
-		 */
-		if (in6_dev->cnf.hop_limit < ra_msg->icmph.icmp6_hop_limit) {
+	if (in6_dev->cnf.accept_ra_min_hop_limit < 256 &&
+	    ra_msg->icmph.icmp6_hop_limit) {
+		if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) {
 			in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
+			if (rt)
+				dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
+					       ra_msg->icmph.icmp6_hop_limit);
 		} else {
-			ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than current\n");
+			ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n");
 		}
-		if (rt)
-			dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
-				       ra_msg->icmph.icmp6_hop_limit);
 	}
 
 skip_defrtr:
-- 
cgit v1.2.3-70-g09d2


From 97da89767d398c1dfa1f34e5f312eb8ebb382f7f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:16 +0200
Subject: uprobes: Export 'struct return_instance', introduce
 arch_uretprobe_is_alive()

Add the new "weak" helper, arch_uretprobe_is_alive(), used by
the next patches. It should return true if this return_instance
is still valid. The arch agnostic version just always returns
true.

The patch exports "struct return_instance" for the architectures
which want to override this hook. We can also cleanup
prepare_uretprobe() if we pass the new return_instance to
arch_uretprobe_hijack_return_addr().

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134016.GA4762@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/uprobes.h | 10 ++++++++++
 kernel/events/uprobes.c | 14 +++++---------
 2 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 60beb5dc7977..50d2764d66a8 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -92,6 +92,15 @@ struct uprobe_task {
 	unsigned int			depth;
 };
 
+struct return_instance {
+	struct uprobe		*uprobe;
+	unsigned long		func;
+	unsigned long		orig_ret_vaddr; /* original return address */
+	bool			chained;	/* true, if instance is nested */
+
+	struct return_instance	*next;		/* keep as stack */
+};
+
 struct xol_area;
 
 struct uprobes_state {
@@ -128,6 +137,7 @@ extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
 extern int  arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data);
 extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs);
+extern bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs);
 extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs);
 extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
 					 void *src, unsigned long len);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 98e4d97b8c31..1c71b6242a7e 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -86,15 +86,6 @@ struct uprobe {
 	struct arch_uprobe	arch;
 };
 
-struct return_instance {
-	struct uprobe		*uprobe;
-	unsigned long		func;
-	unsigned long		orig_ret_vaddr; /* original return address */
-	bool			chained;	/* true, if instance is nested */
-
-	struct return_instance	*next;		/* keep as stack */
-};
-
 /*
  * Execute out of line area: anonymous executable mapping installed
  * by the probed task to execute the copy of the original instruction
@@ -1818,6 +1809,11 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
 	return false;
 }
 
+bool __weak arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs)
+{
+	return true;
+}
+
 /*
  * Run handler and ask thread to singlestep.
  * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
-- 
cgit v1.2.3-70-g09d2


From 7b868e4802a86d867aad1be0471b5767d9c20e10 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:18 +0200
Subject: uprobes/x86: Reimplement arch_uretprobe_is_alive()

Add the x86 specific version of arch_uretprobe_is_alive()
helper. It returns true if the stack frame mangled by
prepare_uretprobe() is still on stack. So if it returns false,
we know that the probed function has already returned.

We add the new return_instance->stack member and change the
generic code to initialize it in prepare_uretprobe, but it
should be equally useful for other architectures.

TODO: this assumes that the probed application can't use
      multiple stacks (say sigaltstack). We will try to improve
      this logic later.

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134018.GA4766@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/uprobes.c | 5 +++++
 include/linux/uprobes.h   | 1 +
 kernel/events/uprobes.c   | 1 +
 3 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 66476244731e..58e9b842633f 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -985,3 +985,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
 
 	return -1;
 }
+
+bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs)
+{
+	return regs->sp <= ret->stack;
+}
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 50d2764d66a8..7ab6d2c8be49 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -95,6 +95,7 @@ struct uprobe_task {
 struct return_instance {
 	struct uprobe		*uprobe;
 	unsigned long		func;
+	unsigned long		stack;		/* stack pointer */
 	unsigned long		orig_ret_vaddr; /* original return address */
 	bool			chained;	/* true, if instance is nested */
 
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1c71b6242a7e..c5f316e06dc0 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1562,6 +1562,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 
 	ri->uprobe = get_uprobe(uprobe);
 	ri->func = instruction_pointer(regs);
+	ri->stack = user_stack_pointer(regs);
 	ri->orig_ret_vaddr = orig_ret_vaddr;
 	ri->chained = chained;
 
-- 
cgit v1.2.3-70-g09d2


From 86dcb702e74b8ab7d3b2d36984ef00671cea73b9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:26 +0200
Subject: uprobes: Add the "enum rp_check ctx" arg to arch_uretprobe_is_alive()

arch/x86 doesn't care (so far), but as Pratyush Anand pointed
out other architectures might want why arch_uretprobe_is_alive()
was called and use different checks depending on the context.
Add the new argument to distinguish 2 callers.

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134026.GA4779@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/uprobes.c | 3 ++-
 include/linux/uprobes.h   | 7 ++++++-
 kernel/events/uprobes.c   | 9 ++++++---
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 58e9b842633f..acf8b9010bbf 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -986,7 +986,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
 	return -1;
 }
 
-bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs)
+bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+				struct pt_regs *regs)
 {
 	return regs->sp <= ret->stack;
 }
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 7ab6d2c8be49..c0a540239ab6 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -102,6 +102,11 @@ struct return_instance {
 	struct return_instance	*next;		/* keep as stack */
 };
 
+enum rp_check {
+	RP_CHECK_CALL,
+	RP_CHECK_RET,
+};
+
 struct xol_area;
 
 struct uprobes_state {
@@ -138,7 +143,7 @@ extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
 extern int  arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data);
 extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs);
-extern bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs);
+extern bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, struct pt_regs *regs);
 extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs);
 extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
 					 void *src, unsigned long len);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7e61c8ca27e0..df5661a44e35 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1514,7 +1514,9 @@ static unsigned long get_trampoline_vaddr(void)
 static void cleanup_return_instances(struct uprobe_task *utask, struct pt_regs *regs)
 {
 	struct return_instance *ri = utask->return_instances;
-	while (ri && !arch_uretprobe_is_alive(ri, regs)) {
+	enum rp_check ctx = RP_CHECK_CALL;
+
+	while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
 		ri = free_ret_instance(ri);
 		utask->depth--;
 	}
@@ -1805,7 +1807,7 @@ static void handle_trampoline(struct pt_regs *regs)
 		 * could hit this trampoline on return. TODO: sigaltstack().
 		 */
 		next = find_next_ret_chain(ri);
-		valid = !next || arch_uretprobe_is_alive(next, regs);
+		valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
 
 		instruction_pointer_set(regs, ri->orig_ret_vaddr);
 		do {
@@ -1830,7 +1832,8 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
 	return false;
 }
 
-bool __weak arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs)
+bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+					struct pt_regs *regs)
 {
 	return true;
 }
-- 
cgit v1.2.3-70-g09d2


From db087ef69a2b155ae001665bf0b3806abde7ee34 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:28 +0200
Subject: uprobes/x86: Make arch_uretprobe_is_alive(RP_CHECK_CALL) more clever

The previous change documents that cleanup_return_instances()
can't always detect the dead frames, the stack can grow. But
there is one special case which imho worth fixing:
arch_uretprobe_is_alive() can return true when the stack didn't
actually grow, but the next "call" insn uses the already
invalidated frame.

Test-case:

	#include <stdio.h>
	#include <setjmp.h>

	jmp_buf jmp;
	int nr = 1024;

	void func_2(void)
	{
		if (--nr == 0)
			return;
		longjmp(jmp, 1);
	}

	void func_1(void)
	{
		setjmp(jmp);
		func_2();
	}

	int main(void)
	{
		func_1();
		return 0;
	}

If you ret-probe func_1() and func_2() prepare_uretprobe() hits
the MAX_URETPROBE_DEPTH limit and "return" from func_2() is not
reported.

When we know that the new call is not chained, we can do the
more strict check. In this case "sp" points to the new ret-addr,
so every frame which uses the same "sp" must be dead. The only
complication is that arch_uretprobe_is_alive() needs to know was
it chained or not, so we add the new RP_CHECK_CHAIN_CALL enum
and change prepare_uretprobe() to pass RP_CHECK_CALL only if
!chained.

Note: arch_uretprobe_is_alive() could also re-read *sp and check
if this word is still trampoline_vaddr. This could obviously
improve the logic, but I would like to avoid another
copy_from_user() especially in the case when we can't avoid the
false "alive == T" positives.

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134028.GA4786@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/uprobes.c |  5 ++++-
 include/linux/uprobes.h   |  1 +
 kernel/events/uprobes.c   | 14 +++++++-------
 3 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index acf8b9010bbf..bf4db6eaec8f 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -989,5 +989,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
 bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
 				struct pt_regs *regs)
 {
-	return regs->sp <= ret->stack;
+	if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */
+		return regs->sp < ret->stack;
+	else
+		return regs->sp <= ret->stack;
 }
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index c0a540239ab6..0bdc72f36905 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -104,6 +104,7 @@ struct return_instance {
 
 enum rp_check {
 	RP_CHECK_CALL,
+	RP_CHECK_CHAIN_CALL,
 	RP_CHECK_RET,
 };
 
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index df5661a44e35..0f370ef57a02 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1511,10 +1511,11 @@ static unsigned long get_trampoline_vaddr(void)
 	return trampoline_vaddr;
 }
 
-static void cleanup_return_instances(struct uprobe_task *utask, struct pt_regs *regs)
+static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
+					struct pt_regs *regs)
 {
 	struct return_instance *ri = utask->return_instances;
-	enum rp_check ctx = RP_CHECK_CALL;
+	enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
 
 	while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
 		ri = free_ret_instance(ri);
@@ -1528,7 +1529,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 	struct return_instance *ri;
 	struct uprobe_task *utask;
 	unsigned long orig_ret_vaddr, trampoline_vaddr;
-	bool chained = false;
+	bool chained;
 
 	if (!get_xol_area())
 		return;
@@ -1554,14 +1555,15 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 		goto fail;
 
 	/* drop the entries invalidated by longjmp() */
-	cleanup_return_instances(utask, regs);
+	chained = (orig_ret_vaddr == trampoline_vaddr);
+	cleanup_return_instances(utask, chained, regs);
 
 	/*
 	 * We don't want to keep trampoline address in stack, rather keep the
 	 * original return address of first caller thru all the consequent
 	 * instances. This also makes breakpoint unwrapping easier.
 	 */
-	if (orig_ret_vaddr == trampoline_vaddr) {
+	if (chained) {
 		if (!utask->return_instances) {
 			/*
 			 * This situation is not possible. Likely we have an
@@ -1570,8 +1572,6 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 			uprobe_warn(current, "handle tail call");
 			goto fail;
 		}
-
-		chained = true;
 		orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From fa8ad7889d83bcf0a6cdbf6d3622f3ec019cde14 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 6 Jul 2015 12:23:53 +0100
Subject: arm: perf: factor arm_pmu core out to drivers

To enable sharing of the arm_pmu code with arm64, this patch factors it
out to drivers/perf/. A new drivers/perf directory is added for
performance monitor drivers to live under.

MAINTAINERS is updated accordingly. Files added previously without a
corresponsing MAINTAINERS update (perf_regs.c, perf_callchain.c, and
perf_event.h) are also added.

Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
[will: augmented Kconfig help slightly]
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 MAINTAINERS                         |   6 +-
 arch/arm/Kconfig                    |   8 +-
 arch/arm/include/asm/pmu.h          | 154 ------
 arch/arm/kernel/Makefile            |   3 +-
 arch/arm/kernel/perf_event.c        | 921 ------------------------------------
 arch/arm/kernel/perf_event_v6.c     |   2 +-
 arch/arm/kernel/perf_event_v7.c     |   2 +-
 arch/arm/kernel/perf_event_xscale.c |   2 +-
 arch/arm/mach-ux500/cpu-db8500.c    |   2 +-
 drivers/Kconfig                     |   2 +
 drivers/Makefile                    |   1 +
 drivers/perf/Kconfig                |  15 +
 drivers/perf/Makefile               |   1 +
 drivers/perf/arm_pmu.c              | 921 ++++++++++++++++++++++++++++++++++++
 include/linux/perf/arm_pmu.h        | 154 ++++++
 15 files changed, 1105 insertions(+), 1089 deletions(-)
 delete mode 100644 arch/arm/include/asm/pmu.h
 delete mode 100644 arch/arm/kernel/perf_event.c
 create mode 100644 drivers/perf/Kconfig
 create mode 100644 drivers/perf/Makefile
 create mode 100644 drivers/perf/arm_pmu.c
 create mode 100644 include/linux/perf/arm_pmu.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index fd6078443083..485c92ced47d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -799,11 +799,13 @@ F:	arch/arm/include/asm/floppy.h
 ARM PMU PROFILING AND DEBUGGING
 M:	Will Deacon <will.deacon@arm.com>
 S:	Maintained
-F:	arch/arm/kernel/perf_event*
+F:	arch/arm/kernel/perf_*
 F:	arch/arm/oprofile/common.c
-F:	arch/arm/include/asm/pmu.h
 F:	arch/arm/kernel/hw_breakpoint.c
 F:	arch/arm/include/asm/hw_breakpoint.h
+F:	arch/arm/include/asm/perf_event.h
+F:	drivers/perf/arm_pmu.c
+F:	include/linux/perf/arm_pmu.h
 
 ARM PORT
 M:	Russell King <linux@arm.linux.org.uk>
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 1c5021002fe4..4f7bc3d4b186 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1701,12 +1701,8 @@ config HIGHPTE
 	  user-space 2nd level page tables to reside in high memory.
 
 config HW_PERF_EVENTS
-	bool "Enable hardware performance counter support for perf events"
-	depends on PERF_EVENTS
-	default y
-	help
-	  Enable hardware performance counter support for perf events. If
-	  disabled, perf events will use software events only.
+	def_bool y
+	depends on ARM_PMU
 
 config SYS_SUPPORTS_HUGETLBFS
        def_bool y
diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
deleted file mode 100644
index 3fc87dfd77e6..000000000000
--- a/arch/arm/include/asm/pmu.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- *  linux/arch/arm/include/asm/pmu.h
- *
- *  Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#ifndef __ARM_PMU_H__
-#define __ARM_PMU_H__
-
-#include <linux/interrupt.h>
-#include <linux/perf_event.h>
-
-#include <asm/cputype.h>
-
-/*
- * struct arm_pmu_platdata - ARM PMU platform data
- *
- * @handle_irq: an optional handler which will be called from the
- *	interrupt and passed the address of the low level handler,
- *	and can be used to implement any platform specific handling
- *	before or after calling it.
- */
-struct arm_pmu_platdata {
-	irqreturn_t (*handle_irq)(int irq, void *dev,
-				  irq_handler_t pmu_handler);
-};
-
-#ifdef CONFIG_HW_PERF_EVENTS
-
-/*
- * The ARMv7 CPU PMU supports up to 32 event counters.
- */
-#define ARMPMU_MAX_HWEVENTS		32
-
-#define HW_OP_UNSUPPORTED		0xFFFF
-#define C(_x)				PERF_COUNT_HW_CACHE_##_x
-#define CACHE_OP_UNSUPPORTED		0xFFFF
-
-#define PERF_MAP_ALL_UNSUPPORTED					\
-	[0 ... PERF_COUNT_HW_MAX - 1] = HW_OP_UNSUPPORTED
-
-#define PERF_CACHE_MAP_ALL_UNSUPPORTED					\
-[0 ... C(MAX) - 1] = {							\
-	[0 ... C(OP_MAX) - 1] = {					\
-		[0 ... C(RESULT_MAX) - 1] = CACHE_OP_UNSUPPORTED,	\
-	},								\
-}
-
-/* The events for a given PMU register set. */
-struct pmu_hw_events {
-	/*
-	 * The events that are active on the PMU for the given index.
-	 */
-	struct perf_event	*events[ARMPMU_MAX_HWEVENTS];
-
-	/*
-	 * A 1 bit for an index indicates that the counter is being used for
-	 * an event. A 0 means that the counter can be used.
-	 */
-	DECLARE_BITMAP(used_mask, ARMPMU_MAX_HWEVENTS);
-
-	/*
-	 * Hardware lock to serialize accesses to PMU registers. Needed for the
-	 * read/modify/write sequences.
-	 */
-	raw_spinlock_t		pmu_lock;
-
-	/*
-	 * When using percpu IRQs, we need a percpu dev_id. Place it here as we
-	 * already have to allocate this struct per cpu.
-	 */
-	struct arm_pmu		*percpu_pmu;
-};
-
-struct arm_pmu {
-	struct pmu	pmu;
-	cpumask_t	active_irqs;
-	cpumask_t	supported_cpus;
-	int		*irq_affinity;
-	char		*name;
-	irqreturn_t	(*handle_irq)(int irq_num, void *dev);
-	void		(*enable)(struct perf_event *event);
-	void		(*disable)(struct perf_event *event);
-	int		(*get_event_idx)(struct pmu_hw_events *hw_events,
-					 struct perf_event *event);
-	void		(*clear_event_idx)(struct pmu_hw_events *hw_events,
-					 struct perf_event *event);
-	int		(*set_event_filter)(struct hw_perf_event *evt,
-					    struct perf_event_attr *attr);
-	u32		(*read_counter)(struct perf_event *event);
-	void		(*write_counter)(struct perf_event *event, u32 val);
-	void		(*start)(struct arm_pmu *);
-	void		(*stop)(struct arm_pmu *);
-	void		(*reset)(void *);
-	int		(*request_irq)(struct arm_pmu *, irq_handler_t handler);
-	void		(*free_irq)(struct arm_pmu *);
-	int		(*map_event)(struct perf_event *event);
-	int		num_events;
-	atomic_t	active_events;
-	struct mutex	reserve_mutex;
-	u64		max_period;
-	struct platform_device	*plat_device;
-	struct pmu_hw_events	__percpu *hw_events;
-	struct notifier_block	hotplug_nb;
-};
-
-#define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
-
-int armpmu_register(struct arm_pmu *armpmu, int type);
-
-u64 armpmu_event_update(struct perf_event *event);
-
-int armpmu_event_set_period(struct perf_event *event);
-
-int armpmu_map_event(struct perf_event *event,
-		     const unsigned (*event_map)[PERF_COUNT_HW_MAX],
-		     const unsigned (*cache_map)[PERF_COUNT_HW_CACHE_MAX]
-						[PERF_COUNT_HW_CACHE_OP_MAX]
-						[PERF_COUNT_HW_CACHE_RESULT_MAX],
-		     u32 raw_event_mask);
-
-struct pmu_probe_info {
-	unsigned int cpuid;
-	unsigned int mask;
-	int (*init)(struct arm_pmu *);
-};
-
-#define PMU_PROBE(_cpuid, _mask, _fn)	\
-{					\
-	.cpuid = (_cpuid),		\
-	.mask = (_mask),		\
-	.init = (_fn),			\
-}
-
-#define ARM_PMU_PROBE(_cpuid, _fn) \
-	PMU_PROBE(_cpuid, ARM_CPU_PART_MASK, _fn)
-
-#define ARM_PMU_XSCALE_MASK	((0xff << 24) | ARM_CPU_XSCALE_ARCH_MASK)
-
-#define XSCALE_PMU_PROBE(_version, _fn) \
-	PMU_PROBE(ARM_CPU_IMP_INTEL << 24 | _version, ARM_PMU_XSCALE_MASK, _fn)
-
-int arm_pmu_device_probe(struct platform_device *pdev,
-			 const struct of_device_id *of_table,
-			 const struct pmu_probe_info *probe_table);
-
-#endif /* CONFIG_HW_PERF_EVENTS */
-
-#endif /* __ARM_PMU_H__ */
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index e69f7a19735d..fcb25c1c5c21 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -71,8 +71,7 @@ obj-$(CONFIG_CPU_PJ4)		+= pj4-cp0.o
 obj-$(CONFIG_CPU_PJ4B)		+= pj4-cp0.o
 obj-$(CONFIG_IWMMXT)		+= iwmmxt.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_regs.o perf_callchain.o
-obj-$(CONFIG_HW_PERF_EVENTS)	+= perf_event.o \
-				   perf_event_xscale.o perf_event_v6.o \
+obj-$(CONFIG_HW_PERF_EVENTS)	+= perf_event_xscale.o perf_event_v6.o \
 				   perf_event_v7.o
 CFLAGS_pj4-cp0.o		:= -marm
 AFLAGS_iwmmxt.o			:= -Wa,-mcpu=iwmmxt
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
deleted file mode 100644
index 1cb40651d783..000000000000
--- a/arch/arm/kernel/perf_event.c
+++ /dev/null
@@ -1,921 +0,0 @@
-#undef DEBUG
-
-/*
- * ARM performance counter support.
- *
- * Copyright (C) 2009 picoChip Designs, Ltd., Jamie Iles
- * Copyright (C) 2010 ARM Ltd., Will Deacon <will.deacon@arm.com>
- *
- * This code is based on the sparc64 perf event code, which is in turn based
- * on the x86 code.
- */
-#define pr_fmt(fmt) "hw perfevents: " fmt
-
-#include <linux/bitmap.h>
-#include <linux/cpumask.h>
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/of_device.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/irq.h>
-#include <linux/irqdesc.h>
-
-#include <asm/cputype.h>
-#include <asm/irq_regs.h>
-#include <asm/pmu.h>
-
-static int
-armpmu_map_cache_event(const unsigned (*cache_map)
-				      [PERF_COUNT_HW_CACHE_MAX]
-				      [PERF_COUNT_HW_CACHE_OP_MAX]
-				      [PERF_COUNT_HW_CACHE_RESULT_MAX],
-		       u64 config)
-{
-	unsigned int cache_type, cache_op, cache_result, ret;
-
-	cache_type = (config >>  0) & 0xff;
-	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
-		return -EINVAL;
-
-	cache_op = (config >>  8) & 0xff;
-	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
-		return -EINVAL;
-
-	cache_result = (config >> 16) & 0xff;
-	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
-		return -EINVAL;
-
-	ret = (int)(*cache_map)[cache_type][cache_op][cache_result];
-
-	if (ret == CACHE_OP_UNSUPPORTED)
-		return -ENOENT;
-
-	return ret;
-}
-
-static int
-armpmu_map_hw_event(const unsigned (*event_map)[PERF_COUNT_HW_MAX], u64 config)
-{
-	int mapping;
-
-	if (config >= PERF_COUNT_HW_MAX)
-		return -EINVAL;
-
-	mapping = (*event_map)[config];
-	return mapping == HW_OP_UNSUPPORTED ? -ENOENT : mapping;
-}
-
-static int
-armpmu_map_raw_event(u32 raw_event_mask, u64 config)
-{
-	return (int)(config & raw_event_mask);
-}
-
-int
-armpmu_map_event(struct perf_event *event,
-		 const unsigned (*event_map)[PERF_COUNT_HW_MAX],
-		 const unsigned (*cache_map)
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX],
-		 u32 raw_event_mask)
-{
-	u64 config = event->attr.config;
-	int type = event->attr.type;
-
-	if (type == event->pmu->type)
-		return armpmu_map_raw_event(raw_event_mask, config);
-
-	switch (type) {
-	case PERF_TYPE_HARDWARE:
-		return armpmu_map_hw_event(event_map, config);
-	case PERF_TYPE_HW_CACHE:
-		return armpmu_map_cache_event(cache_map, config);
-	case PERF_TYPE_RAW:
-		return armpmu_map_raw_event(raw_event_mask, config);
-	}
-
-	return -ENOENT;
-}
-
-int armpmu_event_set_period(struct perf_event *event)
-{
-	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-	struct hw_perf_event *hwc = &event->hw;
-	s64 left = local64_read(&hwc->period_left);
-	s64 period = hwc->sample_period;
-	int ret = 0;
-
-	if (unlikely(left <= -period)) {
-		left = period;
-		local64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		ret = 1;
-	}
-
-	if (unlikely(left <= 0)) {
-		left += period;
-		local64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		ret = 1;
-	}
-
-	/*
-	 * Limit the maximum period to prevent the counter value
-	 * from overtaking the one we are about to program. In
-	 * effect we are reducing max_period to account for
-	 * interrupt latency (and we are being very conservative).
-	 */
-	if (left > (armpmu->max_period >> 1))
-		left = armpmu->max_period >> 1;
-
-	local64_set(&hwc->prev_count, (u64)-left);
-
-	armpmu->write_counter(event, (u64)(-left) & 0xffffffff);
-
-	perf_event_update_userpage(event);
-
-	return ret;
-}
-
-u64 armpmu_event_update(struct perf_event *event)
-{
-	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-	struct hw_perf_event *hwc = &event->hw;
-	u64 delta, prev_raw_count, new_raw_count;
-
-again:
-	prev_raw_count = local64_read(&hwc->prev_count);
-	new_raw_count = armpmu->read_counter(event);
-
-	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-			     new_raw_count) != prev_raw_count)
-		goto again;
-
-	delta = (new_raw_count - prev_raw_count) & armpmu->max_period;
-
-	local64_add(delta, &event->count);
-	local64_sub(delta, &hwc->period_left);
-
-	return new_raw_count;
-}
-
-static void
-armpmu_read(struct perf_event *event)
-{
-	armpmu_event_update(event);
-}
-
-static void
-armpmu_stop(struct perf_event *event, int flags)
-{
-	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-	struct hw_perf_event *hwc = &event->hw;
-
-	/*
-	 * ARM pmu always has to update the counter, so ignore
-	 * PERF_EF_UPDATE, see comments in armpmu_start().
-	 */
-	if (!(hwc->state & PERF_HES_STOPPED)) {
-		armpmu->disable(event);
-		armpmu_event_update(event);
-		hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
-	}
-}
-
-static void armpmu_start(struct perf_event *event, int flags)
-{
-	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-	struct hw_perf_event *hwc = &event->hw;
-
-	/*
-	 * ARM pmu always has to reprogram the period, so ignore
-	 * PERF_EF_RELOAD, see the comment below.
-	 */
-	if (flags & PERF_EF_RELOAD)
-		WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
-
-	hwc->state = 0;
-	/*
-	 * Set the period again. Some counters can't be stopped, so when we
-	 * were stopped we simply disabled the IRQ source and the counter
-	 * may have been left counting. If we don't do this step then we may
-	 * get an interrupt too soon or *way* too late if the overflow has
-	 * happened since disabling.
-	 */
-	armpmu_event_set_period(event);
-	armpmu->enable(event);
-}
-
-static void
-armpmu_del(struct perf_event *event, int flags)
-{
-	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-	struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events);
-	struct hw_perf_event *hwc = &event->hw;
-	int idx = hwc->idx;
-
-	armpmu_stop(event, PERF_EF_UPDATE);
-	hw_events->events[idx] = NULL;
-	clear_bit(idx, hw_events->used_mask);
-	if (armpmu->clear_event_idx)
-		armpmu->clear_event_idx(hw_events, event);
-
-	perf_event_update_userpage(event);
-}
-
-static int
-armpmu_add(struct perf_event *event, int flags)
-{
-	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-	struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events);
-	struct hw_perf_event *hwc = &event->hw;
-	int idx;
-	int err = 0;
-
-	/* An event following a process won't be stopped earlier */
-	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
-		return -ENOENT;
-
-	perf_pmu_disable(event->pmu);
-
-	/* If we don't have a space for the counter then finish early. */
-	idx = armpmu->get_event_idx(hw_events, event);
-	if (idx < 0) {
-		err = idx;
-		goto out;
-	}
-
-	/*
-	 * If there is an event in the counter we are going to use then make
-	 * sure it is disabled.
-	 */
-	event->hw.idx = idx;
-	armpmu->disable(event);
-	hw_events->events[idx] = event;
-
-	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
-	if (flags & PERF_EF_START)
-		armpmu_start(event, PERF_EF_RELOAD);
-
-	/* Propagate our changes to the userspace mapping. */
-	perf_event_update_userpage(event);
-
-out:
-	perf_pmu_enable(event->pmu);
-	return err;
-}
-
-static int
-validate_event(struct pmu *pmu, struct pmu_hw_events *hw_events,
-			       struct perf_event *event)
-{
-	struct arm_pmu *armpmu;
-
-	if (is_software_event(event))
-		return 1;
-
-	/*
-	 * Reject groups spanning multiple HW PMUs (e.g. CPU + CCI). The
-	 * core perf code won't check that the pmu->ctx == leader->ctx
-	 * until after pmu->event_init(event).
-	 */
-	if (event->pmu != pmu)
-		return 0;
-
-	if (event->state < PERF_EVENT_STATE_OFF)
-		return 1;
-
-	if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec)
-		return 1;
-
-	armpmu = to_arm_pmu(event->pmu);
-	return armpmu->get_event_idx(hw_events, event) >= 0;
-}
-
-static int
-validate_group(struct perf_event *event)
-{
-	struct perf_event *sibling, *leader = event->group_leader;
-	struct pmu_hw_events fake_pmu;
-
-	/*
-	 * Initialise the fake PMU. We only need to populate the
-	 * used_mask for the purposes of validation.
-	 */
-	memset(&fake_pmu.used_mask, 0, sizeof(fake_pmu.used_mask));
-
-	if (!validate_event(event->pmu, &fake_pmu, leader))
-		return -EINVAL;
-
-	list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
-		if (!validate_event(event->pmu, &fake_pmu, sibling))
-			return -EINVAL;
-	}
-
-	if (!validate_event(event->pmu, &fake_pmu, event))
-		return -EINVAL;
-
-	return 0;
-}
-
-static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
-{
-	struct arm_pmu *armpmu;
-	struct platform_device *plat_device;
-	struct arm_pmu_platdata *plat;
-	int ret;
-	u64 start_clock, finish_clock;
-
-	/*
-	 * we request the IRQ with a (possibly percpu) struct arm_pmu**, but
-	 * the handlers expect a struct arm_pmu*. The percpu_irq framework will
-	 * do any necessary shifting, we just need to perform the first
-	 * dereference.
-	 */
-	armpmu = *(void **)dev;
-	plat_device = armpmu->plat_device;
-	plat = dev_get_platdata(&plat_device->dev);
-
-	start_clock = sched_clock();
-	if (plat && plat->handle_irq)
-		ret = plat->handle_irq(irq, armpmu, armpmu->handle_irq);
-	else
-		ret = armpmu->handle_irq(irq, armpmu);
-	finish_clock = sched_clock();
-
-	perf_sample_event_took(finish_clock - start_clock);
-	return ret;
-}
-
-static void
-armpmu_release_hardware(struct arm_pmu *armpmu)
-{
-	armpmu->free_irq(armpmu);
-}
-
-static int
-armpmu_reserve_hardware(struct arm_pmu *armpmu)
-{
-	int err = armpmu->request_irq(armpmu, armpmu_dispatch_irq);
-	if (err) {
-		armpmu_release_hardware(armpmu);
-		return err;
-	}
-
-	return 0;
-}
-
-static void
-hw_perf_event_destroy(struct perf_event *event)
-{
-	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-	atomic_t *active_events	 = &armpmu->active_events;
-	struct mutex *pmu_reserve_mutex = &armpmu->reserve_mutex;
-
-	if (atomic_dec_and_mutex_lock(active_events, pmu_reserve_mutex)) {
-		armpmu_release_hardware(armpmu);
-		mutex_unlock(pmu_reserve_mutex);
-	}
-}
-
-static int
-event_requires_mode_exclusion(struct perf_event_attr *attr)
-{
-	return attr->exclude_idle || attr->exclude_user ||
-	       attr->exclude_kernel || attr->exclude_hv;
-}
-
-static int
-__hw_perf_event_init(struct perf_event *event)
-{
-	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-	struct hw_perf_event *hwc = &event->hw;
-	int mapping;
-
-	mapping = armpmu->map_event(event);
-
-	if (mapping < 0) {
-		pr_debug("event %x:%llx not supported\n", event->attr.type,
-			 event->attr.config);
-		return mapping;
-	}
-
-	/*
-	 * We don't assign an index until we actually place the event onto
-	 * hardware. Use -1 to signify that we haven't decided where to put it
-	 * yet. For SMP systems, each core has it's own PMU so we can't do any
-	 * clever allocation or constraints checking at this point.
-	 */
-	hwc->idx		= -1;
-	hwc->config_base	= 0;
-	hwc->config		= 0;
-	hwc->event_base		= 0;
-
-	/*
-	 * Check whether we need to exclude the counter from certain modes.
-	 */
-	if ((!armpmu->set_event_filter ||
-	     armpmu->set_event_filter(hwc, &event->attr)) &&
-	     event_requires_mode_exclusion(&event->attr)) {
-		pr_debug("ARM performance counters do not support "
-			 "mode exclusion\n");
-		return -EOPNOTSUPP;
-	}
-
-	/*
-	 * Store the event encoding into the config_base field.
-	 */
-	hwc->config_base	    |= (unsigned long)mapping;
-
-	if (!is_sampling_event(event)) {
-		/*
-		 * For non-sampling runs, limit the sample_period to half
-		 * of the counter width. That way, the new counter value
-		 * is far less likely to overtake the previous one unless
-		 * you have some serious IRQ latency issues.
-		 */
-		hwc->sample_period  = armpmu->max_period >> 1;
-		hwc->last_period    = hwc->sample_period;
-		local64_set(&hwc->period_left, hwc->sample_period);
-	}
-
-	if (event->group_leader != event) {
-		if (validate_group(event) != 0)
-			return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int armpmu_event_init(struct perf_event *event)
-{
-	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-	int err = 0;
-	atomic_t *active_events = &armpmu->active_events;
-
-	/*
-	 * Reject CPU-affine events for CPUs that are of a different class to
-	 * that which this PMU handles. Process-following events (where
-	 * event->cpu == -1) can be migrated between CPUs, and thus we have to
-	 * reject them later (in armpmu_add) if they're scheduled on a
-	 * different class of CPU.
-	 */
-	if (event->cpu != -1 &&
-		!cpumask_test_cpu(event->cpu, &armpmu->supported_cpus))
-		return -ENOENT;
-
-	/* does not support taken branch sampling */
-	if (has_branch_stack(event))
-		return -EOPNOTSUPP;
-
-	if (armpmu->map_event(event) == -ENOENT)
-		return -ENOENT;
-
-	event->destroy = hw_perf_event_destroy;
-
-	if (!atomic_inc_not_zero(active_events)) {
-		mutex_lock(&armpmu->reserve_mutex);
-		if (atomic_read(active_events) == 0)
-			err = armpmu_reserve_hardware(armpmu);
-
-		if (!err)
-			atomic_inc(active_events);
-		mutex_unlock(&armpmu->reserve_mutex);
-	}
-
-	if (err)
-		return err;
-
-	err = __hw_perf_event_init(event);
-	if (err)
-		hw_perf_event_destroy(event);
-
-	return err;
-}
-
-static void armpmu_enable(struct pmu *pmu)
-{
-	struct arm_pmu *armpmu = to_arm_pmu(pmu);
-	struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events);
-	int enabled = bitmap_weight(hw_events->used_mask, armpmu->num_events);
-
-	/* For task-bound events we may be called on other CPUs */
-	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
-		return;
-
-	if (enabled)
-		armpmu->start(armpmu);
-}
-
-static void armpmu_disable(struct pmu *pmu)
-{
-	struct arm_pmu *armpmu = to_arm_pmu(pmu);
-
-	/* For task-bound events we may be called on other CPUs */
-	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
-		return;
-
-	armpmu->stop(armpmu);
-}
-
-/*
- * In heterogeneous systems, events are specific to a particular
- * microarchitecture, and aren't suitable for another. Thus, only match CPUs of
- * the same microarchitecture.
- */
-static int armpmu_filter_match(struct perf_event *event)
-{
-	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-	unsigned int cpu = smp_processor_id();
-	return cpumask_test_cpu(cpu, &armpmu->supported_cpus);
-}
-
-static void armpmu_init(struct arm_pmu *armpmu)
-{
-	atomic_set(&armpmu->active_events, 0);
-	mutex_init(&armpmu->reserve_mutex);
-
-	armpmu->pmu = (struct pmu) {
-		.pmu_enable	= armpmu_enable,
-		.pmu_disable	= armpmu_disable,
-		.event_init	= armpmu_event_init,
-		.add		= armpmu_add,
-		.del		= armpmu_del,
-		.start		= armpmu_start,
-		.stop		= armpmu_stop,
-		.read		= armpmu_read,
-		.filter_match	= armpmu_filter_match,
-	};
-}
-
-int armpmu_register(struct arm_pmu *armpmu, int type)
-{
-	armpmu_init(armpmu);
-	pr_info("enabled with %s PMU driver, %d counters available\n",
-			armpmu->name, armpmu->num_events);
-	return perf_pmu_register(&armpmu->pmu, armpmu->name, type);
-}
-
-/* Set at runtime when we know what CPU type we are. */
-static struct arm_pmu *__oprofile_cpu_pmu;
-
-/*
- * Despite the names, these two functions are CPU-specific and are used
- * by the OProfile/perf code.
- */
-const char *perf_pmu_name(void)
-{
-	if (!__oprofile_cpu_pmu)
-		return NULL;
-
-	return __oprofile_cpu_pmu->name;
-}
-EXPORT_SYMBOL_GPL(perf_pmu_name);
-
-int perf_num_counters(void)
-{
-	int max_events = 0;
-
-	if (__oprofile_cpu_pmu != NULL)
-		max_events = __oprofile_cpu_pmu->num_events;
-
-	return max_events;
-}
-EXPORT_SYMBOL_GPL(perf_num_counters);
-
-static void cpu_pmu_enable_percpu_irq(void *data)
-{
-	int irq = *(int *)data;
-
-	enable_percpu_irq(irq, IRQ_TYPE_NONE);
-}
-
-static void cpu_pmu_disable_percpu_irq(void *data)
-{
-	int irq = *(int *)data;
-
-	disable_percpu_irq(irq);
-}
-
-static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
-{
-	int i, irq, irqs;
-	struct platform_device *pmu_device = cpu_pmu->plat_device;
-	struct pmu_hw_events __percpu *hw_events = cpu_pmu->hw_events;
-
-	irqs = min(pmu_device->num_resources, num_possible_cpus());
-
-	irq = platform_get_irq(pmu_device, 0);
-	if (irq >= 0 && irq_is_percpu(irq)) {
-		on_each_cpu(cpu_pmu_disable_percpu_irq, &irq, 1);
-		free_percpu_irq(irq, &hw_events->percpu_pmu);
-	} else {
-		for (i = 0; i < irqs; ++i) {
-			int cpu = i;
-
-			if (cpu_pmu->irq_affinity)
-				cpu = cpu_pmu->irq_affinity[i];
-
-			if (!cpumask_test_and_clear_cpu(cpu, &cpu_pmu->active_irqs))
-				continue;
-			irq = platform_get_irq(pmu_device, i);
-			if (irq >= 0)
-				free_irq(irq, per_cpu_ptr(&hw_events->percpu_pmu, cpu));
-		}
-	}
-}
-
-static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
-{
-	int i, err, irq, irqs;
-	struct platform_device *pmu_device = cpu_pmu->plat_device;
-	struct pmu_hw_events __percpu *hw_events = cpu_pmu->hw_events;
-
-	if (!pmu_device)
-		return -ENODEV;
-
-	irqs = min(pmu_device->num_resources, num_possible_cpus());
-	if (irqs < 1) {
-		pr_warn_once("perf/ARM: No irqs for PMU defined, sampling events not supported\n");
-		return 0;
-	}
-
-	irq = platform_get_irq(pmu_device, 0);
-	if (irq >= 0 && irq_is_percpu(irq)) {
-		err = request_percpu_irq(irq, handler, "arm-pmu",
-					 &hw_events->percpu_pmu);
-		if (err) {
-			pr_err("unable to request IRQ%d for ARM PMU counters\n",
-				irq);
-			return err;
-		}
-		on_each_cpu(cpu_pmu_enable_percpu_irq, &irq, 1);
-	} else {
-		for (i = 0; i < irqs; ++i) {
-			int cpu = i;
-
-			err = 0;
-			irq = platform_get_irq(pmu_device, i);
-			if (irq < 0)
-				continue;
-
-			if (cpu_pmu->irq_affinity)
-				cpu = cpu_pmu->irq_affinity[i];
-
-			/*
-			 * If we have a single PMU interrupt that we can't shift,
-			 * assume that we're running on a uniprocessor machine and
-			 * continue. Otherwise, continue without this interrupt.
-			 */
-			if (irq_set_affinity(irq, cpumask_of(cpu)) && irqs > 1) {
-				pr_warn("unable to set irq affinity (irq=%d, cpu=%u)\n",
-					irq, cpu);
-				continue;
-			}
-
-			err = request_irq(irq, handler,
-					  IRQF_NOBALANCING | IRQF_NO_THREAD, "arm-pmu",
-					  per_cpu_ptr(&hw_events->percpu_pmu, cpu));
-			if (err) {
-				pr_err("unable to request IRQ%d for ARM PMU counters\n",
-					irq);
-				return err;
-			}
-
-			cpumask_set_cpu(cpu, &cpu_pmu->active_irqs);
-		}
-	}
-
-	return 0;
-}
-
-/*
- * PMU hardware loses all context when a CPU goes offline.
- * When a CPU is hotplugged back in, since some hardware registers are
- * UNKNOWN at reset, the PMU must be explicitly reset to avoid reading
- * junk values out of them.
- */
-static int cpu_pmu_notify(struct notifier_block *b, unsigned long action,
-			  void *hcpu)
-{
-	int cpu = (unsigned long)hcpu;
-	struct arm_pmu *pmu = container_of(b, struct arm_pmu, hotplug_nb);
-
-	if ((action & ~CPU_TASKS_FROZEN) != CPU_STARTING)
-		return NOTIFY_DONE;
-
-	if (!cpumask_test_cpu(cpu, &pmu->supported_cpus))
-		return NOTIFY_DONE;
-
-	if (pmu->reset)
-		pmu->reset(pmu);
-	else
-		return NOTIFY_DONE;
-
-	return NOTIFY_OK;
-}
-
-static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	int err;
-	int cpu;
-	struct pmu_hw_events __percpu *cpu_hw_events;
-
-	cpu_hw_events = alloc_percpu(struct pmu_hw_events);
-	if (!cpu_hw_events)
-		return -ENOMEM;
-
-	cpu_pmu->hotplug_nb.notifier_call = cpu_pmu_notify;
-	err = register_cpu_notifier(&cpu_pmu->hotplug_nb);
-	if (err)
-		goto out_hw_events;
-
-	for_each_possible_cpu(cpu) {
-		struct pmu_hw_events *events = per_cpu_ptr(cpu_hw_events, cpu);
-		raw_spin_lock_init(&events->pmu_lock);
-		events->percpu_pmu = cpu_pmu;
-	}
-
-	cpu_pmu->hw_events	= cpu_hw_events;
-	cpu_pmu->request_irq	= cpu_pmu_request_irq;
-	cpu_pmu->free_irq	= cpu_pmu_free_irq;
-
-	/* Ensure the PMU has sane values out of reset. */
-	if (cpu_pmu->reset)
-		on_each_cpu_mask(&cpu_pmu->supported_cpus, cpu_pmu->reset,
-			 cpu_pmu, 1);
-
-	/* If no interrupts available, set the corresponding capability flag */
-	if (!platform_get_irq(cpu_pmu->plat_device, 0))
-		cpu_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
-
-	return 0;
-
-out_hw_events:
-	free_percpu(cpu_hw_events);
-	return err;
-}
-
-static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu)
-{
-	unregister_cpu_notifier(&cpu_pmu->hotplug_nb);
-	free_percpu(cpu_pmu->hw_events);
-}
-
-/*
- * CPU PMU identification and probing.
- */
-static int probe_current_pmu(struct arm_pmu *pmu,
-			     const struct pmu_probe_info *info)
-{
-	int cpu = get_cpu();
-	unsigned int cpuid = read_cpuid_id();
-	int ret = -ENODEV;
-
-	pr_info("probing PMU on CPU %d\n", cpu);
-
-	for (; info->init != NULL; info++) {
-		if ((cpuid & info->mask) != info->cpuid)
-			continue;
-		ret = info->init(pmu);
-		break;
-	}
-
-	put_cpu();
-	return ret;
-}
-
-static int of_pmu_irq_cfg(struct arm_pmu *pmu)
-{
-	int *irqs, i = 0;
-	bool using_spi = false;
-	struct platform_device *pdev = pmu->plat_device;
-
-	irqs = kcalloc(pdev->num_resources, sizeof(*irqs), GFP_KERNEL);
-	if (!irqs)
-		return -ENOMEM;
-
-	do {
-		struct device_node *dn;
-		int cpu, irq;
-
-		/* See if we have an affinity entry */
-		dn = of_parse_phandle(pdev->dev.of_node, "interrupt-affinity", i);
-		if (!dn)
-			break;
-
-		/* Check the IRQ type and prohibit a mix of PPIs and SPIs */
-		irq = platform_get_irq(pdev, i);
-		if (irq >= 0) {
-			bool spi = !irq_is_percpu(irq);
-
-			if (i > 0 && spi != using_spi) {
-				pr_err("PPI/SPI IRQ type mismatch for %s!\n",
-					dn->name);
-				kfree(irqs);
-				return -EINVAL;
-			}
-
-			using_spi = spi;
-		}
-
-		/* Now look up the logical CPU number */
-		for_each_possible_cpu(cpu)
-			if (dn == of_cpu_device_node_get(cpu))
-				break;
-
-		if (cpu >= nr_cpu_ids) {
-			pr_warn("Failed to find logical CPU for %s\n",
-				dn->name);
-			of_node_put(dn);
-			cpumask_setall(&pmu->supported_cpus);
-			break;
-		}
-		of_node_put(dn);
-
-		/* For SPIs, we need to track the affinity per IRQ */
-		if (using_spi) {
-			if (i >= pdev->num_resources) {
-				of_node_put(dn);
-				break;
-			}
-
-			irqs[i] = cpu;
-		}
-
-		/* Keep track of the CPUs containing this PMU type */
-		cpumask_set_cpu(cpu, &pmu->supported_cpus);
-		of_node_put(dn);
-		i++;
-	} while (1);
-
-	/* If we didn't manage to parse anything, claim to support all CPUs */
-	if (cpumask_weight(&pmu->supported_cpus) == 0)
-		cpumask_setall(&pmu->supported_cpus);
-
-	/* If we matched up the IRQ affinities, use them to route the SPIs */
-	if (using_spi && i == pdev->num_resources)
-		pmu->irq_affinity = irqs;
-	else
-		kfree(irqs);
-
-	return 0;
-}
-
-int arm_pmu_device_probe(struct platform_device *pdev,
-			 const struct of_device_id *of_table,
-			 const struct pmu_probe_info *probe_table)
-{
-	const struct of_device_id *of_id;
-	const int (*init_fn)(struct arm_pmu *);
-	struct device_node *node = pdev->dev.of_node;
-	struct arm_pmu *pmu;
-	int ret = -ENODEV;
-
-	pmu = kzalloc(sizeof(struct arm_pmu), GFP_KERNEL);
-	if (!pmu) {
-		pr_info("failed to allocate PMU device!\n");
-		return -ENOMEM;
-	}
-
-	if (!__oprofile_cpu_pmu)
-		__oprofile_cpu_pmu = pmu;
-
-	pmu->plat_device = pdev;
-
-	if (node && (of_id = of_match_node(of_table, pdev->dev.of_node))) {
-		init_fn = of_id->data;
-
-		ret = of_pmu_irq_cfg(pmu);
-		if (!ret)
-			ret = init_fn(pmu);
-	} else {
-		ret = probe_current_pmu(pmu, probe_table);
-		cpumask_setall(&pmu->supported_cpus);
-	}
-
-	if (ret) {
-		pr_info("failed to probe PMU!\n");
-		goto out_free;
-	}
-
-	ret = cpu_pmu_init(pmu);
-	if (ret)
-		goto out_free;
-
-	ret = armpmu_register(pmu, -1);
-	if (ret)
-		goto out_destroy;
-
-	return 0;
-
-out_destroy:
-	cpu_pmu_destroy(pmu);
-out_free:
-	pr_info("failed to register PMU devices!\n");
-	kfree(pmu);
-	return ret;
-}
diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c
index 09f83e414a72..09413e7b49aa 100644
--- a/arch/arm/kernel/perf_event_v6.c
+++ b/arch/arm/kernel/perf_event_v6.c
@@ -34,9 +34,9 @@
 
 #include <asm/cputype.h>
 #include <asm/irq_regs.h>
-#include <asm/pmu.h>
 
 #include <linux/of.h>
+#include <linux/perf/arm_pmu.h>
 #include <linux/platform_device.h>
 
 enum armv6_perf_types {
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index f9b37f876e20..126dc679b230 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -21,11 +21,11 @@
 #include <asm/cp15.h>
 #include <asm/cputype.h>
 #include <asm/irq_regs.h>
-#include <asm/pmu.h>
 #include <asm/vfp.h>
 #include "../vfp/vfpinstr.h"
 
 #include <linux/of.h>
+#include <linux/perf/arm_pmu.h>
 #include <linux/platform_device.h>
 
 /*
diff --git a/arch/arm/kernel/perf_event_xscale.c b/arch/arm/kernel/perf_event_xscale.c
index 304d056d5b25..aa0499e2eef7 100644
--- a/arch/arm/kernel/perf_event_xscale.c
+++ b/arch/arm/kernel/perf_event_xscale.c
@@ -16,9 +16,9 @@
 
 #include <asm/cputype.h>
 #include <asm/irq_regs.h>
-#include <asm/pmu.h>
 
 #include <linux/of.h>
+#include <linux/perf/arm_pmu.h>
 #include <linux/platform_device.h>
 
 enum xscale_perf_types {
diff --git a/arch/arm/mach-ux500/cpu-db8500.c b/arch/arm/mach-ux500/cpu-db8500.c
index 16913800bbf9..5578dc1ab52b 100644
--- a/arch/arm/mach-ux500/cpu-db8500.c
+++ b/arch/arm/mach-ux500/cpu-db8500.c
@@ -20,10 +20,10 @@
 #include <linux/mfd/dbx500-prcmu.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
+#include <linux/perf/arm_pmu.h>
 #include <linux/regulator/machine.h>
 #include <linux/random.h>
 
-#include <asm/pmu.h>
 #include <asm/mach/map.h>
 
 #include "setup.h"
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 6e973b8e3a3b..3497485f5eab 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -176,6 +176,8 @@ source "drivers/powercap/Kconfig"
 
 source "drivers/mcb/Kconfig"
 
+source "drivers/perf/Kconfig"
+
 source "drivers/ras/Kconfig"
 
 source "drivers/thunderbolt/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index b64b49f6e01b..f245f2291b8a 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -161,6 +161,7 @@ obj-$(CONFIG_NTB)		+= ntb/
 obj-$(CONFIG_FMC)		+= fmc/
 obj-$(CONFIG_POWERCAP)		+= powercap/
 obj-$(CONFIG_MCB)		+= mcb/
+obj-$(CONFIG_PERF_EVENTS)	+= perf/
 obj-$(CONFIG_RAS)		+= ras/
 obj-$(CONFIG_THUNDERBOLT)	+= thunderbolt/
 obj-$(CONFIG_CORESIGHT)		+= hwtracing/coresight/
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
new file mode 100644
index 000000000000..d9de36ee165d
--- /dev/null
+++ b/drivers/perf/Kconfig
@@ -0,0 +1,15 @@
+#
+# Performance Monitor Drivers
+#
+
+menu "Performance monitor support"
+
+config ARM_PMU
+	depends on PERF_EVENTS && ARM
+	bool "ARM PMU framework"
+	default y
+	help
+	  Say y if you want to use CPU performance monitors on ARM-based
+	  systems.
+
+endmenu
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
new file mode 100644
index 000000000000..acd2397ded94
--- /dev/null
+++ b/drivers/perf/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_ARM_PMU) += arm_pmu.o
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
new file mode 100644
index 000000000000..2365a32a595e
--- /dev/null
+++ b/drivers/perf/arm_pmu.c
@@ -0,0 +1,921 @@
+#undef DEBUG
+
+/*
+ * ARM performance counter support.
+ *
+ * Copyright (C) 2009 picoChip Designs, Ltd., Jamie Iles
+ * Copyright (C) 2010 ARM Ltd., Will Deacon <will.deacon@arm.com>
+ *
+ * This code is based on the sparc64 perf event code, which is in turn based
+ * on the x86 code.
+ */
+#define pr_fmt(fmt) "hw perfevents: " fmt
+
+#include <linux/bitmap.h>
+#include <linux/cpumask.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/of_device.h>
+#include <linux/perf/arm_pmu.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+
+#include <asm/cputype.h>
+#include <asm/irq_regs.h>
+
+static int
+armpmu_map_cache_event(const unsigned (*cache_map)
+				      [PERF_COUNT_HW_CACHE_MAX]
+				      [PERF_COUNT_HW_CACHE_OP_MAX]
+				      [PERF_COUNT_HW_CACHE_RESULT_MAX],
+		       u64 config)
+{
+	unsigned int cache_type, cache_op, cache_result, ret;
+
+	cache_type = (config >>  0) & 0xff;
+	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+		return -EINVAL;
+
+	cache_op = (config >>  8) & 0xff;
+	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+		return -EINVAL;
+
+	cache_result = (config >> 16) & 0xff;
+	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	ret = (int)(*cache_map)[cache_type][cache_op][cache_result];
+
+	if (ret == CACHE_OP_UNSUPPORTED)
+		return -ENOENT;
+
+	return ret;
+}
+
+static int
+armpmu_map_hw_event(const unsigned (*event_map)[PERF_COUNT_HW_MAX], u64 config)
+{
+	int mapping;
+
+	if (config >= PERF_COUNT_HW_MAX)
+		return -EINVAL;
+
+	mapping = (*event_map)[config];
+	return mapping == HW_OP_UNSUPPORTED ? -ENOENT : mapping;
+}
+
+static int
+armpmu_map_raw_event(u32 raw_event_mask, u64 config)
+{
+	return (int)(config & raw_event_mask);
+}
+
+int
+armpmu_map_event(struct perf_event *event,
+		 const unsigned (*event_map)[PERF_COUNT_HW_MAX],
+		 const unsigned (*cache_map)
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX],
+		 u32 raw_event_mask)
+{
+	u64 config = event->attr.config;
+	int type = event->attr.type;
+
+	if (type == event->pmu->type)
+		return armpmu_map_raw_event(raw_event_mask, config);
+
+	switch (type) {
+	case PERF_TYPE_HARDWARE:
+		return armpmu_map_hw_event(event_map, config);
+	case PERF_TYPE_HW_CACHE:
+		return armpmu_map_cache_event(cache_map, config);
+	case PERF_TYPE_RAW:
+		return armpmu_map_raw_event(raw_event_mask, config);
+	}
+
+	return -ENOENT;
+}
+
+int armpmu_event_set_period(struct perf_event *event)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	s64 left = local64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int ret = 0;
+
+	if (unlikely(left <= -period)) {
+		left = period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	/*
+	 * Limit the maximum period to prevent the counter value
+	 * from overtaking the one we are about to program. In
+	 * effect we are reducing max_period to account for
+	 * interrupt latency (and we are being very conservative).
+	 */
+	if (left > (armpmu->max_period >> 1))
+		left = armpmu->max_period >> 1;
+
+	local64_set(&hwc->prev_count, (u64)-left);
+
+	armpmu->write_counter(event, (u64)(-left) & 0xffffffff);
+
+	perf_event_update_userpage(event);
+
+	return ret;
+}
+
+u64 armpmu_event_update(struct perf_event *event)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	u64 delta, prev_raw_count, new_raw_count;
+
+again:
+	prev_raw_count = local64_read(&hwc->prev_count);
+	new_raw_count = armpmu->read_counter(event);
+
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+			     new_raw_count) != prev_raw_count)
+		goto again;
+
+	delta = (new_raw_count - prev_raw_count) & armpmu->max_period;
+
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
+
+	return new_raw_count;
+}
+
+static void
+armpmu_read(struct perf_event *event)
+{
+	armpmu_event_update(event);
+}
+
+static void
+armpmu_stop(struct perf_event *event, int flags)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	/*
+	 * ARM pmu always has to update the counter, so ignore
+	 * PERF_EF_UPDATE, see comments in armpmu_start().
+	 */
+	if (!(hwc->state & PERF_HES_STOPPED)) {
+		armpmu->disable(event);
+		armpmu_event_update(event);
+		hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+	}
+}
+
+static void armpmu_start(struct perf_event *event, int flags)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	/*
+	 * ARM pmu always has to reprogram the period, so ignore
+	 * PERF_EF_RELOAD, see the comment below.
+	 */
+	if (flags & PERF_EF_RELOAD)
+		WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+
+	hwc->state = 0;
+	/*
+	 * Set the period again. Some counters can't be stopped, so when we
+	 * were stopped we simply disabled the IRQ source and the counter
+	 * may have been left counting. If we don't do this step then we may
+	 * get an interrupt too soon or *way* too late if the overflow has
+	 * happened since disabling.
+	 */
+	armpmu_event_set_period(event);
+	armpmu->enable(event);
+}
+
+static void
+armpmu_del(struct perf_event *event, int flags)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
+	struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	armpmu_stop(event, PERF_EF_UPDATE);
+	hw_events->events[idx] = NULL;
+	clear_bit(idx, hw_events->used_mask);
+	if (armpmu->clear_event_idx)
+		armpmu->clear_event_idx(hw_events, event);
+
+	perf_event_update_userpage(event);
+}
+
+static int
+armpmu_add(struct perf_event *event, int flags)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
+	struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+	int err = 0;
+
+	/* An event following a process won't be stopped earlier */
+	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
+		return -ENOENT;
+
+	perf_pmu_disable(event->pmu);
+
+	/* If we don't have a space for the counter then finish early. */
+	idx = armpmu->get_event_idx(hw_events, event);
+	if (idx < 0) {
+		err = idx;
+		goto out;
+	}
+
+	/*
+	 * If there is an event in the counter we are going to use then make
+	 * sure it is disabled.
+	 */
+	event->hw.idx = idx;
+	armpmu->disable(event);
+	hw_events->events[idx] = event;
+
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+	if (flags & PERF_EF_START)
+		armpmu_start(event, PERF_EF_RELOAD);
+
+	/* Propagate our changes to the userspace mapping. */
+	perf_event_update_userpage(event);
+
+out:
+	perf_pmu_enable(event->pmu);
+	return err;
+}
+
+static int
+validate_event(struct pmu *pmu, struct pmu_hw_events *hw_events,
+			       struct perf_event *event)
+{
+	struct arm_pmu *armpmu;
+
+	if (is_software_event(event))
+		return 1;
+
+	/*
+	 * Reject groups spanning multiple HW PMUs (e.g. CPU + CCI). The
+	 * core perf code won't check that the pmu->ctx == leader->ctx
+	 * until after pmu->event_init(event).
+	 */
+	if (event->pmu != pmu)
+		return 0;
+
+	if (event->state < PERF_EVENT_STATE_OFF)
+		return 1;
+
+	if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec)
+		return 1;
+
+	armpmu = to_arm_pmu(event->pmu);
+	return armpmu->get_event_idx(hw_events, event) >= 0;
+}
+
+static int
+validate_group(struct perf_event *event)
+{
+	struct perf_event *sibling, *leader = event->group_leader;
+	struct pmu_hw_events fake_pmu;
+
+	/*
+	 * Initialise the fake PMU. We only need to populate the
+	 * used_mask for the purposes of validation.
+	 */
+	memset(&fake_pmu.used_mask, 0, sizeof(fake_pmu.used_mask));
+
+	if (!validate_event(event->pmu, &fake_pmu, leader))
+		return -EINVAL;
+
+	list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+		if (!validate_event(event->pmu, &fake_pmu, sibling))
+			return -EINVAL;
+	}
+
+	if (!validate_event(event->pmu, &fake_pmu, event))
+		return -EINVAL;
+
+	return 0;
+}
+
+static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
+{
+	struct arm_pmu *armpmu;
+	struct platform_device *plat_device;
+	struct arm_pmu_platdata *plat;
+	int ret;
+	u64 start_clock, finish_clock;
+
+	/*
+	 * we request the IRQ with a (possibly percpu) struct arm_pmu**, but
+	 * the handlers expect a struct arm_pmu*. The percpu_irq framework will
+	 * do any necessary shifting, we just need to perform the first
+	 * dereference.
+	 */
+	armpmu = *(void **)dev;
+	plat_device = armpmu->plat_device;
+	plat = dev_get_platdata(&plat_device->dev);
+
+	start_clock = sched_clock();
+	if (plat && plat->handle_irq)
+		ret = plat->handle_irq(irq, armpmu, armpmu->handle_irq);
+	else
+		ret = armpmu->handle_irq(irq, armpmu);
+	finish_clock = sched_clock();
+
+	perf_sample_event_took(finish_clock - start_clock);
+	return ret;
+}
+
+static void
+armpmu_release_hardware(struct arm_pmu *armpmu)
+{
+	armpmu->free_irq(armpmu);
+}
+
+static int
+armpmu_reserve_hardware(struct arm_pmu *armpmu)
+{
+	int err = armpmu->request_irq(armpmu, armpmu_dispatch_irq);
+	if (err) {
+		armpmu_release_hardware(armpmu);
+		return err;
+	}
+
+	return 0;
+}
+
+static void
+hw_perf_event_destroy(struct perf_event *event)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
+	atomic_t *active_events	 = &armpmu->active_events;
+	struct mutex *pmu_reserve_mutex = &armpmu->reserve_mutex;
+
+	if (atomic_dec_and_mutex_lock(active_events, pmu_reserve_mutex)) {
+		armpmu_release_hardware(armpmu);
+		mutex_unlock(pmu_reserve_mutex);
+	}
+}
+
+static int
+event_requires_mode_exclusion(struct perf_event_attr *attr)
+{
+	return attr->exclude_idle || attr->exclude_user ||
+	       attr->exclude_kernel || attr->exclude_hv;
+}
+
+static int
+__hw_perf_event_init(struct perf_event *event)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int mapping;
+
+	mapping = armpmu->map_event(event);
+
+	if (mapping < 0) {
+		pr_debug("event %x:%llx not supported\n", event->attr.type,
+			 event->attr.config);
+		return mapping;
+	}
+
+	/*
+	 * We don't assign an index until we actually place the event onto
+	 * hardware. Use -1 to signify that we haven't decided where to put it
+	 * yet. For SMP systems, each core has it's own PMU so we can't do any
+	 * clever allocation or constraints checking at this point.
+	 */
+	hwc->idx		= -1;
+	hwc->config_base	= 0;
+	hwc->config		= 0;
+	hwc->event_base		= 0;
+
+	/*
+	 * Check whether we need to exclude the counter from certain modes.
+	 */
+	if ((!armpmu->set_event_filter ||
+	     armpmu->set_event_filter(hwc, &event->attr)) &&
+	     event_requires_mode_exclusion(&event->attr)) {
+		pr_debug("ARM performance counters do not support "
+			 "mode exclusion\n");
+		return -EOPNOTSUPP;
+	}
+
+	/*
+	 * Store the event encoding into the config_base field.
+	 */
+	hwc->config_base	    |= (unsigned long)mapping;
+
+	if (!is_sampling_event(event)) {
+		/*
+		 * For non-sampling runs, limit the sample_period to half
+		 * of the counter width. That way, the new counter value
+		 * is far less likely to overtake the previous one unless
+		 * you have some serious IRQ latency issues.
+		 */
+		hwc->sample_period  = armpmu->max_period >> 1;
+		hwc->last_period    = hwc->sample_period;
+		local64_set(&hwc->period_left, hwc->sample_period);
+	}
+
+	if (event->group_leader != event) {
+		if (validate_group(event) != 0)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int armpmu_event_init(struct perf_event *event)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
+	int err = 0;
+	atomic_t *active_events = &armpmu->active_events;
+
+	/*
+	 * Reject CPU-affine events for CPUs that are of a different class to
+	 * that which this PMU handles. Process-following events (where
+	 * event->cpu == -1) can be migrated between CPUs, and thus we have to
+	 * reject them later (in armpmu_add) if they're scheduled on a
+	 * different class of CPU.
+	 */
+	if (event->cpu != -1 &&
+		!cpumask_test_cpu(event->cpu, &armpmu->supported_cpus))
+		return -ENOENT;
+
+	/* does not support taken branch sampling */
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
+	if (armpmu->map_event(event) == -ENOENT)
+		return -ENOENT;
+
+	event->destroy = hw_perf_event_destroy;
+
+	if (!atomic_inc_not_zero(active_events)) {
+		mutex_lock(&armpmu->reserve_mutex);
+		if (atomic_read(active_events) == 0)
+			err = armpmu_reserve_hardware(armpmu);
+
+		if (!err)
+			atomic_inc(active_events);
+		mutex_unlock(&armpmu->reserve_mutex);
+	}
+
+	if (err)
+		return err;
+
+	err = __hw_perf_event_init(event);
+	if (err)
+		hw_perf_event_destroy(event);
+
+	return err;
+}
+
+static void armpmu_enable(struct pmu *pmu)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(pmu);
+	struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events);
+	int enabled = bitmap_weight(hw_events->used_mask, armpmu->num_events);
+
+	/* For task-bound events we may be called on other CPUs */
+	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
+		return;
+
+	if (enabled)
+		armpmu->start(armpmu);
+}
+
+static void armpmu_disable(struct pmu *pmu)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(pmu);
+
+	/* For task-bound events we may be called on other CPUs */
+	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
+		return;
+
+	armpmu->stop(armpmu);
+}
+
+/*
+ * In heterogeneous systems, events are specific to a particular
+ * microarchitecture, and aren't suitable for another. Thus, only match CPUs of
+ * the same microarchitecture.
+ */
+static int armpmu_filter_match(struct perf_event *event)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
+	unsigned int cpu = smp_processor_id();
+	return cpumask_test_cpu(cpu, &armpmu->supported_cpus);
+}
+
+static void armpmu_init(struct arm_pmu *armpmu)
+{
+	atomic_set(&armpmu->active_events, 0);
+	mutex_init(&armpmu->reserve_mutex);
+
+	armpmu->pmu = (struct pmu) {
+		.pmu_enable	= armpmu_enable,
+		.pmu_disable	= armpmu_disable,
+		.event_init	= armpmu_event_init,
+		.add		= armpmu_add,
+		.del		= armpmu_del,
+		.start		= armpmu_start,
+		.stop		= armpmu_stop,
+		.read		= armpmu_read,
+		.filter_match	= armpmu_filter_match,
+	};
+}
+
+int armpmu_register(struct arm_pmu *armpmu, int type)
+{
+	armpmu_init(armpmu);
+	pr_info("enabled with %s PMU driver, %d counters available\n",
+			armpmu->name, armpmu->num_events);
+	return perf_pmu_register(&armpmu->pmu, armpmu->name, type);
+}
+
+/* Set at runtime when we know what CPU type we are. */
+static struct arm_pmu *__oprofile_cpu_pmu;
+
+/*
+ * Despite the names, these two functions are CPU-specific and are used
+ * by the OProfile/perf code.
+ */
+const char *perf_pmu_name(void)
+{
+	if (!__oprofile_cpu_pmu)
+		return NULL;
+
+	return __oprofile_cpu_pmu->name;
+}
+EXPORT_SYMBOL_GPL(perf_pmu_name);
+
+int perf_num_counters(void)
+{
+	int max_events = 0;
+
+	if (__oprofile_cpu_pmu != NULL)
+		max_events = __oprofile_cpu_pmu->num_events;
+
+	return max_events;
+}
+EXPORT_SYMBOL_GPL(perf_num_counters);
+
+static void cpu_pmu_enable_percpu_irq(void *data)
+{
+	int irq = *(int *)data;
+
+	enable_percpu_irq(irq, IRQ_TYPE_NONE);
+}
+
+static void cpu_pmu_disable_percpu_irq(void *data)
+{
+	int irq = *(int *)data;
+
+	disable_percpu_irq(irq);
+}
+
+static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
+{
+	int i, irq, irqs;
+	struct platform_device *pmu_device = cpu_pmu->plat_device;
+	struct pmu_hw_events __percpu *hw_events = cpu_pmu->hw_events;
+
+	irqs = min(pmu_device->num_resources, num_possible_cpus());
+
+	irq = platform_get_irq(pmu_device, 0);
+	if (irq >= 0 && irq_is_percpu(irq)) {
+		on_each_cpu(cpu_pmu_disable_percpu_irq, &irq, 1);
+		free_percpu_irq(irq, &hw_events->percpu_pmu);
+	} else {
+		for (i = 0; i < irqs; ++i) {
+			int cpu = i;
+
+			if (cpu_pmu->irq_affinity)
+				cpu = cpu_pmu->irq_affinity[i];
+
+			if (!cpumask_test_and_clear_cpu(cpu, &cpu_pmu->active_irqs))
+				continue;
+			irq = platform_get_irq(pmu_device, i);
+			if (irq >= 0)
+				free_irq(irq, per_cpu_ptr(&hw_events->percpu_pmu, cpu));
+		}
+	}
+}
+
+static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
+{
+	int i, err, irq, irqs;
+	struct platform_device *pmu_device = cpu_pmu->plat_device;
+	struct pmu_hw_events __percpu *hw_events = cpu_pmu->hw_events;
+
+	if (!pmu_device)
+		return -ENODEV;
+
+	irqs = min(pmu_device->num_resources, num_possible_cpus());
+	if (irqs < 1) {
+		pr_warn_once("perf/ARM: No irqs for PMU defined, sampling events not supported\n");
+		return 0;
+	}
+
+	irq = platform_get_irq(pmu_device, 0);
+	if (irq >= 0 && irq_is_percpu(irq)) {
+		err = request_percpu_irq(irq, handler, "arm-pmu",
+					 &hw_events->percpu_pmu);
+		if (err) {
+			pr_err("unable to request IRQ%d for ARM PMU counters\n",
+				irq);
+			return err;
+		}
+		on_each_cpu(cpu_pmu_enable_percpu_irq, &irq, 1);
+	} else {
+		for (i = 0; i < irqs; ++i) {
+			int cpu = i;
+
+			err = 0;
+			irq = platform_get_irq(pmu_device, i);
+			if (irq < 0)
+				continue;
+
+			if (cpu_pmu->irq_affinity)
+				cpu = cpu_pmu->irq_affinity[i];
+
+			/*
+			 * If we have a single PMU interrupt that we can't shift,
+			 * assume that we're running on a uniprocessor machine and
+			 * continue. Otherwise, continue without this interrupt.
+			 */
+			if (irq_set_affinity(irq, cpumask_of(cpu)) && irqs > 1) {
+				pr_warn("unable to set irq affinity (irq=%d, cpu=%u)\n",
+					irq, cpu);
+				continue;
+			}
+
+			err = request_irq(irq, handler,
+					  IRQF_NOBALANCING | IRQF_NO_THREAD, "arm-pmu",
+					  per_cpu_ptr(&hw_events->percpu_pmu, cpu));
+			if (err) {
+				pr_err("unable to request IRQ%d for ARM PMU counters\n",
+					irq);
+				return err;
+			}
+
+			cpumask_set_cpu(cpu, &cpu_pmu->active_irqs);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * PMU hardware loses all context when a CPU goes offline.
+ * When a CPU is hotplugged back in, since some hardware registers are
+ * UNKNOWN at reset, the PMU must be explicitly reset to avoid reading
+ * junk values out of them.
+ */
+static int cpu_pmu_notify(struct notifier_block *b, unsigned long action,
+			  void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+	struct arm_pmu *pmu = container_of(b, struct arm_pmu, hotplug_nb);
+
+	if ((action & ~CPU_TASKS_FROZEN) != CPU_STARTING)
+		return NOTIFY_DONE;
+
+	if (!cpumask_test_cpu(cpu, &pmu->supported_cpus))
+		return NOTIFY_DONE;
+
+	if (pmu->reset)
+		pmu->reset(pmu);
+	else
+		return NOTIFY_DONE;
+
+	return NOTIFY_OK;
+}
+
+static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
+{
+	int err;
+	int cpu;
+	struct pmu_hw_events __percpu *cpu_hw_events;
+
+	cpu_hw_events = alloc_percpu(struct pmu_hw_events);
+	if (!cpu_hw_events)
+		return -ENOMEM;
+
+	cpu_pmu->hotplug_nb.notifier_call = cpu_pmu_notify;
+	err = register_cpu_notifier(&cpu_pmu->hotplug_nb);
+	if (err)
+		goto out_hw_events;
+
+	for_each_possible_cpu(cpu) {
+		struct pmu_hw_events *events = per_cpu_ptr(cpu_hw_events, cpu);
+		raw_spin_lock_init(&events->pmu_lock);
+		events->percpu_pmu = cpu_pmu;
+	}
+
+	cpu_pmu->hw_events	= cpu_hw_events;
+	cpu_pmu->request_irq	= cpu_pmu_request_irq;
+	cpu_pmu->free_irq	= cpu_pmu_free_irq;
+
+	/* Ensure the PMU has sane values out of reset. */
+	if (cpu_pmu->reset)
+		on_each_cpu_mask(&cpu_pmu->supported_cpus, cpu_pmu->reset,
+			 cpu_pmu, 1);
+
+	/* If no interrupts available, set the corresponding capability flag */
+	if (!platform_get_irq(cpu_pmu->plat_device, 0))
+		cpu_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+
+	return 0;
+
+out_hw_events:
+	free_percpu(cpu_hw_events);
+	return err;
+}
+
+static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu)
+{
+	unregister_cpu_notifier(&cpu_pmu->hotplug_nb);
+	free_percpu(cpu_pmu->hw_events);
+}
+
+/*
+ * CPU PMU identification and probing.
+ */
+static int probe_current_pmu(struct arm_pmu *pmu,
+			     const struct pmu_probe_info *info)
+{
+	int cpu = get_cpu();
+	unsigned int cpuid = read_cpuid_id();
+	int ret = -ENODEV;
+
+	pr_info("probing PMU on CPU %d\n", cpu);
+
+	for (; info->init != NULL; info++) {
+		if ((cpuid & info->mask) != info->cpuid)
+			continue;
+		ret = info->init(pmu);
+		break;
+	}
+
+	put_cpu();
+	return ret;
+}
+
+static int of_pmu_irq_cfg(struct arm_pmu *pmu)
+{
+	int *irqs, i = 0;
+	bool using_spi = false;
+	struct platform_device *pdev = pmu->plat_device;
+
+	irqs = kcalloc(pdev->num_resources, sizeof(*irqs), GFP_KERNEL);
+	if (!irqs)
+		return -ENOMEM;
+
+	do {
+		struct device_node *dn;
+		int cpu, irq;
+
+		/* See if we have an affinity entry */
+		dn = of_parse_phandle(pdev->dev.of_node, "interrupt-affinity", i);
+		if (!dn)
+			break;
+
+		/* Check the IRQ type and prohibit a mix of PPIs and SPIs */
+		irq = platform_get_irq(pdev, i);
+		if (irq >= 0) {
+			bool spi = !irq_is_percpu(irq);
+
+			if (i > 0 && spi != using_spi) {
+				pr_err("PPI/SPI IRQ type mismatch for %s!\n",
+					dn->name);
+				kfree(irqs);
+				return -EINVAL;
+			}
+
+			using_spi = spi;
+		}
+
+		/* Now look up the logical CPU number */
+		for_each_possible_cpu(cpu)
+			if (dn == of_cpu_device_node_get(cpu))
+				break;
+
+		if (cpu >= nr_cpu_ids) {
+			pr_warn("Failed to find logical CPU for %s\n",
+				dn->name);
+			of_node_put(dn);
+			cpumask_setall(&pmu->supported_cpus);
+			break;
+		}
+		of_node_put(dn);
+
+		/* For SPIs, we need to track the affinity per IRQ */
+		if (using_spi) {
+			if (i >= pdev->num_resources) {
+				of_node_put(dn);
+				break;
+			}
+
+			irqs[i] = cpu;
+		}
+
+		/* Keep track of the CPUs containing this PMU type */
+		cpumask_set_cpu(cpu, &pmu->supported_cpus);
+		of_node_put(dn);
+		i++;
+	} while (1);
+
+	/* If we didn't manage to parse anything, claim to support all CPUs */
+	if (cpumask_weight(&pmu->supported_cpus) == 0)
+		cpumask_setall(&pmu->supported_cpus);
+
+	/* If we matched up the IRQ affinities, use them to route the SPIs */
+	if (using_spi && i == pdev->num_resources)
+		pmu->irq_affinity = irqs;
+	else
+		kfree(irqs);
+
+	return 0;
+}
+
+int arm_pmu_device_probe(struct platform_device *pdev,
+			 const struct of_device_id *of_table,
+			 const struct pmu_probe_info *probe_table)
+{
+	const struct of_device_id *of_id;
+	const int (*init_fn)(struct arm_pmu *);
+	struct device_node *node = pdev->dev.of_node;
+	struct arm_pmu *pmu;
+	int ret = -ENODEV;
+
+	pmu = kzalloc(sizeof(struct arm_pmu), GFP_KERNEL);
+	if (!pmu) {
+		pr_info("failed to allocate PMU device!\n");
+		return -ENOMEM;
+	}
+
+	if (!__oprofile_cpu_pmu)
+		__oprofile_cpu_pmu = pmu;
+
+	pmu->plat_device = pdev;
+
+	if (node && (of_id = of_match_node(of_table, pdev->dev.of_node))) {
+		init_fn = of_id->data;
+
+		ret = of_pmu_irq_cfg(pmu);
+		if (!ret)
+			ret = init_fn(pmu);
+	} else {
+		ret = probe_current_pmu(pmu, probe_table);
+		cpumask_setall(&pmu->supported_cpus);
+	}
+
+	if (ret) {
+		pr_info("failed to probe PMU!\n");
+		goto out_free;
+	}
+
+	ret = cpu_pmu_init(pmu);
+	if (ret)
+		goto out_free;
+
+	ret = armpmu_register(pmu, -1);
+	if (ret)
+		goto out_destroy;
+
+	return 0;
+
+out_destroy:
+	cpu_pmu_destroy(pmu);
+out_free:
+	pr_info("failed to register PMU devices!\n");
+	kfree(pmu);
+	return ret;
+}
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
new file mode 100644
index 000000000000..bfa673bb822d
--- /dev/null
+++ b/include/linux/perf/arm_pmu.h
@@ -0,0 +1,154 @@
+/*
+ *  linux/arch/arm/include/asm/pmu.h
+ *
+ *  Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#ifndef __ARM_PMU_H__
+#define __ARM_PMU_H__
+
+#include <linux/interrupt.h>
+#include <linux/perf_event.h>
+
+#include <asm/cputype.h>
+
+/*
+ * struct arm_pmu_platdata - ARM PMU platform data
+ *
+ * @handle_irq: an optional handler which will be called from the
+ *	interrupt and passed the address of the low level handler,
+ *	and can be used to implement any platform specific handling
+ *	before or after calling it.
+ */
+struct arm_pmu_platdata {
+	irqreturn_t (*handle_irq)(int irq, void *dev,
+				  irq_handler_t pmu_handler);
+};
+
+#ifdef CONFIG_ARM_PMU
+
+/*
+ * The ARMv7 CPU PMU supports up to 32 event counters.
+ */
+#define ARMPMU_MAX_HWEVENTS		32
+
+#define HW_OP_UNSUPPORTED		0xFFFF
+#define C(_x)				PERF_COUNT_HW_CACHE_##_x
+#define CACHE_OP_UNSUPPORTED		0xFFFF
+
+#define PERF_MAP_ALL_UNSUPPORTED					\
+	[0 ... PERF_COUNT_HW_MAX - 1] = HW_OP_UNSUPPORTED
+
+#define PERF_CACHE_MAP_ALL_UNSUPPORTED					\
+[0 ... C(MAX) - 1] = {							\
+	[0 ... C(OP_MAX) - 1] = {					\
+		[0 ... C(RESULT_MAX) - 1] = CACHE_OP_UNSUPPORTED,	\
+	},								\
+}
+
+/* The events for a given PMU register set. */
+struct pmu_hw_events {
+	/*
+	 * The events that are active on the PMU for the given index.
+	 */
+	struct perf_event	*events[ARMPMU_MAX_HWEVENTS];
+
+	/*
+	 * A 1 bit for an index indicates that the counter is being used for
+	 * an event. A 0 means that the counter can be used.
+	 */
+	DECLARE_BITMAP(used_mask, ARMPMU_MAX_HWEVENTS);
+
+	/*
+	 * Hardware lock to serialize accesses to PMU registers. Needed for the
+	 * read/modify/write sequences.
+	 */
+	raw_spinlock_t		pmu_lock;
+
+	/*
+	 * When using percpu IRQs, we need a percpu dev_id. Place it here as we
+	 * already have to allocate this struct per cpu.
+	 */
+	struct arm_pmu		*percpu_pmu;
+};
+
+struct arm_pmu {
+	struct pmu	pmu;
+	cpumask_t	active_irqs;
+	cpumask_t	supported_cpus;
+	int		*irq_affinity;
+	char		*name;
+	irqreturn_t	(*handle_irq)(int irq_num, void *dev);
+	void		(*enable)(struct perf_event *event);
+	void		(*disable)(struct perf_event *event);
+	int		(*get_event_idx)(struct pmu_hw_events *hw_events,
+					 struct perf_event *event);
+	void		(*clear_event_idx)(struct pmu_hw_events *hw_events,
+					 struct perf_event *event);
+	int		(*set_event_filter)(struct hw_perf_event *evt,
+					    struct perf_event_attr *attr);
+	u32		(*read_counter)(struct perf_event *event);
+	void		(*write_counter)(struct perf_event *event, u32 val);
+	void		(*start)(struct arm_pmu *);
+	void		(*stop)(struct arm_pmu *);
+	void		(*reset)(void *);
+	int		(*request_irq)(struct arm_pmu *, irq_handler_t handler);
+	void		(*free_irq)(struct arm_pmu *);
+	int		(*map_event)(struct perf_event *event);
+	int		num_events;
+	atomic_t	active_events;
+	struct mutex	reserve_mutex;
+	u64		max_period;
+	struct platform_device	*plat_device;
+	struct pmu_hw_events	__percpu *hw_events;
+	struct notifier_block	hotplug_nb;
+};
+
+#define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
+
+int armpmu_register(struct arm_pmu *armpmu, int type);
+
+u64 armpmu_event_update(struct perf_event *event);
+
+int armpmu_event_set_period(struct perf_event *event);
+
+int armpmu_map_event(struct perf_event *event,
+		     const unsigned (*event_map)[PERF_COUNT_HW_MAX],
+		     const unsigned (*cache_map)[PERF_COUNT_HW_CACHE_MAX]
+						[PERF_COUNT_HW_CACHE_OP_MAX]
+						[PERF_COUNT_HW_CACHE_RESULT_MAX],
+		     u32 raw_event_mask);
+
+struct pmu_probe_info {
+	unsigned int cpuid;
+	unsigned int mask;
+	int (*init)(struct arm_pmu *);
+};
+
+#define PMU_PROBE(_cpuid, _mask, _fn)	\
+{					\
+	.cpuid = (_cpuid),		\
+	.mask = (_mask),		\
+	.init = (_fn),			\
+}
+
+#define ARM_PMU_PROBE(_cpuid, _fn) \
+	PMU_PROBE(_cpuid, ARM_CPU_PART_MASK, _fn)
+
+#define ARM_PMU_XSCALE_MASK	((0xff << 24) | ARM_CPU_XSCALE_ARCH_MASK)
+
+#define XSCALE_PMU_PROBE(_version, _fn) \
+	PMU_PROBE(ARM_CPU_IMP_INTEL << 24 | _version, ARM_PMU_XSCALE_MASK, _fn)
+
+int arm_pmu_device_probe(struct platform_device *pdev,
+			 const struct of_device_id *of_table,
+			 const struct pmu_probe_info *probe_table);
+
+#endif /* CONFIG_ARM_PMU */
+
+#endif /* __ARM_PMU_H__ */
-- 
cgit v1.2.3-70-g09d2


From 34cadd9c1bcbd5ad5a1f379b013526a8046d4aed Mon Sep 17 00:00:00 2001
From: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Date: Thu, 30 Jul 2015 16:30:07 +0300
Subject: spi: pxa2xx: Add support for Intel Sunrisepoint

Major difference in LPSS SPI between Intel Sunrisepoint PCH and earlier
platforms is an integrated DMA (iDMA) engine. iDMA is an IP that is private
for each LPSS host controller (UART/SPI/I2C). Other differences are private
register space offset, a few private registers that are in different
location and FIFO thresholds.

Intel Sunrisepoint LPSS SPI and iDMA devices are probed and registered in
MFD layer as platform devices. Here these compound devices are detected by
matching against known PCI IDs. This allows us to share
pxa2xx_spi_acpi_get_pdata() for setting up the platform data instead of
duplicating it in MFD part.

This patch adds configuration for Intel Sunrisepoint LPSS SPI, above
detection and DMA filter function that picks the DMA channel only from an
associated iDMA block.

Signed-off-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-pxa2xx.c   | 59 ++++++++++++++++++++++++++++++++++++++++++----
 include/linux/pxa2xx_ssp.h |  1 +
 2 files changed, 56 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index 7293d6d875c5..2c9fa409d2bf 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -21,6 +21,7 @@
 #include <linux/err.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
+#include <linux/pci.h>
 #include <linux/platform_device.h>
 #include <linux/spi/pxa2xx_spi.h>
 #include <linux/spi/spi.h>
@@ -97,6 +98,15 @@ static const struct lpss_config lpss_platforms[] = {
 		.tx_threshold_lo = 160,
 		.tx_threshold_hi = 224,
 	},
+	{	/* LPSS_SPT_SSP */
+		.offset = 0x200,
+		.reg_general = -1,
+		.reg_ssp = 0x20,
+		.reg_cs_ctrl = 0x24,
+		.rx_threshold = 1,
+		.tx_threshold_lo = 32,
+		.tx_threshold_hi = 56,
+	},
 };
 
 static inline const struct lpss_config
@@ -110,6 +120,7 @@ static bool is_lpss_ssp(const struct driver_data *drv_data)
 	switch (drv_data->ssp_type) {
 	case LPSS_LPT_SSP:
 	case LPSS_BYT_SSP:
+	case LPSS_SPT_SSP:
 		return true;
 	default:
 		return false;
@@ -1107,6 +1118,7 @@ static int setup(struct spi_device *spi)
 		break;
 	case LPSS_LPT_SSP:
 	case LPSS_BYT_SSP:
+	case LPSS_SPT_SSP:
 		config = lpss_get_config(drv_data);
 		tx_thres = config->tx_threshold_lo;
 		tx_hi_thres = config->tx_threshold_hi;
@@ -1276,6 +1288,30 @@ static const struct acpi_device_id pxa2xx_spi_acpi_match[] = {
 };
 MODULE_DEVICE_TABLE(acpi, pxa2xx_spi_acpi_match);
 
+/*
+ * PCI IDs of compound devices that integrate both host controller and private
+ * integrated DMA engine. Please note these are not used in module
+ * autoloading and probing in this module but matching the LPSS SSP type.
+ */
+static const struct pci_device_id pxa2xx_spi_pci_compound_match[] = {
+	/* SPT-LP */
+	{ PCI_VDEVICE(INTEL, 0x9d29), LPSS_SPT_SSP },
+	{ PCI_VDEVICE(INTEL, 0x9d2a), LPSS_SPT_SSP },
+	/* SPT-H */
+	{ PCI_VDEVICE(INTEL, 0xa129), LPSS_SPT_SSP },
+	{ PCI_VDEVICE(INTEL, 0xa12a), LPSS_SPT_SSP },
+};
+
+static bool pxa2xx_spi_idma_filter(struct dma_chan *chan, void *param)
+{
+	struct device *dev = param;
+
+	if (dev != chan->device->dev->parent)
+		return false;
+
+	return true;
+}
+
 static struct pxa2xx_spi_master *
 pxa2xx_spi_acpi_get_pdata(struct platform_device *pdev)
 {
@@ -1283,16 +1319,25 @@ pxa2xx_spi_acpi_get_pdata(struct platform_device *pdev)
 	struct acpi_device *adev;
 	struct ssp_device *ssp;
 	struct resource *res;
-	const struct acpi_device_id *id;
+	const struct acpi_device_id *adev_id = NULL;
+	const struct pci_device_id *pcidev_id = NULL;
 	int devid, type;
 
 	if (!ACPI_HANDLE(&pdev->dev) ||
 	    acpi_bus_get_device(ACPI_HANDLE(&pdev->dev), &adev))
 		return NULL;
 
-	id = acpi_match_device(pdev->dev.driver->acpi_match_table, &pdev->dev);
-	if (id)
-		type = (int)id->driver_data;
+	if (dev_is_pci(pdev->dev.parent))
+		pcidev_id = pci_match_id(pxa2xx_spi_pci_compound_match,
+					 to_pci_dev(pdev->dev.parent));
+	else
+		adev_id = acpi_match_device(pdev->dev.driver->acpi_match_table,
+					    &pdev->dev);
+
+	if (adev_id)
+		type = (int)adev_id->driver_data;
+	else if (pcidev_id)
+		type = (int)pcidev_id->driver_data;
 	else
 		return NULL;
 
@@ -1311,6 +1356,12 @@ pxa2xx_spi_acpi_get_pdata(struct platform_device *pdev)
 	if (IS_ERR(ssp->mmio_base))
 		return NULL;
 
+	if (pcidev_id) {
+		pdata->tx_param = pdev->dev.parent;
+		pdata->rx_param = pdev->dev.parent;
+		pdata->dma_filter = pxa2xx_spi_idma_filter;
+	}
+
 	ssp->clk = devm_clk_get(&pdev->dev, NULL);
 	ssp->irq = platform_get_irq(pdev, 0);
 	ssp->type = type;
diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h
index 0485bab061fd..92273776bce6 100644
--- a/include/linux/pxa2xx_ssp.h
+++ b/include/linux/pxa2xx_ssp.h
@@ -197,6 +197,7 @@ enum pxa_ssp_type {
 	QUARK_X1000_SSP,
 	LPSS_LPT_SSP, /* Keep LPSS types sorted with lpss_platforms[] */
 	LPSS_BYT_SSP,
+	LPSS_SPT_SSP,
 };
 
 struct ssp_device {
-- 
cgit v1.2.3-70-g09d2


From ba2bbfbf63075850bb523e2adb815d45e3509995 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Thu, 18 Jun 2015 15:17:53 +0200
Subject: PM / Domains: Remove intermediate states from the power off sequence

Genpd's ->runtime_suspend() (assigned to pm_genpd_runtime_suspend())
doesn't immediately walk the hierarchy of ->runtime_suspend() callbacks.
Instead, pm_genpd_runtime_suspend() calls pm_genpd_poweroff() which
postpones that until *all* the devices in the genpd are runtime suspended.

When pm_genpd_poweroff() discovers that the last device in the genpd is
about to be runtime suspended, it calls __pm_genpd_save_device() for *all*
the devices in the genpd sequentially. Furthermore,
__pm_genpd_save_device() invokes the ->start() callback, walks the
hierarchy of the ->runtime_suspend() callbacks and invokes the ->stop()
callback. This causes a "thundering herd" problem.

Let's address this issue by having pm_genpd_runtime_suspend() immediately
walk the hierarchy of the ->runtime_suspend() callbacks, instead of
postponing that to the power off sequence via pm_genpd_poweroff(). If the
selected ->runtime_suspend() callback doesn't return an error code, call
pm_genpd_poweroff() to see if it's feasible to also power off the PM
domain.

Adopting this change enables us to simplify parts of the code in genpd,
for example the locking mechanism. Additionally, it gives some positive
side effects, as described below.

i)
One device's ->runtime_resume() latency is no longer affected by other
devices' latencies in a genpd.

The complexity genpd has to support the option to abort the power off
sequence suffers from latency issues. More precisely, a device that is
requested to be runtime resumed, may end up waiting for
__pm_genpd_save_device() to complete its operations for *another* device.
That's because pm_genpd_poweroff() can't confirm an abort request while it
waits for __pm_genpd_save_device() to return.

As this patch removes the intermediate states in pm_genpd_poweroff() while
powering off the PM domain, we no longer need the ability to abort that
sequence.

ii)
Make pm_runtime[_status]_suspended() reliable when used with genpd.

Until the last device in a genpd becomes idle, pm_genpd_runtime_suspend()
will return 0 without actually walking the hierarchy of the
->runtime_suspend() callbacks. However, by returning 0 the runtime PM core
considers the device as runtime_suspended, so
pm_runtime[_status]_suspended() will return true, even though the device
isn't (yet) runtime suspended.

After this patch, since pm_genpd_runtime_suspend() immediately walks the
hierarchy of the ->runtime_suspend() callbacks,
pm_runtime[_status]_suspended() will accurately reflect the status of the
device.

iii)
Enable fine-grained PM through runtime PM callbacks in drivers/subsystems.

There are currently cases were drivers/subsystems implements runtime PM
callbacks to deploy fine-grained PM (e.g. gate clocks, move pinctrl to
power-save state, etc.). While using the genpd, pm_genpd_runtime_suspend()
postpones invoking these callbacks until *all* the devices in the genpd
are runtime suspended. In essence, one runtime resumed device prevents
fine-grained PM for other devices within the same genpd.

After this patch, since pm_genpd_runtime_suspend() immediately walks the
hierarchy of the ->runtime_suspend() callbacks, fine-grained PM is enabled
throughout all the levels of runtime PM callbacks.

iiii)
Enable fine-grained PM for IRQ safe devices

Per the definition for an IRQ safe device, its runtime PM callbacks must
be able to execute in atomic context. In the path while genpd walks the
hierarchy of the ->runtime_suspend() callbacks for the device, it uses a
mutex. Therefore, genpd prevents that path to be executed for IRQ safe
devices.

As this patch changes pm_genpd_runtime_suspend() to immediately walk the
hierarchy of the ->runtime_suspend() callbacks and without needing to use
a mutex, fine-grained PM is enabled throughout all the levels of runtime
PM callbacks for IRQ safe devices.

Unfortunately this patch also comes with a drawback, as described in the
summary below.

Driver's/subsystem's runtime PM callbacks may be invoked even when the
genpd hasn't actually powered off the PM domain, potentially introducing
unnecessary latency.

However, in most cases, saving/restoring register contexts for devices are
typically fast operations or can be optimized in device specific ways
(e.g. shadow copies of register contents in memory, device-specific checks
to see if context has been lost before restoring context, etc.).

Still, in some cases the driver/subsystem may suffer from latency if
runtime PM is used in a very fine-grained manner (e.g. for each IO request
or xfer). To prevent that extra overhead, the driver/subsystem may deploy
the runtime PM autosuspend feature.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Kevin Hilman <khilman@linaro.org>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Tested-by: Lina Iyer <lina.iyer@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 363 ++++++++------------------------------------
 include/linux/pm_domain.h   |   7 -
 2 files changed, 62 insertions(+), 308 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 0ee43c1056e0..a1abe16dfe16 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -114,8 +114,12 @@ static int genpd_stop_dev(struct generic_pm_domain *genpd, struct device *dev)
 					stop_latency_ns, "stop");
 }
 
-static int genpd_start_dev(struct generic_pm_domain *genpd, struct device *dev)
+static int genpd_start_dev(struct generic_pm_domain *genpd, struct device *dev,
+			bool timed)
 {
+	if (!timed)
+		return GENPD_DEV_CALLBACK(genpd, int, start, dev);
+
 	return GENPD_DEV_TIMED_CALLBACK(genpd, int, start, dev,
 					start_latency_ns, "start");
 }
@@ -136,41 +140,6 @@ static void genpd_sd_counter_inc(struct generic_pm_domain *genpd)
 	smp_mb__after_atomic();
 }
 
-static void genpd_acquire_lock(struct generic_pm_domain *genpd)
-{
-	DEFINE_WAIT(wait);
-
-	mutex_lock(&genpd->lock);
-	/*
-	 * Wait for the domain to transition into either the active,
-	 * or the power off state.
-	 */
-	for (;;) {
-		prepare_to_wait(&genpd->status_wait_queue, &wait,
-				TASK_UNINTERRUPTIBLE);
-		if (genpd->status == GPD_STATE_ACTIVE
-		    || genpd->status == GPD_STATE_POWER_OFF)
-			break;
-		mutex_unlock(&genpd->lock);
-
-		schedule();
-
-		mutex_lock(&genpd->lock);
-	}
-	finish_wait(&genpd->status_wait_queue, &wait);
-}
-
-static void genpd_release_lock(struct generic_pm_domain *genpd)
-{
-	mutex_unlock(&genpd->lock);
-}
-
-static void genpd_set_active(struct generic_pm_domain *genpd)
-{
-	if (genpd->resume_count == 0)
-		genpd->status = GPD_STATE_ACTIVE;
-}
-
 static void genpd_recalc_cpu_exit_latency(struct generic_pm_domain *genpd)
 {
 	s64 usecs64;
@@ -251,35 +220,14 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool timed)
  * resume a device belonging to it.
  */
 static int __pm_genpd_poweron(struct generic_pm_domain *genpd)
-	__releases(&genpd->lock) __acquires(&genpd->lock)
 {
 	struct gpd_link *link;
-	DEFINE_WAIT(wait);
 	int ret = 0;
 
-	/* If the domain's master is being waited for, we have to wait too. */
-	for (;;) {
-		prepare_to_wait(&genpd->status_wait_queue, &wait,
-				TASK_UNINTERRUPTIBLE);
-		if (genpd->status != GPD_STATE_WAIT_MASTER)
-			break;
-		mutex_unlock(&genpd->lock);
-
-		schedule();
-
-		mutex_lock(&genpd->lock);
-	}
-	finish_wait(&genpd->status_wait_queue, &wait);
-
 	if (genpd->status == GPD_STATE_ACTIVE
 	    || (genpd->prepared_count > 0 && genpd->suspend_power_off))
 		return 0;
 
-	if (genpd->status != GPD_STATE_POWER_OFF) {
-		genpd_set_active(genpd);
-		return 0;
-	}
-
 	if (genpd->cpuidle_data) {
 		cpuidle_pause_and_lock();
 		genpd->cpuidle_data->idle_state->disabled = true;
@@ -294,20 +242,8 @@ static int __pm_genpd_poweron(struct generic_pm_domain *genpd)
 	 */
 	list_for_each_entry(link, &genpd->slave_links, slave_node) {
 		genpd_sd_counter_inc(link->master);
-		genpd->status = GPD_STATE_WAIT_MASTER;
-
-		mutex_unlock(&genpd->lock);
 
 		ret = pm_genpd_poweron(link->master);
-
-		mutex_lock(&genpd->lock);
-
-		/*
-		 * The "wait for parent" status is guaranteed not to change
-		 * while the master is powering on.
-		 */
-		genpd->status = GPD_STATE_POWER_OFF;
-		wake_up_all(&genpd->status_wait_queue);
 		if (ret) {
 			genpd_sd_counter_dec(link->master);
 			goto err;
@@ -319,8 +255,7 @@ static int __pm_genpd_poweron(struct generic_pm_domain *genpd)
 		goto err;
 
  out:
-	genpd_set_active(genpd);
-
+	genpd->status = GPD_STATE_ACTIVE;
 	return 0;
 
  err:
@@ -356,20 +291,18 @@ int pm_genpd_name_poweron(const char *domain_name)
 	return genpd ? pm_genpd_poweron(genpd) : -EINVAL;
 }
 
-static int genpd_start_dev_no_timing(struct generic_pm_domain *genpd,
-				     struct device *dev)
-{
-	return GENPD_DEV_CALLBACK(genpd, int, start, dev);
-}
-
 static int genpd_save_dev(struct generic_pm_domain *genpd, struct device *dev)
 {
 	return GENPD_DEV_TIMED_CALLBACK(genpd, int, save_state, dev,
 					save_state_latency_ns, "state save");
 }
 
-static int genpd_restore_dev(struct generic_pm_domain *genpd, struct device *dev)
+static int genpd_restore_dev(struct generic_pm_domain *genpd,
+			struct device *dev, bool timed)
 {
+	if (!timed)
+		return GENPD_DEV_CALLBACK(genpd, int, restore_state, dev);
+
 	return GENPD_DEV_TIMED_CALLBACK(genpd, int, restore_state, dev,
 					restore_state_latency_ns,
 					"state restore");
@@ -415,89 +348,6 @@ static int genpd_dev_pm_qos_notifier(struct notifier_block *nb,
 	return NOTIFY_DONE;
 }
 
-/**
- * __pm_genpd_save_device - Save the pre-suspend state of a device.
- * @pdd: Domain data of the device to save the state of.
- * @genpd: PM domain the device belongs to.
- */
-static int __pm_genpd_save_device(struct pm_domain_data *pdd,
-				  struct generic_pm_domain *genpd)
-	__releases(&genpd->lock) __acquires(&genpd->lock)
-{
-	struct generic_pm_domain_data *gpd_data = to_gpd_data(pdd);
-	struct device *dev = pdd->dev;
-	int ret = 0;
-
-	if (gpd_data->need_restore > 0)
-		return 0;
-
-	/*
-	 * If the value of the need_restore flag is still unknown at this point,
-	 * we trust that pm_genpd_poweroff() has verified that the device is
-	 * already runtime PM suspended.
-	 */
-	if (gpd_data->need_restore < 0) {
-		gpd_data->need_restore = 1;
-		return 0;
-	}
-
-	mutex_unlock(&genpd->lock);
-
-	genpd_start_dev(genpd, dev);
-	ret = genpd_save_dev(genpd, dev);
-	genpd_stop_dev(genpd, dev);
-
-	mutex_lock(&genpd->lock);
-
-	if (!ret)
-		gpd_data->need_restore = 1;
-
-	return ret;
-}
-
-/**
- * __pm_genpd_restore_device - Restore the pre-suspend state of a device.
- * @pdd: Domain data of the device to restore the state of.
- * @genpd: PM domain the device belongs to.
- */
-static void __pm_genpd_restore_device(struct pm_domain_data *pdd,
-				      struct generic_pm_domain *genpd)
-	__releases(&genpd->lock) __acquires(&genpd->lock)
-{
-	struct generic_pm_domain_data *gpd_data = to_gpd_data(pdd);
-	struct device *dev = pdd->dev;
-	int need_restore = gpd_data->need_restore;
-
-	gpd_data->need_restore = 0;
-	mutex_unlock(&genpd->lock);
-
-	genpd_start_dev(genpd, dev);
-
-	/*
-	 * Call genpd_restore_dev() for recently added devices too (need_restore
-	 * is negative then).
-	 */
-	if (need_restore)
-		genpd_restore_dev(genpd, dev);
-
-	mutex_lock(&genpd->lock);
-}
-
-/**
- * genpd_abort_poweroff - Check if a PM domain power off should be aborted.
- * @genpd: PM domain to check.
- *
- * Return true if a PM domain's status changed to GPD_STATE_ACTIVE during
- * a "power off" operation, which means that a "power on" has occured in the
- * meantime, or if its resume_count field is different from zero, which means
- * that one of its devices has been resumed in the meantime.
- */
-static bool genpd_abort_poweroff(struct generic_pm_domain *genpd)
-{
-	return genpd->status == GPD_STATE_WAIT_MASTER
-		|| genpd->status == GPD_STATE_ACTIVE || genpd->resume_count > 0;
-}
-
 /**
  * genpd_queue_power_off_work - Queue up the execution of pm_genpd_poweroff().
  * @genpd: PM domait to power off.
@@ -515,34 +365,26 @@ static void genpd_queue_power_off_work(struct generic_pm_domain *genpd)
  * @genpd: PM domain to power down.
  *
  * If all of the @genpd's devices have been suspended and all of its subdomains
- * have been powered down, run the runtime suspend callbacks provided by all of
- * the @genpd's devices' drivers and remove power from @genpd.
+ * have been powered down, remove power from @genpd.
  */
 static int pm_genpd_poweroff(struct generic_pm_domain *genpd)
-	__releases(&genpd->lock) __acquires(&genpd->lock)
 {
 	struct pm_domain_data *pdd;
 	struct gpd_link *link;
-	unsigned int not_suspended;
-	int ret = 0;
+	unsigned int not_suspended = 0;
 
- start:
 	/*
 	 * Do not try to power off the domain in the following situations:
 	 * (1) The domain is already in the "power off" state.
-	 * (2) The domain is waiting for its master to power up.
-	 * (3) One of the domain's devices is being resumed right now.
-	 * (4) System suspend is in progress.
+	 * (2) System suspend is in progress.
 	 */
 	if (genpd->status == GPD_STATE_POWER_OFF
-	    || genpd->status == GPD_STATE_WAIT_MASTER
-	    || genpd->resume_count > 0 || genpd->prepared_count > 0)
+	    || genpd->prepared_count > 0)
 		return 0;
 
 	if (atomic_read(&genpd->sd_count) > 0)
 		return -EBUSY;
 
-	not_suspended = 0;
 	list_for_each_entry(pdd, &genpd->dev_list, list_node) {
 		enum pm_qos_flags_status stat;
 
@@ -560,41 +402,11 @@ static int pm_genpd_poweroff(struct generic_pm_domain *genpd)
 	if (not_suspended > genpd->in_progress)
 		return -EBUSY;
 
-	if (genpd->poweroff_task) {
-		/*
-		 * Another instance of pm_genpd_poweroff() is executing
-		 * callbacks, so tell it to start over and return.
-		 */
-		genpd->status = GPD_STATE_REPEAT;
-		return 0;
-	}
-
 	if (genpd->gov && genpd->gov->power_down_ok) {
 		if (!genpd->gov->power_down_ok(&genpd->domain))
 			return -EAGAIN;
 	}
 
-	genpd->status = GPD_STATE_BUSY;
-	genpd->poweroff_task = current;
-
-	list_for_each_entry_reverse(pdd, &genpd->dev_list, list_node) {
-		ret = atomic_read(&genpd->sd_count) == 0 ?
-			__pm_genpd_save_device(pdd, genpd) : -EBUSY;
-
-		if (genpd_abort_poweroff(genpd))
-			goto out;
-
-		if (ret) {
-			genpd_set_active(genpd);
-			goto out;
-		}
-
-		if (genpd->status == GPD_STATE_REPEAT) {
-			genpd->poweroff_task = NULL;
-			goto start;
-		}
-	}
-
 	if (genpd->cpuidle_data) {
 		/*
 		 * If cpuidle_data is set, cpuidle should turn the domain off
@@ -607,14 +419,14 @@ static int pm_genpd_poweroff(struct generic_pm_domain *genpd)
 		cpuidle_pause_and_lock();
 		genpd->cpuidle_data->idle_state->disabled = false;
 		cpuidle_resume_and_unlock();
-		goto out;
+		return 0;
 	}
 
 	if (genpd->power_off) {
-		if (atomic_read(&genpd->sd_count) > 0) {
-			ret = -EBUSY;
-			goto out;
-		}
+		int ret;
+
+		if (atomic_read(&genpd->sd_count) > 0)
+			return -EBUSY;
 
 		/*
 		 * If sd_count > 0 at this point, one of the subdomains hasn't
@@ -625,10 +437,8 @@ static int pm_genpd_poweroff(struct generic_pm_domain *genpd)
 		 * happen very often).
 		 */
 		ret = genpd_power_off(genpd, true);
-		if (ret == -EBUSY) {
-			genpd_set_active(genpd);
-			goto out;
-		}
+		if (ret)
+			return ret;
 	}
 
 	genpd->status = GPD_STATE_POWER_OFF;
@@ -638,10 +448,7 @@ static int pm_genpd_poweroff(struct generic_pm_domain *genpd)
 		genpd_queue_power_off_work(link->master);
 	}
 
- out:
-	genpd->poweroff_task = NULL;
-	wake_up_all(&genpd->status_wait_queue);
-	return ret;
+	return 0;
 }
 
 /**
@@ -654,9 +461,9 @@ static void genpd_power_off_work_fn(struct work_struct *work)
 
 	genpd = container_of(work, struct generic_pm_domain, power_off_work);
 
-	genpd_acquire_lock(genpd);
+	mutex_lock(&genpd->lock);
 	pm_genpd_poweroff(genpd);
-	genpd_release_lock(genpd);
+	mutex_unlock(&genpd->lock);
 }
 
 /**
@@ -670,7 +477,6 @@ static void genpd_power_off_work_fn(struct work_struct *work)
 static int pm_genpd_runtime_suspend(struct device *dev)
 {
 	struct generic_pm_domain *genpd;
-	struct generic_pm_domain_data *gpd_data;
 	bool (*stop_ok)(struct device *__dev);
 	int ret;
 
@@ -684,10 +490,16 @@ static int pm_genpd_runtime_suspend(struct device *dev)
 	if (stop_ok && !stop_ok(dev))
 		return -EBUSY;
 
-	ret = genpd_stop_dev(genpd, dev);
+	ret = genpd_save_dev(genpd, dev);
 	if (ret)
 		return ret;
 
+	ret = genpd_stop_dev(genpd, dev);
+	if (ret) {
+		genpd_restore_dev(genpd, dev, true);
+		return ret;
+	}
+
 	/*
 	 * If power.irq_safe is set, this routine will be run with interrupts
 	 * off, so it can't use mutexes.
@@ -696,16 +508,6 @@ static int pm_genpd_runtime_suspend(struct device *dev)
 		return 0;
 
 	mutex_lock(&genpd->lock);
-
-	/*
-	 * If we have an unknown state of the need_restore flag, it means none
-	 * of the runtime PM callbacks has been invoked yet. Let's update the
-	 * flag to reflect that the current state is active.
-	 */
-	gpd_data = to_gpd_data(dev->power.subsys_data->domain_data);
-	if (gpd_data->need_restore < 0)
-		gpd_data->need_restore = 0;
-
 	genpd->in_progress++;
 	pm_genpd_poweroff(genpd);
 	genpd->in_progress--;
@@ -725,8 +527,8 @@ static int pm_genpd_runtime_suspend(struct device *dev)
 static int pm_genpd_runtime_resume(struct device *dev)
 {
 	struct generic_pm_domain *genpd;
-	DEFINE_WAIT(wait);
 	int ret;
+	bool timed = true;
 
 	dev_dbg(dev, "%s()\n", __func__);
 
@@ -735,39 +537,21 @@ static int pm_genpd_runtime_resume(struct device *dev)
 		return -EINVAL;
 
 	/* If power.irq_safe, the PM domain is never powered off. */
-	if (dev->power.irq_safe)
-		return genpd_start_dev_no_timing(genpd, dev);
+	if (dev->power.irq_safe) {
+		timed = false;
+		goto out;
+	}
 
 	mutex_lock(&genpd->lock);
 	ret = __pm_genpd_poweron(genpd);
-	if (ret) {
-		mutex_unlock(&genpd->lock);
-		return ret;
-	}
-	genpd->status = GPD_STATE_BUSY;
-	genpd->resume_count++;
-	for (;;) {
-		prepare_to_wait(&genpd->status_wait_queue, &wait,
-				TASK_UNINTERRUPTIBLE);
-		/*
-		 * If current is the powering off task, we have been called
-		 * reentrantly from one of the device callbacks, so we should
-		 * not wait.
-		 */
-		if (!genpd->poweroff_task || genpd->poweroff_task == current)
-			break;
-		mutex_unlock(&genpd->lock);
+	mutex_unlock(&genpd->lock);
 
-		schedule();
+	if (ret)
+		return ret;
 
-		mutex_lock(&genpd->lock);
-	}
-	finish_wait(&genpd->status_wait_queue, &wait);
-	__pm_genpd_restore_device(dev->power.subsys_data->domain_data, genpd);
-	genpd->resume_count--;
-	genpd_set_active(genpd);
-	wake_up_all(&genpd->status_wait_queue);
-	mutex_unlock(&genpd->lock);
+ out:
+	genpd_start_dev(genpd, dev, timed);
+	genpd_restore_dev(genpd, dev, timed);
 
 	return 0;
 }
@@ -883,7 +667,7 @@ static void pm_genpd_sync_poweron(struct generic_pm_domain *genpd,
 {
 	struct gpd_link *link;
 
-	if (genpd->status != GPD_STATE_POWER_OFF)
+	if (genpd->status == GPD_STATE_ACTIVE)
 		return;
 
 	list_for_each_entry(link, &genpd->slave_links, slave_node) {
@@ -960,14 +744,14 @@ static int pm_genpd_prepare(struct device *dev)
 	if (resume_needed(dev, genpd))
 		pm_runtime_resume(dev);
 
-	genpd_acquire_lock(genpd);
+	mutex_lock(&genpd->lock);
 
 	if (genpd->prepared_count++ == 0) {
 		genpd->suspended_count = 0;
 		genpd->suspend_power_off = genpd->status == GPD_STATE_POWER_OFF;
 	}
 
-	genpd_release_lock(genpd);
+	mutex_unlock(&genpd->lock);
 
 	if (genpd->suspend_power_off) {
 		pm_runtime_put_noidle(dev);
@@ -1102,7 +886,7 @@ static int pm_genpd_resume_noirq(struct device *dev)
 	pm_genpd_sync_poweron(genpd, true);
 	genpd->suspended_count--;
 
-	return genpd_start_dev(genpd, dev);
+	return genpd_start_dev(genpd, dev, true);
 }
 
 /**
@@ -1230,7 +1014,7 @@ static int pm_genpd_thaw_noirq(struct device *dev)
 	if (IS_ERR(genpd))
 		return -EINVAL;
 
-	return genpd->suspend_power_off ? 0 : genpd_start_dev(genpd, dev);
+	return genpd->suspend_power_off ? 0 : genpd_start_dev(genpd, dev, true);
 }
 
 /**
@@ -1324,7 +1108,7 @@ static int pm_genpd_restore_noirq(struct device *dev)
 
 	pm_genpd_sync_poweron(genpd, true);
 
-	return genpd_start_dev(genpd, dev);
+	return genpd_start_dev(genpd, dev, true);
 }
 
 /**
@@ -1440,7 +1224,6 @@ static struct generic_pm_domain_data *genpd_alloc_dev_data(struct device *dev,
 		gpd_data->td = *td;
 
 	gpd_data->base.dev = dev;
-	gpd_data->need_restore = -1;
 	gpd_data->td.constraint_changed = true;
 	gpd_data->td.effective_constraint_ns = -1;
 	gpd_data->nb.notifier_call = genpd_dev_pm_qos_notifier;
@@ -1502,7 +1285,7 @@ int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 	if (IS_ERR(gpd_data))
 		return PTR_ERR(gpd_data);
 
-	genpd_acquire_lock(genpd);
+	mutex_lock(&genpd->lock);
 
 	if (genpd->prepared_count > 0) {
 		ret = -EAGAIN;
@@ -1519,7 +1302,7 @@ int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 	list_add_tail(&gpd_data->base.list_node, &genpd->dev_list);
 
  out:
-	genpd_release_lock(genpd);
+	mutex_unlock(&genpd->lock);
 
 	if (ret)
 		genpd_free_dev_data(dev, gpd_data);
@@ -1563,7 +1346,7 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 	gpd_data = to_gpd_data(pdd);
 	dev_pm_qos_remove_notifier(dev, &gpd_data->nb);
 
-	genpd_acquire_lock(genpd);
+	mutex_lock(&genpd->lock);
 
 	if (genpd->prepared_count > 0) {
 		ret = -EAGAIN;
@@ -1578,14 +1361,14 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 
 	list_del_init(&pdd->list_node);
 
-	genpd_release_lock(genpd);
+	mutex_unlock(&genpd->lock);
 
 	genpd_free_dev_data(dev, gpd_data);
 
 	return 0;
 
  out:
-	genpd_release_lock(genpd);
+	mutex_unlock(&genpd->lock);
 	dev_pm_qos_add_notifier(dev, &gpd_data->nb);
 
 	return ret;
@@ -1606,17 +1389,9 @@ int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
 	    || genpd == subdomain)
 		return -EINVAL;
 
- start:
-	genpd_acquire_lock(genpd);
+	mutex_lock(&genpd->lock);
 	mutex_lock_nested(&subdomain->lock, SINGLE_DEPTH_NESTING);
 
-	if (subdomain->status != GPD_STATE_POWER_OFF
-	    && subdomain->status != GPD_STATE_ACTIVE) {
-		mutex_unlock(&subdomain->lock);
-		genpd_release_lock(genpd);
-		goto start;
-	}
-
 	if (genpd->status == GPD_STATE_POWER_OFF
 	    &&  subdomain->status != GPD_STATE_POWER_OFF) {
 		ret = -EINVAL;
@@ -1644,7 +1419,7 @@ int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
 
  out:
 	mutex_unlock(&subdomain->lock);
-	genpd_release_lock(genpd);
+	mutex_unlock(&genpd->lock);
 
 	return ret;
 }
@@ -1692,8 +1467,7 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 	if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(subdomain))
 		return -EINVAL;
 
- start:
-	genpd_acquire_lock(genpd);
+	mutex_lock(&genpd->lock);
 
 	list_for_each_entry(link, &genpd->master_links, master_node) {
 		if (link->slave != subdomain)
@@ -1701,13 +1475,6 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 
 		mutex_lock_nested(&subdomain->lock, SINGLE_DEPTH_NESTING);
 
-		if (subdomain->status != GPD_STATE_POWER_OFF
-		    && subdomain->status != GPD_STATE_ACTIVE) {
-			mutex_unlock(&subdomain->lock);
-			genpd_release_lock(genpd);
-			goto start;
-		}
-
 		list_del(&link->master_node);
 		list_del(&link->slave_node);
 		kfree(link);
@@ -1720,7 +1487,7 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 		break;
 	}
 
-	genpd_release_lock(genpd);
+	mutex_unlock(&genpd->lock);
 
 	return ret;
 }
@@ -1744,7 +1511,7 @@ int pm_genpd_attach_cpuidle(struct generic_pm_domain *genpd, int state)
 	if (IS_ERR_OR_NULL(genpd) || state < 0)
 		return -EINVAL;
 
-	genpd_acquire_lock(genpd);
+	mutex_lock(&genpd->lock);
 
 	if (genpd->cpuidle_data) {
 		ret = -EEXIST;
@@ -1775,7 +1542,7 @@ int pm_genpd_attach_cpuidle(struct generic_pm_domain *genpd, int state)
 	genpd_recalc_cpu_exit_latency(genpd);
 
  out:
-	genpd_release_lock(genpd);
+	mutex_unlock(&genpd->lock);
 	return ret;
 
  err:
@@ -1812,7 +1579,7 @@ int pm_genpd_detach_cpuidle(struct generic_pm_domain *genpd)
 	if (IS_ERR_OR_NULL(genpd))
 		return -EINVAL;
 
-	genpd_acquire_lock(genpd);
+	mutex_lock(&genpd->lock);
 
 	cpuidle_data = genpd->cpuidle_data;
 	if (!cpuidle_data) {
@@ -1830,7 +1597,7 @@ int pm_genpd_detach_cpuidle(struct generic_pm_domain *genpd)
 	kfree(cpuidle_data);
 
  out:
-	genpd_release_lock(genpd);
+	mutex_unlock(&genpd->lock);
 	return ret;
 }
 
@@ -1912,9 +1679,6 @@ void pm_genpd_init(struct generic_pm_domain *genpd,
 	genpd->in_progress = 0;
 	atomic_set(&genpd->sd_count, 0);
 	genpd->status = is_off ? GPD_STATE_POWER_OFF : GPD_STATE_ACTIVE;
-	init_waitqueue_head(&genpd->status_wait_queue);
-	genpd->poweroff_task = NULL;
-	genpd->resume_count = 0;
 	genpd->device_count = 0;
 	genpd->max_off_time_ns = -1;
 	genpd->max_off_time_changed = true;
@@ -2293,9 +2057,6 @@ static int pm_genpd_summary_one(struct seq_file *s,
 {
 	static const char * const status_lookup[] = {
 		[GPD_STATE_ACTIVE] = "on",
-		[GPD_STATE_WAIT_MASTER] = "wait-master",
-		[GPD_STATE_BUSY] = "busy",
-		[GPD_STATE_REPEAT] = "off-in-progress",
 		[GPD_STATE_POWER_OFF] = "off"
 	};
 	struct pm_domain_data *pm_data;
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 681ccb053f72..b2725e6e8e7b 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -22,9 +22,6 @@
 
 enum gpd_status {
 	GPD_STATE_ACTIVE = 0,	/* PM domain is active */
-	GPD_STATE_WAIT_MASTER,	/* PM domain's master is being waited for */
-	GPD_STATE_BUSY,		/* Something is happening to the PM domain */
-	GPD_STATE_REPEAT,	/* Power off in progress, to be repeated */
 	GPD_STATE_POWER_OFF,	/* PM domain is off */
 };
 
@@ -59,9 +56,6 @@ struct generic_pm_domain {
 	unsigned int in_progress;	/* Number of devices being suspended now */
 	atomic_t sd_count;	/* Number of subdomains with power "on" */
 	enum gpd_status status;	/* Current state of the domain */
-	wait_queue_head_t status_wait_queue;
-	struct task_struct *poweroff_task;	/* Powering off task */
-	unsigned int resume_count;	/* Number of devices being resumed */
 	unsigned int device_count;	/* Number of devices */
 	unsigned int suspended_count;	/* System suspend device counter */
 	unsigned int prepared_count;	/* Suspend counter of prepared devices */
@@ -113,7 +107,6 @@ struct generic_pm_domain_data {
 	struct pm_domain_data base;
 	struct gpd_timing_data td;
 	struct notifier_block nb;
-	int need_restore;
 };
 
 #ifdef CONFIG_PM_GENERIC_DOMAINS
-- 
cgit v1.2.3-70-g09d2


From f70ea018da0631e10c26a02f5a82d626ffef5bd7 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Fri, 31 Jul 2015 16:52:10 -0700
Subject: net: Add functions to get skb->hash based on flow structures

Add skb_get_hash_flowi6 and skb_get_hash_flowi4 which derive an sk_buff
hash from flowi6 and flowi4 structures respectively. These functions
can be called when creating a packet in the output path where the new
sk_buff does not yet contain a fully formed packet that is parsable by
flow dissector.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    | 21 +++++++++++++++++
 net/core/flow_dissector.c | 58 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 75 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 648a2c241993..b7c1286e247d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -37,6 +37,7 @@
 #include <net/flow_dissector.h>
 #include <linux/splice.h>
 #include <linux/in6.h>
+#include <net/flow.h>
 
 /* A. Checksumming of received packets by device.
  *
@@ -945,6 +946,26 @@ static inline __u32 skb_get_hash(struct sk_buff *skb)
 	return skb->hash;
 }
 
+__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6);
+
+static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
+{
+	if (!skb->l4_hash && !skb->sw_hash)
+		__skb_get_hash_flowi6(skb, fl6);
+
+	return skb->hash;
+}
+
+__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl);
+
+static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
+{
+	if (!skb->l4_hash && !skb->sw_hash)
+		__skb_get_hash_flowi4(skb, fl4);
+
+	return skb->hash;
+}
+
 __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb);
 
 static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 2a834c6179b9..11e6540fa386 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -590,6 +590,15 @@ void make_flow_keys_digest(struct flow_keys_digest *digest,
 }
 EXPORT_SYMBOL(make_flow_keys_digest);
 
+static inline void __skb_set_sw_hash(struct sk_buff *skb, u32 hash,
+				     struct flow_keys *keys)
+{
+	if (keys->ports.ports)
+		skb->l4_hash = 1;
+	skb->sw_hash = 1;
+	skb->hash = hash;
+}
+
 /**
  * __skb_get_hash: calculate a flow hash
  * @skb: sk_buff to calculate flow hash from
@@ -609,10 +618,8 @@ void __skb_get_hash(struct sk_buff *skb)
 	hash = ___skb_get_hash(skb, &keys, hashrnd);
 	if (!hash)
 		return;
-	if (keys.ports.ports)
-		skb->l4_hash = 1;
-	skb->sw_hash = 1;
-	skb->hash = hash;
+
+	__skb_set_sw_hash(skb, hash, &keys);
 }
 EXPORT_SYMBOL(__skb_get_hash);
 
@@ -624,6 +631,49 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
 }
 EXPORT_SYMBOL(skb_get_hash_perturb);
 
+__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
+{
+	struct flow_keys keys;
+
+	memset(&keys, 0, sizeof(keys));
+
+	memcpy(&keys.addrs.v6addrs.src, &fl6->saddr,
+	       sizeof(keys.addrs.v6addrs.src));
+	memcpy(&keys.addrs.v6addrs.dst, &fl6->daddr,
+	       sizeof(keys.addrs.v6addrs.dst));
+	keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+	keys.ports.src = fl6->fl6_sport;
+	keys.ports.dst = fl6->fl6_dport;
+	keys.keyid.keyid = fl6->fl6_gre_key;
+	keys.tags.flow_label = (__force u32)fl6->flowlabel;
+	keys.basic.ip_proto = fl6->flowi6_proto;
+
+	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys);
+
+	return skb->hash;
+}
+EXPORT_SYMBOL(__skb_get_hash_flowi6);
+
+__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
+{
+	struct flow_keys keys;
+
+	memset(&keys, 0, sizeof(keys));
+
+	keys.addrs.v4addrs.src = fl4->saddr;
+	keys.addrs.v4addrs.dst = fl4->daddr;
+	keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+	keys.ports.src = fl4->fl4_sport;
+	keys.ports.dst = fl4->fl4_dport;
+	keys.keyid.keyid = fl4->fl4_gre_key;
+	keys.basic.ip_proto = fl4->flowi4_proto;
+
+	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys);
+
+	return skb->hash;
+}
+EXPORT_SYMBOL(__skb_get_hash_flowi4);
+
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen)
 {
-- 
cgit v1.2.3-70-g09d2


From e52e95199d0c1aa8a06dbbc07b30562fded8b298 Mon Sep 17 00:00:00 2001
From: Cristina Opriceana <cristina.opriceana@gmail.com>
Date: Fri, 24 Jul 2015 16:23:43 +0300
Subject: include: linux: iio: Fix function parameter name in kernel doc

Fix buffer name from kernel doc according to the function parameter.

Signed-off-by: Cristina Opriceana <cristina.opriceana@gmail.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 include/linux/iio/consumer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h
index 26fb8f6342bb..fad58671c49e 100644
--- a/include/linux/iio/consumer.h
+++ b/include/linux/iio/consumer.h
@@ -100,7 +100,7 @@ void iio_channel_stop_all_cb(struct iio_cb_buffer *cb_buff);
 
 /**
  * iio_channel_cb_get_channels() - get access to the underlying channels.
- * @cb_buff:		The callback buffer from whom we want the channel
+ * @cb_buffer:		The callback buffer from whom we want the channel
  *			information.
  *
  * This function allows one to obtain information about the channels.
-- 
cgit v1.2.3-70-g09d2


From 2854c098e222a706b91cc83b4a91dbfd97212765 Mon Sep 17 00:00:00 2001
From: Cristina Opriceana <cristina.opriceana@gmail.com>
Date: Fri, 24 Jul 2015 16:26:09 +0300
Subject: include: linux: iio: Add missing kernel doc field

Fix kernel doc for the iio_dev_attr structure by adding its missing field.

Signed-off-by: Cristina Opriceana <cristina.opriceana@gmail.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 include/linux/iio/sysfs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/sysfs.h b/include/linux/iio/sysfs.h
index 8a1d18640ab9..9cd8f747212f 100644
--- a/include/linux/iio/sysfs.h
+++ b/include/linux/iio/sysfs.h
@@ -18,7 +18,8 @@ struct iio_chan_spec;
  * struct iio_dev_attr - iio specific device attribute
  * @dev_attr:	underlying device attribute
  * @address:	associated register address
- * @l:		list head for maintaining list of dynamically created attrs.
+ * @l:		list head for maintaining list of dynamically created attrs
+ * @c:		specification for the underlying channel
  */
 struct iio_dev_attr {
 	struct device_attribute dev_attr;
-- 
cgit v1.2.3-70-g09d2


From 76b235c6bcb16062d663e2ee96db0b69f2e6bc14 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 24 Jul 2015 14:45:44 +0200
Subject: jump_label: Rename JUMP_LABEL_{EN,DIS}ABLE to JUMP_LABEL_{JMP,NOP}

Since we've already stepped away from ENABLE is a JMP and DISABLE is a
NOP with the branch_default bits, and are going to make it even worse,
rename it to make it all clearer.

This way we don't mix multiple levels of logic attributes, but have a
plain 'physical' name for what the current instruction patching status
of a jump label is.

This is a first step in removing the naming confusion that has led to
a stream of avoidable bugs such as:

  a833581e372a ("x86, perf: Fix static_key bug in load_mm_cr4()")

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
[ Beefed up the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/kernel/jump_label.c     |  2 +-
 arch/arm64/kernel/jump_label.c   |  2 +-
 arch/mips/kernel/jump_label.c    |  2 +-
 arch/powerpc/kernel/jump_label.c |  2 +-
 arch/s390/kernel/jump_label.c    |  2 +-
 arch/sparc/kernel/jump_label.c   |  2 +-
 arch/x86/kernel/jump_label.c     |  2 +-
 include/linux/jump_label.h       |  4 ++--
 kernel/jump_label.c              | 18 +++++++++---------
 9 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/jump_label.c b/arch/arm/kernel/jump_label.c
index e39cbf488cfe..845a5dd9c42b 100644
--- a/arch/arm/kernel/jump_label.c
+++ b/arch/arm/kernel/jump_label.c
@@ -12,7 +12,7 @@ static void __arch_jump_label_transform(struct jump_entry *entry,
 	void *addr = (void *)entry->code;
 	unsigned int insn;
 
-	if (type == JUMP_LABEL_ENABLE)
+	if (type == JUMP_LABEL_JMP)
 		insn = arm_gen_branch(entry->code, entry->target);
 	else
 		insn = arm_gen_nop();
diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c
index 4f1fec7a46db..c2dd1ad3e648 100644
--- a/arch/arm64/kernel/jump_label.c
+++ b/arch/arm64/kernel/jump_label.c
@@ -28,7 +28,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
 	void *addr = (void *)entry->code;
 	u32 insn;
 
-	if (type == JUMP_LABEL_ENABLE) {
+	if (type == JUMP_LABEL_JMP) {
 		insn = aarch64_insn_gen_branch_imm(entry->code,
 						   entry->target,
 						   AARCH64_INSN_BRANCH_NOLINK);
diff --git a/arch/mips/kernel/jump_label.c b/arch/mips/kernel/jump_label.c
index dda800e9e731..3e586daa3a32 100644
--- a/arch/mips/kernel/jump_label.c
+++ b/arch/mips/kernel/jump_label.c
@@ -51,7 +51,7 @@ void arch_jump_label_transform(struct jump_entry *e,
 	/* Target must have the right alignment and ISA must be preserved. */
 	BUG_ON((e->target & J_ALIGN_MASK) != J_ISA_BIT);
 
-	if (type == JUMP_LABEL_ENABLE) {
+	if (type == JUMP_LABEL_JMP) {
 		insn.j_format.opcode = J_ISA_BIT ? mm_j32_op : j_op;
 		insn.j_format.target = e->target >> J_RANGE_SHIFT;
 	} else {
diff --git a/arch/powerpc/kernel/jump_label.c b/arch/powerpc/kernel/jump_label.c
index a1ed8a8c7cb4..6472472093d0 100644
--- a/arch/powerpc/kernel/jump_label.c
+++ b/arch/powerpc/kernel/jump_label.c
@@ -17,7 +17,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
 {
 	u32 *addr = (u32 *)(unsigned long)entry->code;
 
-	if (type == JUMP_LABEL_ENABLE)
+	if (type == JUMP_LABEL_JMP)
 		patch_branch(addr, entry->target, 0);
 	else
 		patch_instruction(addr, PPC_INST_NOP);
diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c
index a90299600483..a83d2248fea9 100644
--- a/arch/s390/kernel/jump_label.c
+++ b/arch/s390/kernel/jump_label.c
@@ -64,7 +64,7 @@ static void __jump_label_transform(struct jump_entry *entry,
 {
 	struct insn old, new;
 
-	if (type == JUMP_LABEL_ENABLE) {
+	if (type == JUMP_LABEL_JMP) {
 		jump_label_make_nop(entry, &old);
 		jump_label_make_branch(entry, &new);
 	} else {
diff --git a/arch/sparc/kernel/jump_label.c b/arch/sparc/kernel/jump_label.c
index 48565c11e82a..59bbeff55024 100644
--- a/arch/sparc/kernel/jump_label.c
+++ b/arch/sparc/kernel/jump_label.c
@@ -16,7 +16,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
 	u32 val;
 	u32 *insn = (u32 *) (unsigned long) entry->code;
 
-	if (type == JUMP_LABEL_ENABLE) {
+	if (type == JUMP_LABEL_JMP) {
 		s32 off = (s32)entry->target - (s32)entry->code;
 
 #ifdef CONFIG_SPARC64
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 26d5a55a2736..e565e0e4d216 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -45,7 +45,7 @@ static void __jump_label_transform(struct jump_entry *entry,
 	const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
 	const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
 
-	if (type == JUMP_LABEL_ENABLE) {
+	if (type == JUMP_LABEL_JMP) {
 		if (init) {
 			/*
 			 * Jump label is enabled for the first time.
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index f4de473f226b..6a8b4fe10ad8 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -86,8 +86,8 @@ struct static_key {
 #ifndef __ASSEMBLY__
 
 enum jump_label_type {
-	JUMP_LABEL_DISABLE = 0,
-	JUMP_LABEL_ENABLE,
+	JUMP_LABEL_NOP = 0,
+	JUMP_LABEL_JMP,
 };
 
 struct module;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 52ebaca1b9fc..96d8945c8bf3 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -65,9 +65,9 @@ void static_key_slow_inc(struct static_key *key)
 	jump_label_lock();
 	if (atomic_read(&key->enabled) == 0) {
 		if (!jump_label_get_branch_default(key))
-			jump_label_update(key, JUMP_LABEL_ENABLE);
+			jump_label_update(key, JUMP_LABEL_JMP);
 		else
-			jump_label_update(key, JUMP_LABEL_DISABLE);
+			jump_label_update(key, JUMP_LABEL_NOP);
 	}
 	atomic_inc(&key->enabled);
 	jump_label_unlock();
@@ -88,9 +88,9 @@ static void __static_key_slow_dec(struct static_key *key,
 		schedule_delayed_work(work, rate_limit);
 	} else {
 		if (!jump_label_get_branch_default(key))
-			jump_label_update(key, JUMP_LABEL_DISABLE);
+			jump_label_update(key, JUMP_LABEL_NOP);
 		else
-			jump_label_update(key, JUMP_LABEL_ENABLE);
+			jump_label_update(key, JUMP_LABEL_JMP);
 	}
 	jump_label_unlock();
 }
@@ -184,9 +184,9 @@ static enum jump_label_type jump_label_type(struct static_key *key)
 	bool state = static_key_enabled(key);
 
 	if ((!true_branch && state) || (true_branch && !state))
-		return JUMP_LABEL_ENABLE;
+		return JUMP_LABEL_JMP;
 
-	return JUMP_LABEL_DISABLE;
+	return JUMP_LABEL_NOP;
 }
 
 void __init jump_label_init(void)
@@ -276,7 +276,7 @@ void jump_label_apply_nops(struct module *mod)
 		return;
 
 	for (iter = iter_start; iter < iter_stop; iter++) {
-		arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
+		arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
 	}
 }
 
@@ -318,8 +318,8 @@ static int jump_label_add_module(struct module *mod)
 		jlm->next = key->next;
 		key->next = jlm;
 
-		if (jump_label_type(key) == JUMP_LABEL_ENABLE)
-			__jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
+		if (jump_label_type(key) == JUMP_LABEL_JMP)
+			__jump_label_update(key, iter, iter_stop, JUMP_LABEL_JMP);
 	}
 
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From a1efb01feca597b2abbc89873b40ef8ec6690168 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 24 Jul 2015 14:55:40 +0200
Subject: jump_label, locking/static_keys: Rename JUMP_LABEL_TYPE_* and related
 helpers to the static_key* pattern

Rename the JUMP_LABEL_TYPE_* macros to be JUMP_TYPE_* and move the
inline helpers into kernel/jump_label.c, since that's the only place
they're ever used.

Also rename the helpers where it's all about static keys.

This is the second step in removing the naming confusion that has led to
a stream of avoidable bugs such as:

  a833581e372a ("x86, perf: Fix static_key bug in load_mm_cr4()")

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jump_label.h | 25 +++++--------------------
 kernel/jump_label.c        | 25 ++++++++++++++++---------
 2 files changed, 21 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 6a8b4fe10ad8..0ddb208b8449 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -101,24 +101,9 @@ static inline int static_key_count(struct static_key *key)
 
 #ifdef HAVE_JUMP_LABEL
 
-#define JUMP_LABEL_TYPE_FALSE_BRANCH	0UL
-#define JUMP_LABEL_TYPE_TRUE_BRANCH	1UL
-#define JUMP_LABEL_TYPE_MASK		1UL
-
-static
-inline struct jump_entry *jump_label_get_entries(struct static_key *key)
-{
-	return (struct jump_entry *)((unsigned long)key->entries
-						& ~JUMP_LABEL_TYPE_MASK);
-}
-
-static inline bool jump_label_get_branch_default(struct static_key *key)
-{
-	if (((unsigned long)key->entries & JUMP_LABEL_TYPE_MASK) ==
-	    JUMP_LABEL_TYPE_TRUE_BRANCH)
-		return true;
-	return false;
-}
+#define JUMP_TYPE_FALSE	0UL
+#define JUMP_TYPE_TRUE	1UL
+#define JUMP_TYPE_MASK	1UL
 
 static __always_inline bool static_key_false(struct static_key *key)
 {
@@ -147,10 +132,10 @@ extern void jump_label_apply_nops(struct module *mod);
 
 #define STATIC_KEY_INIT_TRUE ((struct static_key)		\
 	{ .enabled = ATOMIC_INIT(1),				\
-	  .entries = (void *)JUMP_LABEL_TYPE_TRUE_BRANCH })
+	  .entries = (void *)JUMP_TYPE_TRUE })
 #define STATIC_KEY_INIT_FALSE ((struct static_key)		\
 	{ .enabled = ATOMIC_INIT(0),				\
-	  .entries = (void *)JUMP_LABEL_TYPE_FALSE_BRANCH })
+	  .entries = (void *)JUMP_TYPE_FALSE })
 
 #else  /* !HAVE_JUMP_LABEL */
 
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 96d8945c8bf3..85a2a0086c67 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -56,6 +56,11 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
 
 static void jump_label_update(struct static_key *key, int enable);
 
+static inline bool static_key_type(struct static_key *key)
+{
+	return (unsigned long)key->entries & JUMP_TYPE_MASK;
+}
+
 void static_key_slow_inc(struct static_key *key)
 {
 	STATIC_KEY_CHECK_USE();
@@ -64,7 +69,7 @@ void static_key_slow_inc(struct static_key *key)
 
 	jump_label_lock();
 	if (atomic_read(&key->enabled) == 0) {
-		if (!jump_label_get_branch_default(key))
+		if (!static_key_type(key))
 			jump_label_update(key, JUMP_LABEL_JMP);
 		else
 			jump_label_update(key, JUMP_LABEL_NOP);
@@ -87,7 +92,7 @@ static void __static_key_slow_dec(struct static_key *key,
 		atomic_inc(&key->enabled);
 		schedule_delayed_work(work, rate_limit);
 	} else {
-		if (!jump_label_get_branch_default(key))
+		if (!static_key_type(key))
 			jump_label_update(key, JUMP_LABEL_NOP);
 		else
 			jump_label_update(key, JUMP_LABEL_JMP);
@@ -178,15 +183,17 @@ static void __jump_label_update(struct static_key *key,
 	}
 }
 
-static enum jump_label_type jump_label_type(struct static_key *key)
+static inline struct jump_entry *static_key_entries(struct static_key *key)
 {
-	bool true_branch = jump_label_get_branch_default(key);
-	bool state = static_key_enabled(key);
+	return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK);
+}
 
-	if ((!true_branch && state) || (true_branch && !state))
-		return JUMP_LABEL_JMP;
+static enum jump_label_type jump_label_type(struct static_key *key)
+{
+	bool enabled = static_key_enabled(key);
+	bool type = static_key_type(key);
 
-	return JUMP_LABEL_NOP;
+	return enabled ^ type;
 }
 
 void __init jump_label_init(void)
@@ -442,7 +449,7 @@ int jump_label_text_reserved(void *start, void *end)
 static void jump_label_update(struct static_key *key, int enable)
 {
 	struct jump_entry *stop = __stop___jump_table;
-	struct jump_entry *entry = jump_label_get_entries(key);
+	struct jump_entry *entry = static_key_entries(key);
 #ifdef CONFIG_MODULES
 	struct module *mod;
 
-- 
cgit v1.2.3-70-g09d2


From e33886b38cc82a9fc3b2d655dfc7f50467594138 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 24 Jul 2015 15:03:40 +0200
Subject: locking/static_keys: Add static_key_{en,dis}able() helpers

Add two helpers to make it easier to treat the refcount as boolean.

Suggested-by: Jason Baron <jasonbaron0@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jump_label.h | 20 ++++++++++++++++++++
 kernel/sched/core.c        |  6 ++----
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 0ddb208b8449..65f0ebac63cf 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -198,6 +198,26 @@ static inline bool static_key_enabled(struct static_key *key)
 	return static_key_count(key) > 0;
 }
 
+static inline void static_key_enable(struct static_key *key)
+{
+	int count = static_key_count(key);
+
+	WARN_ON_ONCE(count < 0 || count > 1);
+
+	if (!count)
+		static_key_slow_inc(key);
+}
+
+static inline void static_key_disable(struct static_key *key)
+{
+	int count = static_key_count(key);
+
+	WARN_ON_ONCE(count < 0 || count > 1);
+
+	if (count)
+		static_key_slow_dec(key);
+}
+
 #endif	/* _LINUX_JUMP_LABEL_H */
 
 #endif /* __ASSEMBLY__ */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78b4bad10081..66ae8baf42fe 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -164,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
 
 static void sched_feat_disable(int i)
 {
-	if (static_key_enabled(&sched_feat_keys[i]))
-		static_key_slow_dec(&sched_feat_keys[i]);
+	static_key_disable(&sched_feat_keys[i]);
 }
 
 static void sched_feat_enable(int i)
 {
-	if (!static_key_enabled(&sched_feat_keys[i]))
-		static_key_slow_inc(&sched_feat_keys[i]);
+	static_key_enable(&sched_feat_keys[i]);
 }
 #else
 static void sched_feat_disable(int i) { };
-- 
cgit v1.2.3-70-g09d2


From 11276d5306b8e5b438a36bbff855fe792d7eaa61 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 24 Jul 2015 15:09:55 +0200
Subject: locking/static_keys: Add a new static_key interface

There are various problems and short-comings with the current
static_key interface:

 - static_key_{true,false}() read like a branch depending on the key
   value, instead of the actual likely/unlikely branch depending on
   init value.

 - static_key_{true,false}() are, as stated above, tied to the
   static_key init values STATIC_KEY_INIT_{TRUE,FALSE}.

 - we're limited to the 2 (out of 4) possible options that compile to
   a default NOP because that's what our arch_static_branch() assembly
   emits.

So provide a new static_key interface:

  DEFINE_STATIC_KEY_TRUE(name);
  DEFINE_STATIC_KEY_FALSE(name);

Which define a key of different types with an initial true/false
value.

Then allow:

   static_branch_likely()
   static_branch_unlikely()

to take a key of either type and emit the right instruction for the
case.

This means adding a second arch_static_branch_jump() assembly helper
which emits a JMP per default.

In order to determine the right instruction for the right state,
encode the branch type in the LSB of jump_entry::key.

This is the final step in removing the naming confusion that has led to
a stream of avoidable bugs such as:

  a833581e372a ("x86, perf: Fix static_key bug in load_mm_cr4()")

... but it also allows new static key combinations that will give us
performance enhancements in the subsequent patches.

Tested-by: Rabin Vincent <rabin@rab.in> # arm
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> # ppc
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com> # s390
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/include/asm/jump_label.h     |  25 ++++--
 arch/arm64/include/asm/jump_label.h   |  18 +++-
 arch/mips/include/asm/jump_label.h    |  19 ++++-
 arch/powerpc/include/asm/jump_label.h |  19 ++++-
 arch/s390/include/asm/jump_label.h    |  19 ++++-
 arch/sparc/include/asm/jump_label.h   |  35 ++++++--
 arch/x86/include/asm/jump_label.h     |  21 ++++-
 include/linux/jump_label.h            | 149 +++++++++++++++++++++++++++++++---
 kernel/jump_label.c                   |  37 +++++++--
 9 files changed, 298 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/jump_label.h b/arch/arm/include/asm/jump_label.h
index 5f337dc5c108..34f7b6980d21 100644
--- a/arch/arm/include/asm/jump_label.h
+++ b/arch/arm/include/asm/jump_label.h
@@ -4,23 +4,32 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
+#include <asm/unified.h>
 
 #define JUMP_LABEL_NOP_SIZE 4
 
-#ifdef CONFIG_THUMB2_KERNEL
-#define JUMP_LABEL_NOP	"nop.w"
-#else
-#define JUMP_LABEL_NOP	"nop"
-#endif
+static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
+{
+	asm_volatile_goto("1:\n\t"
+		 WASM(nop) "\n\t"
+		 ".pushsection __jump_table,  \"aw\"\n\t"
+		 ".word 1b, %l[l_yes], %c0\n\t"
+		 ".popsection\n\t"
+		 : :  "i" (&((char *)key)[branch]) :  : l_yes);
+
+	return false;
+l_yes:
+	return true;
+}
 
-static __always_inline bool arch_static_branch(struct static_key *key)
+static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
 {
 	asm_volatile_goto("1:\n\t"
-		 JUMP_LABEL_NOP "\n\t"
+		 WASM(b) " %l[l_yes]\n\t"
 		 ".pushsection __jump_table,  \"aw\"\n\t"
 		 ".word 1b, %l[l_yes], %c0\n\t"
 		 ".popsection\n\t"
-		 : :  "i" (key) :  : l_yes);
+		 : :  "i" (&((char *)key)[branch]) :  : l_yes);
 
 	return false;
 l_yes:
diff --git a/arch/arm64/include/asm/jump_label.h b/arch/arm64/include/asm/jump_label.h
index c0e5165c2f76..1b5e0e843c3a 100644
--- a/arch/arm64/include/asm/jump_label.h
+++ b/arch/arm64/include/asm/jump_label.h
@@ -26,14 +26,28 @@
 
 #define JUMP_LABEL_NOP_SIZE		AARCH64_INSN_SIZE
 
-static __always_inline bool arch_static_branch(struct static_key *key)
+static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
 	asm goto("1: nop\n\t"
 		 ".pushsection __jump_table,  \"aw\"\n\t"
 		 ".align 3\n\t"
 		 ".quad 1b, %l[l_yes], %c0\n\t"
 		 ".popsection\n\t"
-		 :  :  "i"(key) :  : l_yes);
+		 :  :  "i"(&((char *)key)[branch]) :  : l_yes);
+
+	return false;
+l_yes:
+	return true;
+}
+
+static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
+{
+	asm goto("1: b %l[l_yes]\n\t"
+		 ".pushsection __jump_table,  \"aw\"\n\t"
+		 ".align 3\n\t"
+		 ".quad 1b, %l[l_yes], %c0\n\t"
+		 ".popsection\n\t"
+		 :  :  "i"(&((char *)key)[branch]) :  : l_yes);
 
 	return false;
 l_yes:
diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h
index 608aa57799c8..e77672539e8e 100644
--- a/arch/mips/include/asm/jump_label.h
+++ b/arch/mips/include/asm/jump_label.h
@@ -26,14 +26,29 @@
 #define NOP_INSN "nop"
 #endif
 
-static __always_inline bool arch_static_branch(struct static_key *key)
+static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
 	asm_volatile_goto("1:\t" NOP_INSN "\n\t"
 		"nop\n\t"
 		".pushsection __jump_table,  \"aw\"\n\t"
 		WORD_INSN " 1b, %l[l_yes], %0\n\t"
 		".popsection\n\t"
-		: :  "i" (key) : : l_yes);
+		: :  "i" (&((char *)key)[branch]) : : l_yes);
+
+	return false;
+l_yes:
+	return true;
+}
+
+static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
+{
+	asm_volatile_goto("1:\tj %l[l_yes]\n\t"
+		"nop\n\t"
+		".pushsection __jump_table,  \"aw\"\n\t"
+		WORD_INSN " 1b, %l[l_yes], %0\n\t"
+		".popsection\n\t"
+		: :  "i" (&((char *)key)[branch]) : : l_yes);
+
 	return false;
 l_yes:
 	return true;
diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h
index efbf9a322a23..47e155f15433 100644
--- a/arch/powerpc/include/asm/jump_label.h
+++ b/arch/powerpc/include/asm/jump_label.h
@@ -18,14 +18,29 @@
 #define JUMP_ENTRY_TYPE		stringify_in_c(FTR_ENTRY_LONG)
 #define JUMP_LABEL_NOP_SIZE	4
 
-static __always_inline bool arch_static_branch(struct static_key *key)
+static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
 	asm_volatile_goto("1:\n\t"
 		 "nop\n\t"
 		 ".pushsection __jump_table,  \"aw\"\n\t"
 		 JUMP_ENTRY_TYPE "1b, %l[l_yes], %c0\n\t"
 		 ".popsection \n\t"
-		 : :  "i" (key) : : l_yes);
+		 : :  "i" (&((char *)key)[branch]) : : l_yes);
+
+	return false;
+l_yes:
+	return true;
+}
+
+static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
+{
+	asm_volatile_goto("1:\n\t"
+		 "b %l[l_yes]\n\t"
+		 ".pushsection __jump_table,  \"aw\"\n\t"
+		 JUMP_ENTRY_TYPE "1b, %l[l_yes], %c0\n\t"
+		 ".popsection \n\t"
+		 : :  "i" (&((char *)key)[branch]) : : l_yes);
+
 	return false;
 l_yes:
 	return true;
diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h
index 69972b7957ee..7f9fd5e3f1bf 100644
--- a/arch/s390/include/asm/jump_label.h
+++ b/arch/s390/include/asm/jump_label.h
@@ -12,14 +12,29 @@
  * We use a brcl 0,2 instruction for jump labels at compile time so it
  * can be easily distinguished from a hotpatch generated instruction.
  */
-static __always_inline bool arch_static_branch(struct static_key *key)
+static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
 	asm_volatile_goto("0:	brcl 0,"__stringify(JUMP_LABEL_NOP_OFFSET)"\n"
 		".pushsection __jump_table, \"aw\"\n"
 		".balign 8\n"
 		".quad 0b, %l[label], %0\n"
 		".popsection\n"
-		: : "X" (key) : : label);
+		: : "X" (&((char *)key)[branch]) : : label);
+
+	return false;
+label:
+	return true;
+}
+
+static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
+{
+	asm_volatile_goto("0:	brcl 15, %l[label]\n"
+		".pushsection __jump_table, \"aw\"\n"
+		".balign 8\n"
+		".quad 0b, %l[label], %0\n"
+		".popsection\n"
+		: : "X" (&((char *)key)[branch]) : : label);
+
 	return false;
 label:
 	return true;
diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h
index cc9b04a2b11b..62d0354d1727 100644
--- a/arch/sparc/include/asm/jump_label.h
+++ b/arch/sparc/include/asm/jump_label.h
@@ -7,16 +7,33 @@
 
 #define JUMP_LABEL_NOP_SIZE 4
 
-static __always_inline bool arch_static_branch(struct static_key *key)
+static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
-		asm_volatile_goto("1:\n\t"
-			 "nop\n\t"
-			 "nop\n\t"
-			 ".pushsection __jump_table,  \"aw\"\n\t"
-			 ".align 4\n\t"
-			 ".word 1b, %l[l_yes], %c0\n\t"
-			 ".popsection \n\t"
-			 : :  "i" (key) : : l_yes);
+	asm_volatile_goto("1:\n\t"
+		 "nop\n\t"
+		 "nop\n\t"
+		 ".pushsection __jump_table,  \"aw\"\n\t"
+		 ".align 4\n\t"
+		 ".word 1b, %l[l_yes], %c0\n\t"
+		 ".popsection \n\t"
+		 : :  "i" (&((char *)key)[branch]) : : l_yes);
+
+	return false;
+l_yes:
+	return true;
+}
+
+static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
+{
+	asm_volatile_goto("1:\n\t"
+		 "b %l[l_yes]\n\t"
+		 "nop\n\t"
+		 ".pushsection __jump_table,  \"aw\"\n\t"
+		 ".align 4\n\t"
+		 ".word 1b, %l[l_yes], %c0\n\t"
+		 ".popsection \n\t"
+		 : :  "i" (&((char *)key)[branch]) : : l_yes);
+
 	return false;
 l_yes:
 	return true;
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index a4c1cf7e93f8..28d7a857f9d1 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -16,7 +16,7 @@
 # define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
 #endif
 
-static __always_inline bool arch_static_branch(struct static_key *key)
+static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
 	asm_volatile_goto("1:"
 		".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
@@ -24,7 +24,24 @@ static __always_inline bool arch_static_branch(struct static_key *key)
 		_ASM_ALIGN "\n\t"
 		_ASM_PTR "1b, %l[l_yes], %c0 \n\t"
 		".popsection \n\t"
-		: :  "i" (key) : : l_yes);
+		: :  "i" (&((char *)key)[branch]) : : l_yes);
+
+	return false;
+l_yes:
+	return true;
+}
+
+static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
+{
+	asm_volatile_goto("1:"
+		".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
+		"2:\n\t"
+		".pushsection __jump_table,  \"aw\" \n\t"
+		_ASM_ALIGN "\n\t"
+		_ASM_PTR "1b, %l[l_yes], %c0 \n\t"
+		".popsection \n\t"
+		: :  "i" (&((char *)key)[branch]) : : l_yes);
+
 	return false;
 l_yes:
 	return true;
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 65f0ebac63cf..e337a1961933 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -107,12 +107,12 @@ static inline int static_key_count(struct static_key *key)
 
 static __always_inline bool static_key_false(struct static_key *key)
 {
-	return arch_static_branch(key);
+	return arch_static_branch(key, false);
 }
 
 static __always_inline bool static_key_true(struct static_key *key)
 {
-	return !static_key_false(key);
+	return !arch_static_branch(key, true);
 }
 
 extern struct jump_entry __start___jump_table[];
@@ -130,12 +130,12 @@ extern void static_key_slow_inc(struct static_key *key);
 extern void static_key_slow_dec(struct static_key *key);
 extern void jump_label_apply_nops(struct module *mod);
 
-#define STATIC_KEY_INIT_TRUE ((struct static_key)		\
+#define STATIC_KEY_INIT_TRUE					\
 	{ .enabled = ATOMIC_INIT(1),				\
-	  .entries = (void *)JUMP_TYPE_TRUE })
-#define STATIC_KEY_INIT_FALSE ((struct static_key)		\
+	  .entries = (void *)JUMP_TYPE_TRUE }
+#define STATIC_KEY_INIT_FALSE					\
 	{ .enabled = ATOMIC_INIT(0),				\
-	  .entries = (void *)JUMP_TYPE_FALSE })
+	  .entries = (void *)JUMP_TYPE_FALSE }
 
 #else  /* !HAVE_JUMP_LABEL */
 
@@ -183,10 +183,8 @@ static inline int jump_label_apply_nops(struct module *mod)
 	return 0;
 }
 
-#define STATIC_KEY_INIT_TRUE ((struct static_key) \
-		{ .enabled = ATOMIC_INIT(1) })
-#define STATIC_KEY_INIT_FALSE ((struct static_key) \
-		{ .enabled = ATOMIC_INIT(0) })
+#define STATIC_KEY_INIT_TRUE	{ .enabled = ATOMIC_INIT(1) }
+#define STATIC_KEY_INIT_FALSE	{ .enabled = ATOMIC_INIT(0) }
 
 #endif	/* HAVE_JUMP_LABEL */
 
@@ -218,6 +216,137 @@ static inline void static_key_disable(struct static_key *key)
 		static_key_slow_dec(key);
 }
 
+/* -------------------------------------------------------------------------- */
+
+/*
+ * Two type wrappers around static_key, such that we can use compile time
+ * type differentiation to emit the right code.
+ *
+ * All the below code is macros in order to play type games.
+ */
+
+struct static_key_true {
+	struct static_key key;
+};
+
+struct static_key_false {
+	struct static_key key;
+};
+
+#define STATIC_KEY_TRUE_INIT  (struct static_key_true) { .key = STATIC_KEY_INIT_TRUE,  }
+#define STATIC_KEY_FALSE_INIT (struct static_key_false){ .key = STATIC_KEY_INIT_FALSE, }
+
+#define DEFINE_STATIC_KEY_TRUE(name)	\
+	struct static_key_true name = STATIC_KEY_TRUE_INIT
+
+#define DEFINE_STATIC_KEY_FALSE(name)	\
+	struct static_key_false name = STATIC_KEY_FALSE_INIT
+
+#ifdef HAVE_JUMP_LABEL
+
+/*
+ * Combine the right initial value (type) with the right branch order
+ * to generate the desired result.
+ *
+ *
+ * type\branch|	likely (1)	      |	unlikely (0)
+ * -----------+-----------------------+------------------
+ *            |                       |
+ *  true (1)  |	   ...		      |	   ...
+ *            |    NOP		      |	   JMP L
+ *            |    <br-stmts>	      |	1: ...
+ *            |	L: ...		      |
+ *            |			      |
+ *            |			      |	L: <br-stmts>
+ *            |			      |	   jmp 1b
+ *            |                       |
+ * -----------+-----------------------+------------------
+ *            |                       |
+ *  false (0) |	   ...		      |	   ...
+ *            |    JMP L	      |	   NOP
+ *            |    <br-stmts>	      |	1: ...
+ *            |	L: ...		      |
+ *            |			      |
+ *            |			      |	L: <br-stmts>
+ *            |			      |	   jmp 1b
+ *            |                       |
+ * -----------+-----------------------+------------------
+ *
+ * The initial value is encoded in the LSB of static_key::entries,
+ * type: 0 = false, 1 = true.
+ *
+ * The branch type is encoded in the LSB of jump_entry::key,
+ * branch: 0 = unlikely, 1 = likely.
+ *
+ * This gives the following logic table:
+ *
+ *	enabled	type	branch	  instuction
+ * -----------------------------+-----------
+ *	0	0	0	| NOP
+ *	0	0	1	| JMP
+ *	0	1	0	| NOP
+ *	0	1	1	| JMP
+ *
+ *	1	0	0	| JMP
+ *	1	0	1	| NOP
+ *	1	1	0	| JMP
+ *	1	1	1	| NOP
+ *
+ * Which gives the following functions:
+ *
+ *   dynamic: instruction = enabled ^ branch
+ *   static:  instruction = type ^ branch
+ *
+ * See jump_label_type() / jump_label_init_type().
+ */
+
+extern bool ____wrong_branch_error(void);
+
+#define static_branch_likely(x)							\
+({										\
+	bool branch;								\
+	if (__builtin_types_compatible_p(typeof(*x), struct static_key_true))	\
+		branch = !arch_static_branch(&(x)->key, true);			\
+	else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
+		branch = !arch_static_branch_jump(&(x)->key, true);		\
+	else									\
+		branch = ____wrong_branch_error();				\
+	branch;									\
+})
+
+#define static_branch_unlikely(x)						\
+({										\
+	bool branch;								\
+	if (__builtin_types_compatible_p(typeof(*x), struct static_key_true))	\
+		branch = arch_static_branch_jump(&(x)->key, false);		\
+	else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
+		branch = arch_static_branch(&(x)->key, false);			\
+	else									\
+		branch = ____wrong_branch_error();				\
+	branch;									\
+})
+
+#else /* !HAVE_JUMP_LABEL */
+
+#define static_branch_likely(x)		likely(static_key_enabled(&(x)->key))
+#define static_branch_unlikely(x)	unlikely(static_key_enabled(&(x)->key))
+
+#endif /* HAVE_JUMP_LABEL */
+
+/*
+ * Advanced usage; refcount, branch is enabled when: count != 0
+ */
+
+#define static_branch_inc(x)		static_key_slow_inc(&(x)->key)
+#define static_branch_dec(x)		static_key_slow_dec(&(x)->key)
+
+/*
+ * Normal usage; boolean enable/disable.
+ */
+
+#define static_branch_enable(x)		static_key_enable(&(x)->key)
+#define static_branch_disable(x)	static_key_disable(&(x)->key)
+
 #endif	/* _LINUX_JUMP_LABEL_H */
 
 #endif /* __ASSEMBLY__ */
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 2e7cc1e4b4b5..8fd00d892286 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -165,16 +165,22 @@ static inline bool static_key_type(struct static_key *key)
 
 static inline struct static_key *jump_entry_key(struct jump_entry *entry)
 {
-	return (struct static_key *)((unsigned long)entry->key);
+	return (struct static_key *)((unsigned long)entry->key & ~1UL);
+}
+
+static bool jump_entry_branch(struct jump_entry *entry)
+{
+	return (unsigned long)entry->key & 1UL;
 }
 
 static enum jump_label_type jump_label_type(struct jump_entry *entry)
 {
 	struct static_key *key = jump_entry_key(entry);
 	bool enabled = static_key_enabled(key);
-	bool type = static_key_type(key);
+	bool branch = jump_entry_branch(entry);
 
-	return enabled ^ type;
+	/* See the comment in linux/jump_label.h */
+	return enabled ^ branch;
 }
 
 static void __jump_label_update(struct static_key *key,
@@ -205,7 +211,10 @@ void __init jump_label_init(void)
 	for (iter = iter_start; iter < iter_stop; iter++) {
 		struct static_key *iterk;
 
-		arch_jump_label_transform_static(iter, jump_label_type(iter));
+		/* rewrite NOPs */
+		if (jump_label_type(iter) == JUMP_LABEL_NOP)
+			arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
+
 		iterk = jump_entry_key(iter);
 		if (iterk == key)
 			continue;
@@ -225,6 +234,16 @@ void __init jump_label_init(void)
 
 #ifdef CONFIG_MODULES
 
+static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
+{
+	struct static_key *key = jump_entry_key(entry);
+	bool type = static_key_type(key);
+	bool branch = jump_entry_branch(entry);
+
+	/* See the comment in linux/jump_label.h */
+	return type ^ branch;
+}
+
 struct static_key_mod {
 	struct static_key_mod *next;
 	struct jump_entry *entries;
@@ -276,8 +295,11 @@ void jump_label_apply_nops(struct module *mod)
 	if (iter_start == iter_stop)
 		return;
 
-	for (iter = iter_start; iter < iter_stop; iter++)
-		arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
+	for (iter = iter_start; iter < iter_stop; iter++) {
+		/* Only write NOPs for arch_branch_static(). */
+		if (jump_label_init_type(iter) == JUMP_LABEL_NOP)
+			arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
+	}
 }
 
 static int jump_label_add_module(struct module *mod)
@@ -318,7 +340,8 @@ static int jump_label_add_module(struct module *mod)
 		jlm->next = key->next;
 		key->next = jlm;
 
-		if (jump_label_type(iter) == JUMP_LABEL_JMP)
+		/* Only update if we've changed from our initial state */
+		if (jump_label_type(iter) != jump_label_init_type(iter))
 			__jump_label_update(key, iter, iter_stop);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 412758cb26704e5087ca2976ec3b28fb2bdbfad4 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Thu, 30 Jul 2015 03:59:48 +0000
Subject: jump label, locking/static_keys: Update docs

Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: benh@kernel.crashing.org
Cc: bp@alien8.de
Cc: davem@davemloft.net
Cc: ddaney@caviumnetworks.com
Cc: heiko.carstens@de.ibm.com
Cc: linux-kernel@vger.kernel.org
Cc: liuj97@gmail.com
Cc: luto@amacapital.net
Cc: michael@ellerman.id.au
Cc: rabin@rab.in
Cc: ralf@linux-mips.org
Cc: rostedt@goodmis.org
Cc: vbabka@suse.cz
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/6b50f2f6423a2244f37f4b1d2d6c211b9dcdf4f8.1438227999.git.jbaron@akamai.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/static-keys.txt | 99 +++++++++++++++++++++++--------------------
 include/linux/jump_label.h    | 67 ++++++++++++++++++++---------
 2 files changed, 98 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/static-keys.txt b/Documentation/static-keys.txt
index c4407a41b0fc..f4cb0b2d5cd7 100644
--- a/Documentation/static-keys.txt
+++ b/Documentation/static-keys.txt
@@ -1,7 +1,22 @@
 			Static Keys
 			-----------
 
-By: Jason Baron <jbaron@redhat.com>
+DEPRECATED API:
+
+The use of 'struct static_key' directly, is now DEPRECATED. In addition
+static_key_{true,false}() is also DEPRECATED. IE DO NOT use the following:
+
+struct static_key false = STATIC_KEY_INIT_FALSE;
+struct static_key true = STATIC_KEY_INIT_TRUE;
+static_key_true()
+static_key_false()
+
+The updated API replacements are:
+
+DEFINE_STATIC_KEY_TRUE(key);
+DEFINE_STATIC_KEY_FALSE(key);
+static_key_likely()
+statick_key_unlikely()
 
 0) Abstract
 
@@ -9,22 +24,22 @@ Static keys allows the inclusion of seldom used features in
 performance-sensitive fast-path kernel code, via a GCC feature and a code
 patching technique. A quick example:
 
-	struct static_key key = STATIC_KEY_INIT_FALSE;
+	DEFINE_STATIC_KEY_FALSE(key);
 
 	...
 
-        if (static_key_false(&key))
+        if (static_branch_unlikely(&key))
                 do unlikely code
         else
                 do likely code
 
 	...
-	static_key_slow_inc();
+	static_branch_enable(&key);
 	...
-	static_key_slow_inc();
+	static_branch_disable(&key);
 	...
 
-The static_key_false() branch will be generated into the code with as little
+The static_branch_unlikely() branch will be generated into the code with as little
 impact to the likely code path as possible.
 
 
@@ -56,7 +71,7 @@ the branch site to change the branch direction.
 
 For example, if we have a simple branch that is disabled by default:
 
-	if (static_key_false(&key))
+	if (static_branch_unlikely(&key))
 		printk("I am the true branch\n");
 
 Thus, by default the 'printk' will not be emitted. And the code generated will
@@ -75,68 +90,55 @@ the basis for the static keys facility.
 
 In order to make use of this optimization you must first define a key:
 
-	struct static_key key;
-
-Which is initialized as:
-
-	struct static_key key = STATIC_KEY_INIT_TRUE;
+	DEFINE_STATIC_KEY_TRUE(key);
 
 or:
 
-	struct static_key key = STATIC_KEY_INIT_FALSE;
+	DEFINE_STATIC_KEY_FALSE(key);
+
 
-If the key is not initialized, it is default false. The 'struct static_key',
-must be a 'global'. That is, it can't be allocated on the stack or dynamically
+The key must be global, that is, it can't be allocated on the stack or dynamically
 allocated at run-time.
 
 The key is then used in code as:
 
-        if (static_key_false(&key))
+        if (static_branch_unlikely(&key))
                 do unlikely code
         else
                 do likely code
 
 Or:
 
-        if (static_key_true(&key))
+        if (static_branch_likely(&key))
                 do likely code
         else
                 do unlikely code
 
-A key that is initialized via 'STATIC_KEY_INIT_FALSE', must be used in a
-'static_key_false()' construct. Likewise, a key initialized via
-'STATIC_KEY_INIT_TRUE' must be used in a 'static_key_true()' construct. A
-single key can be used in many branches, but all the branches must match the
-way that the key has been initialized.
+Keys defined via DEFINE_STATIC_KEY_TRUE(), or DEFINE_STATIC_KEY_FALSE, may
+be used in either static_branch_likely() or static_branch_unlikely()
+statemnts.
 
-The branch(es) can then be switched via:
+Branch(es) can be set true via:
 
-	static_key_slow_inc(&key);
-	...
-	static_key_slow_dec(&key);
+static_branch_enable(&key);
 
-Thus, 'static_key_slow_inc()' means 'make the branch true', and
-'static_key_slow_dec()' means 'make the branch false' with appropriate
-reference counting. For example, if the key is initialized true, a
-static_key_slow_dec(), will switch the branch to false. And a subsequent
-static_key_slow_inc(), will change the branch back to true. Likewise, if the
-key is initialized false, a 'static_key_slow_inc()', will change the branch to
-true. And then a 'static_key_slow_dec()', will again make the branch false.
+or false via:
+
+static_branch_disable(&key);
 
-An example usage in the kernel is the implementation of tracepoints:
+The branch(es) can then be switched via reference counts:
 
-        static inline void trace_##name(proto)                          \
-        {                                                               \
-                if (static_key_false(&__tracepoint_##name.key))		\
-                        __DO_TRACE(&__tracepoint_##name,                \
-                                TP_PROTO(data_proto),                   \
-                                TP_ARGS(data_args),                     \
-                                TP_CONDITION(cond));                    \
-        }
+	static_branch_inc(&key);
+	...
+	static_branch_dec(&key);
 
-Tracepoints are disabled by default, and can be placed in performance critical
-pieces of the kernel. Thus, by using a static key, the tracepoints can have
-absolutely minimal impact when not in use.
+Thus, 'static_branch_inc()' means 'make the branch true', and
+'static_branch_dec()' means 'make the branch false' with appropriate
+reference counting. For example, if the key is initialized true, a
+static_branch_dec(), will switch the branch to false. And a subsequent
+static_branch_inc(), will change the branch back to true. Likewise, if the
+key is initialized false, a 'static_branch_inc()', will change the branch to
+true. And then a 'static_branch_dec()', will again make the branch false.
 
 
 4) Architecture level code patching interface, 'jump labels'
@@ -150,9 +152,12 @@ simply fall back to a traditional, load, test, and jump sequence.
 
 * #define JUMP_LABEL_NOP_SIZE, see: arch/x86/include/asm/jump_label.h
 
-* __always_inline bool arch_static_branch(struct static_key *key), see:
+* __always_inline bool arch_static_branch(struct static_key *key, bool branch), see:
 					arch/x86/include/asm/jump_label.h
 
+* __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch),
+					see: arch/x86/include/asm/jump_label.h
+
 * void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type),
 					see: arch/x86/kernel/jump_label.c
 
@@ -173,7 +178,7 @@ SYSCALL_DEFINE0(getppid)
 {
         int pid;
 
-+       if (static_key_false(&key))
++       if (static_branch_unlikely(&key))
 +               printk("I am the true branch\n");
 
         rcu_read_lock();
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index e337a1961933..7f653e8f6690 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -7,17 +7,52 @@
  * Copyright (C) 2009-2012 Jason Baron <jbaron@redhat.com>
  * Copyright (C) 2011-2012 Peter Zijlstra <pzijlstr@redhat.com>
  *
+ * DEPRECATED API:
+ *
+ * The use of 'struct static_key' directly, is now DEPRECATED. In addition
+ * static_key_{true,false}() is also DEPRECATED. IE DO NOT use the following:
+ *
+ * struct static_key false = STATIC_KEY_INIT_FALSE;
+ * struct static_key true = STATIC_KEY_INIT_TRUE;
+ * static_key_true()
+ * static_key_false()
+ *
+ * The updated API replacements are:
+ *
+ * DEFINE_STATIC_KEY_TRUE(key);
+ * DEFINE_STATIC_KEY_FALSE(key);
+ * static_key_likely()
+ * statick_key_unlikely()
+ *
  * Jump labels provide an interface to generate dynamic branches using
- * self-modifying code. Assuming toolchain and architecture support, the result
- * of a "if (static_key_false(&key))" statement is an unconditional branch (which
- * defaults to false - and the true block is placed out of line).
+ * self-modifying code. Assuming toolchain and architecture support, if we
+ * define a "key" that is initially false via "DEFINE_STATIC_KEY_FALSE(key)",
+ * an "if (static_branch_unlikely(&key))" statement is an unconditional branch
+ * (which defaults to false - and the true block is placed out of line).
+ * Similarly, we can define an initially true key via
+ * "DEFINE_STATIC_KEY_TRUE(key)", and use it in the same
+ * "if (static_branch_unlikely(&key))", in which case we will generate an
+ * unconditional branch to the out-of-line true branch. Keys that are
+ * initially true or false can be using in both static_branch_unlikely()
+ * and static_branch_likely() statements.
  *
- * However at runtime we can change the branch target using
- * static_key_slow_{inc,dec}(). These function as a 'reference' count on the key
- * object, and for as long as there are references all branches referring to
- * that particular key will point to the (out of line) true block.
+ * At runtime we can change the branch target by setting the key
+ * to true via a call to static_branch_enable(), or false using
+ * static_branch_disable(). If the direction of the branch is switched by
+ * these calls then we run-time modify the branch target via a
+ * no-op -> jump or jump -> no-op conversion. For example, for an
+ * initially false key that is used in an "if (static_branch_unlikely(&key))"
+ * statement, setting the key to true requires us to patch in a jump
+ * to the out-of-line of true branch.
  *
- * Since this relies on modifying code, the static_key_slow_{inc,dec}() functions
+ * In addtion to static_branch_{enable,disable}, we can also reference count
+ * the key or branch direction via static_branch_{inc,dec}. Thus,
+ * static_branch_inc() can be thought of as a 'make more true' and
+ * static_branch_dec() as a 'make more false'. The inc()/dec()
+ * interface is meant to be used exclusively from the inc()/dec() for a given
+ * key.
+ *
+ * Since this relies on modifying code, the branch modifying functions
  * must be considered absolute slow paths (machine wide synchronization etc.).
  * OTOH, since the affected branches are unconditional, their runtime overhead
  * will be absolutely minimal, esp. in the default (off) case where the total
@@ -29,20 +64,10 @@
  * cause significant performance degradation. Struct static_key_deferred and
  * static_key_slow_dec_deferred() provide for this.
  *
- * Lacking toolchain and or architecture support, jump labels fall back to a simple
- * conditional branch.
- *
- * struct static_key my_key = STATIC_KEY_INIT_TRUE;
- *
- *   if (static_key_true(&my_key)) {
- *   }
- *
- * will result in the true case being in-line and starts the key with a single
- * reference. Mixing static_key_true() and static_key_false() on the same key is not
- * allowed.
+ * Lacking toolchain and or architecture support, static keys fall back to a
+ * simple conditional branch.
  *
- * Not initializing the key (static data is initialized to 0s anyway) is the
- * same as using STATIC_KEY_INIT_FALSE.
+ * Additional babbling in: Documentation/static-keys.txt
  */
 
 #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
-- 
cgit v1.2.3-70-g09d2


From 9d7fb04276481c59610983362d8e023d262b58ca Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 30 Jun 2015 11:30:54 +0200
Subject: sched/cputime: Guarantee stime + utime == rtime

While the current code guarantees monotonicity for stime and utime
independently of one another, it does not guarantee that the sum of
both is equal to the total time we started out with.

This confuses things (and peoples) who look at this sum, like top, and
will report >100% usage followed by a matching period of 0%.

Rework the code to provide both individual monotonicity and a coherent
sum.

Suggested-by: Fredrik Markstrom <fredrik.markstrom@gmail.com>
Reported-by: Fredrik Markstrom <fredrik.markstrom@gmail.com>
Tested-by: Fredrik Markstrom <fredrik.markstrom@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: jason.low2@hp.com
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/init_task.h |  10 +++++
 include/linux/sched.h     |  40 ++++++++++--------
 kernel/fork.c             |   7 ++--
 kernel/sched/cputime.c    | 101 +++++++++++++++++++++++++++-------------------
 4 files changed, 97 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e8493fee8160..d0b380ee7d67 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -32,6 +32,14 @@ extern struct fs_struct init_fs;
 #define INIT_CPUSET_SEQ(tsk)
 #endif
 
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+#define INIT_PREV_CPUTIME(x)	.prev_cputime = {			\
+	.lock = __RAW_SPIN_LOCK_UNLOCKED(x.prev_cputime.lock),		\
+},
+#else
+#define INIT_PREV_CPUTIME(x)
+#endif
+
 #define INIT_SIGNALS(sig) {						\
 	.nr_threads	= 1,						\
 	.thread_head	= LIST_HEAD_INIT(init_task.thread_node),	\
@@ -46,6 +54,7 @@ extern struct fs_struct init_fs;
 		.cputime_atomic	= INIT_CPUTIME_ATOMIC,			\
 		.running	= 0,					\
 	},								\
+	INIT_PREV_CPUTIME(sig)						\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
 }
@@ -246,6 +255,7 @@ extern struct task_group root_task_group;
 	INIT_TASK_RCU_TASKS(tsk)					\
 	INIT_CPUSET_SEQ(tsk)						\
 	INIT_RT_MUTEXES(tsk)						\
+	INIT_PREV_CPUTIME(tsk)						\
 	INIT_VTIME(tsk)							\
 	INIT_NUMA_BALANCING(tsk)					\
 	INIT_KASAN(tsk)							\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ae21f1591615..7412070a25cc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -530,39 +530,49 @@ struct cpu_itimer {
 };
 
 /**
- * struct cputime - snaphsot of system and user cputime
+ * struct prev_cputime - snaphsot of system and user cputime
  * @utime: time spent in user mode
  * @stime: time spent in system mode
+ * @lock: protects the above two fields
  *
- * Gathers a generic snapshot of user and system time.
+ * Stores previous user/system time values such that we can guarantee
+ * monotonicity.
  */
-struct cputime {
+struct prev_cputime {
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	cputime_t utime;
 	cputime_t stime;
+	raw_spinlock_t lock;
+#endif
 };
 
+static inline void prev_cputime_init(struct prev_cputime *prev)
+{
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	prev->utime = prev->stime = 0;
+	raw_spin_lock_init(&prev->lock);
+#endif
+}
+
 /**
  * struct task_cputime - collected CPU time counts
  * @utime:		time spent in user mode, in &cputime_t units
  * @stime:		time spent in kernel mode, in &cputime_t units
  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
  *
- * This is an extension of struct cputime that includes the total runtime
- * spent by the task from the scheduler point of view.
- *
- * As a result, this structure groups together three kinds of CPU time
- * that are tracked for threads and thread groups.  Most things considering
- * CPU time want to group these counts together and treat all three
- * of them in parallel.
+ * This structure groups together three kinds of CPU time that are tracked for
+ * threads and thread groups.  Most things considering CPU time want to group
+ * these counts together and treat all three of them in parallel.
  */
 struct task_cputime {
 	cputime_t utime;
 	cputime_t stime;
 	unsigned long long sum_exec_runtime;
 };
+
 /* Alternate field names when used to cache expirations. */
-#define prof_exp	stime
 #define virt_exp	utime
+#define prof_exp	stime
 #define sched_exp	sum_exec_runtime
 
 #define INIT_CPUTIME	\
@@ -715,9 +725,7 @@ struct signal_struct {
 	cputime_t utime, stime, cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	struct cputime prev_cputime;
-#endif
+	struct prev_cputime prev_cputime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
@@ -1481,9 +1489,7 @@ struct task_struct {
 
 	cputime_t utime, stime, utimescaled, stimescaled;
 	cputime_t gtime;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	struct cputime prev_cputime;
-#endif
+	struct prev_cputime prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 	seqlock_t vtime_seqlock;
 	unsigned long long vtime_snap;
diff --git a/kernel/fork.c b/kernel/fork.c
index 1bfefc6f96a4..6e8f807c5716 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1067,6 +1067,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 	rcu_assign_pointer(tsk->sighand, sig);
 	if (!sig)
 		return -ENOMEM;
+
 	atomic_set(&sig->count, 1);
 	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
 	return 0;
@@ -1128,6 +1129,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
 	seqlock_init(&sig->stats_lock);
+	prev_cputime_init(&sig->prev_cputime);
 
 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	sig->real_timer.function = it_real_fn;
@@ -1335,9 +1337,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->utime = p->stime = p->gtime = 0;
 	p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	p->prev_cputime.utime = p->prev_cputime.stime = 0;
-#endif
+	prev_cputime_init(&p->prev_cputime);
+
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 	seqlock_init(&p->vtime_seqlock);
 	p->vtime_snap = 0;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f5a64ffad176..8cbc3db671df 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -555,48 +555,43 @@ drop_precision:
 }
 
 /*
- * Atomically advance counter to the new value. Interrupts, vcpu
- * scheduling, and scaling inaccuracies can cause cputime_advance
- * to be occasionally called with a new value smaller than counter.
- * Let's enforce atomicity.
+ * Adjust tick based cputime random precision against scheduler runtime
+ * accounting.
  *
- * Normally a caller will only go through this loop once, or not
- * at all in case a previous caller updated counter the same jiffy.
- */
-static void cputime_advance(cputime_t *counter, cputime_t new)
-{
-	cputime_t old;
-
-	while (new > (old = READ_ONCE(*counter)))
-		cmpxchg_cputime(counter, old, new);
-}
-
-/*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
+ * Tick based cputime accounting depend on random scheduling timeslices of a
+ * task to be interrupted or not by the timer.  Depending on these
+ * circumstances, the number of these interrupts may be over or
+ * under-optimistic, matching the real user and system cputime with a variable
+ * precision.
+ *
+ * Fix this by scaling these tick based values against the total runtime
+ * accounted by the CFS scheduler.
+ *
+ * This code provides the following guarantees:
+ *
+ *   stime + utime == rtime
+ *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
+ *
+ * Assuming that rtime_i+1 >= rtime_i.
  */
 static void cputime_adjust(struct task_cputime *curr,
-			   struct cputime *prev,
+			   struct prev_cputime *prev,
 			   cputime_t *ut, cputime_t *st)
 {
 	cputime_t rtime, stime, utime;
+	unsigned long flags;
 
-	/*
-	 * Tick based cputime accounting depend on random scheduling
-	 * timeslices of a task to be interrupted or not by the timer.
-	 * Depending on these circumstances, the number of these interrupts
-	 * may be over or under-optimistic, matching the real user and system
-	 * cputime with a variable precision.
-	 *
-	 * Fix this by scaling these tick based values against the total
-	 * runtime accounted by the CFS scheduler.
-	 */
+	/* Serialize concurrent callers such that we can honour our guarantees */
+	raw_spin_lock_irqsave(&prev->lock, flags);
 	rtime = nsecs_to_cputime(curr->sum_exec_runtime);
 
 	/*
-	 * Update userspace visible utime/stime values only if actual execution
-	 * time is bigger than already exported. Note that can happen, that we
-	 * provided bigger values due to scaling inaccuracy on big numbers.
+	 * This is possible under two circumstances:
+	 *  - rtime isn't monotonic after all (a bug);
+	 *  - we got reordered by the lock.
+	 *
+	 * In both cases this acts as a filter such that the rest of the code
+	 * can assume it is monotonic regardless of anything else.
 	 */
 	if (prev->stime + prev->utime >= rtime)
 		goto out;
@@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr,
 
 	if (utime == 0) {
 		stime = rtime;
-	} else if (stime == 0) {
-		utime = rtime;
-	} else {
-		cputime_t total = stime + utime;
+		goto update;
+	}
 
-		stime = scale_stime((__force u64)stime,
-				    (__force u64)rtime, (__force u64)total);
-		utime = rtime - stime;
+	if (stime == 0) {
+		utime = rtime;
+		goto update;
 	}
 
-	cputime_advance(&prev->stime, stime);
-	cputime_advance(&prev->utime, utime);
+	stime = scale_stime((__force u64)stime, (__force u64)rtime,
+			    (__force u64)(stime + utime));
+
+	/*
+	 * Make sure stime doesn't go backwards; this preserves monotonicity
+	 * for utime because rtime is monotonic.
+	 *
+	 *  utime_i+1 = rtime_i+1 - stime_i
+	 *            = rtime_i+1 - (rtime_i - utime_i)
+	 *            = (rtime_i+1 - rtime_i) + utime_i
+	 *            >= utime_i
+	 */
+	if (stime < prev->stime)
+		stime = prev->stime;
+	utime = rtime - stime;
+
+	/*
+	 * Make sure utime doesn't go backwards; this still preserves
+	 * monotonicity for stime, analogous argument to above.
+	 */
+	if (utime < prev->utime) {
+		utime = prev->utime;
+		stime = rtime - utime;
+	}
 
+update:
+	prev->stime = stime;
+	prev->utime = utime;
 out:
 	*ut = prev->utime;
 	*st = prev->stime;
+	raw_spin_unlock_irqrestore(&prev->lock, flags);
 }
 
 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-- 
cgit v1.2.3-70-g09d2


From 63b0e9edceec10fa41ec33393a1515a5ff444277 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Tue, 14 Jul 2015 17:39:50 +0200
Subject: sched/fair: Beef up wake_wide()

Josef Bacik reported that Facebook sees better performance with their
1:N load (1 dispatch/node, N workers/node) when carrying an old patch
to try very hard to wake to an idle CPU.  While looking at wake_wide(),
I noticed that it doesn't pay attention to the wakeup of a many partner
waker, returning 1 only when waking one of its many partners.

Correct that, letting explicit domain flags override the heuristic.

While at it, adjust task_struct bits, we don't need a 64-bit counter.

Tested-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
[ Tidy things up. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team<Kernel-team@fb.com>
Cc: morten.rasmussen@arm.com
Cc: riel@redhat.com
Link: http://lkml.kernel.org/r/1436888390.7983.49.camel@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  4 +--
 kernel/sched/fair.c   | 67 +++++++++++++++++++++++++--------------------------
 2 files changed, 35 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7412070a25cc..65a8a8651596 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1359,9 +1359,9 @@ struct task_struct {
 #ifdef CONFIG_SMP
 	struct llist_node wake_entry;
 	int on_cpu;
-	struct task_struct *last_wakee;
-	unsigned long wakee_flips;
+	unsigned int wakee_flips;
 	unsigned long wakee_flip_decay_ts;
+	struct task_struct *last_wakee;
 
 	int wake_cpu;
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8b384b8d2f1d..ea23f9f1b51b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4726,26 +4726,29 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 
 #endif
 
+/*
+ * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ * A waker of many should wake a different task than the one last awakened
+ * at a frequency roughly N times higher than one of its wakees.  In order
+ * to determine whether we should let the load spread vs consolodating to
+ * shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other.  With
+ * both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size.  Waker/wakee
+ * being client/server, worker/dispatcher, interrupt source or whatever is
+ * irrelevant, spread criteria is apparent partner count exceeds socket size.
+ */
 static int wake_wide(struct task_struct *p)
 {
+	unsigned int master = current->wakee_flips;
+	unsigned int slave = p->wakee_flips;
 	int factor = this_cpu_read(sd_llc_size);
 
-	/*
-	 * Yeah, it's the switching-frequency, could means many wakee or
-	 * rapidly switch, use factor here will just help to automatically
-	 * adjust the loose-degree, so bigger node will lead to more pull.
-	 */
-	if (p->wakee_flips > factor) {
-		/*
-		 * wakee is somewhat hot, it needs certain amount of cpu
-		 * resource, so if waker is far more hot, prefer to leave
-		 * it alone.
-		 */
-		if (current->wakee_flips > (factor * p->wakee_flips))
-			return 1;
-	}
-
-	return 0;
+	if (master < slave)
+		swap(master, slave);
+	if (slave < factor || master < slave * factor)
+		return 0;
+	return 1;
 }
 
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
@@ -4757,13 +4760,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	unsigned long weight;
 	int balanced;
 
-	/*
-	 * If we wake multiple tasks be careful to not bounce
-	 * ourselves around too much.
-	 */
-	if (wake_wide(p))
-		return 0;
-
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
 	prev_cpu  = task_cpu(p);
@@ -5017,17 +5013,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 {
 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
 	int cpu = smp_processor_id();
-	int new_cpu = cpu;
+	int new_cpu = prev_cpu;
 	int want_affine = 0;
 	int sync = wake_flags & WF_SYNC;
 
 	if (sd_flag & SD_BALANCE_WAKE)
-		want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
 
 	rcu_read_lock();
 	for_each_domain(cpu, tmp) {
 		if (!(tmp->flags & SD_LOAD_BALANCE))
-			continue;
+			break;
 
 		/*
 		 * If both cpu and prev_cpu are part of this domain,
@@ -5041,17 +5037,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
 		if (tmp->flags & sd_flag)
 			sd = tmp;
+		else if (!want_affine)
+			break;
 	}
 
-	if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
-		prev_cpu = cpu;
-
-	if (sd_flag & SD_BALANCE_WAKE) {
-		new_cpu = select_idle_sibling(p, prev_cpu);
-		goto unlock;
+	if (affine_sd) {
+		sd = NULL; /* Prefer wake_affine over balance flags */
+		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+			new_cpu = cpu;
 	}
 
-	while (sd) {
+	if (!sd) {
+		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+			new_cpu = select_idle_sibling(p, new_cpu);
+
+	} else while (sd) {
 		struct sched_group *group;
 		int weight;
 
@@ -5085,7 +5085,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 		}
 		/* while loop will break here if sd == NULL */
 	}
-unlock:
 	rcu_read_unlock();
 
 	return new_cpu;
-- 
cgit v1.2.3-70-g09d2


From fe32d3cd5e8eb0f82e459763374aa80797023403 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Wed, 15 Jul 2015 12:52:04 +0300
Subject: sched/preempt: Fix cond_resched_lock() and cond_resched_softirq()

These functions check should_resched() before unlocking spinlock/bh-enable:
preempt_count always non-zero => should_resched() always returns false.
cond_resched_lock() worked iff spin_needbreak is set.

This patch adds argument "preempt_offset" to should_resched().

preempt_count offset constants for that:

  PREEMPT_DISABLE_OFFSET  - offset after preempt_disable()
  PREEMPT_LOCK_OFFSET     - offset after spin_lock()
  SOFTIRQ_DISABLE_OFFSET  - offset after local_bh_distable()
  SOFTIRQ_LOCK_OFFSET     - offset after spin_lock_bh()

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Graf <agraf@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: bdb438065890 ("sched: Extract the basic add/sub preempt_count modifiers")
Link: http://lkml.kernel.org/r/20150715095204.12246.98268.stgit@buzz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/preempt.h |  4 ++--
 include/asm-generic/preempt.h  |  5 +++--
 include/linux/preempt.h        | 19 ++++++++++++++-----
 include/linux/sched.h          |  6 ------
 kernel/sched/core.c            |  6 +++---
 5 files changed, 22 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index dca71714f860..b12f81022a6b 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -90,9 +90,9 @@ static __always_inline bool __preempt_count_dec_and_test(void)
 /*
  * Returns true when we need to resched and can (barring IRQ state).
  */
-static __always_inline bool should_resched(void)
+static __always_inline bool should_resched(int preempt_offset)
 {
-	return unlikely(!raw_cpu_read_4(__preempt_count));
+	return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
 }
 
 #ifdef CONFIG_PREEMPT
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index d0a7a4753db2..0bec580a4885 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -71,9 +71,10 @@ static __always_inline bool __preempt_count_dec_and_test(void)
 /*
  * Returns true when we need to resched and can (barring IRQ state).
  */
-static __always_inline bool should_resched(void)
+static __always_inline bool should_resched(int preempt_offset)
 {
-	return unlikely(!preempt_count() && tif_need_resched());
+	return unlikely(preempt_count() == preempt_offset &&
+			tif_need_resched());
 }
 
 #ifdef CONFIG_PREEMPT
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 84991f185173..bea8dd8ff5e0 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -84,12 +84,20 @@
  */
 #define in_nmi()	(preempt_count() & NMI_MASK)
 
+/*
+ * The preempt_count offset after preempt_disable();
+ */
 #if defined(CONFIG_PREEMPT_COUNT)
-# define PREEMPT_DISABLE_OFFSET 1
+# define PREEMPT_DISABLE_OFFSET	PREEMPT_OFFSET
 #else
-# define PREEMPT_DISABLE_OFFSET 0
+# define PREEMPT_DISABLE_OFFSET	0
 #endif
 
+/*
+ * The preempt_count offset after spin_lock()
+ */
+#define PREEMPT_LOCK_OFFSET	PREEMPT_DISABLE_OFFSET
+
 /*
  * The preempt_count offset needed for things like:
  *
@@ -103,7 +111,7 @@
  *
  * Work as expected.
  */
-#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_DISABLE_OFFSET)
+#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET)
 
 /*
  * Are we running in atomic context?  WARNING: this macro cannot
@@ -124,7 +132,8 @@
 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
 extern void preempt_count_add(int val);
 extern void preempt_count_sub(int val);
-#define preempt_count_dec_and_test() ({ preempt_count_sub(1); should_resched(); })
+#define preempt_count_dec_and_test() \
+	({ preempt_count_sub(1); should_resched(0); })
 #else
 #define preempt_count_add(val)	__preempt_count_add(val)
 #define preempt_count_sub(val)	__preempt_count_sub(val)
@@ -184,7 +193,7 @@ do { \
 
 #define preempt_check_resched() \
 do { \
-	if (should_resched()) \
+	if (should_resched(0)) \
 		__preempt_schedule(); \
 } while (0)
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 65a8a8651596..9c144657aace 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2891,12 +2891,6 @@ extern int _cond_resched(void);
 
 extern int __cond_resched_lock(spinlock_t *lock);
 
-#ifdef CONFIG_PREEMPT_COUNT
-#define PREEMPT_LOCK_OFFSET	PREEMPT_OFFSET
-#else
-#define PREEMPT_LOCK_OFFSET	0
-#endif
-
 #define cond_resched_lock(lock) ({				\
 	___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
 	__cond_resched_lock(lock);				\
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fa5826cc612f..f5fad2b12baf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4496,7 +4496,7 @@ SYSCALL_DEFINE0(sched_yield)
 
 int __sched _cond_resched(void)
 {
-	if (should_resched()) {
+	if (should_resched(0)) {
 		preempt_schedule_common();
 		return 1;
 	}
@@ -4514,7 +4514,7 @@ EXPORT_SYMBOL(_cond_resched);
  */
 int __cond_resched_lock(spinlock_t *lock)
 {
-	int resched = should_resched();
+	int resched = should_resched(PREEMPT_LOCK_OFFSET);
 	int ret = 0;
 
 	lockdep_assert_held(lock);
@@ -4536,7 +4536,7 @@ int __sched __cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 
-	if (should_resched()) {
+	if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
 		local_bh_enable();
 		preempt_schedule_common();
 		local_bh_disable();
-- 
cgit v1.2.3-70-g09d2


From 7eeb088e72048bf4660f64fc3824c8066cf17591 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 30 Jun 2015 03:29:51 +0200
Subject: stop_machine: Unexport __stop_machine()

The only caller outside of stop_machine.c is _cpu_down(), it can use
stop_machine(). get_online_cpus() is fine under cpu_hotplug_begin().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Cc: der.herr@hofr.at
Cc: paulmck@linux.vnet.ibm.com
Cc: riel@redhat.com
Cc: viro@ZenIV.linux.org.uk
Link: http://lkml.kernel.org/r/20150630012951.GA23934@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/stop_machine.h | 22 ++--------------------
 kernel/cpu.c                 |  2 +-
 kernel/stop_machine.c        |  2 +-
 3 files changed, 4 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index d2abbdb8c6aa..0fca276a0537 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -114,23 +114,11 @@ static inline int try_stop_cpus(const struct cpumask *cpumask,
  * grabbing every spinlock in the kernel. */
 int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
 
-/**
- * __stop_machine: freeze the machine on all CPUs and run this function
- * @fn: the function to run
- * @data: the data ptr for the @fn
- * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
- *
- * Description: This is a special version of the above, which assumes cpus
- * won't come or go while it's being called.  Used by hotplug cpu.
- */
-int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
-
 int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
 				   const struct cpumask *cpus);
-
 #else	 /* CONFIG_STOP_MACHINE && CONFIG_SMP */
 
-static inline int __stop_machine(int (*fn)(void *), void *data,
+static inline int stop_machine(int (*fn)(void *), void *data,
 				 const struct cpumask *cpus)
 {
 	unsigned long flags;
@@ -141,16 +129,10 @@ static inline int __stop_machine(int (*fn)(void *), void *data,
 	return ret;
 }
 
-static inline int stop_machine(int (*fn)(void *), void *data,
-			       const struct cpumask *cpus)
-{
-	return __stop_machine(fn, data, cpus);
-}
-
 static inline int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
 						 const struct cpumask *cpus)
 {
-	return __stop_machine(fn, data, cpus);
+	return stop_machine(fn, data, cpus);
 }
 
 #endif	/* CONFIG_STOP_MACHINE && CONFIG_SMP */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9c9c9fab16cc..664ce5299334 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -395,7 +395,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	 * So now all preempt/rcu users must observe !cpu_active().
 	 */
 
-	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
+	err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 621220852df0..b50910dbf030 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -513,7 +513,7 @@ early_initcall(cpu_stop_init);
 
 #ifdef CONFIG_STOP_MACHINE
 
-int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
+static int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
 	struct multi_stop_data msdata = {
 		.fn = fn,
-- 
cgit v1.2.3-70-g09d2


From 9a301f22faac7fc2207ee49c1855a6b4ba9c5a52 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 30 Jun 2015 03:29:55 +0200
Subject: stop_machine: Use 'cpu_stop_fn_t' where possible

Cosmetic, but 'cpu_stop_fn_t' actually makes the code more readable and
it doesn't break cscope. And most of the declarations already use it.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Cc: der.herr@hofr.at
Cc: paulmck@linux.vnet.ibm.com
Cc: riel@redhat.com
Cc: viro@ZenIV.linux.org.uk
Link: http://lkml.kernel.org/r/20150630012955.GA23937@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/stop_machine.h | 8 ++++----
 kernel/stop_machine.c        | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 0fca276a0537..414d924318ce 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -112,13 +112,13 @@ static inline int try_stop_cpus(const struct cpumask *cpumask,
  *
  * This can be thought of as a very heavy write lock, equivalent to
  * grabbing every spinlock in the kernel. */
-int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
+int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus);
 
-int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
+int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
 				   const struct cpumask *cpus);
 #else	 /* CONFIG_STOP_MACHINE && CONFIG_SMP */
 
-static inline int stop_machine(int (*fn)(void *), void *data,
+static inline int stop_machine(cpu_stop_fn_t fn, void *data,
 				 const struct cpumask *cpus)
 {
 	unsigned long flags;
@@ -129,7 +129,7 @@ static inline int stop_machine(int (*fn)(void *), void *data,
 	return ret;
 }
 
-static inline int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
+static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
 						 const struct cpumask *cpus)
 {
 	return stop_machine(fn, data, cpus);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b50910dbf030..9a70defe9f1f 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -141,7 +141,7 @@ enum multi_stop_state {
 };
 
 struct multi_stop_data {
-	int			(*fn)(void *);
+	cpu_stop_fn_t		fn;
 	void			*data;
 	/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
 	unsigned int		num_threads;
@@ -513,7 +513,7 @@ early_initcall(cpu_stop_init);
 
 #ifdef CONFIG_STOP_MACHINE
 
-static int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
+static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
 {
 	struct multi_stop_data msdata = {
 		.fn = fn,
@@ -546,7 +546,7 @@ static int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *c
 	return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
 }
 
-int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
+int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
 {
 	int ret;
 
@@ -580,7 +580,7 @@ EXPORT_SYMBOL_GPL(stop_machine);
  * 0 if all executions of @fn returned 0, any non zero return value if any
  * returned non zero.
  */
-int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
+int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
 				  const struct cpumask *cpus)
 {
 	struct multi_stop_data msdata = { .fn = fn, .data = data,
-- 
cgit v1.2.3-70-g09d2


From 9d89c257dfb9c51a532d69397f6eed75e5168c35 Mon Sep 17 00:00:00 2001
From: Yuyang Du <yuyang.du@intel.com>
Date: Wed, 15 Jul 2015 08:04:37 +0800
Subject: sched/fair: Rewrite runnable load and utilization average tracking

The idea of runnable load average (let runnable time contribute to weight)
was proposed by Paul Turner and Ben Segall, and it is still followed by
this rewrite. This rewrite aims to solve the following issues:

1. cfs_rq's load average (namely runnable_load_avg and blocked_load_avg) is
   updated at the granularity of an entity at a time, which results in the
   cfs_rq's load average is stale or partially updated: at any time, only
   one entity is up to date, all other entities are effectively lagging
   behind. This is undesirable.

   To illustrate, if we have n runnable entities in the cfs_rq, as time
   elapses, they certainly become outdated:

     t0: cfs_rq { e1_old, e2_old, ..., en_old }

   and when we update:

     t1: update e1, then we have cfs_rq { e1_new, e2_old, ..., en_old }

     t2: update e2, then we have cfs_rq { e1_old, e2_new, ..., en_old }

     ...

   We solve this by combining all runnable entities' load averages together
   in cfs_rq's avg, and update the cfs_rq's avg as a whole. This is based
   on the fact that if we regard the update as a function, then:

   w * update(e) = update(w * e) and

   update(e1) + update(e2) = update(e1 + e2), then

   w1 * update(e1) + w2 * update(e2) = update(w1 * e1 + w2 * e2)

   therefore, by this rewrite, we have an entirely updated cfs_rq at the
   time we update it:

     t1: update cfs_rq { e1_new, e2_new, ..., en_new }

     t2: update cfs_rq { e1_new, e2_new, ..., en_new }

     ...

2. cfs_rq's load average is different between top rq->cfs_rq and other
   task_group's per CPU cfs_rqs in whether or not blocked_load_average
   contributes to the load.

   The basic idea behind runnable load average (the same for utilization)
   is that the blocked state is taken into account as opposed to only
   accounting for the currently runnable state. Therefore, the average
   should include both the runnable/running and blocked load averages.
   This rewrite does that.

   In addition, we also combine runnable/running and blocked averages
   of all entities into the cfs_rq's average, and update it together at
   once. This is based on the fact that:

     update(runnable) + update(blocked) = update(runnable + blocked)

   This significantly reduces the code as we don't need to separately
   maintain/update runnable/running load and blocked load.

3. How task_group entities' share is calculated is complex and imprecise.

   We reduce the complexity in this rewrite to allow a very simple rule:
   the task_group's load_avg is aggregated from its per CPU cfs_rqs's
   load_avgs. Then group entity's weight is simply proportional to its
   own cfs_rq's load_avg / task_group's load_avg. To illustrate,

   if a task_group has { cfs_rq1, cfs_rq2, ..., cfs_rqn }, then,

   task_group_avg = cfs_rq1_avg + cfs_rq2_avg + ... + cfs_rqn_avg, then

   cfs_rqx's entity's share = cfs_rqx_avg / task_group_avg * task_group's share

To sum up, this rewrite in principle is equivalent to the current one, but
fixes the issues described above. Turns out, it significantly reduces the
code complexity and hence increases clarity and efficiency. In addition,
the new averages are more smooth/continuous (no spurious spikes and valleys)
and updated more consistently and quickly to reflect the load dynamics.

As a result, we have less load tracking overhead, better performance,
and especially better power efficiency due to more balanced load.

Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan@linux.intel.com
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: fengguang.wu@intel.com
Cc: len.brown@intel.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: rafael.j.wysocki@intel.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1436918682-4971-3-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  41 ++--
 kernel/sched/core.c   |   3 -
 kernel/sched/debug.c  |  41 ++--
 kernel/sched/fair.c   | 630 ++++++++++++++++----------------------------------
 kernel/sched/sched.h  |  28 +--
 5 files changed, 249 insertions(+), 494 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c144657aace..44dca5b35de6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1175,29 +1175,24 @@ struct load_weight {
 	u32 inv_weight;
 };
 
+/*
+ * The load_avg/util_avg accumulates an infinite geometric series.
+ * 1) load_avg factors the amount of time that a sched_entity is
+ * runnable on a rq into its weight. For cfs_rq, it is the aggregated
+ * such weights of all runnable and blocked sched_entities.
+ * 2) util_avg factors frequency scaling into the amount of time
+ * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
+ * For cfs_rq, it is the aggregated such times of all runnable and
+ * blocked sched_entities.
+ * The 64 bit load_sum can:
+ * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with
+ * the highest weight (=88761) always runnable, we should not overflow
+ * 2) for entity, support any load.weight always runnable
+ */
 struct sched_avg {
-	u64 last_runnable_update;
-	s64 decay_count;
-	/*
-	 * utilization_avg_contrib describes the amount of time that a
-	 * sched_entity is running on a CPU. It is based on running_avg_sum
-	 * and is scaled in the range [0..SCHED_LOAD_SCALE].
-	 * load_avg_contrib described the amount of time that a sched_entity
-	 * is runnable on a rq. It is based on both runnable_avg_sum and the
-	 * weight of the task.
-	 */
-	unsigned long load_avg_contrib, utilization_avg_contrib;
-	/*
-	 * These sums represent an infinite geometric series and so are bound
-	 * above by 1024/(1-y).  Thus we only need a u32 to store them for all
-	 * choices of y < 1-2^(-32)*1024.
-	 * running_avg_sum reflects the time that the sched_entity is
-	 * effectively running on the CPU.
-	 * runnable_avg_sum represents the amount of time a sched_entity is on
-	 * a runqueue which includes the running time that is monitored by
-	 * running_avg_sum.
-	 */
-	u32 runnable_avg_sum, avg_period, running_avg_sum;
+	u64 last_update_time, load_sum;
+	u32 util_sum, period_contrib;
+	unsigned long load_avg, util_avg;
 };
 
 #ifdef CONFIG_SCHEDSTATS
@@ -1263,7 +1258,7 @@ struct sched_entity {
 #endif
 
 #ifdef CONFIG_SMP
-	/* Per-entity load-tracking */
+	/* Per entity load average tracking */
 	struct sched_avg	avg;
 #endif
 };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f5fad2b12baf..3981526539c5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2020,9 +2020,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
-#ifdef CONFIG_SMP
-	p->se.avg.decay_count		= 0;
-#endif
 	INIT_LIST_HEAD(&p->se.group_node);
 
 #ifdef CONFIG_SCHEDSTATS
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 363b7e82554b..74f276f5568c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -88,12 +88,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 #endif
 	P(se->load.weight);
 #ifdef CONFIG_SMP
-	P(se->avg.runnable_avg_sum);
-	P(se->avg.running_avg_sum);
-	P(se->avg.avg_period);
-	P(se->avg.load_avg_contrib);
-	P(se->avg.utilization_avg_contrib);
-	P(se->avg.decay_count);
+	P(se->avg.load_avg);
+	P(se->avg.util_avg);
 #endif
 #undef PN
 #undef P
@@ -209,21 +205,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_SMP
-	SEQ_printf(m, "  .%-30s: %ld\n", "runnable_load_avg",
-			cfs_rq->runnable_load_avg);
-	SEQ_printf(m, "  .%-30s: %ld\n", "blocked_load_avg",
-			cfs_rq->blocked_load_avg);
-	SEQ_printf(m, "  .%-30s: %ld\n", "utilization_load_avg",
-			cfs_rq->utilization_load_avg);
+	SEQ_printf(m, "  .%-30s: %lu\n", "load_avg",
+			cfs_rq->avg.load_avg);
+	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
+			cfs_rq->avg.util_avg);
+	SEQ_printf(m, "  .%-30s: %ld\n", "removed_load_avg",
+			atomic_long_read(&cfs_rq->removed_load_avg));
+	SEQ_printf(m, "  .%-30s: %ld\n", "removed_util_avg",
+			atomic_long_read(&cfs_rq->removed_util_avg));
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_contrib",
-			cfs_rq->tg_load_contrib);
-	SEQ_printf(m, "  .%-30s: %d\n", "tg_runnable_contrib",
-			cfs_rq->tg_runnable_contrib);
+	SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
+			cfs_rq->tg_load_avg_contrib);
 	SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_avg",
 			atomic_long_read(&cfs_rq->tg->load_avg));
-	SEQ_printf(m, "  .%-30s: %d\n", "tg->runnable_avg",
-			atomic_read(&cfs_rq->tg->runnable_avg));
 #endif
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -631,12 +625,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
 	P(se.load.weight);
 #ifdef CONFIG_SMP
-	P(se.avg.runnable_avg_sum);
-	P(se.avg.running_avg_sum);
-	P(se.avg.avg_period);
-	P(se.avg.load_avg_contrib);
-	P(se.avg.utilization_avg_contrib);
-	P(se.avg.decay_count);
+	P(se.avg.load_sum);
+	P(se.avg.util_sum);
+	P(se.avg.load_avg);
+	P(se.avg.util_avg);
+	P(se.avg.last_update_time);
 #endif
 	P(policy);
 	P(prio);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 90292c672a3b..01ffa9509c23 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -283,9 +283,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 	return grp->my_q;
 }
 
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
-				       int force_update);
-
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	if (!cfs_rq->on_list) {
@@ -305,8 +302,6 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 		}
 
 		cfs_rq->on_list = 1;
-		/* We should have no load, but we need to update last_decay. */
-		update_cfs_rq_blocked_load(cfs_rq, 0);
 	}
 }
 
@@ -664,19 +659,31 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static int select_idle_sibling(struct task_struct *p, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
 
-static inline void __update_task_entity_contrib(struct sched_entity *se);
-static inline void __update_task_entity_utilization(struct sched_entity *se);
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
 
 /* Give new task start runnable values to heavy its load in infant time */
 void init_task_runnable_average(struct task_struct *p)
 {
-	u32 slice;
+	struct sched_avg *sa = &p->se.avg;
 
-	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
-	p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
-	p->se.avg.avg_period = slice;
-	__update_task_entity_contrib(&p->se);
-	__update_task_entity_utilization(&p->se);
+	sa->last_update_time = 0;
+	/*
+	 * sched_avg's period_contrib should be strictly less then 1024, so
+	 * we give it 1023 to make sure it is almost a period (1024us), and
+	 * will definitely be update (after enqueue).
+	 */
+	sa->period_contrib = 1023;
+	sa->load_avg = scale_load_down(p->se.load.weight);
+	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+	sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
+	sa->util_sum = LOAD_AVG_MAX;
+	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
 #else
 void init_task_runnable_average(struct task_struct *p)
@@ -1698,8 +1705,8 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
 		delta = runtime - p->last_sum_exec_runtime;
 		*period = now - p->last_task_numa_placement;
 	} else {
-		delta = p->se.avg.runnable_avg_sum;
-		*period = p->se.avg.avg_period;
+		delta = p->se.avg.load_sum / p->se.load.weight;
+		*period = LOAD_AVG_MAX;
 	}
 
 	p->last_sum_exec_runtime = runtime;
@@ -2347,13 +2354,13 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
 	long tg_weight;
 
 	/*
-	 * Use this CPU's actual weight instead of the last load_contribution
-	 * to gain a more accurate current total weight. See
-	 * __update_cfs_rq_tg_load_contrib().
+	 * Use this CPU's real-time load instead of the last load contribution
+	 * as the updating of the contribution is delayed, and we will use the
+	 * the real-time load to calc the share. See update_tg_load_avg().
 	 */
 	tg_weight = atomic_long_read(&tg->load_avg);
-	tg_weight -= cfs_rq->tg_load_contrib;
-	tg_weight += cfs_rq->load.weight;
+	tg_weight -= cfs_rq->tg_load_avg_contrib;
+	tg_weight += cfs_rq->avg.load_avg;
 
 	return tg_weight;
 }
@@ -2363,7 +2370,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 	long tg_weight, load, shares;
 
 	tg_weight = calc_tg_weight(tg, cfs_rq);
-	load = cfs_rq->load.weight;
+	load = cfs_rq->avg.load_avg;
 
 	shares = (tg->shares * load);
 	if (tg_weight)
@@ -2425,14 +2432,6 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_SMP
-/*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables below are dependent on this value.
- */
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
-
 /* Precomputed fixed inverse multiplies for multiplication by y^n */
 static const u32 runnable_avg_yN_inv[] = {
 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
@@ -2481,9 +2480,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
 		local_n %= LOAD_AVG_PERIOD;
 	}
 
-	val *= runnable_avg_yN_inv[local_n];
-	/* We don't use SRR here since we always want to round down. */
-	return val >> 32;
+	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
+	return val;
 }
 
 /*
@@ -2542,23 +2540,22 @@ static u32 __compute_runnable_contrib(u64 n)
  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
  */
-static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
-							struct sched_avg *sa,
-							int runnable,
-							int running)
+static __always_inline int
+__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+		  unsigned long weight, int running)
 {
 	u64 delta, periods;
-	u32 runnable_contrib;
+	u32 contrib;
 	int delta_w, decayed = 0;
 	unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
 
-	delta = now - sa->last_runnable_update;
+	delta = now - sa->last_update_time;
 	/*
 	 * This should only happen when time goes backwards, which it
 	 * unfortunately does during sched clock init when we swap over to TSC.
 	 */
 	if ((s64)delta < 0) {
-		sa->last_runnable_update = now;
+		sa->last_update_time = now;
 		return 0;
 	}
 
@@ -2569,26 +2566,26 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
 	delta >>= 10;
 	if (!delta)
 		return 0;
-	sa->last_runnable_update = now;
+	sa->last_update_time = now;
 
 	/* delta_w is the amount already accumulated against our next period */
-	delta_w = sa->avg_period % 1024;
+	delta_w = sa->period_contrib;
 	if (delta + delta_w >= 1024) {
-		/* period roll-over */
 		decayed = 1;
 
+		/* how much left for next period will start over, we don't know yet */
+		sa->period_contrib = 0;
+
 		/*
 		 * Now that we know we're crossing a period boundary, figure
 		 * out how much from delta we need to complete the current
 		 * period and accrue it.
 		 */
 		delta_w = 1024 - delta_w;
-		if (runnable)
-			sa->runnable_avg_sum += delta_w;
+		if (weight)
+			sa->load_sum += weight * delta_w;
 		if (running)
-			sa->running_avg_sum += delta_w * scale_freq
-				>> SCHED_CAPACITY_SHIFT;
-		sa->avg_period += delta_w;
+			sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT;
 
 		delta -= delta_w;
 
@@ -2596,334 +2593,156 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
 		periods = delta / 1024;
 		delta %= 1024;
 
-		sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
-						  periods + 1);
-		sa->running_avg_sum = decay_load(sa->running_avg_sum,
-						  periods + 1);
-		sa->avg_period = decay_load(sa->avg_period,
-						     periods + 1);
+		sa->load_sum = decay_load(sa->load_sum, periods + 1);
+		sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
 
 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
-		runnable_contrib = __compute_runnable_contrib(periods);
-		if (runnable)
-			sa->runnable_avg_sum += runnable_contrib;
+		contrib = __compute_runnable_contrib(periods);
+		if (weight)
+			sa->load_sum += weight * contrib;
 		if (running)
-			sa->running_avg_sum += runnable_contrib * scale_freq
-				>> SCHED_CAPACITY_SHIFT;
-		sa->avg_period += runnable_contrib;
+			sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT;
 	}
 
 	/* Remainder of delta accrued against u_0` */
-	if (runnable)
-		sa->runnable_avg_sum += delta;
+	if (weight)
+		sa->load_sum += weight * delta;
 	if (running)
-		sa->running_avg_sum += delta * scale_freq
-			>> SCHED_CAPACITY_SHIFT;
-	sa->avg_period += delta;
-
-	return decayed;
-}
-
-/* Synchronize an entity's decay with its parenting cfs_rq.*/
-static inline u64 __synchronize_entity_decay(struct sched_entity *se)
-{
-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-	u64 decays = atomic64_read(&cfs_rq->decay_counter);
+		sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT;
 
-	decays -= se->avg.decay_count;
-	se->avg.decay_count = 0;
-	if (!decays)
-		return 0;
+	sa->period_contrib += delta;
 
-	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
-	se->avg.utilization_avg_contrib =
-		decay_load(se->avg.utilization_avg_contrib, decays);
+	if (decayed) {
+		sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+		sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX;
+	}
 
-	return decays;
+	return decayed;
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
-						 int force_update)
-{
-	struct task_group *tg = cfs_rq->tg;
-	long tg_contrib;
-
-	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
-	tg_contrib -= cfs_rq->tg_load_contrib;
-
-	if (!tg_contrib)
-		return;
-
-	if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
-		atomic_long_add(tg_contrib, &tg->load_avg);
-		cfs_rq->tg_load_contrib += tg_contrib;
-	}
-}
-
 /*
- * Aggregate cfs_rq runnable averages into an equivalent task_group
- * representation for computing load contributions.
+ * Updating tg's load_avg is necessary before update_cfs_share (which is done)
+ * and effective_load (which is not done because it is too costly).
  */
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
-						  struct cfs_rq *cfs_rq)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 {
-	struct task_group *tg = cfs_rq->tg;
-	long contrib;
-
-	/* The fraction of a cpu used by this cfs_rq */
-	contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
-			  sa->avg_period + 1);
-	contrib -= cfs_rq->tg_runnable_contrib;
+	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
 
-	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
-		atomic_add(contrib, &tg->runnable_avg);
-		cfs_rq->tg_runnable_contrib += contrib;
-	}
-}
-
-static inline void __update_group_entity_contrib(struct sched_entity *se)
-{
-	struct cfs_rq *cfs_rq = group_cfs_rq(se);
-	struct task_group *tg = cfs_rq->tg;
-	int runnable_avg;
-
-	u64 contrib;
-
-	contrib = cfs_rq->tg_load_contrib * tg->shares;
-	se->avg.load_avg_contrib = div_u64(contrib,
-				     atomic_long_read(&tg->load_avg) + 1);
-
-	/*
-	 * For group entities we need to compute a correction term in the case
-	 * that they are consuming <1 cpu so that we would contribute the same
-	 * load as a task of equal weight.
-	 *
-	 * Explicitly co-ordinating this measurement would be expensive, but
-	 * fortunately the sum of each cpus contribution forms a usable
-	 * lower-bound on the true value.
-	 *
-	 * Consider the aggregate of 2 contributions.  Either they are disjoint
-	 * (and the sum represents true value) or they are disjoint and we are
-	 * understating by the aggregate of their overlap.
-	 *
-	 * Extending this to N cpus, for a given overlap, the maximum amount we
-	 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
-	 * cpus that overlap for this interval and w_i is the interval width.
-	 *
-	 * On a small machine; the first term is well-bounded which bounds the
-	 * total error since w_i is a subset of the period.  Whereas on a
-	 * larger machine, while this first term can be larger, if w_i is the
-	 * of consequential size guaranteed to see n_i*w_i quickly converge to
-	 * our upper bound of 1-cpu.
-	 */
-	runnable_avg = atomic_read(&tg->runnable_avg);
-	if (runnable_avg < NICE_0_LOAD) {
-		se->avg.load_avg_contrib *= runnable_avg;
-		se->avg.load_avg_contrib >>= NICE_0_SHIFT;
+	if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+		atomic_long_add(delta, &cfs_rq->tg->load_avg);
+		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
 	}
 }
 
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
-						 int force_update) {}
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
-						  struct cfs_rq *cfs_rq) {}
-static inline void __update_group_entity_contrib(struct sched_entity *se) {}
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-static inline void __update_task_entity_contrib(struct sched_entity *se)
-{
-	u32 contrib;
-
-	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
-	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
-	contrib /= (se->avg.avg_period + 1);
-	se->avg.load_avg_contrib = scale_load(contrib);
-}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 
-/* Compute the current contribution to load_avg by se, return any delta */
-static long __update_entity_load_avg_contrib(struct sched_entity *se)
+/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
+static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
-	long old_contrib = se->avg.load_avg_contrib;
+	int decayed;
+	struct sched_avg *sa = &cfs_rq->avg;
 
-	if (entity_is_task(se)) {
-		__update_task_entity_contrib(se);
-	} else {
-		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
-		__update_group_entity_contrib(se);
+	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
+		long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+		sa->load_avg = max_t(long, sa->load_avg - r, 0);
+		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
 	}
 
-	return se->avg.load_avg_contrib - old_contrib;
-}
-
-
-static inline void __update_task_entity_utilization(struct sched_entity *se)
-{
-	u32 contrib;
-
-	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
-	contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
-	contrib /= (se->avg.avg_period + 1);
-	se->avg.utilization_avg_contrib = scale_load(contrib);
-}
+	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
+		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
+		sa->util_avg = max_t(long, sa->util_avg - r, 0);
+		sa->util_sum = max_t(s32, sa->util_sum -
+			((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0);
+	}
 
-static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
-{
-	long old_contrib = se->avg.utilization_avg_contrib;
+	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+		scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL);
 
-	if (entity_is_task(se))
-		__update_task_entity_utilization(se);
-	else
-		se->avg.utilization_avg_contrib =
-					group_cfs_rq(se)->utilization_load_avg;
-
-	return se->avg.utilization_avg_contrib - old_contrib;
-}
+#ifndef CONFIG_64BIT
+	smp_wmb();
+	cfs_rq->load_last_update_time_copy = sa->last_update_time;
+#endif
 
-static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
-						 long load_contrib)
-{
-	if (likely(load_contrib < cfs_rq->blocked_load_avg))
-		cfs_rq->blocked_load_avg -= load_contrib;
-	else
-		cfs_rq->blocked_load_avg = 0;
+	return decayed;
 }
 
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-
-/* Update a sched_entity's runnable average */
-static inline void update_entity_load_avg(struct sched_entity *se,
-					  int update_cfs_rq)
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct sched_entity *se, int update_tg)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-	long contrib_delta, utilization_delta;
 	int cpu = cpu_of(rq_of(cfs_rq));
-	u64 now;
+	u64 now = cfs_rq_clock_task(cfs_rq);
 
 	/*
-	 * For a group entity we need to use their owned cfs_rq_clock_task() in
-	 * case they are the parent of a throttled hierarchy.
+	 * Track task load average for carrying it to new CPU after migrated, and
+	 * track group sched_entity load average for task_h_load calc in migration
 	 */
-	if (entity_is_task(se))
-		now = cfs_rq_clock_task(cfs_rq);
-	else
-		now = cfs_rq_clock_task(group_cfs_rq(se));
+	__update_load_avg(now, cpu, &se->avg,
+		se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se);
 
-	if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
-					cfs_rq->curr == se))
-		return;
-
-	contrib_delta = __update_entity_load_avg_contrib(se);
-	utilization_delta = __update_entity_utilization_avg_contrib(se);
-
-	if (!update_cfs_rq)
-		return;
-
-	if (se->on_rq) {
-		cfs_rq->runnable_load_avg += contrib_delta;
-		cfs_rq->utilization_load_avg += utilization_delta;
-	} else {
-		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
-	}
+	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+		update_tg_load_avg(cfs_rq, 0);
 }
 
-/*
- * Decay the load contributed by all blocked children and account this so that
- * their contribution may appropriately discounted when they wake up.
- */
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+/* Add the load generated by se into cfs_rq's load average */
+static inline void
+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
-	u64 decays;
-
-	decays = now - cfs_rq->last_decay;
-	if (!decays && !force_update)
-		return;
+	struct sched_avg *sa = &se->avg;
+	u64 now = cfs_rq_clock_task(cfs_rq);
+	int migrated = 0, decayed;
 
-	if (atomic_long_read(&cfs_rq->removed_load)) {
-		unsigned long removed_load;
-		removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
-		subtract_blocked_load_contrib(cfs_rq, removed_load);
+	if (sa->last_update_time == 0) {
+		sa->last_update_time = now;
+		migrated = 1;
 	}
-
-	if (decays) {
-		cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
-						      decays);
-		atomic64_add(decays, &cfs_rq->decay_counter);
-		cfs_rq->last_decay = now;
+	else {
+		__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+			se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se);
 	}
 
-	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
-}
+	decayed = update_cfs_rq_load_avg(now, cfs_rq);
 
-/* Add the load generated by se into cfs_rq's child load-average */
-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
-						  struct sched_entity *se,
-						  int wakeup)
-{
-	/*
-	 * We track migrations using entity decay_count <= 0, on a wake-up
-	 * migration we use a negative decay count to track the remote decays
-	 * accumulated while sleeping.
-	 *
-	 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
-	 * are seen by enqueue_entity_load_avg() as a migration with an already
-	 * constructed load_avg_contrib.
-	 */
-	if (unlikely(se->avg.decay_count <= 0)) {
-		se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
-		if (se->avg.decay_count) {
-			/*
-			 * In a wake-up migration we have to approximate the
-			 * time sleeping.  This is because we can't synchronize
-			 * clock_task between the two cpus, and it is not
-			 * guaranteed to be read-safe.  Instead, we can
-			 * approximate this using our carried decays, which are
-			 * explicitly atomically readable.
-			 */
-			se->avg.last_runnable_update -= (-se->avg.decay_count)
-							<< 20;
-			update_entity_load_avg(se, 0);
-			/* Indicate that we're now synchronized and on-rq */
-			se->avg.decay_count = 0;
-		}
-		wakeup = 0;
-	} else {
-		__synchronize_entity_decay(se);
+	if (migrated) {
+		cfs_rq->avg.load_avg += sa->load_avg;
+		cfs_rq->avg.load_sum += sa->load_sum;
+		cfs_rq->avg.util_avg += sa->util_avg;
+		cfs_rq->avg.util_sum += sa->util_sum;
 	}
 
-	/* migrated tasks did not contribute to our blocked load */
-	if (wakeup) {
-		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
-		update_entity_load_avg(se, 0);
-	}
-
-	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
-	cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
-	/* we force update consideration on load-balancer moves */
-	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+	if (decayed || migrated)
+		update_tg_load_avg(cfs_rq, 0);
 }
 
 /*
- * Remove se's load from this cfs_rq child load-average, if the entity is
- * transitioning to a blocked state we track its projected decay using
- * blocked_load_avg.
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
  */
-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
-						  struct sched_entity *se,
-						  int sleep)
+void remove_entity_load_avg(struct sched_entity *se)
 {
-	update_entity_load_avg(se, 1);
-	/* we force update consideration on load-balancer moves */
-	update_cfs_rq_blocked_load(cfs_rq, !sleep);
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	u64 last_update_time;
+
+#ifndef CONFIG_64BIT
+	u64 last_update_time_copy;
 
-	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
-	cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
-	if (sleep) {
-		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
-		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
-	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
+	do {
+		last_update_time_copy = cfs_rq->load_last_update_time_copy;
+		smp_rmb();
+		last_update_time = cfs_rq->avg.last_update_time;
+	} while (last_update_time != last_update_time_copy);
+#else
+	last_update_time = cfs_rq->avg.last_update_time;
+#endif
+
+	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0);
+	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
+	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
 }
 
 /*
@@ -2948,16 +2767,10 @@ static int idle_balance(struct rq *this_rq);
 
 #else /* CONFIG_SMP */
 
-static inline void update_entity_load_avg(struct sched_entity *se,
-					  int update_cfs_rq) {}
-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
-					   struct sched_entity *se,
-					   int wakeup) {}
-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
-					   struct sched_entity *se,
-					   int sleep) {}
-static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
-					      int force_update) {}
+static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline void
+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void remove_entity_load_avg(struct sched_entity *se) {}
 
 static inline int idle_balance(struct rq *rq)
 {
@@ -3089,7 +2902,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
+	enqueue_entity_load_avg(cfs_rq, se);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
 
@@ -3164,7 +2977,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
+	update_load_avg(se, 1);
 
 	update_stats_dequeue(cfs_rq, se);
 	if (flags & DEQUEUE_SLEEP) {
@@ -3254,7 +3067,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 */
 		update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
-		update_entity_load_avg(se, 1);
+		update_load_avg(se, 1);
 	}
 
 	update_stats_curr_start(cfs_rq, se);
@@ -3354,7 +3167,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
 		/* in !on_rq case, update occurred at dequeue */
-		update_entity_load_avg(prev, 1);
+		update_load_avg(prev, 0);
 	}
 	cfs_rq->curr = NULL;
 }
@@ -3370,8 +3183,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	/*
 	 * Ensure that runnable average is periodically updated.
 	 */
-	update_entity_load_avg(curr, 1);
-	update_cfs_rq_blocked_load(cfs_rq, 1);
+	update_load_avg(curr, 1);
 	update_cfs_shares(cfs_rq);
 
 #ifdef CONFIG_SCHED_HRTICK
@@ -4244,8 +4056,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
+		update_load_avg(se, 1);
 		update_cfs_shares(cfs_rq);
-		update_entity_load_avg(se, 1);
 	}
 
 	if (!se)
@@ -4304,8 +4116,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
+		update_load_avg(se, 1);
 		update_cfs_shares(cfs_rq);
-		update_entity_load_avg(se, 1);
 	}
 
 	if (!se)
@@ -4444,7 +4256,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 static void update_idle_cpu_load(struct rq *this_rq)
 {
 	unsigned long curr_jiffies = READ_ONCE(jiffies);
-	unsigned long load = this_rq->cfs.runnable_load_avg;
+	unsigned long load = this_rq->cfs.avg.load_avg;
 	unsigned long pending_updates;
 
 	/*
@@ -4490,7 +4302,7 @@ void update_cpu_load_nohz(void)
  */
 void update_cpu_load_active(struct rq *this_rq)
 {
-	unsigned long load = this_rq->cfs.runnable_load_avg;
+	unsigned long load = this_rq->cfs.avg.load_avg;
 	/*
 	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
 	 */
@@ -4501,7 +4313,7 @@ void update_cpu_load_active(struct rq *this_rq)
 /* Used instead of source_load when we know the type == 0 */
 static unsigned long weighted_cpuload(const int cpu)
 {
-	return cpu_rq(cpu)->cfs.runnable_load_avg;
+	return cpu_rq(cpu)->cfs.avg.load_avg;
 }
 
 /*
@@ -4551,7 +4363,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
-	unsigned long load_avg = rq->cfs.runnable_load_avg;
+	unsigned long load_avg = rq->cfs.avg.load_avg;
 
 	if (nr_running)
 		return load_avg / nr_running;
@@ -4670,7 +4482,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 		/*
 		 * w = rw_i + @wl
 		 */
-		w = se->my_q->load.weight + wl;
+		w = se->my_q->avg.load_avg + wl;
 
 		/*
 		 * wl = S * s'_i; see (2)
@@ -4691,7 +4503,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 		/*
 		 * wl = dw_i = S * (s'_i - s_i); see (3)
 		 */
-		wl -= se->load.weight;
+		wl -= se->avg.load_avg;
 
 		/*
 		 * Recursively apply this logic to all parent groups to compute
@@ -4761,14 +4573,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	 */
 	if (sync) {
 		tg = task_group(current);
-		weight = current->se.load.weight;
+		weight = current->se.avg.load_avg;
 
 		this_load += effective_load(tg, this_cpu, -weight, -weight);
 		load += effective_load(tg, prev_cpu, 0, -weight);
 	}
 
 	tg = task_group(p);
-	weight = p->se.load.weight;
+	weight = p->se.avg.load_avg;
 
 	/*
 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
@@ -4961,12 +4773,12 @@ done:
  * tasks. The unit of the return value must be the one of capacity so we can
  * compare the usage with the capacity of the CPU that is available for CFS
  * task (ie cpu_capacity).
- * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
+ * cfs.avg.util_avg is the sum of running time of runnable tasks on a
  * CPU. It represents the amount of utilization of a CPU in the range
  * [0..SCHED_LOAD_SCALE].  The usage of a CPU can't be higher than the full
  * capacity of the CPU because it's about the running time on this CPU.
- * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
- * because of unfortunate rounding in avg_period and running_load_avg or just
+ * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE
+ * because of unfortunate rounding in util_avg or just
  * after migrating tasks until the average stabilizes with the new running
  * time. So we need to check that the usage stays into the range
  * [0..cpu_capacity_orig] and cap if necessary.
@@ -4975,7 +4787,7 @@ done:
  */
 static int get_cpu_usage(int cpu)
 {
-	unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
+	unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg;
 	unsigned long capacity = capacity_orig_of(cpu);
 
 	if (usage >= SCHED_LOAD_SCALE)
@@ -5084,26 +4896,22 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
  * other assumptions, including the state of rq->lock, should be made.
  */
-static void
-migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+static void migrate_task_rq_fair(struct task_struct *p, int next_cpu)
 {
-	struct sched_entity *se = &p->se;
-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
 	/*
-	 * Load tracking: accumulate removed load so that it can be processed
-	 * when we next update owning cfs_rq under rq->lock.  Tasks contribute
-	 * to blocked load iff they have a positive decay-count.  It can never
-	 * be negative here since on-rq tasks have decay-count == 0.
+	 * We are supposed to update the task to "current" time, then its up to date
+	 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
+	 * what current time is, so simply throw away the out-of-date time. This
+	 * will result in the wakee task is less decayed, but giving the wakee more
+	 * load sounds not bad.
 	 */
-	if (se->avg.decay_count) {
-		se->avg.decay_count = -__synchronize_entity_decay(se);
-		atomic_long_add(se->avg.load_avg_contrib,
-						&cfs_rq->removed_load);
-	}
+	remove_entity_load_avg(&p->se);
+
+	/* Tell new CPU we are migrated */
+	p->se.avg.last_update_time = 0;
 
 	/* We have migrated, no longer consider this task hot */
-	se->exec_start = 0;
+	p->se.exec_start = 0;
 }
 #endif /* CONFIG_SMP */
 
@@ -5966,36 +5774,6 @@ static void attach_tasks(struct lb_env *env)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * update tg->load_weight by folding this cpu's load_avg
- */
-static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
-{
-	struct sched_entity *se = tg->se[cpu];
-	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
-
-	/* throttled entities do not contribute to load */
-	if (throttled_hierarchy(cfs_rq))
-		return;
-
-	update_cfs_rq_blocked_load(cfs_rq, 1);
-
-	if (se) {
-		update_entity_load_avg(se, 1);
-		/*
-		 * We pivot on our runnable average having decayed to zero for
-		 * list removal.  This generally implies that all our children
-		 * have also been removed (modulo rounding error or bandwidth
-		 * control); however, such cases are rare and we can fix these
-		 * at enqueue.
-		 *
-		 * TODO: fix up out-of-order children on enqueue.
-		 */
-		if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
-			list_del_leaf_cfs_rq(cfs_rq);
-	}
-}
-
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -6004,19 +5782,19 @@ static void update_blocked_averages(int cpu)
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	update_rq_clock(rq);
+
 	/*
 	 * Iterates the task_group tree in a bottom up fashion, see
 	 * list_add_leaf_cfs_rq() for details.
 	 */
 	for_each_leaf_cfs_rq(rq, cfs_rq) {
-		/*
-		 * Note: We may want to consider periodically releasing
-		 * rq->lock about these updates so that creating many task
-		 * groups does not result in continually extending hold time.
-		 */
-		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
-	}
+		/* throttled entities do not contribute to load */
+		if (throttled_hierarchy(cfs_rq))
+			continue;
 
+		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+			update_tg_load_avg(cfs_rq, 0);
+	}
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
@@ -6044,14 +5822,13 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 	}
 
 	if (!se) {
-		cfs_rq->h_load = cfs_rq->runnable_load_avg;
+		cfs_rq->h_load = cfs_rq->avg.load_avg;
 		cfs_rq->last_h_load_update = now;
 	}
 
 	while ((se = cfs_rq->h_load_next) != NULL) {
 		load = cfs_rq->h_load;
-		load = div64_ul(load * se->avg.load_avg_contrib,
-				cfs_rq->runnable_load_avg + 1);
+		load = div64_ul(load * se->avg.load_avg, cfs_rq->avg.load_avg + 1);
 		cfs_rq = group_cfs_rq(se);
 		cfs_rq->h_load = load;
 		cfs_rq->last_h_load_update = now;
@@ -6063,8 +5840,8 @@ static unsigned long task_h_load(struct task_struct *p)
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 
 	update_cfs_rq_h_load(cfs_rq);
-	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
-			cfs_rq->runnable_load_avg + 1);
+	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
+			cfs_rq->avg.load_avg + 1);
 }
 #else
 static inline void update_blocked_averages(int cpu)
@@ -6073,7 +5850,7 @@ static inline void update_blocked_averages(int cpu)
 
 static unsigned long task_h_load(struct task_struct *p)
 {
-	return p->se.avg.load_avg_contrib;
+	return p->se.avg.load_avg;
 }
 #endif
 
@@ -8071,15 +7848,18 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 	}
 
 #ifdef CONFIG_SMP
-	/*
-	* Remove our load from contribution when we leave sched_fair
-	* and ensure we don't carry in an old decay_count if we
-	* switch back.
-	*/
-	if (se->avg.decay_count) {
-		__synchronize_entity_decay(se);
-		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
-	}
+	/* Catch up with the cfs_rq and remove our load when we leave */
+	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg,
+		se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se);
+
+	cfs_rq->avg.load_avg =
+		max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
+	cfs_rq->avg.load_sum =
+		max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
+	cfs_rq->avg.util_avg =
+		max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
+	cfs_rq->avg.util_sum =
+		max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
 #endif
 }
 
@@ -8136,8 +7916,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
 #ifdef CONFIG_SMP
-	atomic64_set(&cfs_rq->decay_counter, 1);
-	atomic_long_set(&cfs_rq->removed_load, 0);
+	atomic_long_set(&cfs_rq->removed_load_avg, 0);
+	atomic_long_set(&cfs_rq->removed_util_avg, 0);
 #endif
 }
 
@@ -8182,14 +7962,14 @@ static void task_move_group_fair(struct task_struct *p, int queued)
 	if (!queued) {
 		cfs_rq = cfs_rq_of(se);
 		se->vruntime += cfs_rq->min_vruntime;
+
 #ifdef CONFIG_SMP
-		/*
-		 * migrate_task_rq_fair() will have removed our previous
-		 * contribution, but we must synchronize for ongoing future
-		 * decay.
-		 */
-		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
-		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+		/* Virtually synchronize task with its new cfs_rq */
+		p->se.avg.last_update_time = cfs_rq->avg.last_update_time;
+		cfs_rq->avg.load_avg += p->se.avg.load_avg;
+		cfs_rq->avg.load_sum += p->se.avg.load_sum;
+		cfs_rq->avg.util_avg += p->se.avg.util_avg;
+		cfs_rq->avg.util_sum += p->se.avg.util_sum;
 #endif
 	}
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e13210cce7e8..dcde941a585b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -245,7 +245,6 @@ struct task_group {
 
 #ifdef	CONFIG_SMP
 	atomic_long_t load_avg;
-	atomic_t runnable_avg;
 #endif
 #endif
 
@@ -366,27 +365,18 @@ struct cfs_rq {
 
 #ifdef CONFIG_SMP
 	/*
-	 * CFS Load tracking
-	 * Under CFS, load is tracked on a per-entity basis and aggregated up.
-	 * This allows for the description of both thread and group usage (in
-	 * the FAIR_GROUP_SCHED case).
-	 * runnable_load_avg is the sum of the load_avg_contrib of the
-	 * sched_entities on the rq.
-	 * blocked_load_avg is similar to runnable_load_avg except that its
-	 * the blocked sched_entities on the rq.
-	 * utilization_load_avg is the sum of the average running time of the
-	 * sched_entities on the rq.
+	 * CFS load tracking
 	 */
-	unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
-	atomic64_t decay_counter;
-	u64 last_decay;
-	atomic_long_t removed_load;
-
+	struct sched_avg avg;
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	/* Required to track per-cpu representation of a task_group */
-	u32 tg_runnable_contrib;
-	unsigned long tg_load_contrib;
+	unsigned long tg_load_avg_contrib;
+#endif
+	atomic_long_t removed_load_avg, removed_util_avg;
+#ifndef CONFIG_64BIT
+	u64 load_last_update_time_copy;
+#endif
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	/*
 	 *   h_load = weight * f(tg)
 	 *
-- 
cgit v1.2.3-70-g09d2


From bff60792f994a87324ab57e89e945b4572b1ef77 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 31 Jul 2015 15:46:16 +0100
Subject: arm64: psci: factor invocation code to drivers

To enable sharing with arm, move the core PSCI framework code to
drivers/firmware. This results in a minor gain in lines of code, but
this will quickly be amortised by the removal of code currently
duplicated in arch/arm.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Hanjun Guo <hanjun.guo@linaro.org>
Tested-by: Hanjun Guo <hanjun.guo@linaro.org>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/Kconfig            |   1 +
 arch/arm64/include/asm/acpi.h |   4 +-
 arch/arm64/include/asm/psci.h |  28 ----
 arch/arm64/kernel/psci.c      | 361 +----------------------------------------
 arch/arm64/kernel/setup.c     |   2 +-
 drivers/firmware/Kconfig      |   3 +
 drivers/firmware/Makefile     |   1 +
 drivers/firmware/psci.c       | 369 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/psci.h          |  52 ++++++
 9 files changed, 431 insertions(+), 390 deletions(-)
 delete mode 100644 arch/arm64/include/asm/psci.h
 create mode 100644 drivers/firmware/psci.c
 create mode 100644 include/linux/psci.h

(limited to 'include/linux')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 318175f62c24..7c55a632c08b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -20,6 +20,7 @@ config ARM64
 	select ARM_GIC_V2M if PCI_MSI
 	select ARM_GIC_V3
 	select ARM_GIC_V3_ITS if PCI_MSI
+	select ARM_PSCI_FW
 	select BUILDTIME_EXTABLE_SORT
 	select CLONE_BACKWARDS
 	select COMMON_CLK
diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
index 406485ed110a..208cec08a74f 100644
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -12,11 +12,11 @@
 #ifndef _ASM_ACPI_H
 #define _ASM_ACPI_H
 
-#include <linux/mm.h>
 #include <linux/irqchip/arm-gic-acpi.h>
+#include <linux/mm.h>
+#include <linux/psci.h>
 
 #include <asm/cputype.h>
-#include <asm/psci.h>
 #include <asm/smp_plat.h>
 
 /* Macros for consistency checks of the GICC subtable of MADT */
diff --git a/arch/arm64/include/asm/psci.h b/arch/arm64/include/asm/psci.h
deleted file mode 100644
index 49d7e1aaebdc..000000000000
--- a/arch/arm64/include/asm/psci.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * Copyright (C) 2013 ARM Limited
- */
-
-#ifndef __ASM_PSCI_H
-#define __ASM_PSCI_H
-
-int __init psci_dt_init(void);
-
-#ifdef CONFIG_ACPI
-int __init psci_acpi_init(void);
-bool __init acpi_psci_present(void);
-bool __init acpi_psci_use_hvc(void);
-#else
-static inline int psci_acpi_init(void) { return 0; }
-static inline bool acpi_psci_present(void) { return false; }
-#endif
-
-#endif /* __ASM_PSCI_H */
diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c
index 869f202748e8..51fd15a16461 100644
--- a/arch/arm64/kernel/psci.c
+++ b/arch/arm64/kernel/psci.c
@@ -18,23 +18,17 @@
 #include <linux/init.h>
 #include <linux/of.h>
 #include <linux/smp.h>
-#include <linux/reboot.h>
-#include <linux/pm.h>
 #include <linux/delay.h>
+#include <linux/psci.h>
 #include <linux/slab.h>
+
 #include <uapi/linux/psci.h>
 
 #include <asm/compiler.h>
-#include <asm/cputype.h>
 #include <asm/cpu_ops.h>
 #include <asm/errno.h>
-#include <asm/psci.h>
 #include <asm/smp_plat.h>
 #include <asm/suspend.h>
-#include <asm/system_misc.h>
-
-#define PSCI_POWER_STATE_TYPE_STANDBY		0
-#define PSCI_POWER_STATE_TYPE_POWER_DOWN	1
 
 static bool psci_power_state_loses_context(u32 state)
 {
@@ -50,122 +44,8 @@ static bool psci_power_state_is_valid(u32 state)
 	return !(state & ~valid_mask);
 }
 
-/*
- * The CPU any Trusted OS is resident on. The trusted OS may reject CPU_OFF
- * calls to its resident CPU, so we must avoid issuing those. We never migrate
- * a Trusted OS even if it claims to be capable of migration -- doing so will
- * require cooperation with a Trusted OS driver.
- */
-static int resident_cpu = -1;
-
-struct psci_operations {
-	int (*cpu_suspend)(u32 state, unsigned long entry_point);
-	int (*cpu_off)(u32 state);
-	int (*cpu_on)(unsigned long cpuid, unsigned long entry_point);
-	int (*migrate)(unsigned long cpuid);
-	int (*affinity_info)(unsigned long target_affinity,
-			unsigned long lowest_affinity_level);
-	int (*migrate_info_type)(void);
-};
-
-static struct psci_operations psci_ops;
-
-typedef unsigned long (psci_fn)(unsigned long, unsigned long,
-				unsigned long, unsigned long);
-asmlinkage psci_fn __invoke_psci_fn_hvc;
-asmlinkage psci_fn __invoke_psci_fn_smc;
-static psci_fn *invoke_psci_fn;
-
-enum psci_function {
-	PSCI_FN_CPU_SUSPEND,
-	PSCI_FN_CPU_ON,
-	PSCI_FN_CPU_OFF,
-	PSCI_FN_MIGRATE,
-	PSCI_FN_MAX,
-};
-
 static DEFINE_PER_CPU_READ_MOSTLY(u32 *, psci_power_state);
 
-static u32 psci_function_id[PSCI_FN_MAX];
-
-static int psci_to_linux_errno(int errno)
-{
-	switch (errno) {
-	case PSCI_RET_SUCCESS:
-		return 0;
-	case PSCI_RET_NOT_SUPPORTED:
-		return -EOPNOTSUPP;
-	case PSCI_RET_INVALID_PARAMS:
-		return -EINVAL;
-	case PSCI_RET_DENIED:
-		return -EPERM;
-	};
-
-	return -EINVAL;
-}
-
-static u32 psci_get_version(void)
-{
-	return invoke_psci_fn(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0);
-}
-
-static int psci_cpu_suspend(u32 state, unsigned long entry_point)
-{
-	int err;
-	u32 fn;
-
-	fn = psci_function_id[PSCI_FN_CPU_SUSPEND];
-	err = invoke_psci_fn(fn, state, entry_point, 0);
-	return psci_to_linux_errno(err);
-}
-
-static int psci_cpu_off(u32 state)
-{
-	int err;
-	u32 fn;
-
-	fn = psci_function_id[PSCI_FN_CPU_OFF];
-	err = invoke_psci_fn(fn, state, 0, 0);
-	return psci_to_linux_errno(err);
-}
-
-static int psci_cpu_on(unsigned long cpuid, unsigned long entry_point)
-{
-	int err;
-	u32 fn;
-
-	fn = psci_function_id[PSCI_FN_CPU_ON];
-	err = invoke_psci_fn(fn, cpuid, entry_point, 0);
-	return psci_to_linux_errno(err);
-}
-
-static int psci_migrate(unsigned long cpuid)
-{
-	int err;
-	u32 fn;
-
-	fn = psci_function_id[PSCI_FN_MIGRATE];
-	err = invoke_psci_fn(fn, cpuid, 0, 0);
-	return psci_to_linux_errno(err);
-}
-
-static int psci_affinity_info(unsigned long target_affinity,
-		unsigned long lowest_affinity_level)
-{
-	return invoke_psci_fn(PSCI_0_2_FN64_AFFINITY_INFO, target_affinity,
-			      lowest_affinity_level, 0);
-}
-
-static int psci_migrate_info_type(void)
-{
-	return invoke_psci_fn(PSCI_0_2_FN_MIGRATE_INFO_TYPE, 0, 0, 0);
-}
-
-static unsigned long psci_migrate_info_up_cpu(void)
-{
-	return invoke_psci_fn(PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU, 0, 0, 0);
-}
-
 static int __maybe_unused cpu_psci_cpu_init_idle(unsigned int cpu)
 {
 	int i, ret, count = 0;
@@ -230,238 +110,6 @@ free_mem:
 	return ret;
 }
 
-static int get_set_conduit_method(struct device_node *np)
-{
-	const char *method;
-
-	pr_info("probing for conduit method from DT.\n");
-
-	if (of_property_read_string(np, "method", &method)) {
-		pr_warn("missing \"method\" property\n");
-		return -ENXIO;
-	}
-
-	if (!strcmp("hvc", method)) {
-		invoke_psci_fn = __invoke_psci_fn_hvc;
-	} else if (!strcmp("smc", method)) {
-		invoke_psci_fn = __invoke_psci_fn_smc;
-	} else {
-		pr_warn("invalid \"method\" property: %s\n", method);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static void psci_sys_reset(enum reboot_mode reboot_mode, const char *cmd)
-{
-	invoke_psci_fn(PSCI_0_2_FN_SYSTEM_RESET, 0, 0, 0);
-}
-
-static void psci_sys_poweroff(void)
-{
-	invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0);
-}
-
-/*
- * Detect the presence of a resident Trusted OS which may cause CPU_OFF to
- * return DENIED (which would be fatal).
- */
-static void __init psci_init_migrate(void)
-{
-	unsigned long cpuid;
-	int type, cpu;
-
-	type = psci_ops.migrate_info_type();
-
-	if (type == PSCI_0_2_TOS_MP) {
-		pr_info("Trusted OS migration not required\n");
-		return;
-	}
-
-	if (type == PSCI_RET_NOT_SUPPORTED) {
-		pr_info("MIGRATE_INFO_TYPE not supported.\n");
-		return;
-	}
-
-	if (type != PSCI_0_2_TOS_UP_MIGRATE &&
-	    type != PSCI_0_2_TOS_UP_NO_MIGRATE) {
-		pr_err("MIGRATE_INFO_TYPE returned unknown type (%d)\n", type);
-		return;
-	}
-
-	cpuid = psci_migrate_info_up_cpu();
-	if (cpuid & ~MPIDR_HWID_BITMASK) {
-		pr_warn("MIGRATE_INFO_UP_CPU reported invalid physical ID (0x%lx)\n",
-			cpuid);
-		return;
-	}
-
-	cpu = get_logical_index(cpuid);
-	resident_cpu = cpu >= 0 ? cpu : -1;
-
-	pr_info("Trusted OS resident on physical CPU 0x%lx\n", cpuid);
-}
-
-static void __init psci_0_2_set_functions(void)
-{
-	pr_info("Using standard PSCI v0.2 function IDs\n");
-	psci_function_id[PSCI_FN_CPU_SUSPEND] = PSCI_0_2_FN64_CPU_SUSPEND;
-	psci_ops.cpu_suspend = psci_cpu_suspend;
-
-	psci_function_id[PSCI_FN_CPU_OFF] = PSCI_0_2_FN_CPU_OFF;
-	psci_ops.cpu_off = psci_cpu_off;
-
-	psci_function_id[PSCI_FN_CPU_ON] = PSCI_0_2_FN64_CPU_ON;
-	psci_ops.cpu_on = psci_cpu_on;
-
-	psci_function_id[PSCI_FN_MIGRATE] = PSCI_0_2_FN64_MIGRATE;
-	psci_ops.migrate = psci_migrate;
-
-	psci_ops.affinity_info = psci_affinity_info;
-
-	psci_ops.migrate_info_type = psci_migrate_info_type;
-
-	arm_pm_restart = psci_sys_reset;
-
-	pm_power_off = psci_sys_poweroff;
-}
-
-/*
- * Probe function for PSCI firmware versions >= 0.2
- */
-static int __init psci_probe(void)
-{
-	u32 ver = psci_get_version();
-
-	pr_info("PSCIv%d.%d detected in firmware.\n",
-			PSCI_VERSION_MAJOR(ver),
-			PSCI_VERSION_MINOR(ver));
-
-	if (PSCI_VERSION_MAJOR(ver) == 0 && PSCI_VERSION_MINOR(ver) < 2) {
-		pr_err("Conflicting PSCI version detected.\n");
-		return -EINVAL;
-	}
-
-	psci_0_2_set_functions();
-
-	psci_init_migrate();
-
-	return 0;
-}
-
-typedef int (*psci_initcall_t)(const struct device_node *);
-
-/*
- * PSCI init function for PSCI versions >=0.2
- *
- * Probe based on PSCI PSCI_VERSION function
- */
-static int __init psci_0_2_init(struct device_node *np)
-{
-	int err;
-
-	err = get_set_conduit_method(np);
-
-	if (err)
-		goto out_put_node;
-	/*
-	 * Starting with v0.2, the PSCI specification introduced a call
-	 * (PSCI_VERSION) that allows probing the firmware version, so
-	 * that PSCI function IDs and version specific initialization
-	 * can be carried out according to the specific version reported
-	 * by firmware
-	 */
-	err = psci_probe();
-
-out_put_node:
-	of_node_put(np);
-	return err;
-}
-
-/*
- * PSCI < v0.2 get PSCI Function IDs via DT.
- */
-static int __init psci_0_1_init(struct device_node *np)
-{
-	u32 id;
-	int err;
-
-	err = get_set_conduit_method(np);
-
-	if (err)
-		goto out_put_node;
-
-	pr_info("Using PSCI v0.1 Function IDs from DT\n");
-
-	if (!of_property_read_u32(np, "cpu_suspend", &id)) {
-		psci_function_id[PSCI_FN_CPU_SUSPEND] = id;
-		psci_ops.cpu_suspend = psci_cpu_suspend;
-	}
-
-	if (!of_property_read_u32(np, "cpu_off", &id)) {
-		psci_function_id[PSCI_FN_CPU_OFF] = id;
-		psci_ops.cpu_off = psci_cpu_off;
-	}
-
-	if (!of_property_read_u32(np, "cpu_on", &id)) {
-		psci_function_id[PSCI_FN_CPU_ON] = id;
-		psci_ops.cpu_on = psci_cpu_on;
-	}
-
-	if (!of_property_read_u32(np, "migrate", &id)) {
-		psci_function_id[PSCI_FN_MIGRATE] = id;
-		psci_ops.migrate = psci_migrate;
-	}
-
-out_put_node:
-	of_node_put(np);
-	return err;
-}
-
-static const struct of_device_id psci_of_match[] __initconst = {
-	{ .compatible = "arm,psci",	.data = psci_0_1_init},
-	{ .compatible = "arm,psci-0.2",	.data = psci_0_2_init},
-	{},
-};
-
-int __init psci_dt_init(void)
-{
-	struct device_node *np;
-	const struct of_device_id *matched_np;
-	psci_initcall_t init_fn;
-
-	np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np);
-
-	if (!np)
-		return -ENODEV;
-
-	init_fn = (psci_initcall_t)matched_np->data;
-	return init_fn(np);
-}
-
-#ifdef CONFIG_ACPI
-/*
- * We use PSCI 0.2+ when ACPI is deployed on ARM64 and it's
- * explicitly clarified in SBBR
- */
-int __init psci_acpi_init(void)
-{
-	if (!acpi_psci_present()) {
-		pr_info("is not implemented in ACPI.\n");
-		return -EOPNOTSUPP;
-	}
-
-	pr_info("probing for conduit method from ACPI.\n");
-
-	if (acpi_psci_use_hvc())
-		invoke_psci_fn = __invoke_psci_fn_hvc;
-	else
-		invoke_psci_fn = __invoke_psci_fn_smc;
-
-	return psci_probe();
-}
-#endif
-
 #ifdef CONFIG_SMP
 
 static int __init cpu_psci_cpu_init(unsigned int cpu)
@@ -489,11 +137,6 @@ static int cpu_psci_cpu_boot(unsigned int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static bool psci_tos_resident_on(int cpu)
-{
-	return cpu == resident_cpu;
-}
-
 static int cpu_psci_cpu_disable(unsigned int cpu)
 {
 	/* Fail early if we don't have CPU_OFF support */
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index f3067d4d4e35..96ce26428f82 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -46,6 +46,7 @@
 #include <linux/of_platform.h>
 #include <linux/efi.h>
 #include <linux/personality.h>
+#include <linux/psci.h>
 
 #include <asm/acpi.h>
 #include <asm/fixmap.h>
@@ -61,7 +62,6 @@
 #include <asm/tlbflush.h>
 #include <asm/traps.h>
 #include <asm/memblock.h>
-#include <asm/psci.h>
 #include <asm/efi.h>
 #include <asm/virt.h>
 #include <asm/xen/hypervisor.h>
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 99c69a3205c4..d8de6a8dd4de 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -5,6 +5,9 @@
 
 menu "Firmware Drivers"
 
+config ARM_PSCI_FW
+	bool
+
 config EDD
 	tristate "BIOS Enhanced Disk Drive calls determine boot disk"
 	depends on X86
diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile
index 4a4b897f9314..000830fc6707 100644
--- a/drivers/firmware/Makefile
+++ b/drivers/firmware/Makefile
@@ -1,6 +1,7 @@
 #
 # Makefile for the linux kernel.
 #
+obj-$(CONFIG_ARM_PSCI_FW)	+= psci.o
 obj-$(CONFIG_DMI)		+= dmi_scan.o
 obj-$(CONFIG_DMI_SYSFS)		+= dmi-sysfs.o
 obj-$(CONFIG_EDD)		+= edd.o
diff --git a/drivers/firmware/psci.c b/drivers/firmware/psci.c
new file mode 100644
index 000000000000..36e2cea3809b
--- /dev/null
+++ b/drivers/firmware/psci.c
@@ -0,0 +1,369 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015 ARM Limited
+ */
+
+#define pr_fmt(fmt) "psci: " fmt
+
+#include <linux/errno.h>
+#include <linux/linkage.h>
+#include <linux/of.h>
+#include <linux/pm.h>
+#include <linux/printk.h>
+#include <linux/psci.h>
+#include <linux/reboot.h>
+
+#include <uapi/linux/psci.h>
+
+#include <asm/cputype.h>
+#include <asm/system_misc.h>
+#include <asm/smp_plat.h>
+
+/*
+ * The CPU any Trusted OS is resident on. The trusted OS may reject CPU_OFF
+ * calls to its resident CPU, so we must avoid issuing those. We never migrate
+ * a Trusted OS even if it claims to be capable of migration -- doing so will
+ * require cooperation with a Trusted OS driver.
+ */
+static int resident_cpu = -1;
+
+bool psci_tos_resident_on(int cpu)
+{
+	return cpu == resident_cpu;
+}
+
+struct psci_operations psci_ops;
+
+typedef unsigned long (psci_fn)(unsigned long, unsigned long,
+				unsigned long, unsigned long);
+asmlinkage psci_fn __invoke_psci_fn_hvc;
+asmlinkage psci_fn __invoke_psci_fn_smc;
+static psci_fn *invoke_psci_fn;
+
+enum psci_function {
+	PSCI_FN_CPU_SUSPEND,
+	PSCI_FN_CPU_ON,
+	PSCI_FN_CPU_OFF,
+	PSCI_FN_MIGRATE,
+	PSCI_FN_MAX,
+};
+
+static u32 psci_function_id[PSCI_FN_MAX];
+
+static int psci_to_linux_errno(int errno)
+{
+	switch (errno) {
+	case PSCI_RET_SUCCESS:
+		return 0;
+	case PSCI_RET_NOT_SUPPORTED:
+		return -EOPNOTSUPP;
+	case PSCI_RET_INVALID_PARAMS:
+		return -EINVAL;
+	case PSCI_RET_DENIED:
+		return -EPERM;
+	};
+
+	return -EINVAL;
+}
+
+static u32 psci_get_version(void)
+{
+	return invoke_psci_fn(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0);
+}
+
+static int psci_cpu_suspend(u32 state, unsigned long entry_point)
+{
+	int err;
+	u32 fn;
+
+	fn = psci_function_id[PSCI_FN_CPU_SUSPEND];
+	err = invoke_psci_fn(fn, state, entry_point, 0);
+	return psci_to_linux_errno(err);
+}
+
+static int psci_cpu_off(u32 state)
+{
+	int err;
+	u32 fn;
+
+	fn = psci_function_id[PSCI_FN_CPU_OFF];
+	err = invoke_psci_fn(fn, state, 0, 0);
+	return psci_to_linux_errno(err);
+}
+
+static int psci_cpu_on(unsigned long cpuid, unsigned long entry_point)
+{
+	int err;
+	u32 fn;
+
+	fn = psci_function_id[PSCI_FN_CPU_ON];
+	err = invoke_psci_fn(fn, cpuid, entry_point, 0);
+	return psci_to_linux_errno(err);
+}
+
+static int psci_migrate(unsigned long cpuid)
+{
+	int err;
+	u32 fn;
+
+	fn = psci_function_id[PSCI_FN_MIGRATE];
+	err = invoke_psci_fn(fn, cpuid, 0, 0);
+	return psci_to_linux_errno(err);
+}
+
+static int psci_affinity_info(unsigned long target_affinity,
+		unsigned long lowest_affinity_level)
+{
+	return invoke_psci_fn(PSCI_0_2_FN64_AFFINITY_INFO, target_affinity,
+			      lowest_affinity_level, 0);
+}
+
+static int psci_migrate_info_type(void)
+{
+	return invoke_psci_fn(PSCI_0_2_FN_MIGRATE_INFO_TYPE, 0, 0, 0);
+}
+
+static unsigned long psci_migrate_info_up_cpu(void)
+{
+	return invoke_psci_fn(PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU, 0, 0, 0);
+}
+
+static int get_set_conduit_method(struct device_node *np)
+{
+	const char *method;
+
+	pr_info("probing for conduit method from DT.\n");
+
+	if (of_property_read_string(np, "method", &method)) {
+		pr_warn("missing \"method\" property\n");
+		return -ENXIO;
+	}
+
+	if (!strcmp("hvc", method)) {
+		invoke_psci_fn = __invoke_psci_fn_hvc;
+	} else if (!strcmp("smc", method)) {
+		invoke_psci_fn = __invoke_psci_fn_smc;
+	} else {
+		pr_warn("invalid \"method\" property: %s\n", method);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void psci_sys_reset(enum reboot_mode reboot_mode, const char *cmd)
+{
+	invoke_psci_fn(PSCI_0_2_FN_SYSTEM_RESET, 0, 0, 0);
+}
+
+static void psci_sys_poweroff(void)
+{
+	invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0);
+}
+
+/*
+ * Detect the presence of a resident Trusted OS which may cause CPU_OFF to
+ * return DENIED (which would be fatal).
+ */
+static void __init psci_init_migrate(void)
+{
+	unsigned long cpuid;
+	int type, cpu = -1;
+
+	type = psci_ops.migrate_info_type();
+
+	if (type == PSCI_0_2_TOS_MP) {
+		pr_info("Trusted OS migration not required\n");
+		return;
+	}
+
+	if (type == PSCI_RET_NOT_SUPPORTED) {
+		pr_info("MIGRATE_INFO_TYPE not supported.\n");
+		return;
+	}
+
+	if (type != PSCI_0_2_TOS_UP_MIGRATE &&
+	    type != PSCI_0_2_TOS_UP_NO_MIGRATE) {
+		pr_err("MIGRATE_INFO_TYPE returned unknown type (%d)\n", type);
+		return;
+	}
+
+	cpuid = psci_migrate_info_up_cpu();
+	if (cpuid & ~MPIDR_HWID_BITMASK) {
+		pr_warn("MIGRATE_INFO_UP_CPU reported invalid physical ID (0x%lx)\n",
+			cpuid);
+		return;
+	}
+
+	cpu = get_logical_index(cpuid);
+	resident_cpu = cpu >= 0 ? cpu : -1;
+
+	pr_info("Trusted OS resident on physical CPU 0x%lx\n", cpuid);
+}
+
+static void __init psci_0_2_set_functions(void)
+{
+	pr_info("Using standard PSCI v0.2 function IDs\n");
+	psci_function_id[PSCI_FN_CPU_SUSPEND] = PSCI_0_2_FN64_CPU_SUSPEND;
+	psci_ops.cpu_suspend = psci_cpu_suspend;
+
+	psci_function_id[PSCI_FN_CPU_OFF] = PSCI_0_2_FN_CPU_OFF;
+	psci_ops.cpu_off = psci_cpu_off;
+
+	psci_function_id[PSCI_FN_CPU_ON] = PSCI_0_2_FN64_CPU_ON;
+	psci_ops.cpu_on = psci_cpu_on;
+
+	psci_function_id[PSCI_FN_MIGRATE] = PSCI_0_2_FN64_MIGRATE;
+	psci_ops.migrate = psci_migrate;
+
+	psci_ops.affinity_info = psci_affinity_info;
+
+	psci_ops.migrate_info_type = psci_migrate_info_type;
+
+	arm_pm_restart = psci_sys_reset;
+
+	pm_power_off = psci_sys_poweroff;
+}
+
+/*
+ * Probe function for PSCI firmware versions >= 0.2
+ */
+static int __init psci_probe(void)
+{
+	u32 ver = psci_get_version();
+
+	pr_info("PSCIv%d.%d detected in firmware.\n",
+			PSCI_VERSION_MAJOR(ver),
+			PSCI_VERSION_MINOR(ver));
+
+	if (PSCI_VERSION_MAJOR(ver) == 0 && PSCI_VERSION_MINOR(ver) < 2) {
+		pr_err("Conflicting PSCI version detected.\n");
+		return -EINVAL;
+	}
+
+	psci_0_2_set_functions();
+
+	psci_init_migrate();
+
+	return 0;
+}
+
+typedef int (*psci_initcall_t)(const struct device_node *);
+
+/*
+ * PSCI init function for PSCI versions >=0.2
+ *
+ * Probe based on PSCI PSCI_VERSION function
+ */
+static int __init psci_0_2_init(struct device_node *np)
+{
+	int err;
+
+	err = get_set_conduit_method(np);
+
+	if (err)
+		goto out_put_node;
+	/*
+	 * Starting with v0.2, the PSCI specification introduced a call
+	 * (PSCI_VERSION) that allows probing the firmware version, so
+	 * that PSCI function IDs and version specific initialization
+	 * can be carried out according to the specific version reported
+	 * by firmware
+	 */
+	err = psci_probe();
+
+out_put_node:
+	of_node_put(np);
+	return err;
+}
+
+/*
+ * PSCI < v0.2 get PSCI Function IDs via DT.
+ */
+static int __init psci_0_1_init(struct device_node *np)
+{
+	u32 id;
+	int err;
+
+	err = get_set_conduit_method(np);
+
+	if (err)
+		goto out_put_node;
+
+	pr_info("Using PSCI v0.1 Function IDs from DT\n");
+
+	if (!of_property_read_u32(np, "cpu_suspend", &id)) {
+		psci_function_id[PSCI_FN_CPU_SUSPEND] = id;
+		psci_ops.cpu_suspend = psci_cpu_suspend;
+	}
+
+	if (!of_property_read_u32(np, "cpu_off", &id)) {
+		psci_function_id[PSCI_FN_CPU_OFF] = id;
+		psci_ops.cpu_off = psci_cpu_off;
+	}
+
+	if (!of_property_read_u32(np, "cpu_on", &id)) {
+		psci_function_id[PSCI_FN_CPU_ON] = id;
+		psci_ops.cpu_on = psci_cpu_on;
+	}
+
+	if (!of_property_read_u32(np, "migrate", &id)) {
+		psci_function_id[PSCI_FN_MIGRATE] = id;
+		psci_ops.migrate = psci_migrate;
+	}
+
+out_put_node:
+	of_node_put(np);
+	return err;
+}
+
+static const struct of_device_id psci_of_match[] __initconst = {
+	{ .compatible = "arm,psci",	.data = psci_0_1_init},
+	{ .compatible = "arm,psci-0.2",	.data = psci_0_2_init},
+	{},
+};
+
+int __init psci_dt_init(void)
+{
+	struct device_node *np;
+	const struct of_device_id *matched_np;
+	psci_initcall_t init_fn;
+
+	np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np);
+
+	if (!np)
+		return -ENODEV;
+
+	init_fn = (psci_initcall_t)matched_np->data;
+	return init_fn(np);
+}
+
+#ifdef CONFIG_ACPI
+/*
+ * We use PSCI 0.2+ when ACPI is deployed on ARM64 and it's
+ * explicitly clarified in SBBR
+ */
+int __init psci_acpi_init(void)
+{
+	if (!acpi_psci_present()) {
+		pr_info("is not implemented in ACPI.\n");
+		return -EOPNOTSUPP;
+	}
+
+	pr_info("probing for conduit method from ACPI.\n");
+
+	if (acpi_psci_use_hvc())
+		invoke_psci_fn = __invoke_psci_fn_hvc;
+	else
+		invoke_psci_fn = __invoke_psci_fn_smc;
+
+	return psci_probe();
+}
+#endif
diff --git a/include/linux/psci.h b/include/linux/psci.h
new file mode 100644
index 000000000000..a682fcc91c33
--- /dev/null
+++ b/include/linux/psci.h
@@ -0,0 +1,52 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015 ARM Limited
+ */
+
+#ifndef __LINUX_PSCI_H
+#define __LINUX_PSCI_H
+
+#include <linux/init.h>
+#include <linux/types.h>
+
+#define PSCI_POWER_STATE_TYPE_STANDBY		0
+#define PSCI_POWER_STATE_TYPE_POWER_DOWN	1
+
+bool psci_tos_resident_on(int cpu);
+
+struct psci_operations {
+	int (*cpu_suspend)(u32 state, unsigned long entry_point);
+	int (*cpu_off)(u32 state);
+	int (*cpu_on)(unsigned long cpuid, unsigned long entry_point);
+	int (*migrate)(unsigned long cpuid);
+	int (*affinity_info)(unsigned long target_affinity,
+			unsigned long lowest_affinity_level);
+	int (*migrate_info_type)(void);
+};
+
+extern struct psci_operations psci_ops;
+
+#if defined(CONFIG_ARM_PSCI_FW)
+int __init psci_dt_init(void);
+#else
+static inline int psci_dt_init(void) { return 0; }
+#endif
+
+#if defined(CONFIG_ARM_PSCI_FW) && defined(CONFIG_ACPI)
+int __init psci_acpi_init(void);
+bool __init acpi_psci_present(void);
+bool __init acpi_psci_use_hvc(void);
+#else
+static inline int psci_acpi_init(void) { return 0; }
+static inline bool acpi_psci_present(void) { return false; }
+#endif
+
+#endif /* __LINUX_PSCI_H */
-- 
cgit v1.2.3-70-g09d2


From 17e8351a77397e8a83727eb17e3a3e9b8ab5257a Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Fri, 24 Jul 2015 08:12:54 +0200
Subject: thermal: consistently use int for temperatures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The thermal code uses int, long and unsigned long for temperatures
in different places.

Using an unsigned type limits the thermal framework to positive
temperatures without need. Also several drivers currently will report
temperatures near UINT_MAX for temperatures below 0°C. This will probably
immediately shut the machine down due to overtemperature if started below
0°C.

'long' is 64bit on several architectures. This is not needed since INT_MAX °mC
is above the melting point of all known materials.

Consistently use a plain 'int' for temperatures throughout the thermal code and
the drivers. This only changes the places in the drivers where the temperature
is passed around as pointer, when drivers internally use another type this is
not changed.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Acked-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Jean Delvare <jdelvare@suse.de>
Reviewed-by: Lukasz Majewski <l.majewski@samsung.com>
Reviewed-by: Darren Hart <dvhart@linux.intel.com>
Reviewed-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: Peter Feuerer <peter@piie.net>
Cc: Punit Agrawal <punit.agrawal@arm.com>
Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Eduardo Valentin <edubezval@gmail.com>
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: Jean Delvare <jdelvare@suse.de>
Cc: Peter Feuerer <peter@piie.net>
Cc: Heiko Stuebner <heiko@sntech.de>
Cc: Lukasz Majewski <l.majewski@samsung.com>
Cc: Stephen Warren <swarren@wwwdotorg.org>
Cc: Thierry Reding <thierry.reding@gmail.com>
Cc: linux-acpi@vger.kernel.org
Cc: platform-driver-x86@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-omap@vger.kernel.org
Cc: linux-samsung-soc@vger.kernel.org
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Maxime Ripard <maxime.ripard@free-electrons.com>
Cc: Darren Hart <dvhart@infradead.org>
Cc: lm-sensors@lm-sensors.org
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 drivers/acpi/thermal.c                             | 12 +++++-----
 drivers/hwmon/lm75.c                               |  2 +-
 drivers/hwmon/ntc_thermistor.c                     |  2 +-
 drivers/hwmon/tmp102.c                             |  2 +-
 drivers/input/touchscreen/sun4i-ts.c               |  8 +++----
 drivers/platform/x86/acerhdf.c                     |  9 ++++----
 drivers/platform/x86/intel_mid_thermal.c           |  9 ++++----
 drivers/power/charger-manager.c                    |  2 +-
 drivers/power/power_supply_core.c                  |  2 +-
 drivers/thermal/armada_thermal.c                   |  2 +-
 drivers/thermal/db8500_thermal.c                   |  7 +++---
 drivers/thermal/dove_thermal.c                     |  2 +-
 drivers/thermal/fair_share.c                       |  2 +-
 drivers/thermal/gov_bang_bang.c                    |  5 ++--
 drivers/thermal/hisi_thermal.c                     |  4 ++--
 drivers/thermal/imx_thermal.c                      | 27 +++++++++++-----------
 drivers/thermal/int340x_thermal/int3400_thermal.c  |  2 +-
 .../thermal/int340x_thermal/int340x_thermal_zone.c | 10 ++++----
 .../thermal/int340x_thermal/int340x_thermal_zone.h |  8 +++----
 .../int340x_thermal/processor_thermal_device.c     |  4 ++--
 drivers/thermal/intel_quark_dts_thermal.c          | 13 +++++------
 drivers/thermal/intel_soc_dts_iosf.c               |  8 +++----
 drivers/thermal/kirkwood_thermal.c                 |  2 +-
 drivers/thermal/of-thermal.c                       | 14 +++++------
 drivers/thermal/power_allocator.c                  | 16 ++++++-------
 drivers/thermal/qcom-spmi-temp-alarm.c             |  2 +-
 drivers/thermal/rcar_thermal.c                     |  7 +++---
 drivers/thermal/rockchip_thermal.c                 | 10 ++++----
 drivers/thermal/samsung/exynos_tmu.c               | 23 +++++++++---------
 drivers/thermal/spear_thermal.c                    |  2 +-
 drivers/thermal/st/st_thermal.c                    |  5 ++--
 drivers/thermal/step_wise.c                        |  4 ++--
 drivers/thermal/tegra_soctherm.c                   |  4 ++--
 drivers/thermal/thermal_core.c                     | 26 ++++++++++-----------
 drivers/thermal/thermal_hwmon.c                    | 10 ++++----
 drivers/thermal/ti-soc-thermal/ti-thermal-common.c | 10 ++++----
 drivers/thermal/x86_pkg_temp_thermal.c             | 10 ++++----
 include/linux/thermal.h                            | 26 +++++++++------------
 include/trace/events/thermal_power_allocator.h     |  6 ++---
 39 files changed, 152 insertions(+), 167 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 6d4e44ea74ac..e66ad25d112f 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -529,8 +529,7 @@ static void acpi_thermal_check(void *data)
 
 /* sys I/F for generic thermal sysfs support */
 
-static int thermal_get_temp(struct thermal_zone_device *thermal,
-			    unsigned long *temp)
+static int thermal_get_temp(struct thermal_zone_device *thermal, int *temp)
 {
 	struct acpi_thermal *tz = thermal->devdata;
 	int result;
@@ -637,7 +636,7 @@ static int thermal_get_trip_type(struct thermal_zone_device *thermal,
 }
 
 static int thermal_get_trip_temp(struct thermal_zone_device *thermal,
-				 int trip, unsigned long *temp)
+				 int trip, int *temp)
 {
 	struct acpi_thermal *tz = thermal->devdata;
 	int i;
@@ -690,7 +689,8 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal,
 }
 
 static int thermal_get_crit_temp(struct thermal_zone_device *thermal,
-				unsigned long *temperature) {
+				int *temperature)
+{
 	struct acpi_thermal *tz = thermal->devdata;
 
 	if (tz->trips.critical.flags.valid) {
@@ -713,8 +713,8 @@ static int thermal_get_trend(struct thermal_zone_device *thermal,
 		return -EINVAL;
 
 	if (type == THERMAL_TRIP_ACTIVE) {
-		unsigned long trip_temp;
-		unsigned long temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(
+		int trip_temp;
+		int temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(
 					tz->temperature, tz->kelvin_offset);
 		if (thermal_get_trip_temp(thermal, trip, &trip_temp))
 			return -EINVAL;
diff --git a/drivers/hwmon/lm75.c b/drivers/hwmon/lm75.c
index fe41d5ae7cb2..e4e57bbafb10 100644
--- a/drivers/hwmon/lm75.c
+++ b/drivers/hwmon/lm75.c
@@ -104,7 +104,7 @@ static inline long lm75_reg_to_mc(s16 temp, u8 resolution)
 
 /* sysfs attributes for hwmon */
 
-static int lm75_read_temp(void *dev, long *temp)
+static int lm75_read_temp(void *dev, int *temp)
 {
 	struct lm75_data *data = lm75_update_device(dev);
 
diff --git a/drivers/hwmon/ntc_thermistor.c b/drivers/hwmon/ntc_thermistor.c
index dc0b76c5e302..feed30646d91 100644
--- a/drivers/hwmon/ntc_thermistor.c
+++ b/drivers/hwmon/ntc_thermistor.c
@@ -477,7 +477,7 @@ static int ntc_thermistor_get_ohm(struct ntc_data *data)
 	return -EINVAL;
 }
 
-static int ntc_read_temp(void *dev, long *temp)
+static int ntc_read_temp(void *dev, int *temp)
 {
 	struct ntc_data *data = dev_get_drvdata(dev);
 	int ohm;
diff --git a/drivers/hwmon/tmp102.c b/drivers/hwmon/tmp102.c
index 9da2735f1424..65482624ea2c 100644
--- a/drivers/hwmon/tmp102.c
+++ b/drivers/hwmon/tmp102.c
@@ -98,7 +98,7 @@ static struct tmp102 *tmp102_update_device(struct device *dev)
 	return tmp102;
 }
 
-static int tmp102_read_temp(void *dev, long *temp)
+static int tmp102_read_temp(void *dev, int *temp)
 {
 	struct tmp102 *tmp102 = tmp102_update_device(dev);
 
diff --git a/drivers/input/touchscreen/sun4i-ts.c b/drivers/input/touchscreen/sun4i-ts.c
index c0116994067d..485794376ee5 100644
--- a/drivers/input/touchscreen/sun4i-ts.c
+++ b/drivers/input/touchscreen/sun4i-ts.c
@@ -191,7 +191,7 @@ static void sun4i_ts_close(struct input_dev *dev)
 	writel(TEMP_IRQ_EN(1), ts->base + TP_INT_FIFOC);
 }
 
-static int sun4i_get_temp(const struct sun4i_ts_data *ts, long *temp)
+static int sun4i_get_temp(const struct sun4i_ts_data *ts, int *temp)
 {
 	/* No temp_data until the first irq */
 	if (ts->temp_data == -1)
@@ -202,7 +202,7 @@ static int sun4i_get_temp(const struct sun4i_ts_data *ts, long *temp)
 	return 0;
 }
 
-static int sun4i_get_tz_temp(void *data, long *temp)
+static int sun4i_get_tz_temp(void *data, int *temp)
 {
 	return sun4i_get_temp(data, temp);
 }
@@ -215,14 +215,14 @@ static ssize_t show_temp(struct device *dev, struct device_attribute *devattr,
 			 char *buf)
 {
 	struct sun4i_ts_data *ts = dev_get_drvdata(dev);
-	long temp;
+	int temp;
 	int error;
 
 	error = sun4i_get_temp(ts, &temp);
 	if (error)
 		return error;
 
-	return sprintf(buf, "%ld\n", temp);
+	return sprintf(buf, "%d\n", temp);
 }
 
 static ssize_t show_temp_label(struct device *dev,
diff --git a/drivers/platform/x86/acerhdf.c b/drivers/platform/x86/acerhdf.c
index 1ef02daddb60..460fa6708bfc 100644
--- a/drivers/platform/x86/acerhdf.c
+++ b/drivers/platform/x86/acerhdf.c
@@ -346,8 +346,7 @@ static void acerhdf_check_param(struct thermal_zone_device *thermal)
  * as late as the polling interval is since we can't do that in the respective
  * accessors of the module parameters.
  */
-static int acerhdf_get_ec_temp(struct thermal_zone_device *thermal,
-			       unsigned long *t)
+static int acerhdf_get_ec_temp(struct thermal_zone_device *thermal, int *t)
 {
 	int temp, err = 0;
 
@@ -453,7 +452,7 @@ static int acerhdf_get_trip_type(struct thermal_zone_device *thermal, int trip,
 }
 
 static int acerhdf_get_trip_hyst(struct thermal_zone_device *thermal, int trip,
-				 unsigned long *temp)
+				 int *temp)
 {
 	if (trip != 0)
 		return -EINVAL;
@@ -464,7 +463,7 @@ static int acerhdf_get_trip_hyst(struct thermal_zone_device *thermal, int trip,
 }
 
 static int acerhdf_get_trip_temp(struct thermal_zone_device *thermal, int trip,
-				 unsigned long *temp)
+				 int *temp)
 {
 	if (trip == 0)
 		*temp = fanon;
@@ -477,7 +476,7 @@ static int acerhdf_get_trip_temp(struct thermal_zone_device *thermal, int trip,
 }
 
 static int acerhdf_get_crit_temp(struct thermal_zone_device *thermal,
-				 unsigned long *temperature)
+				 int *temperature)
 {
 	*temperature = ACERHDF_TEMP_CRIT;
 	return 0;
diff --git a/drivers/platform/x86/intel_mid_thermal.c b/drivers/platform/x86/intel_mid_thermal.c
index 0944e834af8d..9f713b832ba3 100644
--- a/drivers/platform/x86/intel_mid_thermal.c
+++ b/drivers/platform/x86/intel_mid_thermal.c
@@ -132,7 +132,7 @@ static int is_valid_adc(uint16_t adc_val, uint16_t min, uint16_t max)
  * to achieve very close approximate temp value with less than
  * 0.5C error
  */
-static int adc_to_temp(int direct, uint16_t adc_val, unsigned long *tp)
+static int adc_to_temp(int direct, uint16_t adc_val, int *tp)
 {
 	int temp;
 
@@ -174,14 +174,13 @@ static int adc_to_temp(int direct, uint16_t adc_val, unsigned long *tp)
  *
  * Can sleep
  */
-static int mid_read_temp(struct thermal_zone_device *tzd, unsigned long *temp)
+static int mid_read_temp(struct thermal_zone_device *tzd, int *temp)
 {
 	struct thermal_device_info *td_info = tzd->devdata;
 	uint16_t adc_val, addr;
 	uint8_t data = 0;
 	int ret;
-	unsigned long curr_temp;
-
+	int curr_temp;
 
 	addr = td_info->chnl_addr;
 
@@ -453,7 +452,7 @@ static SIMPLE_DEV_PM_OPS(mid_thermal_pm,
  *
  * Can sleep
  */
-static int read_curr_temp(struct thermal_zone_device *tzd, unsigned long *temp)
+static int read_curr_temp(struct thermal_zone_device *tzd, int *temp)
 {
 	WARN_ON(tzd == NULL);
 	return mid_read_temp(tzd, temp);
diff --git a/drivers/power/charger-manager.c b/drivers/power/charger-manager.c
index 1c202ccbd2a6..907293e6f2a4 100644
--- a/drivers/power/charger-manager.c
+++ b/drivers/power/charger-manager.c
@@ -619,7 +619,7 @@ static int cm_get_battery_temperature(struct charger_manager *cm,
 
 #ifdef CONFIG_THERMAL
 	if (cm->tzd_batt) {
-		ret = thermal_zone_get_temp(cm->tzd_batt, (unsigned long *)temp);
+		ret = thermal_zone_get_temp(cm->tzd_batt, temp);
 		if (!ret)
 			/* Calibrate temperature unit */
 			*temp /= 100;
diff --git a/drivers/power/power_supply_core.c b/drivers/power/power_supply_core.c
index 869284c2e1e8..456987c88baa 100644
--- a/drivers/power/power_supply_core.c
+++ b/drivers/power/power_supply_core.c
@@ -557,7 +557,7 @@ EXPORT_SYMBOL_GPL(power_supply_unreg_notifier);
 
 #ifdef CONFIG_THERMAL
 static int power_supply_read_temp(struct thermal_zone_device *tzd,
-		unsigned long *temp)
+		int *temp)
 {
 	struct power_supply *psy;
 	union power_supply_propval val;
diff --git a/drivers/thermal/armada_thermal.c b/drivers/thermal/armada_thermal.c
index 01255fd65135..26b8d326546a 100644
--- a/drivers/thermal/armada_thermal.c
+++ b/drivers/thermal/armada_thermal.c
@@ -155,7 +155,7 @@ static bool armada_is_valid(struct armada_thermal_priv *priv)
 }
 
 static int armada_get_temp(struct thermal_zone_device *thermal,
-			  unsigned long *temp)
+			  int *temp)
 {
 	struct armada_thermal_priv *priv = thermal->devdata;
 	unsigned long reg;
diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c
index 2fb273c4baa9..652acd8fbe48 100644
--- a/drivers/thermal/db8500_thermal.c
+++ b/drivers/thermal/db8500_thermal.c
@@ -107,8 +107,7 @@ static int db8500_cdev_unbind(struct thermal_zone_device *thermal,
 }
 
 /* Callback to get current temperature */
-static int db8500_sys_get_temp(struct thermal_zone_device *thermal,
-		unsigned long *temp)
+static int db8500_sys_get_temp(struct thermal_zone_device *thermal, int *temp)
 {
 	struct db8500_thermal_zone *pzone = thermal->devdata;
 
@@ -180,7 +179,7 @@ static int db8500_sys_get_trip_type(struct thermal_zone_device *thermal,
 
 /* Callback to get trip point temperature */
 static int db8500_sys_get_trip_temp(struct thermal_zone_device *thermal,
-		int trip, unsigned long *temp)
+		int trip, int *temp)
 {
 	struct db8500_thermal_zone *pzone = thermal->devdata;
 	struct db8500_thsens_platform_data *ptrips = pzone->trip_tab;
@@ -195,7 +194,7 @@ static int db8500_sys_get_trip_temp(struct thermal_zone_device *thermal,
 
 /* Callback to get critical trip point temperature */
 static int db8500_sys_get_crit_temp(struct thermal_zone_device *thermal,
-		unsigned long *temp)
+		int *temp)
 {
 	struct db8500_thermal_zone *pzone = thermal->devdata;
 	struct db8500_thsens_platform_data *ptrips = pzone->trip_tab;
diff --git a/drivers/thermal/dove_thermal.c b/drivers/thermal/dove_thermal.c
index 09f6e304c274..a0bc9de42553 100644
--- a/drivers/thermal/dove_thermal.c
+++ b/drivers/thermal/dove_thermal.c
@@ -93,7 +93,7 @@ static int dove_init_sensor(const struct dove_thermal_priv *priv)
 }
 
 static int dove_get_temp(struct thermal_zone_device *thermal,
-			  unsigned long *temp)
+			  int *temp)
 {
 	unsigned long reg;
 	struct dove_thermal_priv *priv = thermal->devdata;
diff --git a/drivers/thermal/fair_share.c b/drivers/thermal/fair_share.c
index c2c10bbe24d6..34fe36504a55 100644
--- a/drivers/thermal/fair_share.c
+++ b/drivers/thermal/fair_share.c
@@ -34,7 +34,7 @@
 static int get_trip_level(struct thermal_zone_device *tz)
 {
 	int count = 0;
-	unsigned long trip_temp;
+	int trip_temp;
 	enum thermal_trip_type trip_type;
 
 	if (tz->trips == 0 || !tz->ops->get_trip_temp)
diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c
index c5dd76b2ee74..70836c5b89bc 100644
--- a/drivers/thermal/gov_bang_bang.c
+++ b/drivers/thermal/gov_bang_bang.c
@@ -25,14 +25,13 @@
 
 static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 {
-	long trip_temp;
-	unsigned long trip_hyst;
+	int trip_temp, trip_hyst;
 	struct thermal_instance *instance;
 
 	tz->ops->get_trip_temp(tz, trip, &trip_temp);
 	tz->ops->get_trip_hyst(tz, trip, &trip_hyst);
 
-	dev_dbg(&tz->device, "Trip%d[temp=%ld]:temp=%d:hyst=%ld\n",
+	dev_dbg(&tz->device, "Trip%d[temp=%d]:temp=%d:hyst=%d\n",
 				trip, trip_temp, tz->temperature,
 				trip_hyst);
 
diff --git a/drivers/thermal/hisi_thermal.c b/drivers/thermal/hisi_thermal.c
index d5dd357ba57c..49aa068d1603 100644
--- a/drivers/thermal/hisi_thermal.c
+++ b/drivers/thermal/hisi_thermal.c
@@ -155,7 +155,7 @@ static void hisi_thermal_disable_sensor(struct hisi_thermal_data *data)
 	mutex_unlock(&data->thermal_lock);
 }
 
-static int hisi_thermal_get_temp(void *_sensor, long *temp)
+static int hisi_thermal_get_temp(void *_sensor, int *temp)
 {
 	struct hisi_thermal_sensor *sensor = _sensor;
 	struct hisi_thermal_data *data = sensor->thermal;
@@ -178,7 +178,7 @@ static int hisi_thermal_get_temp(void *_sensor, long *temp)
 	data->irq_bind_sensor = sensor_id;
 	mutex_unlock(&data->thermal_lock);
 
-	dev_dbg(&data->pdev->dev, "id=%d, irq=%d, temp=%ld, thres=%d\n",
+	dev_dbg(&data->pdev->dev, "id=%d, irq=%d, temp=%d, thres=%d\n",
 		sensor->id, data->irq_enabled, *temp, sensor->thres_temp);
 	/*
 	 * Bind irq to sensor for two cases:
diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c
index fde4c2876d14..4bec1d3c3d27 100644
--- a/drivers/thermal/imx_thermal.c
+++ b/drivers/thermal/imx_thermal.c
@@ -98,10 +98,10 @@ struct imx_thermal_data {
 	enum thermal_device_mode mode;
 	struct regmap *tempmon;
 	u32 c1, c2; /* See formula in imx_get_sensor_data() */
-	unsigned long temp_passive;
-	unsigned long temp_critical;
-	unsigned long alarm_temp;
-	unsigned long last_temp;
+	int temp_passive;
+	int temp_critical;
+	int alarm_temp;
+	int last_temp;
 	bool irq_enabled;
 	int irq;
 	struct clk *thermal_clk;
@@ -109,7 +109,7 @@ struct imx_thermal_data {
 };
 
 static void imx_set_panic_temp(struct imx_thermal_data *data,
-			       signed long panic_temp)
+			       int panic_temp)
 {
 	struct regmap *map = data->tempmon;
 	int critical_value;
@@ -121,7 +121,7 @@ static void imx_set_panic_temp(struct imx_thermal_data *data,
 }
 
 static void imx_set_alarm_temp(struct imx_thermal_data *data,
-			       signed long alarm_temp)
+			       int alarm_temp)
 {
 	struct regmap *map = data->tempmon;
 	int alarm_value;
@@ -133,7 +133,7 @@ static void imx_set_alarm_temp(struct imx_thermal_data *data,
 			TEMPSENSE0_ALARM_VALUE_SHIFT);
 }
 
-static int imx_get_temp(struct thermal_zone_device *tz, unsigned long *temp)
+static int imx_get_temp(struct thermal_zone_device *tz, int *temp)
 {
 	struct imx_thermal_data *data = tz->devdata;
 	struct regmap *map = data->tempmon;
@@ -189,13 +189,13 @@ static int imx_get_temp(struct thermal_zone_device *tz, unsigned long *temp)
 		if (data->alarm_temp == data->temp_critical &&
 			*temp < data->temp_passive) {
 			imx_set_alarm_temp(data, data->temp_passive);
-			dev_dbg(&tz->device, "thermal alarm off: T < %lu\n",
+			dev_dbg(&tz->device, "thermal alarm off: T < %d\n",
 				data->alarm_temp / 1000);
 		}
 	}
 
 	if (*temp != data->last_temp) {
-		dev_dbg(&tz->device, "millicelsius: %ld\n", *temp);
+		dev_dbg(&tz->device, "millicelsius: %d\n", *temp);
 		data->last_temp = *temp;
 	}
 
@@ -262,8 +262,7 @@ static int imx_get_trip_type(struct thermal_zone_device *tz, int trip,
 	return 0;
 }
 
-static int imx_get_crit_temp(struct thermal_zone_device *tz,
-			     unsigned long *temp)
+static int imx_get_crit_temp(struct thermal_zone_device *tz, int *temp)
 {
 	struct imx_thermal_data *data = tz->devdata;
 
@@ -272,7 +271,7 @@ static int imx_get_crit_temp(struct thermal_zone_device *tz,
 }
 
 static int imx_get_trip_temp(struct thermal_zone_device *tz, int trip,
-			     unsigned long *temp)
+			     int *temp)
 {
 	struct imx_thermal_data *data = tz->devdata;
 
@@ -282,7 +281,7 @@ static int imx_get_trip_temp(struct thermal_zone_device *tz, int trip,
 }
 
 static int imx_set_trip_temp(struct thermal_zone_device *tz, int trip,
-			     unsigned long temp)
+			     int temp)
 {
 	struct imx_thermal_data *data = tz->devdata;
 
@@ -434,7 +433,7 @@ static irqreturn_t imx_thermal_alarm_irq_thread(int irq, void *dev)
 {
 	struct imx_thermal_data *data = dev;
 
-	dev_dbg(&data->tz->device, "THERMAL ALARM: T > %lu\n",
+	dev_dbg(&data->tz->device, "THERMAL ALARM: T > %d\n",
 		data->alarm_temp / 1000);
 
 	thermal_zone_device_update(data->tz);
diff --git a/drivers/thermal/int340x_thermal/int3400_thermal.c b/drivers/thermal/int340x_thermal/int3400_thermal.c
index 031018e7a65b..5836e5554433 100644
--- a/drivers/thermal/int340x_thermal/int3400_thermal.c
+++ b/drivers/thermal/int340x_thermal/int3400_thermal.c
@@ -186,7 +186,7 @@ static int int3400_thermal_run_osc(acpi_handle handle,
 }
 
 static int int3400_thermal_get_temp(struct thermal_zone_device *thermal,
-			unsigned long *temp)
+			int *temp)
 {
 	*temp = 20 * 1000; /* faked temp sensor with 20C */
 	return 0;
diff --git a/drivers/thermal/int340x_thermal/int340x_thermal_zone.c b/drivers/thermal/int340x_thermal/int340x_thermal_zone.c
index 1e25133d35e2..b9b2666aa94c 100644
--- a/drivers/thermal/int340x_thermal/int340x_thermal_zone.c
+++ b/drivers/thermal/int340x_thermal/int340x_thermal_zone.c
@@ -20,7 +20,7 @@
 #include "int340x_thermal_zone.h"
 
 static int int340x_thermal_get_zone_temp(struct thermal_zone_device *zone,
-					 unsigned long *temp)
+					 int *temp)
 {
 	struct int34x_thermal_zone *d = zone->devdata;
 	unsigned long long tmp;
@@ -49,7 +49,7 @@ static int int340x_thermal_get_zone_temp(struct thermal_zone_device *zone,
 }
 
 static int int340x_thermal_get_trip_temp(struct thermal_zone_device *zone,
-					 int trip, unsigned long *temp)
+					 int trip, int *temp)
 {
 	struct int34x_thermal_zone *d = zone->devdata;
 	int i;
@@ -114,7 +114,7 @@ static int int340x_thermal_get_trip_type(struct thermal_zone_device *zone,
 }
 
 static int int340x_thermal_set_trip_temp(struct thermal_zone_device *zone,
-				      int trip, unsigned long temp)
+				      int trip, int temp)
 {
 	struct int34x_thermal_zone *d = zone->devdata;
 	acpi_status status;
@@ -136,7 +136,7 @@ static int int340x_thermal_set_trip_temp(struct thermal_zone_device *zone,
 
 
 static int int340x_thermal_get_trip_hyst(struct thermal_zone_device *zone,
-		int trip, unsigned long *temp)
+		int trip, int *temp)
 {
 	struct int34x_thermal_zone *d = zone->devdata;
 	acpi_status status;
@@ -163,7 +163,7 @@ static struct thermal_zone_device_ops int340x_thermal_zone_ops = {
 };
 
 static int int340x_thermal_get_trip_config(acpi_handle handle, char *name,
-				      unsigned long *temp)
+				      int *temp)
 {
 	unsigned long long r;
 	acpi_status status;
diff --git a/drivers/thermal/int340x_thermal/int340x_thermal_zone.h b/drivers/thermal/int340x_thermal/int340x_thermal_zone.h
index 9f38ab72c4bf..aaadf724ff2e 100644
--- a/drivers/thermal/int340x_thermal/int340x_thermal_zone.h
+++ b/drivers/thermal/int340x_thermal/int340x_thermal_zone.h
@@ -21,7 +21,7 @@
 #define INT340X_THERMAL_MAX_ACT_TRIP_COUNT	10
 
 struct active_trip {
-	unsigned long temp;
+	int temp;
 	int id;
 	bool valid;
 };
@@ -31,11 +31,11 @@ struct int34x_thermal_zone {
 	struct active_trip act_trips[INT340X_THERMAL_MAX_ACT_TRIP_COUNT];
 	unsigned long *aux_trips;
 	int aux_trip_nr;
-	unsigned long psv_temp;
+	int psv_temp;
 	int psv_trip_id;
-	unsigned long crt_temp;
+	int crt_temp;
 	int crt_trip_id;
-	unsigned long hot_temp;
+	int hot_temp;
 	int hot_trip_id;
 	struct thermal_zone_device *zone;
 	struct thermal_zone_device_ops *override_ops;
diff --git a/drivers/thermal/int340x_thermal/processor_thermal_device.c b/drivers/thermal/int340x_thermal/processor_thermal_device.c
index 3df3dc34b124..ccc0ad02d066 100644
--- a/drivers/thermal/int340x_thermal/processor_thermal_device.c
+++ b/drivers/thermal/int340x_thermal/processor_thermal_device.c
@@ -145,7 +145,7 @@ static int get_tjmax(void)
 	return -EINVAL;
 }
 
-static int read_temp_msr(unsigned long *temp)
+static int read_temp_msr(int *temp)
 {
 	int cpu;
 	u32 eax, edx;
@@ -177,7 +177,7 @@ err_ret:
 }
 
 static int proc_thermal_get_zone_temp(struct thermal_zone_device *zone,
-					 unsigned long *temp)
+					 int *temp)
 {
 	int ret;
 
diff --git a/drivers/thermal/intel_quark_dts_thermal.c b/drivers/thermal/intel_quark_dts_thermal.c
index 4434ec812cb7..5ed90e6c8a64 100644
--- a/drivers/thermal/intel_quark_dts_thermal.c
+++ b/drivers/thermal/intel_quark_dts_thermal.c
@@ -186,7 +186,7 @@ static int soc_dts_disable(struct thermal_zone_device *tzd)
 	return ret;
 }
 
-static int _get_trip_temp(int trip, unsigned long *temp)
+static int _get_trip_temp(int trip, int *temp)
 {
 	int status;
 	u32 out;
@@ -212,19 +212,18 @@ static int _get_trip_temp(int trip, unsigned long *temp)
 }
 
 static inline int sys_get_trip_temp(struct thermal_zone_device *tzd,
-				int trip, unsigned long *temp)
+				int trip, int *temp)
 {
 	return _get_trip_temp(trip, temp);
 }
 
-static inline int sys_get_crit_temp(struct thermal_zone_device *tzd,
-				unsigned long *temp)
+static inline int sys_get_crit_temp(struct thermal_zone_device *tzd, int *temp)
 {
 	return _get_trip_temp(QRK_DTS_ID_TP_CRITICAL, temp);
 }
 
 static int update_trip_temp(struct soc_sensor_entry *aux_entry,
-				int trip, unsigned long temp)
+				int trip, int temp)
 {
 	u32 out;
 	u32 temp_out;
@@ -272,7 +271,7 @@ failed:
 }
 
 static inline int sys_set_trip_temp(struct thermal_zone_device *tzd, int trip,
-				unsigned long temp)
+				int temp)
 {
 	return update_trip_temp(tzd->devdata, trip, temp);
 }
@@ -289,7 +288,7 @@ static int sys_get_trip_type(struct thermal_zone_device *thermal,
 }
 
 static int sys_get_curr_temp(struct thermal_zone_device *tzd,
-				unsigned long *temp)
+				int *temp)
 {
 	u32 out;
 	int ret;
diff --git a/drivers/thermal/intel_soc_dts_iosf.c b/drivers/thermal/intel_soc_dts_iosf.c
index 42e4b6ac3875..5841d1d72996 100644
--- a/drivers/thermal/intel_soc_dts_iosf.c
+++ b/drivers/thermal/intel_soc_dts_iosf.c
@@ -80,7 +80,7 @@ err_ret:
 }
 
 static int sys_get_trip_temp(struct thermal_zone_device *tzd, int trip,
-			     unsigned long *temp)
+			     int *temp)
 {
 	int status;
 	u32 out;
@@ -106,7 +106,7 @@ static int sys_get_trip_temp(struct thermal_zone_device *tzd, int trip,
 }
 
 static int update_trip_temp(struct intel_soc_dts_sensor_entry *dts,
-			    int thres_index, unsigned long temp,
+			    int thres_index, int temp,
 			    enum thermal_trip_type trip_type)
 {
 	int status;
@@ -196,7 +196,7 @@ err_restore_ptps:
 }
 
 static int sys_set_trip_temp(struct thermal_zone_device *tzd, int trip,
-			     unsigned long temp)
+			     int temp)
 {
 	struct intel_soc_dts_sensor_entry *dts = tzd->devdata;
 	struct intel_soc_dts_sensors *sensors = dts->sensors;
@@ -226,7 +226,7 @@ static int sys_get_trip_type(struct thermal_zone_device *tzd,
 }
 
 static int sys_get_curr_temp(struct thermal_zone_device *tzd,
-			     unsigned long *temp)
+			     int *temp)
 {
 	int status;
 	u32 out;
diff --git a/drivers/thermal/kirkwood_thermal.c b/drivers/thermal/kirkwood_thermal.c
index 11041fe63dc2..892236621767 100644
--- a/drivers/thermal/kirkwood_thermal.c
+++ b/drivers/thermal/kirkwood_thermal.c
@@ -33,7 +33,7 @@ struct kirkwood_thermal_priv {
 };
 
 static int kirkwood_get_temp(struct thermal_zone_device *thermal,
-			  unsigned long *temp)
+			  int *temp)
 {
 	unsigned long reg;
 	struct kirkwood_thermal_priv *priv = thermal->devdata;
diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c
index b295b2b6c191..42b7d4253b94 100644
--- a/drivers/thermal/of-thermal.c
+++ b/drivers/thermal/of-thermal.c
@@ -91,7 +91,7 @@ struct __thermal_zone {
 /***   DT thermal zone device callbacks   ***/
 
 static int of_thermal_get_temp(struct thermal_zone_device *tz,
-			       unsigned long *temp)
+			       int *temp)
 {
 	struct __thermal_zone *data = tz->devdata;
 
@@ -177,7 +177,7 @@ EXPORT_SYMBOL_GPL(of_thermal_get_trip_points);
  * Return: zero on success, error code otherwise
  */
 static int of_thermal_set_emul_temp(struct thermal_zone_device *tz,
-				    unsigned long temp)
+				    int temp)
 {
 	struct __thermal_zone *data = tz->devdata;
 
@@ -311,7 +311,7 @@ static int of_thermal_get_trip_type(struct thermal_zone_device *tz, int trip,
 }
 
 static int of_thermal_get_trip_temp(struct thermal_zone_device *tz, int trip,
-				    unsigned long *temp)
+				    int *temp)
 {
 	struct __thermal_zone *data = tz->devdata;
 
@@ -324,7 +324,7 @@ static int of_thermal_get_trip_temp(struct thermal_zone_device *tz, int trip,
 }
 
 static int of_thermal_set_trip_temp(struct thermal_zone_device *tz, int trip,
-				    unsigned long temp)
+				    int temp)
 {
 	struct __thermal_zone *data = tz->devdata;
 
@@ -338,7 +338,7 @@ static int of_thermal_set_trip_temp(struct thermal_zone_device *tz, int trip,
 }
 
 static int of_thermal_get_trip_hyst(struct thermal_zone_device *tz, int trip,
-				    unsigned long *hyst)
+				    int *hyst)
 {
 	struct __thermal_zone *data = tz->devdata;
 
@@ -351,7 +351,7 @@ static int of_thermal_get_trip_hyst(struct thermal_zone_device *tz, int trip,
 }
 
 static int of_thermal_set_trip_hyst(struct thermal_zone_device *tz, int trip,
-				    unsigned long hyst)
+				    int hyst)
 {
 	struct __thermal_zone *data = tz->devdata;
 
@@ -365,7 +365,7 @@ static int of_thermal_set_trip_hyst(struct thermal_zone_device *tz, int trip,
 }
 
 static int of_thermal_get_crit_temp(struct thermal_zone_device *tz,
-				    unsigned long *temp)
+				    int *temp)
 {
 	struct __thermal_zone *data = tz->devdata;
 	int i;
diff --git a/drivers/thermal/power_allocator.c b/drivers/thermal/power_allocator.c
index 4672250b329f..045aea59ce9d 100644
--- a/drivers/thermal/power_allocator.c
+++ b/drivers/thermal/power_allocator.c
@@ -92,8 +92,8 @@ struct power_allocator_params {
  * Return: The power budget for the next period.
  */
 static u32 pid_controller(struct thermal_zone_device *tz,
-			  unsigned long current_temp,
-			  unsigned long control_temp,
+			  int current_temp,
+			  int control_temp,
 			  u32 max_allocatable_power)
 {
 	s64 p, i, d, power_range;
@@ -102,7 +102,7 @@ static u32 pid_controller(struct thermal_zone_device *tz,
 
 	max_power_frac = int_to_frac(max_allocatable_power);
 
-	err = ((s32)control_temp - (s32)current_temp);
+	err = control_temp - current_temp;
 	err = int_to_frac(err);
 
 	/* Calculate the proportional term */
@@ -223,8 +223,8 @@ static void divvy_up_power(u32 *req_power, u32 *max_power, int num_actors,
 }
 
 static int allocate_power(struct thermal_zone_device *tz,
-			  unsigned long current_temp,
-			  unsigned long control_temp)
+			  int current_temp,
+			  int control_temp)
 {
 	struct thermal_instance *instance;
 	struct power_allocator_params *params = tz->governor_data;
@@ -326,7 +326,7 @@ static int allocate_power(struct thermal_zone_device *tz,
 				      granted_power, total_granted_power,
 				      num_actors, power_range,
 				      max_allocatable_power, current_temp,
-				      (s32)control_temp - (s32)current_temp);
+				      control_temp - current_temp);
 
 	devm_kfree(&tz->device, req_power);
 unlock:
@@ -411,7 +411,7 @@ static int power_allocator_bind(struct thermal_zone_device *tz)
 {
 	int ret;
 	struct power_allocator_params *params;
-	unsigned long switch_on_temp, control_temp;
+	int switch_on_temp, control_temp;
 	u32 temperature_threshold;
 
 	if (!tz->tzp || !tz->tzp->sustainable_power) {
@@ -476,7 +476,7 @@ static void power_allocator_unbind(struct thermal_zone_device *tz)
 static int power_allocator_throttle(struct thermal_zone_device *tz, int trip)
 {
 	int ret;
-	unsigned long switch_on_temp, control_temp, current_temp;
+	int switch_on_temp, control_temp, current_temp;
 	struct power_allocator_params *params = tz->governor_data;
 
 	/*
diff --git a/drivers/thermal/qcom-spmi-temp-alarm.c b/drivers/thermal/qcom-spmi-temp-alarm.c
index c8d27b8fb9ec..b677aada5b52 100644
--- a/drivers/thermal/qcom-spmi-temp-alarm.c
+++ b/drivers/thermal/qcom-spmi-temp-alarm.c
@@ -117,7 +117,7 @@ static int qpnp_tm_update_temp_no_adc(struct qpnp_tm_chip *chip)
 	return 0;
 }
 
-static int qpnp_tm_get_temp(void *data, long *temp)
+static int qpnp_tm_get_temp(void *data, int *temp)
 {
 	struct qpnp_tm_chip *chip = data;
 	int ret, mili_celsius;
diff --git a/drivers/thermal/rcar_thermal.c b/drivers/thermal/rcar_thermal.c
index fe4e767018c4..5d4ae7d705e0 100644
--- a/drivers/thermal/rcar_thermal.c
+++ b/drivers/thermal/rcar_thermal.c
@@ -200,8 +200,7 @@ err_out_unlock:
 	return ret;
 }
 
-static int rcar_thermal_get_temp(struct thermal_zone_device *zone,
-				 unsigned long *temp)
+static int rcar_thermal_get_temp(struct thermal_zone_device *zone, int *temp)
 {
 	struct rcar_thermal_priv *priv = rcar_zone_to_priv(zone);
 
@@ -235,7 +234,7 @@ static int rcar_thermal_get_trip_type(struct thermal_zone_device *zone,
 }
 
 static int rcar_thermal_get_trip_temp(struct thermal_zone_device *zone,
-				      int trip, unsigned long *temp)
+				      int trip, int *temp)
 {
 	struct rcar_thermal_priv *priv = rcar_zone_to_priv(zone);
 	struct device *dev = rcar_priv_to_dev(priv);
@@ -299,7 +298,7 @@ static void _rcar_thermal_irq_ctrl(struct rcar_thermal_priv *priv, int enable)
 static void rcar_thermal_work(struct work_struct *work)
 {
 	struct rcar_thermal_priv *priv;
-	unsigned long cctemp, nctemp;
+	int cctemp, nctemp;
 
 	priv = container_of(work, struct rcar_thermal_priv, work.work);
 
diff --git a/drivers/thermal/rockchip_thermal.c b/drivers/thermal/rockchip_thermal.c
index cd8f5f93b42c..c89ffb26a354 100644
--- a/drivers/thermal/rockchip_thermal.c
+++ b/drivers/thermal/rockchip_thermal.c
@@ -64,7 +64,7 @@ struct rockchip_tsadc_chip {
 	void (*control)(void __iomem *reg, bool on);
 
 	/* Per-sensor methods */
-	int (*get_temp)(int chn, void __iomem *reg, long *temp);
+	int (*get_temp)(int chn, void __iomem *reg, int *temp);
 	void (*set_tshut_temp)(int chn, void __iomem *reg, long temp);
 	void (*set_tshut_mode)(int chn, void __iomem *reg, enum tshut_mode m);
 };
@@ -191,7 +191,7 @@ static u32 rk_tsadcv2_temp_to_code(long temp)
 	return 0;
 }
 
-static long rk_tsadcv2_code_to_temp(u32 code)
+static int rk_tsadcv2_code_to_temp(u32 code)
 {
 	unsigned int low = 0;
 	unsigned int high = ARRAY_SIZE(v2_code_table) - 1;
@@ -277,7 +277,7 @@ static void rk_tsadcv2_control(void __iomem *regs, bool enable)
 	writel_relaxed(val, regs + TSADCV2_AUTO_CON);
 }
 
-static int rk_tsadcv2_get_temp(int chn, void __iomem *regs, long *temp)
+static int rk_tsadcv2_get_temp(int chn, void __iomem *regs, int *temp)
 {
 	u32 val;
 
@@ -366,7 +366,7 @@ static irqreturn_t rockchip_thermal_alarm_irq_thread(int irq, void *dev)
 	return IRQ_HANDLED;
 }
 
-static int rockchip_thermal_get_temp(void *_sensor, long *out_temp)
+static int rockchip_thermal_get_temp(void *_sensor, int *out_temp)
 {
 	struct rockchip_thermal_sensor *sensor = _sensor;
 	struct rockchip_thermal_data *thermal = sensor->thermal;
@@ -374,7 +374,7 @@ static int rockchip_thermal_get_temp(void *_sensor, long *out_temp)
 	int retval;
 
 	retval = tsadc->get_temp(sensor->id, thermal->regs, out_temp);
-	dev_dbg(&thermal->pdev->dev, "sensor %d - temp: %ld, retval: %d\n",
+	dev_dbg(&thermal->pdev->dev, "sensor %d - temp: %d, retval: %d\n",
 		sensor->id, *out_temp, retval);
 
 	return retval;
diff --git a/drivers/thermal/samsung/exynos_tmu.c b/drivers/thermal/samsung/exynos_tmu.c
index 531f4b179871..9ec29a33aeea 100644
--- a/drivers/thermal/samsung/exynos_tmu.c
+++ b/drivers/thermal/samsung/exynos_tmu.c
@@ -207,8 +207,7 @@ struct exynos_tmu_data {
 	int (*tmu_initialize)(struct platform_device *pdev);
 	void (*tmu_control)(struct platform_device *pdev, bool on);
 	int (*tmu_read)(struct exynos_tmu_data *data);
-	void (*tmu_set_emulation)(struct exynos_tmu_data *data,
-				  unsigned long temp);
+	void (*tmu_set_emulation)(struct exynos_tmu_data *data, int temp);
 	void (*tmu_clear_irqs)(struct exynos_tmu_data *data);
 };
 
@@ -216,7 +215,7 @@ static void exynos_report_trigger(struct exynos_tmu_data *p)
 {
 	char data[10], *envp[] = { data, NULL };
 	struct thermal_zone_device *tz = p->tzd;
-	unsigned long temp;
+	int temp;
 	unsigned int i;
 
 	if (!tz) {
@@ -517,7 +516,7 @@ static int exynos5433_tmu_initialize(struct platform_device *pdev)
 	struct thermal_zone_device *tz = data->tzd;
 	unsigned int status, trim_info;
 	unsigned int rising_threshold = 0, falling_threshold = 0;
-	unsigned long temp, temp_hist;
+	int temp, temp_hist;
 	int ret = 0, threshold_code, i, sensor_id, cal_type;
 
 	status = readb(data->base + EXYNOS_TMU_REG_STATUS);
@@ -610,7 +609,7 @@ static int exynos5440_tmu_initialize(struct platform_device *pdev)
 	struct exynos_tmu_data *data = platform_get_drvdata(pdev);
 	unsigned int trim_info = 0, con, rising_threshold;
 	int ret = 0, threshold_code;
-	unsigned long crit_temp = 0;
+	int crit_temp = 0;
 
 	/*
 	 * For exynos5440 soc triminfo value is swapped between TMU0 and
@@ -663,7 +662,7 @@ static int exynos7_tmu_initialize(struct platform_device *pdev)
 	unsigned int status, trim_info;
 	unsigned int rising_threshold = 0, falling_threshold = 0;
 	int ret = 0, threshold_code, i;
-	unsigned long temp, temp_hist;
+	int temp, temp_hist;
 	unsigned int reg_off, bit_off;
 
 	status = readb(data->base + EXYNOS_TMU_REG_STATUS);
@@ -876,7 +875,7 @@ static void exynos7_tmu_control(struct platform_device *pdev, bool on)
 	writel(con, data->base + EXYNOS_TMU_REG_CONTROL);
 }
 
-static int exynos_get_temp(void *p, long *temp)
+static int exynos_get_temp(void *p, int *temp)
 {
 	struct exynos_tmu_data *data = p;
 
@@ -896,7 +895,7 @@ static int exynos_get_temp(void *p, long *temp)
 
 #ifdef CONFIG_THERMAL_EMULATION
 static u32 get_emul_con_reg(struct exynos_tmu_data *data, unsigned int val,
-			    unsigned long temp)
+			    int temp)
 {
 	if (temp) {
 		temp /= MCELSIUS;
@@ -926,7 +925,7 @@ static u32 get_emul_con_reg(struct exynos_tmu_data *data, unsigned int val,
 }
 
 static void exynos4412_tmu_set_emulation(struct exynos_tmu_data *data,
-					 unsigned long temp)
+					 int temp)
 {
 	unsigned int val;
 	u32 emul_con;
@@ -946,7 +945,7 @@ static void exynos4412_tmu_set_emulation(struct exynos_tmu_data *data,
 }
 
 static void exynos5440_tmu_set_emulation(struct exynos_tmu_data *data,
-					 unsigned long temp)
+					 int temp)
 {
 	unsigned int val;
 
@@ -955,7 +954,7 @@ static void exynos5440_tmu_set_emulation(struct exynos_tmu_data *data,
 	writel(val, data->base + EXYNOS5440_TMU_S0_7_DEBUG);
 }
 
-static int exynos_tmu_set_emulation(void *drv_data, unsigned long temp)
+static int exynos_tmu_set_emulation(void *drv_data, int temp)
 {
 	struct exynos_tmu_data *data = drv_data;
 	int ret = -EINVAL;
@@ -978,7 +977,7 @@ out:
 #else
 #define exynos4412_tmu_set_emulation NULL
 #define exynos5440_tmu_set_emulation NULL
-static int exynos_tmu_set_emulation(void *drv_data,	unsigned long temp)
+static int exynos_tmu_set_emulation(void *drv_data, int temp)
 	{ return -EINVAL; }
 #endif /* CONFIG_THERMAL_EMULATION */
 
diff --git a/drivers/thermal/spear_thermal.c b/drivers/thermal/spear_thermal.c
index bddb71744a6c..534dd9136662 100644
--- a/drivers/thermal/spear_thermal.c
+++ b/drivers/thermal/spear_thermal.c
@@ -38,7 +38,7 @@ struct spear_thermal_dev {
 };
 
 static inline int thermal_get_temp(struct thermal_zone_device *thermal,
-				unsigned long *temp)
+				int *temp)
 {
 	struct spear_thermal_dev *stdev = thermal->devdata;
 
diff --git a/drivers/thermal/st/st_thermal.c b/drivers/thermal/st/st_thermal.c
index 76c515dd802b..44cbba99716a 100644
--- a/drivers/thermal/st/st_thermal.c
+++ b/drivers/thermal/st/st_thermal.c
@@ -111,8 +111,7 @@ static int st_thermal_calibration(struct st_thermal_sensor *sensor)
 }
 
 /* Callback to get temperature from HW*/
-static int st_thermal_get_temp(struct thermal_zone_device *th,
-		unsigned long *temperature)
+static int st_thermal_get_temp(struct thermal_zone_device *th, int *temperature)
 {
 	struct st_thermal_sensor *sensor = th->devdata;
 	struct device *dev = sensor->dev;
@@ -159,7 +158,7 @@ static int st_thermal_get_trip_type(struct thermal_zone_device *th,
 }
 
 static int st_thermal_get_trip_temp(struct thermal_zone_device *th,
-				    int trip, unsigned long *temp)
+				    int trip, int *temp)
 {
 	struct st_thermal_sensor *sensor = th->devdata;
 	struct device *dev = sensor->dev;
diff --git a/drivers/thermal/step_wise.c b/drivers/thermal/step_wise.c
index 5a0f12d08e8b..2f9f7086ac3d 100644
--- a/drivers/thermal/step_wise.c
+++ b/drivers/thermal/step_wise.c
@@ -113,7 +113,7 @@ static void update_passive_instance(struct thermal_zone_device *tz,
 
 static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 {
-	long trip_temp;
+	int trip_temp;
 	enum thermal_trip_type trip_type;
 	enum thermal_trend trend;
 	struct thermal_instance *instance;
@@ -135,7 +135,7 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 		trace_thermal_zone_trip(tz, trip, trip_type);
 	}
 
-	dev_dbg(&tz->device, "Trip%d[type=%d,temp=%ld]:trend=%d,throttle=%d\n",
+	dev_dbg(&tz->device, "Trip%d[type=%d,temp=%d]:trend=%d,throttle=%d\n",
 				trip, trip_type, trip_temp, trend, throttle);
 
 	mutex_lock(&tz->lock);
diff --git a/drivers/thermal/tegra_soctherm.c b/drivers/thermal/tegra_soctherm.c
index 9197fc05c5cc..74ea5765938b 100644
--- a/drivers/thermal/tegra_soctherm.c
+++ b/drivers/thermal/tegra_soctherm.c
@@ -293,7 +293,7 @@ static int enable_tsensor(struct tegra_soctherm *tegra,
  * H denotes an addition of 0.5 Celsius and N denotes negation
  * of the final value.
  */
-static long translate_temp(u16 val)
+static int translate_temp(u16 val)
 {
 	long t;
 
@@ -306,7 +306,7 @@ static long translate_temp(u16 val)
 	return t;
 }
 
-static int tegra_thermctl_get_temp(void *data, long *out_temp)
+static int tegra_thermctl_get_temp(void *data, int *out_temp)
 {
 	struct tegra_thermctl_zone *zone = data;
 	u32 val;
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index c4700950e42e..387c4287fc74 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -426,7 +426,7 @@ static void handle_non_critical_trips(struct thermal_zone_device *tz,
 static void handle_critical_trips(struct thermal_zone_device *tz,
 				int trip, enum thermal_trip_type trip_type)
 {
-	long trip_temp;
+	int trip_temp;
 
 	tz->ops->get_trip_temp(tz, trip, &trip_temp);
 
@@ -474,12 +474,12 @@ static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
  *
  * Return: On success returns 0, an error code otherwise
  */
-int thermal_zone_get_temp(struct thermal_zone_device *tz, unsigned long *temp)
+int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
 {
 	int ret = -EINVAL;
 #ifdef CONFIG_THERMAL_EMULATION
 	int count;
-	unsigned long crit_temp = -1UL;
+	int crit_temp = INT_MAX;
 	enum thermal_trip_type type;
 #endif
 
@@ -516,8 +516,7 @@ EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
 
 static void update_temperature(struct thermal_zone_device *tz)
 {
-	long temp;
-	int ret;
+	int temp, ret;
 
 	ret = thermal_zone_get_temp(tz, &temp);
 	if (ret) {
@@ -577,15 +576,14 @@ static ssize_t
 temp_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct thermal_zone_device *tz = to_thermal_zone(dev);
-	long temperature;
-	int ret;
+	int temperature, ret;
 
 	ret = thermal_zone_get_temp(tz, &temperature);
 
 	if (ret)
 		return ret;
 
-	return sprintf(buf, "%ld\n", temperature);
+	return sprintf(buf, "%d\n", temperature);
 }
 
 static ssize_t
@@ -689,7 +687,7 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr,
 {
 	struct thermal_zone_device *tz = to_thermal_zone(dev);
 	int trip, ret;
-	long temperature;
+	int temperature;
 
 	if (!tz->ops->get_trip_temp)
 		return -EPERM;
@@ -702,7 +700,7 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr,
 	if (ret)
 		return ret;
 
-	return sprintf(buf, "%ld\n", temperature);
+	return sprintf(buf, "%d\n", temperature);
 }
 
 static ssize_t
@@ -711,7 +709,7 @@ trip_point_hyst_store(struct device *dev, struct device_attribute *attr,
 {
 	struct thermal_zone_device *tz = to_thermal_zone(dev);
 	int trip, ret;
-	unsigned long temperature;
+	int temperature;
 
 	if (!tz->ops->set_trip_hyst)
 		return -EPERM;
@@ -719,7 +717,7 @@ trip_point_hyst_store(struct device *dev, struct device_attribute *attr,
 	if (!sscanf(attr->attr.name, "trip_point_%d_hyst", &trip))
 		return -EINVAL;
 
-	if (kstrtoul(buf, 10, &temperature))
+	if (kstrtoint(buf, 10, &temperature))
 		return -EINVAL;
 
 	/*
@@ -738,7 +736,7 @@ trip_point_hyst_show(struct device *dev, struct device_attribute *attr,
 {
 	struct thermal_zone_device *tz = to_thermal_zone(dev);
 	int trip, ret;
-	unsigned long temperature;
+	int temperature;
 
 	if (!tz->ops->get_trip_hyst)
 		return -EPERM;
@@ -748,7 +746,7 @@ trip_point_hyst_show(struct device *dev, struct device_attribute *attr,
 
 	ret = tz->ops->get_trip_hyst(tz, trip, &temperature);
 
-	return ret ? ret : sprintf(buf, "%ld\n", temperature);
+	return ret ? ret : sprintf(buf, "%d\n", temperature);
 }
 
 static ssize_t
diff --git a/drivers/thermal/thermal_hwmon.c b/drivers/thermal/thermal_hwmon.c
index 1967bee4f076..06fd2ed9ef9d 100644
--- a/drivers/thermal/thermal_hwmon.c
+++ b/drivers/thermal/thermal_hwmon.c
@@ -69,7 +69,7 @@ static DEVICE_ATTR(name, 0444, name_show, NULL);
 static ssize_t
 temp_input_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	long temperature;
+	int temperature;
 	int ret;
 	struct thermal_hwmon_attr *hwmon_attr
 			= container_of(attr, struct thermal_hwmon_attr, attr);
@@ -83,7 +83,7 @@ temp_input_show(struct device *dev, struct device_attribute *attr, char *buf)
 	if (ret)
 		return ret;
 
-	return sprintf(buf, "%ld\n", temperature);
+	return sprintf(buf, "%d\n", temperature);
 }
 
 static ssize_t
@@ -95,14 +95,14 @@ temp_crit_show(struct device *dev, struct device_attribute *attr, char *buf)
 			= container_of(hwmon_attr, struct thermal_hwmon_temp,
 				       temp_crit);
 	struct thermal_zone_device *tz = temp->tz;
-	long temperature;
+	int temperature;
 	int ret;
 
 	ret = tz->ops->get_trip_temp(tz, 0, &temperature);
 	if (ret)
 		return ret;
 
-	return sprintf(buf, "%ld\n", temperature);
+	return sprintf(buf, "%d\n", temperature);
 }
 
 
@@ -142,7 +142,7 @@ thermal_hwmon_lookup_temp(const struct thermal_hwmon_device *hwmon,
 
 static bool thermal_zone_crit_temp_valid(struct thermal_zone_device *tz)
 {
-	unsigned long temp;
+	int temp;
 	return tz->ops->get_crit_temp && !tz->ops->get_crit_temp(tz, &temp);
 }
 
diff --git a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
index c7c5b3779dac..b213a1222295 100644
--- a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
+++ b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
@@ -76,14 +76,14 @@ static inline int ti_thermal_hotspot_temperature(int t, int s, int c)
 
 /* thermal zone ops */
 /* Get temperature callback function for thermal zone */
-static inline int __ti_thermal_get_temp(void *devdata, long *temp)
+static inline int __ti_thermal_get_temp(void *devdata, int *temp)
 {
 	struct thermal_zone_device *pcb_tz = NULL;
 	struct ti_thermal_data *data = devdata;
 	struct ti_bandgap *bgp;
 	const struct ti_temp_sensor *s;
 	int ret, tmp, slope, constant;
-	unsigned long pcb_temp;
+	int pcb_temp;
 
 	if (!data)
 		return 0;
@@ -119,7 +119,7 @@ static inline int __ti_thermal_get_temp(void *devdata, long *temp)
 }
 
 static inline int ti_thermal_get_temp(struct thermal_zone_device *thermal,
-				      unsigned long *temp)
+				      int *temp)
 {
 	struct ti_thermal_data *data = thermal->devdata;
 
@@ -229,7 +229,7 @@ static int ti_thermal_get_trip_type(struct thermal_zone_device *thermal,
 
 /* Get trip temperature callback functions for thermal zone */
 static int ti_thermal_get_trip_temp(struct thermal_zone_device *thermal,
-				    int trip, unsigned long *temp)
+				    int trip, int *temp)
 {
 	if (!ti_thermal_is_valid_trip(trip))
 		return -EINVAL;
@@ -280,7 +280,7 @@ static int ti_thermal_get_trend(struct thermal_zone_device *thermal,
 
 /* Get critical temperature callback functions for thermal zone */
 static int ti_thermal_get_crit_temp(struct thermal_zone_device *thermal,
-				    unsigned long *temp)
+				    int *temp)
 {
 	/* shutdown zone */
 	return ti_thermal_get_trip_temp(thermal, OMAP_TRIP_NUMBER - 1, temp);
diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
index 50d1d2cb091a..7fc919f7da4d 100644
--- a/drivers/thermal/x86_pkg_temp_thermal.c
+++ b/drivers/thermal/x86_pkg_temp_thermal.c
@@ -164,7 +164,7 @@ err_ret:
 	return err;
 }
 
-static int sys_get_curr_temp(struct thermal_zone_device *tzd, unsigned long *temp)
+static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp)
 {
 	u32 eax, edx;
 	struct phy_dev_entry *phy_dev_entry;
@@ -175,7 +175,7 @@ static int sys_get_curr_temp(struct thermal_zone_device *tzd, unsigned long *tem
 	if (eax & 0x80000000) {
 		*temp = phy_dev_entry->tj_max -
 				((eax >> 16) & 0x7f) * 1000;
-		pr_debug("sys_get_curr_temp %ld\n", *temp);
+		pr_debug("sys_get_curr_temp %d\n", *temp);
 		return 0;
 	}
 
@@ -183,7 +183,7 @@ static int sys_get_curr_temp(struct thermal_zone_device *tzd, unsigned long *tem
 }
 
 static int sys_get_trip_temp(struct thermal_zone_device *tzd,
-		int trip, unsigned long *temp)
+		int trip, int *temp)
 {
 	u32 eax, edx;
 	struct phy_dev_entry *phy_dev_entry;
@@ -214,13 +214,13 @@ static int sys_get_trip_temp(struct thermal_zone_device *tzd,
 		*temp = phy_dev_entry->tj_max - thres_reg_value * 1000;
 	else
 		*temp = 0;
-	pr_debug("sys_get_trip_temp %ld\n", *temp);
+	pr_debug("sys_get_trip_temp %d\n", *temp);
 
 	return 0;
 }
 
 static int sys_set_trip_temp(struct thermal_zone_device *tzd, int trip,
-							unsigned long temp)
+							int temp)
 {
 	u32 l, h;
 	struct phy_dev_entry *phy_dev_entry;
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 037e9df2f610..17292fee8686 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -92,23 +92,19 @@ struct thermal_zone_device_ops {
 		     struct thermal_cooling_device *);
 	int (*unbind) (struct thermal_zone_device *,
 		       struct thermal_cooling_device *);
-	int (*get_temp) (struct thermal_zone_device *, unsigned long *);
+	int (*get_temp) (struct thermal_zone_device *, int *);
 	int (*get_mode) (struct thermal_zone_device *,
 			 enum thermal_device_mode *);
 	int (*set_mode) (struct thermal_zone_device *,
 		enum thermal_device_mode);
 	int (*get_trip_type) (struct thermal_zone_device *, int,
 		enum thermal_trip_type *);
-	int (*get_trip_temp) (struct thermal_zone_device *, int,
-			      unsigned long *);
-	int (*set_trip_temp) (struct thermal_zone_device *, int,
-			      unsigned long);
-	int (*get_trip_hyst) (struct thermal_zone_device *, int,
-			      unsigned long *);
-	int (*set_trip_hyst) (struct thermal_zone_device *, int,
-			      unsigned long);
-	int (*get_crit_temp) (struct thermal_zone_device *, unsigned long *);
-	int (*set_emul_temp) (struct thermal_zone_device *, unsigned long);
+	int (*get_trip_temp) (struct thermal_zone_device *, int, int *);
+	int (*set_trip_temp) (struct thermal_zone_device *, int, int);
+	int (*get_trip_hyst) (struct thermal_zone_device *, int, int *);
+	int (*set_trip_hyst) (struct thermal_zone_device *, int, int);
+	int (*get_crit_temp) (struct thermal_zone_device *, int *);
+	int (*set_emul_temp) (struct thermal_zone_device *, int);
 	int (*get_trend) (struct thermal_zone_device *, int,
 			  enum thermal_trend *);
 	int (*notify) (struct thermal_zone_device *, int,
@@ -332,9 +328,9 @@ struct thermal_genl_event {
  *		   temperature.
  */
 struct thermal_zone_of_device_ops {
-	int (*get_temp)(void *, long *);
+	int (*get_temp)(void *, int *);
 	int (*get_trend)(void *, long *);
-	int (*set_emul_temp)(void *, unsigned long);
+	int (*set_emul_temp)(void *, int);
 };
 
 /**
@@ -406,7 +402,7 @@ thermal_of_cooling_device_register(struct device_node *np, char *, void *,
 				   const struct thermal_cooling_device_ops *);
 void thermal_cooling_device_unregister(struct thermal_cooling_device *);
 struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name);
-int thermal_zone_get_temp(struct thermal_zone_device *tz, unsigned long *temp);
+int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp);
 
 int get_tz_trend(struct thermal_zone_device *, int);
 struct thermal_instance *get_thermal_instance(struct thermal_zone_device *,
@@ -457,7 +453,7 @@ static inline struct thermal_zone_device *thermal_zone_get_zone_by_name(
 		const char *name)
 { return ERR_PTR(-ENODEV); }
 static inline int thermal_zone_get_temp(
-		struct thermal_zone_device *tz, unsigned long *temp)
+		struct thermal_zone_device *tz, int *temp)
 { return -ENODEV; }
 static inline int get_tz_trend(struct thermal_zone_device *tz, int trip)
 { return -ENODEV; }
diff --git a/include/trace/events/thermal_power_allocator.h b/include/trace/events/thermal_power_allocator.h
index 12e1321c4e0c..5afae8fe3795 100644
--- a/include/trace/events/thermal_power_allocator.h
+++ b/include/trace/events/thermal_power_allocator.h
@@ -11,7 +11,7 @@ TRACE_EVENT(thermal_power_allocator,
 		 u32 total_req_power, u32 *granted_power,
 		 u32 total_granted_power, size_t num_actors,
 		 u32 power_range, u32 max_allocatable_power,
-		 unsigned long current_temp, s32 delta_temp),
+		 int current_temp, s32 delta_temp),
 	TP_ARGS(tz, req_power, total_req_power, granted_power,
 		total_granted_power, num_actors, power_range,
 		max_allocatable_power, current_temp, delta_temp),
@@ -24,7 +24,7 @@ TRACE_EVENT(thermal_power_allocator,
 		__field(size_t,        num_actors               )
 		__field(u32,           power_range              )
 		__field(u32,           max_allocatable_power    )
-		__field(unsigned long, current_temp             )
+		__field(int,           current_temp             )
 		__field(s32,           delta_temp               )
 	),
 	TP_fast_assign(
@@ -42,7 +42,7 @@ TRACE_EVENT(thermal_power_allocator,
 		__entry->delta_temp = delta_temp;
 	),
 
-	TP_printk("thermal_zone_id=%d req_power={%s} total_req_power=%u granted_power={%s} total_granted_power=%u power_range=%u max_allocatable_power=%u current_temperature=%lu delta_temperature=%d",
+	TP_printk("thermal_zone_id=%d req_power={%s} total_req_power=%u granted_power={%s} total_granted_power=%u power_range=%u max_allocatable_power=%u current_temperature=%d delta_temperature=%d",
 		__entry->tz_id,
 		__print_array(__get_dynamic_array(req_power),
                               __entry->num_actors, 4),
-- 
cgit v1.2.3-70-g09d2


From 84ded2f8e7dda336fc2fb3570726ceb3b3b3590f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 3 Aug 2015 11:45:34 -0400
Subject: Revert "libata: Implement support for sense data reporting"

This reverts commit fe7173c206de63fc28475ee6ae42ff95c05692de.

As implemented, ACS-4 sense reporting for ATA devices bypasses error
diagnosis and handling in libata degrading EH behavior significantly.
Revert the related changes for now.

ATA_ID_COMMAND_SET_3/4 constants are not reverted as they're used by
later changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Hannes Reinecke <hare@suse.de>
Cc: stable@vger.kernel.org #v4.1+
---
 drivers/ata/libata-core.c | 20 +----------
 drivers/ata/libata-eh.c   | 86 ++---------------------------------------------
 include/linux/ata.h       | 16 ---------
 3 files changed, 4 insertions(+), 118 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 426bc12459de..19bcb80b2031 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2147,24 +2147,6 @@ static int ata_dev_config_ncq(struct ata_device *dev,
 	return 0;
 }
 
-static void ata_dev_config_sense_reporting(struct ata_device *dev)
-{
-	unsigned int err_mask;
-
-	if (!ata_id_has_sense_reporting(dev->id))
-		return;
-
-	if (ata_id_sense_reporting_enabled(dev->id))
-		return;
-
-	err_mask = ata_dev_set_feature(dev, SETFEATURE_SENSE_DATA, 0x1);
-	if (err_mask) {
-		ata_dev_dbg(dev,
-			    "failed to enable Sense Data Reporting, Emask 0x%x\n",
-			    err_mask);
-	}
-}
-
 /**
  *	ata_dev_configure - Configure the specified ATA/ATAPI device
  *	@dev: Target device to configure
@@ -2387,7 +2369,7 @@ int ata_dev_configure(struct ata_device *dev)
 					dev->devslp_timing[i] = sata_setting[j];
 				}
 		}
-		ata_dev_config_sense_reporting(dev);
+
 		dev->cdb_len = 16;
 	}
 
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index af08d32af4e0..16125be34893 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1629,70 +1629,6 @@ unsigned int atapi_eh_tur(struct ata_device *dev, u8 *r_sense_key)
 	return err_mask;
 }
 
-/**
- *	ata_eh_request_sense - perform REQUEST_SENSE_DATA_EXT
- *	@dev: device to perform REQUEST_SENSE_SENSE_DATA_EXT to
- *	@sense_buf: result sense data buffer (SCSI_SENSE_BUFFERSIZE bytes long)
- *	@dfl_sense_key: default sense key to use
- *
- *	Perform REQUEST_SENSE_DATA_EXT after the device reported CHECK
- *	SENSE.  This function is EH helper.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep).
- *
- *	RETURNS:
- *	encoded sense data on success, 0 on failure or if sense data
- *	is not available.
- */
-static u32 ata_eh_request_sense(struct ata_queued_cmd *qc,
-				struct scsi_cmnd *cmd)
-{
-	struct ata_device *dev = qc->dev;
-	struct ata_taskfile tf;
-	unsigned int err_mask;
-
-	if (!cmd)
-		return 0;
-
-	DPRINTK("ATA request sense\n");
-	ata_dev_warn(dev, "request sense\n");
-	if (!ata_id_sense_reporting_enabled(dev->id)) {
-		ata_dev_warn(qc->dev, "sense data reporting disabled\n");
-		return 0;
-	}
-	ata_tf_init(dev, &tf);
-
-	tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
-	tf.flags |= ATA_TFLAG_LBA | ATA_TFLAG_LBA48;
-	tf.command = ATA_CMD_REQ_SENSE_DATA;
-	tf.protocol = ATA_PROT_NODATA;
-
-	err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
-	/*
-	 * ACS-4 states:
-	 * The device may set the SENSE DATA AVAILABLE bit to one in the
-	 * STATUS field and clear the ERROR bit to zero in the STATUS field
-	 * to indicate that the command returned completion without an error
-	 * and the sense data described in table 306 is available.
-	 *
-	 * IOW the 'ATA_SENSE' bit might not be set even though valid
-	 * sense data is available.
-	 * So check for both.
-	 */
-	if ((tf.command & ATA_SENSE) ||
-		tf.lbah != 0 || tf.lbam != 0 || tf.lbal != 0) {
-		ata_scsi_set_sense(cmd, tf.lbah, tf.lbam, tf.lbal);
-		qc->flags |= ATA_QCFLAG_SENSE_VALID;
-		ata_dev_warn(dev, "sense data %02x/%02x/%02x\n",
-			     tf.lbah, tf.lbam, tf.lbal);
-	} else {
-		ata_dev_warn(dev, "request sense failed stat %02x emask %x\n",
-			     tf.command, err_mask);
-	}
-	return err_mask;
-}
-
 /**
  *	atapi_eh_request_sense - perform ATAPI REQUEST_SENSE
  *	@dev: device to perform REQUEST_SENSE to
@@ -1896,22 +1832,7 @@ static unsigned int ata_eh_analyze_tf(struct ata_queued_cmd *qc,
 		return ATA_EH_RESET;
 	}
 
-	/*
-	 * Sense data reporting does not work if the
-	 * device fault bit is set.
-	 */
-	if ((stat & ATA_SENSE) && !(stat & ATA_DF) &&
-	    !(qc->flags & ATA_QCFLAG_SENSE_VALID)) {
-		if (!(qc->ap->pflags & ATA_PFLAG_FROZEN)) {
-			tmp = ata_eh_request_sense(qc, qc->scsicmd);
-			if (tmp)
-				qc->err_mask |= tmp;
-		} else {
-			ata_dev_warn(qc->dev, "sense data available but port frozen\n");
-		}
-	}
-
-	/* Set by NCQ autosense or request sense above */
+	/* Set by NCQ autosense */
 	if (qc->flags & ATA_QCFLAG_SENSE_VALID)
 		return 0;
 
@@ -2658,15 +2579,14 @@ static void ata_eh_link_report(struct ata_link *link)
 
 #ifdef CONFIG_ATA_VERBOSE_ERROR
 		if (res->command & (ATA_BUSY | ATA_DRDY | ATA_DF | ATA_DRQ |
-				    ATA_SENSE | ATA_ERR)) {
+				    ATA_ERR)) {
 			if (res->command & ATA_BUSY)
 				ata_dev_err(qc->dev, "status: { Busy }\n");
 			else
-				ata_dev_err(qc->dev, "status: { %s%s%s%s%s}\n",
+				ata_dev_err(qc->dev, "status: { %s%s%s%s}\n",
 				  res->command & ATA_DRDY ? "DRDY " : "",
 				  res->command & ATA_DF ? "DF " : "",
 				  res->command & ATA_DRQ ? "DRQ " : "",
-				  res->command & ATA_SENSE ? "SENSE " : "",
 				  res->command & ATA_ERR ? "ERR " : "");
 		}
 
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 6c78956aa470..0e6a782575b5 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -385,8 +385,6 @@ enum {
 	SATA_SSP		= 0x06,	/* Software Settings Preservation */
 	SATA_DEVSLP		= 0x09,	/* Device Sleep */
 
-	SETFEATURE_SENSE_DATA = 0xC3, /* Sense Data Reporting feature */
-
 	/* feature values for SET_MAX */
 	ATA_SET_MAX_ADDR	= 0x00,
 	ATA_SET_MAX_PASSWD	= 0x01,
@@ -720,20 +718,6 @@ static inline bool ata_id_has_read_log_dma_ext(const u16 *id)
 	return false;
 }
 
-static inline bool ata_id_has_sense_reporting(const u16 *id)
-{
-	if (!(id[ATA_ID_CFS_ENABLE_2] & (1 << 15)))
-		return false;
-	return id[ATA_ID_COMMAND_SET_3] & (1 << 6);
-}
-
-static inline bool ata_id_sense_reporting_enabled(const u16 *id)
-{
-	if (!(id[ATA_ID_CFS_ENABLE_2] & (1 << 15)))
-		return false;
-	return id[ATA_ID_COMMAND_SET_4] & (1 << 6);
-}
-
 /**
  *	ata_id_major_version	-	get ATA level of drive
  *	@id: Identify data
-- 
cgit v1.2.3-70-g09d2


From 74a80d67b8316eb3fbeb73dafc060a5a0a708587 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 3 Aug 2015 11:46:39 -0400
Subject: Revert "libata: Implement NCQ autosense"

This reverts commit 42b966fbf35da9c87f08d98f9b8978edf9e717cf.

As implemented, ACS-4 sense reporting for ATA devices bypasses error
diagnosis and handling in libata degrading EH behavior significantly.
Revert the related changes for now.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Hannes Reinecke <hare@suse.de>
Cc: stable@vger.kernel.org #v4.1+
---
 drivers/ata/libata-eh.c   | 18 ------------------
 drivers/ata/libata-scsi.c |  9 ++-------
 drivers/ata/libata.h      |  1 -
 include/linux/ata.h       |  2 --
 4 files changed, 2 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 16125be34893..cb0508af1459 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1592,8 +1592,6 @@ static int ata_eh_read_log_10h(struct ata_device *dev,
 	tf->hob_lbah = buf[10];
 	tf->nsect = buf[12];
 	tf->hob_nsect = buf[13];
-	if (ata_id_has_ncq_autosense(dev->id))
-		tf->auxiliary = buf[14] << 16 | buf[15] << 8 | buf[16];
 
 	return 0;
 }
@@ -1791,18 +1789,6 @@ void ata_eh_analyze_ncq_error(struct ata_link *link)
 	memcpy(&qc->result_tf, &tf, sizeof(tf));
 	qc->result_tf.flags = ATA_TFLAG_ISADDR | ATA_TFLAG_LBA | ATA_TFLAG_LBA48;
 	qc->err_mask |= AC_ERR_DEV | AC_ERR_NCQ;
-	if (qc->result_tf.auxiliary) {
-		char sense_key, asc, ascq;
-
-		sense_key = (qc->result_tf.auxiliary >> 16) & 0xff;
-		asc = (qc->result_tf.auxiliary >> 8) & 0xff;
-		ascq = qc->result_tf.auxiliary & 0xff;
-		ata_dev_dbg(dev, "NCQ Autosense %02x/%02x/%02x\n",
-			    sense_key, asc, ascq);
-		ata_scsi_set_sense(qc->scsicmd, sense_key, asc, ascq);
-		qc->flags |= ATA_QCFLAG_SENSE_VALID;
-	}
-
 	ehc->i.err_mask &= ~AC_ERR_DEV;
 }
 
@@ -1832,10 +1818,6 @@ static unsigned int ata_eh_analyze_tf(struct ata_queued_cmd *qc,
 		return ATA_EH_RESET;
 	}
 
-	/* Set by NCQ autosense */
-	if (qc->flags & ATA_QCFLAG_SENSE_VALID)
-		return 0;
-
 	if (stat & (ATA_ERR | ATA_DF))
 		qc->err_mask |= AC_ERR_DEV;
 	else
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index e1ecd2ab3724..0d7f0da3a269 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -270,11 +270,8 @@ DEVICE_ATTR(unload_heads, S_IRUGO | S_IWUSR,
 	    ata_scsi_park_show, ata_scsi_park_store);
 EXPORT_SYMBOL_GPL(dev_attr_unload_heads);
 
-void ata_scsi_set_sense(struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq)
+static void ata_scsi_set_sense(struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq)
 {
-	if (!cmd)
-		return;
-
 	cmd->result = (DRIVER_SENSE << 24) | SAM_STAT_CHECK_CONDITION;
 
 	scsi_build_sense_buffer(0, cmd->sense_buffer, sk, asc, ascq);
@@ -1780,9 +1777,7 @@ static void ata_scsi_qc_complete(struct ata_queued_cmd *qc)
 	    ((cdb[2] & 0x20) || need_sense)) {
 		ata_gen_passthru_sense(qc);
 	} else {
-		if (qc->flags & ATA_QCFLAG_SENSE_VALID) {
-			cmd->result = SAM_STAT_CHECK_CONDITION;
-		} else if (!need_sense) {
+		if (!need_sense) {
 			cmd->result = SAM_STAT_GOOD;
 		} else {
 			/* TODO: decide which descriptor format to use
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 8cfdd9616d16..f840ca18a7c0 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -137,7 +137,6 @@ extern int ata_scsi_add_hosts(struct ata_host *host,
 			      struct scsi_host_template *sht);
 extern void ata_scsi_scan_host(struct ata_port *ap, int sync);
 extern int ata_scsi_offline_dev(struct ata_device *dev);
-extern void ata_scsi_set_sense(struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq);
 extern void ata_scsi_media_change_notify(struct ata_device *dev);
 extern void ata_scsi_hotplug(struct work_struct *work);
 extern void ata_schedule_scsi_eh(struct Scsi_Host *shost);
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 0e6a782575b5..d2992bfa1706 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -528,8 +528,6 @@ struct ata_bmdma_prd {
 #define ata_id_cdb_intr(id)	(((id)[ATA_ID_CONFIG] & 0x60) == 0x20)
 #define ata_id_has_da(id)	((id)[ATA_ID_SATA_CAPABILITY_2] & (1 << 4))
 #define ata_id_has_devslp(id)	((id)[ATA_ID_FEATURE_SUPP] & (1 << 8))
-#define ata_id_has_ncq_autosense(id) \
-				((id)[ATA_ID_FEATURE_SUPP] & (1 << 7))
 
 static inline bool ata_id_has_hipm(const u16 *id)
 {
-- 
cgit v1.2.3-70-g09d2


From a3a10ce3429e5dee623ad5c8407ea58e204fcb0a Mon Sep 17 00:00:00 2001
From: Richard Watts <rrw@kynesim.co.uk>
Date: Tue, 19 May 2015 16:06:53 +0100
Subject: Avoid usb reset crashes by making tty_io cdevs truly dynamic

Avoid usb reset crashes by making tty_io cdevs truly dynamic

Signed-off-by: Richard Watts <rrw@kynesim.co.uk>
Reported-by: Duncan Mackintosh <DMackintosh@cbnl.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c       | 24 ++++++++++++++++--------
 include/linux/tty_driver.h |  2 +-
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index c37a215177c0..02785d844354 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -3152,9 +3152,12 @@ static int tty_cdev_add(struct tty_driver *driver, dev_t dev,
 		unsigned int index, unsigned int count)
 {
 	/* init here, since reused cdevs cause crashes */
-	cdev_init(&driver->cdevs[index], &tty_fops);
-	driver->cdevs[index].owner = driver->owner;
-	return cdev_add(&driver->cdevs[index], dev, count);
+	driver->cdevs[index] = cdev_alloc();
+	if (!driver->cdevs[index])
+		return -ENOMEM;
+	cdev_init(driver->cdevs[index], &tty_fops);
+	driver->cdevs[index]->owner = driver->owner;
+	return cdev_add(driver->cdevs[index], dev, count);
 }
 
 /**
@@ -3260,8 +3263,10 @@ struct device *tty_register_device_attr(struct tty_driver *driver,
 
 error:
 	put_device(dev);
-	if (cdev)
-		cdev_del(&driver->cdevs[index]);
+	if (cdev) {
+		cdev_del(driver->cdevs[index]);
+		driver->cdevs[index] = NULL;
+	}
 	return ERR_PTR(retval);
 }
 EXPORT_SYMBOL_GPL(tty_register_device_attr);
@@ -3281,8 +3286,10 @@ void tty_unregister_device(struct tty_driver *driver, unsigned index)
 {
 	device_destroy(tty_class,
 		MKDEV(driver->major, driver->minor_start) + index);
-	if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC))
-		cdev_del(&driver->cdevs[index]);
+	if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) {
+		cdev_del(driver->cdevs[index]);
+		driver->cdevs[index] = NULL;
+	}
 }
 EXPORT_SYMBOL(tty_unregister_device);
 
@@ -3347,6 +3354,7 @@ err_free_all:
 	kfree(driver->ports);
 	kfree(driver->ttys);
 	kfree(driver->termios);
+	kfree(driver->cdevs);
 	kfree(driver);
 	return ERR_PTR(err);
 }
@@ -3375,7 +3383,7 @@ static void destruct_tty_driver(struct kref *kref)
 		}
 		proc_tty_unregister_driver(driver);
 		if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)
-			cdev_del(&driver->cdevs[0]);
+			cdev_del(driver->cdevs[0]);
 	}
 	kfree(driver->cdevs);
 	kfree(driver->ports);
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 92e337c18839..161052477f77 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -296,7 +296,7 @@ struct tty_operations {
 struct tty_driver {
 	int	magic;		/* magic number for this structure */
 	struct kref kref;	/* Reference management */
-	struct cdev *cdevs;
+	struct cdev **cdevs;
 	struct module	*owner;
 	const char	*driver_name;
 	const char	*name;
-- 
cgit v1.2.3-70-g09d2


From 512f64d9f7467597388ffbd5a21589ee3f375d8b Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Thu, 23 Jul 2015 15:08:41 +0300
Subject: mei: bus: add reference to bus device in struct mei_cl_client

Add reference to the bus device (mei_device) for easier access.
To ensures that referencing cldev->bus is valid during cldev life time
we increase the bus ref counter on a client device creation and drop it
on the device release.

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mei/bus.c     | 17 +++++++++++++++++
 include/linux/mei_cl_bus.h |  3 +++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index 1d9ce9c491cf..963731eb4383 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -590,6 +590,20 @@ static struct bus_type mei_cl_bus_type = {
 	.uevent		= mei_cl_device_uevent,
 };
 
+static struct mei_device *mei_dev_bus_get(struct mei_device *bus)
+{
+	if (bus)
+		get_device(bus->dev);
+
+	return bus;
+}
+
+static void mei_dev_bus_put(struct mei_device *bus)
+{
+	if (bus)
+		put_device(bus->dev);
+}
+
 static void mei_cl_dev_release(struct device *dev)
 {
 	struct mei_cl_device *cldev = to_mei_cl_device(dev);
@@ -598,6 +612,7 @@ static void mei_cl_dev_release(struct device *dev)
 		return;
 
 	mei_me_cl_put(cldev->me_cl);
+	mei_dev_bus_put(cldev->bus);
 	kfree(cldev);
 }
 
@@ -641,6 +656,7 @@ struct mei_cl_device *mei_cl_add_device(struct mei_device *bus,
 	cldev->dev.parent = bus->dev;
 	cldev->dev.bus = &mei_cl_bus_type;
 	cldev->dev.type = &mei_cl_device_type;
+	cldev->bus      = mei_dev_bus_get(bus);
 
 	strlcpy(cldev->name, name, sizeof(cldev->name));
 
@@ -650,6 +666,7 @@ struct mei_cl_device *mei_cl_add_device(struct mei_device *bus,
 	if (status) {
 		dev_err(bus->dev, "Failed to register MEI device\n");
 		mei_me_cl_put(cldev->me_cl);
+		mei_dev_bus_put(bus);
 		kfree(cldev);
 		return NULL;
 	}
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index a16b1f9c1aca..4c5c25b3222c 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -6,6 +6,7 @@
 #include <linux/mod_devicetable.h>
 
 struct mei_cl_device;
+struct mei_device;
 
 typedef void (*mei_cl_event_cb_t)(struct mei_cl_device *device,
 			       u32 events, void *context);
@@ -17,6 +18,7 @@ typedef void (*mei_cl_event_cb_t)(struct mei_cl_device *device,
  * Drivers for MEI devices will get an mei_cl_device pointer
  * when being probed and shall use it for doing ME bus I/O.
  *
+ * @bus: parent mei device
  * @dev: linux driver model device pointer
  * @me_cl: me client
  * @cl: mei client
@@ -29,6 +31,7 @@ typedef void (*mei_cl_event_cb_t)(struct mei_cl_device *device,
  * @priv_data: client private data
  */
 struct mei_cl_device {
+	struct mei_device *bus;
 	struct device dev;
 
 	struct mei_me_client *me_cl;
-- 
cgit v1.2.3-70-g09d2


From 0ff0a8d853039aa60bba3ca3e04e4fb74584a736 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Thu, 23 Jul 2015 15:08:42 +0300
Subject: mei: bus: add me client device list infrastructure

Instead of holding the list of host clients (me_cl)
we want to keep the list me client devices (mei_cl_device)
This way we can create host to me client connection only when needed.
Add list head to mei_cl_device and cl_bus_lock
Add bus_added flag to the me client (mei_me_client) to track if
the appropriate mei_cl_device was already created and is_added
flag to mei_cl_device to track if it was already added to the device
list across the bus rescans

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mei/bus.c     | 1 +
 drivers/misc/mei/init.c    | 1 +
 drivers/misc/mei/mei_dev.h | 6 ++++--
 include/linux/mei_cl_bus.h | 4 ++++
 4 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index 963731eb4383..34b14dda050c 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -657,6 +657,7 @@ struct mei_cl_device *mei_cl_add_device(struct mei_device *bus,
 	cldev->dev.bus = &mei_cl_bus_type;
 	cldev->dev.type = &mei_cl_device_type;
 	cldev->bus      = mei_dev_bus_get(bus);
+	INIT_LIST_HEAD(&cldev->bus_list);
 
 	strlcpy(cldev->name, name, sizeof(cldev->name));
 
diff --git a/drivers/misc/mei/init.c b/drivers/misc/mei/init.c
index 00c3865ca3b1..15000e9231b1 100644
--- a/drivers/misc/mei/init.c
+++ b/drivers/misc/mei/init.c
@@ -390,6 +390,7 @@ void mei_device_init(struct mei_device *dev,
 	INIT_LIST_HEAD(&dev->me_clients);
 	mutex_init(&dev->device_lock);
 	init_rwsem(&dev->me_clients_rwsem);
+	mutex_init(&dev->cl_bus_lock);
 	init_waitqueue_head(&dev->wait_hw_ready);
 	init_waitqueue_head(&dev->wait_pg);
 	init_waitqueue_head(&dev->wait_hbm_start);
diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h
index bc65fb42aea9..882e6f77084a 100644
--- a/drivers/misc/mei/mei_dev.h
+++ b/drivers/misc/mei/mei_dev.h
@@ -178,7 +178,7 @@ struct mei_fw_status {
  * @client_id: me client id
  * @mei_flow_ctrl_creds: flow control credits
  * @connect_count: number connections to this client
- * @reserved: reserved
+ * @bus_added: added to bus
  */
 struct mei_me_client {
 	struct list_head list;
@@ -187,7 +187,7 @@ struct mei_me_client {
 	u8 client_id;
 	u8 mei_flow_ctrl_creds;
 	u8 connect_count;
-	u8 reserved;
+	u8 bus_added;
 };
 
 
@@ -447,6 +447,7 @@ const char *mei_pg_state_str(enum mei_pg_state state);
  * @reset_work  : work item for the device reset
  *
  * @device_list : mei client bus list
+ * @cl_bus_lock : client bus list lock
  *
  * @dbgfs_dir   : debugfs mei root directory
  *
@@ -543,6 +544,7 @@ struct mei_device {
 
 	/* List of bus devices */
 	struct list_head device_list;
+	struct mutex cl_bus_lock;
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 	struct dentry *dbgfs_dir;
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index 4c5c25b3222c..85239138251c 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -18,6 +18,7 @@ typedef void (*mei_cl_event_cb_t)(struct mei_cl_device *device,
  * Drivers for MEI devices will get an mei_cl_device pointer
  * when being probed and shall use it for doing ME bus I/O.
  *
+ * @bus_list: device on the bus list
  * @bus: parent mei device
  * @dev: linux driver model device pointer
  * @me_cl: me client
@@ -28,9 +29,11 @@ typedef void (*mei_cl_event_cb_t)(struct mei_cl_device *device,
  *	events (e.g. Rx buffer pending) notifications.
  * @event_context: event callback run context
  * @events: Events bitmask sent to the driver.
+ * @is_added: device is already scanned
  * @priv_data: client private data
  */
 struct mei_cl_device {
+	struct list_head bus_list;
 	struct mei_device *bus;
 	struct device dev;
 
@@ -42,6 +45,7 @@ struct mei_cl_device {
 	mei_cl_event_cb_t event_cb;
 	void *event_context;
 	unsigned long events;
+	unsigned int is_added:1;
 
 	void *priv_data;
 };
-- 
cgit v1.2.3-70-g09d2


From 71ce789115f878a07e4a6c43d6006cea6aee1078 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Thu, 23 Jul 2015 15:08:43 +0300
Subject: mei: bus: enable running fixup routines before device registration

Split the device registration into allocation and device struct
initialization, device setup, and the final device registration.
This why it is possible to run fixups and quirks during the setup stage
on an initialized device. Each fixup routine effects do_match flag.
If the flag is set to false at the end the device won't be
registered on the bus.

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mei/bus-fixup.c | 30 +++++++++++++++
 drivers/misc/mei/bus.c       | 91 ++++++++++++++++++++++++++++++++++++--------
 drivers/misc/mei/mei_dev.h   |  2 +-
 include/linux/mei_cl_bus.h   |  4 ++
 4 files changed, 111 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/mei/bus-fixup.c b/drivers/misc/mei/bus-fixup.c
index 47aa1523d9e1..865e33bcd226 100644
--- a/drivers/misc/mei/bus-fixup.c
+++ b/drivers/misc/mei/bus-fixup.c
@@ -20,12 +20,15 @@
 #include <linux/moduleparam.h>
 #include <linux/device.h>
 #include <linux/slab.h>
+#include <linux/uuid.h>
 
 #include <linux/mei_cl_bus.h>
 
 #include "mei_dev.h"
 #include "client.h"
 
+#define MEI_UUID_ANY NULL_UUID_LE
+
 struct mei_nfc_cmd {
 	u8 command;
 	u8 status;
@@ -412,4 +415,31 @@ void mei_nfc_host_exit(struct mei_device *bus)
 	mutex_unlock(&bus->device_lock);
 }
 
+#define MEI_FIXUP(_uuid, _hook) { _uuid, _hook }
+
+static struct mei_fixup {
+
+	const uuid_le uuid;
+	void (*hook)(struct mei_cl_device *cldev);
+} mei_fixups[] = {};
+
+/**
+ * mei_cl_dev_fixup - run fixup handlers
+ *
+ * @cldev: me client device
+ */
+void mei_cl_dev_fixup(struct mei_cl_device *cldev)
+{
+	struct mei_fixup *f;
+	const uuid_le *uuid = mei_me_cl_uuid(cldev->me_cl);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mei_fixups); i++) {
+
+		f = &mei_fixups[i];
+		if (uuid_le_cmp(f->uuid, MEI_UUID_ANY) == 0 ||
+		    uuid_le_cmp(f->uuid, *uuid) == 0)
+			f->hook(cldev);
+	}
+}
 
diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index 34b14dda050c..68b7756bf384 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -436,6 +436,9 @@ static int mei_cl_device_match(struct device *dev, struct device_driver *drv)
 	if (!cldev)
 		return 0;
 
+	if (!cldev->do_match)
+		return 0;
+
 	if (!cldrv || !cldrv->id_table)
 		return 0;
 
@@ -634,6 +637,76 @@ struct mei_cl *mei_cl_bus_find_cl_by_uuid(struct mei_device *bus,
 	return NULL;
 }
 
+/**
+ * mei_cl_dev_alloc - initialize and allocate mei client device
+ *
+ * @bus: mei device
+ * @me_cl: me client
+ *
+ * Return: allocated device structur or NULL on allocation failure
+ */
+static struct mei_cl_device *mei_cl_dev_alloc(struct mei_device *bus,
+					      struct mei_me_client *me_cl)
+{
+	struct mei_cl_device *cldev;
+
+	cldev = kzalloc(sizeof(struct mei_cl_device), GFP_KERNEL);
+	if (!cldev)
+		return NULL;
+
+	device_initialize(&cldev->dev);
+	cldev->dev.parent = bus->dev;
+	cldev->dev.bus    = &mei_cl_bus_type;
+	cldev->dev.type   = &mei_cl_device_type;
+	cldev->bus        = mei_dev_bus_get(bus);
+	cldev->me_cl      = mei_me_cl_get(me_cl);
+	cldev->is_added   = 0;
+	INIT_LIST_HEAD(&cldev->bus_list);
+
+	return cldev;
+}
+
+/**
+ * mei_cl_dev_setup - setup me client device
+ *    run fix up routines and set the device name
+ *
+ * @bus: mei device
+ * @cldev: me client device
+ *
+ * Return: true if the device is eligible for enumeration
+ */
+static bool mei_cl_dev_setup(struct mei_device *bus,
+			     struct mei_cl_device *cldev)
+{
+	cldev->do_match = 1;
+	mei_cl_dev_fixup(cldev);
+
+	if (cldev->do_match)
+		dev_set_name(&cldev->dev, "mei:%s:%pUl",
+			     cldev->name, mei_me_cl_uuid(cldev->me_cl));
+
+	return cldev->do_match == 1;
+}
+
+/**
+ * mei_cl_bus_dev_add - add me client devices
+ *
+ * @cldev: me client device
+ *
+ * Return: 0 on success; < 0 on failre
+ */
+static int mei_cl_bus_dev_add(struct mei_cl_device *cldev)
+{
+	int ret;
+
+	dev_dbg(cldev->bus->dev, "adding %pUL\n", mei_me_cl_uuid(cldev->me_cl));
+	ret = device_add(&cldev->dev);
+	if (!ret)
+		cldev->is_added = 1;
+
+	return ret;
+}
+
 struct mei_cl_device *mei_cl_add_device(struct mei_device *bus,
 					struct mei_me_client *me_cl,
 					struct mei_cl *cl,
@@ -642,28 +715,16 @@ struct mei_cl_device *mei_cl_add_device(struct mei_device *bus,
 	struct mei_cl_device *cldev;
 	int status;
 
-	cldev = kzalloc(sizeof(struct mei_cl_device), GFP_KERNEL);
+	cldev = mei_cl_dev_alloc(bus, me_cl);
 	if (!cldev)
 		return NULL;
 
-	cldev->me_cl = mei_me_cl_get(me_cl);
-	if (!cldev->me_cl) {
-		kfree(cldev);
-		return NULL;
-	}
-
 	cldev->cl = cl;
-	cldev->dev.parent = bus->dev;
-	cldev->dev.bus = &mei_cl_bus_type;
-	cldev->dev.type = &mei_cl_device_type;
-	cldev->bus      = mei_dev_bus_get(bus);
-	INIT_LIST_HEAD(&cldev->bus_list);
-
 	strlcpy(cldev->name, name, sizeof(cldev->name));
 
-	dev_set_name(&cldev->dev, "mei:%s:%pUl", name, mei_me_cl_uuid(me_cl));
+	mei_cl_dev_setup(bus, cldev);
 
-	status = device_register(&cldev->dev);
+	status = mei_cl_bus_dev_add(cldev);
 	if (status) {
 		dev_err(bus->dev, "Failed to register MEI device\n");
 		mei_me_cl_put(cldev->me_cl);
diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h
index 882e6f77084a..ad59ab776f2d 100644
--- a/drivers/misc/mei/mei_dev.h
+++ b/drivers/misc/mei/mei_dev.h
@@ -335,7 +335,7 @@ struct mei_cl_device *mei_cl_add_device(struct mei_device *bus,
 					struct mei_cl *cl,
 					char *name);
 void mei_cl_remove_device(struct mei_cl_device *cldev);
-
+void mei_cl_dev_fixup(struct mei_cl_device *dev);
 ssize_t __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length,
 			bool blocking);
 ssize_t __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length);
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index 85239138251c..81ab56dd0ae0 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -29,6 +29,8 @@ typedef void (*mei_cl_event_cb_t)(struct mei_cl_device *device,
  *	events (e.g. Rx buffer pending) notifications.
  * @event_context: event callback run context
  * @events: Events bitmask sent to the driver.
+ *
+ * @do_match: wheather device can be matched with a driver
  * @is_added: device is already scanned
  * @priv_data: client private data
  */
@@ -45,6 +47,8 @@ struct mei_cl_device {
 	mei_cl_event_cb_t event_cb;
 	void *event_context;
 	unsigned long events;
+
+	unsigned int do_match:1;
 	unsigned int is_added:1;
 
 	void *priv_data;
-- 
cgit v1.2.3-70-g09d2


From bb2ef9c39db2e3c2562b4e439b2b00dc42e2c026 Mon Sep 17 00:00:00 2001
From: Alexander Usyskin <alexander.usyskin@intel.com>
Date: Sun, 26 Jul 2015 09:54:23 +0300
Subject: mei: bus: add and call callback on notify event

Enable drivers on mei client bus to subscribe
to asynchronous event notifications.
Introduce events_mask to the existing callback infrastructure
so it is possible to handle both RX and event notification.

Signed-off-by: Alexander Usyskin <alexander.usyskin@intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mei/bus.c     | 50 ++++++++++++++++++++++++++++++++++++++++++----
 drivers/misc/mei/client.c  |  2 ++
 drivers/misc/mei/mei_dev.h |  1 +
 drivers/nfc/mei_phy.c      |  3 ++-
 include/linux/mei_cl_bus.h |  4 ++++
 5 files changed, 55 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index 3ab08e522fb8..eef1c6b46ad8 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -222,7 +222,33 @@ static void mei_bus_event_work(struct work_struct *work)
 	cldev->events = 0;
 
 	/* Prepare for the next read */
-	mei_cl_read_start(cldev->cl, 0, NULL);
+	if (cldev->events_mask & BIT(MEI_CL_EVENT_RX))
+		mei_cl_read_start(cldev->cl, 0, NULL);
+}
+
+/**
+ * mei_cl_bus_notify_event - schedule notify cb on bus client
+ *
+ * @cl: host client
+ */
+void mei_cl_bus_notify_event(struct mei_cl *cl)
+{
+	struct mei_cl_device *cldev = cl->cldev;
+
+	if (!cldev || !cldev->event_cb)
+		return;
+
+	if (!(cldev->events_mask & BIT(MEI_CL_EVENT_NOTIF)))
+		return;
+
+	if (!cl->notify_ev)
+		return;
+
+	set_bit(MEI_CL_EVENT_NOTIF, &cldev->events);
+
+	schedule_work(&cldev->event_work);
+
+	cl->notify_ev = false;
 }
 
 /**
@@ -237,6 +263,9 @@ void mei_cl_bus_rx_event(struct mei_cl *cl)
 	if (!cldev || !cldev->event_cb)
 		return;
 
+	if (!(cldev->events_mask & BIT(MEI_CL_EVENT_RX)))
+		return;
+
 	set_bit(MEI_CL_EVENT_RX, &cldev->events);
 
 	schedule_work(&cldev->event_work);
@@ -247,6 +276,7 @@ void mei_cl_bus_rx_event(struct mei_cl *cl)
  *
  * @cldev: me client devices
  * @event_cb: callback function
+ * @events_mask: requested events bitmask
  * @context: driver context data
  *
  * Return: 0 on success
@@ -254,6 +284,7 @@ void mei_cl_bus_rx_event(struct mei_cl *cl)
  *         <0 on other errors
  */
 int mei_cl_register_event_cb(struct mei_cl_device *cldev,
+			  unsigned long events_mask,
 			  mei_cl_event_cb_t event_cb, void *context)
 {
 	int ret;
@@ -262,13 +293,24 @@ int mei_cl_register_event_cb(struct mei_cl_device *cldev,
 		return -EALREADY;
 
 	cldev->events = 0;
+	cldev->events_mask = events_mask;
 	cldev->event_cb = event_cb;
 	cldev->event_context = context;
 	INIT_WORK(&cldev->event_work, mei_bus_event_work);
 
-	ret = mei_cl_read_start(cldev->cl, 0, NULL);
-	if (ret && ret != -EBUSY)
-		return ret;
+	if (cldev->events_mask & BIT(MEI_CL_EVENT_RX)) {
+		ret = mei_cl_read_start(cldev->cl, 0, NULL);
+		if (ret && ret != -EBUSY)
+			return ret;
+	}
+
+	if (cldev->events_mask & BIT(MEI_CL_EVENT_NOTIF)) {
+		mutex_lock(&cldev->cl->dev->device_lock);
+		ret = mei_cl_notify_request(cldev->cl, NULL, event_cb ? 1 : 0);
+		mutex_unlock(&cldev->cl->dev->device_lock);
+		if (ret)
+			return ret;
+	}
 
 	return 0;
 }
diff --git a/drivers/misc/mei/client.c b/drivers/misc/mei/client.c
index db2436aee2dc..5fcd70bcdf96 100644
--- a/drivers/misc/mei/client.c
+++ b/drivers/misc/mei/client.c
@@ -1375,6 +1375,8 @@ void mei_cl_notify(struct mei_cl *cl)
 
 	if (cl->ev_async)
 		kill_fasync(&cl->ev_async, SIGIO, POLL_PRI);
+
+	mei_cl_bus_notify_event(cl);
 }
 
 /**
diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h
index c960aaa538c0..e25ee16c658e 100644
--- a/drivers/misc/mei/mei_dev.h
+++ b/drivers/misc/mei/mei_dev.h
@@ -345,6 +345,7 @@ ssize_t __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length,
 			bool blocking);
 ssize_t __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length);
 void mei_cl_bus_rx_event(struct mei_cl *cl);
+void mei_cl_bus_notify_event(struct mei_cl *cl);
 void mei_cl_bus_remove_devices(struct mei_device *bus);
 int mei_cl_bus_init(void);
 void mei_cl_bus_exit(void);
diff --git a/drivers/nfc/mei_phy.c b/drivers/nfc/mei_phy.c
index 2b77ccf77f81..754a9bb0f58d 100644
--- a/drivers/nfc/mei_phy.c
+++ b/drivers/nfc/mei_phy.c
@@ -355,7 +355,8 @@ static int nfc_mei_phy_enable(void *phy_id)
 		goto err;
 	}
 
-	r = mei_cl_register_event_cb(phy->device, nfc_mei_event_cb, phy);
+	r = mei_cl_register_event_cb(phy->device, BIT(MEI_CL_EVENT_RX),
+				     nfc_mei_event_cb, phy);
 	if (r) {
 		pr_err("Event cb registration failed %d\n", r);
 		goto err;
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index 81ab56dd0ae0..0962b2ca628a 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -28,6 +28,7 @@ typedef void (*mei_cl_event_cb_t)(struct mei_cl_device *device,
  * @event_cb: Drivers register this callback to get asynchronous ME
  *	events (e.g. Rx buffer pending) notifications.
  * @event_context: event callback run context
+ * @events_mask: Events bit mask requested by driver.
  * @events: Events bitmask sent to the driver.
  *
  * @do_match: wheather device can be matched with a driver
@@ -46,6 +47,7 @@ struct mei_cl_device {
 	struct work_struct event_work;
 	mei_cl_event_cb_t event_cb;
 	void *event_context;
+	unsigned long events_mask;
 	unsigned long events;
 
 	unsigned int do_match:1;
@@ -76,10 +78,12 @@ ssize_t mei_cl_send(struct mei_cl_device *device, u8 *buf, size_t length);
 ssize_t  mei_cl_recv(struct mei_cl_device *device, u8 *buf, size_t length);
 
 int mei_cl_register_event_cb(struct mei_cl_device *device,
+			  unsigned long event_mask,
 			  mei_cl_event_cb_t read_cb, void *context);
 
 #define MEI_CL_EVENT_RX 0
 #define MEI_CL_EVENT_TX 1
+#define MEI_CL_EVENT_NOTIF 2
 
 void *mei_cl_get_drvdata(const struct mei_cl_device *device);
 void mei_cl_set_drvdata(struct mei_cl_device *device, void *data);
-- 
cgit v1.2.3-70-g09d2


From e5779e8e12299f77c2421a707855d8d124171d85 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 30 Jul 2015 20:32:40 -0700
Subject: perf/x86/hw_breakpoints: Disallow kernel breakpoints unless
 kprobe-safe

Code on the kprobe blacklist doesn't want unexpected int3
exceptions. It probably doesn't want unexpected debug exceptions
either. Be safe: disallow breakpoints in nokprobes code.

On non-CONFIG_KPROBES kernels, there is no kprobe blacklist.  In
that case, disallow kernel breakpoints entirely.

It will be particularly important to keep hw breakpoints out of the
entry and NMI code once we move debug exceptions off the IST stack.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/e14b152af99640448d895e3c2a8c2d5ee19a1325.1438312874.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/hw_breakpoint.c | 15 +++++++++++++++
 include/linux/kprobes.h         |  2 ++
 kernel/kprobes.c                |  2 +-
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 7114ba220fd4..78f3e90c5659 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -32,6 +32,7 @@
 #include <linux/irqflags.h>
 #include <linux/notifier.h>
 #include <linux/kallsyms.h>
+#include <linux/kprobes.h>
 #include <linux/percpu.h>
 #include <linux/kdebug.h>
 #include <linux/kernel.h>
@@ -243,6 +244,20 @@ static int arch_build_bp_info(struct perf_event *bp)
 		info->type = X86_BREAKPOINT_RW;
 		break;
 	case HW_BREAKPOINT_X:
+		/*
+		 * We don't allow kernel breakpoints in places that are not
+		 * acceptable for kprobes.  On non-kprobes kernels, we don't
+		 * allow kernel breakpoints at all.
+		 */
+		if (bp->attr.bp_addr >= TASK_SIZE_MAX) {
+#ifdef CONFIG_KPROBES
+			if (within_kprobe_blacklist(bp->attr.bp_addr))
+				return -EINVAL;
+#else
+			return -EINVAL;
+#endif
+		}
+
 		info->type = X86_BREAKPOINT_EXECUTE;
 		/*
 		 * x86 inst breakpoints need to have a specific undefined len.
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 1ab54754a86d..8f6849084248 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -267,6 +267,8 @@ extern void show_registers(struct pt_regs *regs);
 extern void kprobes_inc_nmissed_count(struct kprobe *p);
 extern bool arch_within_kprobe_blacklist(unsigned long addr);
 
+extern bool within_kprobe_blacklist(unsigned long addr);
+
 struct kprobe_insn_cache {
 	struct mutex mutex;
 	void *(*alloc)(void);	/* allocate insn page */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c90e417bb963..d10ab6b9b5e0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1332,7 +1332,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr)
 	       addr < (unsigned long)__kprobes_text_end;
 }
 
-static bool within_kprobe_blacklist(unsigned long addr)
+bool within_kprobe_blacklist(unsigned long addr)
 {
 	struct kprobe_blacklist_entry *ent;
 
-- 
cgit v1.2.3-70-g09d2


From 4c2880b31c700b03f3f115b5ca64be615783aa9c Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Fri, 31 Jul 2015 09:44:12 +0100
Subject: irqchip/gic: Ensure gic_cpu_if_up/down() programs correct GIC
 instance

Commit 3228950621d9 ("irqchip: gic: Preserve gic V2 bypass bits in cpu
ctrl register") added a new function, gic_cpu_if_up(), to program the
GIC CPU_CTRL register. This function assumes that there is only one GIC
instance present and hence always uses the chip data for the primary GIC
controller. Although it is not common for there to be a secondary, some
devices do support a secondary. Therefore, fix this by passing
gic_cpu_if_up() a pointer to the appropriate chip data structure.

Similarly, the function gic_cpu_if_down() only assumes that there is a
single GIC instance present. Update this function so that an instance
number is passed for the appropriate GIC and return an error code on
failure. The vexpress TC2 (which has a single GIC) is currently the only
user of this function and so update it accordingly. Note that because the
TC2 only has a single GIC, the call to gic_cpu_if_down() should always
be successful.

Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1438332252-25248-2-git-send-email-jonathanh@nvidia.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/arm/mach-vexpress/tc2_pm.c |  2 +-
 drivers/irqchip/irq-gic.c       | 18 ++++++++++++------
 include/linux/irqchip/arm-gic.h |  2 +-
 3 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-vexpress/tc2_pm.c b/arch/arm/mach-vexpress/tc2_pm.c
index b3328cd46c33..1aa4ccece69f 100644
--- a/arch/arm/mach-vexpress/tc2_pm.c
+++ b/arch/arm/mach-vexpress/tc2_pm.c
@@ -80,7 +80,7 @@ static void tc2_pm_cpu_powerdown_prepare(unsigned int cpu, unsigned int cluster)
 	 * to the CPU by disabling the GIC CPU IF to prevent wfi
 	 * from completing execution behind power controller back
 	 */
-	gic_cpu_if_down();
+	gic_cpu_if_down(0);
 }
 
 static void tc2_pm_cluster_powerdown_prepare(unsigned int cluster)
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 84fc622d0309..aa3e7b8a69c4 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -356,9 +356,9 @@ static u8 gic_get_cpumask(struct gic_chip_data *gic)
 	return mask;
 }
 
-static void gic_cpu_if_up(void)
+static void gic_cpu_if_up(struct gic_chip_data *gic)
 {
-	void __iomem *cpu_base = gic_data_cpu_base(&gic_data[0]);
+	void __iomem *cpu_base = gic_data_cpu_base(gic);
 	u32 bypass = 0;
 
 	/*
@@ -426,17 +426,23 @@ static void gic_cpu_init(struct gic_chip_data *gic)
 	gic_cpu_config(dist_base, NULL);
 
 	writel_relaxed(GICC_INT_PRI_THRESHOLD, base + GIC_CPU_PRIMASK);
-	gic_cpu_if_up();
+	gic_cpu_if_up(gic);
 }
 
-void gic_cpu_if_down(void)
+int gic_cpu_if_down(unsigned int gic_nr)
 {
-	void __iomem *cpu_base = gic_data_cpu_base(&gic_data[0]);
+	void __iomem *cpu_base;
 	u32 val = 0;
 
+	if (gic_nr >= MAX_GIC_NR)
+		return -EINVAL;
+
+	cpu_base = gic_data_cpu_base(&gic_data[gic_nr]);
 	val = readl(cpu_base + GIC_CPU_CTRL);
 	val &= ~GICC_ENABLE;
 	writel_relaxed(val, cpu_base + GIC_CPU_CTRL);
+
+	return 0;
 }
 
 #ifdef CONFIG_CPU_PM
@@ -572,7 +578,7 @@ static void gic_cpu_restore(unsigned int gic_nr)
 					dist_base + GIC_DIST_PRI + i * 4);
 
 	writel_relaxed(GICC_INT_PRI_THRESHOLD, cpu_base + GIC_CPU_PRIMASK);
-	gic_cpu_if_up();
+	gic_cpu_if_up(&gic_data[gic_nr]);
 }
 
 static int gic_notifier(struct notifier_block *self, unsigned long cmd,	void *v)
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index 61a2007eb49a..65da435d01c1 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -98,7 +98,7 @@ struct device_node;
 void gic_init_bases(unsigned int, int, void __iomem *, void __iomem *,
 		    u32 offset, struct device_node *);
 void gic_cascade_irq(unsigned int gic_nr, unsigned int irq);
-void gic_cpu_if_down(void);
+int gic_cpu_if_down(unsigned int gic_nr);
 
 static inline void gic_init(unsigned int nr, int start,
 			    void __iomem *dist , void __iomem *cpu)
-- 
cgit v1.2.3-70-g09d2


From 12d560f4ea87030667438a169912380be00cea4b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 14 Jul 2015 18:35:23 -0700
Subject: rcu,locking: Privatize smp_mb__after_unlock_lock()

RCU is the only thing that uses smp_mb__after_unlock_lock(), and is
likely the only thing that ever will use it, so this commit makes this
macro private to RCU.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: "linux-arch@vger.kernel.org" <linux-arch@vger.kernel.org>
---
 Documentation/memory-barriers.txt   | 71 +++----------------------------------
 arch/powerpc/include/asm/spinlock.h |  2 --
 include/linux/spinlock.h            | 10 ------
 kernel/rcu/tree.h                   | 12 +++++++
 4 files changed, 16 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 318523872db5..eafa6a53f72c 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1854,16 +1854,10 @@ RELEASE are to the same lock variable, but only from the perspective of
 another CPU not holding that lock.  In short, a ACQUIRE followed by an
 RELEASE may -not- be assumed to be a full memory barrier.
 
-Similarly, the reverse case of a RELEASE followed by an ACQUIRE does not
-imply a full memory barrier.  If it is necessary for a RELEASE-ACQUIRE
-pair to produce a full barrier, the ACQUIRE can be followed by an
-smp_mb__after_unlock_lock() invocation.  This will produce a full barrier
-(including transitivity) if either (a) the RELEASE and the ACQUIRE are
-executed by the same CPU or task, or (b) the RELEASE and ACQUIRE act on
-the same variable.  The smp_mb__after_unlock_lock() primitive is free
-on many architectures.  Without smp_mb__after_unlock_lock(), the CPU's
-execution of the critical sections corresponding to the RELEASE and the
-ACQUIRE can cross, so that:
+Similarly, the reverse case of a RELEASE followed by an ACQUIRE does
+not imply a full memory barrier.  Therefore, the CPU's execution of the
+critical sections corresponding to the RELEASE and the ACQUIRE can cross,
+so that:
 
 	*A = a;
 	RELEASE M
@@ -1901,29 +1895,6 @@ the RELEASE would simply complete, thereby avoiding the deadlock.
 	a sleep-unlock race, but the locking primitive needs to resolve
 	such races properly in any case.
 
-With smp_mb__after_unlock_lock(), the two critical sections cannot overlap.
-For example, with the following code, the store to *A will always be
-seen by other CPUs before the store to *B:
-
-	*A = a;
-	RELEASE M
-	ACQUIRE N
-	smp_mb__after_unlock_lock();
-	*B = b;
-
-The operations will always occur in one of the following orders:
-
-	STORE *A, RELEASE, ACQUIRE, smp_mb__after_unlock_lock(), STORE *B
-	STORE *A, ACQUIRE, RELEASE, smp_mb__after_unlock_lock(), STORE *B
-	ACQUIRE, STORE *A, RELEASE, smp_mb__after_unlock_lock(), STORE *B
-
-If the RELEASE and ACQUIRE were instead both operating on the same lock
-variable, only the first of these alternatives can occur.  In addition,
-the more strongly ordered systems may rule out some of the above orders.
-But in any case, as noted earlier, the smp_mb__after_unlock_lock()
-ensures that the store to *A will always be seen as happening before
-the store to *B.
-
 Locks and semaphores may not provide any guarantee of ordering on UP compiled
 systems, and so cannot be counted on in such a situation to actually achieve
 anything at all - especially with respect to I/O accesses - unless combined
@@ -2154,40 +2125,6 @@ But it won't see any of:
 	*E, *F or *G following RELEASE Q
 
 
-However, if the following occurs:
-
-	CPU 1				CPU 2
-	===============================	===============================
-	WRITE_ONCE(*A, a);
-	ACQUIRE M		     [1]
-	WRITE_ONCE(*B, b);
-	WRITE_ONCE(*C, c);
-	RELEASE M	     [1]
-	WRITE_ONCE(*D, d);		WRITE_ONCE(*E, e);
-					ACQUIRE M		     [2]
-					smp_mb__after_unlock_lock();
-					WRITE_ONCE(*F, f);
-					WRITE_ONCE(*G, g);
-					RELEASE M	     [2]
-					WRITE_ONCE(*H, h);
-
-CPU 3 might see:
-
-	*E, ACQUIRE M [1], *C, *B, *A, RELEASE M [1],
-		ACQUIRE M [2], *H, *F, *G, RELEASE M [2], *D
-
-But assuming CPU 1 gets the lock first, CPU 3 won't see any of:
-
-	*B, *C, *D, *F, *G or *H preceding ACQUIRE M [1]
-	*A, *B or *C following RELEASE M [1]
-	*F, *G or *H preceding ACQUIRE M [2]
-	*A, *B, *C, *E, *F or *G following RELEASE M [2]
-
-Note that the smp_mb__after_unlock_lock() is critically important
-here: Without it CPU 3 might see some of the above orderings.
-Without smp_mb__after_unlock_lock(), the accesses are not guaranteed
-to be seen in order unless CPU 3 holds lock M.
-
 
 ACQUIRES VS I/O ACCESSES
 ------------------------
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 4dbe072eecbe..523673d7583c 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -28,8 +28,6 @@
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
 
-#define smp_mb__after_unlock_lock()	smp_mb()  /* Full ordering for lock. */
-
 #ifdef CONFIG_PPC64
 /* use 0x800000yy when locked, where yy == CPU number */
 #ifdef __BIG_ENDIAN__
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 0063b24b4f36..16c5ed5a627c 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -130,16 +130,6 @@ do {								\
 #define smp_mb__before_spinlock()	smp_wmb()
 #endif
 
-/*
- * Place this after a lock-acquisition primitive to guarantee that
- * an UNLOCK+LOCK pair act as a full barrier.  This guarantee applies
- * if the UNLOCK and LOCK are executed by the same CPU or if the
- * UNLOCK and LOCK operate on the same lock variable.
- */
-#ifndef smp_mb__after_unlock_lock
-#define smp_mb__after_unlock_lock()	do { } while (0)
-#endif
-
 /**
  * raw_spin_unlock_wait - wait until the spinlock gets unlocked
  * @lock: the spinlock in question.
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 0412030ca882..2e991f8361e4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -653,3 +653,15 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 }
 #endif /* #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Place this after a lock-acquisition primitive to guarantee that
+ * an UNLOCK+LOCK pair act as a full barrier.  This guarantee applies
+ * if the UNLOCK and LOCK are executed by the same CPU or if the
+ * UNLOCK and LOCK operate on the same lock variable.
+ */
+#ifdef CONFIG_PPC
+#define smp_mb__after_unlock_lock()	smp_mb()  /* Full ordering for lock. */
+#else /* #ifdef CONFIG_PPC */
+#define smp_mb__after_unlock_lock()	do { } while (0)
+#endif /* #else #ifdef CONFIG_PPC */
-- 
cgit v1.2.3-70-g09d2


From cc476b42a39d5a66d94f46cade972dcb8ee278df Mon Sep 17 00:00:00 2001
From: Robert Baldyga <r.baldyga@samsung.com>
Date: Fri, 31 Jul 2015 16:00:13 +0200
Subject: usb: gadget: encapsulate endpoint claiming mechanism

So far it was necessary for usb functions to set ep->driver_data in
endpoint obtained from autoconfig to non-null value, to indicate that
endpoint is claimed by function (in autoconfig it was checked if endpoint
has set this field to non-null value, and if it has, it was assumed that
it is claimed). It could cause bugs because if some function doesn't
set this field autoconfig could return the same endpoint more than one
time.

To help to avoid such bugs this patch adds claimed flag to struct usb_ep,
and  encapsulates endpoint claiming mechanism inside usb_ep_autoconfig_ss()
and usb_ep_autoconfig_reset(), so now usb functions don't need to perform
any additional actions to mark endpoint obtained from autoconfig as claimed.

Signed-off-by: Robert Baldyga <r.baldyga@samsung.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/gadget/epautoconf.c | 11 ++++++-----
 include/linux/usb/gadget.h      |  1 +
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/epautoconf.c b/drivers/usb/gadget/epautoconf.c
index 919cdfdda78b..8e00ca765549 100644
--- a/drivers/usb/gadget/epautoconf.c
+++ b/drivers/usb/gadget/epautoconf.c
@@ -53,7 +53,7 @@ ep_matches (
 	int		num_req_streams = 0;
 
 	/* endpoint already claimed? */
-	if (NULL != ep->driver_data)
+	if (ep->claimed)
 		return 0;
 
 	/* only support ep0 for portable CONTROL traffic */
@@ -240,7 +240,7 @@ find_ep (struct usb_gadget *gadget, const char *name)
  * updated with the assigned number of streams if it is
  * different from the original value. To prevent the endpoint
  * from being returned by a later autoconfig call, claim it by
- * assigning ep->driver_data to some non-null value.
+ * assigning ep->claimed to true.
  *
  * On failure, this returns a null endpoint descriptor.
  */
@@ -323,6 +323,7 @@ struct usb_ep *usb_ep_autoconfig_ss(
 found_ep:
 	ep->desc = NULL;
 	ep->comp_desc = NULL;
+	ep->claimed = true;
 	return ep;
 }
 EXPORT_SYMBOL_GPL(usb_ep_autoconfig_ss);
@@ -354,7 +355,7 @@ EXPORT_SYMBOL_GPL(usb_ep_autoconfig_ss);
  * descriptor bEndpointAddress.  For bulk endpoints, the wMaxPacket value
  * is initialized as if the endpoint were used at full speed.  To prevent
  * the endpoint from being returned by a later autoconfig call, claim it
- * by assigning ep->driver_data to some non-null value.
+ * by assigning ep->claimed to true.
  *
  * On failure, this returns a null endpoint descriptor.
  */
@@ -373,7 +374,7 @@ EXPORT_SYMBOL_GPL(usb_ep_autoconfig);
  *
  * Use this for devices where one configuration may need to assign
  * endpoint resources very differently from the next one.  It clears
- * state such as ep->driver_data and the record of assigned endpoints
+ * state such as ep->claimed and the record of assigned endpoints
  * used by usb_ep_autoconfig().
  */
 void usb_ep_autoconfig_reset (struct usb_gadget *gadget)
@@ -381,7 +382,7 @@ void usb_ep_autoconfig_reset (struct usb_gadget *gadget)
 	struct usb_ep	*ep;
 
 	list_for_each_entry (ep, &gadget->ep_list, ep_list) {
-		ep->driver_data = NULL;
+		ep->claimed = false;
 	}
 	gadget->in_epnum = 0;
 	gadget->out_epnum = 0;
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 353a72096dda..68fb5e8b18c3 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -173,6 +173,7 @@ struct usb_ep {
 	const char		*name;
 	const struct usb_ep_ops	*ops;
 	struct list_head	ep_list;
+	bool			claimed;
 	unsigned		maxpacket:16;
 	unsigned		maxpacket_limit:16;
 	unsigned		max_streams:16;
-- 
cgit v1.2.3-70-g09d2


From 734b5a2addd333829a6d647ee14a3609c7a87c44 Mon Sep 17 00:00:00 2001
From: Robert Baldyga <r.baldyga@samsung.com>
Date: Fri, 31 Jul 2015 16:00:14 +0200
Subject: usb: gadget: add endpoint capabilities flags

Introduce struct usb_ep_caps which contains information about capabilities
of usb endpoints - supported transfer types and directions. This structure
should be filled by UDC driver for each of its endpoints, and will be
used in epautoconf in new ep matching mechanism which will replace ugly
guessing of endpoint capabilities basing on its name.

Signed-off-by: Robert Baldyga <r.baldyga@samsung.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/linux/usb/gadget.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 68fb5e8b18c3..a9a49593c239 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -140,11 +140,30 @@ struct usb_ep_ops {
 	void (*fifo_flush) (struct usb_ep *ep);
 };
 
+/**
+ * struct usb_ep_caps - endpoint capabilities description
+ * @type_control:Endpoint supports control type (reserved for ep0).
+ * @type_iso:Endpoint supports isochronous transfers.
+ * @type_bulk:Endpoint supports bulk transfers.
+ * @type_int:Endpoint supports interrupt transfers.
+ * @dir_in:Endpoint supports IN direction.
+ * @dir_out:Endpoint supports OUT direction.
+ */
+struct usb_ep_caps {
+	unsigned type_control:1;
+	unsigned type_iso:1;
+	unsigned type_bulk:1;
+	unsigned type_int:1;
+	unsigned dir_in:1;
+	unsigned dir_out:1;
+};
+
 /**
  * struct usb_ep - device side representation of USB endpoint
  * @name:identifier for the endpoint, such as "ep-a" or "ep9in-bulk"
  * @ops: Function pointers used to access hardware-specific operations.
  * @ep_list:the gadget's ep_list holds all of its endpoints
+ * @caps:The structure describing types and directions supported by endoint.
  * @maxpacket:The maximum packet size used on this endpoint.  The initial
  *	value can sometimes be reduced (hardware allowing), according to
  *      the endpoint descriptor used to configure the endpoint.
@@ -167,12 +186,14 @@ struct usb_ep_ops {
  * gadget->ep_list.  the control endpoint (gadget->ep0) is not in that list,
  * and is accessed only in response to a driver setup() callback.
  */
+
 struct usb_ep {
 	void			*driver_data;
 
 	const char		*name;
 	const struct usb_ep_ops	*ops;
 	struct list_head	ep_list;
+	struct usb_ep_caps	caps;
 	bool			claimed;
 	unsigned		maxpacket:16;
 	unsigned		maxpacket_limit:16;
-- 
cgit v1.2.3-70-g09d2


From 80e6e3847f851fc05e63265050115e29e2a50d7e Mon Sep 17 00:00:00 2001
From: Robert Baldyga <r.baldyga@samsung.com>
Date: Fri, 31 Jul 2015 16:00:15 +0200
Subject: usb: gadget: add endpoint capabilities helper macros

Add macros useful while initializing array of endpoint capabilities
structures. These macros makes structure initialization more compact
to decrease number of code lines and increase readability of code.

Signed-off-by: Robert Baldyga <r.baldyga@samsung.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/linux/usb/gadget.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index a9a49593c239..82b5bcbd2c98 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -158,6 +158,26 @@ struct usb_ep_caps {
 	unsigned dir_out:1;
 };
 
+#define USB_EP_CAPS_TYPE_CONTROL     0x01
+#define USB_EP_CAPS_TYPE_ISO         0x02
+#define USB_EP_CAPS_TYPE_BULK        0x04
+#define USB_EP_CAPS_TYPE_INT         0x08
+#define USB_EP_CAPS_TYPE_ALL \
+	(USB_EP_CAPS_TYPE_ISO | USB_EP_CAPS_TYPE_BULK | USB_EP_CAPS_TYPE_INT)
+#define USB_EP_CAPS_DIR_IN           0x01
+#define USB_EP_CAPS_DIR_OUT          0x02
+#define USB_EP_CAPS_DIR_ALL  (USB_EP_CAPS_DIR_IN | USB_EP_CAPS_DIR_OUT)
+
+#define USB_EP_CAPS(_type, _dir) \
+	{ \
+		.type_control = !!(_type & USB_EP_CAPS_TYPE_CONTROL), \
+		.type_iso = !!(_type & USB_EP_CAPS_TYPE_ISO), \
+		.type_bulk = !!(_type & USB_EP_CAPS_TYPE_BULK), \
+		.type_int = !!(_type & USB_EP_CAPS_TYPE_INT), \
+		.dir_in = !!(_dir & USB_EP_CAPS_DIR_IN), \
+		.dir_out = !!(_dir & USB_EP_CAPS_DIR_OUT), \
+	}
+
 /**
  * struct usb_ep - device side representation of USB endpoint
  * @name:identifier for the endpoint, such as "ep-a" or "ep9in-bulk"
-- 
cgit v1.2.3-70-g09d2


From 6abc8ca19df0078de17dc38340db3002ed489ce7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 4 Aug 2015 15:20:55 -0400
Subject: cgroup: define controller file conventions

Traditionally, each cgroup controller implemented whatever interface
it wanted leading to interfaces which are widely inconsistent.
Examining the requirements of the controllers readily yield that there
are only a few control schemes shared among all.

Two major controllers already had to implement new interface for the
unified hierarchy due to significant structural changes.  Let's take
the chance to establish common conventions throughout all controllers.

This patch defines CGROUP_WEIGHT_MIN/DFL/MAX to be used on all weight
based control knobs and documents the conventions that controllers
should follow on the unified hierarchy.  Except for io.weight knob,
all existing unified hierarchy knobs are already compliant.  A
follow-up patch will update io.weight.

v2: Added descriptions of min, low and high knobs.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 Documentation/cgroups/unified-hierarchy.txt | 80 ++++++++++++++++++++++++++---
 include/linux/cgroup.h                      |  9 ++++
 2 files changed, 81 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 86847a7647ab..1ee9caf29e57 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -23,10 +23,13 @@ CONTENTS
 5. Other Changes
   5-1. [Un]populated Notification
   5-2. Other Core Changes
-  5-3. Per-Controller Changes
-    5-3-1. blkio
-    5-3-2. cpuset
-    5-3-3. memory
+  5-3. Controller File Conventions
+    5-3-1. Format
+    5-3-2. Control Knobs
+  5-4. Per-Controller Changes
+    5-4-1. blkio
+    5-4-2. cpuset
+    5-4-3. memory
 6. Planned Changes
   6-1. CAP for resource control
 
@@ -372,14 +375,75 @@ supported and the interface files "release_agent" and
 - The "cgroup.clone_children" file is removed.
 
 
-5-3. Per-Controller Changes
+5-3. Controller File Conventions
 
-5-3-1. blkio
+5-3-1. Format
+
+In general, all controller files should be in one of the following
+formats whenever possible.
+
+- Values only files
+
+  VAL0 VAL1...\n
+
+- Flat keyed files
+
+  KEY0 VAL0\n
+  KEY1 VAL1\n
+  ...
+
+- Nested keyed files
+
+  KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01...
+  KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11...
+  ...
+
+For a writeable file, the format for writing should generally match
+reading; however, controllers may allow omitting later fields or
+implement restricted shortcuts for most common use cases.
+
+For both flat and nested keyed files, only the values for a single key
+can be written at a time.  For nested keyed files, the sub key pairs
+may be specified in any order and not all pairs have to be specified.
+
+
+5-3-2. Control Knobs
+
+- Settings for a single feature should generally be implemented in a
+  single file.
+
+- In general, the root cgroup should be exempt from resource control
+  and thus shouldn't have resource control knobs.
+
+- If a controller implements ratio based resource distribution, the
+  control knob should be named "weight" and have the range [1, 10000]
+  and 100 should be the default value.  The values are chosen to allow
+  enough and symmetric bias in both directions while keeping it
+  intuitive (the default is 100%).
+
+- If a controller implements an absolute resource guarantee and/or
+  limit, the control knobs should be named "min" and "max"
+  respectively.  If a controller implements best effort resource
+  gurantee and/or limit, the control knobs should be named "low" and
+  "high" respectively.
+
+  In the above four control files, the special token "max" should be
+  used to represent upward infinity for both reading and writing.
+
+- If a setting has configurable default value and specific overrides,
+  the default settings should be keyed with "default" and appear as
+  the first entry in the file.  Specific entries can use "default" as
+  its value to indicate inheritance of the default value.
+
+
+5-4. Per-Controller Changes
+
+5-4-1. blkio
 
 - blk-throttle becomes properly hierarchical.
 
 
-5-3-2. cpuset
+5-4-2. cpuset
 
 - Tasks are kept in empty cpusets after hotplug and take on the masks
   of the nearest non-empty ancestor, instead of being moved to it.
@@ -388,7 +452,7 @@ supported and the interface files "release_agent" and
   masks of the nearest non-empty ancestor.
 
 
-5-3-3. memory
+5-4-3. memory
 
 - use_hierarchy is on by default and the cgroup file for the flag is
   not created.
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a593e299162e..c6bf9d30c270 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -22,6 +22,15 @@
 
 #ifdef CONFIG_CGROUPS
 
+/*
+ * All weight knobs on the default hierarhcy should use the following min,
+ * default and max values.  The default value is the logarithmic center of
+ * MIN and MAX and allows 100x to be expressed in both directions.
+ */
+#define CGROUP_WEIGHT_MIN		1
+#define CGROUP_WEIGHT_DFL		100
+#define CGROUP_WEIGHT_MAX		10000
+
 /* a css_task_iter should be treated as an opaque object */
 struct css_task_iter {
 	struct cgroup_subsys		*ss;
-- 
cgit v1.2.3-70-g09d2


From 7f3884f7de89c49439fdaa115f6d1caec3256cc3 Mon Sep 17 00:00:00 2001
From: Nick Dyer <nick.dyer@itdev.co.uk>
Date: Tue, 4 Aug 2015 16:36:29 -0700
Subject: Input: atmel_mxt_ts - use deep sleep mode when stopped

The hardcoded 0x83 CTRL setting overrides other settings in that byte,
enabling extra reporting that may not be useful on a particular platform.

Implement improved suspend mechanism via deep sleep. By writing zero to
both the active and idle cycle times the maXTouch device can be put into a
deep sleep mode, using minimal power. It is necessary to issue a calibrate
command after the chip has spent any time in deep sleep, however a soft
reset is unnecessary.

Use the old method on Chromebook Pixel via platform data option.

This patch also deals with the situation where the power configuration is
zero on probe, which would mean that the device never wakes up to execute
commands.

After a config download, the T7 power configuration may have changed so it
is necessary to re-read it.

Signed-off-by: Nick Dyer <nick.dyer@itdev.co.uk>
Acked-by: Benson Leung <bleung@chromium.org>
Acked-by: Yufeng Shen <miletus@chromium.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/atmel_mxt_ts.c   | 119 ++++++++++++++++++++++++++---
 drivers/platform/chrome/chromeos_laptop.c  |   4 +-
 include/linux/i2c/atmel_mxt_ts.h           |  25 ------
 include/linux/platform_data/atmel_mxt_ts.h |  31 ++++++++
 4 files changed, 142 insertions(+), 37 deletions(-)
 delete mode 100644 include/linux/i2c/atmel_mxt_ts.h
 create mode 100644 include/linux/platform_data/atmel_mxt_ts.h

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c b/drivers/input/touchscreen/atmel_mxt_ts.c
index 8efe7a002f1e..0e743b3a691b 100644
--- a/drivers/input/touchscreen/atmel_mxt_ts.c
+++ b/drivers/input/touchscreen/atmel_mxt_ts.c
@@ -22,7 +22,7 @@
 #include <linux/delay.h>
 #include <linux/firmware.h>
 #include <linux/i2c.h>
-#include <linux/i2c/atmel_mxt_ts.h>
+#include <linux/platform_data/atmel_mxt_ts.h>
 #include <linux/input/mt.h>
 #include <linux/interrupt.h>
 #include <linux/of.h>
@@ -103,9 +103,13 @@
 #define MXT_T6_STATUS_COMSERR	(1 << 2)
 
 /* MXT_GEN_POWER_T7 field */
-#define MXT_POWER_IDLEACQINT	0
-#define MXT_POWER_ACTVACQINT	1
-#define MXT_POWER_ACTV2IDLETO	2
+struct t7_config {
+	u8 idle;
+	u8 active;
+} __packed;
+
+#define MXT_POWER_CFG_RUN		0
+#define MXT_POWER_CFG_DEEPSLEEP		1
 
 /* MXT_GEN_ACQUIRE_T8 field */
 #define MXT_ACQUIRE_CHRGTIME	0
@@ -117,7 +121,7 @@
 #define MXT_ACQUIRE_ATCHCALSTHR	7
 
 /* MXT_TOUCH_MULTI_T9 field */
-#define MXT_TOUCH_CTRL		0
+#define MXT_T9_CTRL		0
 #define MXT_T9_ORIENT		9
 #define MXT_T9_RANGE		18
 
@@ -291,6 +295,7 @@ struct mxt_data {
 	u8 last_message_count;
 	u8 num_touchids;
 	u8 multitouch;
+	struct t7_config t7_cfg;
 
 	/* Cached parameters from object table */
 	u16 T5_address;
@@ -1361,6 +1366,8 @@ static int mxt_upload_cfg_mem(struct mxt_data *data, unsigned int cfg_start,
 	return 0;
 }
 
+static int mxt_init_t7_power_cfg(struct mxt_data *data);
+
 /*
  * mxt_update_cfg - download configuration to chip
  *
@@ -1508,6 +1515,9 @@ static int mxt_update_cfg(struct mxt_data *data, const struct firmware *cfg)
 
 	dev_info(dev, "Config successfully updated\n");
 
+	/* T7 config may have changed */
+	mxt_init_t7_power_cfg(data);
+
 release_mem:
 	kfree(config_mem);
 	return ret;
@@ -2051,6 +2061,60 @@ err_free_object_table:
 	return error;
 }
 
+static int mxt_set_t7_power_cfg(struct mxt_data *data, u8 sleep)
+{
+	struct device *dev = &data->client->dev;
+	int error;
+	struct t7_config *new_config;
+	struct t7_config deepsleep = { .active = 0, .idle = 0 };
+
+	if (sleep == MXT_POWER_CFG_DEEPSLEEP)
+		new_config = &deepsleep;
+	else
+		new_config = &data->t7_cfg;
+
+	error = __mxt_write_reg(data->client, data->T7_address,
+				sizeof(data->t7_cfg), new_config);
+	if (error)
+		return error;
+
+	dev_dbg(dev, "Set T7 ACTV:%d IDLE:%d\n",
+		new_config->active, new_config->idle);
+
+	return 0;
+}
+
+static int mxt_init_t7_power_cfg(struct mxt_data *data)
+{
+	struct device *dev = &data->client->dev;
+	int error;
+	bool retry = false;
+
+recheck:
+	error = __mxt_read_reg(data->client, data->T7_address,
+				sizeof(data->t7_cfg), &data->t7_cfg);
+	if (error)
+		return error;
+
+	if (data->t7_cfg.active == 0 || data->t7_cfg.idle == 0) {
+		if (!retry) {
+			dev_dbg(dev, "T7 cfg zero, resetting\n");
+			mxt_soft_reset(data);
+			retry = true;
+			goto recheck;
+		} else {
+			dev_dbg(dev, "T7 cfg zero after reset, overriding\n");
+			data->t7_cfg.active = 20;
+			data->t7_cfg.idle = 100;
+			return mxt_set_t7_power_cfg(data, MXT_POWER_CFG_RUN);
+		}
+	}
+
+	dev_dbg(dev, "Initialized power cfg: ACTV %d, IDLE %d\n",
+		data->t7_cfg.active, data->t7_cfg.idle);
+	return 0;
+}
+
 static int mxt_configure_objects(struct mxt_data *data,
 				 const struct firmware *cfg)
 {
@@ -2058,6 +2122,12 @@ static int mxt_configure_objects(struct mxt_data *data,
 	struct mxt_info *info = &data->info;
 	int error;
 
+	error = mxt_init_t7_power_cfg(data);
+	if (error) {
+		dev_err(dev, "Failed to initialize power cfg\n");
+		return error;
+	}
+
 	if (cfg) {
 		error = mxt_update_cfg(data, cfg);
 		if (error)
@@ -2346,14 +2416,41 @@ static const struct attribute_group mxt_attr_group = {
 
 static void mxt_start(struct mxt_data *data)
 {
-	/* Touch enable */
-	mxt_write_object(data, data->multitouch, MXT_TOUCH_CTRL, 0x83);
+	switch (data->pdata->suspend_mode) {
+	case MXT_SUSPEND_T9_CTRL:
+		mxt_soft_reset(data);
+
+		/* Touch enable */
+		/* 0x83 = SCANEN | RPTEN | ENABLE */
+		mxt_write_object(data,
+				MXT_TOUCH_MULTI_T9, MXT_T9_CTRL, 0x83);
+		break;
+
+	case MXT_SUSPEND_DEEP_SLEEP:
+	default:
+		mxt_set_t7_power_cfg(data, MXT_POWER_CFG_RUN);
+
+		/* Recalibrate since chip has been in deep sleep */
+		mxt_t6_command(data, MXT_COMMAND_CALIBRATE, 1, false);
+		break;
+	}
+
 }
 
 static void mxt_stop(struct mxt_data *data)
 {
-	/* Touch disable */
-	mxt_write_object(data, data->multitouch, MXT_TOUCH_CTRL, 0);
+	switch (data->pdata->suspend_mode) {
+	case MXT_SUSPEND_T9_CTRL:
+		/* Touch disable */
+		mxt_write_object(data,
+				MXT_TOUCH_MULTI_T9, MXT_T9_CTRL, 0);
+		break;
+
+	case MXT_SUSPEND_DEEP_SLEEP:
+	default:
+		mxt_set_t7_power_cfg(data, MXT_POWER_CFG_DEEPSLEEP);
+		break;
+	}
 }
 
 static int mxt_input_open(struct input_dev *dev)
@@ -2409,6 +2506,8 @@ static const struct mxt_platform_data *mxt_parse_dt(struct i2c_client *client)
 		pdata->t19_keymap = keymap;
 	}
 
+	pdata->suspend_mode = MXT_SUSPEND_DEEP_SLEEP;
+
 	return pdata;
 }
 #else
@@ -2625,8 +2724,6 @@ static int __maybe_unused mxt_resume(struct device *dev)
 	struct mxt_data *data = i2c_get_clientdata(client);
 	struct input_dev *input_dev = data->input_dev;
 
-	mxt_soft_reset(data);
-
 	mutex_lock(&input_dev->mutex);
 
 	if (input_dev->users)
diff --git a/drivers/platform/chrome/chromeos_laptop.c b/drivers/platform/chrome/chromeos_laptop.c
index a04019ab9feb..02072749fff3 100644
--- a/drivers/platform/chrome/chromeos_laptop.c
+++ b/drivers/platform/chrome/chromeos_laptop.c
@@ -23,7 +23,7 @@
 
 #include <linux/dmi.h>
 #include <linux/i2c.h>
-#include <linux/i2c/atmel_mxt_ts.h>
+#include <linux/platform_data/atmel_mxt_ts.h>
 #include <linux/input.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
@@ -111,6 +111,7 @@ static struct mxt_platform_data atmel_224s_tp_platform_data = {
 	.irqflags		= IRQF_TRIGGER_FALLING,
 	.t19_num_keys		= ARRAY_SIZE(mxt_t19_keys),
 	.t19_keymap		= mxt_t19_keys,
+	.suspend_mode		= MXT_SUSPEND_T9_CTRL,
 };
 
 static struct i2c_board_info atmel_224s_tp_device = {
@@ -121,6 +122,7 @@ static struct i2c_board_info atmel_224s_tp_device = {
 
 static struct mxt_platform_data atmel_1664s_platform_data = {
 	.irqflags		= IRQF_TRIGGER_FALLING,
+	.suspend_mode		= MXT_SUSPEND_T9_CTRL,
 };
 
 static struct i2c_board_info atmel_1664s_device = {
diff --git a/include/linux/i2c/atmel_mxt_ts.h b/include/linux/i2c/atmel_mxt_ts.h
deleted file mode 100644
index 02bf6ea31701..000000000000
--- a/include/linux/i2c/atmel_mxt_ts.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Atmel maXTouch Touchscreen driver
- *
- * Copyright (C) 2010 Samsung Electronics Co.Ltd
- * Author: Joonyoung Shim <jy0922.shim@samsung.com>
- *
- * This program is free software; you can redistribute  it and/or modify it
- * under  the terms of  the GNU General  Public License as published by the
- * Free Software Foundation;  either version 2 of the  License, or (at your
- * option) any later version.
- */
-
-#ifndef __LINUX_ATMEL_MXT_TS_H
-#define __LINUX_ATMEL_MXT_TS_H
-
-#include <linux/types.h>
-
-/* The platform data for the Atmel maXTouch touchscreen driver */
-struct mxt_platform_data {
-	unsigned long irqflags;
-	u8 t19_num_keys;
-	const unsigned int *t19_keymap;
-};
-
-#endif /* __LINUX_ATMEL_MXT_TS_H */
diff --git a/include/linux/platform_data/atmel_mxt_ts.h b/include/linux/platform_data/atmel_mxt_ts.h
new file mode 100644
index 000000000000..695035a8d7fb
--- /dev/null
+++ b/include/linux/platform_data/atmel_mxt_ts.h
@@ -0,0 +1,31 @@
+/*
+ * Atmel maXTouch Touchscreen driver
+ *
+ * Copyright (C) 2010 Samsung Electronics Co.Ltd
+ * Author: Joonyoung Shim <jy0922.shim@samsung.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#ifndef __LINUX_PLATFORM_DATA_ATMEL_MXT_TS_H
+#define __LINUX_PLATFORM_DATA_ATMEL_MXT_TS_H
+
+#include <linux/types.h>
+
+enum mxt_suspend_mode {
+	MXT_SUSPEND_DEEP_SLEEP	= 0,
+	MXT_SUSPEND_T9_CTRL	= 1,
+};
+
+/* The platform data for the Atmel maXTouch touchscreen driver */
+struct mxt_platform_data {
+	unsigned long irqflags;
+	u8 t19_num_keys;
+	const unsigned int *t19_keymap;
+	enum mxt_suspend_mode suspend_mode;
+};
+
+#endif /* __LINUX_PLATFORM_DATA_ATMEL_MXT_TS_H */
-- 
cgit v1.2.3-70-g09d2


From 056f6c87028544de934f27caf95aa1545d585767 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 10 Jul 2015 12:07:25 +0200
Subject: dmaengine: shdma: Make dummy shdma_chan_filter() always return false

If CONFIG_SH_DMAE_BASE (which is required for DMA engine support for
legacy SH, SH/R-Mobile, and R-Car Gen1, but not for R-Car Gen2) is not
enabled, but CONFIG_RCAR_DMAC (for R-Car Gen2 DMA engine support) is,
and the DTS doesn't provide a "dmas" property for a device,
dma_request_slave_channel_compat() incorrectly succeeds, and returns a
DMA channel.

However, when trying to use that DMA channel later, it fails with:

    rcar-dmac e6700000.dma-controller: rcar_dmac_prep_slave_sg: bad parameter: len=1, id=-22

(Fortunately most drivers can handle this failure, and fall back to
PIO)

The reason for this is that a NULL legacy filter function is used, which
actually means "all channels are OK", not "do not match".
If CONFIG_SH_DMAE_BASE is enabled (like in shmobile_defconfig, which
supports other SoCs besides R-Car Gen2), shdma_chan_filter() correctly
returns false, as no available channel on R-Car Gen2 matches a
shdma-base channel.
If the DTS does provide a "dmas" property, dma_request_slave_channel()
succeeds, and legacy filter-based matching is not used.

To fix this, change shdma_chan_filter from being NULL to a dummy
function that always returns false, like is done on other platforms.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/shdma-base.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/shdma-base.h b/include/linux/shdma-base.h
index dd0ba502ccb3..d927647e6350 100644
--- a/include/linux/shdma-base.h
+++ b/include/linux/shdma-base.h
@@ -128,7 +128,10 @@ void shdma_cleanup(struct shdma_dev *sdev);
 #if IS_ENABLED(CONFIG_SH_DMAE_BASE)
 bool shdma_chan_filter(struct dma_chan *chan, void *arg);
 #else
-#define shdma_chan_filter NULL
+static inline bool shdma_chan_filter(struct dma_chan *chan, void *arg)
+{
+	return false;
+}
 #endif
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 8cd90e50d1408c65c355084b1c7f8f9085f49c6b Mon Sep 17 00:00:00 2001
From: Jun Nie <jun.nie@linaro.org>
Date: Fri, 31 Jul 2015 15:49:19 +0800
Subject: uart: pl011: Add support to ZTE ZX296702 uart

Support ZTE uart with some registers differing offset.
Probe as platform device for not AMBA IP ID is
available on ZTE uart.

Signed-off-by: Jun Nie <jun.nie@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/Kconfig      |   4 +-
 drivers/tty/serial/amba-pl011.c | 195 +++++++++++++++++++++++++++++++++++++---
 include/linux/amba/serial.h     |  14 +++
 3 files changed, 197 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index 687b1ea294b7..ed299b9e6375 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -47,12 +47,12 @@ config SERIAL_AMBA_PL010_CONSOLE
 
 config SERIAL_AMBA_PL011
 	tristate "ARM AMBA PL011 serial port support"
-	depends on ARM_AMBA
+	depends on ARM_AMBA || SOC_ZX296702
 	select SERIAL_CORE
 	help
 	  This selects the ARM(R) AMBA(R) PrimeCell PL011 UART.  If you have
 	  an Integrator/PP2, Integrator/CP or Versatile platform, say Y or M
-	  here.
+	  here. Say Y or M if you have SOC_ZX296702.
 
 	  If unsure, say N.
 
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index 017443d092c1..2af09ab153b6 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -74,6 +74,10 @@
 /* There is by now at least one vendor with differing details, so handle it */
 struct vendor_data {
 	unsigned int		ifls;
+	unsigned int		fr_busy;
+	unsigned int		fr_dsr;
+	unsigned int		fr_cts;
+	unsigned int		fr_ri;
 	unsigned int		lcrh_tx;
 	unsigned int		lcrh_rx;
 	u16			*reg_lut;
@@ -127,6 +131,7 @@ static u16 arm_reg[] = {
 	[REG_DMACR]		= UART011_DMACR,
 };
 
+#ifdef CONFIG_ARM_AMBA
 static unsigned int get_fifosize_arm(struct amba_device *dev)
 {
 	return amba_rev(dev) < 3 ? 16 : 32;
@@ -134,6 +139,10 @@ static unsigned int get_fifosize_arm(struct amba_device *dev)
 
 static struct vendor_data vendor_arm = {
 	.ifls			= UART011_IFLS_RX4_8|UART011_IFLS_TX4_8,
+	.fr_busy		= UART01x_FR_BUSY,
+	.fr_dsr			= UART01x_FR_DSR,
+	.fr_cts			= UART01x_FR_CTS,
+	.fr_ri			= UART011_FR_RI,
 	.lcrh_tx		= REG_LCRH,
 	.lcrh_rx		= REG_LCRH,
 	.reg_lut		= arm_reg,
@@ -144,8 +153,13 @@ static struct vendor_data vendor_arm = {
 	.fixed_options		= false,
 	.get_fifosize		= get_fifosize_arm,
 };
+#endif
 
 static struct vendor_data vendor_sbsa = {
+	.fr_busy		= UART01x_FR_BUSY,
+	.fr_dsr			= UART01x_FR_DSR,
+	.fr_cts			= UART01x_FR_CTS,
+	.fr_ri			= UART011_FR_RI,
 	.reg_lut		= arm_reg,
 	.oversampling		= false,
 	.dma_threshold		= false,
@@ -154,6 +168,7 @@ static struct vendor_data vendor_sbsa = {
 	.fixed_options		= true,
 };
 
+#ifdef CONFIG_ARM_AMBA
 static u16 st_reg[] = {
 	[REG_DR]		= UART01x_DR,
 	[REG_RSR]		= UART01x_RSR,
@@ -180,6 +195,10 @@ static unsigned int get_fifosize_st(struct amba_device *dev)
 
 static struct vendor_data vendor_st = {
 	.ifls			= UART011_IFLS_RX_HALF|UART011_IFLS_TX_HALF,
+	.fr_busy		= UART01x_FR_BUSY,
+	.fr_dsr			= UART01x_FR_DSR,
+	.fr_cts			= UART01x_FR_CTS,
+	.fr_ri			= UART011_FR_RI,
 	.lcrh_tx		= REG_LCRH,
 	.lcrh_rx		= REG_ST_LCRH_RX,
 	.reg_lut		= st_reg,
@@ -190,6 +209,43 @@ static struct vendor_data vendor_st = {
 	.fixed_options		= false,
 	.get_fifosize		= get_fifosize_st,
 };
+#endif
+
+#ifdef CONFIG_SOC_ZX296702
+static u16 zte_reg[] = {
+	[REG_DR]		= ZX_UART01x_DR,
+	[REG_RSR]		= UART01x_RSR,
+	[REG_ST_DMAWM]		= ST_UART011_DMAWM,
+	[REG_FR]		= ZX_UART01x_FR,
+	[REG_ST_LCRH_RX]	= ST_UART011_LCRH_RX,
+	[REG_ILPR]		= UART01x_ILPR,
+	[REG_IBRD]		= UART011_IBRD,
+	[REG_FBRD]		= UART011_FBRD,
+	[REG_LCRH]		= ZX_UART011_LCRH_TX,
+	[REG_CR]		= ZX_UART011_CR,
+	[REG_IFLS]		= ZX_UART011_IFLS,
+	[REG_IMSC]		= ZX_UART011_IMSC,
+	[REG_RIS]		= ZX_UART011_RIS,
+	[REG_MIS]		= ZX_UART011_MIS,
+	[REG_ICR]		= ZX_UART011_ICR,
+	[REG_DMACR]		= ZX_UART011_DMACR,
+};
+
+static struct vendor_data vendor_zte = {
+	.ifls			= UART011_IFLS_RX4_8|UART011_IFLS_TX4_8,
+	.fr_busy		= ZX_UART01x_FR_BUSY,
+	.fr_dsr			= ZX_UART01x_FR_DSR,
+	.fr_cts			= ZX_UART01x_FR_CTS,
+	.fr_ri			= ZX_UART011_FR_RI,
+	.lcrh_tx		= REG_LCRH,
+	.lcrh_rx		= REG_ST_LCRH_RX,
+	.reg_lut		= zte_reg,
+	.oversampling		= false,
+	.dma_threshold		= false,
+	.cts_event_workaround	= false,
+	.fixed_options		= false,
+};
+#endif
 
 /* Deals with DMA transactions */
 
@@ -233,6 +289,10 @@ struct uart_amba_port {
 	unsigned int		im;		/* interrupt mask */
 	unsigned int		old_status;
 	unsigned int		fifosize;	/* vendor-specific */
+	unsigned int		fr_busy;        /* vendor-specific */
+	unsigned int		fr_dsr;		/* vendor-specific */
+	unsigned int		fr_cts;         /* vendor-specific */
+	unsigned int		fr_ri;		/* vendor-specific */
 	unsigned int		lcrh_tx;	/* vendor-specific */
 	unsigned int		lcrh_rx;	/* vendor-specific */
 	unsigned int		old_cr;		/* state during shutdown */
@@ -1163,7 +1223,7 @@ static void pl011_dma_shutdown(struct uart_amba_port *uap)
 		return;
 
 	/* Disable RX and TX DMA */
-	while (pl011_readw(uap, REG_FR) & UART01x_FR_BUSY)
+	while (pl011_readw(uap, REG_FR) & uap->fr_busy)
 		barrier();
 
 	spin_lock_irq(&uap->port.lock);
@@ -1412,11 +1472,11 @@ static void pl011_modem_status(struct uart_amba_port *uap)
 	if (delta & UART01x_FR_DCD)
 		uart_handle_dcd_change(&uap->port, status & UART01x_FR_DCD);
 
-	if (delta & UART01x_FR_DSR)
+	if (delta & uap->fr_dsr)
 		uap->port.icount.dsr++;
 
-	if (delta & UART01x_FR_CTS)
-		uart_handle_cts_change(&uap->port, status & UART01x_FR_CTS);
+	if (delta & uap->fr_cts)
+		uart_handle_cts_change(&uap->port, status & uap->fr_cts);
 
 	wake_up_interruptible(&uap->port.state->port.delta_msr_wait);
 }
@@ -1487,7 +1547,7 @@ static unsigned int pl011_tx_empty(struct uart_port *port)
 	struct uart_amba_port *uap =
 	    container_of(port, struct uart_amba_port, port);
 	unsigned int status = pl011_readw(uap, REG_FR);
-	return status & (UART01x_FR_BUSY|UART01x_FR_TXFF) ? 0 : TIOCSER_TEMT;
+	return status & (uap->fr_busy|UART01x_FR_TXFF) ? 0 : TIOCSER_TEMT;
 }
 
 static unsigned int pl011_get_mctrl(struct uart_port *port)
@@ -1502,9 +1562,9 @@ static unsigned int pl011_get_mctrl(struct uart_port *port)
 		result |= tiocmbit
 
 	TIOCMBIT(UART01x_FR_DCD, TIOCM_CAR);
-	TIOCMBIT(UART01x_FR_DSR, TIOCM_DSR);
-	TIOCMBIT(UART01x_FR_CTS, TIOCM_CTS);
-	TIOCMBIT(UART011_FR_RI, TIOCM_RNG);
+	TIOCMBIT(uap->fr_dsr, TIOCM_DSR);
+	TIOCMBIT(uap->fr_cts, TIOCM_CTS);
+	TIOCMBIT(uap->fr_ri, TIOCM_RNG);
 #undef TIOCMBIT
 	return result;
 }
@@ -1720,8 +1780,7 @@ static int pl011_startup(struct uart_port *port)
 	/*
 	 * initialise the old status of the modem signals
 	 */
-	uap->old_status = pl011_readw(uap, REG_FR) &
-			UART01x_FR_MODEM_ANY;
+	uap->old_status = pl011_readw(uap, REG_FR) & UART01x_FR_MODEM_ANY;
 
 	/* Startup DMA */
 	pl011_dma_startup(uap);
@@ -1800,7 +1859,7 @@ static void pl011_disable_interrupts(struct uart_amba_port *uap)
 	/* mask all interrupts and clear all pending ones */
 	uap->im = 0;
 	pl011_writew(uap, uap->im, REG_IMSC);
-	pl011_writew(0xffff, REG_ICR);
+	pl011_writew(uap, 0xffff, REG_ICR);
 
 	spin_unlock_irq(&uap->port.lock);
 }
@@ -2178,7 +2237,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
 	 */
 	do {
 		status = pl011_readw(uap, REG_FR);
-	} while (status & UART01x_FR_BUSY);
+	} while (status & uap->fr_busy);
 	if (!uap->vendor->always_enabled)
 		pl011_writew(uap, old_cr, REG_CR);
 
@@ -2295,7 +2354,7 @@ static void pl011_putc(struct uart_port *port, int c)
 	while (pl011_readw(uap, REG_FR) & UART01x_FR_TXFF)
 		;
 	pl011_writeb(uap, c, REG_DR);
-	while (pl011_readw(uap, REG_FR) & UART01x_FR_BUSY)
+	while (pl011_readw(uap, REG_FR) & uap->fr_busy)
 		;
 }
 
@@ -2441,6 +2500,7 @@ static int pl011_register_port(struct uart_amba_port *uap)
 	return ret;
 }
 
+#ifdef CONFIG_ARM_AMBA
 static int pl011_probe(struct amba_device *dev, const struct amba_id *id)
 {
 	struct uart_amba_port *uap;
@@ -2464,6 +2524,10 @@ static int pl011_probe(struct amba_device *dev, const struct amba_id *id)
 	uap->reg_lut = vendor->reg_lut;
 	uap->lcrh_rx = vendor->lcrh_rx;
 	uap->lcrh_tx = vendor->lcrh_tx;
+	uap->fr_busy = vendor->fr_busy;
+	uap->fr_dsr = vendor->fr_dsr;
+	uap->fr_cts = vendor->fr_cts;
+	uap->fr_ri = vendor->fr_ri;
 	uap->fifosize = vendor->get_fifosize(dev);
 	uap->port.irq = dev->irq[0];
 	uap->port.ops = &amba_pl011_pops;
@@ -2487,6 +2551,67 @@ static int pl011_remove(struct amba_device *dev)
 	pl011_unregister_port(uap);
 	return 0;
 }
+#endif
+
+#ifdef CONFIG_SOC_ZX296702
+static int zx_uart_probe(struct platform_device *pdev)
+{
+	struct uart_amba_port *uap;
+	struct vendor_data *vendor = &vendor_zte;
+	struct resource *res;
+	int portnr, ret;
+
+	portnr = pl011_find_free_port();
+	if (portnr < 0)
+		return portnr;
+
+	uap = devm_kzalloc(&pdev->dev, sizeof(struct uart_amba_port),
+			GFP_KERNEL);
+	if (!uap) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	uap->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(uap->clk)) {
+		ret = PTR_ERR(uap->clk);
+		goto out;
+	}
+
+	uap->vendor	= vendor;
+	uap->reg_lut	= vendor->reg_lut;
+	uap->lcrh_rx	= vendor->lcrh_rx;
+	uap->lcrh_tx	= vendor->lcrh_tx;
+	uap->fr_busy	= vendor->fr_busy;
+	uap->fr_dsr	= vendor->fr_dsr;
+	uap->fr_cts	= vendor->fr_cts;
+	uap->fr_ri	= vendor->fr_ri;
+	uap->fifosize	= 16;
+	uap->port.irq	= platform_get_irq(pdev, 0);
+	uap->port.ops	= &amba_pl011_pops;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+
+	ret = pl011_setup_port(&pdev->dev, uap, res, portnr);
+	if (ret)
+		return ret;
+
+	platform_set_drvdata(pdev, uap);
+
+	return pl011_register_port(uap);
+out:
+	return ret;
+}
+
+static int zx_uart_remove(struct platform_device *pdev)
+{
+	struct uart_amba_port *uap = platform_get_drvdata(pdev);
+
+	uart_remove_one_port(&amba_reg, &uap->port);
+	pl011_unregister_port(uap);
+	return 0;
+}
+#endif
 
 #ifdef CONFIG_PM_SLEEP
 static int pl011_suspend(struct device *dev)
@@ -2544,6 +2669,10 @@ static int sbsa_uart_probe(struct platform_device *pdev)
 
 	uap->vendor	= &vendor_sbsa;
 	uap->reg_lut	= vendor_sbsa.reg_lut;
+	uap->fr_busy	= vendor_sbsa.fr_busy;
+	uap->fr_dsr	= vendor_sbsa.fr_dsr;
+	uap->fr_cts	= vendor_sbsa.fr_cts;
+	uap->fr_ri	= vendor_sbsa.fr_ri;
 	uap->fifosize	= 32;
 	uap->port.irq	= platform_get_irq(pdev, 0);
 	uap->port.ops	= &sbsa_uart_pops;
@@ -2593,6 +2722,7 @@ static struct platform_driver arm_sbsa_uart_platform_driver = {
 	},
 };
 
+#ifdef CONFIG_ARM_AMBA
 static struct amba_id pl011_ids[] = {
 	{
 		.id	= 0x00041011,
@@ -2618,20 +2748,57 @@ static struct amba_driver pl011_driver = {
 	.probe		= pl011_probe,
 	.remove		= pl011_remove,
 };
+#endif
+
+#ifdef CONFIG_SOC_ZX296702
+static const struct of_device_id zx_uart_dt_ids[] = {
+	{ .compatible = "zte,zx296702-uart", },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, zx_uart_dt_ids);
+
+static struct platform_driver zx_uart_driver = {
+	.driver = {
+		.name	= "zx-uart",
+		.owner	= THIS_MODULE,
+		.pm	= &pl011_dev_pm_ops,
+		.of_match_table = zx_uart_dt_ids,
+	},
+	.probe		= zx_uart_probe,
+	.remove		= zx_uart_remove,
+};
+#endif
+
 
 static int __init pl011_init(void)
 {
+	int ret;
 	printk(KERN_INFO "Serial: AMBA PL011 UART driver\n");
 
 	if (platform_driver_register(&arm_sbsa_uart_platform_driver))
 		pr_warn("could not register SBSA UART platform driver\n");
-	return amba_driver_register(&pl011_driver);
+
+#ifdef CONFIG_SOC_ZX296702
+	ret = platform_driver_register(&zx_uart_driver);
+	if (ret)
+		pr_warn("could not register ZX UART platform driver\n");
+#endif
+
+#ifdef CONFIG_ARM_AMBA
+	ret = amba_driver_register(&pl011_driver);
+#endif
+	return ret;
 }
 
 static void __exit pl011_exit(void)
 {
 	platform_driver_unregister(&arm_sbsa_uart_platform_driver);
+#ifdef CONFIG_SOC_ZX296702
+	platform_driver_unregister(&zx_uart_driver);
+#endif
+#ifdef CONFIG_ARM_AMBA
 	amba_driver_unregister(&pl011_driver);
+#endif
 }
 
 /*
diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index 0ddb5c02ad8b..6a0a89ed7f81 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -33,12 +33,14 @@
 #define UART01x_DR		0x00	/* Data read or written from the interface. */
 #define UART01x_RSR		0x04	/* Receive status register (Read). */
 #define UART01x_ECR		0x04	/* Error clear register (Write). */
+#define ZX_UART01x_DR		0x04	/* Data read or written from the interface. */
 #define UART010_LCRH		0x08	/* Line control register, high byte. */
 #define ST_UART011_DMAWM	0x08    /* DMA watermark configure register. */
 #define UART010_LCRM		0x0C	/* Line control register, middle byte. */
 #define ST_UART011_TIMEOUT	0x0C    /* Timeout period register. */
 #define UART010_LCRL		0x10	/* Line control register, low byte. */
 #define UART010_CR		0x14	/* Control register. */
+#define ZX_UART01x_FR		0x14	/* Flag register (Read only). */
 #define UART01x_FR		0x18	/* Flag register (Read only). */
 #define UART010_IIR		0x1C	/* Interrupt identification register (Read). */
 #define UART010_ICR		0x1C	/* Interrupt clear register (Write). */
@@ -49,13 +51,21 @@
 #define UART011_LCRH		0x2c	/* Line control register. */
 #define ST_UART011_LCRH_TX	0x2c    /* Tx Line control register. */
 #define UART011_CR		0x30	/* Control register. */
+#define ZX_UART011_LCRH_TX	0x30    /* Tx Line control register. */
 #define UART011_IFLS		0x34	/* Interrupt fifo level select. */
+#define ZX_UART011_CR		0x34	/* Control register. */
+#define ZX_UART011_IFLS		0x38	/* Interrupt fifo level select. */
 #define UART011_IMSC		0x38	/* Interrupt mask. */
 #define UART011_RIS		0x3c	/* Raw interrupt status. */
 #define UART011_MIS		0x40	/* Masked interrupt status. */
+#define ZX_UART011_IMSC		0x40	/* Interrupt mask. */
 #define UART011_ICR		0x44	/* Interrupt clear register. */
+#define ZX_UART011_RIS		0x44	/* Raw interrupt status. */
 #define UART011_DMACR		0x48	/* DMA control register. */
+#define ZX_UART011_MIS		0x48	/* Masked interrupt status. */
+#define ZX_UART011_ICR		0x4c	/* Interrupt clear register. */
 #define ST_UART011_XFCR		0x50	/* XON/XOFF control register. */
+#define ZX_UART011_DMACR	0x50	/* DMA control register. */
 #define ST_UART011_XON1		0x54	/* XON1 register. */
 #define ST_UART011_XON2		0x58	/* XON2 register. */
 #define ST_UART011_XOFF1	0x5C	/* XON1 register. */
@@ -75,15 +85,19 @@
 #define UART01x_RSR_PE 		0x02
 #define UART01x_RSR_FE 		0x01
 
+#define ZX_UART01x_FR_BUSY	0x300
 #define UART011_FR_RI		0x100
 #define UART011_FR_TXFE		0x080
 #define UART011_FR_RXFF		0x040
 #define UART01x_FR_TXFF		0x020
 #define UART01x_FR_RXFE		0x010
 #define UART01x_FR_BUSY		0x008
+#define ZX_UART01x_FR_DSR       0x008
 #define UART01x_FR_DCD 		0x004
 #define UART01x_FR_DSR 		0x002
+#define ZX_UART01x_FR_CTS	0x002
 #define UART01x_FR_CTS 		0x001
+#define ZX_UART011_FR_RI	0x001
 #define UART01x_FR_TMSK		(UART01x_FR_TXFF + UART01x_FR_BUSY)
 
 #define UART011_CR_CTSEN	0x8000	/* CTS hardware flow control */
-- 
cgit v1.2.3-70-g09d2


From 77a68e56aae141d3e9c740a0ac43362af75d4890 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Mon, 20 Jul 2015 10:41:32 +0200
Subject: dmaengine: Add an enum for the dmaengine alignment constraints

Most drivers need to set constraints on the buffer alignment for async tx
operations. However, even though it is documented, some drivers either use
a defined constant that is not matching what the alignment variable expects
(like DMA_BUSWIDTH_* constants) or fill the alignment in bytes instead of
power of two.

Add a new enum for these alignments that matches what the framework
expects, and convert the drivers to it.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/coh901318.c    |  2 +-
 drivers/dma/dma-jz4780.c   |  2 +-
 drivers/dma/edma.c         |  2 +-
 drivers/dma/imx-dma.c      |  2 +-
 drivers/dma/k3dma.c        |  3 +--
 drivers/dma/mic_x100_dma.h |  2 +-
 drivers/dma/mmp_pdma.c     |  3 +--
 drivers/dma/mmp_tdma.c     |  3 +--
 drivers/dma/ste_dma40.c    |  2 +-
 drivers/dma/sun6i-dma.c    |  2 +-
 drivers/dma/xgene-dma.c    |  5 ++---
 include/linux/dmaengine.h  | 25 ++++++++++++++++++++-----
 12 files changed, 32 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/coh901318.c b/drivers/dma/coh901318.c
index fd22dd36985f..c340ca9bd2b5 100644
--- a/drivers/dma/coh901318.c
+++ b/drivers/dma/coh901318.c
@@ -2730,7 +2730,7 @@ static int __init coh901318_probe(struct platform_device *pdev)
 	 * This controller can only access address at even 32bit boundaries,
 	 * i.e. 2^2
 	 */
-	base->dma_memcpy.copy_align = 2;
+	base->dma_memcpy.copy_align = DMAENGINE_ALIGN_4_BYTES;
 	err = dma_async_device_register(&base->dma_memcpy);
 
 	if (err)
diff --git a/drivers/dma/dma-jz4780.c b/drivers/dma/dma-jz4780.c
index 26d2f0e09ea3..c29569ac9e4f 100644
--- a/drivers/dma/dma-jz4780.c
+++ b/drivers/dma/dma-jz4780.c
@@ -775,7 +775,7 @@ static int jz4780_dma_probe(struct platform_device *pdev)
 	dma_cap_set(DMA_CYCLIC, dd->cap_mask);
 
 	dd->dev = dev;
-	dd->copy_align = 2; /* 2^2 = 4 byte alignment */
+	dd->copy_align = DMAENGINE_ALIGN_4_BYTES;
 	dd->device_alloc_chan_resources = jz4780_dma_alloc_chan_resources;
 	dd->device_free_chan_resources = jz4780_dma_free_chan_resources;
 	dd->device_prep_slave_sg = jz4780_dma_prep_slave_sg;
diff --git a/drivers/dma/edma.c b/drivers/dma/edma.c
index 88853af69489..3e5d4f193005 100644
--- a/drivers/dma/edma.c
+++ b/drivers/dma/edma.c
@@ -1000,7 +1000,7 @@ static void edma_dma_init(struct edma_cc *ecc, struct dma_device *dma,
 	 * code using dma memcpy must make sure alignment of
 	 * length is at dma->copy_align boundary.
 	 */
-	dma->copy_align = DMA_SLAVE_BUSWIDTH_4_BYTES;
+	dma->copy_align = DMAENGINE_ALIGN_4_BYTES;
 
 	INIT_LIST_HEAD(&dma->channels);
 }
diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
index 139c5676cd74..48d85f8b95fe 100644
--- a/drivers/dma/imx-dma.c
+++ b/drivers/dma/imx-dma.c
@@ -1187,7 +1187,7 @@ static int __init imxdma_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, imxdma);
 
-	imxdma->dma_device.copy_align = 2; /* 2^2 = 4 bytes alignment */
+	imxdma->dma_device.copy_align = DMAENGINE_ALIGN_4_BYTES;
 	imxdma->dma_device.dev->dma_parms = &imxdma->dma_parms;
 	dma_set_max_seg_size(imxdma->dma_device.dev, 0xffffff);
 
diff --git a/drivers/dma/k3dma.c b/drivers/dma/k3dma.c
index 647e362f01fd..1ba2fd73852d 100644
--- a/drivers/dma/k3dma.c
+++ b/drivers/dma/k3dma.c
@@ -24,7 +24,6 @@
 #include "virt-dma.h"
 
 #define DRIVER_NAME		"k3-dma"
-#define DMA_ALIGN		3
 #define DMA_MAX_SIZE		0x1ffc
 
 #define INT_STAT		0x00
@@ -732,7 +731,7 @@ static int k3_dma_probe(struct platform_device *op)
 	d->slave.device_pause = k3_dma_transfer_pause;
 	d->slave.device_resume = k3_dma_transfer_resume;
 	d->slave.device_terminate_all = k3_dma_terminate_all;
-	d->slave.copy_align = DMA_ALIGN;
+	d->slave.copy_align = DMAENGINE_ALIGN_8_BYTES;
 
 	/* init virtual channel */
 	d->chans = devm_kzalloc(&op->dev,
diff --git a/drivers/dma/mic_x100_dma.h b/drivers/dma/mic_x100_dma.h
index f663b0bdd11d..d89982034e68 100644
--- a/drivers/dma/mic_x100_dma.h
+++ b/drivers/dma/mic_x100_dma.h
@@ -39,7 +39,7 @@
  */
 #define MIC_DMA_MAX_NUM_CHAN	8
 #define MIC_DMA_NUM_CHAN	4
-#define MIC_DMA_ALIGN_SHIFT	6
+#define MIC_DMA_ALIGN_SHIFT	DMAENGINE_ALIGN_64_BYTES
 #define MIC_DMA_ALIGN_BYTES	(1 << MIC_DMA_ALIGN_SHIFT)
 #define MIC_DMA_DESC_RX_SIZE	(128 * 1024 - 4)
 
diff --git a/drivers/dma/mmp_pdma.c b/drivers/dma/mmp_pdma.c
index 462a0229a743..e39457f13d4d 100644
--- a/drivers/dma/mmp_pdma.c
+++ b/drivers/dma/mmp_pdma.c
@@ -72,7 +72,6 @@
 #define DCMD_WIDTH4	(3 << 14)	/* 4 byte width (Word) */
 #define DCMD_LENGTH	0x01fff		/* length mask (max = 8K - 1) */
 
-#define PDMA_ALIGNMENT		3
 #define PDMA_MAX_DESC_BYTES	DCMD_LENGTH
 
 struct mmp_pdma_desc_hw {
@@ -1071,7 +1070,7 @@ static int mmp_pdma_probe(struct platform_device *op)
 	pdev->device.device_issue_pending = mmp_pdma_issue_pending;
 	pdev->device.device_config = mmp_pdma_config;
 	pdev->device.device_terminate_all = mmp_pdma_terminate_all;
-	pdev->device.copy_align = PDMA_ALIGNMENT;
+	pdev->device.copy_align = DMAENGINE_ALIGN_8_BYTES;
 	pdev->device.src_addr_widths = widths;
 	pdev->device.dst_addr_widths = widths;
 	pdev->device.directions = BIT(DMA_MEM_TO_DEV) | BIT(DMA_DEV_TO_MEM);
diff --git a/drivers/dma/mmp_tdma.c b/drivers/dma/mmp_tdma.c
index e683761e0f8f..3df0422607d5 100644
--- a/drivers/dma/mmp_tdma.c
+++ b/drivers/dma/mmp_tdma.c
@@ -100,7 +100,6 @@ enum mmp_tdma_type {
 	PXA910_SQU,
 };
 
-#define TDMA_ALIGNMENT		3
 #define TDMA_MAX_XFER_BYTES    SZ_64K
 
 struct mmp_tdma_chan {
@@ -695,7 +694,7 @@ static int mmp_tdma_probe(struct platform_device *pdev)
 	tdev->device.device_pause = mmp_tdma_pause_chan;
 	tdev->device.device_resume = mmp_tdma_resume_chan;
 	tdev->device.device_terminate_all = mmp_tdma_terminate_all;
-	tdev->device.copy_align = TDMA_ALIGNMENT;
+	tdev->device.copy_align = DMAENGINE_ALIGN_8_BYTES;
 
 	dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
 	platform_set_drvdata(pdev, tdev);
diff --git a/drivers/dma/ste_dma40.c b/drivers/dma/ste_dma40.c
index 3c10f034d4b9..750d1b313684 100644
--- a/drivers/dma/ste_dma40.c
+++ b/drivers/dma/ste_dma40.c
@@ -2853,7 +2853,7 @@ static void d40_ops_init(struct d40_base *base, struct dma_device *dev)
 		 * This controller can only access address at even
 		 * 32bit boundaries, i.e. 2^2
 		 */
-		dev->copy_align = 2;
+		dev->copy_align = DMAENGINE_ALIGN_4_BYTES;
 	}
 
 	if (dma_has_cap(DMA_SG, dev->cap_mask))
diff --git a/drivers/dma/sun6i-dma.c b/drivers/dma/sun6i-dma.c
index 842ff97c2cfb..73e0be6e2100 100644
--- a/drivers/dma/sun6i-dma.c
+++ b/drivers/dma/sun6i-dma.c
@@ -969,7 +969,7 @@ static int sun6i_dma_probe(struct platform_device *pdev)
 	sdc->slave.device_issue_pending		= sun6i_dma_issue_pending;
 	sdc->slave.device_prep_slave_sg		= sun6i_dma_prep_slave_sg;
 	sdc->slave.device_prep_dma_memcpy	= sun6i_dma_prep_dma_memcpy;
-	sdc->slave.copy_align			= 4;
+	sdc->slave.copy_align			= DMAENGINE_ALIGN_4_BYTES;
 	sdc->slave.device_config		= sun6i_dma_config;
 	sdc->slave.device_pause			= sun6i_dma_pause;
 	sdc->slave.device_resume		= sun6i_dma_resume;
diff --git a/drivers/dma/xgene-dma.c b/drivers/dma/xgene-dma.c
index 620fd55ec766..fe87a634b145 100644
--- a/drivers/dma/xgene-dma.c
+++ b/drivers/dma/xgene-dma.c
@@ -150,7 +150,6 @@
 #define XGENE_DMA_PQ_CHANNEL		1
 #define XGENE_DMA_MAX_BYTE_CNT		0x4000	/* 16 KB */
 #define XGENE_DMA_MAX_64B_DESC_BYTE_CNT	0x14000	/* 80 KB */
-#define XGENE_DMA_XOR_ALIGNMENT		6	/* 64 Bytes */
 #define XGENE_DMA_MAX_XOR_SRC		5
 #define XGENE_DMA_16K_BUFFER_LEN_CODE	0x0
 #define XGENE_DMA_INVALID_LEN_CODE	0x7800000000000000ULL
@@ -1740,13 +1739,13 @@ static void xgene_dma_set_caps(struct xgene_dma_chan *chan,
 	if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
 		dma_dev->device_prep_dma_xor = xgene_dma_prep_xor;
 		dma_dev->max_xor = XGENE_DMA_MAX_XOR_SRC;
-		dma_dev->xor_align = XGENE_DMA_XOR_ALIGNMENT;
+		dma_dev->xor_align = DMAENGINE_ALIGN_64_BYTES;
 	}
 
 	if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) {
 		dma_dev->device_prep_dma_pq = xgene_dma_prep_pq;
 		dma_dev->max_pq = XGENE_DMA_MAX_XOR_SRC;
-		dma_dev->pq_align = XGENE_DMA_XOR_ALIGNMENT;
+		dma_dev->pq_align = DMAENGINE_ALIGN_64_BYTES;
 	}
 }
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index e2f5eb419976..03ed832adbc2 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -584,6 +584,20 @@ struct dma_tx_state {
 	u32 residue;
 };
 
+/**
+ * enum dmaengine_alignment - defines alignment of the DMA async tx
+ * buffers
+ */
+enum dmaengine_alignment {
+	DMAENGINE_ALIGN_1_BYTE = 0,
+	DMAENGINE_ALIGN_2_BYTES = 1,
+	DMAENGINE_ALIGN_4_BYTES = 2,
+	DMAENGINE_ALIGN_8_BYTES = 3,
+	DMAENGINE_ALIGN_16_BYTES = 4,
+	DMAENGINE_ALIGN_32_BYTES = 5,
+	DMAENGINE_ALIGN_64_BYTES = 6,
+};
+
 /**
  * struct dma_device - info on the entity supplying DMA services
  * @chancnt: how many DMA channels are supported
@@ -645,10 +659,10 @@ struct dma_device {
 	dma_cap_mask_t  cap_mask;
 	unsigned short max_xor;
 	unsigned short max_pq;
-	u8 copy_align;
-	u8 xor_align;
-	u8 pq_align;
-	u8 fill_align;
+	enum dmaengine_alignment copy_align;
+	enum dmaengine_alignment xor_align;
+	enum dmaengine_alignment pq_align;
+	enum dmaengine_alignment fill_align;
 	#define DMA_HAS_PQ_CONTINUE (1 << 15)
 
 	int dev_id;
@@ -833,7 +847,8 @@ static inline dma_cookie_t dmaengine_submit(struct dma_async_tx_descriptor *desc
 	return desc->tx_submit(desc);
 }
 
-static inline bool dmaengine_check_align(u8 align, size_t off1, size_t off2, size_t len)
+static inline bool dmaengine_check_align(enum dmaengine_alignment align,
+					 size_t off1, size_t off2, size_t len)
 {
 	size_t mask;
 
-- 
cgit v1.2.3-70-g09d2


From 2b94ed245861a7d378dcde6eef7fa7717e06e349 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Sat, 1 Aug 2015 16:08:06 -0700
Subject: kexec: define kexec_in_progress in !CONFIG_KEXEC case

If some piece of code wants to check kexec_in_progress it has to be put
in #ifdef CONFIG_KEXEC block to not break the build in !CONFIG_KEXEC
case. Overcome this limitation by defining kexec_in_progress to false.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kexec.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index e804306ef5e8..b63218f68c4b 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -323,6 +323,7 @@ struct pt_regs;
 struct task_struct;
 static inline void crash_kexec(struct pt_regs *regs) { }
 static inline int kexec_should_crash(struct task_struct *p) { return 0; }
+#define kexec_in_progress false
 #endif /* CONFIG_KEXEC */
 
 #endif /* !defined(__ASSEBMLY__) */
-- 
cgit v1.2.3-70-g09d2


From 1a1d48a4a8fde49aedc045d894efe67173d59fe0 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Tue, 4 Aug 2015 16:15:14 +0200
Subject: linux/bitmap: Force inlining of bitmap weight functions

With this config:

  http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os

gcc-4.7.2 generates many copies of these tiny functions:

	bitmap_weight (55 copies):
	55                      push   %rbp
	48 89 e5                mov    %rsp,%rbp
	e8 3f 3a 8b 00          callq  __bitmap_weight
	5d                      pop    %rbp
	c3                      retq

	hweight_long (23 copies):
	55                      push   %rbp
	e8 b5 65 8e 00          callq  __sw_hweight64
	48 89 e5                mov    %rsp,%rbp
	5d                      pop    %rbp
	c3                      retq

See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

This patch fixes this via s/inline/__always_inline/

While at it, replaced two "__inline__" with usual "inline"
(the rest of the source file uses the latter).

	    text     data      bss       dec  filename
	86971357 17195880 36659200 140826437  vmlinux.before
	86971120 17195912 36659200 140826232  vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1438697716-28121-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/bitmap.h | 2 +-
 include/linux/bitops.h | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index ea17cca9e685..9653fdb76a42 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -295,7 +295,7 @@ static inline int bitmap_full(const unsigned long *src, unsigned int nbits)
 	return find_first_zero_bit(src, nbits) == nbits;
 }
 
-static inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
+static __always_inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 297f5bda4fdf..e63553386ae7 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -57,7 +57,7 @@ extern unsigned long __sw_hweight64(__u64 w);
 	     (bit) < (size);					\
 	     (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
 
-static __inline__ int get_bitmask_order(unsigned int count)
+static inline int get_bitmask_order(unsigned int count)
 {
 	int order;
 
@@ -65,7 +65,7 @@ static __inline__ int get_bitmask_order(unsigned int count)
 	return order;	/* We could be slightly more clever with -1 here... */
 }
 
-static __inline__ int get_count_order(unsigned int count)
+static inline int get_count_order(unsigned int count)
 {
 	int order;
 
@@ -75,7 +75,7 @@ static __inline__ int get_count_order(unsigned int count)
 	return order;
 }
 
-static inline unsigned long hweight_long(unsigned long w)
+static __always_inline unsigned long hweight_long(unsigned long w)
 {
 	return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
 }
-- 
cgit v1.2.3-70-g09d2


From accd0b9ec015d611eb7783dd86f1bb31bf8d62ab Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Tue, 4 Aug 2015 16:15:16 +0200
Subject: jiffies: Force inlining of {m,u}msecs_to_jiffies()

With this config:

  http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os

gcc-4.7.2 generates many copies of these tiny functions:

	msecs_to_jiffies (45 copies):
	55                      push   %rbp
	48 89 e5                mov    %rsp,%rbp
	e8 59 ec 03 00          callq  __msecs_to_jiffies
	5d                      pop    %rbp
	c3                      retq

	usecs_to_jiffies (10 copies):
	55                      push   %rbp
	48 89 e5                mov    %rsp,%rbp
	e8 5d 54 5e ff          callq  __usecs_to_jiffies
	5d                      pop    %rbp
	c3                      retq

See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

This patch fixes this via s/inline/__always_inline/

	    text     data      bss       dec  filename
	86970954 17195912 36659200 140826066  vmlinux.before
	86966150 17195912 36659200 140821262  vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1438697716-28121-3-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jiffies.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 535fd3bb1ba8..1ba48a18c1d7 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -351,7 +351,7 @@ static inline unsigned long _msecs_to_jiffies(const unsigned int m)
  * directly here and from __msecs_to_jiffies() in the case where
  * constant folding is not possible.
  */
-static inline unsigned long msecs_to_jiffies(const unsigned int m)
+static __always_inline unsigned long msecs_to_jiffies(const unsigned int m)
 {
 	if (__builtin_constant_p(m)) {
 		if ((int)m < 0)
@@ -405,7 +405,7 @@ static inline unsigned long _usecs_to_jiffies(const unsigned int u)
  * directly here and from __msecs_to_jiffies() in the case where
  * constant folding is not possible.
  */
-static inline unsigned long usecs_to_jiffies(const unsigned int u)
+static __always_inline unsigned long usecs_to_jiffies(const unsigned int u)
 {
 	if (__builtin_constant_p(u)) {
 		if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
-- 
cgit v1.2.3-70-g09d2


From 3f3af97d8225a58ecdcde7217c030b17e5198226 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 5 Aug 2015 12:54:46 +0100
Subject: ASN.1: Fix actions on CHOICE elements with IMPLICIT tags

In an ASN.1 description where there is a CHOICE construct that contains
elements with IMPLICIT tags that refer to constructed types, actions to be
taken on those elements should be conditional on the corresponding element
actually being matched.  Currently, however, such actions are performed
unconditionally in the middle of processing the CHOICE.

For example, look at elements 'b' and 'e' here:

	A ::= SEQUENCE {
			CHOICE {
			b [0] IMPLICIT B ({ do_XXXXXXXXXXXX_b }),
			c [1] EXPLICIT C ({ do_XXXXXXXXXXXX_c }),
			d [2] EXPLICIT B ({ do_XXXXXXXXXXXX_d }),
			e [3] IMPLICIT C ({ do_XXXXXXXXXXXX_e }),
			f [4] IMPLICIT INTEGER ({ do_XXXXXXXXXXXX_f })
			}
		} ({ do_XXXXXXXXXXXX_A })

	B ::= SET OF OBJECT IDENTIFIER ({ do_XXXXXXXXXXXX_oid })

	C ::= SET OF INTEGER ({ do_XXXXXXXXXXXX_int })

They each have an action (do_XXXXXXXXXXXX_b and do_XXXXXXXXXXXX_e) that
should only be processed if that element is matched.

The problem is that there's no easy place to hang the action off in the
subclause (type B for element 'b' and type C for element 'e') because
subclause opcode sequences can be shared.

To fix this, introduce a conditional action opcode(ASN1_OP_MAYBE_ACT) that
the decoder only processes if the preceding match was successful.  This can
be seen in an excerpt from the output of the fixed ASN.1 compiler for the
above ASN.1 description:

	[  13] =  ASN1_OP_COND_MATCH_JUMP_OR_SKIP,		// e
	[  14] =  _tagn(CONT, CONS,  3),
	[  15] =  _jump_target(45),		// --> C
	[  16] =  ASN1_OP_MAYBE_ACT,
	[  17] =  _action(ACT_do_XXXXXXXXXXXX_e),

In this, if the op at [13] is matched (ie. element 'e' above) then the
action at [16] will be performed.  However, if the op at [13] doesn't match
or is skipped because it is conditional and some previous op matched, then
the action at [16] will be ignored.

Note that to make this work in the decoder, the ASN1_OP_RETURN op must set
the flag to indicate that a match happened.  This is necessary because the
_jump_target() seen above introduces a subclause (in this case an object of
type 'C') which is likely to alter the flag.  Setting the flag here is okay
because to process a subclause, a match must have happened and caused a
jump.

This cannot be tested with the code as it stands, but rather affects future
code.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/asn1_ber_bytecode.h |  3 ++-
 lib/asn1_decoder.c                | 14 +++++++++++++-
 scripts/asn1_compiler.c           |  3 ++-
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/asn1_ber_bytecode.h b/include/linux/asn1_ber_bytecode.h
index 945d44ae529c..27f35780aecf 100644
--- a/include/linux/asn1_ber_bytecode.h
+++ b/include/linux/asn1_ber_bytecode.h
@@ -61,7 +61,8 @@ enum asn1_opcode {
 	ASN1_OP_COND_FAIL		= 0x1b,
 	ASN1_OP_COMPLETE		= 0x1c,
 	ASN1_OP_ACT			= 0x1d,
-	ASN1_OP_RETURN			= 0x1e,
+	ASN1_OP_MAYBE_ACT		= 0x1e,
+	ASN1_OP_RETURN			= 0x1f,
 
 	/* The following eight have bit 0 -> SET, 1 -> OF, 2 -> ACT */
 	ASN1_OP_END_SEQ			= 0x20,
diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c
index 1a000bb050f9..55980d7e1ac0 100644
--- a/lib/asn1_decoder.c
+++ b/lib/asn1_decoder.c
@@ -33,6 +33,7 @@ static const unsigned char asn1_op_lengths[ASN1_OP__NR] = {
 	[ASN1_OP_COND_FAIL]			= 1,
 	[ASN1_OP_COMPLETE]			= 1,
 	[ASN1_OP_ACT]				= 1         + 1,
+	[ASN1_OP_MAYBE_ACT]			= 1         + 1,
 	[ASN1_OP_RETURN]			= 1,
 	[ASN1_OP_END_SEQ]			= 1,
 	[ASN1_OP_END_SEQ_OF]			= 1     + 1,
@@ -177,6 +178,7 @@ int asn1_ber_decoder(const struct asn1_decoder *decoder,
 	unsigned char flags = 0;
 #define FLAG_INDEFINITE_LENGTH	0x01
 #define FLAG_MATCHED		0x02
+#define FLAG_LAST_MATCHED	0x04 /* Last tag matched */
 #define FLAG_CONS		0x20 /* Corresponds to CONS bit in the opcode tag
 				      * - ie. whether or not we are going to parse
 				      *   a compound type.
@@ -211,6 +213,7 @@ next_op:
 		if ((op & ASN1_OP_MATCH__COND &&
 		     flags & FLAG_MATCHED) ||
 		    dp == datalen) {
+			flags &= ~FLAG_LAST_MATCHED;
 			pc += asn1_op_lengths[op];
 			goto next_op;
 		}
@@ -422,8 +425,15 @@ next_op:
 		pc += asn1_op_lengths[op];
 		goto next_op;
 
+	case ASN1_OP_MAYBE_ACT:
+		if (!(flags & FLAG_LAST_MATCHED)) {
+			pc += asn1_op_lengths[op];
+			goto next_op;
+		}
 	case ASN1_OP_ACT:
 		ret = actions[machine[pc + 1]](context, hdr, tag, data + tdp, len);
+		if (ret < 0)
+			return ret;
 		pc += asn1_op_lengths[op];
 		goto next_op;
 
@@ -431,6 +441,7 @@ next_op:
 		if (unlikely(jsp <= 0))
 			goto jump_stack_underflow;
 		pc = jump_stack[--jsp];
+		flags |= FLAG_MATCHED | FLAG_LAST_MATCHED;
 		goto next_op;
 
 	default:
@@ -438,7 +449,8 @@ next_op:
 	}
 
 	/* Shouldn't reach here */
-	pr_err("ASN.1 decoder error: Found reserved opcode (%u)\n", op);
+	pr_err("ASN.1 decoder error: Found reserved opcode (%u) pc=%zu\n",
+	       op, pc);
 	return -EBADMSG;
 
 data_overrun_error:
diff --git a/scripts/asn1_compiler.c b/scripts/asn1_compiler.c
index e87359cd23c0..0515bced929a 100644
--- a/scripts/asn1_compiler.c
+++ b/scripts/asn1_compiler.c
@@ -1468,7 +1468,8 @@ dont_render_tag:
 	case TYPE_REF:
 		render_element(out, e->type->type->element, tag);
 		if (e->action)
-			render_opcode(out, "ASN1_OP_ACT,\n");
+			render_opcode(out, "ASN1_OP_%sACT,\n",
+				      skippable ? "MAYBE_" : "");
 		break;
 
 	case SEQUENCE:
-- 
cgit v1.2.3-70-g09d2


From 233ce79db4b23a174bcf30bde5d6ad913d5f46d3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 5 Aug 2015 12:54:46 +0100
Subject: ASN.1: Handle 'ANY OPTIONAL' in grammar

An ANY object in an ASN.1 grammar that is marked OPTIONAL should be skipped
if there is no more data to be had.

This can be tested by editing X.509 certificates or PKCS#7 messages to
remove the NULL from subobjects that look like the following:

	SEQUENCE {
	  OBJECT(2a864886f70d01010b);
	  NULL();
	}

This is an algorithm identifier plus an optional parameter.

The modified DER can be passed to one of:

	keyctl padd asymmetric "" @s </tmp/modified.x509
	keyctl padd pkcs7_test foo @s </tmp/modified.pkcs7

It should work okay with the patch and produce EBADMSG without.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Marcel Holtmann <marcel@holtmann.org>
Reviewed-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/asn1_ber_bytecode.h | 17 +++++++++++------
 lib/asn1_decoder.c                |  8 ++++++++
 scripts/asn1_compiler.c           |  3 ++-
 3 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/asn1_ber_bytecode.h b/include/linux/asn1_ber_bytecode.h
index 27f35780aecf..ab3a6c002f7b 100644
--- a/include/linux/asn1_ber_bytecode.h
+++ b/include/linux/asn1_ber_bytecode.h
@@ -45,24 +45,27 @@ enum asn1_opcode {
 	ASN1_OP_MATCH_JUMP		= 0x04,
 	ASN1_OP_MATCH_JUMP_OR_SKIP	= 0x05,
 	ASN1_OP_MATCH_ANY		= 0x08,
+	ASN1_OP_MATCH_ANY_OR_SKIP	= 0x09,
 	ASN1_OP_MATCH_ANY_ACT		= 0x0a,
+	ASN1_OP_MATCH_ANY_ACT_OR_SKIP	= 0x0b,
 	/* Everything before here matches unconditionally */
 
 	ASN1_OP_COND_MATCH_OR_SKIP	= 0x11,
 	ASN1_OP_COND_MATCH_ACT_OR_SKIP	= 0x13,
 	ASN1_OP_COND_MATCH_JUMP_OR_SKIP	= 0x15,
 	ASN1_OP_COND_MATCH_ANY		= 0x18,
+	ASN1_OP_COND_MATCH_ANY_OR_SKIP	= 0x19,
 	ASN1_OP_COND_MATCH_ANY_ACT	= 0x1a,
+	ASN1_OP_COND_MATCH_ANY_ACT_OR_SKIP = 0x1b,
 
 	/* Everything before here will want a tag from the data */
-#define ASN1_OP__MATCHES_TAG ASN1_OP_COND_MATCH_ANY_ACT
+#define ASN1_OP__MATCHES_TAG ASN1_OP_COND_MATCH_ANY_ACT_OR_SKIP
 
 	/* These are here to help fill up space */
-	ASN1_OP_COND_FAIL		= 0x1b,
-	ASN1_OP_COMPLETE		= 0x1c,
-	ASN1_OP_ACT			= 0x1d,
-	ASN1_OP_MAYBE_ACT		= 0x1e,
-	ASN1_OP_RETURN			= 0x1f,
+	ASN1_OP_COND_FAIL		= 0x1c,
+	ASN1_OP_COMPLETE		= 0x1d,
+	ASN1_OP_ACT			= 0x1e,
+	ASN1_OP_MAYBE_ACT		= 0x1f,
 
 	/* The following eight have bit 0 -> SET, 1 -> OF, 2 -> ACT */
 	ASN1_OP_END_SEQ			= 0x20,
@@ -77,6 +80,8 @@ enum asn1_opcode {
 #define ASN1_OP_END__OF			  0x02
 #define ASN1_OP_END__ACT		  0x04
 
+	ASN1_OP_RETURN			= 0x28,
+
 	ASN1_OP__NR
 };
 
diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c
index 3f74dd3e2910..2b3f46c049d4 100644
--- a/lib/asn1_decoder.c
+++ b/lib/asn1_decoder.c
@@ -24,12 +24,16 @@ static const unsigned char asn1_op_lengths[ASN1_OP__NR] = {
 	[ASN1_OP_MATCH_JUMP]			= 1 + 1 + 1,
 	[ASN1_OP_MATCH_JUMP_OR_SKIP]		= 1 + 1 + 1,
 	[ASN1_OP_MATCH_ANY]			= 1,
+	[ASN1_OP_MATCH_ANY_OR_SKIP]		= 1,
 	[ASN1_OP_MATCH_ANY_ACT]			= 1         + 1,
+	[ASN1_OP_MATCH_ANY_ACT_OR_SKIP]		= 1         + 1,
 	[ASN1_OP_COND_MATCH_OR_SKIP]		= 1 + 1,
 	[ASN1_OP_COND_MATCH_ACT_OR_SKIP]	= 1 + 1     + 1,
 	[ASN1_OP_COND_MATCH_JUMP_OR_SKIP]	= 1 + 1 + 1,
 	[ASN1_OP_COND_MATCH_ANY]		= 1,
+	[ASN1_OP_COND_MATCH_ANY_OR_SKIP]	= 1,
 	[ASN1_OP_COND_MATCH_ANY_ACT]		= 1         + 1,
+	[ASN1_OP_COND_MATCH_ANY_ACT_OR_SKIP]	= 1         + 1,
 	[ASN1_OP_COND_FAIL]			= 1,
 	[ASN1_OP_COMPLETE]			= 1,
 	[ASN1_OP_ACT]				= 1         + 1,
@@ -304,7 +308,9 @@ next_op:
 	/* Decide how to handle the operation */
 	switch (op) {
 	case ASN1_OP_MATCH_ANY_ACT:
+	case ASN1_OP_MATCH_ANY_ACT_OR_SKIP:
 	case ASN1_OP_COND_MATCH_ANY_ACT:
+	case ASN1_OP_COND_MATCH_ANY_ACT_OR_SKIP:
 		ret = actions[machine[pc + 1]](context, hdr, tag, data + dp, len);
 		if (ret < 0)
 			return ret;
@@ -321,8 +327,10 @@ next_op:
 	case ASN1_OP_MATCH:
 	case ASN1_OP_MATCH_OR_SKIP:
 	case ASN1_OP_MATCH_ANY:
+	case ASN1_OP_MATCH_ANY_OR_SKIP:
 	case ASN1_OP_COND_MATCH_OR_SKIP:
 	case ASN1_OP_COND_MATCH_ANY:
+	case ASN1_OP_COND_MATCH_ANY_OR_SKIP:
 	skip_data:
 		if (!(flags & FLAG_CONS)) {
 			if (flags & FLAG_INDEFINITE_LENGTH) {
diff --git a/scripts/asn1_compiler.c b/scripts/asn1_compiler.c
index 0515bced929a..1c75e22b6385 100644
--- a/scripts/asn1_compiler.c
+++ b/scripts/asn1_compiler.c
@@ -1401,7 +1401,8 @@ static void render_element(FILE *out, struct element *e, struct element *tag)
 	act = e->action ? "_ACT" : "";
 	switch (e->compound) {
 	case ANY:
-		render_opcode(out, "ASN1_OP_%sMATCH_ANY%s,", cond, act);
+		render_opcode(out, "ASN1_OP_%sMATCH_ANY%s%s,",
+			      cond, act, skippable ? "_OR_SKIP" : "");
 		if (e->name)
 			render_more(out, "\t\t// %*.*s",
 				    (int)e->name->size, (int)e->name->size,
-- 
cgit v1.2.3-70-g09d2


From 9f55eb92441883a1afca48dc8d32bf62c4d8e833 Mon Sep 17 00:00:00 2001
From: Fugang Duan <b38611@freescale.com>
Date: Tue, 28 Jul 2015 15:30:39 +0800
Subject: ARM: imx6ul: add fec bits to GPR syscon definition

FEC requires additional bits to select refrence clock.

Signed-off-by: Fugang Duan <B38611@freescale.com>
Signed-off-by: Shawn Guo <shawnguo@kernel.org>
---
 include/linux/mfd/syscon/imx6q-iomuxc-gpr.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h b/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h
index d16f4c82c568..558a485d03ab 100644
--- a/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h
+++ b/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h
@@ -435,4 +435,12 @@
 #define IMX6SX_GPR5_DISP_MUX_DCIC1_LVDS			(0x1 << 1)
 #define IMX6SX_GPR5_DISP_MUX_DCIC1_MASK			(0x1 << 1)
 
+/* For imx6ul iomux gpr register field define */
+#define IMX6UL_GPR1_ENET1_CLK_DIR		(0x1 << 17)
+#define IMX6UL_GPR1_ENET2_CLK_DIR		(0x1 << 18)
+#define IMX6UL_GPR1_ENET1_CLK_OUTPUT		(0x1 << 17)
+#define IMX6UL_GPR1_ENET2_CLK_OUTPUT		(0x1 << 18)
+#define IMX6UL_GPR1_ENET_CLK_DIR		(0x3 << 17)
+#define IMX6UL_GPR1_ENET_CLK_OUTPUT		(0x3 << 17)
+
 #endif /* __LINUX_IMX6Q_IOMUXC_GPR_H */
-- 
cgit v1.2.3-70-g09d2


From 44e259ac909f3b41786cf732a44b5cf8444e098a Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Wed, 15 Jul 2015 19:59:36 +0100
Subject: ARM: dove: create a proper PMU driver for power domains, PMU IRQs and
 resets

The PMU device contains an interrupt controller, power control and
resets.  The interrupt controller is a little sub-standard in that
there is no race free way to clear down pending interrupts, so we try
to avoid problems by reducing the window as much as possible, and
clearing as infrequently as possible.

The interrupt support is implemented using an IRQ domain, and the
parent interrupt referenced in the standard DT way.

The power domains and reset support is closely related - there is a
defined sequence for powering down a domain which is tightly coupled
with asserting the reset.  Hence, it makes sense to group these two
together, and in order to avoid any locking contention disrupting this
sequence, we avoid the use of syscon or regmap.

This patch adds the core PMU driver: power domains must be defined in
the DT file in order to make use of them.  The reset controller can
be referenced in the standard way for reset controllers.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
---
 arch/arm/mach-mvebu/Kconfig  |   1 +
 arch/arm/mach-mvebu/dove.c   |   2 +
 drivers/soc/Makefile         |   1 +
 drivers/soc/dove/Makefile    |   1 +
 drivers/soc/dove/pmu.c       | 412 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/soc/dove/pmu.h |   6 +
 6 files changed, 423 insertions(+)
 create mode 100644 drivers/soc/dove/Makefile
 create mode 100644 drivers/soc/dove/pmu.c
 create mode 100644 include/linux/soc/dove/pmu.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-mvebu/Kconfig b/arch/arm/mach-mvebu/Kconfig
index 97473168d6b6..c86a5a0aefac 100644
--- a/arch/arm/mach-mvebu/Kconfig
+++ b/arch/arm/mach-mvebu/Kconfig
@@ -96,6 +96,7 @@ config MACH_DOVE
 	select MACH_MVEBU_ANY
 	select ORION_IRQCHIP
 	select ORION_TIMER
+	select PM_GENERIC_DOMAINS if PM
 	select PINCTRL_DOVE
 	help
 	  Say 'Y' here if you want your kernel to support the
diff --git a/arch/arm/mach-mvebu/dove.c b/arch/arm/mach-mvebu/dove.c
index 5a1741500a30..1aebb82e3d7b 100644
--- a/arch/arm/mach-mvebu/dove.c
+++ b/arch/arm/mach-mvebu/dove.c
@@ -12,6 +12,7 @@
 #include <linux/mbus.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
+#include <linux/soc/dove/pmu.h>
 #include <asm/hardware/cache-tauros2.h>
 #include <asm/mach/arch.h>
 #include "common.h"
@@ -24,6 +25,7 @@ static void __init dove_init(void)
 	tauros2_init(0);
 #endif
 	BUG_ON(mvebu_mbus_dt_init(false));
+	dove_init_pmu();
 	of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL);
 }
 
diff --git a/drivers/soc/Makefile b/drivers/soc/Makefile
index 7dc7c0d8a2c1..0b12d777d3c4 100644
--- a/drivers/soc/Makefile
+++ b/drivers/soc/Makefile
@@ -2,6 +2,7 @@
 # Makefile for the Linux Kernel SOC specific device drivers.
 #
 
+obj-$(CONFIG_MACH_DOVE)		+= dove/
 obj-$(CONFIG_ARCH_MEDIATEK)	+= mediatek/
 obj-$(CONFIG_ARCH_QCOM)		+= qcom/
 obj-$(CONFIG_ARCH_SUNXI)	+= sunxi/
diff --git a/drivers/soc/dove/Makefile b/drivers/soc/dove/Makefile
new file mode 100644
index 000000000000..2db8e65513a3
--- /dev/null
+++ b/drivers/soc/dove/Makefile
@@ -0,0 +1 @@
+obj-y		+= pmu.o
diff --git a/drivers/soc/dove/pmu.c b/drivers/soc/dove/pmu.c
new file mode 100644
index 000000000000..6792aae9e2e5
--- /dev/null
+++ b/drivers/soc/dove/pmu.c
@@ -0,0 +1,412 @@
+/*
+ * Marvell Dove PMU support
+ */
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+#include <linux/pm_domain.h>
+#include <linux/reset.h>
+#include <linux/reset-controller.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/soc/dove/pmu.h>
+#include <linux/spinlock.h>
+
+#define NR_PMU_IRQS		7
+
+#define PMC_SW_RST		0x30
+#define PMC_IRQ_CAUSE		0x50
+#define PMC_IRQ_MASK		0x54
+
+#define PMU_PWR			0x10
+#define PMU_ISO			0x58
+
+struct pmu_data {
+	spinlock_t lock;
+	struct device_node *of_node;
+	void __iomem *pmc_base;
+	void __iomem *pmu_base;
+	struct irq_chip_generic *irq_gc;
+	struct irq_domain *irq_domain;
+#ifdef CONFIG_RESET_CONTROLLER
+	struct reset_controller_dev reset;
+#endif
+};
+
+/*
+ * The PMU contains a register to reset various subsystems within the
+ * SoC.  Export this as a reset controller.
+ */
+#ifdef CONFIG_RESET_CONTROLLER
+#define rcdev_to_pmu(rcdev) container_of(rcdev, struct pmu_data, reset)
+
+static int pmu_reset_reset(struct reset_controller_dev *rc, unsigned long id)
+{
+	struct pmu_data *pmu = rcdev_to_pmu(rc);
+	unsigned long flags;
+	u32 val;
+
+	spin_lock_irqsave(&pmu->lock, flags);
+	val = readl_relaxed(pmu->pmc_base + PMC_SW_RST);
+	writel_relaxed(val & ~BIT(id), pmu->pmc_base + PMC_SW_RST);
+	writel_relaxed(val | BIT(id), pmu->pmc_base + PMC_SW_RST);
+	spin_unlock_irqrestore(&pmu->lock, flags);
+
+	return 0;
+}
+
+static int pmu_reset_assert(struct reset_controller_dev *rc, unsigned long id)
+{
+	struct pmu_data *pmu = rcdev_to_pmu(rc);
+	unsigned long flags;
+	u32 val = ~BIT(id);
+
+	spin_lock_irqsave(&pmu->lock, flags);
+	val &= readl_relaxed(pmu->pmc_base + PMC_SW_RST);
+	writel_relaxed(val, pmu->pmc_base + PMC_SW_RST);
+	spin_unlock_irqrestore(&pmu->lock, flags);
+
+	return 0;
+}
+
+static int pmu_reset_deassert(struct reset_controller_dev *rc, unsigned long id)
+{
+	struct pmu_data *pmu = rcdev_to_pmu(rc);
+	unsigned long flags;
+	u32 val = BIT(id);
+
+	spin_lock_irqsave(&pmu->lock, flags);
+	val |= readl_relaxed(pmu->pmc_base + PMC_SW_RST);
+	writel_relaxed(val, pmu->pmc_base + PMC_SW_RST);
+	spin_unlock_irqrestore(&pmu->lock, flags);
+
+	return 0;
+}
+
+static struct reset_control_ops pmu_reset_ops = {
+	.reset = pmu_reset_reset,
+	.assert = pmu_reset_assert,
+	.deassert = pmu_reset_deassert,
+};
+
+static struct reset_controller_dev pmu_reset __initdata = {
+	.ops = &pmu_reset_ops,
+	.owner = THIS_MODULE,
+	.nr_resets = 32,
+};
+
+static void __init pmu_reset_init(struct pmu_data *pmu)
+{
+	int ret;
+
+	pmu->reset = pmu_reset;
+	pmu->reset.of_node = pmu->of_node;
+
+	ret = reset_controller_register(&pmu->reset);
+	if (ret)
+		pr_err("pmu: %s failed: %d\n", "reset_controller_register", ret);
+}
+#else
+static void __init pmu_reset_init(struct pmu_data *pmu)
+{
+}
+#endif
+
+struct pmu_domain {
+	struct pmu_data *pmu;
+	u32 pwr_mask;
+	u32 rst_mask;
+	u32 iso_mask;
+	struct generic_pm_domain base;
+};
+
+#define to_pmu_domain(dom) container_of(dom, struct pmu_domain, base)
+
+/*
+ * This deals with the "old" Marvell sequence of bringing a power domain
+ * down/up, which is: apply power, release reset, disable isolators.
+ *
+ * Later devices apparantly use a different sequence: power up, disable
+ * isolators, assert repair signal, enable SRMA clock, enable AXI clock,
+ * enable module clock, deassert reset.
+ *
+ * Note: reading the assembly, it seems that the IO accessors have an
+ * unfortunate side-effect - they cause memory already read into registers
+ * for the if () to be re-read for the bit-set or bit-clear operation.
+ * The code is written to avoid this.
+ */
+static int pmu_domain_power_off(struct generic_pm_domain *domain)
+{
+	struct pmu_domain *pmu_dom = to_pmu_domain(domain);
+	struct pmu_data *pmu = pmu_dom->pmu;
+	unsigned long flags;
+	unsigned int val;
+	void __iomem *pmu_base = pmu->pmu_base;
+	void __iomem *pmc_base = pmu->pmc_base;
+
+	spin_lock_irqsave(&pmu->lock, flags);
+
+	/* Enable isolators */
+	if (pmu_dom->iso_mask) {
+		val = ~pmu_dom->iso_mask;
+		val &= readl_relaxed(pmu_base + PMU_ISO);
+		writel_relaxed(val, pmu_base + PMU_ISO);
+	}
+
+	/* Reset unit */
+	if (pmu_dom->rst_mask) {
+		val = ~pmu_dom->rst_mask;
+		val &= readl_relaxed(pmc_base + PMC_SW_RST);
+		writel_relaxed(val, pmc_base + PMC_SW_RST);
+	}
+
+	/* Power down */
+	val = readl_relaxed(pmu_base + PMU_PWR) | pmu_dom->pwr_mask;
+	writel_relaxed(val, pmu_base + PMU_PWR);
+
+	spin_unlock_irqrestore(&pmu->lock, flags);
+
+	return 0;
+}
+
+static int pmu_domain_power_on(struct generic_pm_domain *domain)
+{
+	struct pmu_domain *pmu_dom = to_pmu_domain(domain);
+	struct pmu_data *pmu = pmu_dom->pmu;
+	unsigned long flags;
+	unsigned int val;
+	void __iomem *pmu_base = pmu->pmu_base;
+	void __iomem *pmc_base = pmu->pmc_base;
+
+	spin_lock_irqsave(&pmu->lock, flags);
+
+	/* Power on */
+	val = ~pmu_dom->pwr_mask & readl_relaxed(pmu_base + PMU_PWR);
+	writel_relaxed(val, pmu_base + PMU_PWR);
+
+	/* Release reset */
+	if (pmu_dom->rst_mask) {
+		val = pmu_dom->rst_mask;
+		val |= readl_relaxed(pmc_base + PMC_SW_RST);
+		writel_relaxed(val, pmc_base + PMC_SW_RST);
+	}
+
+	/* Disable isolators */
+	if (pmu_dom->iso_mask) {
+		val = pmu_dom->iso_mask;
+		val |= readl_relaxed(pmu_base + PMU_ISO);
+		writel_relaxed(val, pmu_base + PMU_ISO);
+	}
+
+	spin_unlock_irqrestore(&pmu->lock, flags);
+
+	return 0;
+}
+
+static void __pmu_domain_register(struct pmu_domain *domain,
+	struct device_node *np)
+{
+	unsigned int val = readl_relaxed(domain->pmu->pmu_base + PMU_PWR);
+
+	domain->base.power_off = pmu_domain_power_off;
+	domain->base.power_on = pmu_domain_power_on;
+
+	pm_genpd_init(&domain->base, NULL, !(val & domain->pwr_mask));
+
+	if (np)
+		of_genpd_add_provider_simple(np, &domain->base);
+}
+
+/* PMU IRQ controller */
+static void pmu_irq_handler(unsigned int irq, struct irq_desc *desc)
+{
+	struct pmu_data *pmu = irq_get_handler_data(irq);
+	struct irq_chip_generic *gc = pmu->irq_gc;
+	struct irq_domain *domain = pmu->irq_domain;
+	void __iomem *base = gc->reg_base;
+	u32 stat = readl_relaxed(base + PMC_IRQ_CAUSE) & gc->mask_cache;
+	u32 done = ~0;
+
+	if (stat == 0) {
+		handle_bad_irq(irq, desc);
+		return;
+	}
+
+	while (stat) {
+		u32 hwirq = fls(stat) - 1;
+
+		stat &= ~(1 << hwirq);
+		done &= ~(1 << hwirq);
+
+		generic_handle_irq(irq_find_mapping(domain, hwirq));
+	}
+
+	/*
+	 * The PMU mask register is not RW0C: it is RW.  This means that
+	 * the bits take whatever value is written to them; if you write
+	 * a '1', you will set the interrupt.
+	 *
+	 * Unfortunately this means there is NO race free way to clear
+	 * these interrupts.
+	 *
+	 * So, let's structure the code so that the window is as small as
+	 * possible.
+	 */
+	irq_gc_lock(gc);
+	done &= readl_relaxed(base + PMC_IRQ_CAUSE);
+	writel_relaxed(done, base + PMC_IRQ_CAUSE);
+	irq_gc_unlock(gc);
+}
+
+static int __init dove_init_pmu_irq(struct pmu_data *pmu, int irq)
+{
+	const char *name = "pmu_irq";
+	struct irq_chip_generic *gc;
+	struct irq_domain *domain;
+	int ret;
+
+	/* mask and clear all interrupts */
+	writel(0, pmu->pmc_base + PMC_IRQ_MASK);
+	writel(0, pmu->pmc_base + PMC_IRQ_CAUSE);
+
+	domain = irq_domain_add_linear(pmu->of_node, NR_PMU_IRQS,
+				       &irq_generic_chip_ops, NULL);
+	if (!domain) {
+		pr_err("%s: unable to add irq domain\n", name);
+		return -ENOMEM;
+	}
+
+	ret = irq_alloc_domain_generic_chips(domain, NR_PMU_IRQS, 1, name,
+					     handle_level_irq,
+					     IRQ_NOREQUEST | IRQ_NOPROBE, 0,
+					     IRQ_GC_INIT_MASK_CACHE);
+	if (ret) {
+		pr_err("%s: unable to alloc irq domain gc: %d\n", name, ret);
+		irq_domain_remove(domain);
+		return ret;
+	}
+
+	gc = irq_get_domain_generic_chip(domain, 0);
+	gc->reg_base = pmu->pmc_base;
+	gc->chip_types[0].regs.mask = PMC_IRQ_MASK;
+	gc->chip_types[0].chip.irq_mask = irq_gc_mask_clr_bit;
+	gc->chip_types[0].chip.irq_unmask = irq_gc_mask_set_bit;
+
+	pmu->irq_domain = domain;
+	pmu->irq_gc = gc;
+
+	irq_set_handler_data(irq, pmu);
+	irq_set_chained_handler(irq, pmu_irq_handler);
+
+	return 0;
+}
+
+/*
+ * pmu: power-manager@d0000 {
+ *	compatible = "marvell,dove-pmu";
+ *	reg = <0xd0000 0x8000> <0xd8000 0x8000>;
+ *	interrupts = <33>;
+ *	interrupt-controller;
+ *	#reset-cells = 1;
+ *	vpu_domain: vpu-domain {
+ *		#power-domain-cells = <0>;
+ *		marvell,pmu_pwr_mask = <0x00000008>;
+ *		marvell,pmu_iso_mask = <0x00000001>;
+ *		resets = <&pmu 16>;
+ *	};
+ *	gpu_domain: gpu-domain {
+ *		#power-domain-cells = <0>;
+ *		marvell,pmu_pwr_mask = <0x00000004>;
+ *		marvell,pmu_iso_mask = <0x00000002>;
+ *		resets = <&pmu 18>;
+ *	};
+ * };
+ */
+int __init dove_init_pmu(void)
+{
+	struct device_node *np_pmu, *domains_node, *np;
+	struct pmu_data *pmu;
+	int ret, parent_irq;
+
+	/* Lookup the PMU node */
+	np_pmu = of_find_compatible_node(NULL, NULL, "marvell,dove-pmu");
+	if (!np_pmu)
+		return 0;
+
+	domains_node = of_get_child_by_name(np_pmu, "domains");
+	if (!domains_node) {
+		pr_err("%s: failed to find domains sub-node\n", np_pmu->name);
+		return 0;
+	}
+
+	pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
+	if (!pmu)
+		return -ENOMEM;
+
+	spin_lock_init(&pmu->lock);
+	pmu->of_node = np_pmu;
+	pmu->pmc_base = of_iomap(pmu->of_node, 0);
+	pmu->pmu_base = of_iomap(pmu->of_node, 1);
+	if (!pmu->pmc_base || !pmu->pmu_base) {
+		pr_err("%s: failed to map PMU\n", np_pmu->name);
+		iounmap(pmu->pmu_base);
+		iounmap(pmu->pmc_base);
+		kfree(pmu);
+		return -ENOMEM;
+	}
+
+	pmu_reset_init(pmu);
+
+	for_each_available_child_of_node(domains_node, np) {
+		struct of_phandle_args args;
+		struct pmu_domain *domain;
+
+		domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+		if (!domain)
+			break;
+
+		domain->pmu = pmu;
+		domain->base.name = kstrdup(np->name, GFP_KERNEL);
+		if (!domain->base.name) {
+			kfree(domain);
+			break;
+		}
+
+		of_property_read_u32(np, "marvell,pmu_pwr_mask",
+				     &domain->pwr_mask);
+		of_property_read_u32(np, "marvell,pmu_iso_mask",
+				     &domain->iso_mask);
+
+		/*
+		 * We parse the reset controller property directly here
+		 * to ensure that we can operate when the reset controller
+		 * support is not configured into the kernel.
+		 */
+		ret = of_parse_phandle_with_args(np, "resets", "#reset-cells",
+						 0, &args);
+		if (ret == 0) {
+			if (args.np == pmu->of_node)
+				domain->rst_mask = BIT(args.args[0]);
+			of_node_put(args.np);
+		}
+
+		__pmu_domain_register(domain, np);
+	}
+	pm_genpd_poweroff_unused();
+
+	/* Loss of the interrupt controller is not a fatal error. */
+	parent_irq = irq_of_parse_and_map(pmu->of_node, 0);
+	if (!parent_irq) {
+		pr_err("%s: no interrupt specified\n", np_pmu->name);
+	} else {
+		ret = dove_init_pmu_irq(pmu, parent_irq);
+		if (ret)
+			pr_err("dove_init_pmu_irq() failed: %d\n", ret);
+	}
+
+	return 0;
+}
diff --git a/include/linux/soc/dove/pmu.h b/include/linux/soc/dove/pmu.h
new file mode 100644
index 000000000000..9c99f84bcc0e
--- /dev/null
+++ b/include/linux/soc/dove/pmu.h
@@ -0,0 +1,6 @@
+#ifndef LINUX_SOC_DOVE_PMU_H
+#define LINUX_SOC_DOVE_PMU_H
+
+int dove_init_pmu(void);
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From f368ed6088ae9c1fbe1c897bb5f215ce5e63fa1e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 30 Jul 2015 15:59:57 -0700
Subject: char: make misc_deregister a void function

With well over 200+ users of this api, there are a mere 12 users that
actually checked the return value of this function.  And all of them
really didn't do anything with that information as the system or module
was shutting down no matter what.

So stop pretending like it matters, and just return void from
misc_deregister().  If something goes wrong in the call, you will get a
WARNING splat in the syslog so you know how to fix up your driver.
Other than that, there's nothing that can go wrong.

Cc: Alasdair Kergon <agk@redhat.com>
Cc: Neil Brown <neilb@suse.com>
Cc: Oleg Drokin <oleg.drokin@intel.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Christine Caulfield <ccaulfie@redhat.com>
Cc: David Teigland <teigland@redhat.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Acked-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Acked-by: Alessandro Zummo <a.zummo@towertech.it>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/misc.c                           | 9 +++------
 drivers/md/dm-ioctl.c                         | 4 +---
 drivers/misc/vmw_vmci/vmci_host.c             | 7 +------
 drivers/rtc/rtc-ds1374.c                      | 5 ++---
 drivers/staging/android/ashmem.c              | 7 +------
 drivers/staging/android/ion/ion_test.c        | 3 ++-
 drivers/staging/lustre/lustre/libcfs/module.c | 4 +---
 drivers/vhost/scsi.c                          | 4 ++--
 drivers/watchdog/at91rm9200_wdt.c             | 5 ++---
 drivers/watchdog/ks8695_wdt.c                 | 9 +++------
 drivers/watchdog/ts72xx_wdt.c                 | 3 ++-
 fs/btrfs/super.c                              | 3 +--
 fs/dlm/plock.c                                | 3 +--
 fs/dlm/user.c                                 | 9 +++------
 fs/ocfs2/stack_user.c                         | 9 +--------
 include/linux/miscdevice.h                    | 2 +-
 16 files changed, 27 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index c83ef9652bc9..8069b361b8dd 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -243,17 +243,15 @@ int misc_register(struct miscdevice * misc)
  *	@misc: device to unregister
  *
  *	Unregister a miscellaneous device that was previously
- *	successfully registered with misc_register(). Success
- *	is indicated by a zero return, a negative errno code
- *	indicates an error.
+ *	successfully registered with misc_register().
  */
 
-int misc_deregister(struct miscdevice *misc)
+void misc_deregister(struct miscdevice *misc)
 {
 	int i = DYNAMIC_MINORS - misc->minor - 1;
 
 	if (WARN_ON(list_empty(&misc->list)))
-		return -EINVAL;
+		return;
 
 	mutex_lock(&misc_mtx);
 	list_del(&misc->list);
@@ -261,7 +259,6 @@ int misc_deregister(struct miscdevice *misc)
 	if (i < DYNAMIC_MINORS && i >= 0)
 		clear_bit(i, misc_minors);
 	mutex_unlock(&misc_mtx);
-	return 0;
 }
 
 EXPORT_SYMBOL(misc_register);
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 720ceeb7fa9b..80a439543259 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1919,9 +1919,7 @@ int __init dm_interface_init(void)
 
 void dm_interface_exit(void)
 {
-	if (misc_deregister(&_dm_misc) < 0)
-		DMERR("misc_deregister failed for control device");
-
+	misc_deregister(&_dm_misc);
 	dm_hash_exit();
 }
 
diff --git a/drivers/misc/vmw_vmci/vmci_host.c b/drivers/misc/vmw_vmci/vmci_host.c
index a721b5d8a9da..9ec262a52656 100644
--- a/drivers/misc/vmw_vmci/vmci_host.c
+++ b/drivers/misc/vmw_vmci/vmci_host.c
@@ -1031,14 +1031,9 @@ int __init vmci_host_init(void)
 
 void __exit vmci_host_exit(void)
 {
-	int error;
-
 	vmci_host_device_initialized = false;
 
-	error = misc_deregister(&vmci_host_miscdev);
-	if (error)
-		pr_warn("Error unregistering character device: %d\n", error);
-
+	misc_deregister(&vmci_host_miscdev);
 	vmci_ctx_destroy(host_context);
 	vmci_qp_broker_exit();
 
diff --git a/drivers/rtc/rtc-ds1374.c b/drivers/rtc/rtc-ds1374.c
index 167783fa7ac1..72c933375233 100644
--- a/drivers/rtc/rtc-ds1374.c
+++ b/drivers/rtc/rtc-ds1374.c
@@ -666,9 +666,8 @@ static int ds1374_remove(struct i2c_client *client)
 #ifdef CONFIG_RTC_DRV_DS1374_WDT
 	int res;
 
-	res = misc_deregister(&ds1374_miscdev);
-	if (!res)
-		ds1374_miscdev.parent = NULL;
+	misc_deregister(&ds1374_miscdev);
+	ds1374_miscdev.parent = NULL;
 	unregister_reboot_notifier(&ds1374_wdt_notifier);
 #endif
 
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index c5c037ccf32c..2c75d90b26c8 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -863,14 +863,9 @@ static int __init ashmem_init(void)
 
 static void __exit ashmem_exit(void)
 {
-	int ret;
-
 	unregister_shrinker(&ashmem_shrinker);
 
-	ret = misc_deregister(&ashmem_misc);
-	if (unlikely(ret))
-		pr_err("failed to unregister misc device!\n");
-
+	misc_deregister(&ashmem_misc);
 	kmem_cache_destroy(ashmem_range_cachep);
 	kmem_cache_destroy(ashmem_area_cachep);
 
diff --git a/drivers/staging/android/ion/ion_test.c b/drivers/staging/android/ion/ion_test.c
index 7d6e6b6bc894..b8dcf5a26cc4 100644
--- a/drivers/staging/android/ion/ion_test.c
+++ b/drivers/staging/android/ion/ion_test.c
@@ -269,7 +269,8 @@ static int ion_test_remove(struct platform_device *pdev)
 	if (!testdev)
 		return -ENODATA;
 
-	return misc_deregister(&testdev->misc);
+	misc_deregister(&testdev->misc);
+	return 0;
 }
 
 static struct platform_device *ion_test_pdev;
diff --git a/drivers/staging/lustre/lustre/libcfs/module.c b/drivers/staging/lustre/lustre/libcfs/module.c
index e60b2e9b9194..e7074006e41b 100644
--- a/drivers/staging/lustre/lustre/libcfs/module.c
+++ b/drivers/staging/lustre/lustre/libcfs/module.c
@@ -467,9 +467,7 @@ static void exit_libcfs_module(void)
 	cfs_crypto_unregister();
 	cfs_wi_shutdown();
 
-	rc = misc_deregister(&libcfs_dev);
-	if (rc)
-		CERROR("misc_deregister error %d\n", rc);
+	misc_deregister(&libcfs_dev);
 
 	cfs_cpu_fini();
 
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index dfcc02c93648..f114a9dbb48f 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1573,9 +1573,9 @@ static int __init vhost_scsi_register(void)
 	return misc_register(&vhost_scsi_misc);
 }
 
-static int vhost_scsi_deregister(void)
+static void vhost_scsi_deregister(void)
 {
-	return misc_deregister(&vhost_scsi_misc);
+	misc_deregister(&vhost_scsi_misc);
 }
 
 static char *vhost_scsi_dump_proto_id(struct vhost_scsi_tport *tport)
diff --git a/drivers/watchdog/at91rm9200_wdt.c b/drivers/watchdog/at91rm9200_wdt.c
index 41cecb55766c..9ba1153465ae 100644
--- a/drivers/watchdog/at91rm9200_wdt.c
+++ b/drivers/watchdog/at91rm9200_wdt.c
@@ -269,9 +269,8 @@ static int at91wdt_remove(struct platform_device *pdev)
 	if (res)
 		dev_warn(dev, "failed to unregister restart handler\n");
 
-	res = misc_deregister(&at91wdt_miscdev);
-	if (!res)
-		at91wdt_miscdev.parent = NULL;
+	misc_deregister(&at91wdt_miscdev);
+	at91wdt_miscdev.parent = NULL;
 
 	return res;
 }
diff --git a/drivers/watchdog/ks8695_wdt.c b/drivers/watchdog/ks8695_wdt.c
index b7ea39b455c8..1e41818a44bc 100644
--- a/drivers/watchdog/ks8695_wdt.c
+++ b/drivers/watchdog/ks8695_wdt.c
@@ -254,13 +254,10 @@ static int ks8695wdt_probe(struct platform_device *pdev)
 
 static int ks8695wdt_remove(struct platform_device *pdev)
 {
-	int res;
-
-	res = misc_deregister(&ks8695wdt_miscdev);
-	if (!res)
-		ks8695wdt_miscdev.parent = NULL;
+	misc_deregister(&ks8695wdt_miscdev);
+	ks8695wdt_miscdev.parent = NULL;
 
-	return res;
+	return 0;
 }
 
 static void ks8695wdt_shutdown(struct platform_device *pdev)
diff --git a/drivers/watchdog/ts72xx_wdt.c b/drivers/watchdog/ts72xx_wdt.c
index 119beb7f6017..4b541934b6c5 100644
--- a/drivers/watchdog/ts72xx_wdt.c
+++ b/drivers/watchdog/ts72xx_wdt.c
@@ -428,7 +428,8 @@ static int ts72xx_wdt_probe(struct platform_device *pdev)
 
 static int ts72xx_wdt_remove(struct platform_device *pdev)
 {
-	return misc_deregister(&ts72xx_wdt_miscdev);
+	misc_deregister(&ts72xx_wdt_miscdev);
+	return 0;
 }
 
 static struct platform_driver ts72xx_wdt_driver = {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index cd7ef34d2dce..6bad63379a4c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2163,8 +2163,7 @@ static int btrfs_interface_init(void)
 
 static void btrfs_interface_exit(void)
 {
-	if (misc_deregister(&btrfs_misc) < 0)
-		printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n");
+	misc_deregister(&btrfs_misc);
 }
 
 static void btrfs_print_info(void)
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index e0ab3a93eeff..5532f097f6da 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -509,7 +509,6 @@ int dlm_plock_init(void)
 
 void dlm_plock_exit(void)
 {
-	if (misc_deregister(&plock_dev_misc) < 0)
-		log_print("dlm_plock_exit: misc_deregister failed");
+	misc_deregister(&plock_dev_misc);
 }
 
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index fb85f32e9eca..75ecc0d3bc85 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -362,18 +362,15 @@ fail:
 
 int dlm_device_deregister(struct dlm_ls *ls)
 {
-	int error;
-
 	/* The device is not registered.  This happens when the lockspace
 	   was never used from userspace, or when device_create_lockspace()
 	   calls dlm_release_lockspace() after the register fails. */
 	if (!ls->ls_device.name)
 		return 0;
 
-	error = misc_deregister(&ls->ls_device);
-	if (!error)
-		kfree(ls->ls_device.name);
-	return error;
+	misc_deregister(&ls->ls_device);
+	kfree(ls->ls_device.name);
+	return 0;
 }
 
 static int device_user_purge(struct dlm_user_proc *proc,
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 2768eb1da2b8..ced70c8139f7 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -655,14 +655,7 @@ static int ocfs2_control_init(void)
 
 static void ocfs2_control_exit(void)
 {
-	int rc;
-
-	rc = misc_deregister(&ocfs2_control_device);
-	if (rc)
-		printk(KERN_ERR
-		       "ocfs2: Unable to deregister ocfs2_control device "
-		       "(errno %d)\n",
-		       -rc);
+	misc_deregister(&ocfs2_control_device);
 }
 
 static void fsdlm_lock_ast_wrapper(void *astarg)
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 819077c32690..81f6e427ba6b 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -67,7 +67,7 @@ struct miscdevice  {
 };
 
 extern int misc_register(struct miscdevice *misc);
-extern int misc_deregister(struct miscdevice *misc);
+extern void misc_deregister(struct miscdevice *misc);
 
 #define MODULE_ALIAS_MISCDEV(minor)				\
 	MODULE_ALIAS("char-major-" __stringify(MISC_MAJOR)	\
-- 
cgit v1.2.3-70-g09d2


From 7f163a6fd957a85f7f66a129db1ad243a44399ee Mon Sep 17 00:00:00 2001
From: Jake Oshins <jakeo@microsoft.com>
Date: Wed, 5 Aug 2015 00:52:36 -0700
Subject: drivers:hv: Modify hv_vmbus to search for all MMIO ranges available.

This patch changes the logic in hv_vmbus to record all of the ranges in the
VM's firmware (BIOS or UEFI) that offer regions of memory-mapped I/O space for
use by paravirtual front-end drivers.  The old logic just found one range
above 4GB and called it good.  This logic will find any ranges above 1MB.

It would have been possible with this patch to just use existing resource
allocation functions, rather than keep track of the entire set of Hyper-V
related MMIO regions in VMBus.  This strategy, however, is not sufficient
when the resource allocator needs to be aware of the constraints of a
Hyper-V virtual machine, which is what happens in the next patch in the series.
So this first patch exists to show the first steps in reworking the MMIO
allocation paths for Hyper-V front-end drivers.

Signed-off-by: Jake Oshins <jakeo@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/vmbus_drv.c          | 116 +++++++++++++++++++++++++++++++---------
 drivers/video/fbdev/hyperv_fb.c |   2 +-
 include/linux/hyperv.h          |   2 +-
 3 files changed, 92 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index e7b0bcd453e7..ee59e06c2194 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -102,10 +102,7 @@ static struct notifier_block hyperv_panic_block = {
 	.notifier_call = hyperv_panic_event,
 };
 
-struct resource hyperv_mmio = {
-	.name  = "hyperv mmio",
-	.flags = IORESOURCE_MEM,
-};
+struct resource *hyperv_mmio;
 EXPORT_SYMBOL_GPL(hyperv_mmio);
 
 static int vmbus_exists(void)
@@ -1013,30 +1010,105 @@ void vmbus_device_unregister(struct hv_device *device_obj)
 
 
 /*
- * VMBUS is an acpi enumerated device. Get the the information we
+ * VMBUS is an acpi enumerated device. Get the information we
  * need from DSDT.
  */
-
+#define VTPM_BASE_ADDRESS 0xfed40000
 static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx)
 {
+	resource_size_t start = 0;
+	resource_size_t end = 0;
+	struct resource *new_res;
+	struct resource **old_res = &hyperv_mmio;
+	struct resource **prev_res = NULL;
+
 	switch (res->type) {
 	case ACPI_RESOURCE_TYPE_IRQ:
 		irq = res->data.irq.interrupts[0];
+		return AE_OK;
+
+	/*
+	 * "Address" descriptors are for bus windows. Ignore
+	 * "memory" descriptors, which are for registers on
+	 * devices.
+	 */
+	case ACPI_RESOURCE_TYPE_ADDRESS32:
+		start = res->data.address32.address.minimum;
+		end = res->data.address32.address.maximum;
 		break;
 
 	case ACPI_RESOURCE_TYPE_ADDRESS64:
-		hyperv_mmio.start = res->data.address64.address.minimum;
-		hyperv_mmio.end = res->data.address64.address.maximum;
+		start = res->data.address64.address.minimum;
+		end = res->data.address64.address.maximum;
 		break;
+
+	default:
+		/* Unused resource type */
+		return AE_OK;
+
 	}
+	/*
+	 * Ignore ranges that are below 1MB, as they're not
+	 * necessary or useful here.
+	 */
+	if (end < 0x100000)
+		return AE_OK;
+
+	new_res = kzalloc(sizeof(*new_res), GFP_ATOMIC);
+	if (!new_res)
+		return AE_NO_MEMORY;
+
+	/* If this range overlaps the virtual TPM, truncate it. */
+	if (end > VTPM_BASE_ADDRESS && start < VTPM_BASE_ADDRESS)
+		end = VTPM_BASE_ADDRESS;
+
+	new_res->name = "hyperv mmio";
+	new_res->flags = IORESOURCE_MEM;
+	new_res->start = start;
+	new_res->end = end;
+
+	do {
+		if (!*old_res) {
+			*old_res = new_res;
+			break;
+		}
+
+		if ((*old_res)->end < new_res->start) {
+			new_res->sibling = *old_res;
+			if (prev_res)
+				(*prev_res)->sibling = new_res;
+			*old_res = new_res;
+			break;
+		}
+
+		prev_res = old_res;
+		old_res = &(*old_res)->sibling;
+
+	} while (1);
 
 	return AE_OK;
 }
 
+static int vmbus_acpi_remove(struct acpi_device *device)
+{
+	struct resource *cur_res;
+	struct resource *next_res;
+
+	if (hyperv_mmio) {
+		for (cur_res = hyperv_mmio; cur_res; cur_res = next_res) {
+			next_res = cur_res->sibling;
+			kfree(cur_res);
+		}
+	}
+
+	return 0;
+}
+
 static int vmbus_acpi_add(struct acpi_device *device)
 {
 	acpi_status result;
 	int ret_val = -ENODEV;
+	struct acpi_device *ancestor;
 
 	hv_acpi_dev = device;
 
@@ -1046,35 +1118,27 @@ static int vmbus_acpi_add(struct acpi_device *device)
 	if (ACPI_FAILURE(result))
 		goto acpi_walk_err;
 	/*
-	 * The parent of the vmbus acpi device (Gen2 firmware) is the VMOD that
-	 * has the mmio ranges. Get that.
+	 * Some ancestor of the vmbus acpi device (Gen1 or Gen2
+	 * firmware) is the VMOD that has the mmio ranges. Get that.
 	 */
-	if (device->parent) {
-		result = acpi_walk_resources(device->parent->handle,
-					METHOD_NAME__CRS,
-					vmbus_walk_resources, NULL);
+	for (ancestor = device->parent; ancestor; ancestor = ancestor->parent) {
+		result = acpi_walk_resources(ancestor->handle, METHOD_NAME__CRS,
+					     vmbus_walk_resources, NULL);
 
 		if (ACPI_FAILURE(result))
-			goto acpi_walk_err;
-		if (hyperv_mmio.start && hyperv_mmio.end)
-			request_resource(&iomem_resource, &hyperv_mmio);
+			continue;
+		if (hyperv_mmio)
+			break;
 	}
 	ret_val = 0;
 
 acpi_walk_err:
 	complete(&probe_event);
+	if (ret_val)
+		vmbus_acpi_remove(device);
 	return ret_val;
 }
 
-static int vmbus_acpi_remove(struct acpi_device *device)
-{
-	int ret = 0;
-
-	if (hyperv_mmio.start && hyperv_mmio.end)
-		ret = release_resource(&hyperv_mmio);
-	return ret;
-}
-
 static const struct acpi_device_id vmbus_acpi_device_ids[] = {
 	{"VMBUS", 0},
 	{"VMBus", 0},
diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c
index 807ee22ef229..b54ee1c05a5f 100644
--- a/drivers/video/fbdev/hyperv_fb.c
+++ b/drivers/video/fbdev/hyperv_fb.c
@@ -688,7 +688,7 @@ static int hvfb_getmem(struct fb_info *info)
 	par->mem.name = KBUILD_MODNAME;
 	par->mem.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 	if (gen2vm) {
-		ret = allocate_resource(&hyperv_mmio, &par->mem,
+		ret = allocate_resource(hyperv_mmio, &par->mem,
 					screen_fb_size,
 					0, -1,
 					screen_fb_size,
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 30d3a1f79450..217e14be77b9 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1233,7 +1233,7 @@ extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *,
 
 void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid);
 
-extern struct resource hyperv_mmio;
+extern struct resource *hyperv_mmio;
 
 /*
  * Negotiated version with the Host.
-- 
cgit v1.2.3-70-g09d2


From 3546448338e76a52d4f86eb3680cb2934e22d89b Mon Sep 17 00:00:00 2001
From: Jake Oshins <jakeo@microsoft.com>
Date: Wed, 5 Aug 2015 00:52:37 -0700
Subject: drivers:hv: Move MMIO range picking from hyper_fb to hv_vmbus

This patch deletes the logic from hyperv_fb which picked a range of MMIO space
for the frame buffer and adds new logic to hv_vmbus which picks ranges for
child drivers.  The new logic isn't quite the same as the old, as it considers
more possible ranges.

Signed-off-by: Jake Oshins <jakeo@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/vmbus_drv.c          | 88 +++++++++++++++++++++++++++++++++++++++--
 drivers/video/fbdev/hyperv_fb.c | 46 ++++++++++-----------
 include/linux/hyperv.h          |  7 +++-
 3 files changed, 110 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index ee59e06c2194..8c3eaee8c54c 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -39,6 +39,7 @@
 #include <asm/mshyperv.h>
 #include <linux/notifier.h>
 #include <linux/ptrace.h>
+#include <linux/screen_info.h>
 #include <linux/kdebug.h>
 #include "hyperv_vmbus.h"
 
@@ -103,7 +104,6 @@ static struct notifier_block hyperv_panic_block = {
 };
 
 struct resource *hyperv_mmio;
-EXPORT_SYMBOL_GPL(hyperv_mmio);
 
 static int vmbus_exists(void)
 {
@@ -891,8 +891,8 @@ err_cleanup:
 }
 
 /**
- * __vmbus_child_driver_register - Register a vmbus's driver
- * @drv: Pointer to driver structure you want to register
+ * __vmbus_child_driver_register() - Register a vmbus's driver
+ * @hv_driver: Pointer to driver structure you want to register
  * @owner: owner module of the drv
  * @mod_name: module name string
  *
@@ -924,7 +924,8 @@ EXPORT_SYMBOL_GPL(__vmbus_driver_register);
 
 /**
  * vmbus_driver_unregister() - Unregister a vmbus's driver
- * @drv: Pointer to driver structure you want to un-register
+ * @hv_driver: Pointer to driver structure you want to
+ *             un-register
  *
  * Un-register the given driver that was previous registered with a call to
  * vmbus_driver_register()
@@ -1104,6 +1105,85 @@ static int vmbus_acpi_remove(struct acpi_device *device)
 	return 0;
 }
 
+/**
+ * vmbus_allocate_mmio() - Pick a memory-mapped I/O range.
+ * @new:		If successful, supplied a pointer to the
+ *			allocated MMIO space.
+ * @device_obj:		Identifies the caller
+ * @min:		Minimum guest physical address of the
+ *			allocation
+ * @max:		Maximum guest physical address
+ * @size:		Size of the range to be allocated
+ * @align:		Alignment of the range to be allocated
+ * @fb_overlap_ok:	Whether this allocation can be allowed
+ *			to overlap the video frame buffer.
+ *
+ * This function walks the resources granted to VMBus by the
+ * _CRS object in the ACPI namespace underneath the parent
+ * "bridge" whether that's a root PCI bus in the Generation 1
+ * case or a Module Device in the Generation 2 case.  It then
+ * attempts to allocate from the global MMIO pool in a way that
+ * matches the constraints supplied in these parameters and by
+ * that _CRS.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
+			resource_size_t min, resource_size_t max,
+			resource_size_t size, resource_size_t align,
+			bool fb_overlap_ok)
+{
+	struct resource *iter;
+	resource_size_t range_min, range_max, start, local_min, local_max;
+	const char *dev_n = dev_name(&device_obj->device);
+	u32 fb_end = screen_info.lfb_base + (screen_info.lfb_size << 1);
+	int i;
+
+	for (iter = hyperv_mmio; iter; iter = iter->sibling) {
+		if ((iter->start >= max) || (iter->end <= min))
+			continue;
+
+		range_min = iter->start;
+		range_max = iter->end;
+
+		/* If this range overlaps the frame buffer, split it into
+		   two tries. */
+		for (i = 0; i < 2; i++) {
+			local_min = range_min;
+			local_max = range_max;
+			if (fb_overlap_ok || (range_min >= fb_end) ||
+			    (range_max <= screen_info.lfb_base)) {
+				i++;
+			} else {
+				if ((range_min <= screen_info.lfb_base) &&
+				    (range_max >= screen_info.lfb_base)) {
+					/*
+					 * The frame buffer is in this window,
+					 * so trim this into the part that
+					 * preceeds the frame buffer.
+					 */
+					local_max = screen_info.lfb_base - 1;
+					range_min = fb_end;
+				} else {
+					range_min = fb_end;
+					continue;
+				}
+			}
+
+			start = (local_min + align - 1) & ~(align - 1);
+			for (; start + size - 1 <= local_max; start += align) {
+				*new = request_mem_region_exclusive(start, size,
+								    dev_n);
+				if (*new)
+					return 0;
+			}
+		}
+	}
+
+	return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(vmbus_allocate_mmio);
+
 static int vmbus_acpi_add(struct acpi_device *device)
 {
 	acpi_status result;
diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c
index b54ee1c05a5f..e2451bdb4525 100644
--- a/drivers/video/fbdev/hyperv_fb.c
+++ b/drivers/video/fbdev/hyperv_fb.c
@@ -213,7 +213,7 @@ struct synthvid_msg {
 
 struct hvfb_par {
 	struct fb_info *info;
-	struct resource mem;
+	struct resource *mem;
 	bool fb_ready; /* fb device is ready */
 	struct completion wait;
 	u32 synthvid_version;
@@ -677,26 +677,18 @@ static void hvfb_get_option(struct fb_info *info)
 
 
 /* Get framebuffer memory from Hyper-V video pci space */
-static int hvfb_getmem(struct fb_info *info)
+static int hvfb_getmem(struct hv_device *hdev, struct fb_info *info)
 {
 	struct hvfb_par *par = info->par;
 	struct pci_dev *pdev  = NULL;
 	void __iomem *fb_virt;
 	int gen2vm = efi_enabled(EFI_BOOT);
+	resource_size_t pot_start, pot_end;
 	int ret;
 
-	par->mem.name = KBUILD_MODNAME;
-	par->mem.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 	if (gen2vm) {
-		ret = allocate_resource(hyperv_mmio, &par->mem,
-					screen_fb_size,
-					0, -1,
-					screen_fb_size,
-					NULL, NULL);
-		if (ret != 0) {
-			pr_err("Unable to allocate framebuffer memory\n");
-			return -ENODEV;
-		}
+		pot_start = 0;
+		pot_end = -1;
 	} else {
 		pdev = pci_get_device(PCI_VENDOR_ID_MICROSOFT,
 			      PCI_DEVICE_ID_HYPERV_VIDEO, NULL);
@@ -709,16 +701,18 @@ static int hvfb_getmem(struct fb_info *info)
 		    pci_resource_len(pdev, 0) < screen_fb_size)
 			goto err1;
 
-		par->mem.end = pci_resource_end(pdev, 0);
-		par->mem.start = par->mem.end - screen_fb_size + 1;
-		ret = request_resource(&pdev->resource[0], &par->mem);
-		if (ret != 0) {
-			pr_err("Unable to request framebuffer memory\n");
-			goto err1;
-		}
+		pot_end = pci_resource_end(pdev, 0);
+		pot_start = pot_end - screen_fb_size + 1;
+	}
+
+	ret = vmbus_allocate_mmio(&par->mem, hdev, pot_start, pot_end,
+				  screen_fb_size, 0x100000, true);
+	if (ret != 0) {
+		pr_err("Unable to allocate framebuffer memory\n");
+		goto err1;
 	}
 
-	fb_virt = ioremap(par->mem.start, screen_fb_size);
+	fb_virt = ioremap(par->mem->start, screen_fb_size);
 	if (!fb_virt)
 		goto err2;
 
@@ -736,7 +730,7 @@ static int hvfb_getmem(struct fb_info *info)
 		info->apertures->ranges[0].size = pci_resource_len(pdev, 0);
 	}
 
-	info->fix.smem_start = par->mem.start;
+	info->fix.smem_start = par->mem->start;
 	info->fix.smem_len = screen_fb_size;
 	info->screen_base = fb_virt;
 	info->screen_size = screen_fb_size;
@@ -749,7 +743,8 @@ static int hvfb_getmem(struct fb_info *info)
 err3:
 	iounmap(fb_virt);
 err2:
-	release_resource(&par->mem);
+	release_mem_region(par->mem->start, screen_fb_size);
+	par->mem = NULL;
 err1:
 	if (!gen2vm)
 		pci_dev_put(pdev);
@@ -763,7 +758,8 @@ static void hvfb_putmem(struct fb_info *info)
 	struct hvfb_par *par = info->par;
 
 	iounmap(info->screen_base);
-	release_resource(&par->mem);
+	release_mem_region(par->mem->start, screen_fb_size);
+	par->mem = NULL;
 }
 
 
@@ -794,7 +790,7 @@ static int hvfb_probe(struct hv_device *hdev,
 		goto error1;
 	}
 
-	ret = hvfb_getmem(info);
+	ret = hvfb_getmem(hdev, info);
 	if (ret) {
 		pr_err("No memory for framebuffer\n");
 		goto error2;
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 217e14be77b9..54733d5b503e 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -977,6 +977,11 @@ int __must_check __vmbus_driver_register(struct hv_driver *hv_driver,
 					 const char *mod_name);
 void vmbus_driver_unregister(struct hv_driver *hv_driver);
 
+int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
+			resource_size_t min, resource_size_t max,
+			resource_size_t size, resource_size_t align,
+			bool fb_overlap_ok);
+
 /**
  * VMBUS_DEVICE - macro used to describe a specific hyperv vmbus device
  *
@@ -1233,8 +1238,6 @@ extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *,
 
 void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid);
 
-extern struct resource *hyperv_mmio;
-
 /*
  * Negotiated version with the Host.
  */
-- 
cgit v1.2.3-70-g09d2


From 9f01ec53458d9e9b68f1c555e773b5d1a1f66e94 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Wed, 5 Aug 2015 00:52:38 -0700
Subject: Drivers: hv: vmbus: Improve the CPU affiliation for channels

The current code tracks the assigned CPUs within a NUMA node in the context of
the primary channel. So, if we have a VM with a single NUMA node with 8 VCPUs, we may
end up unevenly distributing the channel load. Fix the issue by tracking affiliations
globally.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel_mgmt.c | 11 ++++++-----
 drivers/hv/hv.c           |  9 +++++++++
 drivers/hv/hyperv_vmbus.h |  5 +++++
 include/linux/hyperv.h    |  1 -
 4 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 30613dfa38b3..39c5afc7970c 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -392,6 +392,7 @@ static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_gui
 	struct vmbus_channel *primary = channel->primary_channel;
 	int next_node;
 	struct cpumask available_mask;
+	struct cpumask *alloced_mask;
 
 	for (i = IDE; i < MAX_PERF_CHN; i++) {
 		if (!memcmp(type_guid->b, hp_devs[i].guid,
@@ -409,7 +410,6 @@ static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_gui
 		 * channel, bind it to cpu 0.
 		 */
 		channel->numa_node = 0;
-		cpumask_set_cpu(0, &channel->alloced_cpus_in_node);
 		channel->target_cpu = 0;
 		channel->target_vp = hv_context.vp_index[0];
 		return;
@@ -434,21 +434,22 @@ static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_gui
 		channel->numa_node = next_node;
 		primary = channel;
 	}
+	alloced_mask = &hv_context.hv_numa_map[primary->numa_node];
 
-	if (cpumask_weight(&primary->alloced_cpus_in_node) ==
+	if (cpumask_weight(alloced_mask) ==
 	    cpumask_weight(cpumask_of_node(primary->numa_node))) {
 		/*
 		 * We have cycled through all the CPUs in the node;
 		 * reset the alloced map.
 		 */
-		cpumask_clear(&primary->alloced_cpus_in_node);
+		cpumask_clear(alloced_mask);
 	}
 
-	cpumask_xor(&available_mask, &primary->alloced_cpus_in_node,
+	cpumask_xor(&available_mask, alloced_mask,
 		    cpumask_of_node(primary->numa_node));
 
 	cur_cpu = cpumask_next(-1, &available_mask);
-	cpumask_set_cpu(cur_cpu, &primary->alloced_cpus_in_node);
+	cpumask_set_cpu(cur_cpu, alloced_mask);
 
 	channel->target_cpu = cur_cpu;
 	channel->target_vp = hv_context.vp_index[cur_cpu];
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 41d8072d61d9..fd93cfde96d0 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -332,6 +332,13 @@ int hv_synic_alloc(void)
 	size_t ced_size = sizeof(struct clock_event_device);
 	int cpu;
 
+	hv_context.hv_numa_map = kzalloc(sizeof(struct cpumask) * nr_node_ids,
+					 GFP_ATOMIC);
+	if (hv_context.hv_numa_map == NULL) {
+		pr_err("Unable to allocate NUMA map\n");
+		goto err;
+	}
+
 	for_each_online_cpu(cpu) {
 		hv_context.event_dpc[cpu] = kmalloc(size, GFP_ATOMIC);
 		if (hv_context.event_dpc[cpu] == NULL) {
@@ -345,6 +352,7 @@ int hv_synic_alloc(void)
 			pr_err("Unable to allocate clock event device\n");
 			goto err;
 		}
+
 		hv_init_clockevent_device(hv_context.clk_evt[cpu], cpu);
 
 		hv_context.synic_message_page[cpu] =
@@ -393,6 +401,7 @@ void hv_synic_free(void)
 {
 	int cpu;
 
+	kfree(hv_context.hv_numa_map);
 	for_each_online_cpu(cpu)
 		hv_synic_free_cpu(cpu);
 }
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 638370701657..6f258255ac94 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -551,6 +551,11 @@ struct hv_context {
 	 * Support PV clockevent device.
 	 */
 	struct clock_event_device *clk_evt[NR_CPUS];
+	/*
+	 * To manage allocations in a NUMA node.
+	 * Array indexed by numa node ID.
+	 */
+	struct cpumask *hv_numa_map;
 };
 
 extern struct hv_context hv_context;
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 54733d5b503e..5a3df5a47c8f 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -699,7 +699,6 @@ struct vmbus_channel {
 	/*
 	 * State to manage the CPU affiliation of channels.
 	 */
-	struct cpumask alloced_cpus_in_node;
 	int numa_node;
 	/*
 	 * Support for sub-channels. For high performance devices,
-- 
cgit v1.2.3-70-g09d2


From 3b71107d73b16074afa7658f3f0fcf837aabfe24 Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui@microsoft.com>
Date: Wed, 5 Aug 2015 00:52:39 -0700
Subject: Drivers: hv: vmbus: Further improve CPU affiliation logic

Keep track of CPU affiliations of sub-channels within the scope of the primary
channel. This will allow us to better distribute the load amongst available
CPUs.

Signed-off-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel_mgmt.c | 20 ++++++++++++++++++--
 include/linux/hyperv.h    |  1 +
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 39c5afc7970c..2f9aead4ecfc 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -448,8 +448,24 @@ static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_gui
 	cpumask_xor(&available_mask, alloced_mask,
 		    cpumask_of_node(primary->numa_node));
 
-	cur_cpu = cpumask_next(-1, &available_mask);
-	cpumask_set_cpu(cur_cpu, alloced_mask);
+	cur_cpu = -1;
+	while (true) {
+		cur_cpu = cpumask_next(cur_cpu, &available_mask);
+		if (cur_cpu >= nr_cpu_ids) {
+			cur_cpu = -1;
+			cpumask_copy(&available_mask,
+				     cpumask_of_node(primary->numa_node));
+			continue;
+		}
+
+		if (!cpumask_test_cpu(cur_cpu,
+				&primary->alloced_cpus_in_node)) {
+			cpumask_set_cpu(cur_cpu,
+					&primary->alloced_cpus_in_node);
+			cpumask_set_cpu(cur_cpu, alloced_mask);
+			break;
+		}
+	}
 
 	channel->target_cpu = cur_cpu;
 	channel->target_vp = hv_context.vp_index[cur_cpu];
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 5a3df5a47c8f..54733d5b503e 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -699,6 +699,7 @@ struct vmbus_channel {
 	/*
 	 * State to manage the CPU affiliation of channels.
 	 */
+	struct cpumask alloced_cpus_in_node;
 	int numa_node;
 	/*
 	 * Support for sub-channels. For high performance devices,
-- 
cgit v1.2.3-70-g09d2


From 061dff29f8f62c21c9222897e4d121b4a5fa50da Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 3 Aug 2015 13:02:59 -0400
Subject: xprtrdma: Increase default credit limit

In preparation for similar increases on NFS/RDMA servers, bump the
advertised credit limit for RPC/RDMA to 128. This allocates some
extra resources, but the client will continue to allow only the
number of RPCs in flight that the server requests via its advertised
credit limit.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-By: Sagi Grimberg <sagig@mellanox.com>
Tested-by: Devesh Sharma <devesh.sharma@avagotech.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprtrdma.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index b17613052cc3..b7b279b54504 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -49,7 +49,7 @@
  * a single chunk type per message is supported currently.
  */
 #define RPCRDMA_MIN_SLOT_TABLE	(2U)
-#define RPCRDMA_DEF_SLOT_TABLE	(32U)
+#define RPCRDMA_DEF_SLOT_TABLE	(128U)
 #define RPCRDMA_MAX_SLOT_TABLE	(256U)
 
 #define RPCRDMA_DEF_INLINE  (1024)	/* default inline max */
-- 
cgit v1.2.3-70-g09d2


From c0bd1b9e58959c51a4c939505f89721dfbc73c44 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 22 Jul 2015 13:17:15 -0500
Subject: Revert "ti-st: add device tree support"

This reverts commit 46d0d33350e9b32642d745a8b46a954910196b4d.

This binding is horrible and never should have been merged. It is not
documented nor are there any in tree users, so reverting it will not
break anything we care about. Lets revert it before we do have users.

The problems with it are:

- It is not documented.

- The GPIO connection is described with a custom property and uses Linux
GPIO numbering.

- The UART connection is described using the Linux tty device name.

Cc: Gigi Joseph <gigi.joseph@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/ti-st/st_kim.c  | 94 ++++----------------------------------------
 drivers/misc/ti-st/st_ll.c   | 17 +-------
 include/linux/ti_wilink_st.h |  1 -
 3 files changed, 9 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/ti-st/st_kim.c b/drivers/misc/ti-st/st_kim.c
index c84093e639e0..c828282af38a 100644
--- a/drivers/misc/ti-st/st_kim.c
+++ b/drivers/misc/ti-st/st_kim.c
@@ -36,8 +36,6 @@
 #include <linux/skbuff.h>
 #include <linux/ti_wilink_st.h>
 #include <linux/module.h>
-#include <linux/of.h>
-#include <linux/of_device.h>
 
 #define MAX_ST_DEVICES	3	/* Imagine 1 on each UART for now */
 static struct platform_device *st_kim_devices[MAX_ST_DEVICES];
@@ -45,9 +43,6 @@ static struct platform_device *st_kim_devices[MAX_ST_DEVICES];
 /**********************************************************************/
 /* internal functions */
 
-struct ti_st_plat_data	*dt_pdata;
-static struct ti_st_plat_data *get_platform_data(struct device *dev);
-
 /**
  * st_get_plat_device -
  *	function which returns the reference to the platform device
@@ -469,12 +464,7 @@ long st_kim_start(void *kim_data)
 	struct kim_data_s	*kim_gdata = (struct kim_data_s *)kim_data;
 
 	pr_info(" %s", __func__);
-	if (kim_gdata->kim_pdev->dev.of_node) {
-		pr_debug("use device tree data");
-		pdata = dt_pdata;
-	} else {
-		pdata = kim_gdata->kim_pdev->dev.platform_data;
-	}
+	pdata = kim_gdata->kim_pdev->dev.platform_data;
 
 	do {
 		/* platform specific enabling code here */
@@ -534,18 +524,12 @@ long st_kim_stop(void *kim_data)
 {
 	long err = 0;
 	struct kim_data_s	*kim_gdata = (struct kim_data_s *)kim_data;
-	struct ti_st_plat_data	*pdata;
+	struct ti_st_plat_data	*pdata =
+		kim_gdata->kim_pdev->dev.platform_data;
 	struct tty_struct	*tty = kim_gdata->core_data->tty;
 
 	reinit_completion(&kim_gdata->ldisc_installed);
 
-	if (kim_gdata->kim_pdev->dev.of_node) {
-		pr_debug("use device tree data");
-		pdata = dt_pdata;
-	} else
-		pdata = kim_gdata->kim_pdev->dev.platform_data;
-
-
 	if (tty) {	/* can be called before ldisc is installed */
 		/* Flush any pending characters in the driver and discipline. */
 		tty_ldisc_flush(tty);
@@ -737,52 +721,13 @@ static const struct file_operations list_debugfs_fops = {
  * board-*.c file
  */
 
-static const struct of_device_id kim_of_match[] = {
-{
-	.compatible = "kim",
-	},
-	{}
-};
-MODULE_DEVICE_TABLE(of, kim_of_match);
-
-static struct ti_st_plat_data *get_platform_data(struct device *dev)
-{
-	struct device_node *np = dev->of_node;
-	const u32 *dt_property;
-	int len;
-
-	dt_pdata = kzalloc(sizeof(*dt_pdata), GFP_KERNEL);
-	if (!dt_pdata)
-		return NULL;
-
-	dt_property = of_get_property(np, "dev_name", &len);
-	if (dt_property)
-		memcpy(&dt_pdata->dev_name, dt_property, len);
-	of_property_read_u32(np, "nshutdown_gpio",
-			     &dt_pdata->nshutdown_gpio);
-	of_property_read_u32(np, "flow_cntrl", &dt_pdata->flow_cntrl);
-	of_property_read_u32(np, "baud_rate", &dt_pdata->baud_rate);
-
-	return dt_pdata;
-}
-
 static struct dentry *kim_debugfs_dir;
 static int kim_probe(struct platform_device *pdev)
 {
 	struct kim_data_s	*kim_gdata;
-	struct ti_st_plat_data	*pdata;
+	struct ti_st_plat_data	*pdata = pdev->dev.platform_data;
 	int err;
 
-	if (pdev->dev.of_node)
-		pdata = get_platform_data(&pdev->dev);
-	else
-		pdata = pdev->dev.platform_data;
-
-	if (pdata == NULL) {
-		dev_err(&pdev->dev, "Platform Data is missing\n");
-		return -ENXIO;
-	}
-
 	if ((pdev->id != -1) && (pdev->id < MAX_ST_DEVICES)) {
 		/* multiple devices could exist */
 		st_kim_devices[pdev->id] = pdev;
@@ -863,16 +808,9 @@ err_core_init:
 static int kim_remove(struct platform_device *pdev)
 {
 	/* free the GPIOs requested */
-	struct ti_st_plat_data	*pdata;
+	struct ti_st_plat_data	*pdata = pdev->dev.platform_data;
 	struct kim_data_s	*kim_gdata;
 
-	if (pdev->dev.of_node) {
-		pr_debug("use device tree data");
-		pdata = dt_pdata;
-	} else {
-		pdata = pdev->dev.platform_data;
-	}
-
 	kim_gdata = platform_get_drvdata(pdev);
 
 	/* Free the Bluetooth/FM/GPIO
@@ -890,22 +828,12 @@ static int kim_remove(struct platform_device *pdev)
 
 	kfree(kim_gdata);
 	kim_gdata = NULL;
-	kfree(dt_pdata);
-	dt_pdata = NULL;
-
 	return 0;
 }
 
 static int kim_suspend(struct platform_device *pdev, pm_message_t state)
 {
-	struct ti_st_plat_data	*pdata;
-
-	if (pdev->dev.of_node) {
-		pr_debug("use device tree data");
-		pdata = dt_pdata;
-	} else {
-		pdata = pdev->dev.platform_data;
-	}
+	struct ti_st_plat_data	*pdata = pdev->dev.platform_data;
 
 	if (pdata->suspend)
 		return pdata->suspend(pdev, state);
@@ -915,14 +843,7 @@ static int kim_suspend(struct platform_device *pdev, pm_message_t state)
 
 static int kim_resume(struct platform_device *pdev)
 {
-	struct ti_st_plat_data	*pdata;
-
-	if (pdev->dev.of_node) {
-		pr_debug("use device tree data");
-		pdata = dt_pdata;
-	} else {
-		pdata = pdev->dev.platform_data;
-	}
+	struct ti_st_plat_data	*pdata = pdev->dev.platform_data;
 
 	if (pdata->resume)
 		return pdata->resume(pdev);
@@ -939,7 +860,6 @@ static struct platform_driver kim_platform_driver = {
 	.resume = kim_resume,
 	.driver = {
 		.name = "kim",
-		.of_match_table = of_match_ptr(kim_of_match),
 	},
 };
 
diff --git a/drivers/misc/ti-st/st_ll.c b/drivers/misc/ti-st/st_ll.c
index 518e1b7f2f95..93b4d67cc4a3 100644
--- a/drivers/misc/ti-st/st_ll.c
+++ b/drivers/misc/ti-st/st_ll.c
@@ -26,7 +26,6 @@
 #include <linux/ti_wilink_st.h>
 
 /**********************************************************************/
-
 /* internal functions */
 static void send_ll_cmd(struct st_data_s *st_data,
 	unsigned char cmd)
@@ -54,13 +53,7 @@ static void ll_device_want_to_sleep(struct st_data_s *st_data)
 
 	/* communicate to platform about chip asleep */
 	kim_data = st_data->kim_data;
-	if (kim_data->kim_pdev->dev.of_node) {
-		pr_debug("use device tree data");
-		pdata = dt_pdata;
-	} else {
-		pdata = kim_data->kim_pdev->dev.platform_data;
-	}
-
+	pdata = kim_data->kim_pdev->dev.platform_data;
 	if (pdata->chip_asleep)
 		pdata->chip_asleep(NULL);
 }
@@ -93,13 +86,7 @@ static void ll_device_want_to_wakeup(struct st_data_s *st_data)
 
 	/* communicate to platform about chip wakeup */
 	kim_data = st_data->kim_data;
-	if (kim_data->kim_pdev->dev.of_node) {
-		pr_debug("use device tree data");
-		pdata = dt_pdata;
-	} else {
-		pdata = kim_data->kim_pdev->dev.platform_data;
-	}
-
+	pdata = kim_data->kim_pdev->dev.platform_data;
 	if (pdata->chip_awake)
 		pdata->chip_awake(NULL);
 }
diff --git a/include/linux/ti_wilink_st.h b/include/linux/ti_wilink_st.h
index c78dcfeaf25f..d4217eff489f 100644
--- a/include/linux/ti_wilink_st.h
+++ b/include/linux/ti_wilink_st.h
@@ -86,7 +86,6 @@ struct st_proto_s {
 extern long st_register(struct st_proto_s *);
 extern long st_unregister(struct st_proto_s *);
 
-extern struct ti_st_plat_data   *dt_pdata;
 
 /*
  * header information used by st_core.c
-- 
cgit v1.2.3-70-g09d2


From 722102274ab1206634d31abe4f438a911a0945d2 Mon Sep 17 00:00:00 2001
From: Chunyan Zhang <zhang.chunyan@linaro.org>
Date: Fri, 31 Jul 2015 09:37:26 -0600
Subject: Coresight: Add an interface for supporting ETM3/4 Context ID tracing

If PID namespace is enabled, everytime users configure the Context ID
register to trace the specific process, there needs to be a translation
between the real PID seen from the kernel and VPID seen from the
namespace in which the user's process resides .

This patch just adds the translation interface for ETMs.

Signed-off-by: Chunyan Zhang <zhang.chunyan@linaro.org>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/coresight.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 3486b9082adb..626da6948ca2 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -248,4 +248,24 @@ static inline struct coresight_platform_data *of_get_coresight_platform_data(
 	struct device *dev, struct device_node *node) { return NULL; }
 #endif
 
+#ifdef CONFIG_PID_NS
+static inline unsigned long
+coresight_vpid_to_pid(unsigned long vpid)
+{
+	struct task_struct *task = NULL;
+	unsigned long pid = 0;
+
+	rcu_read_lock();
+	task = find_task_by_vpid(vpid);
+	if (task)
+		pid = task_pid_nr(task);
+	rcu_read_unlock();
+
+	return pid;
+}
+#else
+static inline unsigned long
+coresight_vpid_to_pid(unsigned long vpid) { return vpid; }
+#endif
+
 #endif
-- 
cgit v1.2.3-70-g09d2


From ff63ec1312dabd28876c9c03b5ed172a879bfb60 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 31 Jul 2015 09:37:30 -0600
Subject: coresight: Fix implicit inclusion of linux/sched.h

The patch "Coresight: Add an interface for supporting ETM3/4 Context ID
tracing" adds uses of find_task_by_vpid() and task_pid_nr() from
linux/sched.h but does not include that header causing build errors in
at least an ARM allmodconfig where it is not implicitly included. Add an
explicit include to fix that.

Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/coresight.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 626da6948ca2..c69e1b932809 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -14,6 +14,7 @@
 #define _LINUX_CORESIGHT_H
 
 #include <linux/device.h>
+#include <linux/sched.h>
 
 /* Peripheral id registers (0xFD0-0xFEC) */
 #define CORESIGHT_PERIPHIDR4	0xfd0
-- 
cgit v1.2.3-70-g09d2


From eace75cfdcf7d9937d8c1fb226780123c64d72c4 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Mon, 27 Jul 2015 12:13:19 +0100
Subject: nvmem: Add a simple NVMEM framework for nvmem providers

This patch adds just providers part of the framework just to enable easy
review.

Up until now, NVMEM drivers like eeprom were stored in drivers/misc,
where they all had to duplicate pretty much the same code to register
a sysfs file, allow in-kernel users to access the content of the devices
they were driving, etc.

This was also a problem as far as other in-kernel users were involved,
since the solutions used were pretty much different from on driver to
another, there was a rather big abstraction leak.

This introduction of this framework aims at solving this. It also
introduces DT representation for consumer devices to go get the data
they require (MAC Addresses, SoC/Revision ID, part numbers, and so on)
from the nvmems.

Having regmap interface to this framework would give much better
abstraction for nvmems on different buses.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
[Maxime Ripard: intial version of eeprom framework]
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Tested-by: Stefan Wahren <stefan.wahren@i2se.com>
Tested-by: Philipp Zabel <p.zabel@pengutronix.de>
Tested-by: Rajendra Nayak <rnayak@codeaurora.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/Kconfig                |   2 +
 drivers/Makefile               |   1 +
 drivers/nvmem/Kconfig          |  13 ++
 drivers/nvmem/Makefile         |   6 +
 drivers/nvmem/core.c           | 406 +++++++++++++++++++++++++++++++++++++++++
 include/linux/nvmem-consumer.h |  23 +++
 include/linux/nvmem-provider.h |  47 +++++
 7 files changed, 498 insertions(+)
 create mode 100644 drivers/nvmem/Kconfig
 create mode 100644 drivers/nvmem/Makefile
 create mode 100644 drivers/nvmem/core.c
 create mode 100644 include/linux/nvmem-consumer.h
 create mode 100644 include/linux/nvmem-provider.h

(limited to 'include/linux')

diff --git a/drivers/Kconfig b/drivers/Kconfig
index 6e973b8e3a3b..4e2e6aaf0b88 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -184,4 +184,6 @@ source "drivers/android/Kconfig"
 
 source "drivers/nvdimm/Kconfig"
 
+source "drivers/nvmem/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index b64b49f6e01b..4c270f5414f0 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -165,3 +165,4 @@ obj-$(CONFIG_RAS)		+= ras/
 obj-$(CONFIG_THUNDERBOLT)	+= thunderbolt/
 obj-$(CONFIG_CORESIGHT)		+= hwtracing/coresight/
 obj-$(CONFIG_ANDROID)		+= android/
+obj-$(CONFIG_NVMEM)		+= nvmem/
diff --git a/drivers/nvmem/Kconfig b/drivers/nvmem/Kconfig
new file mode 100644
index 000000000000..de90c82d891b
--- /dev/null
+++ b/drivers/nvmem/Kconfig
@@ -0,0 +1,13 @@
+menuconfig NVMEM
+	tristate "NVMEM Support"
+	select REGMAP
+	help
+	  Support for NVMEM(Non Volatile Memory) devices like EEPROM, EFUSES...
+
+	  This framework is designed to provide a generic interface to NVMEM
+	  from both the Linux Kernel and the userspace.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called nvmem_core.
+
+	  If unsure, say no.
diff --git a/drivers/nvmem/Makefile b/drivers/nvmem/Makefile
new file mode 100644
index 000000000000..6df2c6952ad5
--- /dev/null
+++ b/drivers/nvmem/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for nvmem drivers.
+#
+
+obj-$(CONFIG_NVMEM)		+= nvmem_core.o
+nvmem_core-y			:= core.o
diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
new file mode 100644
index 000000000000..2b024915e224
--- /dev/null
+++ b/drivers/nvmem/core.c
@@ -0,0 +1,406 @@
+/*
+ * nvmem framework core.
+ *
+ * Copyright (C) 2015 Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
+ * Copyright (C) 2013 Maxime Ripard <maxime.ripard@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/nvmem-consumer.h>
+#include <linux/nvmem-provider.h>
+#include <linux/of.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+
+struct nvmem_device {
+	const char		*name;
+	struct regmap		*regmap;
+	struct module		*owner;
+	struct device		dev;
+	int			stride;
+	int			word_size;
+	int			ncells;
+	int			id;
+	int			users;
+	size_t			size;
+	bool			read_only;
+};
+
+struct nvmem_cell {
+	const char		*name;
+	int			offset;
+	int			bytes;
+	int			bit_offset;
+	int			nbits;
+	struct nvmem_device	*nvmem;
+	struct list_head	node;
+};
+
+static DEFINE_MUTEX(nvmem_mutex);
+static DEFINE_IDA(nvmem_ida);
+
+static LIST_HEAD(nvmem_cells);
+static DEFINE_MUTEX(nvmem_cells_mutex);
+
+#define to_nvmem_device(d) container_of(d, struct nvmem_device, dev)
+
+static ssize_t bin_attr_nvmem_read(struct file *filp, struct kobject *kobj,
+				    struct bin_attribute *attr,
+				    char *buf, loff_t pos, size_t count)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct nvmem_device *nvmem = to_nvmem_device(dev);
+	int rc;
+
+	/* Stop the user from reading */
+	if (pos > nvmem->size)
+		return 0;
+
+	if (pos + count > nvmem->size)
+		count = nvmem->size - pos;
+
+	count = round_down(count, nvmem->word_size);
+
+	rc = regmap_raw_read(nvmem->regmap, pos, buf, count);
+
+	if (IS_ERR_VALUE(rc))
+		return rc;
+
+	return count;
+}
+
+static ssize_t bin_attr_nvmem_write(struct file *filp, struct kobject *kobj,
+				     struct bin_attribute *attr,
+				     char *buf, loff_t pos, size_t count)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct nvmem_device *nvmem = to_nvmem_device(dev);
+	int rc;
+
+	/* Stop the user from writing */
+	if (pos > nvmem->size)
+		return 0;
+
+	if (pos + count > nvmem->size)
+		count = nvmem->size - pos;
+
+	count = round_down(count, nvmem->word_size);
+
+	rc = regmap_raw_write(nvmem->regmap, pos, buf, count);
+
+	if (IS_ERR_VALUE(rc))
+		return rc;
+
+	return count;
+}
+
+/* default read/write permissions */
+static struct bin_attribute bin_attr_rw_nvmem = {
+	.attr	= {
+		.name	= "nvmem",
+		.mode	= S_IWUSR | S_IRUGO,
+	},
+	.read	= bin_attr_nvmem_read,
+	.write	= bin_attr_nvmem_write,
+};
+
+static struct bin_attribute *nvmem_bin_rw_attributes[] = {
+	&bin_attr_rw_nvmem,
+	NULL,
+};
+
+static const struct attribute_group nvmem_bin_rw_group = {
+	.bin_attrs	= nvmem_bin_rw_attributes,
+};
+
+static const struct attribute_group *nvmem_rw_dev_groups[] = {
+	&nvmem_bin_rw_group,
+	NULL,
+};
+
+/* read only permission */
+static struct bin_attribute bin_attr_ro_nvmem = {
+	.attr	= {
+		.name	= "nvmem",
+		.mode	= S_IRUGO,
+	},
+	.read	= bin_attr_nvmem_read,
+};
+
+static struct bin_attribute *nvmem_bin_ro_attributes[] = {
+	&bin_attr_ro_nvmem,
+	NULL,
+};
+
+static const struct attribute_group nvmem_bin_ro_group = {
+	.bin_attrs	= nvmem_bin_ro_attributes,
+};
+
+static const struct attribute_group *nvmem_ro_dev_groups[] = {
+	&nvmem_bin_ro_group,
+	NULL,
+};
+
+static void nvmem_release(struct device *dev)
+{
+	struct nvmem_device *nvmem = to_nvmem_device(dev);
+
+	ida_simple_remove(&nvmem_ida, nvmem->id);
+	kfree(nvmem);
+}
+
+static const struct device_type nvmem_provider_type = {
+	.release	= nvmem_release,
+};
+
+static struct bus_type nvmem_bus_type = {
+	.name		= "nvmem",
+};
+
+static int of_nvmem_match(struct device *dev, void *nvmem_np)
+{
+	return dev->of_node == nvmem_np;
+}
+
+static struct nvmem_device *of_nvmem_find(struct device_node *nvmem_np)
+{
+	struct device *d;
+
+	if (!nvmem_np)
+		return NULL;
+
+	d = bus_find_device(&nvmem_bus_type, NULL, nvmem_np, of_nvmem_match);
+
+	if (!d)
+		return NULL;
+
+	return to_nvmem_device(d);
+}
+
+static struct nvmem_cell *nvmem_find_cell(const char *cell_id)
+{
+	struct nvmem_cell *p;
+
+	list_for_each_entry(p, &nvmem_cells, node)
+		if (p && !strcmp(p->name, cell_id))
+			return p;
+
+	return NULL;
+}
+
+static void nvmem_cell_drop(struct nvmem_cell *cell)
+{
+	mutex_lock(&nvmem_cells_mutex);
+	list_del(&cell->node);
+	mutex_unlock(&nvmem_cells_mutex);
+	kfree(cell);
+}
+
+static void nvmem_device_remove_all_cells(const struct nvmem_device *nvmem)
+{
+	struct nvmem_cell *cell;
+	struct list_head *p, *n;
+
+	list_for_each_safe(p, n, &nvmem_cells) {
+		cell = list_entry(p, struct nvmem_cell, node);
+		if (cell->nvmem == nvmem)
+			nvmem_cell_drop(cell);
+	}
+}
+
+static void nvmem_cell_add(struct nvmem_cell *cell)
+{
+	mutex_lock(&nvmem_cells_mutex);
+	list_add_tail(&cell->node, &nvmem_cells);
+	mutex_unlock(&nvmem_cells_mutex);
+}
+
+static int nvmem_cell_info_to_nvmem_cell(struct nvmem_device *nvmem,
+				   const struct nvmem_cell_info *info,
+				   struct nvmem_cell *cell)
+{
+	cell->nvmem = nvmem;
+	cell->offset = info->offset;
+	cell->bytes = info->bytes;
+	cell->name = info->name;
+
+	cell->bit_offset = info->bit_offset;
+	cell->nbits = info->nbits;
+
+	if (cell->nbits)
+		cell->bytes = DIV_ROUND_UP(cell->nbits + cell->bit_offset,
+					   BITS_PER_BYTE);
+
+	if (!IS_ALIGNED(cell->offset, nvmem->stride)) {
+		dev_err(&nvmem->dev,
+			"cell %s unaligned to nvmem stride %d\n",
+			cell->name, nvmem->stride);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nvmem_add_cells(struct nvmem_device *nvmem,
+			   const struct nvmem_config *cfg)
+{
+	struct nvmem_cell **cells;
+	const struct nvmem_cell_info *info = cfg->cells;
+	int i, rval;
+
+	cells = kcalloc(cfg->ncells, sizeof(*cells), GFP_KERNEL);
+	if (!cells)
+		return -ENOMEM;
+
+	for (i = 0; i < cfg->ncells; i++) {
+		cells[i] = kzalloc(sizeof(**cells), GFP_KERNEL);
+		if (!cells[i]) {
+			rval = -ENOMEM;
+			goto err;
+		}
+
+		rval = nvmem_cell_info_to_nvmem_cell(nvmem, &info[i], cells[i]);
+		if (IS_ERR_VALUE(rval)) {
+			kfree(cells[i]);
+			goto err;
+		}
+
+		nvmem_cell_add(cells[i]);
+	}
+
+	nvmem->ncells = cfg->ncells;
+	/* remove tmp array */
+	kfree(cells);
+
+	return 0;
+err:
+	while (--i)
+		nvmem_cell_drop(cells[i]);
+
+	return rval;
+}
+
+/**
+ * nvmem_register() - Register a nvmem device for given nvmem_config.
+ * Also creates an binary entry in /sys/bus/nvmem/devices/dev-name/nvmem
+ *
+ * @config: nvmem device configuration with which nvmem device is created.
+ *
+ * Return: Will be an ERR_PTR() on error or a valid pointer to nvmem_device
+ * on success.
+ */
+
+struct nvmem_device *nvmem_register(const struct nvmem_config *config)
+{
+	struct nvmem_device *nvmem;
+	struct device_node *np;
+	struct regmap *rm;
+	int rval;
+
+	if (!config->dev)
+		return ERR_PTR(-EINVAL);
+
+	rm = dev_get_regmap(config->dev, NULL);
+	if (!rm) {
+		dev_err(config->dev, "Regmap not found\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	nvmem = kzalloc(sizeof(*nvmem), GFP_KERNEL);
+	if (!nvmem)
+		return ERR_PTR(-ENOMEM);
+
+	rval  = ida_simple_get(&nvmem_ida, 0, 0, GFP_KERNEL);
+	if (rval < 0) {
+		kfree(nvmem);
+		return ERR_PTR(rval);
+	}
+
+	nvmem->id = rval;
+	nvmem->regmap = rm;
+	nvmem->owner = config->owner;
+	nvmem->stride = regmap_get_reg_stride(rm);
+	nvmem->word_size = regmap_get_val_bytes(rm);
+	nvmem->size = regmap_get_max_register(rm) + nvmem->stride;
+	nvmem->dev.type = &nvmem_provider_type;
+	nvmem->dev.bus = &nvmem_bus_type;
+	nvmem->dev.parent = config->dev;
+	np = config->dev->of_node;
+	nvmem->dev.of_node = np;
+	dev_set_name(&nvmem->dev, "%s%d",
+		     config->name ? : "nvmem", config->id);
+
+	nvmem->read_only = of_property_read_bool(np, "read-only") |
+			   config->read_only;
+
+	nvmem->dev.groups = nvmem->read_only ? nvmem_ro_dev_groups :
+					       nvmem_rw_dev_groups;
+
+	device_initialize(&nvmem->dev);
+
+	dev_dbg(&nvmem->dev, "Registering nvmem device %s\n", config->name);
+
+	rval = device_add(&nvmem->dev);
+	if (rval) {
+		ida_simple_remove(&nvmem_ida, nvmem->id);
+		kfree(nvmem);
+		return ERR_PTR(rval);
+	}
+
+	if (config->cells)
+		nvmem_add_cells(nvmem, config);
+
+	return nvmem;
+}
+EXPORT_SYMBOL_GPL(nvmem_register);
+
+/**
+ * nvmem_unregister() - Unregister previously registered nvmem device
+ *
+ * @nvmem: Pointer to previously registered nvmem device.
+ *
+ * Return: Will be an negative on error or a zero on success.
+ */
+int nvmem_unregister(struct nvmem_device *nvmem)
+{
+	if (nvmem->users)
+		return -EBUSY;
+
+	nvmem_device_remove_all_cells(nvmem);
+	device_del(&nvmem->dev);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvmem_unregister);
+
+static int __init nvmem_init(void)
+{
+	return bus_register(&nvmem_bus_type);
+}
+
+static void __exit nvmem_exit(void)
+{
+	bus_unregister(&nvmem_bus_type);
+}
+
+subsys_initcall(nvmem_init);
+module_exit(nvmem_exit);
+
+MODULE_AUTHOR("Srinivas Kandagatla <srinivas.kandagatla@linaro.org");
+MODULE_AUTHOR("Maxime Ripard <maxime.ripard@free-electrons.com");
+MODULE_DESCRIPTION("nvmem Driver Core");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h
new file mode 100644
index 000000000000..1e9e7678a501
--- /dev/null
+++ b/include/linux/nvmem-consumer.h
@@ -0,0 +1,23 @@
+/*
+ * nvmem framework consumer.
+ *
+ * Copyright (C) 2015 Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
+ * Copyright (C) 2013 Maxime Ripard <maxime.ripard@free-electrons.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2.  This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef _LINUX_NVMEM_CONSUMER_H
+#define _LINUX_NVMEM_CONSUMER_H
+
+struct nvmem_cell_info {
+	const char		*name;
+	unsigned int		offset;
+	unsigned int		bytes;
+	unsigned int		bit_offset;
+	unsigned int		nbits;
+};
+
+#endif  /* ifndef _LINUX_NVMEM_CONSUMER_H */
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
new file mode 100644
index 000000000000..0b68caff1b3c
--- /dev/null
+++ b/include/linux/nvmem-provider.h
@@ -0,0 +1,47 @@
+/*
+ * nvmem framework provider.
+ *
+ * Copyright (C) 2015 Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
+ * Copyright (C) 2013 Maxime Ripard <maxime.ripard@free-electrons.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2.  This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef _LINUX_NVMEM_PROVIDER_H
+#define _LINUX_NVMEM_PROVIDER_H
+
+struct nvmem_device;
+struct nvmem_cell_info;
+
+struct nvmem_config {
+	struct device		*dev;
+	const char		*name;
+	int			id;
+	struct module		*owner;
+	const struct nvmem_cell_info	*cells;
+	int			ncells;
+	bool			read_only;
+};
+
+#if IS_ENABLED(CONFIG_NVMEM)
+
+struct nvmem_device *nvmem_register(const struct nvmem_config *cfg);
+int nvmem_unregister(struct nvmem_device *nvmem);
+
+#else
+
+static inline struct nvmem_device *nvmem_register(const struct nvmem_config *c)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline int nvmem_unregister(struct nvmem_device *nvmem)
+{
+	return -ENOSYS;
+}
+
+#endif /* CONFIG_NVMEM */
+
+#endif  /* ifndef _LINUX_NVMEM_PROVIDER_H */
-- 
cgit v1.2.3-70-g09d2


From 69aba7948cbe53f2f1827e84e9dd0ae470a5072e Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Mon, 27 Jul 2015 12:13:34 +0100
Subject: nvmem: Add a simple NVMEM framework for consumers

This patch adds just consumers part of the framework just to enable easy
review.

Up until now, nvmem drivers were stored in drivers/misc, where they all
had to duplicate pretty much the same code to register a sysfs file,
allow in-kernel users to access the content of the devices they were
driving, etc.

This was also a problem as far as other in-kernel users were involved,
since the solutions used were pretty much different from on driver to
another, there was a rather big abstraction leak.

This introduction of this framework aims at solving this. It also
introduces DT representation for consumer devices to go get the data they
require (MAC Addresses, SoC/Revision ID, part numbers, and so on) from
the nvmems.

Having regmap interface to this framework would give much better
abstraction for nvmems on different buses.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
[Maxime Ripard: intial version of the framework]
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Tested-by: Stefan Wahren <stefan.wahren@i2se.com>
Tested-by: Philipp Zabel <p.zabel@pengutronix.de>
Tested-by: Rajendra Nayak <rnayak@codeaurora.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c           | 421 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/nvmem-consumer.h |  61 ++++++
 2 files changed, 481 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index 2b024915e224..8c16ae2e1308 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -377,8 +377,12 @@ EXPORT_SYMBOL_GPL(nvmem_register);
  */
 int nvmem_unregister(struct nvmem_device *nvmem)
 {
-	if (nvmem->users)
+	mutex_lock(&nvmem_mutex);
+	if (nvmem->users) {
+		mutex_unlock(&nvmem_mutex);
 		return -EBUSY;
+	}
+	mutex_unlock(&nvmem_mutex);
 
 	nvmem_device_remove_all_cells(nvmem);
 	device_del(&nvmem->dev);
@@ -387,6 +391,421 @@ int nvmem_unregister(struct nvmem_device *nvmem)
 }
 EXPORT_SYMBOL_GPL(nvmem_unregister);
 
+static struct nvmem_device *__nvmem_device_get(struct device_node *np,
+					       struct nvmem_cell **cellp,
+					       const char *cell_id)
+{
+	struct nvmem_device *nvmem = NULL;
+
+	mutex_lock(&nvmem_mutex);
+
+	if (np) {
+		nvmem = of_nvmem_find(np);
+		if (!nvmem) {
+			mutex_unlock(&nvmem_mutex);
+			return ERR_PTR(-EPROBE_DEFER);
+		}
+	} else {
+		struct nvmem_cell *cell = nvmem_find_cell(cell_id);
+
+		if (cell) {
+			nvmem = cell->nvmem;
+			*cellp = cell;
+		}
+
+		if (!nvmem) {
+			mutex_unlock(&nvmem_mutex);
+			return ERR_PTR(-ENOENT);
+		}
+	}
+
+	nvmem->users++;
+	mutex_unlock(&nvmem_mutex);
+
+	if (!try_module_get(nvmem->owner)) {
+		dev_err(&nvmem->dev,
+			"could not increase module refcount for cell %s\n",
+			nvmem->name);
+
+		mutex_lock(&nvmem_mutex);
+		nvmem->users--;
+		mutex_unlock(&nvmem_mutex);
+
+		return ERR_PTR(-EINVAL);
+	}
+
+	return nvmem;
+}
+
+static void __nvmem_device_put(struct nvmem_device *nvmem)
+{
+	module_put(nvmem->owner);
+	mutex_lock(&nvmem_mutex);
+	nvmem->users--;
+	mutex_unlock(&nvmem_mutex);
+}
+
+static struct nvmem_cell *nvmem_cell_get_from_list(const char *cell_id)
+{
+	struct nvmem_cell *cell = NULL;
+	struct nvmem_device *nvmem;
+
+	nvmem = __nvmem_device_get(NULL, &cell, cell_id);
+	if (IS_ERR(nvmem))
+		return ERR_CAST(nvmem);
+
+	return cell;
+}
+
+#if IS_ENABLED(CONFIG_NVMEM) && IS_ENABLED(CONFIG_OF)
+/**
+ * of_nvmem_cell_get() - Get a nvmem cell from given device node and cell id
+ *
+ * @dev node: Device tree node that uses the nvmem cell
+ * @id: nvmem cell name from nvmem-cell-names property.
+ *
+ * Return: Will be an ERR_PTR() on error or a valid pointer
+ * to a struct nvmem_cell.  The nvmem_cell will be freed by the
+ * nvmem_cell_put().
+ */
+struct nvmem_cell *of_nvmem_cell_get(struct device_node *np,
+					    const char *name)
+{
+	struct device_node *cell_np, *nvmem_np;
+	struct nvmem_cell *cell;
+	struct nvmem_device *nvmem;
+	const __be32 *addr;
+	int rval, len, index;
+
+	index = of_property_match_string(np, "nvmem-cell-names", name);
+
+	cell_np = of_parse_phandle(np, "nvmem-cells", index);
+	if (!cell_np)
+		return ERR_PTR(-EINVAL);
+
+	nvmem_np = of_get_next_parent(cell_np);
+	if (!nvmem_np)
+		return ERR_PTR(-EINVAL);
+
+	nvmem = __nvmem_device_get(nvmem_np, NULL, NULL);
+	if (IS_ERR(nvmem))
+		return ERR_CAST(nvmem);
+
+	addr = of_get_property(cell_np, "reg", &len);
+	if (!addr || (len < 2 * sizeof(u32))) {
+		dev_err(&nvmem->dev, "nvmem: invalid reg on %s\n",
+			cell_np->full_name);
+		rval  = -EINVAL;
+		goto err_mem;
+	}
+
+	cell = kzalloc(sizeof(*cell), GFP_KERNEL);
+	if (!cell) {
+		rval = -ENOMEM;
+		goto err_mem;
+	}
+
+	cell->nvmem = nvmem;
+	cell->offset = be32_to_cpup(addr++);
+	cell->bytes = be32_to_cpup(addr);
+	cell->name = cell_np->name;
+
+	addr = of_get_property(cell_np, "bits", &len);
+	if (addr && len == (2 * sizeof(u32))) {
+		cell->bit_offset = be32_to_cpup(addr++);
+		cell->nbits = be32_to_cpup(addr);
+	}
+
+	if (cell->nbits)
+		cell->bytes = DIV_ROUND_UP(cell->nbits + cell->bit_offset,
+					   BITS_PER_BYTE);
+
+	if (!IS_ALIGNED(cell->offset, nvmem->stride)) {
+			dev_err(&nvmem->dev,
+				"cell %s unaligned to nvmem stride %d\n",
+				cell->name, nvmem->stride);
+		rval  = -EINVAL;
+		goto err_sanity;
+	}
+
+	nvmem_cell_add(cell);
+
+	return cell;
+
+err_sanity:
+	kfree(cell);
+
+err_mem:
+	__nvmem_device_put(nvmem);
+
+	return ERR_PTR(rval);
+}
+EXPORT_SYMBOL_GPL(of_nvmem_cell_get);
+#endif
+
+/**
+ * nvmem_cell_get() - Get nvmem cell of device form a given cell name
+ *
+ * @dev node: Device tree node that uses the nvmem cell
+ * @id: nvmem cell name to get.
+ *
+ * Return: Will be an ERR_PTR() on error or a valid pointer
+ * to a struct nvmem_cell.  The nvmem_cell will be freed by the
+ * nvmem_cell_put().
+ */
+struct nvmem_cell *nvmem_cell_get(struct device *dev, const char *cell_id)
+{
+	struct nvmem_cell *cell;
+
+	if (dev->of_node) { /* try dt first */
+		cell = of_nvmem_cell_get(dev->of_node, cell_id);
+		if (!IS_ERR(cell) || PTR_ERR(cell) == -EPROBE_DEFER)
+			return cell;
+	}
+
+	return nvmem_cell_get_from_list(cell_id);
+}
+EXPORT_SYMBOL_GPL(nvmem_cell_get);
+
+static void devm_nvmem_cell_release(struct device *dev, void *res)
+{
+	nvmem_cell_put(*(struct nvmem_cell **)res);
+}
+
+/**
+ * devm_nvmem_cell_get() - Get nvmem cell of device form a given id
+ *
+ * @dev node: Device tree node that uses the nvmem cell
+ * @id: nvmem id in nvmem-names property.
+ *
+ * Return: Will be an ERR_PTR() on error or a valid pointer
+ * to a struct nvmem_cell.  The nvmem_cell will be freed by the
+ * automatically once the device is freed.
+ */
+struct nvmem_cell *devm_nvmem_cell_get(struct device *dev, const char *id)
+{
+	struct nvmem_cell **ptr, *cell;
+
+	ptr = devres_alloc(devm_nvmem_cell_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	cell = nvmem_cell_get(dev, id);
+	if (!IS_ERR(cell)) {
+		*ptr = cell;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return cell;
+}
+EXPORT_SYMBOL_GPL(devm_nvmem_cell_get);
+
+static int devm_nvmem_cell_match(struct device *dev, void *res, void *data)
+{
+	struct nvmem_cell **c = res;
+
+	if (WARN_ON(!c || !*c))
+		return 0;
+
+	return *c == data;
+}
+
+/**
+ * devm_nvmem_cell_put() - Release previously allocated nvmem cell
+ * from devm_nvmem_cell_get.
+ *
+ * @cell: Previously allocated nvmem cell by devm_nvmem_cell_get()
+ */
+void devm_nvmem_cell_put(struct device *dev, struct nvmem_cell *cell)
+{
+	int ret;
+
+	ret = devres_release(dev, devm_nvmem_cell_release,
+				devm_nvmem_cell_match, cell);
+
+	WARN_ON(ret);
+}
+EXPORT_SYMBOL(devm_nvmem_cell_put);
+
+/**
+ * nvmem_cell_put() - Release previously allocated nvmem cell.
+ *
+ * @cell: Previously allocated nvmem cell by nvmem_cell_get()
+ */
+void nvmem_cell_put(struct nvmem_cell *cell)
+{
+	struct nvmem_device *nvmem = cell->nvmem;
+
+	__nvmem_device_put(nvmem);
+	nvmem_cell_drop(cell);
+}
+EXPORT_SYMBOL_GPL(nvmem_cell_put);
+
+static inline void nvmem_shift_read_buffer_in_place(struct nvmem_cell *cell,
+						    void *buf)
+{
+	u8 *p, *b;
+	int i, bit_offset = cell->bit_offset;
+
+	p = b = buf;
+	if (bit_offset) {
+		/* First shift */
+		*b++ >>= bit_offset;
+
+		/* setup rest of the bytes if any */
+		for (i = 1; i < cell->bytes; i++) {
+			/* Get bits from next byte and shift them towards msb */
+			*p |= *b << (BITS_PER_BYTE - bit_offset);
+
+			p = b;
+			*b++ >>= bit_offset;
+		}
+
+		/* result fits in less bytes */
+		if (cell->bytes != DIV_ROUND_UP(cell->nbits, BITS_PER_BYTE))
+			*p-- = 0;
+	}
+	/* clear msb bits if any leftover in the last byte */
+	*p &= GENMASK((cell->nbits%BITS_PER_BYTE) - 1, 0);
+}
+
+static int __nvmem_cell_read(struct nvmem_device *nvmem,
+		      struct nvmem_cell *cell,
+		      void *buf, size_t *len)
+{
+	int rc;
+
+	rc = regmap_raw_read(nvmem->regmap, cell->offset, buf, cell->bytes);
+
+	if (IS_ERR_VALUE(rc))
+		return rc;
+
+	/* shift bits in-place */
+	if (cell->bit_offset || cell->bit_offset)
+		nvmem_shift_read_buffer_in_place(cell, buf);
+
+	*len = cell->bytes;
+
+	return 0;
+}
+
+/**
+ * nvmem_cell_read() - Read a given nvmem cell
+ *
+ * @cell: nvmem cell to be read.
+ * @len: pointer to length of cell which will be populated on successful read.
+ *
+ * Return: ERR_PTR() on error or a valid pointer to a char * buffer on success.
+ * The buffer should be freed by the consumer with a kfree().
+ */
+void *nvmem_cell_read(struct nvmem_cell *cell, size_t *len)
+{
+	struct nvmem_device *nvmem = cell->nvmem;
+	u8 *buf;
+	int rc;
+
+	if (!nvmem || !nvmem->regmap)
+		return ERR_PTR(-EINVAL);
+
+	buf = kzalloc(cell->bytes, GFP_KERNEL);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+
+	rc = __nvmem_cell_read(nvmem, cell, buf, len);
+	if (IS_ERR_VALUE(rc)) {
+		kfree(buf);
+		return ERR_PTR(rc);
+	}
+
+	return buf;
+}
+EXPORT_SYMBOL_GPL(nvmem_cell_read);
+
+static inline void *nvmem_cell_prepare_write_buffer(struct nvmem_cell *cell,
+						    u8 *_buf, int len)
+{
+	struct nvmem_device *nvmem = cell->nvmem;
+	int i, rc, nbits, bit_offset = cell->bit_offset;
+	u8 v, *p, *buf, *b, pbyte, pbits;
+
+	nbits = cell->nbits;
+	buf = kzalloc(cell->bytes, GFP_KERNEL);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(buf, _buf, len);
+	p = b = buf;
+
+	if (bit_offset) {
+		pbyte = *b;
+		*b <<= bit_offset;
+
+		/* setup the first byte with lsb bits from nvmem */
+		rc = regmap_raw_read(nvmem->regmap, cell->offset, &v, 1);
+		*b++ |= GENMASK(bit_offset - 1, 0) & v;
+
+		/* setup rest of the byte if any */
+		for (i = 1; i < cell->bytes; i++) {
+			/* Get last byte bits and shift them towards lsb */
+			pbits = pbyte >> (BITS_PER_BYTE - 1 - bit_offset);
+			pbyte = *b;
+			p = b;
+			*b <<= bit_offset;
+			*b++ |= pbits;
+		}
+	}
+
+	/* if it's not end on byte boundary */
+	if ((nbits + bit_offset) % BITS_PER_BYTE) {
+		/* setup the last byte with msb bits from nvmem */
+		rc = regmap_raw_read(nvmem->regmap,
+				    cell->offset + cell->bytes - 1, &v, 1);
+		*p |= GENMASK(7, (nbits + bit_offset) % BITS_PER_BYTE) & v;
+
+	}
+
+	return buf;
+}
+
+/**
+ * nvmem_cell_write() - Write to a given nvmem cell
+ *
+ * @cell: nvmem cell to be written.
+ * @buf: Buffer to be written.
+ * @len: length of buffer to be written to nvmem cell.
+ *
+ * Return: length of bytes written or negative on failure.
+ */
+int nvmem_cell_write(struct nvmem_cell *cell, void *buf, size_t len)
+{
+	struct nvmem_device *nvmem = cell->nvmem;
+	int rc;
+
+	if (!nvmem || !nvmem->regmap || nvmem->read_only ||
+	    (cell->bit_offset == 0 && len != cell->bytes))
+		return -EINVAL;
+
+	if (cell->bit_offset || cell->nbits) {
+		buf = nvmem_cell_prepare_write_buffer(cell, buf, len);
+		if (IS_ERR(buf))
+			return PTR_ERR(buf);
+	}
+
+	rc = regmap_raw_write(nvmem->regmap, cell->offset, buf, cell->bytes);
+
+	/* free the tmp buffer */
+	if (cell->bit_offset)
+		kfree(buf);
+
+	if (IS_ERR_VALUE(rc))
+		return rc;
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(nvmem_cell_write);
+
 static int __init nvmem_init(void)
 {
 	return bus_register(&nvmem_bus_type);
diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h
index 1e9e7678a501..297cc67b7211 100644
--- a/include/linux/nvmem-consumer.h
+++ b/include/linux/nvmem-consumer.h
@@ -12,6 +12,11 @@
 #ifndef _LINUX_NVMEM_CONSUMER_H
 #define _LINUX_NVMEM_CONSUMER_H
 
+struct device;
+struct device_node;
+/* consumer cookie */
+struct nvmem_cell;
+
 struct nvmem_cell_info {
 	const char		*name;
 	unsigned int		offset;
@@ -20,4 +25,60 @@ struct nvmem_cell_info {
 	unsigned int		nbits;
 };
 
+#if IS_ENABLED(CONFIG_NVMEM)
+
+/* Cell based interface */
+struct nvmem_cell *nvmem_cell_get(struct device *dev, const char *name);
+struct nvmem_cell *devm_nvmem_cell_get(struct device *dev, const char *name);
+void nvmem_cell_put(struct nvmem_cell *cell);
+void devm_nvmem_cell_put(struct device *dev, struct nvmem_cell *cell);
+void *nvmem_cell_read(struct nvmem_cell *cell, size_t *len);
+int nvmem_cell_write(struct nvmem_cell *cell, void *buf, size_t len);
+
+#else
+
+static inline struct nvmem_cell *nvmem_cell_get(struct device *dev,
+						const char *name)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline struct nvmem_cell *devm_nvmem_cell_get(struct device *dev,
+				       const char *name)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline void devm_nvmem_cell_put(struct device *dev,
+				       struct nvmem_cell *cell)
+{
+
+}
+static inline void nvmem_cell_put(struct nvmem_cell *cell)
+{
+}
+
+static inline char *nvmem_cell_read(struct nvmem_cell *cell, size_t *len)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline int nvmem_cell_write(struct nvmem_cell *cell,
+				    const char *buf, size_t len)
+{
+	return -ENOSYS;
+}
+#endif /* CONFIG_NVMEM */
+
+#if IS_ENABLED(CONFIG_NVMEM) && IS_ENABLED(CONFIG_OF)
+struct nvmem_cell *of_nvmem_cell_get(struct device_node *np,
+				     const char *name);
+#else
+static inline struct nvmem_cell *of_nvmem_cell_get(struct device_node *np,
+				     const char *name)
+{
+	return ERR_PTR(-ENOSYS);
+}
+#endif /* CONFIG_NVMEM && CONFIG_OF */
+
 #endif  /* ifndef _LINUX_NVMEM_CONSUMER_H */
-- 
cgit v1.2.3-70-g09d2


From e2a5402ec7c6d0442cca370a0097e75750f81398 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Mon, 27 Jul 2015 12:13:45 +0100
Subject: nvmem: Add nvmem_device based consumer apis.

This patch adds read/write apis which are based on nvmem_device. It is
common that the drivers like omap cape manager or qcom cpr driver to
access bytes directly at particular offset in the eeprom and not from
nvmem cell info in DT. These driver would need to get access to the nvmem
directly, which is what these new APIS provide.

These wrapper apis would help such users to avoid code duplication in
there drivers and also avoid them reading a big eeprom blob and parsing
it internally in there driver.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Tested-by: Stefan Wahren <stefan.wahren@i2se.com>
Tested-by: Philipp Zabel <p.zabel@pengutronix.de>
Tested-by: Rajendra Nayak <rnayak@codeaurora.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c           | 258 +++++++++++++++++++++++++++++++++++++++++
 include/linux/nvmem-consumer.h |  73 ++++++++++++
 2 files changed, 331 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index 8c16ae2e1308..d3c6676b3c0c 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -445,6 +445,148 @@ static void __nvmem_device_put(struct nvmem_device *nvmem)
 	mutex_unlock(&nvmem_mutex);
 }
 
+static int nvmem_match(struct device *dev, void *data)
+{
+	return !strcmp(dev_name(dev), data);
+}
+
+static struct nvmem_device *nvmem_find(const char *name)
+{
+	struct device *d;
+
+	d = bus_find_device(&nvmem_bus_type, NULL, (void *)name, nvmem_match);
+
+	if (!d)
+		return NULL;
+
+	return to_nvmem_device(d);
+}
+
+#if IS_ENABLED(CONFIG_NVMEM) && IS_ENABLED(CONFIG_OF)
+/**
+ * of_nvmem_device_get() - Get nvmem device from a given id
+ *
+ * @dev node: Device tree node that uses the nvmem device
+ * @id: nvmem name from nvmem-names property.
+ *
+ * Return: ERR_PTR() on error or a valid pointer to a struct nvmem_device
+ * on success.
+ */
+struct nvmem_device *of_nvmem_device_get(struct device_node *np, const char *id)
+{
+
+	struct device_node *nvmem_np;
+	int index;
+
+	index = of_property_match_string(np, "nvmem-names", id);
+
+	nvmem_np = of_parse_phandle(np, "nvmem", index);
+	if (!nvmem_np)
+		return ERR_PTR(-EINVAL);
+
+	return __nvmem_device_get(nvmem_np, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(of_nvmem_device_get);
+#endif
+
+/**
+ * nvmem_device_get() - Get nvmem device from a given id
+ *
+ * @dev : Device that uses the nvmem device
+ * @id: nvmem name from nvmem-names property.
+ *
+ * Return: ERR_PTR() on error or a valid pointer to a struct nvmem_device
+ * on success.
+ */
+struct nvmem_device *nvmem_device_get(struct device *dev, const char *dev_name)
+{
+	if (dev->of_node) { /* try dt first */
+		struct nvmem_device *nvmem;
+
+		nvmem = of_nvmem_device_get(dev->of_node, dev_name);
+
+		if (!IS_ERR(nvmem) || PTR_ERR(nvmem) == -EPROBE_DEFER)
+			return nvmem;
+
+	}
+
+	return nvmem_find(dev_name);
+}
+EXPORT_SYMBOL_GPL(nvmem_device_get);
+
+static int devm_nvmem_device_match(struct device *dev, void *res, void *data)
+{
+	struct nvmem_device **nvmem = res;
+
+	if (WARN_ON(!nvmem || !*nvmem))
+		return 0;
+
+	return *nvmem == data;
+}
+
+static void devm_nvmem_device_release(struct device *dev, void *res)
+{
+	nvmem_device_put(*(struct nvmem_device **)res);
+}
+
+/**
+ * devm_nvmem_device_put() - put alredy got nvmem device
+ *
+ * @nvmem: pointer to nvmem device allocated by devm_nvmem_cell_get(),
+ * that needs to be released.
+ */
+void devm_nvmem_device_put(struct device *dev, struct nvmem_device *nvmem)
+{
+	int ret;
+
+	ret = devres_release(dev, devm_nvmem_device_release,
+			     devm_nvmem_device_match, nvmem);
+
+	WARN_ON(ret);
+}
+EXPORT_SYMBOL_GPL(devm_nvmem_device_put);
+
+/**
+ * nvmem_device_put() - put alredy got nvmem device
+ *
+ * @nvmem: pointer to nvmem device that needs to be released.
+ */
+void nvmem_device_put(struct nvmem_device *nvmem)
+{
+	__nvmem_device_put(nvmem);
+}
+EXPORT_SYMBOL_GPL(nvmem_device_put);
+
+/**
+ * devm_nvmem_device_get() - Get nvmem cell of device form a given id
+ *
+ * @dev node: Device tree node that uses the nvmem cell
+ * @id: nvmem name in nvmems property.
+ *
+ * Return: ERR_PTR() on error or a valid pointer to a struct nvmem_cell
+ * on success.  The nvmem_cell will be freed by the automatically once the
+ * device is freed.
+ */
+struct nvmem_device *devm_nvmem_device_get(struct device *dev, const char *id)
+{
+	struct nvmem_device **ptr, *nvmem;
+
+	ptr = devres_alloc(devm_nvmem_device_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	nvmem = nvmem_device_get(dev, id);
+	if (!IS_ERR(nvmem)) {
+		*ptr = nvmem;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return nvmem;
+}
+EXPORT_SYMBOL_GPL(devm_nvmem_device_get);
+
 static struct nvmem_cell *nvmem_cell_get_from_list(const char *cell_id)
 {
 	struct nvmem_cell *cell = NULL;
@@ -806,6 +948,122 @@ int nvmem_cell_write(struct nvmem_cell *cell, void *buf, size_t len)
 }
 EXPORT_SYMBOL_GPL(nvmem_cell_write);
 
+/**
+ * nvmem_device_cell_read() - Read a given nvmem device and cell
+ *
+ * @nvmem: nvmem device to read from.
+ * @info: nvmem cell info to be read.
+ * @buf: buffer pointer which will be populated on successful read.
+ *
+ * Return: length of successful bytes read on success and negative
+ * error code on error.
+ */
+ssize_t nvmem_device_cell_read(struct nvmem_device *nvmem,
+			   struct nvmem_cell_info *info, void *buf)
+{
+	struct nvmem_cell cell;
+	int rc;
+	ssize_t len;
+
+	if (!nvmem || !nvmem->regmap)
+		return -EINVAL;
+
+	rc = nvmem_cell_info_to_nvmem_cell(nvmem, info, &cell);
+	if (IS_ERR_VALUE(rc))
+		return rc;
+
+	rc = __nvmem_cell_read(nvmem, &cell, buf, &len);
+	if (IS_ERR_VALUE(rc))
+		return rc;
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(nvmem_device_cell_read);
+
+/**
+ * nvmem_device_cell_write() - Write cell to a given nvmem device
+ *
+ * @nvmem: nvmem device to be written to.
+ * @info: nvmem cell info to be written
+ * @buf: buffer to be written to cell.
+ *
+ * Return: length of bytes written or negative error code on failure.
+ * */
+int nvmem_device_cell_write(struct nvmem_device *nvmem,
+			    struct nvmem_cell_info *info, void *buf)
+{
+	struct nvmem_cell cell;
+	int rc;
+
+	if (!nvmem || !nvmem->regmap)
+		return -EINVAL;
+
+	rc = nvmem_cell_info_to_nvmem_cell(nvmem, info, &cell);
+	if (IS_ERR_VALUE(rc))
+		return rc;
+
+	return nvmem_cell_write(&cell, buf, cell.bytes);
+}
+EXPORT_SYMBOL_GPL(nvmem_device_cell_write);
+
+/**
+ * nvmem_device_read() - Read from a given nvmem device
+ *
+ * @nvmem: nvmem device to read from.
+ * @offset: offset in nvmem device.
+ * @bytes: number of bytes to read.
+ * @buf: buffer pointer which will be populated on successful read.
+ *
+ * Return: length of successful bytes read on success and negative
+ * error code on error.
+ */
+int nvmem_device_read(struct nvmem_device *nvmem,
+		      unsigned int offset,
+		      size_t bytes, void *buf)
+{
+	int rc;
+
+	if (!nvmem || !nvmem->regmap)
+		return -EINVAL;
+
+	rc = regmap_raw_read(nvmem->regmap, offset, buf, bytes);
+
+	if (IS_ERR_VALUE(rc))
+		return rc;
+
+	return bytes;
+}
+EXPORT_SYMBOL_GPL(nvmem_device_read);
+
+/**
+ * nvmem_device_write() - Write cell to a given nvmem device
+ *
+ * @nvmem: nvmem device to be written to.
+ * @offset: offset in nvmem device.
+ * @bytes: number of bytes to write.
+ * @buf: buffer to be written.
+ *
+ * Return: length of bytes written or negative error code on failure.
+ * */
+int nvmem_device_write(struct nvmem_device *nvmem,
+		       unsigned int offset,
+		       size_t bytes, void *buf)
+{
+	int rc;
+
+	if (!nvmem || !nvmem->regmap)
+		return -EINVAL;
+
+	rc = regmap_raw_write(nvmem->regmap, offset, buf, bytes);
+
+	if (IS_ERR_VALUE(rc))
+		return rc;
+
+
+	return bytes;
+}
+EXPORT_SYMBOL_GPL(nvmem_device_write);
+
 static int __init nvmem_init(void)
 {
 	return bus_register(&nvmem_bus_type);
diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h
index 297cc67b7211..9bb77d3ed6e0 100644
--- a/include/linux/nvmem-consumer.h
+++ b/include/linux/nvmem-consumer.h
@@ -16,6 +16,7 @@ struct device;
 struct device_node;
 /* consumer cookie */
 struct nvmem_cell;
+struct nvmem_device;
 
 struct nvmem_cell_info {
 	const char		*name;
@@ -35,6 +36,21 @@ void devm_nvmem_cell_put(struct device *dev, struct nvmem_cell *cell);
 void *nvmem_cell_read(struct nvmem_cell *cell, size_t *len);
 int nvmem_cell_write(struct nvmem_cell *cell, void *buf, size_t len);
 
+/* direct nvmem device read/write interface */
+struct nvmem_device *nvmem_device_get(struct device *dev, const char *name);
+struct nvmem_device *devm_nvmem_device_get(struct device *dev,
+					   const char *name);
+void nvmem_device_put(struct nvmem_device *nvmem);
+void devm_nvmem_device_put(struct device *dev, struct nvmem_device *nvmem);
+int nvmem_device_read(struct nvmem_device *nvmem, unsigned int offset,
+		      size_t bytes, void *buf);
+int nvmem_device_write(struct nvmem_device *nvmem, unsigned int offset,
+		       size_t bytes, void *buf);
+ssize_t nvmem_device_cell_read(struct nvmem_device *nvmem,
+			   struct nvmem_cell_info *info, void *buf);
+int nvmem_device_cell_write(struct nvmem_device *nvmem,
+			    struct nvmem_cell_info *info, void *buf);
+
 #else
 
 static inline struct nvmem_cell *nvmem_cell_get(struct device *dev,
@@ -68,17 +84,74 @@ static inline int nvmem_cell_write(struct nvmem_cell *cell,
 {
 	return -ENOSYS;
 }
+
+static inline struct nvmem_device *nvmem_device_get(struct device *dev,
+						    const char *name)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline struct nvmem_device *devm_nvmem_device_get(struct device *dev,
+							 const char *name)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline void nvmem_device_put(struct nvmem_device *nvmem)
+{
+}
+
+static inline void devm_nvmem_device_put(struct device *dev,
+					 struct nvmem_device *nvmem)
+{
+}
+
+static inline ssize_t nvmem_device_cell_read(struct nvmem_device *nvmem,
+					 struct nvmem_cell_info *info,
+					 void *buf)
+{
+	return -ENOSYS;
+}
+
+static inline int nvmem_device_cell_write(struct nvmem_device *nvmem,
+					  struct nvmem_cell_info *info,
+					  void *buf)
+{
+	return -ENOSYS;
+}
+
+static inline int nvmem_device_read(struct nvmem_device *nvmem,
+				    unsigned int offset, size_t bytes,
+				    void *buf)
+{
+	return -ENOSYS;
+}
+
+static inline int nvmem_device_write(struct nvmem_device *nvmem,
+				     unsigned int offset, size_t bytes,
+				     void *buf)
+{
+	return -ENOSYS;
+}
 #endif /* CONFIG_NVMEM */
 
 #if IS_ENABLED(CONFIG_NVMEM) && IS_ENABLED(CONFIG_OF)
 struct nvmem_cell *of_nvmem_cell_get(struct device_node *np,
 				     const char *name);
+struct nvmem_device *of_nvmem_device_get(struct device_node *np,
+					 const char *name);
 #else
 static inline struct nvmem_cell *of_nvmem_cell_get(struct device_node *np,
 				     const char *name)
 {
 	return ERR_PTR(-ENOSYS);
 }
+
+static inline struct nvmem_device *of_nvmem_device_get(struct device_node *np,
+						       const char *name)
+{
+	return ERR_PTR(-ENOSYS);
+}
 #endif /* CONFIG_NVMEM && CONFIG_OF */
 
 #endif  /* ifndef _LINUX_NVMEM_CONSUMER_H */
-- 
cgit v1.2.3-70-g09d2


From f61ae4fb66a4f7ae49e3456003fc4328d6db09c9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 2 Aug 2015 20:38:26 +0000
Subject: genirq: Provide irq_desc_has_action

If we have a reference to irq_desc already, there is no point to do
another lookup.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Link: http://lkml.kernel.org/r/20150802203609.638130301@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdesc.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index fcea4e48e21f..5acfa26602e1 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -166,12 +166,16 @@ static inline int handle_domain_irq(struct irq_domain *domain,
 #endif
 
 /* Test to see if a driver has successfully requested an irq */
-static inline int irq_has_action(unsigned int irq)
+static inline int irq_desc_has_action(struct irq_desc *desc)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	return desc->action != NULL;
 }
 
+static inline int irq_has_action(unsigned int irq)
+{
+	return irq_desc_has_action(irq_to_desc(irq));
+}
+
 /* caller has locked the irq_desc and both params are valid */
 static inline void __irq_set_handler_locked(unsigned int irq,
 					    irq_flow_handler_t handler)
-- 
cgit v1.2.3-70-g09d2


From 71db87ba570038497db1227b7dc61113c4156565 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 30 Jul 2015 15:04:01 +0530
Subject: bus: subsys: update return type of ->remove_dev() to void

Its return value is not used by the subsys core and nothing meaningful
can be done with it, even if we want to use it. The subsys device is
anyway getting removed.

Update prototype of ->remove_dev() to make its return type as void. Fix
all usage sites as well.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/sh/kernel/cpu/sh4/sq.c          |  3 +--
 arch/tile/kernel/sysfs.c             | 11 ++++-------
 arch/x86/kernel/cpu/microcode/core.c |  5 ++---
 drivers/cpufreq/cpufreq.c            | 12 +++++-------
 drivers/net/rionet.c                 |  4 +---
 include/linux/device.h               |  2 +-
 6 files changed, 14 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c
index 0a47bd3e7bee..4ca78ed71ad2 100644
--- a/arch/sh/kernel/cpu/sh4/sq.c
+++ b/arch/sh/kernel/cpu/sh4/sq.c
@@ -355,13 +355,12 @@ static int sq_dev_add(struct device *dev, struct subsys_interface *sif)
 	return error;
 }
 
-static int sq_dev_remove(struct device *dev, struct subsys_interface *sif)
+static void sq_dev_remove(struct device *dev, struct subsys_interface *sif)
 {
 	unsigned int cpu = dev->id;
 	struct kobject *kobj = sq_kobject[cpu];
 
 	kobject_put(kobj);
-	return 0;
 }
 
 static struct subsys_interface sq_interface = {
diff --git a/arch/tile/kernel/sysfs.c b/arch/tile/kernel/sysfs.c
index a3ed12f8f83b..825867c53853 100644
--- a/arch/tile/kernel/sysfs.c
+++ b/arch/tile/kernel/sysfs.c
@@ -198,16 +198,13 @@ static int hv_stats_device_add(struct device *dev, struct subsys_interface *sif)
 	return err;
 }
 
-static int hv_stats_device_remove(struct device *dev,
-				  struct subsys_interface *sif)
+static void hv_stats_device_remove(struct device *dev,
+				   struct subsys_interface *sif)
 {
 	int cpu = dev->id;
 
-	if (!cpu_online(cpu))
-		return 0;
-
-	sysfs_remove_file(&dev->kobj, &dev_attr_hv_stats.attr);
-	return 0;
+	if (cpu_online(cpu))
+		sysfs_remove_file(&dev->kobj, &dev_attr_hv_stats.attr);
 }
 
 
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 6236a54a63f4..3c986390058a 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -377,17 +377,16 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif)
 	return err;
 }
 
-static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
+static void mc_device_remove(struct device *dev, struct subsys_interface *sif)
 {
 	int cpu = dev->id;
 
 	if (!cpu_online(cpu))
-		return 0;
+		return;
 
 	pr_debug("CPU%d removed\n", cpu);
 	microcode_fini_cpu(cpu);
 	sysfs_remove_group(&dev->kobj, &mc_attr_group);
-	return 0;
 }
 
 static struct subsys_interface mc_cpu_interface = {
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 26063afb3eba..6da25c10bdfd 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1518,7 +1518,7 @@ static int __cpufreq_remove_dev_finish(struct device *dev,
  *
  * Removes the cpufreq interface for a CPU device.
  */
-static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
+static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 {
 	unsigned int cpu = dev->id;
 	int ret;
@@ -1533,7 +1533,7 @@ static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 		struct cpumask mask;
 
 		if (!policy)
-			return 0;
+			return;
 
 		cpumask_copy(&mask, policy->related_cpus);
 		cpumask_clear_cpu(cpu, &mask);
@@ -1544,19 +1544,17 @@ static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 		 */
 		if (cpumask_intersects(&mask, cpu_present_mask)) {
 			remove_cpu_dev_symlink(policy, cpu);
-			return 0;
+			return;
 		}
 
 		cpufreq_policy_free(policy, true);
-		return 0;
+		return;
 	}
 
 	ret = __cpufreq_remove_dev_prepare(dev, sif);
 
 	if (!ret)
-		ret = __cpufreq_remove_dev_finish(dev, sif);
-
-	return ret;
+		__cpufreq_remove_dev_finish(dev, sif);
 }
 
 static void handle_update(struct work_struct *work)
diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
index dac7a0d9bb46..01f08a7751f7 100644
--- a/drivers/net/rionet.c
+++ b/drivers/net/rionet.c
@@ -396,7 +396,7 @@ static int rionet_close(struct net_device *ndev)
 	return 0;
 }
 
-static int rionet_remove_dev(struct device *dev, struct subsys_interface *sif)
+static void rionet_remove_dev(struct device *dev, struct subsys_interface *sif)
 {
 	struct rio_dev *rdev = to_rio_dev(dev);
 	unsigned char netid = rdev->net->hport->id;
@@ -416,8 +416,6 @@ static int rionet_remove_dev(struct device *dev, struct subsys_interface *sif)
 			}
 		}
 	}
-
-	return 0;
 }
 
 static void rionet_get_drvinfo(struct net_device *ndev,
diff --git a/include/linux/device.h b/include/linux/device.h
index a2b4ea70a946..1225f98e9240 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -341,7 +341,7 @@ struct subsys_interface {
 	struct bus_type *subsys;
 	struct list_head node;
 	int (*add_dev)(struct device *dev, struct subsys_interface *sif);
-	int (*remove_dev)(struct device *dev, struct subsys_interface *sif);
+	void (*remove_dev)(struct device *dev, struct subsys_interface *sif);
 };
 
 int subsys_interface_register(struct subsys_interface *sif);
-- 
cgit v1.2.3-70-g09d2


From 50c7cd2bd3786258606c6c7c8356064c08ab2383 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Mon, 6 Jul 2015 12:19:23 +0200
Subject: dmaengine: Add scatter-gathered memset

The current API allows the driver to accelerate memset by using the DMA
controller.

However, it does so over a contiguous memory area, which might proves
inefficient when you have to do it over a non-contiguous yet repititive
pattern, since you have to create a number of descriptors and then submit
each other.

Add a memset operation going over a scatter list to handle such cases in a
single call.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Acked-by: Ludovic Desroches <ludovic.desroches@atmel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dmaengine.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 03ed832adbc2..8ad9a4e839f6 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -66,6 +66,7 @@ enum dma_transaction_type {
 	DMA_XOR_VAL,
 	DMA_PQ_VAL,
 	DMA_MEMSET,
+	DMA_MEMSET_SG,
 	DMA_INTERRUPT,
 	DMA_SG,
 	DMA_PRIVATE,
@@ -630,6 +631,7 @@ enum dmaengine_alignment {
  * @device_prep_dma_pq: prepares a pq operation
  * @device_prep_dma_pq_val: prepares a pqzero_sum operation
  * @device_prep_dma_memset: prepares a memset operation
+ * @device_prep_dma_memset_sg: prepares a memset operation over a scatter list
  * @device_prep_dma_interrupt: prepares an end of chain interrupt operation
  * @device_prep_slave_sg: prepares a slave dma operation
  * @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio.
@@ -696,6 +698,9 @@ struct dma_device {
 	struct dma_async_tx_descriptor *(*device_prep_dma_memset)(
 		struct dma_chan *chan, dma_addr_t dest, int value, size_t len,
 		unsigned long flags);
+	struct dma_async_tx_descriptor *(*device_prep_dma_memset_sg)(
+		struct dma_chan *chan, struct scatterlist *sg,
+		unsigned int nents, int value, unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_interrupt)(
 		struct dma_chan *chan, unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_sg)(
-- 
cgit v1.2.3-70-g09d2


From 596c154d62330ea0bb4e3c3e50afa3682e50b617 Mon Sep 17 00:00:00 2001
From: Robert Baldyga <r.baldyga@samsung.com>
Date: Thu, 6 Aug 2015 14:11:10 +0200
Subject: usb: gadget: add 'ep_match' callback to usb_gadget_ops

Add callback that is called by epautoconf to allow UDC driver match the
best endpoint for specific descriptor. It's intended to supply mechanism
which allows to get rid of chip-specific endpoint matching code from
epautoconf.

If gadget has set 'ep_match' callback we prefer to call it first, and
if it fails to find matching endpoint, then we try to use default matching
algorithm.

Signed-off-by: Robert Baldyga <r.baldyga@samsung.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/gadget/epautoconf.c | 6 ++++++
 include/linux/usb/gadget.h      | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/epautoconf.c b/drivers/usb/gadget/epautoconf.c
index 95e12759af4d..f000c73319f4 100644
--- a/drivers/usb/gadget/epautoconf.c
+++ b/drivers/usb/gadget/epautoconf.c
@@ -165,6 +165,12 @@ struct usb_ep *usb_ep_autoconfig_ss(
 
 	type = desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK;
 
+	if (gadget->ops->match_ep) {
+		ep = gadget->ops->match_ep(gadget, desc, ep_comp);
+		if (ep)
+			goto found_ep;
+	}
+
 	/* First, apply chip-specific "best usage" knowledge.
 	 * This might make a good usb_gadget_ops hook ...
 	 */
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 82b5bcbd2c98..303214bb2f8b 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -534,6 +534,9 @@ struct usb_gadget_ops {
 	int	(*udc_start)(struct usb_gadget *,
 			struct usb_gadget_driver *);
 	int	(*udc_stop)(struct usb_gadget *);
+	struct usb_ep *(*match_ep)(struct usb_gadget *,
+			struct usb_endpoint_descriptor *,
+			struct usb_ss_ep_comp_descriptor *);
 };
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 4278c687f697b651ab0c771114564da5ed006f22 Mon Sep 17 00:00:00 2001
From: Robert Baldyga <r.baldyga@samsung.com>
Date: Thu, 6 Aug 2015 14:11:11 +0200
Subject: usb: gadget: move ep_matches() from epautoconf to udc-core

Move ep_matches() function to udc-core and rename it to
usb_gadget_ep_match_desc(). This function can be used by UDC drivers
in 'match_ep' callback to avoid writing lots of repetitive code.

Replace all calls of ep_matches() with usb_gadget_ep_match_desc().

Signed-off-by: Robert Baldyga <r.baldyga@samsung.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/gadget/epautoconf.c   | 95 +++++----------------------------------
 drivers/usb/gadget/udc/udc-core.c | 69 ++++++++++++++++++++++++++++
 include/linux/usb/gadget.h        |  8 ++++
 3 files changed, 88 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/epautoconf.c b/drivers/usb/gadget/epautoconf.c
index f000c73319f4..d49af4fc8667 100644
--- a/drivers/usb/gadget/epautoconf.c
+++ b/drivers/usb/gadget/epautoconf.c
@@ -22,82 +22,6 @@
 
 #include "gadget_chips.h"
 
-static int
-ep_matches (
-	struct usb_gadget		*gadget,
-	struct usb_ep			*ep,
-	struct usb_endpoint_descriptor	*desc,
-	struct usb_ss_ep_comp_descriptor *ep_comp
-)
-{
-	u8		type;
-	u16		max;
-	int		num_req_streams = 0;
-
-	/* endpoint already claimed? */
-	if (ep->claimed)
-		return 0;
-
-	type = usb_endpoint_type(desc);
-	max = 0x7ff & usb_endpoint_maxp(desc);
-
-	if (usb_endpoint_dir_in(desc) && !ep->caps.dir_in)
-		return 0;
-	if (usb_endpoint_dir_out(desc) && !ep->caps.dir_out)
-		return 0;
-
-	if (max > ep->maxpacket_limit)
-		return 0;
-
-	/* "high bandwidth" works only at high speed */
-	if (!gadget_is_dualspeed(gadget) && usb_endpoint_maxp(desc) & (3<<11))
-		return 0;
-
-	switch (type) {
-	case USB_ENDPOINT_XFER_CONTROL:
-		/* only support ep0 for portable CONTROL traffic */
-		return 0;
-	case USB_ENDPOINT_XFER_ISOC:
-		if (!ep->caps.type_iso)
-			return 0;
-		/* ISO:  limit 1023 bytes full speed,
-		 * 1024 high/super speed
-		 */
-		if (!gadget_is_dualspeed(gadget) && max > 1023)
-			return 0;
-		break;
-	case USB_ENDPOINT_XFER_BULK:
-		if (!ep->caps.type_bulk)
-			return 0;
-		if (ep_comp && gadget_is_superspeed(gadget)) {
-			/* Get the number of required streams from the
-			 * EP companion descriptor and see if the EP
-			 * matches it
-			 */
-			num_req_streams = ep_comp->bmAttributes & 0x1f;
-			if (num_req_streams > ep->max_streams)
-				return 0;
-		}
-		break;
-	case USB_ENDPOINT_XFER_INT:
-		/* Bulk endpoints handle interrupt transfers,
-		 * except the toggle-quirky iso-synch kind
-		 */
-		if (!ep->caps.type_int && !ep->caps.type_bulk)
-			return 0;
-		/* INT:  limit 64 bytes full speed,
-		 * 1024 high/super speed
-		 */
-		if (!gadget_is_dualspeed(gadget) && max > 64)
-			return 0;
-		break;
-	}
-
-	/* MATCH!! */
-
-	return 1;
-}
-
 static struct usb_ep *
 find_ep (struct usb_gadget *gadget, const char *name)
 {
@@ -180,10 +104,12 @@ struct usb_ep *usb_ep_autoconfig_ss(
 		if (type == USB_ENDPOINT_XFER_INT) {
 			/* ep-e, ep-f are PIO with only 64 byte fifos */
 			ep = find_ep(gadget, "ep-e");
-			if (ep && ep_matches(gadget, ep, desc, ep_comp))
+			if (ep && usb_gadget_ep_match_desc(gadget,
+					ep, desc, ep_comp))
 				goto found_ep;
 			ep = find_ep(gadget, "ep-f");
-			if (ep && ep_matches(gadget, ep, desc, ep_comp))
+			if (ep && usb_gadget_ep_match_desc(gadget,
+					ep, desc, ep_comp))
 				goto found_ep;
 		}
 
@@ -191,20 +117,21 @@ struct usb_ep *usb_ep_autoconfig_ss(
 		snprintf(name, sizeof(name), "ep%d%s", usb_endpoint_num(desc),
 				usb_endpoint_dir_in(desc) ? "in" : "out");
 		ep = find_ep(gadget, name);
-		if (ep && ep_matches(gadget, ep, desc, ep_comp))
+		if (ep && usb_gadget_ep_match_desc(gadget, ep, desc, ep_comp))
 			goto found_ep;
 	} else if (gadget_is_goku (gadget)) {
 		if (USB_ENDPOINT_XFER_INT == type) {
 			/* single buffering is enough */
 			ep = find_ep(gadget, "ep3-bulk");
-			if (ep && ep_matches(gadget, ep, desc, ep_comp))
+			if (ep && usb_gadget_ep_match_desc(gadget,
+					ep, desc, ep_comp))
 				goto found_ep;
 		} else if (USB_ENDPOINT_XFER_BULK == type
 				&& (USB_DIR_IN & desc->bEndpointAddress)) {
 			/* DMA may be available */
 			ep = find_ep(gadget, "ep2-bulk");
-			if (ep && ep_matches(gadget, ep, desc,
-					      ep_comp))
+			if (ep && usb_gadget_ep_match_desc(gadget,
+					ep, desc, ep_comp))
 				goto found_ep;
 		}
 
@@ -223,14 +150,14 @@ struct usb_ep *usb_ep_autoconfig_ss(
 				ep = find_ep(gadget, "ep2out");
 		} else
 			ep = NULL;
-		if (ep && ep_matches(gadget, ep, desc, ep_comp))
+		if (ep && usb_gadget_ep_match_desc(gadget, ep, desc, ep_comp))
 			goto found_ep;
 #endif
 	}
 
 	/* Second, look at endpoints until an unclaimed one looks usable */
 	list_for_each_entry (ep, &gadget->ep_list, ep_list) {
-		if (ep_matches(gadget, ep, desc, ep_comp))
+		if (usb_gadget_ep_match_desc(gadget, ep, desc, ep_comp))
 			goto found_ep;
 	}
 
diff --git a/drivers/usb/gadget/udc/udc-core.c b/drivers/usb/gadget/udc/udc-core.c
index 362ee8af5fce..b6427d1e9a6c 100644
--- a/drivers/usb/gadget/udc/udc-core.c
+++ b/drivers/usb/gadget/udc/udc-core.c
@@ -131,6 +131,75 @@ EXPORT_SYMBOL_GPL(usb_gadget_giveback_request);
 
 /* ------------------------------------------------------------------------- */
 
+int usb_gadget_ep_match_desc(struct usb_gadget *gadget,
+		struct usb_ep *ep, struct usb_endpoint_descriptor *desc,
+		struct usb_ss_ep_comp_descriptor *ep_comp)
+{
+	u8		type;
+	u16		max;
+	int		num_req_streams = 0;
+
+	/* endpoint already claimed? */
+	if (ep->claimed)
+		return 0;
+
+	type = usb_endpoint_type(desc);
+	max = 0x7ff & usb_endpoint_maxp(desc);
+
+	if (usb_endpoint_dir_in(desc) && !ep->caps.dir_in)
+		return 0;
+	if (usb_endpoint_dir_out(desc) && !ep->caps.dir_out)
+		return 0;
+
+	if (max > ep->maxpacket_limit)
+		return 0;
+
+	/* "high bandwidth" works only at high speed */
+	if (!gadget_is_dualspeed(gadget) && usb_endpoint_maxp(desc) & (3<<11))
+		return 0;
+
+	switch (type) {
+	case USB_ENDPOINT_XFER_CONTROL:
+		/* only support ep0 for portable CONTROL traffic */
+		return 0;
+	case USB_ENDPOINT_XFER_ISOC:
+		if (!ep->caps.type_iso)
+			return 0;
+		/* ISO:  limit 1023 bytes full speed, 1024 high/super speed */
+		if (!gadget_is_dualspeed(gadget) && max > 1023)
+			return 0;
+		break;
+	case USB_ENDPOINT_XFER_BULK:
+		if (!ep->caps.type_bulk)
+			return 0;
+		if (ep_comp && gadget_is_superspeed(gadget)) {
+			/* Get the number of required streams from the
+			 * EP companion descriptor and see if the EP
+			 * matches it
+			 */
+			num_req_streams = ep_comp->bmAttributes & 0x1f;
+			if (num_req_streams > ep->max_streams)
+				return 0;
+		}
+		break;
+	case USB_ENDPOINT_XFER_INT:
+		/* Bulk endpoints handle interrupt transfers,
+		 * except the toggle-quirky iso-synch kind
+		 */
+		if (!ep->caps.type_int && !ep->caps.type_bulk)
+			return 0;
+		/* INT:  limit 64 bytes full speed, 1024 high/super speed */
+		if (!gadget_is_dualspeed(gadget) && max > 64)
+			return 0;
+		break;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(usb_gadget_ep_match_desc);
+
+/* ------------------------------------------------------------------------- */
+
 static void usb_gadget_state_work(struct work_struct *work)
 {
 	struct usb_gadget *gadget = work_to_gadget(work);
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 303214bb2f8b..e04fd6381ae8 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -1204,6 +1204,14 @@ extern void usb_gadget_giveback_request(struct usb_ep *ep,
 
 /*-------------------------------------------------------------------------*/
 
+/* utility to check if endpoint caps match descriptor needs */
+
+extern int usb_gadget_ep_match_desc(struct usb_gadget *gadget,
+		struct usb_ep *ep, struct usb_endpoint_descriptor *desc,
+		struct usb_ss_ep_comp_descriptor *ep_comp);
+
+/*-------------------------------------------------------------------------*/
+
 /* utility to update vbus status for udc core, it may be scheduled */
 extern void usb_udc_vbus_handler(struct usb_gadget *gadget, bool status);
 
-- 
cgit v1.2.3-70-g09d2


From b0aea0037c8896b8e69cad3f6e828782789c1edf Mon Sep 17 00:00:00 2001
From: Robert Baldyga <r.baldyga@samsung.com>
Date: Thu, 6 Aug 2015 14:11:12 +0200
Subject: usb: gadget: move find_ep() from epautoconf to udc-core

Move find_ep() to udc-core and rename it to gadget_find_ep_by_name().
It can be used in UDC drivers, especially in 'match_ep' callback after
moving chip-specific endpoint matching logic from epautoconf to UDC
drivers.

Replace all calls of find_ep() function with gadget_find_ep_by_name().

Signed-off-by: Robert Baldyga <r.baldyga@samsung.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/gadget/epautoconf.c   | 30 +++++++++---------------------
 drivers/usb/gadget/udc/udc-core.c | 21 +++++++++++++++++++++
 include/linux/usb/gadget.h        |  8 +++++++-
 3 files changed, 37 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/epautoconf.c b/drivers/usb/gadget/epautoconf.c
index d49af4fc8667..a39ca033b9ce 100644
--- a/drivers/usb/gadget/epautoconf.c
+++ b/drivers/usb/gadget/epautoconf.c
@@ -22,18 +22,6 @@
 
 #include "gadget_chips.h"
 
-static struct usb_ep *
-find_ep (struct usb_gadget *gadget, const char *name)
-{
-	struct usb_ep	*ep;
-
-	list_for_each_entry (ep, &gadget->ep_list, ep_list) {
-		if (0 == strcmp (ep->name, name))
-			return ep;
-	}
-	return NULL;
-}
-
 /**
  * usb_ep_autoconfig_ss() - choose an endpoint matching the ep
  * descriptor and ep companion descriptor
@@ -103,11 +91,11 @@ struct usb_ep *usb_ep_autoconfig_ss(
 
 		if (type == USB_ENDPOINT_XFER_INT) {
 			/* ep-e, ep-f are PIO with only 64 byte fifos */
-			ep = find_ep(gadget, "ep-e");
+			ep = gadget_find_ep_by_name(gadget, "ep-e");
 			if (ep && usb_gadget_ep_match_desc(gadget,
 					ep, desc, ep_comp))
 				goto found_ep;
-			ep = find_ep(gadget, "ep-f");
+			ep = gadget_find_ep_by_name(gadget, "ep-f");
 			if (ep && usb_gadget_ep_match_desc(gadget,
 					ep, desc, ep_comp))
 				goto found_ep;
@@ -116,20 +104,20 @@ struct usb_ep *usb_ep_autoconfig_ss(
 		/* USB3380: use same address for usb and hardware endpoints */
 		snprintf(name, sizeof(name), "ep%d%s", usb_endpoint_num(desc),
 				usb_endpoint_dir_in(desc) ? "in" : "out");
-		ep = find_ep(gadget, name);
+		ep = gadget_find_ep_by_name(gadget, name);
 		if (ep && usb_gadget_ep_match_desc(gadget, ep, desc, ep_comp))
 			goto found_ep;
 	} else if (gadget_is_goku (gadget)) {
 		if (USB_ENDPOINT_XFER_INT == type) {
 			/* single buffering is enough */
-			ep = find_ep(gadget, "ep3-bulk");
+			ep = gadget_find_ep_by_name(gadget, "ep3-bulk");
 			if (ep && usb_gadget_ep_match_desc(gadget,
 					ep, desc, ep_comp))
 				goto found_ep;
 		} else if (USB_ENDPOINT_XFER_BULK == type
 				&& (USB_DIR_IN & desc->bEndpointAddress)) {
 			/* DMA may be available */
-			ep = find_ep(gadget, "ep2-bulk");
+			ep = gadget_find_ep_by_name(gadget, "ep2-bulk");
 			if (ep && usb_gadget_ep_match_desc(gadget,
 					ep, desc, ep_comp))
 				goto found_ep;
@@ -140,14 +128,14 @@ struct usb_ep *usb_ep_autoconfig_ss(
 		if ((USB_ENDPOINT_XFER_BULK == type) ||
 		    (USB_ENDPOINT_XFER_ISOC == type)) {
 			if (USB_DIR_IN & desc->bEndpointAddress)
-				ep = find_ep (gadget, "ep5in");
+				ep = gadget_find_ep_by_name(gadget, "ep5in");
 			else
-				ep = find_ep (gadget, "ep6out");
+				ep = gadget_find_ep_by_name(gadget, "ep6out");
 		} else if (USB_ENDPOINT_XFER_INT == type) {
 			if (USB_DIR_IN & desc->bEndpointAddress)
-				ep = find_ep(gadget, "ep1in");
+				ep = gadget_find_ep_by_name(gadget, "ep1in");
 			else
-				ep = find_ep(gadget, "ep2out");
+				ep = gadget_find_ep_by_name(gadget, "ep2out");
 		} else
 			ep = NULL;
 		if (ep && usb_gadget_ep_match_desc(gadget, ep, desc, ep_comp))
diff --git a/drivers/usb/gadget/udc/udc-core.c b/drivers/usb/gadget/udc/udc-core.c
index b6427d1e9a6c..3c954b5fe4f3 100644
--- a/drivers/usb/gadget/udc/udc-core.c
+++ b/drivers/usb/gadget/udc/udc-core.c
@@ -131,6 +131,27 @@ EXPORT_SYMBOL_GPL(usb_gadget_giveback_request);
 
 /* ------------------------------------------------------------------------- */
 
+/**
+ * gadget_find_ep_by_name - returns ep whose name is the same as sting passed
+ *	in second parameter or NULL if searched endpoint not found
+ * @g: controller to check for quirk
+ * @name: name of searched endpoint
+ */
+struct usb_ep *gadget_find_ep_by_name(struct usb_gadget *g, const char *name)
+{
+	struct usb_ep *ep;
+
+	gadget_for_each_ep(ep, g) {
+		if (!strcmp(ep->name, name))
+			return ep;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(gadget_find_ep_by_name);
+
+/* ------------------------------------------------------------------------- */
+
 int usb_gadget_ep_match_desc(struct usb_gadget *gadget,
 		struct usb_ep *ep, struct usb_endpoint_descriptor *desc,
 		struct usb_ss_ep_comp_descriptor *ep_comp)
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index e04fd6381ae8..c14a69b36d27 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -639,7 +639,6 @@ static inline struct usb_gadget *dev_to_usb_gadget(struct device *dev)
 #define gadget_for_each_ep(tmp, gadget) \
 	list_for_each_entry(tmp, &(gadget)->ep_list, ep_list)
 
-
 /**
  * usb_ep_align_maybe - returns @len aligned to ep's maxpacketsize if gadget
  *	requires quirk_ep_out_aligned_size, otherwise reguens len.
@@ -1204,6 +1203,13 @@ extern void usb_gadget_giveback_request(struct usb_ep *ep,
 
 /*-------------------------------------------------------------------------*/
 
+/* utility to find endpoint by name */
+
+extern struct usb_ep *gadget_find_ep_by_name(struct usb_gadget *g,
+		const char *name);
+
+/*-------------------------------------------------------------------------*/
+
 /* utility to check if endpoint caps match descriptor needs */
 
 extern int usb_gadget_ep_match_desc(struct usb_gadget *gadget,
-- 
cgit v1.2.3-70-g09d2


From 098d2164e3441c252eaa28906d45e16b7bf1bd2b Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Wed, 1 Jul 2015 02:13:49 +0000
Subject: bpf: Use correct #ifdef controller for trace_call_bpf()

Commit e1abf2cc8d5d80b41c4419368ec743ccadbb131e ("bpf: Fix the build on
BPF_SYSCALL=y && !CONFIG_TRACING kernels, make it more configurable")
updated the building condition of bpf_trace.o from CONFIG_BPF_SYSCALL
to CONFIG_BPF_EVENTS, but the corresponding #ifdef controller in
trace_events.h for trace_call_bpf() was not changed. Which, in theory,
is incorrect.

With current Kconfigs, we can create a .config with CONFIG_BPF_SYSCALL=y
and CONFIG_BPF_EVENTS=n by unselecting CONFIG_KPROBE_EVENT and
selecting CONFIG_BPF_SYSCALL. With these options, trace_call_bpf() will
be defined as an extern function, but if anyone calls it a symbol missing
error will be triggered since bpf_trace.o was not built.

This patch changes the #ifdef controller for trace_call_bpf() from
CONFIG_BPF_SYSCALL to CONFIG_BPF_EVENTS. I'll show its correctness:

Before this patch:

   BPF_SYSCALL   BPF_EVENTS   trace_call_bpf   bpf_trace.o
   y             y           normal           compiled
   n             n           inline           not compiled
   y             n           normal           not compiled (incorrect)
   n             y          impossible (BPF_EVENTS depends on BPF_SYSCALL)

After this patch:

   BPF_SYSCALL   BPF_EVENTS   trace_call_bpf   bpf_trace.o
   y             y           normal           compiled
   n             n           inline           not compiled
   y             n           inline           not compiled (fixed)
   n             y          impossible (BPF_EVENTS depends on BPF_SYSCALL)

So this patch doesn't break anything. QED.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kaixu Xia <xiakaixu@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1435716878-189507-2-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/trace_events.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 1063c850dbab..180dbf8720f9 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -542,7 +542,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file,
 		event_triggers_post_call(file, tt);
 }
 
-#ifdef CONFIG_BPF_SYSCALL
+#ifdef CONFIG_BPF_EVENTS
 unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
 #else
 static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
-- 
cgit v1.2.3-70-g09d2


From 04a22fae4cbc1f7d3f7471e9b36359f98bd3f043 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Wed, 1 Jul 2015 02:13:50 +0000
Subject: tracing, perf: Implement BPF programs attached to uprobes

By copying BPF related operation to uprobe processing path, this patch
allow users attach BPF programs to uprobes like what they are already
doing on kprobes.

After this patch, users are allowed to use PERF_EVENT_IOC_SET_BPF on a
uprobe perf event. Which make it possible to profile user space programs
and kernel events together using BPF.

Because of this patch, CONFIG_BPF_EVENTS should be selected by
CONFIG_UPROBE_EVENT to ensure trace_call_bpf() is compiled even if
KPROBE_EVENT is not set.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kaixu Xia <xiakaixu@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1435716878-189507-3-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/trace_events.h | 5 +++++
 kernel/events/core.c         | 4 ++--
 kernel/trace/Kconfig         | 2 +-
 kernel/trace/trace_uprobe.c  | 5 +++++
 4 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 180dbf8720f9..ed27917cabc9 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -243,6 +243,7 @@ enum {
 	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
 	TRACE_EVENT_FL_KPROBE_BIT,
+	TRACE_EVENT_FL_UPROBE_BIT,
 };
 
 /*
@@ -257,6 +258,7 @@ enum {
  *  USE_CALL_FILTER - For trace internal events, don't use file filter
  *  TRACEPOINT    - Event is a tracepoint
  *  KPROBE        - Event is a kprobe
+ *  UPROBE        - Event is a uprobe
  */
 enum {
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
@@ -267,8 +269,11 @@ enum {
 	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
 	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
+	TRACE_EVENT_FL_UPROBE		= (1 << TRACE_EVENT_FL_UPROBE_BIT),
 };
 
+#define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE)
+
 struct trace_event_call {
 	struct list_head	list;
 	struct trace_event_class *class;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bdea12924b11..77f9e5d0e2d1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6846,8 +6846,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 	if (event->tp_event->prog)
 		return -EEXIST;
 
-	if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
-		/* bpf programs can only be attached to kprobes */
+	if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
+		/* bpf programs can only be attached to u/kprobes */
 		return -EINVAL;
 
 	prog = bpf_prog_get(prog_fd);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3b9a48ae153a..1153c43428f3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -434,7 +434,7 @@ config UPROBE_EVENT
 
 config BPF_EVENTS
 	depends on BPF_SYSCALL
-	depends on KPROBE_EVENT
+	depends on KPROBE_EVENT || UPROBE_EVENT
 	bool
 	default y
 	help
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index aa1ea7b36fa8..f97479f1ce35 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1095,11 +1095,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 {
 	struct trace_event_call *call = &tu->tp.call;
 	struct uprobe_trace_entry_head *entry;
+	struct bpf_prog *prog = call->prog;
 	struct hlist_head *head;
 	void *data;
 	int size, esize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
 
 	size = esize + tu->tp.size + dsize;
@@ -1289,6 +1293,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
 		return -ENODEV;
 	}
 
+	call->flags = TRACE_EVENT_FL_UPROBE;
 	call->class->reg = trace_uprobe_register;
 	call->data = tu;
 	ret = trace_add_event_call(call);
-- 
cgit v1.2.3-70-g09d2


From 84cb777e67814f2e06a99ff228f743409e9617e9 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Wed, 5 Aug 2015 23:48:20 -0400
Subject: audit: use macros for unset inode and device values

Clean up a number of places were casted magic numbers are used to represent
unset inode and device numbers in preparation for the audit by executable path
patch set.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
[PM: enclosed the _UNSET macros in parentheses for ./scripts/checkpatch]
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 include/linux/audit.h | 3 +++
 kernel/audit.c        | 2 +-
 kernel/audit_watch.c  | 8 ++++----
 kernel/auditsc.c      | 6 +++---
 4 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index c2e7e3a83965..759feb0e9d13 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -27,6 +27,9 @@
 #include <linux/ptrace.h>
 #include <uapi/linux/audit.h>
 
+#define AUDIT_INO_UNSET ((unsigned long)-1)
+#define AUDIT_DEV_UNSET ((dev_t)-1)
+
 struct audit_sig_info {
 	uid_t		uid;
 	pid_t		pid;
diff --git a/kernel/audit.c b/kernel/audit.c
index 7497a5a0fac0..060153dc47d4 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1759,7 +1759,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
 	} else
 		audit_log_format(ab, " name=(null)");
 
-	if (n->ino != (unsigned long)-1) {
+	if (n->ino != AUDIT_INO_UNSET) {
 		audit_log_format(ab, " inode=%lu"
 				 " dev=%02x:%02x mode=%#ho"
 				 " ouid=%u ogid=%u rdev=%02x:%02x",
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index b81ad5bc7485..645c6884cee5 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -138,7 +138,7 @@ char *audit_watch_path(struct audit_watch *watch)
 
 int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
 {
-	return (watch->ino != (unsigned long)-1) &&
+	return (watch->ino != AUDIT_INO_UNSET) &&
 		(watch->ino == ino) &&
 		(watch->dev == dev);
 }
@@ -179,8 +179,8 @@ static struct audit_watch *audit_init_watch(char *path)
 	INIT_LIST_HEAD(&watch->rules);
 	atomic_set(&watch->count, 1);
 	watch->path = path;
-	watch->dev = (dev_t)-1;
-	watch->ino = (unsigned long)-1;
+	watch->dev = AUDIT_DEV_UNSET;
+	watch->ino = AUDIT_INO_UNSET;
 
 	return watch;
 }
@@ -493,7 +493,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
 	if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
 		audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
 	else if (mask & (FS_DELETE|FS_MOVED_FROM))
-		audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+		audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1);
 	else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
 		audit_remove_parent_watches(parent);
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f6bc31e7dca9..ea3fe2b748a8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -180,7 +180,7 @@ static int audit_match_filetype(struct audit_context *ctx, int val)
 		return 0;
 
 	list_for_each_entry(n, &ctx->names_list, list) {
-		if ((n->ino != -1) &&
+		if ((n->ino != AUDIT_INO_UNSET) &&
 		    ((n->mode & S_IFMT) == mode))
 			return 1;
 	}
@@ -1681,7 +1681,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
 		aname->should_free = true;
 	}
 
-	aname->ino = (unsigned long)-1;
+	aname->ino = AUDIT_INO_UNSET;
 	aname->type = type;
 	list_add_tail(&aname->list, &context->names_list);
 
@@ -1923,7 +1923,7 @@ void __audit_inode_child(const struct inode *parent,
 	if (inode)
 		audit_copy_inode(found_child, dentry, inode);
 	else
-		found_child->ino = (unsigned long)-1;
+		found_child->ino = AUDIT_INO_UNSET;
 }
 EXPORT_SYMBOL_GPL(__audit_inode_child);
 
-- 
cgit v1.2.3-70-g09d2


From 34d99af52ad40bd498ba66970579a5bc1fb1a3bc Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Wed, 5 Aug 2015 16:29:37 -0400
Subject: audit: implement audit by executable

This adds the ability audit the actions of a not-yet-running process.

This patch implements the ability to filter on the executable path.  Instead of
just hard coding the ino and dev of the executable we care about at the moment
the rule is inserted into the kernel, use the new audit_fsnotify
infrastructure to manage this dynamically.  This means that if the filename
does not yet exist but the containing directory does, or if the inode in
question is unlinked and creat'd (aka updated) the rule will just continue to
work.  If the containing directory is moved or deleted or the filesystem is
unmounted, the rule is deleted automatically.  A future enhancement would be to
have the rule survive across directory disruptions.

This is a heavily modified version of a patch originally submitted by Eric
Paris with some ideas from Peter Moody.

Cc: Peter Moody <peter@hda3.com>
Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
[PM: minor whitespace clean to satisfy ./scripts/checkpatch]
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 include/linux/audit.h      |  1 +
 include/uapi/linux/audit.h |  5 ++++-
 kernel/audit.h             |  4 ++++
 kernel/audit_tree.c        |  2 ++
 kernel/audit_watch.c       | 31 +++++++++++++++++++++++++++
 kernel/auditfilter.c       | 53 +++++++++++++++++++++++++++++++++++++++++++++-
 kernel/auditsc.c           |  3 +++
 7 files changed, 97 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 759feb0e9d13..b2abc996c25d 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -62,6 +62,7 @@ struct audit_krule {
 	struct audit_field	*inode_f; /* quick access to an inode field */
 	struct audit_watch	*watch;	/* associated watch */
 	struct audit_tree	*tree;	/* associated watched tree */
+	struct audit_fsnotify_mark	*exe;
 	struct list_head	rlist;	/* entry in audit_{watch,tree}.rules list */
 	struct list_head	list;	/* for AUDIT_LIST* purposes only */
 	u64			prio;
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index d3475e1f15ec..f6ff62c24aba 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -266,6 +266,7 @@
 #define AUDIT_OBJ_UID	109
 #define AUDIT_OBJ_GID	110
 #define AUDIT_FIELD_COMPARE	111
+#define AUDIT_EXE	112
 
 #define AUDIT_ARG0      200
 #define AUDIT_ARG1      (AUDIT_ARG0+1)
@@ -324,8 +325,10 @@ enum {
 
 #define AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT	0x00000001
 #define AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME	0x00000002
+#define AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH	0x00000004
 #define AUDIT_FEATURE_BITMAP_ALL (AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT | \
-				  AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME)
+				  AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME | \
+				  AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH)
 
 /* deprecated: AUDIT_VERSION_* */
 #define AUDIT_VERSION_LATEST 		AUDIT_FEATURE_BITMAP_ALL
diff --git a/kernel/audit.h b/kernel/audit.h
index 7102d538737b..24ec86145667 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -274,6 +274,8 @@ extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
 extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
 extern void audit_remove_mark_rule(struct audit_krule *krule);
 extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev);
+extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old);
+extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark);
 
 #else
 #define audit_put_watch(w) {}
@@ -289,6 +291,8 @@ extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long in
 #define audit_remove_mark(m)
 #define audit_remove_mark_rule(k)
 #define audit_mark_compare(m, i, d) 0
+#define audit_exe_compare(t, m) (-EINVAL)
+#define audit_dupe_exe(n, o) (-EINVAL)
 #endif /* CONFIG_AUDIT_WATCH */
 
 #ifdef CONFIG_AUDIT_TREE
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 2e0c97427b33..f41722506808 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -478,6 +478,8 @@ static void kill_rules(struct audit_tree *tree)
 		if (rule->tree) {
 			/* not a half-baked one */
 			audit_tree_log_remove_rule(rule);
+			if (entry->rule.exe)
+				audit_remove_mark(entry->rule.exe);
 			rule->tree = NULL;
 			list_del_rcu(&entry->list);
 			list_del(&entry->rule.list);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 645c6884cee5..27ef8dcf7cd8 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -312,6 +312,8 @@ static void audit_update_watch(struct audit_parent *parent,
 				list_replace(&oentry->rule.list,
 					     &nentry->rule.list);
 			}
+			if (oentry->rule.exe)
+				audit_remove_mark(oentry->rule.exe);
 
 			audit_watch_log_rule_change(r, owatch, "updated_rules");
 
@@ -342,6 +344,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 		list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
 			e = container_of(r, struct audit_entry, rule);
 			audit_watch_log_rule_change(r, w, "remove_rule");
+			if (e->rule.exe)
+				audit_remove_mark(e->rule.exe);
 			list_del(&r->rlist);
 			list_del(&r->list);
 			list_del_rcu(&e->list);
@@ -514,3 +518,30 @@ static int __init audit_watch_init(void)
 	return 0;
 }
 device_initcall(audit_watch_init);
+
+int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old)
+{
+	struct audit_fsnotify_mark *audit_mark;
+	char *pathname;
+
+	pathname = kstrdup(audit_mark_path(old->exe), GFP_KERNEL);
+	if (!pathname)
+		return -ENOMEM;
+
+	audit_mark = audit_alloc_mark(new, pathname, strlen(pathname));
+	if (IS_ERR(audit_mark)) {
+		kfree(pathname);
+		return PTR_ERR(audit_mark);
+	}
+	new->exe = audit_mark;
+
+	return 0;
+}
+
+int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
+{
+	unsigned long ino = tsk->mm->exe_file->f_inode->i_ino;
+	dev_t dev = tsk->mm->exe_file->f_inode->i_sb->s_dev;
+
+	return audit_mark_compare(mark, ino, dev);
+}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index b4d8c366ec30..7714d93edb85 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -405,6 +405,12 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
 		if (f->val > AUDIT_MAX_FIELD_COMPARE)
 			return -EINVAL;
 		break;
+	case AUDIT_EXE:
+		if (f->op != Audit_equal)
+			return -EINVAL;
+		if (entry->rule.listnr != AUDIT_FILTER_EXIT)
+			return -EINVAL;
+		break;
 	};
 	return 0;
 }
@@ -419,6 +425,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 	size_t remain = datasz - sizeof(struct audit_rule_data);
 	int i;
 	char *str;
+	struct audit_fsnotify_mark *audit_mark;
 
 	entry = audit_to_entry_common(data);
 	if (IS_ERR(entry))
@@ -539,6 +546,24 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 			entry->rule.buflen += f->val;
 			entry->rule.filterkey = str;
 			break;
+		case AUDIT_EXE:
+			if (entry->rule.exe || f->val > PATH_MAX)
+				goto exit_free;
+			str = audit_unpack_string(&bufp, &remain, f->val);
+			if (IS_ERR(str)) {
+				err = PTR_ERR(str);
+				goto exit_free;
+			}
+			entry->rule.buflen += f->val;
+
+			audit_mark = audit_alloc_mark(&entry->rule, str, f->val);
+			if (IS_ERR(audit_mark)) {
+				kfree(str);
+				err = PTR_ERR(audit_mark);
+				goto exit_free;
+			}
+			entry->rule.exe = audit_mark;
+			break;
 		}
 	}
 
@@ -551,6 +576,8 @@ exit_nofree:
 exit_free:
 	if (entry->rule.tree)
 		audit_put_tree(entry->rule.tree); /* that's the temporary one */
+	if (entry->rule.exe)
+		audit_remove_mark(entry->rule.exe); /* that's the template one */
 	audit_free_rule(entry);
 	return ERR_PTR(err);
 }
@@ -615,6 +642,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
 			data->buflen += data->values[i] =
 				audit_pack_string(&bufp, krule->filterkey);
 			break;
+		case AUDIT_EXE:
+			data->buflen += data->values[i] =
+				audit_pack_string(&bufp, audit_mark_path(krule->exe));
+			break;
 		case AUDIT_LOGINUID_SET:
 			if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
 				data->fields[i] = AUDIT_LOGINUID;
@@ -678,6 +709,12 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
 			if (strcmp(a->filterkey, b->filterkey))
 				return 1;
 			break;
+		case AUDIT_EXE:
+			/* both paths exist based on above type compare */
+			if (strcmp(audit_mark_path(a->exe),
+				   audit_mark_path(b->exe)))
+				return 1;
+			break;
 		case AUDIT_UID:
 		case AUDIT_EUID:
 		case AUDIT_SUID:
@@ -799,8 +836,14 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
 				err = -ENOMEM;
 			else
 				new->filterkey = fk;
+			break;
+		case AUDIT_EXE:
+			err = audit_dupe_exe(new, old);
+			break;
 		}
 		if (err) {
+			if (new->exe)
+				audit_remove_mark(new->exe);
 			audit_free_rule(entry);
 			return ERR_PTR(err);
 		}
@@ -963,6 +1006,9 @@ int audit_del_rule(struct audit_entry *entry)
 	if (e->rule.tree)
 		audit_remove_tree_rule(&e->rule);
 
+	if (e->rule.exe)
+		audit_remove_mark_rule(&e->rule);
+
 #ifdef CONFIG_AUDITSYSCALL
 	if (!dont_count)
 		audit_n_rules--;
@@ -1067,8 +1113,11 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
 		WARN_ON(1);
 	}
 
-	if (err || type == AUDIT_DEL_RULE)
+	if (err || type == AUDIT_DEL_RULE) {
+		if (entry->rule.exe)
+			audit_remove_mark(entry->rule.exe);
 		audit_free_rule(entry);
+	}
 
 	return err;
 }
@@ -1360,6 +1409,8 @@ static int update_lsm_rule(struct audit_krule *r)
 		return 0;
 
 	nentry = audit_dupe_rule(r);
+	if (entry->rule.exe)
+		audit_remove_mark(entry->rule.exe);
 	if (IS_ERR(nentry)) {
 		/* save the first error encountered for the
 		 * return value */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ea3fe2b748a8..9b56b7ae053f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -466,6 +466,9 @@ static int audit_filter_rules(struct task_struct *tsk,
 				result = audit_comparator(ctx->ppid, f->op, f->val);
 			}
 			break;
+		case AUDIT_EXE:
+			result = audit_exe_compare(tsk, rule->exe);
+			break;
 		case AUDIT_UID:
 			result = audit_uid_comparator(cred->uid, f->op, f->uid);
 			break;
-- 
cgit v1.2.3-70-g09d2


From 3ca9bb33c627f22640cfca97fdf88eec0a120dfd Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 29 Jul 2015 16:23:03 +0530
Subject: PM / OPP: Add clock-latency-ns support

With "operating-points-v2" bindings, clock-latency is defined per OPP.
Users of this value expect a single value which defines the latency to
switch to any clock rate. Find maximum clock-latency-ns from the OPP
table to service requests from such users.

Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Reviewed-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp.c | 41 +++++++++++++++++++++++++++++++++++++++--
 include/linux/pm_opp.h   |  6 ++++++
 2 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c
index 0e0eff4d9299..8638204c457e 100644
--- a/drivers/base/power/opp.c
+++ b/drivers/base/power/opp.c
@@ -57,6 +57,8 @@
  * @u_volt_min:	Minimum voltage in microvolts corresponding to this OPP
  * @u_volt_max:	Maximum voltage in microvolts corresponding to this OPP
  * @u_amp:	Maximum current drawn by the device in microamperes
+ * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's
+ *		frequency from any other OPP's frequency.
  * @dev_opp:	points back to the device_opp struct this opp belongs to
  * @rcu_head:	RCU callback head used for deferred freeing
  * @np:		OPP's device node.
@@ -75,6 +77,7 @@ struct dev_pm_opp {
 	unsigned long u_volt_min;
 	unsigned long u_volt_max;
 	unsigned long u_amp;
+	unsigned long clock_latency_ns;
 
 	struct device_opp *dev_opp;
 	struct rcu_head rcu_head;
@@ -109,6 +112,8 @@ struct device_opp {
 	struct srcu_notifier_head srcu_head;
 	struct rcu_head rcu_head;
 	struct list_head opp_list;
+
+	unsigned long clock_latency_ns_max;
 };
 
 /*
@@ -225,6 +230,32 @@ unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_freq);
 
+/**
+ * dev_pm_opp_get_max_clock_latency() - Get max clock latency in nanoseconds
+ * @dev:	device for which we do this operation
+ *
+ * Return: This function returns the max clock latency in nanoseconds.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
+{
+	struct device_opp *dev_opp;
+	unsigned long clock_latency_ns;
+
+	rcu_read_lock();
+
+	dev_opp = _find_device_opp(dev);
+	if (IS_ERR(dev_opp))
+		clock_latency_ns = 0;
+	else
+		clock_latency_ns = dev_opp->clock_latency_ns_max;
+
+	rcu_read_unlock();
+	return clock_latency_ns;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
+
 /**
  * dev_pm_opp_get_opp_count() - Get number of opps available in the opp list
  * @dev:	device for which we do this operation
@@ -779,6 +810,8 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np)
 	new_opp->np = np;
 	new_opp->dynamic = false;
 	new_opp->available = true;
+	of_property_read_u32(np, "clock-latency-ns",
+			     (u32 *)&new_opp->clock_latency_ns);
 
 	ret = opp_get_microvolt(new_opp, dev);
 	if (ret)
@@ -790,11 +823,15 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np)
 	if (ret)
 		goto free_opp;
 
+	if (new_opp->clock_latency_ns > dev_opp->clock_latency_ns_max)
+		dev_opp->clock_latency_ns_max = new_opp->clock_latency_ns;
+
 	mutex_unlock(&dev_opp_list_lock);
 
-	pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu\n",
+	pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n",
 		 __func__, new_opp->turbo, new_opp->rate, new_opp->u_volt,
-		 new_opp->u_volt_min, new_opp->u_volt_max);
+		 new_opp->u_volt_min, new_opp->u_volt_max,
+		 new_opp->clock_latency_ns);
 
 	/*
 	 * Notify the changes in the availability of the operable
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index cec2d4540914..20324b579adc 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -31,6 +31,7 @@ unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp);
 unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp);
 
 int dev_pm_opp_get_opp_count(struct device *dev);
+unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev);
 
 struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
 					      unsigned long freq,
@@ -67,6 +68,11 @@ static inline int dev_pm_opp_get_opp_count(struct device *dev)
 	return 0;
 }
 
+static inline unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
+{
+	return 0;
+}
+
 static inline struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
 					unsigned long freq, bool available)
 {
-- 
cgit v1.2.3-70-g09d2


From 8d4d4e98acd68c31435ebb7beea591dbf60b9eb2 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 12 Jun 2015 17:10:38 +0530
Subject: PM / OPP: Add helpers for initializing CPU OPPs

With "operating-points-v2" its possible to tell which devices share
OPPs. We already have infrastructure to decode that information.

This patch adds following APIs:
 - of_get_cpus_sharing_opps: Returns cpumask of CPUs sharing OPPs (only
   valid with v2 bindings).
 - of_cpumask_init_opp_table: Initializes OPPs for all CPUs present in
   cpumask.
 - of_cpumask_free_opp_table: Frees OPPs for all CPUs present in cpumask.

 - set_cpus_sharing_opps: Sets which CPUs share OPPs (only valid with old
   OPP bindings, as this information isn't present in DT).

Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Reviewed-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp.c | 175 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pm_opp.h   |  23 +++++++
 2 files changed, 198 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c
index 0ebcea49145a..663aae1c9834 100644
--- a/drivers/base/power/opp.c
+++ b/drivers/base/power/opp.c
@@ -11,6 +11,7 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/err.h>
@@ -1195,6 +1196,26 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(of_free_opp_table);
 
+void of_cpumask_free_opp_table(cpumask_var_t cpumask)
+{
+	struct device *cpu_dev;
+	int cpu;
+
+	WARN_ON(cpumask_empty(cpumask));
+
+	for_each_cpu(cpu, cpumask) {
+		cpu_dev = get_cpu_device(cpu);
+		if (!cpu_dev) {
+			pr_err("%s: failed to get cpu%d device\n", __func__,
+			       cpu);
+			continue;
+		}
+
+		of_free_opp_table(cpu_dev);
+	}
+}
+EXPORT_SYMBOL_GPL(of_cpumask_free_opp_table);
+
 /* Returns opp descriptor node from its phandle. Caller must do of_node_put() */
 static struct device_node *
 _of_get_opp_desc_node_from_prop(struct device *dev, const struct property *prop)
@@ -1211,6 +1232,31 @@ _of_get_opp_desc_node_from_prop(struct device *dev, const struct property *prop)
 	return opp_np;
 }
 
+/* Returns opp descriptor node for a device. Caller must do of_node_put() */
+static struct device_node *_of_get_opp_desc_node(struct device *dev)
+{
+	const struct property *prop;
+
+	prop = of_find_property(dev->of_node, "operating-points-v2", NULL);
+	if (!prop)
+		return ERR_PTR(-ENODEV);
+	if (!prop->value)
+		return ERR_PTR(-ENODATA);
+
+	/*
+	 * TODO: Support for multiple OPP tables.
+	 *
+	 * There should be only ONE phandle present in "operating-points-v2"
+	 * property.
+	 */
+	if (prop->length != sizeof(__be32)) {
+		dev_err(dev, "%s: Invalid opp desc phandle\n", __func__);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return _of_get_opp_desc_node_from_prop(dev, prop);
+}
+
 /* Initializes OPP tables based on new bindings */
 static int _of_init_opp_table_v2(struct device *dev,
 				 const struct property *prop)
@@ -1351,4 +1397,133 @@ int of_init_opp_table(struct device *dev)
 	return _of_init_opp_table_v2(dev, prop);
 }
 EXPORT_SYMBOL_GPL(of_init_opp_table);
+
+int of_cpumask_init_opp_table(cpumask_var_t cpumask)
+{
+	struct device *cpu_dev;
+	int cpu, ret = 0;
+
+	WARN_ON(cpumask_empty(cpumask));
+
+	for_each_cpu(cpu, cpumask) {
+		cpu_dev = get_cpu_device(cpu);
+		if (!cpu_dev) {
+			pr_err("%s: failed to get cpu%d device\n", __func__,
+			       cpu);
+			continue;
+		}
+
+		ret = of_init_opp_table(cpu_dev);
+		if (ret) {
+			pr_err("%s: couldn't find opp table for cpu:%d, %d\n",
+			       __func__, cpu, ret);
+
+			/* Free all other OPPs */
+			of_cpumask_free_opp_table(cpumask);
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(of_cpumask_init_opp_table);
+
+/* Required only for V1 bindings, as v2 can manage it from DT itself */
+int set_cpus_sharing_opps(struct device *cpu_dev, cpumask_var_t cpumask)
+{
+	struct device_list_opp *list_dev;
+	struct device_opp *dev_opp;
+	struct device *dev;
+	int cpu, ret = 0;
+
+	rcu_read_lock();
+
+	dev_opp = _find_device_opp(cpu_dev);
+	if (IS_ERR(dev_opp)) {
+		ret = -EINVAL;
+		goto out_rcu_read_unlock;
+	}
+
+	for_each_cpu(cpu, cpumask) {
+		if (cpu == cpu_dev->id)
+			continue;
+
+		dev = get_cpu_device(cpu);
+		if (!dev) {
+			dev_err(cpu_dev, "%s: failed to get cpu%d device\n",
+				__func__, cpu);
+			continue;
+		}
+
+		list_dev = _add_list_dev(dev, dev_opp);
+		if (!list_dev) {
+			dev_err(dev, "%s: failed to add list-dev for cpu%d device\n",
+				__func__, cpu);
+			continue;
+		}
+	}
+out_rcu_read_unlock:
+	rcu_read_unlock();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(set_cpus_sharing_opps);
+
+/*
+ * Works only for OPP v2 bindings.
+ *
+ * cpumask should be already set to mask of cpu_dev->id.
+ * Returns -ENOENT if operating-points-v2 bindings aren't supported.
+ */
+int of_get_cpus_sharing_opps(struct device *cpu_dev, cpumask_var_t cpumask)
+{
+	struct device_node *np, *tmp_np;
+	struct device *tcpu_dev;
+	int cpu, ret = 0;
+
+	/* Get OPP descriptor node */
+	np = _of_get_opp_desc_node(cpu_dev);
+	if (IS_ERR(np)) {
+		dev_dbg(cpu_dev, "%s: Couldn't find opp node: %ld\n", __func__,
+			PTR_ERR(np));
+		return -ENOENT;
+	}
+
+	/* OPPs are shared ? */
+	if (!of_property_read_bool(np, "opp-shared"))
+		goto put_cpu_node;
+
+	for_each_possible_cpu(cpu) {
+		if (cpu == cpu_dev->id)
+			continue;
+
+		tcpu_dev = get_cpu_device(cpu);
+		if (!tcpu_dev) {
+			dev_err(cpu_dev, "%s: failed to get cpu%d device\n",
+				__func__, cpu);
+			ret = -ENODEV;
+			goto put_cpu_node;
+		}
+
+		/* Get OPP descriptor node */
+		tmp_np = _of_get_opp_desc_node(tcpu_dev);
+		if (IS_ERR(tmp_np)) {
+			dev_err(tcpu_dev, "%s: Couldn't find opp node: %ld\n",
+				__func__, PTR_ERR(tmp_np));
+			ret = PTR_ERR(tmp_np);
+			goto put_cpu_node;
+		}
+
+		/* CPUs are sharing opp node */
+		if (np == tmp_np)
+			cpumask_set_cpu(cpu, cpumask);
+
+		of_node_put(tmp_np);
+	}
+
+put_cpu_node:
+	of_node_put(np);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(of_get_cpus_sharing_opps);
 #endif
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 20324b579adc..bb52fae5b921 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -121,6 +121,10 @@ static inline struct srcu_notifier_head *dev_pm_opp_get_notifier(
 #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF)
 int of_init_opp_table(struct device *dev);
 void of_free_opp_table(struct device *dev);
+int of_cpumask_init_opp_table(cpumask_var_t cpumask);
+void of_cpumask_free_opp_table(cpumask_var_t cpumask);
+int of_get_cpus_sharing_opps(struct device *cpu_dev, cpumask_var_t cpumask);
+int set_cpus_sharing_opps(struct device *cpu_dev, cpumask_var_t cpumask);
 #else
 static inline int of_init_opp_table(struct device *dev)
 {
@@ -130,6 +134,25 @@ static inline int of_init_opp_table(struct device *dev)
 static inline void of_free_opp_table(struct device *dev)
 {
 }
+
+static inline int of_cpumask_init_opp_table(cpumask_var_t cpumask)
+{
+	return -ENOSYS;
+}
+
+static inline void of_cpumask_free_opp_table(cpumask_var_t cpumask)
+{
+}
+
+static inline int of_get_cpus_sharing_opps(struct device *cpu_dev, cpumask_var_t cpumask)
+{
+	return -ENOSYS;
+}
+
+static inline int set_cpus_sharing_opps(struct device *cpu_dev, cpumask_var_t cpumask)
+{
+	return -ENOSYS;
+}
 #endif
 
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit v1.2.3-70-g09d2


From 19445b25e350ebebaa304bb2135619f643302947 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Thu, 9 Jul 2015 17:43:35 +0200
Subject: PM / OPP: add dev_pm_opp_is_turbo() helper

Add dev_pm_opp_is_turbo() helper to verify if an opp is to be used only
for turbo mode or not.

Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp.c | 34 ++++++++++++++++++++++++++++++++++
 include/linux/pm_opp.h   |  7 +++++++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c
index 663aae1c9834..204c6c945168 100644
--- a/drivers/base/power/opp.c
+++ b/drivers/base/power/opp.c
@@ -280,6 +280,40 @@ unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_freq);
 
+/**
+ * dev_pm_opp_is_turbo() - Returns if opp is turbo OPP or not
+ * @opp: opp for which turbo mode is being verified
+ *
+ * Turbo OPPs are not for normal use, and can be enabled (under certain
+ * conditions) for short duration of times to finish high throughput work
+ * quickly. Running on them for longer times may overheat the chip.
+ *
+ * Return: true if opp is turbo opp, else false.
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. This means that opp which could have been fetched by
+ * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are
+ * under RCU lock. The pointer returned by the opp_find_freq family must be
+ * used in the same section as the usage of this function with the pointer
+ * prior to unlocking with rcu_read_unlock() to maintain the integrity of the
+ * pointer.
+ */
+bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp)
+{
+	struct dev_pm_opp *tmp_opp;
+
+	opp_rcu_lockdep_assert();
+
+	tmp_opp = rcu_dereference(opp);
+	if (IS_ERR_OR_NULL(tmp_opp) || !tmp_opp->available) {
+		pr_err("%s: Invalid parameters\n", __func__);
+		return false;
+	}
+
+	return tmp_opp->turbo;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_is_turbo);
+
 /**
  * dev_pm_opp_get_max_clock_latency() - Get max clock latency in nanoseconds
  * @dev:	device for which we do this operation
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index bb52fae5b921..cab7ba55bedb 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -30,6 +30,8 @@ unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp);
 
 unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp);
 
+bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp);
+
 int dev_pm_opp_get_opp_count(struct device *dev);
 unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev);
 
@@ -63,6 +65,11 @@ static inline unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp)
 	return 0;
 }
 
+static inline bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp)
+{
+	return false;
+}
+
 static inline int dev_pm_opp_get_opp_count(struct device *dev)
 {
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 44139ed4943ee8ec186eea3e9072ca16d2b48133 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 29 Jul 2015 16:23:09 +0530
Subject: cpufreq: Allow drivers to enable boost support after registering
 driver

In some cases it wouldn't be known at time of driver registration, if
the driver needs to support boost frequencies.

For example, while getting boost information from DT with opp-v2
bindings, we need to parse the bindings for all the CPUs to know if
turbo/boost OPPs are supported or not.

One way out to do that efficiently is to delay supporting boost mode
(i.e. creating /sys/devices/system/cpu/cpufreq/boost file), until the
time OPP bindings are parsed.

At that point, the driver can enable boost support. This can be done at
->init(), where the frequency table is created.

To do that, the driver requires few APIs from cpufreq core that let him
do this. This patch provides these APIs.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c    | 68 +++++++++++++++++++++++++++++++-------------
 drivers/cpufreq/freq_table.c | 15 ++++++++++
 include/linux/cpufreq.h      | 12 ++++++++
 3 files changed, 75 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 76a26609d96b..e48242119d77 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2437,6 +2437,49 @@ int cpufreq_boost_supported(void)
 }
 EXPORT_SYMBOL_GPL(cpufreq_boost_supported);
 
+static int create_boost_sysfs_file(void)
+{
+	int ret;
+
+	if (!cpufreq_boost_supported())
+		return 0;
+
+	/*
+	 * Check if driver provides function to enable boost -
+	 * if not, use cpufreq_boost_set_sw as default
+	 */
+	if (!cpufreq_driver->set_boost)
+		cpufreq_driver->set_boost = cpufreq_boost_set_sw;
+
+	ret = cpufreq_sysfs_create_file(&boost.attr);
+	if (ret)
+		pr_err("%s: cannot register global BOOST sysfs file\n",
+		       __func__);
+
+	return ret;
+}
+
+static void remove_boost_sysfs_file(void)
+{
+	if (cpufreq_boost_supported())
+		cpufreq_sysfs_remove_file(&boost.attr);
+}
+
+int cpufreq_enable_boost_support(void)
+{
+	if (!cpufreq_driver)
+		return -EINVAL;
+
+	if (cpufreq_boost_supported())
+		return 0;
+
+	cpufreq_driver->boost_supported = true;
+
+	/* This will get removed on driver unregister */
+	return create_boost_sysfs_file();
+}
+EXPORT_SYMBOL_GPL(cpufreq_enable_boost_support);
+
 int cpufreq_boost_enabled(void)
 {
 	return cpufreq_driver->boost_enabled;
@@ -2490,21 +2533,9 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	if (driver_data->setpolicy)
 		driver_data->flags |= CPUFREQ_CONST_LOOPS;
 
-	if (cpufreq_boost_supported()) {
-		/*
-		 * Check if driver provides function to enable boost -
-		 * if not, use cpufreq_boost_set_sw as default
-		 */
-		if (!cpufreq_driver->set_boost)
-			cpufreq_driver->set_boost = cpufreq_boost_set_sw;
-
-		ret = cpufreq_sysfs_create_file(&boost.attr);
-		if (ret) {
-			pr_err("%s: cannot register global BOOST sysfs file\n",
-			       __func__);
-			goto err_null_driver;
-		}
-	}
+	ret = create_boost_sysfs_file();
+	if (ret)
+		goto err_null_driver;
 
 	ret = subsys_interface_register(&cpufreq_interface);
 	if (ret)
@@ -2528,8 +2559,7 @@ out:
 err_if_unreg:
 	subsys_interface_unregister(&cpufreq_interface);
 err_boost_unreg:
-	if (cpufreq_boost_supported())
-		cpufreq_sysfs_remove_file(&boost.attr);
+	remove_boost_sysfs_file();
 err_null_driver:
 	write_lock_irqsave(&cpufreq_driver_lock, flags);
 	cpufreq_driver = NULL;
@@ -2558,9 +2588,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
 	/* Protect against concurrent cpu hotplug */
 	get_online_cpus();
 	subsys_interface_unregister(&cpufreq_interface);
-	if (cpufreq_boost_supported())
-		cpufreq_sysfs_remove_file(&boost.attr);
-
+	remove_boost_sysfs_file();
 	unregister_hotcpu_notifier(&cpufreq_cpu_notifier);
 
 	write_lock_irqsave(&cpufreq_driver_lock, flags);
diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c
index dfbbf981ed56..a8f1daffc9bc 100644
--- a/drivers/cpufreq/freq_table.c
+++ b/drivers/cpufreq/freq_table.c
@@ -18,6 +18,21 @@
  *                     FREQUENCY TABLE HELPERS                       *
  *********************************************************************/
 
+bool policy_has_boost_freq(struct cpufreq_policy *policy)
+{
+	struct cpufreq_frequency_table *pos, *table = policy->freq_table;
+
+	if (!table)
+		return false;
+
+	cpufreq_for_each_valid_entry(pos, table)
+		if (pos->flags & CPUFREQ_BOOST_FREQ)
+			return true;
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(policy_has_boost_freq);
+
 int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
 				    struct cpufreq_frequency_table *table)
 {
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index bde1e567b3a9..95f018649abf 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -578,6 +578,8 @@ ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf);
 int cpufreq_boost_trigger_state(int state);
 int cpufreq_boost_supported(void);
 int cpufreq_boost_enabled(void);
+int cpufreq_enable_boost_support(void);
+bool policy_has_boost_freq(struct cpufreq_policy *policy);
 #else
 static inline int cpufreq_boost_trigger_state(int state)
 {
@@ -591,6 +593,16 @@ static inline int cpufreq_boost_enabled(void)
 {
 	return 0;
 }
+
+static inline int cpufreq_enable_boost_support(void)
+{
+	return -EINVAL;
+}
+
+static inline bool policy_has_boost_freq(struct cpufreq_policy *policy)
+{
+	return false;
+}
 #endif
 /* the following funtion is for cpufreq core use only */
 struct cpufreq_frequency_table *cpufreq_frequency_get_table(unsigned int cpu);
-- 
cgit v1.2.3-70-g09d2


From 4248b0da460839e30eaaad78992b9a1dd3e63e21 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 6 Aug 2015 15:46:20 -0700
Subject: fs, file table: reinit files_stat.max_files after deferred memory
 initialisation

Dave Hansen reported the following;

	My laptop has been behaving strangely with 4.2-rc2.  Once I log
	in to my X session, I start getting all kinds of strange errors
	from applications and see this in my dmesg:

        	VFS: file-max limit 8192 reached

The problem is that the file-max is calculated before memory is fully
initialised and miscalculates how much memory the kernel is using.  This
patch recalculates file-max after deferred memory initialisation.  Note
that using memory hotplug infrastructure would not have avoided this
problem as the value is not recalculated after memory hot-add.

4.1:             files_stat.max_files = 6582781
4.2-rc2:         files_stat.max_files = 8192
4.2-rc2 patched: files_stat.max_files = 6562467

Small differences with the patch applied and 4.1 but not enough to matter.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reported-by: Dave Hansen <dave.hansen@intel.com>
Cc: Nicolai Stange <nicstange@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Alex Ng <alexng@microsoft.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c        | 13 +++----------
 fs/file_table.c    | 24 +++++++++++++++---------
 include/linux/fs.h |  5 +++--
 init/main.c        |  2 +-
 mm/page_alloc.c    |  3 +++
 5 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 5c8ea15e73a5..9b5fe503f6cb 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3442,22 +3442,15 @@ void __init vfs_caches_init_early(void)
 	inode_init_early();
 }
 
-void __init vfs_caches_init(unsigned long mempages)
+void __init vfs_caches_init(void)
 {
-	unsigned long reserve;
-
-	/* Base hash sizes on available memory, with a reserve equal to
-           150% of current kernel size */
-
-	reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1);
-	mempages -= reserve;
-
 	names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	dcache_init();
 	inode_init();
-	files_init(mempages);
+	files_init();
+	files_maxfiles_init();
 	mnt_init();
 	bdev_cache_init();
 	chrdev_init();
diff --git a/fs/file_table.c b/fs/file_table.c
index 7f9d407c7595..ad17e05ebf95 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -25,6 +25,7 @@
 #include <linux/hardirq.h>
 #include <linux/task_work.h>
 #include <linux/ima.h>
+#include <linux/swap.h>
 
 #include <linux/atomic.h>
 
@@ -308,19 +309,24 @@ void put_filp(struct file *file)
 	}
 }
 
-void __init files_init(unsigned long mempages)
+void __init files_init(void)
 { 
-	unsigned long n;
-
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
+}
 
-	/*
-	 * One file with associated inode and dcache is very roughly 1K.
-	 * Per default don't use more than 10% of our memory for files. 
-	 */ 
+/*
+ * One file with associated inode and dcache is very roughly 1K. Per default
+ * do not use more than 10% of our memory for files.
+ */
+void __init files_maxfiles_init(void)
+{
+	unsigned long n;
+	unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2;
+
+	memreserve = min(memreserve, totalram_pages - 1);
+	n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
 
-	n = (mempages * (PAGE_SIZE / 1024)) / 10;
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
-	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 } 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cc008c338f5a..84b783f277f7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -55,7 +55,8 @@ struct vm_fault;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
-extern void __init files_init(unsigned long);
+extern void __init files_init(void);
+extern void __init files_maxfiles_init(void);
 
 extern struct files_stat_struct files_stat;
 extern unsigned long get_max_files(void);
@@ -2245,7 +2246,7 @@ extern int ioctl_preallocate(struct file *filp, void __user *argp);
 
 /* fs/dcache.c */
 extern void __init vfs_caches_init_early(void);
-extern void __init vfs_caches_init(unsigned long);
+extern void __init vfs_caches_init(void);
 
 extern struct kmem_cache *names_cachep;
 
diff --git a/init/main.c b/init/main.c
index c5d5626289ce..56506553d4d8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -656,7 +656,7 @@ asmlinkage __visible void __init start_kernel(void)
 	key_init();
 	security_init();
 	dbg_late_init();
-	vfs_caches_init(totalram_pages);
+	vfs_caches_init();
 	signals_init();
 	/* rootfs populating might need page-writeback */
 	page_writeback_init();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 322628278ae4..cb61f44eb3fc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1201,6 +1201,9 @@ void __init page_alloc_init_late(void)
 
 	/* Block until all are initialised */
 	wait_for_completion(&pgdat_init_all_done_comp);
+
+	/* Reinit limits that are based on free pages after the kernel is up */
+	files_maxfiles_init();
 }
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
-- 
cgit v1.2.3-70-g09d2


From f4c18e6f7b5bbb5b528b3334115806b0d76f50f9 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Thu, 6 Aug 2015 15:47:08 -0700
Subject: mm: check __PG_HWPOISON separately from PAGE_FLAGS_CHECK_AT_*

The race condition addressed in commit add05cecef80 ("mm: soft-offline:
don't free target page in successful page migration") was not closed
completely, because that can happen not only for soft-offline, but also
for hard-offline.  Consider that a slab page is about to be freed into
buddy pool, and then an uncorrected memory error hits the page just
after entering __free_one_page(), then VM_BUG_ON_PAGE(page->flags &
PAGE_FLAGS_CHECK_AT_PREP) is triggered, despite the fact that it's not
necessary because the data on the affected page is not consumed.

To solve it, this patch drops __PG_HWPOISON from page flag checks at
allocation/free time.  I think it's justified because __PG_HWPOISON
flags is defined to prevent the page from being reused, and setting it
outside the page's alloc-free cycle is a designed behavior (not a bug.)

For recent months, I was annoyed about BUG_ON when soft-offlined page
remains on lru cache list for a while, which is avoided by calling
put_page() instead of putback_lru_page() in page migration's success
path.  This means that this patch reverts a major change from commit
add05cecef80 about the new refcounting rule of soft-offlined pages, so
"reuse window" revives.  This will be closed by a subsequent patch.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Dean Nelson <dnelson@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 10 +++++++---
 mm/huge_memory.c           |  7 +------
 mm/migrate.c               |  5 ++++-
 mm/page_alloc.c            |  4 ++++
 4 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f34e040b34e9..41c93844fb1d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -631,15 +631,19 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
 	 1 << PG_private | 1 << PG_private_2 | \
 	 1 << PG_writeback | 1 << PG_reserved | \
 	 1 << PG_slab	 | 1 << PG_swapcache | 1 << PG_active | \
-	 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \
+	 1 << PG_unevictable | __PG_MLOCKED | \
 	 __PG_COMPOUND_LOCK)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
- * Pages being prepped should not have any flags set.  It they are set,
+ * Pages being prepped should not have these flags set.  It they are set,
  * there has been a kernel bug or struct page corruption.
+ *
+ * __PG_HWPOISON is exceptional because it needs to be kept beyond page's
+ * alloc-free cycle to prevent from reusing the page.
  */
-#define PAGE_FLAGS_CHECK_AT_PREP	((1 << NR_PAGEFLAGS) - 1)
+#define PAGE_FLAGS_CHECK_AT_PREP	\
+	(((1 << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
 
 #define PAGE_FLAGS_PRIVATE				\
 	(1 << PG_private | 1 << PG_private_2)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c107094f79ba..097c7a4bfbd9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1676,12 +1676,7 @@ static void __split_huge_page_refcount(struct page *page,
 		/* after clearing PageTail the gup refcount can be released */
 		smp_mb__after_atomic();
 
-		/*
-		 * retain hwpoison flag of the poisoned tail page:
-		 *   fix for the unsuitable process killed on Guest Machine(KVM)
-		 *   by the memory-failure.
-		 */
-		page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
+		page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 		page_tail->flags |= (page->flags &
 				     ((1L << PG_referenced) |
 				      (1L << PG_swapbacked) |
diff --git a/mm/migrate.c b/mm/migrate.c
index ee401e4e5ef1..f2415be7d93b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -950,7 +950,10 @@ out:
 		list_del(&page->lru);
 		dec_zone_page_state(page, NR_ISOLATED_ANON +
 				page_is_file_cache(page));
-		if (reason != MR_MEMORY_FAILURE)
+		/* Soft-offlined page shouldn't go through lru cache list */
+		if (reason == MR_MEMORY_FAILURE)
+			put_page(page);
+		else
 			putback_lru_page(page);
 	}
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cb61f44eb3fc..beda41710802 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1296,6 +1296,10 @@ static inline int check_new_page(struct page *page)
 		bad_reason = "non-NULL mapping";
 	if (unlikely(atomic_read(&page->_count) != 0))
 		bad_reason = "nonzero _count";
+	if (unlikely(page->flags & __PG_HWPOISON)) {
+		bad_reason = "HWPoisoned (hardware-corrupted)";
+		bad_flags = __PG_HWPOISON;
+	}
 	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
 		bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
 		bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
-- 
cgit v1.2.3-70-g09d2


From d9eea403ca81f60cd535d354c77ada4c2bee8d66 Mon Sep 17 00:00:00 2001
From: Achiad Shochat <achiad@mellanox.com>
Date: Tue, 4 Aug 2015 14:05:42 +0300
Subject: net/mlx5_core: Introduce access function to modify RSS/LRO params

To be used by the mlx5 Eth driver in following commit.

This is in preparation for netdev "light-weight" open/stop flow
change described in previous commit.

Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 12 ++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/transobj.h |  2 ++
 include/linux/mlx5/mlx5_ifc.h                      |  9 ++++++++-
 3 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
index c4f3f74908ec..e6453f61141e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
@@ -163,6 +163,18 @@ int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
 	return err;
 }
 
+int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
+			 int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_tir_out)];
+
+	MLX5_SET(modify_tir_in, in, tirn, tirn);
+	MLX5_SET(modify_tir_in, in, opcode, MLX5_CMD_OP_MODIFY_TIR);
+
+	memset(out, 0, sizeof(out));
+	return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
+}
+
 void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn)
 {
 	u32 in[MLX5_ST_SZ_DW(destroy_tir_out)];
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
index 10bd75e7d9b1..d436c2d8b527 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
@@ -45,6 +45,8 @@ int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen);
 void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn);
 int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
 			 u32 *tirn);
+int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
+			 int inlen);
 void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn);
 int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen,
 			 u32 *tisn);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c60a62bba652..469b7bda3304 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4050,6 +4050,13 @@ struct mlx5_ifc_modify_tis_in_bits {
 	struct mlx5_ifc_tisc_bits ctx;
 };
 
+struct mlx5_ifc_modify_tir_bitmask_bits {
+	u8	   reserved[0x20];
+
+	u8         reserved1[0x1f];
+	u8         lro[0x1];
+};
+
 struct mlx5_ifc_modify_tir_out_bits {
 	u8         status[0x8];
 	u8         reserved_0[0x18];
@@ -4071,7 +4078,7 @@ struct mlx5_ifc_modify_tir_in_bits {
 
 	u8         reserved_3[0x20];
 
-	u8         modify_bitmask[0x40];
+	struct mlx5_ifc_modify_tir_bitmask_bits bitmask;
 
 	u8         reserved_4[0x40];
 
-- 
cgit v1.2.3-70-g09d2


From 5c50368f38317627421bf24a0b66b1af0d44eddc Mon Sep 17 00:00:00 2001
From: Achiad Shochat <achiad@mellanox.com>
Date: Tue, 4 Aug 2015 14:05:43 +0300
Subject: net/mlx5e: Light-weight netdev open/stop

Create/destroy TIRs, TISs and flow tables upon PCI probe/remove rather
than upon the netdev ndo_open/stop.

Upon ndo_stop(), redirect all RX traffic to the (lately introduced)
"Drop RQ" and then close only the RX/TX rings, leaving the TIRs,
TISs and flow tables alive.

Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 237 ++++++++++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/transobj.c |  12 ++
 drivers/net/ethernet/mellanox/mlx5/core/transobj.h |   2 +
 include/linux/mlx5/mlx5_ifc.h                      |   9 +-
 4 files changed, 184 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index baa7a69bb694..33d08bb11f84 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1301,14 +1301,18 @@ static void mlx5e_fill_rqt_rqns(struct mlx5e_priv *priv, void *rqtc,
 
 			ix = ix % priv->params.num_channels;
 			MLX5_SET(rqtc, rqtc, rq_num[i],
-				 priv->channel[ix]->rq.rqn);
+				 test_bit(MLX5E_STATE_OPENED, &priv->state) ?
+				 priv->channel[ix]->rq.rqn :
+				 priv->drop_rq.rqn);
 		}
 
 		break;
 
 	default: /* MLX5E_SINGLE_RQ_RQT */
 		MLX5_SET(rqtc, rqtc, rq_num[0],
-			 priv->channel[0]->rq.rqn);
+			 test_bit(MLX5E_STATE_OPENED, &priv->state) ?
+			 priv->channel[0]->rq.rqn :
+			 priv->drop_rq.rqn);
 
 		break;
 	}
@@ -1347,19 +1351,95 @@ static int mlx5e_open_rqt(struct mlx5e_priv *priv, enum mlx5e_rqt_ix rqt_ix)
 	return err;
 }
 
+static int mlx5e_redirect_rqt(struct mlx5e_priv *priv, enum mlx5e_rqt_ix rqt_ix)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 *in;
+	void *rqtc;
+	int inlen;
+	int log_sz;
+	int sz;
+	int err;
+
+	log_sz = (rqt_ix == MLX5E_SINGLE_RQ_RQT) ? 0 :
+		  priv->params.rx_hash_log_tbl_sz;
+	sz = 1 << log_sz;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + sizeof(u32) * sz;
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
+
+	MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
+
+	mlx5e_fill_rqt_rqns(priv, rqtc, rqt_ix);
+
+	MLX5_SET(modify_rqt_in, in, bitmask.rqn_list, 1);
+
+	err = mlx5_core_modify_rqt(mdev, priv->rqtn[rqt_ix], in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
 static void mlx5e_close_rqt(struct mlx5e_priv *priv, enum mlx5e_rqt_ix rqt_ix)
 {
 	mlx5_core_destroy_rqt(priv->mdev, priv->rqtn[rqt_ix]);
 }
 
+static void mlx5e_build_tir_ctx_lro(void *tirc, struct mlx5e_priv *priv)
+{
+	if (!priv->params.lro_en)
+		return;
+
+#define ROUGH_MAX_L2_L3_HDR_SZ 256
+
+	MLX5_SET(tirc, tirc, lro_enable_mask,
+		 MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO |
+		 MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO);
+	MLX5_SET(tirc, tirc, lro_max_ip_payload_size,
+		 (priv->params.lro_wqe_sz -
+		  ROUGH_MAX_L2_L3_HDR_SZ) >> 8);
+	MLX5_SET(tirc, tirc, lro_timeout_period_usecs,
+		 MLX5_CAP_ETH(priv->mdev,
+			      lro_timer_supported_periods[3]));
+}
+
+static int mlx5e_modify_tir_lro(struct mlx5e_priv *priv, int tt)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	void *in;
+	void *tirc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_tir_in, in, bitmask.lro, 1);
+	tirc = MLX5_ADDR_OF(modify_tir_in, in, ctx);
+
+	mlx5e_build_tir_ctx_lro(tirc, priv);
+
+	err = mlx5_core_modify_tir(mdev, priv->tirn[tt], in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
 static void mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 *tirc, int tt)
 {
 	void *hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
 
 	MLX5_SET(tirc, tirc, transport_domain, priv->tdn);
 
-#define ROUGH_MAX_L2_L3_HDR_SZ 256
-
 #define MLX5_HASH_IP            (MLX5_HASH_FIELD_SEL_SRC_IP   |\
 				 MLX5_HASH_FIELD_SEL_DST_IP)
 
@@ -1372,17 +1452,7 @@ static void mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 *tirc, int tt)
 				 MLX5_HASH_FIELD_SEL_DST_IP   |\
 				 MLX5_HASH_FIELD_SEL_IPSEC_SPI)
 
-	if (priv->params.lro_en) {
-		MLX5_SET(tirc, tirc, lro_enable_mask,
-			 MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO |
-			 MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO);
-		MLX5_SET(tirc, tirc, lro_max_ip_payload_size,
-			 (priv->params.lro_wqe_sz -
-			  ROUGH_MAX_L2_L3_HDR_SZ) >> 8);
-		MLX5_SET(tirc, tirc, lro_timeout_period_usecs,
-			 MLX5_CAP_ETH(priv->mdev,
-				      lro_timer_supported_periods[3]));
-	}
+	mlx5e_build_tir_ctx_lro(tirc, priv);
 
 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
 
@@ -1568,12 +1638,20 @@ static int mlx5e_set_dev_port_mtu(struct net_device *netdev)
 	return 0;
 }
 
+static void mlx5e_redirect_rqts(struct mlx5e_priv *priv)
+{
+	mlx5e_redirect_rqt(priv, MLX5E_INDIRECTION_RQT);
+	mlx5e_redirect_rqt(priv, MLX5E_SINGLE_RQ_RQT);
+}
+
 int mlx5e_open_locked(struct net_device *netdev)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	int num_txqs;
 	int err;
 
+	set_bit(MLX5E_STATE_OPENED, &priv->state);
+
 	num_txqs = priv->params.num_channels * priv->params.num_tc;
 	netif_set_real_num_tx_queues(netdev, num_txqs);
 	netif_set_real_num_rx_queues(netdev, priv->params.num_channels);
@@ -1582,83 +1660,32 @@ int mlx5e_open_locked(struct net_device *netdev)
 	if (err)
 		return err;
 
-	err = mlx5e_open_tises(priv);
-	if (err) {
-		netdev_err(netdev, "%s: mlx5e_open_tises failed, %d\n",
-			   __func__, err);
-		return err;
-	}
-
 	err = mlx5e_open_channels(priv);
 	if (err) {
 		netdev_err(netdev, "%s: mlx5e_open_channels failed, %d\n",
 			   __func__, err);
-		goto err_close_tises;
-	}
-
-	err = mlx5e_open_rqt(priv, MLX5E_INDIRECTION_RQT);
-	if (err) {
-		netdev_err(netdev, "%s: mlx5e_open_rqt(INDIR) failed, %d\n",
-			   __func__, err);
-		goto err_close_channels;
-	}
-
-	err = mlx5e_open_rqt(priv, MLX5E_SINGLE_RQ_RQT);
-	if (err) {
-		netdev_err(netdev, "%s: mlx5e_open_rqt(SINGLE) failed, %d\n",
-			   __func__, err);
-		goto err_close_rqt_indir;
-	}
-
-	err = mlx5e_open_tirs(priv);
-	if (err) {
-		netdev_err(netdev, "%s: mlx5e_open_tir failed, %d\n",
-			   __func__, err);
-		goto err_close_rqt_single;
-	}
-
-	err = mlx5e_open_flow_table(priv);
-	if (err) {
-		netdev_err(netdev, "%s: mlx5e_open_flow_table failed, %d\n",
-			   __func__, err);
-		goto err_close_tirs;
+		return err;
 	}
 
 	err = mlx5e_add_all_vlan_rules(priv);
 	if (err) {
 		netdev_err(netdev, "%s: mlx5e_add_all_vlan_rules failed, %d\n",
 			   __func__, err);
-		goto err_close_flow_table;
+		goto err_close_channels;
 	}
 
 	mlx5e_init_eth_addr(priv);
 
-	set_bit(MLX5E_STATE_OPENED, &priv->state);
-
 	mlx5e_update_carrier(priv);
+	mlx5e_redirect_rqts(priv);
 	mlx5e_set_rx_mode_core(priv);
 
 	schedule_delayed_work(&priv->update_stats_work, 0);
 	return 0;
 
-err_close_flow_table:
-	mlx5e_close_flow_table(priv);
-
-err_close_tirs:
-	mlx5e_close_tirs(priv);
-
-err_close_rqt_single:
-	mlx5e_close_rqt(priv, MLX5E_SINGLE_RQ_RQT);
-
-err_close_rqt_indir:
-	mlx5e_close_rqt(priv, MLX5E_INDIRECTION_RQT);
-
 err_close_channels:
 	mlx5e_close_channels(priv);
 
-err_close_tises:
-	mlx5e_close_tises(priv);
-
 	return err;
 }
 
@@ -1682,13 +1709,9 @@ int mlx5e_close_locked(struct net_device *netdev)
 
 	mlx5e_set_rx_mode_core(priv);
 	mlx5e_del_all_vlan_rules(priv);
+	mlx5e_redirect_rqts(priv);
 	netif_carrier_off(priv->netdev);
-	mlx5e_close_flow_table(priv);
-	mlx5e_close_tirs(priv);
-	mlx5e_close_rqt(priv, MLX5E_SINGLE_RQ_RQT);
-	mlx5e_close_rqt(priv, MLX5E_INDIRECTION_RQT);
 	mlx5e_close_channels(priv);
-	mlx5e_close_tises(priv);
 
 	return 0;
 }
@@ -1766,6 +1789,8 @@ static int mlx5e_set_features(struct net_device *netdev,
 			mlx5e_close_locked(priv->netdev);
 
 		priv->params.lro_en = !!(features & NETIF_F_LRO);
+		mlx5e_modify_tir_lro(priv, MLX5E_TT_IPV4_TCP);
+		mlx5e_modify_tir_lro(priv, MLX5E_TT_IPV6_TCP);
 
 		if (was_opened)
 			err = mlx5e_open_locked(priv->netdev);
@@ -2026,16 +2051,72 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev)
 		goto err_dealloc_transport_domain;
 	}
 
+	err = mlx5e_open_tises(priv);
+	if (err) {
+		mlx5_core_warn(mdev, "open tises failed, %d\n", err);
+		goto err_destroy_mkey;
+	}
+
+	err = mlx5e_open_drop_rq(priv);
+	if (err) {
+		mlx5_core_err(mdev, "open drop rq failed, %d\n", err);
+		goto err_close_tises;
+	}
+
+	err = mlx5e_open_rqt(priv, MLX5E_INDIRECTION_RQT);
+	if (err) {
+		mlx5_core_warn(mdev, "open rqt(INDIR) failed, %d\n", err);
+		goto err_close_drop_rq;
+	}
+
+	err = mlx5e_open_rqt(priv, MLX5E_SINGLE_RQ_RQT);
+	if (err) {
+		mlx5_core_warn(mdev, "open rqt(SINGLE) failed, %d\n", err);
+		goto err_close_rqt_indir;
+	}
+
+	err = mlx5e_open_tirs(priv);
+	if (err) {
+		mlx5_core_warn(mdev, "open tirs failed, %d\n", err);
+		goto err_close_rqt_single;
+	}
+
+	err = mlx5e_open_flow_table(priv);
+	if (err) {
+		mlx5_core_warn(mdev, "open flow table failed, %d\n", err);
+		goto err_close_tirs;
+	}
+
+	mlx5e_init_eth_addr(priv);
+
 	err = register_netdev(netdev);
 	if (err) {
 		mlx5_core_err(mdev, "register_netdev failed, %d\n", err);
-		goto err_destroy_mkey;
+		goto err_close_flow_table;
 	}
 
 	mlx5e_enable_async_events(priv);
 
 	return priv;
 
+err_close_flow_table:
+	mlx5e_close_flow_table(priv);
+
+err_close_tirs:
+	mlx5e_close_tirs(priv);
+
+err_close_rqt_single:
+	mlx5e_close_rqt(priv, MLX5E_SINGLE_RQ_RQT);
+
+err_close_rqt_indir:
+	mlx5e_close_rqt(priv, MLX5E_INDIRECTION_RQT);
+
+err_close_drop_rq:
+	mlx5e_close_drop_rq(priv);
+
+err_close_tises:
+	mlx5e_close_tises(priv);
+
 err_destroy_mkey:
 	mlx5_core_destroy_mkey(mdev, &priv->mr);
 
@@ -2060,6 +2141,12 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv)
 	struct net_device *netdev = priv->netdev;
 
 	unregister_netdev(netdev);
+	mlx5e_close_flow_table(priv);
+	mlx5e_close_tirs(priv);
+	mlx5e_close_rqt(priv, MLX5E_SINGLE_RQ_RQT);
+	mlx5e_close_rqt(priv, MLX5E_INDIRECTION_RQT);
+	mlx5e_close_drop_rq(priv);
+	mlx5e_close_tises(priv);
 	mlx5_core_destroy_mkey(priv->mdev, &priv->mr);
 	mlx5_dealloc_transport_domain(priv->mdev, priv->tdn);
 	mlx5_core_dealloc_pd(priv->mdev, priv->pdn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
index e6453f61141e..b4c87c7b0cf0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
@@ -387,6 +387,18 @@ int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen,
 	return err;
 }
 
+int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in,
+			 int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_rqt_out)];
+
+	MLX5_SET(modify_rqt_in, in, rqtn, rqtn);
+	MLX5_SET(modify_rqt_in, in, opcode, MLX5_CMD_OP_MODIFY_RQT);
+
+	memset(out, 0, sizeof(out));
+	return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
+}
+
 void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn)
 {
 	u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)];
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
index d436c2d8b527..74cae51436e4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
@@ -65,6 +65,8 @@ int mlx5_core_arm_xsrq(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm);
 
 int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen,
 			 u32 *rqtn);
+int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in,
+			 int inlen);
 void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn);
 
 #endif /* __TRANSOBJ_H__ */
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 469b7bda3304..dd2097455a2e 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4123,6 +4123,13 @@ struct mlx5_ifc_modify_rqt_out_bits {
 	u8         reserved_1[0x40];
 };
 
+struct mlx5_ifc_rqt_bitmask_bits {
+	u8	   reserved[0x20];
+
+	u8         reserved1[0x1f];
+	u8         rqn_list[0x1];
+};
+
 struct mlx5_ifc_modify_rqt_in_bits {
 	u8         opcode[0x10];
 	u8         reserved_0[0x10];
@@ -4135,7 +4142,7 @@ struct mlx5_ifc_modify_rqt_in_bits {
 
 	u8         reserved_3[0x20];
 
-	u8         modify_bitmask[0x40];
+	struct mlx5_ifc_rqt_bitmask_bits bitmask;
 
 	u8         reserved_4[0x40];
 
-- 
cgit v1.2.3-70-g09d2


From efea389d3cc6427a9a94e92b2d7bf4c862f2cfcf Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Tue, 4 Aug 2015 14:05:47 +0300
Subject: net/mlx5_core: Support physical port counters

Added physical port counters in the following standard formats to
ethtool statistics:
  - IEEE 802.3
  - RFC2863
  - RFC2819

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       | 75 ++++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 10 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 42 ++++++++++++
 include/linux/mlx5/device.h                        | 10 +++
 include/linux/mlx5/driver.h                        |  1 +
 5 files changed, 137 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 35c33907a9ff..e9d7d90363a8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -138,6 +138,80 @@ struct mlx5e_vport_stats {
 #define NUM_VPORT_COUNTERS     31
 };
 
+static const char pport_strings[][ETH_GSTRING_LEN] = {
+	/* IEEE802.3 counters */
+	"frames_tx",
+	"frames_rx",
+	"check_seq_err",
+	"alignment_err",
+	"octets_tx",
+	"octets_received",
+	"multicast_xmitted",
+	"broadcast_xmitted",
+	"multicast_rx",
+	"broadcast_rx",
+	"in_range_len_errors",
+	"out_of_range_len",
+	"too_long_errors",
+	"symbol_err",
+	"mac_control_tx",
+	"mac_control_rx",
+	"unsupported_op_rx",
+	"pause_ctrl_rx",
+	"pause_ctrl_tx",
+
+	/* RFC2863 counters */
+	"in_octets",
+	"in_ucast_pkts",
+	"in_discards",
+	"in_errors",
+	"in_unknown_protos",
+	"out_octets",
+	"out_ucast_pkts",
+	"out_discards",
+	"out_errors",
+	"in_multicast_pkts",
+	"in_broadcast_pkts",
+	"out_multicast_pkts",
+	"out_broadcast_pkts",
+
+	/* RFC2819 counters */
+	"drop_events",
+	"octets",
+	"pkts",
+	"broadcast_pkts",
+	"multicast_pkts",
+	"crc_align_errors",
+	"undersize_pkts",
+	"oversize_pkts",
+	"fragments",
+	"jabbers",
+	"collisions",
+	"p64octets",
+	"p65to127octets",
+	"p128to255octets",
+	"p256to511octets",
+	"p512to1023octets",
+	"p1024to1518octets",
+	"p1519to2047octets",
+	"p2048to4095octets",
+	"p4096to8191octets",
+	"p8192to10239octets",
+};
+
+#define NUM_IEEE_802_3_COUNTERS		19
+#define NUM_RFC_2863_COUNTERS		13
+#define NUM_RFC_2819_COUNTERS		21
+#define NUM_PPORT_COUNTERS		(NUM_IEEE_802_3_COUNTERS + \
+					 NUM_RFC_2863_COUNTERS + \
+					 NUM_RFC_2819_COUNTERS)
+
+struct mlx5e_pport_stats {
+	__be64 IEEE_802_3_counters[NUM_IEEE_802_3_COUNTERS];
+	__be64 RFC_2863_counters[NUM_RFC_2863_COUNTERS];
+	__be64 RFC_2819_counters[NUM_RFC_2819_COUNTERS];
+};
+
 static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
 	"packets",
 	"csum_none",
@@ -180,6 +254,7 @@ struct mlx5e_sq_stats {
 
 struct mlx5e_stats {
 	struct mlx5e_vport_stats   vport;
+	struct mlx5e_pport_stats   pport;
 };
 
 struct mlx5e_params {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index b95aa3384c36..b549797b315f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -171,7 +171,7 @@ static int mlx5e_get_sset_count(struct net_device *dev, int sset)
 
 	switch (sset) {
 	case ETH_SS_STATS:
-		return NUM_VPORT_COUNTERS +
+		return NUM_VPORT_COUNTERS + NUM_PPORT_COUNTERS +
 		       priv->params.num_channels * NUM_RQ_STATS +
 		       priv->params.num_channels * priv->params.num_tc *
 						   NUM_SQ_STATS;
@@ -200,6 +200,11 @@ static void mlx5e_get_strings(struct net_device *dev,
 			strcpy(data + (idx++) * ETH_GSTRING_LEN,
 			       vport_strings[i]);
 
+		/* PPORT counters */
+		for (i = 0; i < NUM_PPORT_COUNTERS; i++)
+			strcpy(data + (idx++) * ETH_GSTRING_LEN,
+			       pport_strings[i]);
+
 		/* per channel counters */
 		for (i = 0; i < priv->params.num_channels; i++)
 			for (j = 0; j < NUM_RQ_STATS; j++)
@@ -234,6 +239,9 @@ static void mlx5e_get_ethtool_stats(struct net_device *dev,
 	for (i = 0; i < NUM_VPORT_COUNTERS; i++)
 		data[idx++] = ((u64 *)&priv->stats.vport)[i];
 
+	for (i = 0; i < NUM_PPORT_COUNTERS; i++)
+		data[idx++] = be64_to_cpu(((__be64 *)&priv->stats.pport)[i]);
+
 	/* per channel counters */
 	for (i = 0; i < priv->params.num_channels; i++)
 		for (j = 0; j < NUM_RQ_STATS; j++)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b8023a7484e0..111427b33ec8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -82,6 +82,47 @@ static void mlx5e_update_carrier_work(struct work_struct *work)
 	mutex_unlock(&priv->state_lock);
 }
 
+static void mlx5e_update_pport_counters(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_pport_stats *s = &priv->stats.pport;
+	u32 *in;
+	u32 *out;
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+
+	in  = mlx5_vzalloc(sz);
+	out = mlx5_vzalloc(sz);
+	if (!in || !out)
+		goto free_out;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out,
+			     sz, MLX5_REG_PPCNT, 0, 0);
+	memcpy(s->IEEE_802_3_counters,
+	       MLX5_ADDR_OF(ppcnt_reg, out, counter_set),
+	       sizeof(s->IEEE_802_3_counters));
+
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out,
+			     sz, MLX5_REG_PPCNT, 0, 0);
+	memcpy(s->RFC_2863_counters,
+	       MLX5_ADDR_OF(ppcnt_reg, out, counter_set),
+	       sizeof(s->RFC_2863_counters));
+
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out,
+			     sz, MLX5_REG_PPCNT, 0, 0);
+	memcpy(s->RFC_2819_counters,
+	       MLX5_ADDR_OF(ppcnt_reg, out, counter_set),
+	       sizeof(s->RFC_2819_counters));
+
+free_out:
+	kvfree(in);
+	kvfree(out);
+}
+
 void mlx5e_update_stats(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
@@ -202,6 +243,7 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 	s->tx_csum_offload = s->tx_packets - tx_offload_none;
 	s->rx_csum_good    = s->rx_packets - s->rx_csum_none;
 
+	mlx5e_update_pport_counters(priv);
 free_out:
 	kvfree(out);
 }
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index b943cd9e2097..250b1ff8b48d 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1182,6 +1182,16 @@ enum {
 	MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR	= 0x40,
 };
 
+enum {
+	MLX5_IEEE_802_3_COUNTERS_GROUP	      = 0x0,
+	MLX5_RFC_2863_COUNTERS_GROUP	      = 0x1,
+	MLX5_RFC_2819_COUNTERS_GROUP	      = 0x2,
+	MLX5_RFC_3635_COUNTERS_GROUP	      = 0x3,
+	MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP = 0x5,
+	MLX5_PER_PRIORITY_COUNTERS_GROUP      = 0x10,
+	MLX5_PER_TRAFFIC_CLASS_COUNTERS_GROUP = 0x11
+};
+
 static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz)
 {
 	if (pkey_sz > MLX5_MAX_LOG_PKEY_TABLE)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5fe0cae1a515..2039546b0ec6 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -103,6 +103,7 @@ enum {
 	MLX5_REG_PMTU		 = 0x5003,
 	MLX5_REG_PTYS		 = 0x5004,
 	MLX5_REG_PAOS		 = 0x5006,
+	MLX5_REG_PPCNT		 = 0x5008,
 	MLX5_REG_PMAOS		 = 0x5012,
 	MLX5_REG_PUDE		 = 0x5009,
 	MLX5_REG_PMPE		 = 0x5010,
-- 
cgit v1.2.3-70-g09d2


From d92cff89a0c80e7e49796366e441d97f07b5d321 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 4 Aug 2015 18:26:19 +0200
Subject: net_dbg_ratelimited: turn into no-op when !DEBUG

The pr_debug family of functions turns into a no-op when -DDEBUG is not
specified, opting instead to call "no_printk", which gets compiled to a
no-op (but retains gcc's nice warnings about printf-style arguments).

The problem with net_dbg_ratelimited is that it is defined to be a
variant of net_ratelimited_function, which expands to essentially:

    if (net_ratelimit())
        pr_debug(fmt, ...);

When DEBUG is not defined, then this becomes,

    if (net_ratelimit())
        ;

This seems benign, except it isn't. Firstly, there's the obvious
overhead of calling net_ratelimit needlessly, which does quite some book
keeping for the rate limiting. Given that the pr_debug and
net_dbg_ratelimited family of functions are sprinkled liberally through
performance critical code, with developers assuming they'll be compiled
out to a no-op most of the time, we certainly do not want this needless
book keeping. Secondly, and most visibly, even though no debug message
is printed when DEBUG is not defined, if there is a flood of
invocations, dmesg winds up peppered with messages such as
"net_ratelimit: 320 callbacks suppressed". This is because our
aforementioned net_ratelimit() function actually prints this text in
some circumstances. It's especially odd to see this when there isn't any
other accompanying debug message.

So, in sum, it doesn't make sense to have this function's current
behavior, and instead it should match what every other debug family of
functions in the kernel does with !DEBUG -- nothing.

This patch replaces calls to net_dbg_ratelimited when !DEBUG with
no_printk, keeping with the idiom of all the other debug print helpers.

Also, though not strictly neccessary, it guards the call with an if (0)
so that all evaluation of any arguments are sure to be compiled out.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index 04aa06852771..049d4b03c4c4 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -239,8 +239,16 @@ do {								\
 	net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__)
 #define net_info_ratelimited(fmt, ...)				\
 	net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__)
+#if defined(DEBUG)
 #define net_dbg_ratelimited(fmt, ...)				\
 	net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__)
+#else
+#define net_dbg_ratelimited(fmt, ...)				\
+	do {							\
+		if (0)						\
+			no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \
+	} while (0)
+#endif
 
 bool __net_get_random_once(void *buf, int nbytes, bool *done,
 			   struct static_key *done_key);
-- 
cgit v1.2.3-70-g09d2


From 3499abb249bb5ed9d21031944bc3059ec4aa2909 Mon Sep 17 00:00:00 2001
From: Andreas Schultz <aschultz@tpip.net>
Date: Wed, 5 Aug 2015 17:51:45 +0200
Subject: netfilter: nfacct: per network namespace support

- Move the nfnl_acct_list into the network namespace, initialize
  and destroy it per namespace
- Keep track of refcnt on nfacct objects, the old logic does not
  longer work with a per namespace list
- Adjust xt_nfacct to pass the namespace when registring objects

Signed-off-by: Andreas Schultz <aschultz@tpip.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink_acct.h |  3 +-
 include/net/net_namespace.h              |  3 ++
 net/netfilter/nfnetlink_acct.c           | 71 ++++++++++++++++++++++----------
 net/netfilter/xt_nfacct.c                |  2 +-
 4 files changed, 56 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_acct.h b/include/linux/netfilter/nfnetlink_acct.h
index 6ec975748742..80ca889b164e 100644
--- a/include/linux/netfilter/nfnetlink_acct.h
+++ b/include/linux/netfilter/nfnetlink_acct.h
@@ -2,6 +2,7 @@
 #define _NFNL_ACCT_H_
 
 #include <uapi/linux/netfilter/nfnetlink_acct.h>
+#include <net/net_namespace.h>
 
 enum {
 	NFACCT_NO_QUOTA		= -1,
@@ -11,7 +12,7 @@ enum {
 
 struct nf_acct;
 
-struct nf_acct *nfnl_acct_find_get(const char *filter_name);
+struct nf_acct *nfnl_acct_find_get(struct net *net, const char *filter_name);
 void nfnl_acct_put(struct nf_acct *acct);
 void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct);
 extern int nfnl_acct_overquota(const struct sk_buff *skb,
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index e951453e0a23..2dcea635ecce 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -118,6 +118,9 @@ struct net {
 #endif
 	struct sock		*nfnl;
 	struct sock		*nfnl_stash;
+#if IS_ENABLED(CONFIG_NETFILTER_NETLINK_ACCT)
+	struct list_head        nfnl_acct_list;
+#endif
 #endif
 #ifdef CONFIG_WEXT_CORE
 	struct sk_buff_head	wext_nlevents;
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index c18af2f63eef..fefbf5f0b28d 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -27,8 +27,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
 MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure");
 
-static LIST_HEAD(nfnl_acct_list);
-
 struct nf_acct {
 	atomic64_t		pkts;
 	atomic64_t		bytes;
@@ -53,6 +51,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
 	     const struct nlmsghdr *nlh, const struct nlattr * const tb[])
 {
 	struct nf_acct *nfacct, *matching = NULL;
+	struct net *net = sock_net(nfnl);
 	char *acct_name;
 	unsigned int size = 0;
 	u32 flags = 0;
@@ -64,7 +63,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
 	if (strlen(acct_name) == 0)
 		return -EINVAL;
 
-	list_for_each_entry(nfacct, &nfnl_acct_list, head) {
+	list_for_each_entry(nfacct, &net->nfnl_acct_list, head) {
 		if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0)
 			continue;
 
@@ -124,7 +123,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
 			     be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS])));
 	}
 	atomic_set(&nfacct->refcnt, 1);
-	list_add_tail_rcu(&nfacct->head, &nfnl_acct_list);
+	list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list);
 	return 0;
 }
 
@@ -185,6 +184,7 @@ nla_put_failure:
 static int
 nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct net *net = sock_net(skb->sk);
 	struct nf_acct *cur, *last;
 	const struct nfacct_filter *filter = cb->data;
 
@@ -196,7 +196,7 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		cb->args[1] = 0;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(cur, &nfnl_acct_list, head) {
+	list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
 		if (last) {
 			if (cur != last)
 				continue;
@@ -257,6 +257,7 @@ static int
 nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb,
 	     const struct nlmsghdr *nlh, const struct nlattr * const tb[])
 {
+	struct net *net = sock_net(nfnl);
 	int ret = -ENOENT;
 	struct nf_acct *cur;
 	char *acct_name;
@@ -283,7 +284,7 @@ nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb,
 		return -EINVAL;
 	acct_name = nla_data(tb[NFACCT_NAME]);
 
-	list_for_each_entry(cur, &nfnl_acct_list, head) {
+	list_for_each_entry(cur, &net->nfnl_acct_list, head) {
 		struct sk_buff *skb2;
 
 		if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
@@ -336,19 +337,20 @@ static int
 nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb,
 	     const struct nlmsghdr *nlh, const struct nlattr * const tb[])
 {
+	struct net *net = sock_net(nfnl);
 	char *acct_name;
 	struct nf_acct *cur;
 	int ret = -ENOENT;
 
 	if (!tb[NFACCT_NAME]) {
-		list_for_each_entry(cur, &nfnl_acct_list, head)
+		list_for_each_entry(cur, &net->nfnl_acct_list, head)
 			nfnl_acct_try_del(cur);
 
 		return 0;
 	}
 	acct_name = nla_data(tb[NFACCT_NAME]);
 
-	list_for_each_entry(cur, &nfnl_acct_list, head) {
+	list_for_each_entry(cur, &net->nfnl_acct_list, head) {
 		if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0)
 			continue;
 
@@ -394,12 +396,12 @@ static const struct nfnetlink_subsystem nfnl_acct_subsys = {
 
 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT);
 
-struct nf_acct *nfnl_acct_find_get(const char *acct_name)
+struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name)
 {
 	struct nf_acct *cur, *acct = NULL;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(cur, &nfnl_acct_list, head) {
+	list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
 		if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
 			continue;
 
@@ -422,7 +424,9 @@ EXPORT_SYMBOL_GPL(nfnl_acct_find_get);
 
 void nfnl_acct_put(struct nf_acct *acct)
 {
-	atomic_dec(&acct->refcnt);
+	if (atomic_dec_and_test(&acct->refcnt))
+		kfree_rcu(acct, rcu_head);
+
 	module_put(THIS_MODULE);
 }
 EXPORT_SYMBOL_GPL(nfnl_acct_put);
@@ -478,34 +482,59 @@ int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct)
 }
 EXPORT_SYMBOL_GPL(nfnl_acct_overquota);
 
+static int __net_init nfnl_acct_net_init(struct net *net)
+{
+	INIT_LIST_HEAD(&net->nfnl_acct_list);
+
+	return 0;
+}
+
+static void __net_exit nfnl_acct_net_exit(struct net *net)
+{
+	struct nf_acct *cur, *tmp;
+
+	list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) {
+		list_del_rcu(&cur->head);
+
+		if (atomic_dec_and_test(&cur->refcnt))
+			kfree_rcu(cur, rcu_head);
+	}
+}
+
+static struct pernet_operations nfnl_acct_ops = {
+        .init   = nfnl_acct_net_init,
+        .exit   = nfnl_acct_net_exit,
+};
+
 static int __init nfnl_acct_init(void)
 {
 	int ret;
 
+	ret = register_pernet_subsys(&nfnl_acct_ops);
+	if (ret < 0) {
+		pr_err("nfnl_acct_init: failed to register pernet ops\n");
+		goto err_out;
+	}
+
 	pr_info("nfnl_acct: registering with nfnetlink.\n");
 	ret = nfnetlink_subsys_register(&nfnl_acct_subsys);
 	if (ret < 0) {
 		pr_err("nfnl_acct_init: cannot register with nfnetlink.\n");
-		goto err_out;
+		goto cleanup_pernet;
 	}
 	return 0;
+
+cleanup_pernet:
+	unregister_pernet_subsys(&nfnl_acct_ops);
 err_out:
 	return ret;
 }
 
 static void __exit nfnl_acct_exit(void)
 {
-	struct nf_acct *cur, *tmp;
-
 	pr_info("nfnl_acct: unregistering from nfnetlink.\n");
 	nfnetlink_subsys_unregister(&nfnl_acct_subsys);
-
-	list_for_each_entry_safe(cur, tmp, &nfnl_acct_list, head) {
-		list_del_rcu(&cur->head);
-		/* We are sure that our objects have no clients at this point,
-		 * it's safe to release them all without checking refcnt. */
-		kfree_rcu(cur, rcu_head);
-	}
+	unregister_pernet_subsys(&nfnl_acct_ops);
 }
 
 module_init(nfnl_acct_init);
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index 8c646ed9c921..3048a7e3a90a 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -37,7 +37,7 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par)
 	struct xt_nfacct_match_info *info = par->matchinfo;
 	struct nf_acct *nfacct;
 
-	nfacct = nfnl_acct_find_get(info->name);
+	nfacct = nfnl_acct_find_get(par->net, info->name);
 	if (nfacct == NULL) {
 		pr_info("xt_nfacct: accounting object with name `%s' "
 			"does not exists\n", info->name);
-- 
cgit v1.2.3-70-g09d2


From 377a51a6b8c09b39b540e5a80ad7029a450cee71 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Mon, 6 Jul 2015 15:40:22 +0200
Subject: proportions: Spelling s/consitent/consistent/

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 include/linux/proportions.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/proportions.h b/include/linux/proportions.h
index 00e8e8fa7358..5440f64d2942 100644
--- a/include/linux/proportions.h
+++ b/include/linux/proportions.h
@@ -33,7 +33,7 @@ struct prop_global {
 /*
  * global proportion descriptor
  *
- * this is needed to consitently flip prop_global structures.
+ * this is needed to consistently flip prop_global structures.
  */
 struct prop_descriptor {
 	int index;
-- 
cgit v1.2.3-70-g09d2


From 99d49e3af6dfde62caffb2913807fdaf293a9e3d Mon Sep 17 00:00:00 2001
From: Frans Klaver <fransklaver@gmail.com>
Date: Thu, 4 Sep 2014 00:58:23 +0200
Subject: mod_devicetable: add space before */

Match the style of the other one-line comments.

Signed-off-by: Frans Klaver <fransklaver@gmail.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 include/linux/mod_devicetable.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 34f25b7bf642..688997a24aad 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -253,7 +253,7 @@ struct pcmcia_device_id {
 
 	__u32 		prod_id_hash[4];
 
-	/* not matched against in kernelspace*/
+	/* not matched against in kernelspace */
 	const char *	prod_id[4];
 
 	/* not matched against */
-- 
cgit v1.2.3-70-g09d2


From a568231f463225eb31593f71446a267a03ae0528 Mon Sep 17 00:00:00 2001
From: Leilk Liu <leilk.liu@mediatek.com>
Date: Fri, 7 Aug 2015 15:19:50 +0800
Subject: spi: mediatek: Add spi bus for Mediatek MT8173

This patch adds basic spi bus for MT8173.

Signed-off-by: Leilk Liu <leilk.liu@mediatek.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/Kconfig                      |   9 +
 drivers/spi/Makefile                     |   1 +
 drivers/spi/spi-mt65xx.c                 | 749 +++++++++++++++++++++++++++++++
 include/linux/platform_data/spi-mt65xx.h |  22 +
 4 files changed, 781 insertions(+)
 create mode 100644 drivers/spi/spi-mt65xx.c
 create mode 100644 include/linux/platform_data/spi-mt65xx.h

(limited to 'include/linux')

diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig
index 0cae1694014d..38ddfba49d76 100644
--- a/drivers/spi/Kconfig
+++ b/drivers/spi/Kconfig
@@ -326,6 +326,15 @@ config SPI_MESON_SPIFC
 	  This enables master mode support for the SPIFC (SPI flash
 	  controller) available in Amlogic Meson SoCs.
 
+config SPI_MT65XX
+	tristate "MediaTek SPI controller"
+	depends on ARCH_MEDIATEK || COMPILE_TEST
+	help
+	  This selects the MediaTek(R) SPI bus driver.
+	  If you want to use MediaTek(R) SPI interface,
+	  say Y or M here.If you are not sure, say N.
+	  SPI drivers for Mediatek MT65XX and MT81XX series ARM SoCs.
+
 config SPI_OC_TINY
 	tristate "OpenCores tiny SPI"
 	depends on GPIOLIB || COMPILE_TEST
diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile
index 1154dbac8f2c..9746beb21769 100644
--- a/drivers/spi/Makefile
+++ b/drivers/spi/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_SPI_MESON_SPIFC)		+= spi-meson-spifc.o
 obj-$(CONFIG_SPI_MPC512x_PSC)		+= spi-mpc512x-psc.o
 obj-$(CONFIG_SPI_MPC52xx_PSC)		+= spi-mpc52xx-psc.o
 obj-$(CONFIG_SPI_MPC52xx)		+= spi-mpc52xx.o
+obj-$(CONFIG_SPI_MT65XX)                += spi-mt65xx.o
 obj-$(CONFIG_SPI_MXS)			+= spi-mxs.o
 obj-$(CONFIG_SPI_NUC900)		+= spi-nuc900.o
 obj-$(CONFIG_SPI_OC_TINY)		+= spi-oc-tiny.o
diff --git a/drivers/spi/spi-mt65xx.c b/drivers/spi/spi-mt65xx.c
new file mode 100644
index 000000000000..4676b0122b89
--- /dev/null
+++ b/drivers/spi/spi-mt65xx.c
@@ -0,0 +1,749 @@
+/*
+ * Copyright (c) 2015 MediaTek Inc.
+ * Author: Leilk Liu <leilk.liu@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/clk.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/platform_data/spi-mt65xx.h>
+#include <linux/pm_runtime.h>
+#include <linux/spi/spi.h>
+
+#define SPI_CFG0_REG                      0x0000
+#define SPI_CFG1_REG                      0x0004
+#define SPI_TX_SRC_REG                    0x0008
+#define SPI_RX_DST_REG                    0x000c
+#define SPI_TX_DATA_REG                   0x0010
+#define SPI_RX_DATA_REG                   0x0014
+#define SPI_CMD_REG                       0x0018
+#define SPI_STATUS0_REG                   0x001c
+#define SPI_PAD_SEL_REG                   0x0024
+
+#define SPI_CFG0_SCK_HIGH_OFFSET          0
+#define SPI_CFG0_SCK_LOW_OFFSET           8
+#define SPI_CFG0_CS_HOLD_OFFSET           16
+#define SPI_CFG0_CS_SETUP_OFFSET          24
+
+#define SPI_CFG1_CS_IDLE_OFFSET           0
+#define SPI_CFG1_PACKET_LOOP_OFFSET       8
+#define SPI_CFG1_PACKET_LENGTH_OFFSET     16
+#define SPI_CFG1_GET_TICK_DLY_OFFSET      30
+
+#define SPI_CFG1_CS_IDLE_MASK             0xff
+#define SPI_CFG1_PACKET_LOOP_MASK         0xff00
+#define SPI_CFG1_PACKET_LENGTH_MASK       0x3ff0000
+
+#define SPI_CMD_ACT_OFFSET                0
+#define SPI_CMD_RESUME_OFFSET             1
+#define SPI_CMD_CPHA_OFFSET               8
+#define SPI_CMD_CPOL_OFFSET               9
+#define SPI_CMD_TXMSBF_OFFSET             12
+#define SPI_CMD_RXMSBF_OFFSET             13
+#define SPI_CMD_RX_ENDIAN_OFFSET          14
+#define SPI_CMD_TX_ENDIAN_OFFSET          15
+
+#define SPI_CMD_RST                  BIT(2)
+#define SPI_CMD_PAUSE_EN             BIT(4)
+#define SPI_CMD_DEASSERT             BIT(5)
+#define SPI_CMD_CPHA                 BIT(8)
+#define SPI_CMD_CPOL                 BIT(9)
+#define SPI_CMD_RX_DMA               BIT(10)
+#define SPI_CMD_TX_DMA               BIT(11)
+#define SPI_CMD_TXMSBF               BIT(12)
+#define SPI_CMD_RXMSBF               BIT(13)
+#define SPI_CMD_RX_ENDIAN            BIT(14)
+#define SPI_CMD_TX_ENDIAN            BIT(15)
+#define SPI_CMD_FINISH_IE            BIT(16)
+#define SPI_CMD_PAUSE_IE             BIT(17)
+
+#define MTK_SPI_QUIRK_PAD_SELECT 1
+/* Must explicitly send dummy Tx bytes to do Rx only transfer */
+#define MTK_SPI_QUIRK_MUST_TX 1
+
+#define MT8173_SPI_MAX_PAD_SEL 3
+
+#define MTK_SPI_IDLE 0
+#define MTK_SPI_PAUSED 1
+
+#define MTK_SPI_MAX_FIFO_SIZE 32
+#define MTK_SPI_PACKET_SIZE 1024
+
+struct mtk_spi_compatible {
+	u32 need_pad_sel;
+	u32 must_tx;
+};
+
+struct mtk_spi {
+	void __iomem *base;
+	u32 state;
+	u32 pad_sel;
+	struct clk *spi_clk, *parent_clk;
+	struct spi_transfer *cur_transfer;
+	u32 xfer_len;
+	struct scatterlist *tx_sgl, *rx_sgl;
+	u32 tx_sgl_len, rx_sgl_len;
+	const struct mtk_spi_compatible *dev_comp;
+};
+
+static const struct mtk_spi_compatible mt6589_compat = {
+	.need_pad_sel = 0,
+	.must_tx = 0,
+};
+
+static const struct mtk_spi_compatible mt8135_compat = {
+	.need_pad_sel = 0,
+	.must_tx = 0,
+};
+
+static const struct mtk_spi_compatible mt8173_compat = {
+	.need_pad_sel = MTK_SPI_QUIRK_PAD_SELECT,
+	.must_tx = MTK_SPI_QUIRK_MUST_TX,
+};
+
+/*
+ * A piece of default chip info unless the platform
+ * supplies it.
+ */
+static const struct mtk_chip_config mtk_default_chip_info = {
+	.rx_mlsb = 1,
+	.tx_mlsb = 1,
+	.tx_endian = 0,
+	.rx_endian = 0,
+};
+
+static const struct of_device_id mtk_spi_of_match[] = {
+	{ .compatible = "mediatek,mt6589-spi", .data = (void *)&mt6589_compat },
+	{ .compatible = "mediatek,mt8135-spi", .data = (void *)&mt8135_compat },
+	{ .compatible = "mediatek,mt8173-spi", .data = (void *)&mt8173_compat },
+	{}
+};
+MODULE_DEVICE_TABLE(of, mtk_spi_of_match);
+
+static void mtk_spi_reset(struct mtk_spi *mdata)
+{
+	u32 reg_val;
+
+	/* set the software reset bit in SPI_CMD_REG. */
+	reg_val = readl(mdata->base + SPI_CMD_REG);
+	reg_val |= SPI_CMD_RST;
+	writel(reg_val, mdata->base + SPI_CMD_REG);
+
+	reg_val = readl(mdata->base + SPI_CMD_REG);
+	reg_val &= ~SPI_CMD_RST;
+	writel(reg_val, mdata->base + SPI_CMD_REG);
+}
+
+static void mtk_spi_config(struct mtk_spi *mdata,
+			   struct mtk_chip_config *chip_config)
+{
+	u32 reg_val;
+
+	reg_val = readl(mdata->base + SPI_CMD_REG);
+
+	/* set the mlsbx and mlsbtx */
+	reg_val &= ~(SPI_CMD_TXMSBF | SPI_CMD_RXMSBF);
+	reg_val |= (chip_config->tx_mlsb << SPI_CMD_TXMSBF_OFFSET);
+	reg_val |= (chip_config->rx_mlsb << SPI_CMD_RXMSBF_OFFSET);
+
+	/* set the tx/rx endian */
+	reg_val &= ~(SPI_CMD_TX_ENDIAN | SPI_CMD_RX_ENDIAN);
+	reg_val |= (chip_config->tx_endian << SPI_CMD_TX_ENDIAN_OFFSET);
+	reg_val |= (chip_config->rx_endian << SPI_CMD_RX_ENDIAN_OFFSET);
+
+	/* set finish and pause interrupt always enable */
+	reg_val |= SPI_CMD_FINISH_IE | SPI_CMD_PAUSE_EN;
+
+	/* disable dma mode */
+	reg_val &= ~(SPI_CMD_TX_DMA | SPI_CMD_RX_DMA);
+
+	/* disable deassert mode */
+	reg_val &= ~SPI_CMD_DEASSERT;
+
+	writel(reg_val, mdata->base + SPI_CMD_REG);
+
+	/* pad select */
+	if (mdata->dev_comp->need_pad_sel)
+		writel(mdata->pad_sel, mdata->base + SPI_PAD_SEL_REG);
+}
+
+static int mtk_spi_prepare_hardware(struct spi_master *master)
+{
+	struct spi_transfer *trans;
+	struct mtk_spi *mdata = spi_master_get_devdata(master);
+	struct spi_message *msg = master->cur_msg;
+	int ret;
+
+	ret = clk_prepare_enable(mdata->spi_clk);
+	if (ret < 0) {
+		dev_err(&master->dev, "failed to enable clock (%d)\n", ret);
+		return ret;
+	}
+
+	trans = list_first_entry(&msg->transfers, struct spi_transfer,
+				 transfer_list);
+	if (trans->cs_change == 0) {
+		mdata->state = MTK_SPI_IDLE;
+		mtk_spi_reset(mdata);
+	}
+
+	return ret;
+}
+
+static int mtk_spi_unprepare_hardware(struct spi_master *master)
+{
+	struct mtk_spi *mdata = spi_master_get_devdata(master);
+
+	clk_disable_unprepare(mdata->spi_clk);
+
+	return 0;
+}
+
+static int mtk_spi_prepare_message(struct spi_master *master,
+				   struct spi_message *msg)
+{
+	u32 reg_val;
+	u8 cpha, cpol;
+	struct mtk_chip_config *chip_config;
+	struct spi_device *spi = msg->spi;
+	struct mtk_spi *mdata = spi_master_get_devdata(master);
+
+	cpha = spi->mode & SPI_CPHA ? 1 : 0;
+	cpol = spi->mode & SPI_CPOL ? 1 : 0;
+
+	reg_val = readl(mdata->base + SPI_CMD_REG);
+	reg_val &= ~(SPI_CMD_CPHA | SPI_CMD_CPOL);
+	reg_val |= (cpha << SPI_CMD_CPHA_OFFSET);
+	reg_val |= (cpol << SPI_CMD_CPOL_OFFSET);
+	writel(reg_val, mdata->base + SPI_CMD_REG);
+
+	chip_config = spi->controller_data;
+	if (!chip_config) {
+		chip_config = (void *)&mtk_default_chip_info;
+		spi->controller_data = chip_config;
+	}
+	mtk_spi_config(mdata, chip_config);
+
+	return 0;
+}
+
+static void mtk_spi_set_cs(struct spi_device *spi, bool enable)
+{
+	u32 reg_val;
+	struct mtk_spi *mdata = spi_master_get_devdata(spi->master);
+
+	reg_val = readl(mdata->base + SPI_CMD_REG);
+	if (!enable)
+		reg_val |= SPI_CMD_PAUSE_EN;
+	else
+		reg_val &= ~SPI_CMD_PAUSE_EN;
+	writel(reg_val, mdata->base + SPI_CMD_REG);
+}
+
+static void mtk_spi_prepare_transfer(struct spi_master *master,
+				     struct spi_transfer *xfer)
+{
+	u32 spi_clk_hz, div, high_time, low_time, holdtime,
+	    setuptime, cs_idletime, reg_val = 0;
+	struct mtk_spi *mdata = spi_master_get_devdata(master);
+
+	spi_clk_hz = clk_get_rate(mdata->spi_clk);
+	if (xfer->speed_hz < spi_clk_hz / 2)
+		div = DIV_ROUND_UP(spi_clk_hz, xfer->speed_hz);
+	else
+		div = 1;
+
+	high_time = (div + 1) / 2;
+	low_time = (div + 1) / 2;
+	holdtime = (div + 1) / 2 * 2;
+	setuptime = (div + 1) / 2 * 2;
+	cs_idletime = (div + 1) / 2 * 2;
+
+	reg_val |= (((high_time - 1) & 0xff) << SPI_CFG0_SCK_HIGH_OFFSET);
+	reg_val |= (((low_time - 1) & 0xff) << SPI_CFG0_SCK_LOW_OFFSET);
+	reg_val |= (((holdtime - 1) & 0xff) << SPI_CFG0_CS_HOLD_OFFSET);
+	reg_val |= (((setuptime - 1) & 0xff) << SPI_CFG0_CS_SETUP_OFFSET);
+	writel(reg_val, mdata->base + SPI_CFG0_REG);
+
+	reg_val = readl(mdata->base + SPI_CFG1_REG);
+	reg_val &= ~SPI_CFG1_CS_IDLE_MASK;
+	reg_val |= (((cs_idletime - 1) & 0xff) << SPI_CFG1_CS_IDLE_OFFSET);
+	writel(reg_val, mdata->base + SPI_CFG1_REG);
+}
+
+static void mtk_spi_setup_packet(struct spi_master *master)
+{
+	u32 packet_size, packet_loop, reg_val;
+	struct mtk_spi *mdata = spi_master_get_devdata(master);
+
+	packet_size = min_t(unsigned, mdata->xfer_len, MTK_SPI_PACKET_SIZE);
+	packet_loop = mdata->xfer_len / packet_size;
+
+	reg_val = readl(mdata->base + SPI_CFG1_REG);
+	reg_val &= ~(SPI_CFG1_PACKET_LENGTH_MASK + SPI_CFG1_PACKET_LOOP_MASK);
+	reg_val |= (packet_size - 1) << SPI_CFG1_PACKET_LENGTH_OFFSET;
+	reg_val |= (packet_loop - 1) << SPI_CFG1_PACKET_LOOP_OFFSET;
+	writel(reg_val, mdata->base + SPI_CFG1_REG);
+}
+
+static void mtk_spi_enable_transfer(struct spi_master *master)
+{
+	int cmd;
+	struct mtk_spi *mdata = spi_master_get_devdata(master);
+
+	cmd = readl(mdata->base + SPI_CMD_REG);
+	if (mdata->state == MTK_SPI_IDLE)
+		cmd |= 1 << SPI_CMD_ACT_OFFSET;
+	else
+		cmd |= 1 << SPI_CMD_RESUME_OFFSET;
+	writel(cmd, mdata->base + SPI_CMD_REG);
+}
+
+static int mtk_spi_get_mult_delta(int xfer_len)
+{
+	int mult_delta;
+
+	if (xfer_len > MTK_SPI_PACKET_SIZE)
+		mult_delta = xfer_len % MTK_SPI_PACKET_SIZE;
+	else
+		mult_delta = 0;
+
+	return mult_delta;
+}
+
+static void mtk_spi_update_mdata_len(struct spi_master *master)
+{
+	int mult_delta;
+	struct mtk_spi *mdata = spi_master_get_devdata(master);
+
+	if (mdata->tx_sgl_len && mdata->rx_sgl_len) {
+		if (mdata->tx_sgl_len > mdata->rx_sgl_len) {
+			mult_delta = mtk_spi_get_mult_delta(mdata->rx_sgl_len);
+			mdata->xfer_len = mdata->rx_sgl_len - mult_delta;
+			mdata->rx_sgl_len = mult_delta;
+			mdata->tx_sgl_len -= mdata->xfer_len;
+		} else {
+			mult_delta = mtk_spi_get_mult_delta(mdata->tx_sgl_len);
+			mdata->xfer_len = mdata->tx_sgl_len - mult_delta;
+			mdata->tx_sgl_len = mult_delta;
+			mdata->rx_sgl_len -= mdata->xfer_len;
+		}
+	} else if (mdata->tx_sgl_len) {
+		mult_delta = mtk_spi_get_mult_delta(mdata->tx_sgl_len);
+		mdata->xfer_len = mdata->tx_sgl_len - mult_delta;
+		mdata->tx_sgl_len = mult_delta;
+	} else if (mdata->rx_sgl_len) {
+		mult_delta = mtk_spi_get_mult_delta(mdata->rx_sgl_len);
+		mdata->xfer_len = mdata->rx_sgl_len - mult_delta;
+		mdata->rx_sgl_len = mult_delta;
+	}
+}
+
+static void mtk_spi_setup_dma_addr(struct spi_master *master,
+				   struct spi_transfer *xfer)
+{
+	struct mtk_spi *mdata = spi_master_get_devdata(master);
+
+	if (mdata->tx_sgl)
+		writel(cpu_to_le32(xfer->tx_dma), mdata->base + SPI_TX_SRC_REG);
+	if (mdata->rx_sgl)
+		writel(cpu_to_le32(xfer->rx_dma), mdata->base + SPI_RX_DST_REG);
+}
+
+static int mtk_spi_fifo_transfer(struct spi_master *master,
+				 struct spi_device *spi,
+				 struct spi_transfer *xfer)
+{
+	int cnt, i;
+	struct mtk_spi *mdata = spi_master_get_devdata(master);
+
+	mdata->cur_transfer = xfer;
+	mdata->xfer_len = xfer->len;
+	mtk_spi_prepare_transfer(master, xfer);
+	mtk_spi_setup_packet(master);
+
+	if (xfer->len % 4)
+		cnt = xfer->len / 4 + 1;
+	else
+		cnt = xfer->len / 4;
+
+	for (i = 0; i < cnt; i++)
+		writel(*((u32 *)xfer->tx_buf + i),
+		       mdata->base + SPI_TX_DATA_REG);
+
+	mtk_spi_enable_transfer(master);
+
+	return 1;
+}
+
+static int mtk_spi_dma_transfer(struct spi_master *master,
+				struct spi_device *spi,
+				struct spi_transfer *xfer)
+{
+	int cmd;
+	struct mtk_spi *mdata = spi_master_get_devdata(master);
+
+	mdata->tx_sgl = NULL;
+	mdata->rx_sgl = NULL;
+	mdata->tx_sgl_len = 0;
+	mdata->rx_sgl_len = 0;
+	mdata->cur_transfer = xfer;
+
+	mtk_spi_prepare_transfer(master, xfer);
+
+	cmd = readl(mdata->base + SPI_CMD_REG);
+	if (xfer->tx_buf)
+		cmd |= SPI_CMD_TX_DMA;
+	if (xfer->rx_buf)
+		cmd |= SPI_CMD_RX_DMA;
+	writel(cmd, mdata->base + SPI_CMD_REG);
+
+	if (xfer->tx_buf)
+		mdata->tx_sgl = xfer->tx_sg.sgl;
+	if (xfer->rx_buf)
+		mdata->rx_sgl = xfer->rx_sg.sgl;
+
+	if (mdata->tx_sgl) {
+		xfer->tx_dma = sg_dma_address(mdata->tx_sgl);
+		mdata->tx_sgl_len = sg_dma_len(mdata->tx_sgl);
+	}
+	if (mdata->rx_sgl) {
+		xfer->rx_dma = sg_dma_address(mdata->rx_sgl);
+		mdata->rx_sgl_len = sg_dma_len(mdata->rx_sgl);
+	}
+
+	mtk_spi_update_mdata_len(master);
+	mtk_spi_setup_packet(master);
+	mtk_spi_setup_dma_addr(master, xfer);
+	mtk_spi_enable_transfer(master);
+
+	return 1;
+}
+
+static int mtk_spi_transfer_one(struct spi_master *master,
+				struct spi_device *spi,
+				struct spi_transfer *xfer)
+{
+	if (master->can_dma(master, spi, xfer))
+		return mtk_spi_dma_transfer(master, spi, xfer);
+	else
+		return mtk_spi_fifo_transfer(master, spi, xfer);
+}
+
+static bool mtk_spi_can_dma(struct spi_master *master,
+			    struct spi_device *spi,
+			    struct spi_transfer *xfer)
+{
+	return xfer->len > MTK_SPI_MAX_FIFO_SIZE;
+}
+
+static irqreturn_t mtk_spi_interrupt(int irq, void *dev_id)
+{
+	u32 cmd, reg_val, i;
+	struct spi_master *master = dev_id;
+	struct mtk_spi *mdata = spi_master_get_devdata(master);
+	struct spi_transfer *trans = mdata->cur_transfer;
+
+	reg_val = readl(mdata->base + SPI_STATUS0_REG);
+	if (reg_val & 0x2)
+		mdata->state = MTK_SPI_PAUSED;
+	else
+		mdata->state = MTK_SPI_IDLE;
+
+	if (!master->can_dma(master, master->cur_msg->spi, trans)) {
+		/* xfer len is not N*4 bytes every time in a transfer,
+		 * but SPI_RX_DATA_REG must reads 4 bytes once,
+		 * so rx buffer byte by byte.
+		 */
+		if (trans->rx_buf) {
+			for (i = 0; i < mdata->xfer_len; i++) {
+				if (i % 4 == 0)
+					reg_val =
+					readl(mdata->base + SPI_RX_DATA_REG);
+				*((u8 *)(trans->rx_buf + i)) =
+					(reg_val >> ((i % 4) * 8)) & 0xff;
+			}
+		}
+		spi_finalize_current_transfer(master);
+		return IRQ_HANDLED;
+	}
+
+	if (mdata->tx_sgl)
+		trans->tx_dma += mdata->xfer_len;
+	if (mdata->rx_sgl)
+		trans->rx_dma += mdata->xfer_len;
+
+	if (mdata->tx_sgl && (mdata->tx_sgl_len == 0)) {
+		mdata->tx_sgl = sg_next(mdata->tx_sgl);
+		if (mdata->tx_sgl) {
+			trans->tx_dma = sg_dma_address(mdata->tx_sgl);
+			mdata->tx_sgl_len = sg_dma_len(mdata->tx_sgl);
+		}
+	}
+	if (mdata->rx_sgl && (mdata->rx_sgl_len == 0)) {
+		mdata->rx_sgl = sg_next(mdata->rx_sgl);
+		if (mdata->rx_sgl) {
+			trans->rx_dma = sg_dma_address(mdata->rx_sgl);
+			mdata->rx_sgl_len = sg_dma_len(mdata->rx_sgl);
+		}
+	}
+
+	if (!mdata->tx_sgl && !mdata->rx_sgl) {
+		/* spi disable dma */
+		cmd = readl(mdata->base + SPI_CMD_REG);
+		cmd &= ~SPI_CMD_TX_DMA;
+		cmd &= ~SPI_CMD_RX_DMA;
+		writel(cmd, mdata->base + SPI_CMD_REG);
+
+		spi_finalize_current_transfer(master);
+		return IRQ_HANDLED;
+	}
+
+	mtk_spi_update_mdata_len(master);
+	mtk_spi_setup_packet(master);
+	mtk_spi_setup_dma_addr(master, trans);
+	mtk_spi_enable_transfer(master);
+
+	return IRQ_HANDLED;
+}
+
+static int mtk_spi_probe(struct platform_device *pdev)
+{
+	struct spi_master *master;
+	struct mtk_spi *mdata;
+	const struct of_device_id *of_id;
+	struct resource *res;
+	int	irq, ret;
+
+	master = spi_alloc_master(&pdev->dev, sizeof(*mdata));
+	if (!master) {
+		dev_err(&pdev->dev, "failed to alloc spi master\n");
+		return -ENOMEM;
+	}
+
+	master->auto_runtime_pm = true;
+	master->dev.of_node = pdev->dev.of_node;
+	master->mode_bits = SPI_CPOL | SPI_CPHA;
+
+	master->set_cs = mtk_spi_set_cs;
+	master->prepare_transfer_hardware = mtk_spi_prepare_hardware;
+	master->unprepare_transfer_hardware = mtk_spi_unprepare_hardware;
+	master->prepare_message = mtk_spi_prepare_message;
+	master->transfer_one = mtk_spi_transfer_one;
+	master->can_dma = mtk_spi_can_dma;
+
+	of_id = of_match_node(mtk_spi_of_match, pdev->dev.of_node);
+	if (!of_id) {
+		dev_err(&pdev->dev, "failed to probe of_node\n");
+		ret = -EINVAL;
+		goto err_put_master;
+	}
+
+	mdata = spi_master_get_devdata(master);
+	mdata->dev_comp = of_id->data;
+	if (mdata->dev_comp->must_tx)
+		master->flags = SPI_MASTER_MUST_TX;
+
+	if (mdata->dev_comp->need_pad_sel) {
+		ret = of_property_read_u32(pdev->dev.of_node,
+					   "mediatek,pad-select",
+					   &mdata->pad_sel);
+		if (ret) {
+			dev_err(&pdev->dev, "failed to read pad select: %d\n",
+				ret);
+			goto err_put_master;
+		}
+
+		if (mdata->pad_sel > MT8173_SPI_MAX_PAD_SEL) {
+			dev_err(&pdev->dev, "wrong pad-select: %u\n",
+				mdata->pad_sel);
+			ret = -EINVAL;
+			goto err_put_master;
+		}
+	}
+
+	platform_set_drvdata(pdev, master);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		ret = -ENODEV;
+		dev_err(&pdev->dev, "failed to determine base address\n");
+		goto err_put_master;
+	}
+
+	mdata->base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(mdata->base)) {
+		ret = PTR_ERR(mdata->base);
+		goto err_put_master;
+	}
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		dev_err(&pdev->dev, "failed to get irq (%d)\n", irq);
+		ret = irq;
+		goto err_put_master;
+	}
+
+	if (!pdev->dev.dma_mask)
+		pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
+
+	ret = devm_request_irq(&pdev->dev, irq, mtk_spi_interrupt,
+			       IRQF_TRIGGER_NONE, dev_name(&pdev->dev), master);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register irq (%d)\n", ret);
+		goto err_put_master;
+	}
+
+	mdata->spi_clk = devm_clk_get(&pdev->dev, "spi-clk");
+	if (IS_ERR(mdata->spi_clk)) {
+		ret = PTR_ERR(mdata->spi_clk);
+		dev_err(&pdev->dev, "failed to get spi-clk: %d\n", ret);
+		goto err_put_master;
+	}
+
+	mdata->parent_clk = devm_clk_get(&pdev->dev, "parent-clk");
+	if (IS_ERR(mdata->parent_clk)) {
+		ret = PTR_ERR(mdata->parent_clk);
+		dev_err(&pdev->dev, "failed to get parent-clk: %d\n", ret);
+		goto err_put_master;
+	}
+
+	ret = clk_prepare_enable(mdata->spi_clk);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "failed to enable spi_clk (%d)\n", ret);
+		goto err_put_master;
+	}
+
+	ret = clk_set_parent(mdata->spi_clk, mdata->parent_clk);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "failed to clk_set_parent (%d)\n", ret);
+		goto err_disable_clk;
+	}
+
+	clk_disable_unprepare(mdata->spi_clk);
+
+	pm_runtime_enable(&pdev->dev);
+
+	ret = devm_spi_register_master(&pdev->dev, master);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register master (%d)\n", ret);
+		goto err_put_master;
+	}
+
+	return 0;
+
+err_disable_clk:
+	clk_disable_unprepare(mdata->spi_clk);
+err_put_master:
+	spi_master_put(master);
+
+	return ret;
+}
+
+static int mtk_spi_remove(struct platform_device *pdev)
+{
+	struct spi_master *master = platform_get_drvdata(pdev);
+	struct mtk_spi *mdata = spi_master_get_devdata(master);
+
+	pm_runtime_disable(&pdev->dev);
+
+	mtk_spi_reset(mdata);
+	clk_disable_unprepare(mdata->spi_clk);
+	spi_master_put(master);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int mtk_spi_suspend(struct device *dev)
+{
+	int ret;
+	struct spi_master *master = dev_get_drvdata(dev);
+	struct mtk_spi *mdata = spi_master_get_devdata(master);
+
+	ret = spi_master_suspend(master);
+	if (ret)
+		return ret;
+
+	if (!pm_runtime_suspended(dev))
+		clk_disable_unprepare(mdata->spi_clk);
+
+	return ret;
+}
+
+static int mtk_spi_resume(struct device *dev)
+{
+	int ret;
+	struct spi_master *master = dev_get_drvdata(dev);
+	struct mtk_spi *mdata = spi_master_get_devdata(master);
+
+	if (!pm_runtime_suspended(dev)) {
+		ret = clk_prepare_enable(mdata->spi_clk);
+		if (ret < 0)
+			return ret;
+	}
+
+	ret = spi_master_resume(master);
+	if (ret < 0)
+		clk_disable_unprepare(mdata->spi_clk);
+
+	return ret;
+}
+#endif /* CONFIG_PM_SLEEP */
+
+#ifdef CONFIG_PM
+static int mtk_spi_runtime_suspend(struct device *dev)
+{
+	struct spi_master *master = dev_get_drvdata(dev);
+	struct mtk_spi *mdata = spi_master_get_devdata(master);
+
+	clk_disable_unprepare(mdata->spi_clk);
+
+	return 0;
+}
+
+static int mtk_spi_runtime_resume(struct device *dev)
+{
+	struct spi_master *master = dev_get_drvdata(dev);
+	struct mtk_spi *mdata = spi_master_get_devdata(master);
+
+	return clk_prepare_enable(mdata->spi_clk);
+}
+#endif /* CONFIG_PM */
+
+static const struct dev_pm_ops mtk_spi_pm = {
+	SET_SYSTEM_SLEEP_PM_OPS(mtk_spi_suspend, mtk_spi_resume)
+	SET_RUNTIME_PM_OPS(mtk_spi_runtime_suspend,
+			   mtk_spi_runtime_resume, NULL)
+};
+
+struct platform_driver mtk_spi_driver = {
+	.driver = {
+		.name = "mtk-spi",
+		.pm	= &mtk_spi_pm,
+		.of_match_table = mtk_spi_of_match,
+	},
+	.probe = mtk_spi_probe,
+	.remove = mtk_spi_remove,
+};
+
+module_platform_driver(mtk_spi_driver);
+
+MODULE_DESCRIPTION("MTK SPI Controller driver");
+MODULE_AUTHOR("Leilk Liu <leilk.liu@mediatek.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform: mtk_spi");
diff --git a/include/linux/platform_data/spi-mt65xx.h b/include/linux/platform_data/spi-mt65xx.h
new file mode 100644
index 000000000000..751225569d27
--- /dev/null
+++ b/include/linux/platform_data/spi-mt65xx.h
@@ -0,0 +1,22 @@
+/*
+ *  MTK SPI bus driver definitions
+ *
+ * Copyright (c) 2015 MediaTek Inc.
+ * Author: Leilk Liu <leilk.liu@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef ____LINUX_PLATFORM_DATA_SPI_MTK_H
+#define ____LINUX_PLATFORM_DATA_SPI_MTK_H
+
+/* Board specific platform_data */
+struct mtk_chip_config {
+	u32 tx_mlsb;
+	u32 rx_mlsb;
+	u32 tx_endian;
+	u32 rx_endian;
+};
+#endif
-- 
cgit v1.2.3-70-g09d2


From 3cfe7a74d42b7e3644f8b2b26aa20146d4f90f0f Mon Sep 17 00:00:00 2001
From: Nicolas Boichat <drinkcat@chromium.org>
Date: Wed, 8 Jul 2015 14:30:18 +0800
Subject: regmap: Use different lockdep class for each regmap init call

Lockdep validator complains about recursive locking and deadlock
when two different regmap instances are called in a nested order.
That happens anytime a regmap read/write call needs to access
another regmap.

This is because, for performance reason, lockdep groups all locks
initialized by the same mutex_init() in the same lock class.
Therefore all regmap mutexes are in the same lock class, leading
to lockdep "nested locking" warnings if a regmap accesses another
regmap.

In general, it is impossible to establish in advance the hierarchy
of regmaps, so we make sure that each regmap init call initializes
its own static lock_class_key. This is done by wrapping all
regmap_init calls into macros.

This also allows us to give meaningful names to the lock_class_key.
For example, in rt5677 case, we have in /proc/lockdep_chains:
irq_context: 0
[ffffffc0018d2198] &dev->mutex
[ffffffc0018d2198] &dev->mutex
[ffffffc001bd7f60] rt5677:5104:(&rt5677_regmap)->_lock
[ffffffc001bd7f58] rt5677:5096:(&rt5677_regmap_physical)->_lock
[ffffffc001b95448] &(&base->lock)->rlock

The above would have resulted in a lockdep recursive warning
previously. This is not the case anymore as the lockdep validator
now clearly identifies the 2 regmaps as separate.

Signed-off-by: Nicolas Boichat <drinkcat@chromium.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-ac97.c |  22 +++--
 drivers/base/regmap/regmap-i2c.c  |  22 +++--
 drivers/base/regmap/regmap-mmio.c |  27 ++++--
 drivers/base/regmap/regmap-spi.c  |  22 +++--
 drivers/base/regmap/regmap-spmi.c |  44 +++++----
 drivers/base/regmap/regmap.c      |  31 +++---
 include/linux/regmap.h            | 192 ++++++++++++++++++++++++++++----------
 7 files changed, 250 insertions(+), 110 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regmap-ac97.c b/drivers/base/regmap/regmap-ac97.c
index 8d304e2a943d..aa631be8b821 100644
--- a/drivers/base/regmap/regmap-ac97.c
+++ b/drivers/base/regmap/regmap-ac97.c
@@ -87,12 +87,15 @@ static const struct regmap_bus ac97_regmap_bus = {
  * The return value will be an ERR_PTR() on error or a valid pointer to
  * a struct regmap.
  */
-struct regmap *regmap_init_ac97(struct snd_ac97 *ac97,
-				const struct regmap_config *config)
+struct regmap *__regmap_init_ac97(struct snd_ac97 *ac97,
+				  const struct regmap_config *config,
+				  struct lock_class_key *lock_key,
+				  const char *lock_name)
 {
-	return regmap_init(&ac97->dev, &ac97_regmap_bus, ac97, config);
+	return __regmap_init(&ac97->dev, &ac97_regmap_bus, ac97, config,
+			     lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(regmap_init_ac97);
+EXPORT_SYMBOL_GPL(__regmap_init_ac97);
 
 /**
  * devm_regmap_init_ac97(): Initialise AC'97 register map
@@ -104,11 +107,14 @@ EXPORT_SYMBOL_GPL(regmap_init_ac97);
  * to a struct regmap.  The regmap will be automatically freed by the
  * device management code.
  */
-struct regmap *devm_regmap_init_ac97(struct snd_ac97 *ac97,
-				     const struct regmap_config *config)
+struct regmap *__devm_regmap_init_ac97(struct snd_ac97 *ac97,
+				       const struct regmap_config *config,
+				       struct lock_class_key *lock_key,
+				       const char *lock_name)
 {
-	return devm_regmap_init(&ac97->dev, &ac97_regmap_bus, ac97, config);
+	return __devm_regmap_init(&ac97->dev, &ac97_regmap_bus, ac97, config,
+				  lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(devm_regmap_init_ac97);
+EXPORT_SYMBOL_GPL(__devm_regmap_init_ac97);
 
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/base/regmap/regmap-i2c.c b/drivers/base/regmap/regmap-i2c.c
index 4b76e33110a2..3163b22e2baf 100644
--- a/drivers/base/regmap/regmap-i2c.c
+++ b/drivers/base/regmap/regmap-i2c.c
@@ -242,17 +242,20 @@ static const struct regmap_bus *regmap_get_i2c_bus(struct i2c_client *i2c,
  * The return value will be an ERR_PTR() on error or a valid pointer to
  * a struct regmap.
  */
-struct regmap *regmap_init_i2c(struct i2c_client *i2c,
-			       const struct regmap_config *config)
+struct regmap *__regmap_init_i2c(struct i2c_client *i2c,
+				 const struct regmap_config *config,
+				 struct lock_class_key *lock_key,
+				 const char *lock_name)
 {
 	const struct regmap_bus *bus = regmap_get_i2c_bus(i2c, config);
 
 	if (IS_ERR(bus))
 		return ERR_CAST(bus);
 
-	return regmap_init(&i2c->dev, bus, &i2c->dev, config);
+	return __regmap_init(&i2c->dev, bus, &i2c->dev, config,
+			     lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(regmap_init_i2c);
+EXPORT_SYMBOL_GPL(__regmap_init_i2c);
 
 /**
  * devm_regmap_init_i2c(): Initialise managed register map
@@ -264,16 +267,19 @@ EXPORT_SYMBOL_GPL(regmap_init_i2c);
  * to a struct regmap.  The regmap will be automatically freed by the
  * device management code.
  */
-struct regmap *devm_regmap_init_i2c(struct i2c_client *i2c,
-				    const struct regmap_config *config)
+struct regmap *__devm_regmap_init_i2c(struct i2c_client *i2c,
+				      const struct regmap_config *config,
+				      struct lock_class_key *lock_key,
+				      const char *lock_name)
 {
 	const struct regmap_bus *bus = regmap_get_i2c_bus(i2c, config);
 
 	if (IS_ERR(bus))
 		return ERR_CAST(bus);
 
-	return devm_regmap_init(&i2c->dev, bus, &i2c->dev, config);
+	return __devm_regmap_init(&i2c->dev, bus, &i2c->dev, config,
+				  lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(devm_regmap_init_i2c);
+EXPORT_SYMBOL_GPL(__devm_regmap_init_i2c);
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/base/regmap/regmap-mmio.c b/drivers/base/regmap/regmap-mmio.c
index 04a329a377e9..a1b2b270e4bc 100644
--- a/drivers/base/regmap/regmap-mmio.c
+++ b/drivers/base/regmap/regmap-mmio.c
@@ -307,9 +307,11 @@ err_free:
  * The return value will be an ERR_PTR() on error or a valid pointer to
  * a struct regmap.
  */
-struct regmap *regmap_init_mmio_clk(struct device *dev, const char *clk_id,
-				    void __iomem *regs,
-				    const struct regmap_config *config)
+struct regmap *__regmap_init_mmio_clk(struct device *dev, const char *clk_id,
+				      void __iomem *regs,
+				      const struct regmap_config *config,
+				      struct lock_class_key *lock_key,
+				      const char *lock_name)
 {
 	struct regmap_mmio_context *ctx;
 
@@ -317,9 +319,10 @@ struct regmap *regmap_init_mmio_clk(struct device *dev, const char *clk_id,
 	if (IS_ERR(ctx))
 		return ERR_CAST(ctx);
 
-	return regmap_init(dev, &regmap_mmio, ctx, config);
+	return __regmap_init(dev, &regmap_mmio, ctx, config,
+			     lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(regmap_init_mmio_clk);
+EXPORT_SYMBOL_GPL(__regmap_init_mmio_clk);
 
 /**
  * devm_regmap_init_mmio_clk(): Initialise managed register map with clock
@@ -333,9 +336,12 @@ EXPORT_SYMBOL_GPL(regmap_init_mmio_clk);
  * to a struct regmap.  The regmap will be automatically freed by the
  * device management code.
  */
-struct regmap *devm_regmap_init_mmio_clk(struct device *dev, const char *clk_id,
-					 void __iomem *regs,
-					 const struct regmap_config *config)
+struct regmap *__devm_regmap_init_mmio_clk(struct device *dev,
+					   const char *clk_id,
+					   void __iomem *regs,
+					   const struct regmap_config *config,
+					   struct lock_class_key *lock_key,
+					   const char *lock_name)
 {
 	struct regmap_mmio_context *ctx;
 
@@ -343,8 +349,9 @@ struct regmap *devm_regmap_init_mmio_clk(struct device *dev, const char *clk_id,
 	if (IS_ERR(ctx))
 		return ERR_CAST(ctx);
 
-	return devm_regmap_init(dev, &regmap_mmio, ctx, config);
+	return __devm_regmap_init(dev, &regmap_mmio, ctx, config,
+				  lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(devm_regmap_init_mmio_clk);
+EXPORT_SYMBOL_GPL(__devm_regmap_init_mmio_clk);
 
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/base/regmap/regmap-spi.c b/drivers/base/regmap/regmap-spi.c
index 53d1148e80a0..4c7850d660d1 100644
--- a/drivers/base/regmap/regmap-spi.c
+++ b/drivers/base/regmap/regmap-spi.c
@@ -122,12 +122,15 @@ static struct regmap_bus regmap_spi = {
  * The return value will be an ERR_PTR() on error or a valid pointer to
  * a struct regmap.
  */
-struct regmap *regmap_init_spi(struct spi_device *spi,
-			       const struct regmap_config *config)
+struct regmap *__regmap_init_spi(struct spi_device *spi,
+				 const struct regmap_config *config,
+				 struct lock_class_key *lock_key,
+				 const char *lock_name)
 {
-	return regmap_init(&spi->dev, &regmap_spi, &spi->dev, config);
+	return __regmap_init(&spi->dev, &regmap_spi, &spi->dev, config,
+			     lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(regmap_init_spi);
+EXPORT_SYMBOL_GPL(__regmap_init_spi);
 
 /**
  * devm_regmap_init_spi(): Initialise register map
@@ -139,11 +142,14 @@ EXPORT_SYMBOL_GPL(regmap_init_spi);
  * to a struct regmap.  The map will be automatically freed by the
  * device management code.
  */
-struct regmap *devm_regmap_init_spi(struct spi_device *spi,
-				    const struct regmap_config *config)
+struct regmap *__devm_regmap_init_spi(struct spi_device *spi,
+				      const struct regmap_config *config,
+				      struct lock_class_key *lock_key,
+				      const char *lock_name)
 {
-	return devm_regmap_init(&spi->dev, &regmap_spi, &spi->dev, config);
+	return __devm_regmap_init(&spi->dev, &regmap_spi, &spi->dev, config,
+				  lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(devm_regmap_init_spi);
+EXPORT_SYMBOL_GPL(__devm_regmap_init_spi);
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/base/regmap/regmap-spmi.c b/drivers/base/regmap/regmap-spmi.c
index d7026dc33388..7f50f5862d39 100644
--- a/drivers/base/regmap/regmap-spmi.c
+++ b/drivers/base/regmap/regmap-spmi.c
@@ -99,12 +99,15 @@ static struct regmap_bus regmap_spmi_base = {
  * The return value will be an ERR_PTR() on error or a valid pointer to
  * a struct regmap.
  */
-struct regmap *regmap_init_spmi_base(struct spmi_device *sdev,
-				     const struct regmap_config *config)
+struct regmap *__regmap_init_spmi_base(struct spmi_device *sdev,
+				       const struct regmap_config *config,
+				       struct lock_class_key *lock_key,
+				       const char *lock_name)
 {
-	return regmap_init(&sdev->dev, &regmap_spmi_base, sdev, config);
+	return __regmap_init(&sdev->dev, &regmap_spmi_base, sdev, config,
+			     lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(regmap_init_spmi_base);
+EXPORT_SYMBOL_GPL(__regmap_init_spmi_base);
 
 /**
  * devm_regmap_init_spmi_base(): Create managed regmap for Base register space
@@ -115,12 +118,15 @@ EXPORT_SYMBOL_GPL(regmap_init_spmi_base);
  * to a struct regmap.  The regmap will be automatically freed by the
  * device management code.
  */
-struct regmap *devm_regmap_init_spmi_base(struct spmi_device *sdev,
-					  const struct regmap_config *config)
+struct regmap *__devm_regmap_init_spmi_base(struct spmi_device *sdev,
+					    const struct regmap_config *config,
+					    struct lock_class_key *lock_key,
+					    const char *lock_name)
 {
-	return devm_regmap_init(&sdev->dev, &regmap_spmi_base, sdev, config);
+	return __devm_regmap_init(&sdev->dev, &regmap_spmi_base, sdev, config,
+				  lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(devm_regmap_init_spmi_base);
+EXPORT_SYMBOL_GPL(__devm_regmap_init_spmi_base);
 
 static int regmap_spmi_ext_read(void *context,
 				const void *reg, size_t reg_size,
@@ -230,12 +236,15 @@ static struct regmap_bus regmap_spmi_ext = {
  * The return value will be an ERR_PTR() on error or a valid pointer to
  * a struct regmap.
  */
-struct regmap *regmap_init_spmi_ext(struct spmi_device *sdev,
-				    const struct regmap_config *config)
+struct regmap *__regmap_init_spmi_ext(struct spmi_device *sdev,
+				      const struct regmap_config *config,
+				      struct lock_class_key *lock_key,
+				      const char *lock_name)
 {
-	return regmap_init(&sdev->dev, &regmap_spmi_ext, sdev, config);
+	return __regmap_init(&sdev->dev, &regmap_spmi_ext, sdev, config,
+			     lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(regmap_init_spmi_ext);
+EXPORT_SYMBOL_GPL(__regmap_init_spmi_ext);
 
 /**
  * devm_regmap_init_spmi_ext(): Create managed regmap for Ext register space
@@ -246,11 +255,14 @@ EXPORT_SYMBOL_GPL(regmap_init_spmi_ext);
  * to a struct regmap.  The regmap will be automatically freed by the
  * device management code.
  */
-struct regmap *devm_regmap_init_spmi_ext(struct spmi_device *sdev,
-				     const struct regmap_config *config)
+struct regmap *__devm_regmap_init_spmi_ext(struct spmi_device *sdev,
+					   const struct regmap_config *config,
+					   struct lock_class_key *lock_key,
+					   const char *lock_name)
 {
-	return devm_regmap_init(&sdev->dev, &regmap_spmi_ext, sdev, config);
+	return __devm_regmap_init(&sdev->dev, &regmap_spmi_ext, sdev, config,
+				  lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(devm_regmap_init_spmi_ext);
+EXPORT_SYMBOL_GPL(__devm_regmap_init_spmi_ext);
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 7111d04f2621..b9fddccd6e06 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -527,10 +527,12 @@ EXPORT_SYMBOL_GPL(regmap_get_val_endian);
  * a struct regmap.  This function should generally not be called
  * directly, it should be called by bus-specific init functions.
  */
-struct regmap *regmap_init(struct device *dev,
-			   const struct regmap_bus *bus,
-			   void *bus_context,
-			   const struct regmap_config *config)
+struct regmap *__regmap_init(struct device *dev,
+			     const struct regmap_bus *bus,
+			     void *bus_context,
+			     const struct regmap_config *config,
+			     struct lock_class_key *lock_key,
+			     const char *lock_name)
 {
 	struct regmap *map;
 	int ret = -EINVAL;
@@ -556,10 +558,14 @@ struct regmap *regmap_init(struct device *dev,
 			spin_lock_init(&map->spinlock);
 			map->lock = regmap_lock_spinlock;
 			map->unlock = regmap_unlock_spinlock;
+			lockdep_set_class_and_name(&map->spinlock,
+						   lock_key, lock_name);
 		} else {
 			mutex_init(&map->mutex);
 			map->lock = regmap_lock_mutex;
 			map->unlock = regmap_unlock_mutex;
+			lockdep_set_class_and_name(&map->mutex,
+						   lock_key, lock_name);
 		}
 		map->lock_arg = map;
 	}
@@ -899,7 +905,7 @@ err_map:
 err:
 	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(regmap_init);
+EXPORT_SYMBOL_GPL(__regmap_init);
 
 static void devm_regmap_release(struct device *dev, void *res)
 {
@@ -919,10 +925,12 @@ static void devm_regmap_release(struct device *dev, void *res)
  * directly, it should be called by bus-specific init functions.  The
  * map will be automatically freed by the device management code.
  */
-struct regmap *devm_regmap_init(struct device *dev,
-				const struct regmap_bus *bus,
-				void *bus_context,
-				const struct regmap_config *config)
+struct regmap *__devm_regmap_init(struct device *dev,
+				  const struct regmap_bus *bus,
+				  void *bus_context,
+				  const struct regmap_config *config,
+				  struct lock_class_key *lock_key,
+				  const char *lock_name)
 {
 	struct regmap **ptr, *regmap;
 
@@ -930,7 +938,8 @@ struct regmap *devm_regmap_init(struct device *dev,
 	if (!ptr)
 		return ERR_PTR(-ENOMEM);
 
-	regmap = regmap_init(dev, bus, bus_context, config);
+	regmap = __regmap_init(dev, bus, bus_context, config,
+			       lock_key, lock_name);
 	if (!IS_ERR(regmap)) {
 		*ptr = regmap;
 		devres_add(dev, ptr);
@@ -940,7 +949,7 @@ struct regmap *devm_regmap_init(struct device *dev,
 
 	return regmap;
 }
-EXPORT_SYMBOL_GPL(devm_regmap_init);
+EXPORT_SYMBOL_GPL(__devm_regmap_init);
 
 static void regmap_field_init(struct regmap_field *rm_field,
 	struct regmap *regmap, struct reg_field reg_field)
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 59c55ea0f0b5..5d7027286032 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -17,6 +17,7 @@
 #include <linux/rbtree.h>
 #include <linux/err.h>
 #include <linux/bug.h>
+#include <linux/lockdep.h>
 
 struct module;
 struct device;
@@ -324,46 +325,147 @@ struct regmap_bus {
 	enum regmap_endian val_format_endian_default;
 };
 
-struct regmap *regmap_init(struct device *dev,
-			   const struct regmap_bus *bus,
-			   void *bus_context,
-			   const struct regmap_config *config);
-int regmap_attach_dev(struct device *dev, struct regmap *map,
-				 const struct regmap_config *config);
-struct regmap *regmap_init_i2c(struct i2c_client *i2c,
-			       const struct regmap_config *config);
-struct regmap *regmap_init_spi(struct spi_device *dev,
-			       const struct regmap_config *config);
-struct regmap *regmap_init_spmi_base(struct spmi_device *dev,
-				     const struct regmap_config *config);
-struct regmap *regmap_init_spmi_ext(struct spmi_device *dev,
-				    const struct regmap_config *config);
-struct regmap *regmap_init_mmio_clk(struct device *dev, const char *clk_id,
-				    void __iomem *regs,
-				    const struct regmap_config *config);
-struct regmap *regmap_init_ac97(struct snd_ac97 *ac97,
-				const struct regmap_config *config);
-
-struct regmap *devm_regmap_init(struct device *dev,
-				const struct regmap_bus *bus,
-				void *bus_context,
-				const struct regmap_config *config);
-struct regmap *devm_regmap_init_i2c(struct i2c_client *i2c,
-				    const struct regmap_config *config);
-struct regmap *devm_regmap_init_spi(struct spi_device *dev,
-				    const struct regmap_config *config);
-struct regmap *devm_regmap_init_spmi_base(struct spmi_device *dev,
-					  const struct regmap_config *config);
-struct regmap *devm_regmap_init_spmi_ext(struct spmi_device *dev,
-					 const struct regmap_config *config);
-struct regmap *devm_regmap_init_mmio_clk(struct device *dev, const char *clk_id,
-					 void __iomem *regs,
-					 const struct regmap_config *config);
-struct regmap *devm_regmap_init_ac97(struct snd_ac97 *ac97,
-				     const struct regmap_config *config);
+/*
+ * __regmap_init functions.
+ *
+ * These functions take a lock key and name parameter, and should not be called
+ * directly. Instead, use the regmap_init macros that generate a key and name
+ * for each call.
+ */
+struct regmap *__regmap_init(struct device *dev,
+			     const struct regmap_bus *bus,
+			     void *bus_context,
+			     const struct regmap_config *config,
+			     struct lock_class_key *lock_key,
+			     const char *lock_name);
+struct regmap *__regmap_init_i2c(struct i2c_client *i2c,
+				 const struct regmap_config *config,
+				 struct lock_class_key *lock_key,
+				 const char *lock_name);
+struct regmap *__regmap_init_spi(struct spi_device *dev,
+				 const struct regmap_config *config,
+				 struct lock_class_key *lock_key,
+				 const char *lock_name);
+struct regmap *__regmap_init_spmi_base(struct spmi_device *dev,
+				       const struct regmap_config *config,
+				       struct lock_class_key *lock_key,
+				       const char *lock_name);
+struct regmap *__regmap_init_spmi_ext(struct spmi_device *dev,
+				      const struct regmap_config *config,
+				      struct lock_class_key *lock_key,
+				      const char *lock_name);
+struct regmap *__regmap_init_mmio_clk(struct device *dev, const char *clk_id,
+				      void __iomem *regs,
+				      const struct regmap_config *config,
+				      struct lock_class_key *lock_key,
+				      const char *lock_name);
+struct regmap *__regmap_init_ac97(struct snd_ac97 *ac97,
+				  const struct regmap_config *config,
+				  struct lock_class_key *lock_key,
+				  const char *lock_name);
+
+struct regmap *__devm_regmap_init(struct device *dev,
+				  const struct regmap_bus *bus,
+				  void *bus_context,
+				  const struct regmap_config *config,
+				  struct lock_class_key *lock_key,
+				  const char *lock_name);
+struct regmap *__devm_regmap_init_i2c(struct i2c_client *i2c,
+				      const struct regmap_config *config,
+				      struct lock_class_key *lock_key,
+				      const char *lock_name);
+struct regmap *__devm_regmap_init_spi(struct spi_device *dev,
+				      const struct regmap_config *config,
+				      struct lock_class_key *lock_key,
+				      const char *lock_name);
+struct regmap *__devm_regmap_init_spmi_base(struct spmi_device *dev,
+					    const struct regmap_config *config,
+					    struct lock_class_key *lock_key,
+					    const char *lock_name);
+struct regmap *__devm_regmap_init_spmi_ext(struct spmi_device *dev,
+					   const struct regmap_config *config,
+					   struct lock_class_key *lock_key,
+					   const char *lock_name);
+struct regmap *__devm_regmap_init_mmio_clk(struct device *dev,
+					   const char *clk_id,
+					   void __iomem *regs,
+					   const struct regmap_config *config,
+					   struct lock_class_key *lock_key,
+					   const char *lock_name);
+struct regmap *__devm_regmap_init_ac97(struct snd_ac97 *ac97,
+				       const struct regmap_config *config,
+				       struct lock_class_key *lock_key,
+				       const char *lock_name);
 
+/*
+ * Wrapper for regmap_init macros to include a unique lockdep key and name
+ * for each call. No-op if CONFIG_LOCKDEP is not set.
+ *
+ * @fn: Real function to call (in the form __[*_]regmap_init[_*])
+ * @name: Config variable name (#config in the calling macro)
+ **/
+#ifdef CONFIG_LOCKDEP
+#define __regmap_lockdep_wrapper(fn, name, ...)				\
+(									\
+	({								\
+		static struct lock_class_key _key;			\
+		fn(__VA_ARGS__, &_key,					\
+			KBUILD_BASENAME ":"				\
+			__stringify(__LINE__) ":"			\
+			"(" name ")->lock");				\
+	})								\
+)
+#else
+#define __regmap_lockdep_wrapper(fn, name, ...) fn(__VA_ARGS__, NULL, NULL)
+#endif
+
+#define regmap_init(dev, bus, bus_context, config)			\
+	__regmap_lockdep_wrapper(__regmap_init, #config,		\
+				dev, bus, bus_context, config)
+int regmap_attach_dev(struct device *dev, struct regmap *map,
+		      const struct regmap_config *config);
+#define regmap_init_i2c(i2c, config)					\
+	__regmap_lockdep_wrapper(__regmap_init_i2c, #config,		\
+				i2c, config)
+#define regmap_init_spi(dev, config)					\
+	__regmap_lockdep_wrapper(__regmap_init_spi, #config,		\
+				dev, config)
+#define regmap_init_spmi_base(dev, config)				\
+	__regmap_lockdep_wrapper(__regmap_init_spmi_base, #config,	\
+				dev, config)
+#define regmap_init_spmi_ext(dev, config)				\
+	__regmap_lockdep_wrapper(__regmap_init_spmi_ext, #config,	\
+				dev, config)
+#define regmap_init_mmio_clk(dev, clk_id, regs, config)			\
+	__regmap_lockdep_wrapper(__regmap_init_mmio_clk, #config,	\
+				dev, clk_id, regs, config)
+#define regmap_init_ac97(ac97, config)					\
+	__regmap_lockdep_wrapper(__regmap_init_ac97, #config,		\
+				ac97, config)
 bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 
+#define devm_regmap_init(dev, bus, bus_context, config)			\
+	__regmap_lockdep_wrapper(__devm_regmap_init, #config,		\
+				dev, bus, bus_context, config)
+#define devm_regmap_init_i2c(i2c, config)				\
+	__regmap_lockdep_wrapper(__devm_regmap_init_i2c, #config,	\
+				i2c, config)
+#define devm_regmap_init_spi(dev, config)				\
+	__regmap_lockdep_wrapper(__devm_regmap_init_spi, #config,	\
+				dev, config)
+#define devm_regmap_init_spmi_base(dev, config)				\
+	__regmap_lockdep_wrapper(__devm_regmap_init_spmi_base, #config,	\
+				dev, config)
+#define devm_regmap_init_spmi_ext(dev, config)				\
+	__regmap_lockdep_wrapper(__devm_regmap_init_spmi_ext, #config,	\
+				dev, config)
+#define devm_regmap_init_mmio_clk(dev, clk_id, regs, config)		\
+	__regmap_lockdep_wrapper(__devm_regmap_init_mmio_clk, #config,	\
+				dev, clk_id, regs, config)
+#define devm_regmap_init_ac97(ac97, config)				\
+	__regmap_lockdep_wrapper(__devm_regmap_init_ac97, #config,	\
+				ac97, config)
+
 /**
  * regmap_init_mmio(): Initialise register map
  *
@@ -374,12 +476,8 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
  * The return value will be an ERR_PTR() on error or a valid pointer to
  * a struct regmap.
  */
-static inline struct regmap *regmap_init_mmio(struct device *dev,
-					void __iomem *regs,
-					const struct regmap_config *config)
-{
-	return regmap_init_mmio_clk(dev, NULL, regs, config);
-}
+#define regmap_init_mmio(dev, regs, config)		\
+	regmap_init_mmio_clk(dev, NULL, regs, config)
 
 /**
  * devm_regmap_init_mmio(): Initialise managed register map
@@ -392,12 +490,8 @@ static inline struct regmap *regmap_init_mmio(struct device *dev,
  * to a struct regmap.  The regmap will be automatically freed by the
  * device management code.
  */
-static inline struct regmap *devm_regmap_init_mmio(struct device *dev,
-					void __iomem *regs,
-					const struct regmap_config *config)
-{
-	return devm_regmap_init_mmio_clk(dev, NULL, regs, config);
-}
+#define devm_regmap_init_mmio(dev, regs, config)		\
+	devm_regmap_init_mmio_clk(dev, NULL, regs, config)
 
 void regmap_exit(struct regmap *map);
 int regmap_reinit_cache(struct regmap *map,
-- 
cgit v1.2.3-70-g09d2


From 21c36d35711d24a7689b7fb9606ce78f3b4c3d3b Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Fri, 7 Aug 2015 13:59:16 +0200
Subject: cpufreq-dt: make scaling_boost_freqs sysfs attr available when boost
 is enabled

Make scaling_boost_freqs sysfs attribute is available when
cpufreq-dt driver is used and boost support is enabled.

Suggested-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq-dt.c | 9 ++++++++-
 include/linux/cpufreq.h      | 1 +
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index b9259abd25d4..c3583cdfadbd 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -36,6 +36,12 @@ struct private_data {
 	unsigned int voltage_tolerance; /* in percentage */
 };
 
+static struct freq_attr *cpufreq_dt_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,   /* Extra space for boost-attr if required */
+	NULL,
+};
+
 static int set_target(struct cpufreq_policy *policy, unsigned int index)
 {
 	struct dev_pm_opp *opp;
@@ -336,6 +342,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 		ret = cpufreq_enable_boost_support();
 		if (ret)
 			goto out_free_cpufreq_table;
+		cpufreq_dt_attr[1] = &cpufreq_freq_attr_scaling_boost_freqs;
 	}
 
 	policy->cpuinfo.transition_latency = transition_latency;
@@ -411,7 +418,7 @@ static struct cpufreq_driver dt_cpufreq_driver = {
 	.exit = cpufreq_exit,
 	.ready = cpufreq_ready,
 	.name = "cpufreq-dt",
-	.attr = cpufreq_generic_attr,
+	.attr = cpufreq_dt_attr,
 };
 
 static int dt_cpufreq_probe(struct platform_device *pdev)
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 95f018649abf..657542d3e23b 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -609,6 +609,7 @@ struct cpufreq_frequency_table *cpufreq_frequency_get_table(unsigned int cpu);
 
 /* the following are really really optional */
 extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs;
+extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs;
 extern struct freq_attr *cpufreq_generic_attr[];
 int cpufreq_table_validate_and_show(struct cpufreq_policy *policy,
 				      struct cpufreq_frequency_table *table);
-- 
cgit v1.2.3-70-g09d2


From cf184dc2dd33847f4b211b01d8c7ec0526e6c5e4 Mon Sep 17 00:00:00 2001
From: Jaiprakash Singh <b44839@freescale.com>
Date: Wed, 20 May 2015 21:17:11 -0500
Subject: fsl_ifc: Change IO accessor based on endianness

IFC IO accressor are set at run time based
on IFC IP registers endianness.IFC node in
DTS file contains information about
endianness.

Signed-off-by: Jaiprakash Singh <b44839@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
---
 .../bindings/memory-controllers/fsl/ifc.txt        |   3 +
 drivers/memory/fsl_ifc.c                           |  43 ++--
 drivers/mtd/nand/fsl_ifc_nand.c                    | 258 +++++++++++----------
 include/linux/fsl_ifc.h                            |  50 ++++
 4 files changed, 213 insertions(+), 141 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/memory-controllers/fsl/ifc.txt b/Documentation/devicetree/bindings/memory-controllers/fsl/ifc.txt
index d5e370450ac0..89427b018ba7 100644
--- a/Documentation/devicetree/bindings/memory-controllers/fsl/ifc.txt
+++ b/Documentation/devicetree/bindings/memory-controllers/fsl/ifc.txt
@@ -18,6 +18,8 @@ Properties:
               interrupt (NAND_EVTER_STAT).  If there is only one,
               that interrupt reports both types of event.
 
+- little-endian : If this property is absent, the big-endian mode will
+                  be in use as default for registers.
 
 - ranges : Each range corresponds to a single chipselect, and covers
            the entire access window as configured.
@@ -34,6 +36,7 @@ Example:
 		#size-cells = <1>;
 		reg = <0x0 0xffe1e000 0 0x2000>;
 		interrupts = <16 2 19 2>;
+		little-endian;
 
 		/* NOR, NAND Flashes and CPLD on board */
 		ranges = <0x0 0x0 0x0 0xee000000 0x02000000
diff --git a/drivers/memory/fsl_ifc.c b/drivers/memory/fsl_ifc.c
index 410c39749872..e87459f6d686 100644
--- a/drivers/memory/fsl_ifc.c
+++ b/drivers/memory/fsl_ifc.c
@@ -62,7 +62,7 @@ int fsl_ifc_find(phys_addr_t addr_base)
 		return -ENODEV;
 
 	for (i = 0; i < fsl_ifc_ctrl_dev->banks; i++) {
-		u32 cspr = in_be32(&fsl_ifc_ctrl_dev->regs->cspr_cs[i].cspr);
+		u32 cspr = ifc_in32(&fsl_ifc_ctrl_dev->regs->cspr_cs[i].cspr);
 		if (cspr & CSPR_V && (cspr & CSPR_BA) ==
 				convert_ifc_address(addr_base))
 			return i;
@@ -79,16 +79,16 @@ static int fsl_ifc_ctrl_init(struct fsl_ifc_ctrl *ctrl)
 	/*
 	 * Clear all the common status and event registers
 	 */
-	if (in_be32(&ifc->cm_evter_stat) & IFC_CM_EVTER_STAT_CSER)
-		out_be32(&ifc->cm_evter_stat, IFC_CM_EVTER_STAT_CSER);
+	if (ifc_in32(&ifc->cm_evter_stat) & IFC_CM_EVTER_STAT_CSER)
+		ifc_out32(IFC_CM_EVTER_STAT_CSER, &ifc->cm_evter_stat);
 
 	/* enable all error and events */
-	out_be32(&ifc->cm_evter_en, IFC_CM_EVTER_EN_CSEREN);
+	ifc_out32(IFC_CM_EVTER_EN_CSEREN, &ifc->cm_evter_en);
 
 	/* enable all error and event interrupts */
-	out_be32(&ifc->cm_evter_intr_en, IFC_CM_EVTER_INTR_EN_CSERIREN);
-	out_be32(&ifc->cm_erattr0, 0x0);
-	out_be32(&ifc->cm_erattr1, 0x0);
+	ifc_out32(IFC_CM_EVTER_INTR_EN_CSERIREN, &ifc->cm_evter_intr_en);
+	ifc_out32(0x0, &ifc->cm_erattr0);
+	ifc_out32(0x0, &ifc->cm_erattr1);
 
 	return 0;
 }
@@ -127,9 +127,9 @@ static u32 check_nand_stat(struct fsl_ifc_ctrl *ctrl)
 
 	spin_lock_irqsave(&nand_irq_lock, flags);
 
-	stat = in_be32(&ifc->ifc_nand.nand_evter_stat);
+	stat = ifc_in32(&ifc->ifc_nand.nand_evter_stat);
 	if (stat) {
-		out_be32(&ifc->ifc_nand.nand_evter_stat, stat);
+		ifc_out32(stat, &ifc->ifc_nand.nand_evter_stat);
 		ctrl->nand_stat = stat;
 		wake_up(&ctrl->nand_wait);
 	}
@@ -161,16 +161,16 @@ static irqreturn_t fsl_ifc_ctrl_irq(int irqno, void *data)
 	irqreturn_t ret = IRQ_NONE;
 
 	/* read for chip select error */
-	cs_err = in_be32(&ifc->cm_evter_stat);
+	cs_err = ifc_in32(&ifc->cm_evter_stat);
 	if (cs_err) {
 		dev_err(ctrl->dev, "transaction sent to IFC is not mapped to"
 				"any memory bank 0x%08X\n", cs_err);
 		/* clear the chip select error */
-		out_be32(&ifc->cm_evter_stat, IFC_CM_EVTER_STAT_CSER);
+		ifc_out32(IFC_CM_EVTER_STAT_CSER, &ifc->cm_evter_stat);
 
 		/* read error attribute registers print the error information */
-		status = in_be32(&ifc->cm_erattr0);
-		err_addr = in_be32(&ifc->cm_erattr1);
+		status = ifc_in32(&ifc->cm_erattr0);
+		err_addr = ifc_in32(&ifc->cm_erattr1);
 
 		if (status & IFC_CM_ERATTR0_ERTYP_READ)
 			dev_err(ctrl->dev, "Read transaction error"
@@ -231,6 +231,23 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
 		goto err;
 	}
 
+	version = ifc_in32(&fsl_ifc_ctrl_dev->regs->ifc_rev) &
+			FSL_IFC_VERSION_MASK;
+	banks = (version == FSL_IFC_VERSION_1_0_0) ? 4 : 8;
+	dev_info(&dev->dev, "IFC version %d.%d, %d banks\n",
+		version >> 24, (version >> 16) & 0xf, banks);
+
+	fsl_ifc_ctrl_dev->version = version;
+	fsl_ifc_ctrl_dev->banks = banks;
+
+	if (of_property_read_bool(dev->dev.of_node, "little-endian")) {
+		fsl_ifc_ctrl_dev->little_endian = true;
+		dev_dbg(&dev->dev, "IFC REGISTERS are LITTLE endian\n");
+	} else {
+		fsl_ifc_ctrl_dev->little_endian = false;
+		dev_dbg(&dev->dev, "IFC REGISTERS are BIG endian\n");
+	}
+
 	version = ioread32be(&fsl_ifc_ctrl_dev->regs->ifc_rev) &
 			FSL_IFC_VERSION_MASK;
 	banks = (version == FSL_IFC_VERSION_1_0_0) ? 4 : 8;
diff --git a/drivers/mtd/nand/fsl_ifc_nand.c b/drivers/mtd/nand/fsl_ifc_nand.c
index 51394e59901b..a4e27e891153 100644
--- a/drivers/mtd/nand/fsl_ifc_nand.c
+++ b/drivers/mtd/nand/fsl_ifc_nand.c
@@ -238,8 +238,8 @@ static void set_addr(struct mtd_info *mtd, int column, int page_addr, int oob)
 
 	ifc_nand_ctrl->page = page_addr;
 	/* Program ROW0/COL0 */
-	iowrite32be(page_addr, &ifc->ifc_nand.row0);
-	iowrite32be((oob ? IFC_NAND_COL_MS : 0) | column, &ifc->ifc_nand.col0);
+	ifc_out32(page_addr, &ifc->ifc_nand.row0);
+	ifc_out32((oob ? IFC_NAND_COL_MS : 0) | column, &ifc->ifc_nand.col0);
 
 	buf_num = page_addr & priv->bufnum_mask;
 
@@ -301,19 +301,19 @@ static void fsl_ifc_run_command(struct mtd_info *mtd)
 	int i;
 
 	/* set the chip select for NAND Transaction */
-	iowrite32be(priv->bank << IFC_NAND_CSEL_SHIFT,
-		    &ifc->ifc_nand.nand_csel);
+	ifc_out32(priv->bank << IFC_NAND_CSEL_SHIFT,
+		  &ifc->ifc_nand.nand_csel);
 
 	dev_vdbg(priv->dev,
 			"%s: fir0=%08x fcr0=%08x\n",
 			__func__,
-			ioread32be(&ifc->ifc_nand.nand_fir0),
-			ioread32be(&ifc->ifc_nand.nand_fcr0));
+			ifc_in32(&ifc->ifc_nand.nand_fir0),
+			ifc_in32(&ifc->ifc_nand.nand_fcr0));
 
 	ctrl->nand_stat = 0;
 
 	/* start read/write seq */
-	iowrite32be(IFC_NAND_SEQ_STRT_FIR_STRT, &ifc->ifc_nand.nandseq_strt);
+	ifc_out32(IFC_NAND_SEQ_STRT_FIR_STRT, &ifc->ifc_nand.nandseq_strt);
 
 	/* wait for command complete flag or timeout */
 	wait_event_timeout(ctrl->nand_wait, ctrl->nand_stat,
@@ -336,7 +336,7 @@ static void fsl_ifc_run_command(struct mtd_info *mtd)
 		int sector_end = sector + chip->ecc.steps - 1;
 
 		for (i = sector / 4; i <= sector_end / 4; i++)
-			eccstat[i] = ioread32be(&ifc->ifc_nand.nand_eccstat[i]);
+			eccstat[i] = ifc_in32(&ifc->ifc_nand.nand_eccstat[i]);
 
 		for (i = sector; i <= sector_end; i++) {
 			errors = check_read_ecc(mtd, ctrl, eccstat, i);
@@ -376,33 +376,33 @@ static void fsl_ifc_do_read(struct nand_chip *chip,
 
 	/* Program FIR/IFC_NAND_FCR0 for Small/Large page */
 	if (mtd->writesize > 512) {
-		iowrite32be((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
-			    (IFC_FIR_OP_CA0 << IFC_NAND_FIR0_OP1_SHIFT) |
-			    (IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP2_SHIFT) |
-			    (IFC_FIR_OP_CMD1 << IFC_NAND_FIR0_OP3_SHIFT) |
-			    (IFC_FIR_OP_RBCD << IFC_NAND_FIR0_OP4_SHIFT),
-			    &ifc->ifc_nand.nand_fir0);
-		iowrite32be(0x0, &ifc->ifc_nand.nand_fir1);
-
-		iowrite32be((NAND_CMD_READ0 << IFC_NAND_FCR0_CMD0_SHIFT) |
-			    (NAND_CMD_READSTART << IFC_NAND_FCR0_CMD1_SHIFT),
-			    &ifc->ifc_nand.nand_fcr0);
+		ifc_out32((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+			  (IFC_FIR_OP_CA0 << IFC_NAND_FIR0_OP1_SHIFT) |
+			  (IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP2_SHIFT) |
+			  (IFC_FIR_OP_CMD1 << IFC_NAND_FIR0_OP3_SHIFT) |
+			  (IFC_FIR_OP_RBCD << IFC_NAND_FIR0_OP4_SHIFT),
+			  &ifc->ifc_nand.nand_fir0);
+		ifc_out32(0x0, &ifc->ifc_nand.nand_fir1);
+
+		ifc_out32((NAND_CMD_READ0 << IFC_NAND_FCR0_CMD0_SHIFT) |
+			  (NAND_CMD_READSTART << IFC_NAND_FCR0_CMD1_SHIFT),
+			  &ifc->ifc_nand.nand_fcr0);
 	} else {
-		iowrite32be((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
-			    (IFC_FIR_OP_CA0 << IFC_NAND_FIR0_OP1_SHIFT) |
-			    (IFC_FIR_OP_RA0  << IFC_NAND_FIR0_OP2_SHIFT) |
-			    (IFC_FIR_OP_RBCD << IFC_NAND_FIR0_OP3_SHIFT),
-			    &ifc->ifc_nand.nand_fir0);
-		iowrite32be(0x0, &ifc->ifc_nand.nand_fir1);
+		ifc_out32((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+			  (IFC_FIR_OP_CA0 << IFC_NAND_FIR0_OP1_SHIFT) |
+			  (IFC_FIR_OP_RA0  << IFC_NAND_FIR0_OP2_SHIFT) |
+			  (IFC_FIR_OP_RBCD << IFC_NAND_FIR0_OP3_SHIFT),
+			  &ifc->ifc_nand.nand_fir0);
+		ifc_out32(0x0, &ifc->ifc_nand.nand_fir1);
 
 		if (oob)
-			iowrite32be(NAND_CMD_READOOB <<
-				    IFC_NAND_FCR0_CMD0_SHIFT,
-				    &ifc->ifc_nand.nand_fcr0);
+			ifc_out32(NAND_CMD_READOOB <<
+				  IFC_NAND_FCR0_CMD0_SHIFT,
+				  &ifc->ifc_nand.nand_fcr0);
 		else
-			iowrite32be(NAND_CMD_READ0 <<
-				    IFC_NAND_FCR0_CMD0_SHIFT,
-				    &ifc->ifc_nand.nand_fcr0);
+			ifc_out32(NAND_CMD_READ0 <<
+				  IFC_NAND_FCR0_CMD0_SHIFT,
+				  &ifc->ifc_nand.nand_fcr0);
 	}
 }
 
@@ -422,7 +422,7 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
 	switch (command) {
 	/* READ0 read the entire buffer to use hardware ECC. */
 	case NAND_CMD_READ0:
-		iowrite32be(0, &ifc->ifc_nand.nand_fbcr);
+		ifc_out32(0, &ifc->ifc_nand.nand_fbcr);
 		set_addr(mtd, 0, page_addr, 0);
 
 		ifc_nand_ctrl->read_bytes = mtd->writesize + mtd->oobsize;
@@ -437,7 +437,7 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
 
 	/* READOOB reads only the OOB because no ECC is performed. */
 	case NAND_CMD_READOOB:
-		iowrite32be(mtd->oobsize - column, &ifc->ifc_nand.nand_fbcr);
+		ifc_out32(mtd->oobsize - column, &ifc->ifc_nand.nand_fbcr);
 		set_addr(mtd, column, page_addr, 1);
 
 		ifc_nand_ctrl->read_bytes = mtd->writesize + mtd->oobsize;
@@ -453,19 +453,19 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
 		if (command == NAND_CMD_PARAM)
 			timing = IFC_FIR_OP_RBCD;
 
-		iowrite32be((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
-			    (IFC_FIR_OP_UA  << IFC_NAND_FIR0_OP1_SHIFT) |
-			    (timing << IFC_NAND_FIR0_OP2_SHIFT),
-			    &ifc->ifc_nand.nand_fir0);
-		iowrite32be(command << IFC_NAND_FCR0_CMD0_SHIFT,
-			    &ifc->ifc_nand.nand_fcr0);
-		iowrite32be(column, &ifc->ifc_nand.row3);
+		ifc_out32((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+			  (IFC_FIR_OP_UA  << IFC_NAND_FIR0_OP1_SHIFT) |
+			  (timing << IFC_NAND_FIR0_OP2_SHIFT),
+			  &ifc->ifc_nand.nand_fir0);
+		ifc_out32(command << IFC_NAND_FCR0_CMD0_SHIFT,
+			  &ifc->ifc_nand.nand_fcr0);
+		ifc_out32(column, &ifc->ifc_nand.row3);
 
 		/*
 		 * although currently it's 8 bytes for READID, we always read
 		 * the maximum 256 bytes(for PARAM)
 		 */
-		iowrite32be(256, &ifc->ifc_nand.nand_fbcr);
+		ifc_out32(256, &ifc->ifc_nand.nand_fbcr);
 		ifc_nand_ctrl->read_bytes = 256;
 
 		set_addr(mtd, 0, 0, 0);
@@ -480,16 +480,16 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
 
 	/* ERASE2 uses the block and page address from ERASE1 */
 	case NAND_CMD_ERASE2:
-		iowrite32be((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
-			    (IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP1_SHIFT) |
-			    (IFC_FIR_OP_CMD1 << IFC_NAND_FIR0_OP2_SHIFT),
-			    &ifc->ifc_nand.nand_fir0);
+		ifc_out32((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+			  (IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP1_SHIFT) |
+			  (IFC_FIR_OP_CMD1 << IFC_NAND_FIR0_OP2_SHIFT),
+			  &ifc->ifc_nand.nand_fir0);
 
-		iowrite32be((NAND_CMD_ERASE1 << IFC_NAND_FCR0_CMD0_SHIFT) |
-			    (NAND_CMD_ERASE2 << IFC_NAND_FCR0_CMD1_SHIFT),
-			    &ifc->ifc_nand.nand_fcr0);
+		ifc_out32((NAND_CMD_ERASE1 << IFC_NAND_FCR0_CMD0_SHIFT) |
+			  (NAND_CMD_ERASE2 << IFC_NAND_FCR0_CMD1_SHIFT),
+			  &ifc->ifc_nand.nand_fcr0);
 
-		iowrite32be(0, &ifc->ifc_nand.nand_fbcr);
+		ifc_out32(0, &ifc->ifc_nand.nand_fbcr);
 		ifc_nand_ctrl->read_bytes = 0;
 		fsl_ifc_run_command(mtd);
 		return;
@@ -506,19 +506,18 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
 				(NAND_CMD_STATUS << IFC_NAND_FCR0_CMD1_SHIFT) |
 				(NAND_CMD_PAGEPROG << IFC_NAND_FCR0_CMD2_SHIFT);
 
-			iowrite32be(
-				 (IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
-				 (IFC_FIR_OP_CA0 << IFC_NAND_FIR0_OP1_SHIFT) |
-				 (IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP2_SHIFT) |
-				 (IFC_FIR_OP_WBCD  << IFC_NAND_FIR0_OP3_SHIFT) |
-				 (IFC_FIR_OP_CMD2 << IFC_NAND_FIR0_OP4_SHIFT),
-				 &ifc->ifc_nand.nand_fir0);
-			iowrite32be(
-				 (IFC_FIR_OP_CW1 << IFC_NAND_FIR1_OP5_SHIFT) |
-				 (IFC_FIR_OP_RDSTAT <<
-					IFC_NAND_FIR1_OP6_SHIFT) |
-				 (IFC_FIR_OP_NOP << IFC_NAND_FIR1_OP7_SHIFT),
-				 &ifc->ifc_nand.nand_fir1);
+			ifc_out32(
+				(IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+				(IFC_FIR_OP_CA0 << IFC_NAND_FIR0_OP1_SHIFT) |
+				(IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP2_SHIFT) |
+				(IFC_FIR_OP_WBCD << IFC_NAND_FIR0_OP3_SHIFT) |
+				(IFC_FIR_OP_CMD2 << IFC_NAND_FIR0_OP4_SHIFT),
+				&ifc->ifc_nand.nand_fir0);
+			ifc_out32(
+				(IFC_FIR_OP_CW1 << IFC_NAND_FIR1_OP5_SHIFT) |
+				(IFC_FIR_OP_RDSTAT << IFC_NAND_FIR1_OP6_SHIFT) |
+				(IFC_FIR_OP_NOP << IFC_NAND_FIR1_OP7_SHIFT),
+				&ifc->ifc_nand.nand_fir1);
 		} else {
 			nand_fcr0 = ((NAND_CMD_PAGEPROG <<
 					IFC_NAND_FCR0_CMD1_SHIFT) |
@@ -527,20 +526,19 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
 				    (NAND_CMD_STATUS <<
 					IFC_NAND_FCR0_CMD3_SHIFT));
 
-			iowrite32be(
+			ifc_out32(
 				(IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
 				(IFC_FIR_OP_CMD2 << IFC_NAND_FIR0_OP1_SHIFT) |
 				(IFC_FIR_OP_CA0 << IFC_NAND_FIR0_OP2_SHIFT) |
 				(IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP3_SHIFT) |
 				(IFC_FIR_OP_WBCD << IFC_NAND_FIR0_OP4_SHIFT),
 				&ifc->ifc_nand.nand_fir0);
-			iowrite32be(
-				 (IFC_FIR_OP_CMD1 << IFC_NAND_FIR1_OP5_SHIFT) |
-				 (IFC_FIR_OP_CW3 << IFC_NAND_FIR1_OP6_SHIFT) |
-				 (IFC_FIR_OP_RDSTAT <<
-					IFC_NAND_FIR1_OP7_SHIFT) |
-				 (IFC_FIR_OP_NOP << IFC_NAND_FIR1_OP8_SHIFT),
-				  &ifc->ifc_nand.nand_fir1);
+			ifc_out32(
+				(IFC_FIR_OP_CMD1 << IFC_NAND_FIR1_OP5_SHIFT) |
+				(IFC_FIR_OP_CW3 << IFC_NAND_FIR1_OP6_SHIFT) |
+				(IFC_FIR_OP_RDSTAT << IFC_NAND_FIR1_OP7_SHIFT) |
+				(IFC_FIR_OP_NOP << IFC_NAND_FIR1_OP8_SHIFT),
+				&ifc->ifc_nand.nand_fir1);
 
 			if (column >= mtd->writesize)
 				nand_fcr0 |=
@@ -555,7 +553,7 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
 			column -= mtd->writesize;
 			ifc_nand_ctrl->oob = 1;
 		}
-		iowrite32be(nand_fcr0, &ifc->ifc_nand.nand_fcr0);
+		ifc_out32(nand_fcr0, &ifc->ifc_nand.nand_fcr0);
 		set_addr(mtd, column, page_addr, ifc_nand_ctrl->oob);
 		return;
 	}
@@ -563,24 +561,26 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
 	/* PAGEPROG reuses all of the setup from SEQIN and adds the length */
 	case NAND_CMD_PAGEPROG: {
 		if (ifc_nand_ctrl->oob) {
-			iowrite32be(ifc_nand_ctrl->index -
-				    ifc_nand_ctrl->column,
-				    &ifc->ifc_nand.nand_fbcr);
+			ifc_out32(ifc_nand_ctrl->index -
+				  ifc_nand_ctrl->column,
+				  &ifc->ifc_nand.nand_fbcr);
 		} else {
-			iowrite32be(0, &ifc->ifc_nand.nand_fbcr);
+			ifc_out32(0, &ifc->ifc_nand.nand_fbcr);
 		}
 
 		fsl_ifc_run_command(mtd);
 		return;
 	}
 
-	case NAND_CMD_STATUS:
-		iowrite32be((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
-			    (IFC_FIR_OP_RB << IFC_NAND_FIR0_OP1_SHIFT),
-			    &ifc->ifc_nand.nand_fir0);
-		iowrite32be(NAND_CMD_STATUS << IFC_NAND_FCR0_CMD0_SHIFT,
-			    &ifc->ifc_nand.nand_fcr0);
-		iowrite32be(1, &ifc->ifc_nand.nand_fbcr);
+	case NAND_CMD_STATUS: {
+		void __iomem *addr;
+
+		ifc_out32((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+			  (IFC_FIR_OP_RB << IFC_NAND_FIR0_OP1_SHIFT),
+			  &ifc->ifc_nand.nand_fir0);
+		ifc_out32(NAND_CMD_STATUS << IFC_NAND_FCR0_CMD0_SHIFT,
+			  &ifc->ifc_nand.nand_fcr0);
+		ifc_out32(1, &ifc->ifc_nand.nand_fbcr);
 		set_addr(mtd, 0, 0, 0);
 		ifc_nand_ctrl->read_bytes = 1;
 
@@ -590,17 +590,19 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
 		 * The chip always seems to report that it is
 		 * write-protected, even when it is not.
 		 */
+		addr = ifc_nand_ctrl->addr;
 		if (chip->options & NAND_BUSWIDTH_16)
-			setbits16(ifc_nand_ctrl->addr, NAND_STATUS_WP);
+			ifc_out16(ifc_in16(addr) | (NAND_STATUS_WP), addr);
 		else
-			setbits8(ifc_nand_ctrl->addr, NAND_STATUS_WP);
+			ifc_out8(ifc_in8(addr) | (NAND_STATUS_WP), addr);
 		return;
+	}
 
 	case NAND_CMD_RESET:
-		iowrite32be(IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT,
-			    &ifc->ifc_nand.nand_fir0);
-		iowrite32be(NAND_CMD_RESET << IFC_NAND_FCR0_CMD0_SHIFT,
-			    &ifc->ifc_nand.nand_fcr0);
+		ifc_out32(IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT,
+			  &ifc->ifc_nand.nand_fir0);
+		ifc_out32(NAND_CMD_RESET << IFC_NAND_FCR0_CMD0_SHIFT,
+			  &ifc->ifc_nand.nand_fcr0);
 		fsl_ifc_run_command(mtd);
 		return;
 
@@ -658,7 +660,7 @@ static uint8_t fsl_ifc_read_byte(struct mtd_info *mtd)
 	 */
 	if (ifc_nand_ctrl->index < ifc_nand_ctrl->read_bytes) {
 		offset = ifc_nand_ctrl->index++;
-		return in_8(ifc_nand_ctrl->addr + offset);
+		return ifc_in8(ifc_nand_ctrl->addr + offset);
 	}
 
 	dev_err(priv->dev, "%s: beyond end of buffer\n", __func__);
@@ -680,7 +682,7 @@ static uint8_t fsl_ifc_read_byte16(struct mtd_info *mtd)
 	 * next byte.
 	 */
 	if (ifc_nand_ctrl->index < ifc_nand_ctrl->read_bytes) {
-		data = in_be16(ifc_nand_ctrl->addr + ifc_nand_ctrl->index);
+		data = ifc_in16(ifc_nand_ctrl->addr + ifc_nand_ctrl->index);
 		ifc_nand_ctrl->index += 2;
 		return (uint8_t) data;
 	}
@@ -726,18 +728,18 @@ static int fsl_ifc_wait(struct mtd_info *mtd, struct nand_chip *chip)
 	u32 nand_fsr;
 
 	/* Use READ_STATUS command, but wait for the device to be ready */
-	iowrite32be((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
-		    (IFC_FIR_OP_RDSTAT << IFC_NAND_FIR0_OP1_SHIFT),
-		    &ifc->ifc_nand.nand_fir0);
-	iowrite32be(NAND_CMD_STATUS << IFC_NAND_FCR0_CMD0_SHIFT,
-		    &ifc->ifc_nand.nand_fcr0);
-	iowrite32be(1, &ifc->ifc_nand.nand_fbcr);
+	ifc_out32((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+		  (IFC_FIR_OP_RDSTAT << IFC_NAND_FIR0_OP1_SHIFT),
+		  &ifc->ifc_nand.nand_fir0);
+	ifc_out32(NAND_CMD_STATUS << IFC_NAND_FCR0_CMD0_SHIFT,
+		  &ifc->ifc_nand.nand_fcr0);
+	ifc_out32(1, &ifc->ifc_nand.nand_fbcr);
 	set_addr(mtd, 0, 0, 0);
 	ifc_nand_ctrl->read_bytes = 1;
 
 	fsl_ifc_run_command(mtd);
 
-	nand_fsr = ioread32be(&ifc->ifc_nand.nand_fsr);
+	nand_fsr = ifc_in32(&ifc->ifc_nand.nand_fsr);
 
 	/*
 	 * The chip always seems to report that it is
@@ -829,34 +831,34 @@ static void fsl_ifc_sram_init(struct fsl_ifc_mtd *priv)
 	uint32_t cs = priv->bank;
 
 	/* Save CSOR and CSOR_ext */
-	csor = ioread32be(&ifc->csor_cs[cs].csor);
-	csor_ext = ioread32be(&ifc->csor_cs[cs].csor_ext);
+	csor = ifc_in32(&ifc->csor_cs[cs].csor);
+	csor_ext = ifc_in32(&ifc->csor_cs[cs].csor_ext);
 
 	/* chage PageSize 8K and SpareSize 1K*/
 	csor_8k = (csor & ~(CSOR_NAND_PGS_MASK)) | 0x0018C000;
-	iowrite32be(csor_8k, &ifc->csor_cs[cs].csor);
-	iowrite32be(0x0000400, &ifc->csor_cs[cs].csor_ext);
+	ifc_out32(csor_8k, &ifc->csor_cs[cs].csor);
+	ifc_out32(0x0000400, &ifc->csor_cs[cs].csor_ext);
 
 	/* READID */
-	iowrite32be((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
-		    (IFC_FIR_OP_UA  << IFC_NAND_FIR0_OP1_SHIFT) |
-		    (IFC_FIR_OP_RB << IFC_NAND_FIR0_OP2_SHIFT),
-		    &ifc->ifc_nand.nand_fir0);
-	iowrite32be(NAND_CMD_READID << IFC_NAND_FCR0_CMD0_SHIFT,
-		    &ifc->ifc_nand.nand_fcr0);
-	iowrite32be(0x0, &ifc->ifc_nand.row3);
+	ifc_out32((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+		  (IFC_FIR_OP_UA  << IFC_NAND_FIR0_OP1_SHIFT) |
+		  (IFC_FIR_OP_RB << IFC_NAND_FIR0_OP2_SHIFT),
+		  &ifc->ifc_nand.nand_fir0);
+	ifc_out32(NAND_CMD_READID << IFC_NAND_FCR0_CMD0_SHIFT,
+		  &ifc->ifc_nand.nand_fcr0);
+	ifc_out32(0x0, &ifc->ifc_nand.row3);
 
-	iowrite32be(0x0, &ifc->ifc_nand.nand_fbcr);
+	ifc_out32(0x0, &ifc->ifc_nand.nand_fbcr);
 
 	/* Program ROW0/COL0 */
-	iowrite32be(0x0, &ifc->ifc_nand.row0);
-	iowrite32be(0x0, &ifc->ifc_nand.col0);
+	ifc_out32(0x0, &ifc->ifc_nand.row0);
+	ifc_out32(0x0, &ifc->ifc_nand.col0);
 
 	/* set the chip select for NAND Transaction */
-	iowrite32be(cs << IFC_NAND_CSEL_SHIFT, &ifc->ifc_nand.nand_csel);
+	ifc_out32(cs << IFC_NAND_CSEL_SHIFT, &ifc->ifc_nand.nand_csel);
 
 	/* start read seq */
-	iowrite32be(IFC_NAND_SEQ_STRT_FIR_STRT, &ifc->ifc_nand.nandseq_strt);
+	ifc_out32(IFC_NAND_SEQ_STRT_FIR_STRT, &ifc->ifc_nand.nandseq_strt);
 
 	/* wait for command complete flag or timeout */
 	wait_event_timeout(ctrl->nand_wait, ctrl->nand_stat,
@@ -866,8 +868,8 @@ static void fsl_ifc_sram_init(struct fsl_ifc_mtd *priv)
 		printk(KERN_ERR "fsl-ifc: Failed to Initialise SRAM\n");
 
 	/* Restore CSOR and CSOR_ext */
-	iowrite32be(csor, &ifc->csor_cs[cs].csor);
-	iowrite32be(csor_ext, &ifc->csor_cs[cs].csor_ext);
+	ifc_out32(csor, &ifc->csor_cs[cs].csor);
+	ifc_out32(csor_ext, &ifc->csor_cs[cs].csor_ext);
 }
 
 static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
@@ -884,7 +886,7 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
 
 	/* fill in nand_chip structure */
 	/* set up function call table */
-	if ((ioread32be(&ifc->cspr_cs[priv->bank].cspr)) & CSPR_PORT_SIZE_16)
+	if ((ifc_in32(&ifc->cspr_cs[priv->bank].cspr)) & CSPR_PORT_SIZE_16)
 		chip->read_byte = fsl_ifc_read_byte16;
 	else
 		chip->read_byte = fsl_ifc_read_byte;
@@ -898,13 +900,13 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
 	chip->bbt_td = &bbt_main_descr;
 	chip->bbt_md = &bbt_mirror_descr;
 
-	iowrite32be(0x0, &ifc->ifc_nand.ncfgr);
+	ifc_out32(0x0, &ifc->ifc_nand.ncfgr);
 
 	/* set up nand options */
 	chip->bbt_options = NAND_BBT_USE_FLASH;
 	chip->options = NAND_NO_SUBPAGE_WRITE;
 
-	if (ioread32be(&ifc->cspr_cs[priv->bank].cspr) & CSPR_PORT_SIZE_16) {
+	if (ifc_in32(&ifc->cspr_cs[priv->bank].cspr) & CSPR_PORT_SIZE_16) {
 		chip->read_byte = fsl_ifc_read_byte16;
 		chip->options |= NAND_BUSWIDTH_16;
 	} else {
@@ -917,7 +919,7 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
 	chip->ecc.read_page = fsl_ifc_read_page;
 	chip->ecc.write_page = fsl_ifc_write_page;
 
-	csor = ioread32be(&ifc->csor_cs[priv->bank].csor);
+	csor = ifc_in32(&ifc->csor_cs[priv->bank].csor);
 
 	/* Hardware generates ECC per 512 Bytes */
 	chip->ecc.size = 512;
@@ -1006,7 +1008,7 @@ static int fsl_ifc_chip_remove(struct fsl_ifc_mtd *priv)
 static int match_bank(struct fsl_ifc_regs __iomem *ifc, int bank,
 		      phys_addr_t addr)
 {
-	u32 cspr = ioread32be(&ifc->cspr_cs[bank].cspr);
+	u32 cspr = ifc_in32(&ifc->cspr_cs[bank].cspr);
 
 	if (!(cspr & CSPR_V))
 		return 0;
@@ -1092,16 +1094,16 @@ static int fsl_ifc_nand_probe(struct platform_device *dev)
 
 	dev_set_drvdata(priv->dev, priv);
 
-	iowrite32be(IFC_NAND_EVTER_EN_OPC_EN |
-		    IFC_NAND_EVTER_EN_FTOER_EN |
-		    IFC_NAND_EVTER_EN_WPER_EN,
-		    &ifc->ifc_nand.nand_evter_en);
+	ifc_out32(IFC_NAND_EVTER_EN_OPC_EN |
+		  IFC_NAND_EVTER_EN_FTOER_EN |
+		  IFC_NAND_EVTER_EN_WPER_EN,
+		  &ifc->ifc_nand.nand_evter_en);
 
 	/* enable NAND Machine Interrupts */
-	iowrite32be(IFC_NAND_EVTER_INTR_OPCIR_EN |
-		    IFC_NAND_EVTER_INTR_FTOERIR_EN |
-		    IFC_NAND_EVTER_INTR_WPERIR_EN,
-		    &ifc->ifc_nand.nand_evter_intr_en);
+	ifc_out32(IFC_NAND_EVTER_INTR_OPCIR_EN |
+		  IFC_NAND_EVTER_INTR_FTOERIR_EN |
+		  IFC_NAND_EVTER_INTR_WPERIR_EN,
+		  &ifc->ifc_nand.nand_evter_intr_en);
 	priv->mtd.name = kasprintf(GFP_KERNEL, "%llx.flash", (u64)res.start);
 	if (!priv->mtd.name) {
 		ret = -ENOMEM;
diff --git a/include/linux/fsl_ifc.h b/include/linux/fsl_ifc.h
index bf0321eabbda..0023088b253b 100644
--- a/include/linux/fsl_ifc.h
+++ b/include/linux/fsl_ifc.h
@@ -841,9 +841,59 @@ struct fsl_ifc_ctrl {
 
 	u32 nand_stat;
 	wait_queue_head_t nand_wait;
+	bool little_endian;
 };
 
 extern struct fsl_ifc_ctrl *fsl_ifc_ctrl_dev;
 
+static inline u32 ifc_in32(void __iomem *addr)
+{
+	u32 val;
+
+	if (fsl_ifc_ctrl_dev->little_endian)
+		val = ioread32(addr);
+	else
+		val = ioread32be(addr);
+
+	return val;
+}
+
+static inline u16 ifc_in16(void __iomem *addr)
+{
+	u16 val;
+
+	if (fsl_ifc_ctrl_dev->little_endian)
+		val = ioread16(addr);
+	else
+		val = ioread16be(addr);
+
+	return val;
+}
+
+static inline u8 ifc_in8(void __iomem *addr)
+{
+	return ioread8(addr);
+}
+
+static inline void ifc_out32(u32 val, void __iomem *addr)
+{
+	if (fsl_ifc_ctrl_dev->little_endian)
+		iowrite32(val, addr);
+	else
+		iowrite32be(val, addr);
+}
+
+static inline void ifc_out16(u16 val, void __iomem *addr)
+{
+	if (fsl_ifc_ctrl_dev->little_endian)
+		iowrite16(val, addr);
+	else
+		iowrite16be(val, addr);
+}
+
+static inline void ifc_out8(u8 val, void __iomem *addr)
+{
+	iowrite8(val, addr);
+}
 
 #endif /* __ASM_FSL_IFC_H */
-- 
cgit v1.2.3-70-g09d2


From ad30bad3a5474f2585a7f2a35d4705c7f85210f3 Mon Sep 17 00:00:00 2001
From: Pengyu Ma <pengyu.ma@windriver.com>
Date: Tue, 4 Aug 2015 16:32:18 +0800
Subject: iio: declare struct to fix warning

When compile iio related driver the following warning shown:

include/linux/iio/trigger.h:35:34: warning: 'struct iio_trigger'
declared inside parameter list
  int (*set_trigger_state)(struct iio_trigger *trig, bool state);

include/linux/iio/trigger.h:38:18: warning: 'struct iio_dev'
declared inside parameter list
           struct iio_dev *indio_dev);

'struct iio_dev' and 'struct iio_trigger' was used before declaration,
forward declaration for these structs to fix warning.

Signed-off-by: Pengyu Ma <pengyu.ma@windriver.com>
Acked-by: Daniel Baluta <daniel.baluta@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 include/linux/iio/trigger.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h
index fa76c79a52a1..1c9e028e0d4a 100644
--- a/include/linux/iio/trigger.h
+++ b/include/linux/iio/trigger.h
@@ -18,6 +18,9 @@ struct iio_subirq {
 	bool enabled;
 };
 
+struct iio_dev;
+struct iio_trigger;
+
 /**
  * struct iio_trigger_ops - operations structure for an iio_trigger.
  * @owner:		used to monitor usage count of the trigger.
-- 
cgit v1.2.3-70-g09d2


From c689a923c867eac40ed3826c1d9328edea8b6bc7 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Wed, 5 Aug 2015 15:38:14 +0200
Subject: iio: Add inverse unit conversion macros

Add inverse unit conversion macro to convert from standard IIO units to
units that might be used by some devices.

Those are useful in combination with scale factors that are specified as
IIO_VAL_FRACTIONAL. Typically the denominator for those specifications will
contain the maximum raw value the sensor will generate and the numerator
the value it maps to in a specific unit. Sometimes datasheets specify those
in different units than the standard IIO units (e.g. degree/s instead of
rad/s) and so we need to do a unit conversion.

From a mathematical point of view it does not make a difference whether we
apply the unit conversion to the numerator or the inverse unit conversion
to the denominator since (x / y) / z = x / (y * z). But as the denominator
is typically a larger value and we are rounding both the numerator and
denominator to integer values using the later method gives us a better
precision (E.g. the relative error is smaller if we round 8000.3 to 8000
rather than rounding 8.3 to 8).

This is where in inverse unit conversion macros will be used.

Marked for stable as used by some upcoming fixes.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Cc: <Stable@vger.kernel.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 include/linux/iio/iio.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index f79148261d16..7bb7f673cb3f 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -644,6 +644,15 @@ int iio_str_to_fixpoint(const char *str, int fract_mult, int *integer,
  */
 #define IIO_DEGREE_TO_RAD(deg) (((deg) * 314159ULL + 9000000ULL) / 18000000ULL)
 
+/**
+ * IIO_RAD_TO_DEGREE() - Convert rad to degree
+ * @rad: A value in rad
+ *
+ * Returns the given value converted from rad to degree
+ */
+#define IIO_RAD_TO_DEGREE(rad) \
+	(((rad) * 18000000ULL + 314159ULL / 2) / 314159ULL)
+
 /**
  * IIO_G_TO_M_S_2() - Convert g to meter / second**2
  * @g: A value in g
@@ -652,4 +661,12 @@ int iio_str_to_fixpoint(const char *str, int fract_mult, int *integer,
  */
 #define IIO_G_TO_M_S_2(g) ((g) * 980665ULL / 100000ULL)
 
+/**
+ * IIO_M_S_2_TO_G() - Convert meter / second**2 to g
+ * @ms2: A value in meter / second**2
+ *
+ * Returns the given value converted from meter / second**2 to g
+ */
+#define IIO_M_S_2_TO_G(ms2) (((ms2) * 100000ULL + 980665ULL / 2) / 980665ULL)
+
 #endif /* _INDUSTRIAL_IO_H_ */
-- 
cgit v1.2.3-70-g09d2


From 7e34d70a7163b236f520ef4fc0d7c50093dd3746 Mon Sep 17 00:00:00 2001
From: Tal Shorer <tal.shorer@gmail.com>
Date: Fri, 7 Aug 2015 22:35:51 +0300
Subject: usb: hcd.h: Fix the values of SetHubDepth and GetPortErrorCount to
 match USB 3.1 specification

>From the usb 3.1 spec available at http://www.usb.org/developers/docs/
table 10-7 (Hub Class Requests) specifies the values for SetHubDepth and
GetPortErrorCount as:

Request			bmRequestType	bRequest		wValue		wIndex	wLength	Data
SetHubDepth		00100000B	SET_HUB_DEPTH		Hub Depth	Zero	Zero	None
GetPortErrorCount	10100011B	GET_PORT_ERR_COUNT	Zero		Port	Two	Number of Link Errors on this port

Fix these two values to match the spec.

Signed-off-by: Tal Shorer <tal.shorer@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/hcd.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index c9aa7792de10..d2784c10bfe2 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -564,9 +564,9 @@ extern void usb_ep0_reinit(struct usb_device *);
 
 /*-------------------------------------------------------------------------*/
 
-/* class requests from USB 3.0 hub spec, table 10-5 */
-#define SetHubDepth		(0x3000 | HUB_SET_DEPTH)
-#define GetPortErrorCount	(0x8000 | HUB_GET_PORT_ERR_COUNT)
+/* class requests from USB 3.1 hub spec, table 10-7 */
+#define SetHubDepth		(0x2000 | HUB_SET_DEPTH)
+#define GetPortErrorCount	(0xa300 | HUB_GET_PORT_ERR_COUNT)
 
 /*
  * Generic bandwidth allocation constants/support
-- 
cgit v1.2.3-70-g09d2


From 85a77ff0160ab0a70eb4e8b14200e29b4d35c355 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Thu, 11 Jun 2015 14:58:47 +0900
Subject: extcon: Remove duplicate header file in extcon.h

This patch removes the duplicate header file in extcon.h.

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 include/linux/extcon.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index b16d929fa75f..1656c98175f5 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -27,8 +27,6 @@
 #define __LINUX_EXTCON_H__
 
 #include <linux/device.h>
-#include <linux/notifier.h>
-#include <linux/sysfs.h>
 
 /*
  * Define the unique id of supported external connectors
-- 
cgit v1.2.3-70-g09d2


From 2519b7650e99d90643a7a20d623513de9c95a817 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Thu, 11 Jun 2015 20:17:02 +0900
Subject: extcon: Remove optional print_state() function pointer of struct
 extcon_dev

This patch removes the optional print_state() function pointer which included
in 'struct extcon_dev' because the extcon must maintain the consistent name
of extcon device on sysfs instead of inconsistent state of external connectors.

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/extcon/extcon-gpio.c | 18 ------------------
 drivers/extcon/extcon.c      |  8 --------
 include/linux/extcon.h       |  5 -----
 3 files changed, 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-gpio.c b/drivers/extcon/extcon-gpio.c
index 355459a54e8b..57c24fa52edb 100644
--- a/drivers/extcon/extcon-gpio.c
+++ b/drivers/extcon/extcon-gpio.c
@@ -65,22 +65,6 @@ static irqreturn_t gpio_irq_handler(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static ssize_t extcon_gpio_print_state(struct extcon_dev *edev, char *buf)
-{
-	struct device *dev = edev->dev.parent;
-	struct gpio_extcon_data *extcon_data = dev_get_drvdata(dev);
-	const char *state;
-
-	if (extcon_get_state(edev))
-		state = extcon_data->state_on;
-	else
-		state = extcon_data->state_off;
-
-	if (state)
-		return sprintf(buf, "%s\n", state);
-	return -EINVAL;
-}
-
 static int gpio_extcon_probe(struct platform_device *pdev)
 {
 	struct gpio_extcon_platform_data *pdata = dev_get_platdata(&pdev->dev);
@@ -110,8 +94,6 @@ static int gpio_extcon_probe(struct platform_device *pdev)
 	extcon_data->state_on = pdata->state_on;
 	extcon_data->state_off = pdata->state_off;
 	extcon_data->check_on_resume = pdata->check_on_resume;
-	if (pdata->state_on && pdata->state_off)
-		extcon_data->edev->print_state = extcon_gpio_print_state;
 
 	ret = devm_gpio_request_one(&pdev->dev, extcon_data->gpio, GPIOF_DIR_IN,
 				    pdev->name);
diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c
index 43b57b02d050..d1fb5b4d024a 100644
--- a/drivers/extcon/extcon.c
+++ b/drivers/extcon/extcon.c
@@ -172,14 +172,6 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr,
 	int i, count = 0;
 	struct extcon_dev *edev = dev_get_drvdata(dev);
 
-	if (edev->print_state) {
-		int ret = edev->print_state(edev, buf);
-
-		if (ret >= 0)
-			return ret;
-		/* Use default if failed */
-	}
-
 	if (edev->max_supported == 0)
 		return sprintf(buf, "%u\n", edev->state);
 
diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index 1656c98175f5..c0f8c4fc5d45 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -75,8 +75,6 @@ struct extcon_cable;
  *			be attached simulataneously. {0x7, 0} is equivalent to
  *			{0x3, 0x6, 0x5, 0}. If it is {0xFFFFFFFF, 0}, there
  *			can be no simultaneous connections.
- * @print_state:	An optional callback to override the method to print the
- *			status of the extcon device.
  * @dev:		Device of this extcon.
  * @state:		Attach/detach state of this extcon. Do not provide at
  *			register-time.
@@ -100,9 +98,6 @@ struct extcon_dev {
 	const unsigned int *supported_cable;
 	const u32 *mutually_exclusive;
 
-	/* Optional callbacks to override class functions */
-	ssize_t	(*print_state)(struct extcon_dev *edev, char *buf);
-
 	/* Internal data. Please do not set. */
 	struct device dev;
 	struct raw_notifier_head *nh;
-- 
cgit v1.2.3-70-g09d2


From ffe8690c85b8426db7783064724d106702f1b1e8 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Thu, 6 Aug 2015 07:02:32 +0000
Subject: perf: add the necessary core perf APIs when accessing events counters
 in eBPF programs

This patch add three core perf APIs:
 - perf_event_attrs(): export the struct perf_event_attr from struct
   perf_event;
 - perf_event_get(): get the struct perf_event from the given fd;
 - perf_event_read_local(): read the events counters active on the
   current CPU;
These APIs are needed when accessing events counters in eBPF programs.

The API perf_event_read_local() comes from Peter and I add the
corresponding SOB.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h | 10 ++++++
 kernel/events/core.c       | 78 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2027809433b3..092a0e8a479a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -641,6 +641,8 @@ extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
+extern struct perf_event *perf_event_get(unsigned int fd);
+extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
@@ -659,6 +661,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				void *context);
 extern void perf_pmu_migrate_context(struct pmu *pmu,
 				int src_cpu, int dst_cpu);
+extern u64 perf_event_read_local(struct perf_event *event);
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 
@@ -979,6 +982,12 @@ static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
 static inline void perf_event_delayed_put(struct task_struct *task)	{ }
+static inline struct perf_event *perf_event_get(unsigned int fd)	{ return ERR_PTR(-EINVAL); }
+static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
+{
+	return ERR_PTR(-EINVAL);
+}
+static inline u64 perf_event_read_local(struct perf_event *event)	{ return -EINVAL; }
 static inline void perf_event_print_debug(void)				{ }
 static inline int perf_event_task_disable(void)				{ return -EINVAL; }
 static inline int perf_event_task_enable(void)				{ return -EINVAL; }
@@ -1011,6 +1020,7 @@ static inline void perf_event_enable(struct perf_event *event)		{ }
 static inline void perf_event_disable(struct perf_event *event)		{ }
 static inline int __perf_event_disable(void *info)			{ return -1; }
 static inline void perf_event_task_tick(void)				{ }
+static inline int perf_event_release_kernel(struct perf_event *event)	{ return 0; }
 #endif
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae3419b99..e2c6a8886d4d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3212,6 +3212,59 @@ static inline u64 perf_event_count(struct perf_event *event)
 	return __perf_event_count(event);
 }
 
+/*
+ * NMI-safe method to read a local event, that is an event that
+ * is:
+ *   - either for the current task, or for this CPU
+ *   - does not have inherit set, for inherited task events
+ *     will not be local and we cannot read them atomically
+ *   - must not have a pmu::count method
+ */
+u64 perf_event_read_local(struct perf_event *event)
+{
+	unsigned long flags;
+	u64 val;
+
+	/*
+	 * Disabling interrupts avoids all counter scheduling (context
+	 * switches, timer based rotation and IPIs).
+	 */
+	local_irq_save(flags);
+
+	/* If this is a per-task event, it must be for current */
+	WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
+		     event->hw.target != current);
+
+	/* If this is a per-CPU event, it must be for this CPU */
+	WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
+		     event->cpu != smp_processor_id());
+
+	/*
+	 * It must not be an event with inherit set, we cannot read
+	 * all child counters from atomic context.
+	 */
+	WARN_ON_ONCE(event->attr.inherit);
+
+	/*
+	 * It must not have a pmu::count method, those are not
+	 * NMI safe.
+	 */
+	WARN_ON_ONCE(event->pmu->count);
+
+	/*
+	 * If the event is currently on this CPU, its either a per-task event,
+	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
+	 * oncpu == -1).
+	 */
+	if (event->oncpu == smp_processor_id())
+		event->pmu->read(event);
+
+	val = local64_read(&event->count);
+	local_irq_restore(flags);
+
+	return val;
+}
+
 static u64 perf_event_read(struct perf_event *event)
 {
 	/*
@@ -8574,6 +8627,31 @@ void perf_event_delayed_put(struct task_struct *task)
 		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
 }
 
+struct perf_event *perf_event_get(unsigned int fd)
+{
+	int err;
+	struct fd f;
+	struct perf_event *event;
+
+	err = perf_fget_light(fd, &f);
+	if (err)
+		return ERR_PTR(err);
+
+	event = f.file->private_data;
+	atomic_long_inc(&event->refcount);
+	fdput(f);
+
+	return event;
+}
+
+const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
+{
+	if (!event)
+		return ERR_PTR(-EINVAL);
+
+	return &event->attr;
+}
+
 /*
  * inherit a event from parent task to child task:
  */
-- 
cgit v1.2.3-70-g09d2


From 2a36f0b92eb638dd023870574eb471b1c56be9ad Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Thu, 6 Aug 2015 07:02:33 +0000
Subject: bpf: Make the bpf_prog_array_map more generic

All the map backends are of generic nature. In order to avoid
adding much special code into the eBPF core, rewrite part of
the bpf_prog_array map code and make it more generic. So the
new perf_event_array map type can reuse most of code with
bpf_prog_array map and add fewer lines of special code.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/net/bpf_jit_comp.c |  6 ++--
 include/linux/bpf.h         |  8 +++--
 kernel/bpf/arraymap.c       | 80 +++++++++++++++++++++++++++------------------
 kernel/bpf/core.c           |  2 +-
 kernel/bpf/syscall.c        |  2 +-
 5 files changed, 60 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index ec5214f39aa8..70efcd0940f9 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog)
  *     goto out;
  *   if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
  *     goto out;
- *   prog = array->prog[index];
+ *   prog = array->ptrs[index];
  *   if (prog == NULL)
  *     goto out;
  *   goto *(prog->bpf_func + prologue_size);
@@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog)
 	EMIT3(0x83, 0xC0, 0x01);                  /* add eax, 1 */
 	EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
 
-	/* prog = array->prog[index]; */
+	/* prog = array->ptrs[index]; */
 	EMIT4_off32(0x48, 0x8D, 0x84, 0xD6,       /* lea rax, [rsi + rdx * 8 + offsetof(...)] */
-		    offsetof(struct bpf_array, prog));
+		    offsetof(struct bpf_array, ptrs));
 	EMIT3(0x48, 0x8B, 0x00);                  /* mov rax, qword ptr [rax] */
 
 	/* if (prog == NULL)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 139d6d2e123f..d495211d63d1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -24,6 +24,10 @@ struct bpf_map_ops {
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
 	int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
 	int (*map_delete_elem)(struct bpf_map *map, void *key);
+
+	/* funcs called by prog_array and perf_event_array map */
+	void *(*map_fd_get_ptr) (struct bpf_map *map, int fd);
+	void (*map_fd_put_ptr) (void *ptr);
 };
 
 struct bpf_map {
@@ -142,13 +146,13 @@ struct bpf_array {
 	bool owner_jited;
 	union {
 		char value[0] __aligned(8);
-		struct bpf_prog *prog[0] __aligned(8);
+		void *ptrs[0] __aligned(8);
 	};
 };
 #define MAX_TAIL_CALL_CNT 32
 
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
-void bpf_prog_array_map_clear(struct bpf_map *map);
+void bpf_fd_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229a6fa4..45df6572ecfd 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -150,15 +150,15 @@ static int __init register_array_map(void)
 }
 late_initcall(register_array_map);
 
-static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
 {
-	/* only bpf_prog file descriptors can be stored in prog_array map */
+	/* only file descriptors can be stored in this type of map */
 	if (attr->value_size != sizeof(u32))
 		return ERR_PTR(-EINVAL);
 	return array_map_alloc(attr);
 }
 
-static void prog_array_map_free(struct bpf_map *map)
+static void fd_array_map_free(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
@@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map)
 
 	/* make sure it's empty */
 	for (i = 0; i < array->map.max_entries; i++)
-		BUG_ON(array->prog[i] != NULL);
+		BUG_ON(array->ptrs[i] != NULL);
 	kvfree(array);
 }
 
-static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	return NULL;
 }
 
 /* only called from syscall */
-static int prog_array_map_update_elem(struct bpf_map *map, void *key,
-				      void *value, u64 map_flags)
+static int fd_array_map_update_elem(struct bpf_map *map, void *key,
+				    void *value, u64 map_flags)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
-	struct bpf_prog *prog, *old_prog;
+	void *new_ptr, *old_ptr;
 	u32 index = *(u32 *)key, ufd;
 
 	if (map_flags != BPF_ANY)
@@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key,
 		return -E2BIG;
 
 	ufd = *(u32 *)value;
-	prog = bpf_prog_get(ufd);
-	if (IS_ERR(prog))
-		return PTR_ERR(prog);
-
-	if (!bpf_prog_array_compatible(array, prog)) {
-		bpf_prog_put(prog);
-		return -EINVAL;
-	}
+	new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+	if (IS_ERR(new_ptr))
+		return PTR_ERR(new_ptr);
 
-	old_prog = xchg(array->prog + index, prog);
-	if (old_prog)
-		bpf_prog_put_rcu(old_prog);
+	old_ptr = xchg(array->ptrs + index, new_ptr);
+	if (old_ptr)
+		map->ops->map_fd_put_ptr(old_ptr);
 
 	return 0;
 }
 
-static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
-	struct bpf_prog *old_prog;
+	void *old_ptr;
 	u32 index = *(u32 *)key;
 
 	if (index >= array->map.max_entries)
 		return -E2BIG;
 
-	old_prog = xchg(array->prog + index, NULL);
-	if (old_prog) {
-		bpf_prog_put_rcu(old_prog);
+	old_ptr = xchg(array->ptrs + index, NULL);
+	if (old_ptr) {
+		map->ops->map_fd_put_ptr(old_ptr);
 		return 0;
 	} else {
 		return -ENOENT;
 	}
 }
 
+static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_prog *prog = bpf_prog_get(fd);
+	if (IS_ERR(prog))
+		return prog;
+
+	if (!bpf_prog_array_compatible(array, prog)) {
+		bpf_prog_put(prog);
+		return ERR_PTR(-EINVAL);
+	}
+	return prog;
+}
+
+static void prog_fd_array_put_ptr(void *ptr)
+{
+	struct bpf_prog *prog = ptr;
+
+	bpf_prog_put_rcu(prog);
+}
+
 /* decrement refcnt of all bpf_progs that are stored in this map */
-void bpf_prog_array_map_clear(struct bpf_map *map)
+void bpf_fd_array_map_clear(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
 
 	for (i = 0; i < array->map.max_entries; i++)
-		prog_array_map_delete_elem(map, &i);
+		fd_array_map_delete_elem(map, &i);
 }
 
 static const struct bpf_map_ops prog_array_ops = {
-	.map_alloc = prog_array_map_alloc,
-	.map_free = prog_array_map_free,
+	.map_alloc = fd_array_map_alloc,
+	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
-	.map_lookup_elem = prog_array_map_lookup_elem,
-	.map_update_elem = prog_array_map_update_elem,
-	.map_delete_elem = prog_array_map_delete_elem,
+	.map_lookup_elem = fd_array_map_lookup_elem,
+	.map_update_elem = fd_array_map_update_elem,
+	.map_delete_elem = fd_array_map_delete_elem,
+	.map_fd_get_ptr = prog_fd_array_get_ptr,
+	.map_fd_put_ptr = prog_fd_array_put_ptr,
 };
 
 static struct bpf_map_type_list prog_array_type __read_mostly = {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index fafa74161445..67c380cfa9ca 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -450,7 +450,7 @@ select_insn:
 
 		tail_call_cnt++;
 
-		prog = READ_ONCE(array->prog[index]);
+		prog = READ_ONCE(array->ptrs[index]);
 		if (unlikely(!prog))
 			goto out;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a1b14d197a4f..dc9b464fefa9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -72,7 +72,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
 		/* prog_array stores refcnt-ed bpf_prog pointers
 		 * release them all when user space closes prog_array_fd
 		 */
-		bpf_prog_array_map_clear(map);
+		bpf_fd_array_map_clear(map);
 
 	bpf_map_put(map);
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From ea317b267e9d03a8241893aa176fba7661d07579 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Thu, 6 Aug 2015 07:02:34 +0000
Subject: bpf: Add new bpf map type to store the pointer to struct perf_event

Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
This map only stores the pointer to struct perf_event. The
user space event FDs from perf_event_open() syscall are converted
to the pointer to struct perf_event and stored in map.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/arraymap.c    | 57 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d495211d63d1..4fc1f4070789 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -10,6 +10,7 @@
 #include <uapi/linux/bpf.h>
 #include <linux/workqueue.h>
 #include <linux/file.h>
+#include <linux/perf_event.h>
 
 struct bpf_map;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2ce13c109b00..a1814e8e53a7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -114,6 +114,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_HASH,
 	BPF_MAP_TYPE_ARRAY,
 	BPF_MAP_TYPE_PROG_ARRAY,
+	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 45df6572ecfd..29ace107f236 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -273,3 +273,60 @@ static int __init register_prog_array_map(void)
 	return 0;
 }
 late_initcall(register_prog_array_map);
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+	bpf_fd_array_map_clear(map);
+	fd_array_map_free(map);
+}
+
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+	struct perf_event *event;
+	const struct perf_event_attr *attr;
+
+	event = perf_event_get(fd);
+	if (IS_ERR(event))
+		return event;
+
+	attr = perf_event_attrs(event);
+	if (IS_ERR(attr))
+		return (void *)attr;
+
+	if (attr->type != PERF_TYPE_RAW &&
+	    attr->type != PERF_TYPE_HARDWARE) {
+		perf_event_release_kernel(event);
+		return ERR_PTR(-EINVAL);
+	}
+	return event;
+}
+
+static void perf_event_fd_array_put_ptr(void *ptr)
+{
+	struct perf_event *event = ptr;
+
+	perf_event_release_kernel(event);
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+	.map_alloc = fd_array_map_alloc,
+	.map_free = perf_event_array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = fd_array_map_lookup_elem,
+	.map_update_elem = fd_array_map_update_elem,
+	.map_delete_elem = fd_array_map_delete_elem,
+	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
+	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+	.ops = &perf_event_array_ops,
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+	bpf_register_map_type(&perf_event_array_type);
+	return 0;
+}
+late_initcall(register_perf_event_array_map);
-- 
cgit v1.2.3-70-g09d2


From 35578d7984003097af2b1e34502bc943d40c1804 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Thu, 6 Aug 2015 07:02:35 +0000
Subject: bpf: Implement function bpf_perf_event_read() that get the selected
 hardware PMU conuter

According to the perf_event_map_fd and index, the function
bpf_perf_event_read() can convert the corresponding map
value to the pointer to struct perf_event and return the
Hardware PMU counter value.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/verifier.c    | 48 +++++++++++++++++++++++++++++++++---------------
 kernel/trace/bpf_trace.c | 31 +++++++++++++++++++++++++++++++
 4 files changed, 66 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4fc1f4070789..f57d7fed9ec3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -190,6 +190,7 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
 extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
+extern const struct bpf_func_proto bpf_perf_event_read_proto;
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a1814e8e53a7..92a48e2d5461 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -271,6 +271,7 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_skb_get_tunnel_key,
 	BPF_FUNC_skb_set_tunnel_key,
+	BPF_FUNC_perf_event_read,	/* u64 bpf_perf_event_read(&map, index) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cd307df98cb3..48e1c7192560 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -238,6 +238,14 @@ static const char * const reg_type_str[] = {
 	[CONST_IMM]		= "imm",
 };
 
+static const struct {
+	int map_type;
+	int func_id;
+} func_limit[] = {
+	{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
+	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+};
+
 static void print_verifier_state(struct verifier_env *env)
 {
 	enum bpf_reg_type t;
@@ -837,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 	return err;
 }
 
+static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+{
+	bool bool_map, bool_func;
+	int i;
+
+	if (!map)
+		return 0;
+
+	for (i = 0; i <= ARRAY_SIZE(func_limit); i++) {
+		bool_map = (map->map_type == func_limit[i].map_type);
+		bool_func = (func_id == func_limit[i].func_id);
+		/* only when map & func pair match it can continue.
+		 * don't allow any other map type to be passed into
+		 * the special func;
+		 */
+		if (bool_map != bool_func)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int check_call(struct verifier_env *env, int func_id)
 {
 	struct verifier_state *state = &env->cur_state;
@@ -912,21 +942,9 @@ static int check_call(struct verifier_env *env, int func_id)
 		return -EINVAL;
 	}
 
-	if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
-	    func_id != BPF_FUNC_tail_call)
-		/* prog_array map type needs extra care:
-		 * only allow to pass it into bpf_tail_call() for now.
-		 * bpf_map_delete_elem() can be allowed in the future,
-		 * while bpf_map_update_elem() must only be done via syscall
-		 */
-		return -EINVAL;
-
-	if (func_id == BPF_FUNC_tail_call &&
-	    map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
-		/* don't allow any other map type to be passed into
-		 * bpf_tail_call()
-		 */
-		return -EINVAL;
+	err = check_map_func_compatibility(map, func_id);
+	if (err)
+		return err;
 
 	return 0;
 }
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041adee90..ef9936df1b04 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -158,6 +158,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
+static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+{
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct perf_event *event;
+
+	if (unlikely(index >= array->map.max_entries))
+		return -E2BIG;
+
+	event = (struct perf_event *)array->ptrs[index];
+	if (!event)
+		return -ENOENT;
+
+	/*
+	 * we don't know if the function is run successfully by the
+	 * return value. It can be judged in other places, such as
+	 * eBPF programs.
+	 */
+	return perf_event_read_local(event);
+}
+
+const struct bpf_func_proto bpf_perf_event_read_proto = {
+	.func		= bpf_perf_event_read,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -183,6 +212,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return bpf_get_trace_printk_proto();
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_perf_event_read:
+		return &bpf_perf_event_read_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3-70-g09d2


From 48e9743dd6483c5fd3f10c8e42c60d52d64b0e27 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Date: Mon, 27 Jul 2015 17:30:50 +0300
Subject: i2c: core: add and export of_get_i2c_adapter_by_node() interface

of_find_i2c_adapter_by_node() call requires quite often missing
put_device(), and i2c_put_adapter() releases a device locked by
i2c_get_adapter() only. In general module_put(adapter->owner) and
put_device(dev) are not interchangeable.

This is a common error reproduction scenario as a result of the
misusage described above (for clearness this is run on iMX6 platform
with HDMI and I2C bus drivers compiled as kernel modules):

    root@mx6q:~# lsmod | grep i2c
    i2c_imx                10213  0
    root@mx6q:~# lsmod | grep dw_hdmi_imx
    dw_hdmi_imx             3631  0
    dw_hdmi                11846  1 dw_hdmi_imx
    imxdrm                  8674  3 dw_hdmi_imx,imx_ipuv3_crtc,imx_ldb
    drm_kms_helper        113765  5 dw_hdmi,imxdrm,imx_ipuv3_crtc,imx_ldb
    root@mx6q:~# rmmod dw_hdmi_imx
    root@mx6q:~# lsmod | grep i2c
    i2c_imx                10213  -1

                                 ^^^^^

    root@mx6q:~# rmmod i2c_imx
    rmmod: ERROR: Module i2c_imx is in use

To fix existing users of these interfaces and to avoid any further
confusion and misusage in future, add one more interface
of_get_i2c_adapter_by_node(), it is similar to i2c_get_adapter() in
sense that an I2C bus device driver found and locked by user can be
correctly unlocked by i2c_put_adapter().

Signed-off-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core.c | 18 ++++++++++++++++++
 include/linux/i2c.h    |  7 +++++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index f80992d0a608..07a83f34ed58 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -1371,6 +1371,24 @@ struct i2c_adapter *of_find_i2c_adapter_by_node(struct device_node *node)
 	return adapter;
 }
 EXPORT_SYMBOL(of_find_i2c_adapter_by_node);
+
+/* must call i2c_put_adapter() when done with returned i2c_adapter device */
+struct i2c_adapter *of_get_i2c_adapter_by_node(struct device_node *node)
+{
+	struct i2c_adapter *adapter;
+
+	adapter = of_find_i2c_adapter_by_node(node);
+	if (!adapter)
+		return NULL;
+
+	if (!try_module_get(adapter->owner)) {
+		put_device(&adapter->dev);
+		adapter = NULL;
+	}
+
+	return adapter;
+}
+EXPORT_SYMBOL(of_get_i2c_adapter_by_node);
 #else
 static void of_i2c_register_devices(struct i2c_adapter *adap) { }
 #endif /* CONFIG_OF */
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index e83a738a3b87..e2c859b74f8b 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -638,6 +638,8 @@ extern struct i2c_client *of_find_i2c_device_by_node(struct device_node *node);
 /* must call put_device() when done with returned i2c_adapter device */
 extern struct i2c_adapter *of_find_i2c_adapter_by_node(struct device_node *node);
 
+/* must call i2c_put_adapter() when done with returned i2c_adapter device */
+struct i2c_adapter *of_get_i2c_adapter_by_node(struct device_node *node);
 #else
 
 static inline struct i2c_client *of_find_i2c_device_by_node(struct device_node *node)
@@ -649,6 +651,11 @@ static inline struct i2c_adapter *of_find_i2c_adapter_by_node(struct device_node
 {
 	return NULL;
 }
+
+static inline struct i2c_adapter *of_get_i2c_adapter_by_node(struct device_node *node)
+{
+	return NULL;
+}
 #endif /* CONFIG_OF */
 
 #endif /* _LINUX_I2C_H */
-- 
cgit v1.2.3-70-g09d2


From 0cc67945ea5933d53db69606312cf52f553d1b81 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Fri, 31 Jul 2015 11:48:05 +0100
Subject: mailbox: switch to hrtimer for tx_complete polling

The mailbox core uses jiffy based timer to handle polling for the
transmit completion. If the client/protocol have/support notification
of the last packet transmit completion via ACK packet, then we tick the
Tx state machine immediately in the callback. However if the client
doesn't support that mechanism we might end-up waiting for atleast a
jiffy even though the remote is ready to receive the next request.

This patch switches the timer used for that polling from jiffy-based
to hrtimer-based so that we can support polling at much higher time
resolution.

Reported-and-suggested-by: Juri Lelli <Juri.Lelli@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/mailbox.c          | 27 +++++++++++++++------------
 include/linux/mailbox_controller.h |  7 ++++---
 2 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c
index c7fdb57fd166..6a4811f85705 100644
--- a/drivers/mailbox/mailbox.c
+++ b/drivers/mailbox/mailbox.c
@@ -26,8 +26,6 @@
 static LIST_HEAD(mbox_cons);
 static DEFINE_MUTEX(con_mutex);
 
-static void poll_txdone(unsigned long data);
-
 static int add_to_rbuf(struct mbox_chan *chan, void *mssg)
 {
 	int idx;
@@ -88,7 +86,9 @@ exit:
 	spin_unlock_irqrestore(&chan->lock, flags);
 
 	if (!err && (chan->txdone_method & TXDONE_BY_POLL))
-		poll_txdone((unsigned long)chan->mbox);
+		/* kick start the timer immediately to avoid delays */
+		hrtimer_start(&chan->mbox->poll_hrt, ktime_set(0, 0),
+			      HRTIMER_MODE_REL);
 }
 
 static void tx_tick(struct mbox_chan *chan, int r)
@@ -112,9 +112,10 @@ static void tx_tick(struct mbox_chan *chan, int r)
 		complete(&chan->tx_complete);
 }
 
-static void poll_txdone(unsigned long data)
+static enum hrtimer_restart txdone_hrtimer(struct hrtimer *hrtimer)
 {
-	struct mbox_controller *mbox = (struct mbox_controller *)data;
+	struct mbox_controller *mbox =
+		container_of(hrtimer, struct mbox_controller, poll_hrt);
 	bool txdone, resched = false;
 	int i;
 
@@ -130,9 +131,11 @@ static void poll_txdone(unsigned long data)
 		}
 	}
 
-	if (resched)
-		mod_timer(&mbox->poll, jiffies +
-				msecs_to_jiffies(mbox->txpoll_period));
+	if (resched) {
+		hrtimer_forward_now(hrtimer, ms_to_ktime(mbox->txpoll_period));
+		return HRTIMER_RESTART;
+	}
+	return HRTIMER_NORESTART;
 }
 
 /**
@@ -451,9 +454,9 @@ int mbox_controller_register(struct mbox_controller *mbox)
 		txdone = TXDONE_BY_ACK;
 
 	if (txdone == TXDONE_BY_POLL) {
-		mbox->poll.function = &poll_txdone;
-		mbox->poll.data = (unsigned long)mbox;
-		init_timer(&mbox->poll);
+		hrtimer_init(&mbox->poll_hrt, CLOCK_MONOTONIC,
+			     HRTIMER_MODE_REL);
+		mbox->poll_hrt.function = txdone_hrtimer;
 	}
 
 	for (i = 0; i < mbox->num_chans; i++) {
@@ -495,7 +498,7 @@ void mbox_controller_unregister(struct mbox_controller *mbox)
 		mbox_free_channel(&mbox->chans[i]);
 
 	if (mbox->txdone_poll)
-		del_timer_sync(&mbox->poll);
+		hrtimer_cancel(&mbox->poll_hrt);
 
 	mutex_unlock(&con_mutex);
 }
diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h
index 68c42454439b..74deadb42d76 100644
--- a/include/linux/mailbox_controller.h
+++ b/include/linux/mailbox_controller.h
@@ -9,7 +9,7 @@
 
 #include <linux/of.h>
 #include <linux/types.h>
-#include <linux/timer.h>
+#include <linux/hrtimer.h>
 #include <linux/device.h>
 #include <linux/completion.h>
 
@@ -67,7 +67,8 @@ struct mbox_chan_ops {
  * @txpoll_period:	If 'txdone_poll' is in effect, the API polls for
  *			last TX's status after these many millisecs
  * @of_xlate:		Controller driver specific mapping of channel via DT
- * @poll:		API private. Used to poll for TXDONE on all channels.
+ * @poll_hrt:		API private. hrtimer used to poll for TXDONE on all
+ *			channels.
  * @node:		API private. To hook into list of controllers.
  */
 struct mbox_controller {
@@ -81,7 +82,7 @@ struct mbox_controller {
 	struct mbox_chan *(*of_xlate)(struct mbox_controller *mbox,
 				      const struct of_phandle_args *sp);
 	/* Internal to API */
-	struct timer_list poll;
+	struct hrtimer poll_hrt;
 	struct list_head node;
 };
 
-- 
cgit v1.2.3-70-g09d2


From 92b7cb5dc885b38b21093eefed8028b615952965 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Mon, 3 Aug 2015 17:40:51 +0300
Subject: extcon: palmas: Support GPIO based USB ID detection

Some palmas based chip variants do not have OTG based ID logic.
For these variants we rely on GPIO based USB ID detection.

These chips do have VBUS comparator for VBUS detection so we
continue to use the old way of detecting VBUS.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 .../devicetree/bindings/extcon/extcon-palmas.txt   |   5 +-
 drivers/extcon/extcon-palmas.c                     | 129 ++++++++++++++++++---
 drivers/extcon/extcon-usb-gpio.c                   |   1 +
 include/linux/mfd/palmas.h                         |   7 ++
 4 files changed, 126 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/extcon/extcon-palmas.txt b/Documentation/devicetree/bindings/extcon/extcon-palmas.txt
index 45414bbcd945..f61d5af44a27 100644
--- a/Documentation/devicetree/bindings/extcon/extcon-palmas.txt
+++ b/Documentation/devicetree/bindings/extcon/extcon-palmas.txt
@@ -10,8 +10,11 @@ Required Properties:
 
 Optional Properties:
  - ti,wakeup : To enable the wakeup comparator in probe
- - ti,enable-id-detection: Perform ID detection.
+ - ti,enable-id-detection: Perform ID detection. If id-gpio is specified
+		it performs id-detection using GPIO else using OTG core.
  - ti,enable-vbus-detection: Perform VBUS detection.
+ - id-gpio: gpio for GPIO ID detection. See gpio binding.
+ - debounce-delay-ms: debounce delay for GPIO ID pin in milliseconds.
 
 palmas-usb {
        compatible = "ti,twl6035-usb", "ti,palmas-usb";
diff --git a/drivers/extcon/extcon-palmas.c b/drivers/extcon/extcon-palmas.c
index 8933e7e0d9da..662e91778cb0 100644
--- a/drivers/extcon/extcon-palmas.c
+++ b/drivers/extcon/extcon-palmas.c
@@ -28,6 +28,10 @@
 #include <linux/mfd/palmas.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
+#include <linux/of_gpio.h>
+#include <linux/workqueue.h>
+
+#define USB_GPIO_DEBOUNCE_MS	20	/* ms */
 
 static const unsigned int palmas_extcon_cable[] = {
 	EXTCON_USB,
@@ -118,19 +122,54 @@ static irqreturn_t palmas_id_irq_handler(int irq, void *_palmas_usb)
 	return IRQ_HANDLED;
 }
 
+static void palmas_gpio_id_detect(struct work_struct *work)
+{
+	int id;
+	struct palmas_usb *palmas_usb = container_of(to_delayed_work(work),
+						     struct palmas_usb,
+						     wq_detectid);
+	struct extcon_dev *edev = palmas_usb->edev;
+
+	if (!palmas_usb->id_gpiod)
+		return;
+
+	id = gpiod_get_value_cansleep(palmas_usb->id_gpiod);
+
+	if (id) {
+		extcon_set_cable_state_(edev, EXTCON_USB_HOST, false);
+		dev_info(palmas_usb->dev, "USB-HOST cable is detached\n");
+	} else {
+		extcon_set_cable_state_(edev, EXTCON_USB_HOST, true);
+		dev_info(palmas_usb->dev, "USB-HOST cable is attached\n");
+	}
+}
+
+static irqreturn_t palmas_gpio_id_irq_handler(int irq, void *_palmas_usb)
+{
+	struct palmas_usb *palmas_usb = _palmas_usb;
+
+	queue_delayed_work(system_power_efficient_wq, &palmas_usb->wq_detectid,
+			   palmas_usb->sw_debounce_jiffies);
+
+	return IRQ_HANDLED;
+}
+
 static void palmas_enable_irq(struct palmas_usb *palmas_usb)
 {
 	palmas_write(palmas_usb->palmas, PALMAS_USB_OTG_BASE,
 		PALMAS_USB_VBUS_CTRL_SET,
 		PALMAS_USB_VBUS_CTRL_SET_VBUS_ACT_COMP);
 
-	palmas_write(palmas_usb->palmas, PALMAS_USB_OTG_BASE,
-		PALMAS_USB_ID_CTRL_SET, PALMAS_USB_ID_CTRL_SET_ID_ACT_COMP);
+	if (palmas_usb->enable_id_detection) {
+		palmas_write(palmas_usb->palmas, PALMAS_USB_OTG_BASE,
+			     PALMAS_USB_ID_CTRL_SET,
+			     PALMAS_USB_ID_CTRL_SET_ID_ACT_COMP);
 
-	palmas_write(palmas_usb->palmas, PALMAS_USB_OTG_BASE,
-		PALMAS_USB_ID_INT_EN_HI_SET,
-		PALMAS_USB_ID_INT_EN_HI_SET_ID_GND |
-		PALMAS_USB_ID_INT_EN_HI_SET_ID_FLOAT);
+		palmas_write(palmas_usb->palmas, PALMAS_USB_OTG_BASE,
+			     PALMAS_USB_ID_INT_EN_HI_SET,
+			     PALMAS_USB_ID_INT_EN_HI_SET_ID_GND |
+			     PALMAS_USB_ID_INT_EN_HI_SET_ID_FLOAT);
+	}
 
 	if (palmas_usb->enable_vbus_detection)
 		palmas_vbus_irq_handler(palmas_usb->vbus_irq, palmas_usb);
@@ -169,20 +208,36 @@ static int palmas_usb_probe(struct platform_device *pdev)
 			palmas_usb->wakeup = pdata->wakeup;
 	}
 
+	palmas_usb->id_gpiod = devm_gpiod_get_optional(&pdev->dev, "id");
+	if (IS_ERR(palmas_usb->id_gpiod)) {
+		dev_err(&pdev->dev, "failed to get id gpio\n");
+		return PTR_ERR(palmas_usb->id_gpiod);
+	}
+
+	if (palmas_usb->enable_id_detection && palmas_usb->id_gpiod) {
+		palmas_usb->enable_id_detection = false;
+		palmas_usb->enable_gpio_id_detection = true;
+	}
+
+	if (palmas_usb->enable_gpio_id_detection) {
+		u32 debounce;
+
+		if (of_property_read_u32(node, "debounce-delay-ms", &debounce))
+			debounce = USB_GPIO_DEBOUNCE_MS;
+
+		status = gpiod_set_debounce(palmas_usb->id_gpiod,
+					    debounce * 1000);
+		if (status < 0)
+			palmas_usb->sw_debounce_jiffies = msecs_to_jiffies(debounce);
+	}
+
+	INIT_DELAYED_WORK(&palmas_usb->wq_detectid, palmas_gpio_id_detect);
+
 	palmas->usb = palmas_usb;
 	palmas_usb->palmas = palmas;
 
 	palmas_usb->dev	 = &pdev->dev;
 
-	palmas_usb->id_otg_irq = regmap_irq_get_virq(palmas->irq_data,
-						PALMAS_ID_OTG_IRQ);
-	palmas_usb->id_irq = regmap_irq_get_virq(palmas->irq_data,
-						PALMAS_ID_IRQ);
-	palmas_usb->vbus_otg_irq = regmap_irq_get_virq(palmas->irq_data,
-						PALMAS_VBUS_OTG_IRQ);
-	palmas_usb->vbus_irq = regmap_irq_get_virq(palmas->irq_data,
-						PALMAS_VBUS_IRQ);
-
 	palmas_usb_wakeup(palmas, palmas_usb->wakeup);
 
 	platform_set_drvdata(pdev, palmas_usb);
@@ -201,6 +256,10 @@ static int palmas_usb_probe(struct platform_device *pdev)
 	}
 
 	if (palmas_usb->enable_id_detection) {
+		palmas_usb->id_otg_irq = regmap_irq_get_virq(palmas->irq_data,
+							     PALMAS_ID_OTG_IRQ);
+		palmas_usb->id_irq = regmap_irq_get_virq(palmas->irq_data,
+							 PALMAS_ID_IRQ);
 		status = devm_request_threaded_irq(palmas_usb->dev,
 				palmas_usb->id_irq,
 				NULL, palmas_id_irq_handler,
@@ -212,9 +271,33 @@ static int palmas_usb_probe(struct platform_device *pdev)
 					palmas_usb->id_irq, status);
 			return status;
 		}
+	} else if (palmas_usb->enable_gpio_id_detection) {
+		palmas_usb->gpio_id_irq = gpiod_to_irq(palmas_usb->id_gpiod);
+		if (palmas_usb->gpio_id_irq < 0) {
+			dev_err(&pdev->dev, "failed to get id irq\n");
+			return palmas_usb->gpio_id_irq;
+		}
+		status = devm_request_threaded_irq(&pdev->dev,
+						   palmas_usb->gpio_id_irq,
+						   NULL,
+						   palmas_gpio_id_irq_handler,
+						   IRQF_TRIGGER_RISING |
+						   IRQF_TRIGGER_FALLING |
+						   IRQF_ONESHOT,
+						   "palmas_usb_id",
+						   palmas_usb);
+		if (status < 0) {
+			dev_err(&pdev->dev,
+				"failed to request handler for id irq\n");
+			return status;
+		}
 	}
 
 	if (palmas_usb->enable_vbus_detection) {
+		palmas_usb->vbus_otg_irq = regmap_irq_get_virq(palmas->irq_data,
+						       PALMAS_VBUS_OTG_IRQ);
+		palmas_usb->vbus_irq = regmap_irq_get_virq(palmas->irq_data,
+							   PALMAS_VBUS_IRQ);
 		status = devm_request_threaded_irq(palmas_usb->dev,
 				palmas_usb->vbus_irq, NULL,
 				palmas_vbus_irq_handler,
@@ -229,10 +312,21 @@ static int palmas_usb_probe(struct platform_device *pdev)
 	}
 
 	palmas_enable_irq(palmas_usb);
+	/* perform initial detection */
+	palmas_gpio_id_detect(&palmas_usb->wq_detectid.work);
 	device_set_wakeup_capable(&pdev->dev, true);
 	return 0;
 }
 
+static int palmas_usb_remove(struct platform_device *pdev)
+{
+	struct palmas_usb *palmas_usb = platform_get_drvdata(pdev);
+
+	cancel_delayed_work_sync(&palmas_usb->wq_detectid);
+
+	return 0;
+}
+
 #ifdef CONFIG_PM_SLEEP
 static int palmas_usb_suspend(struct device *dev)
 {
@@ -243,6 +337,8 @@ static int palmas_usb_suspend(struct device *dev)
 			enable_irq_wake(palmas_usb->vbus_irq);
 		if (palmas_usb->enable_id_detection)
 			enable_irq_wake(palmas_usb->id_irq);
+		if (palmas_usb->enable_gpio_id_detection)
+			enable_irq_wake(palmas_usb->gpio_id_irq);
 	}
 	return 0;
 }
@@ -256,6 +352,8 @@ static int palmas_usb_resume(struct device *dev)
 			disable_irq_wake(palmas_usb->vbus_irq);
 		if (palmas_usb->enable_id_detection)
 			disable_irq_wake(palmas_usb->id_irq);
+		if (palmas_usb->enable_gpio_id_detection)
+			disable_irq_wake(palmas_usb->gpio_id_irq);
 	}
 	return 0;
 };
@@ -273,6 +371,7 @@ static const struct of_device_id of_palmas_match_tbl[] = {
 
 static struct platform_driver palmas_usb_driver = {
 	.probe = palmas_usb_probe,
+	.remove = palmas_usb_remove,
 	.driver = {
 		.name = "palmas-usb",
 		.of_match_table = of_palmas_match_tbl,
diff --git a/drivers/extcon/extcon-usb-gpio.c b/drivers/extcon/extcon-usb-gpio.c
index a2a44536a608..2b2fecffb1ad 100644
--- a/drivers/extcon/extcon-usb-gpio.c
+++ b/drivers/extcon/extcon-usb-gpio.c
@@ -15,6 +15,7 @@
  */
 
 #include <linux/extcon.h>
+#include <linux/gpio.h>
 #include <linux/gpio/consumer.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
diff --git a/include/linux/mfd/palmas.h b/include/linux/mfd/palmas.h
index bb270bd03eed..13e1d96935ed 100644
--- a/include/linux/mfd/palmas.h
+++ b/include/linux/mfd/palmas.h
@@ -21,6 +21,7 @@
 #include <linux/regmap.h>
 #include <linux/regulator/driver.h>
 #include <linux/extcon.h>
+#include <linux/of_gpio.h>
 #include <linux/usb/phy_companion.h>
 
 #define PALMAS_NUM_CLIENTS		3
@@ -551,10 +552,16 @@ struct palmas_usb {
 	int vbus_otg_irq;
 	int vbus_irq;
 
+	int gpio_id_irq;
+	struct gpio_desc *id_gpiod;
+	unsigned long sw_debounce_jiffies;
+	struct delayed_work wq_detectid;
+
 	enum palmas_usb_state linkstat;
 	int wakeup;
 	bool enable_vbus_detection;
 	bool enable_id_detection;
+	bool enable_gpio_id_detection;
 };
 
 #define comparator_to_palmas(x) container_of((x), struct palmas_usb, comparator)
-- 
cgit v1.2.3-70-g09d2


From cc9a903d915c21626b6b2fbf8ed0ff16a7f82210 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 7 Aug 2015 16:55:46 -0400
Subject: svcrdma: Change maximum server payload back to RPCSVC_MAXPAYLOAD

Both commit 0380a3f375 ("svcrdma: Add a separate "max data segs"
macro for svcrdma") and commit 7e5be28827bf ("svcrdma: advertise
the correct max payload") are incorrect. This commit reverts both
changes, restoring the server's maximum payload size to 1MB.

Commit 7e5be28827bf based the server's maximum payload on the
_client's_ RPCRDMA_MAX_DATA_SEGS value. That was wrong.

Commit 0380a3f375 tried to fix this so that the client maximum
payload size could be raised without affecting the server, but
managed to confuse matters more on the server side.

More importantly, limiting the advertised maximum payload size was
meant to be a workaround, not the actual fix. We need to revisit

  https://bugzilla.linux-nfs.org/show_bug.cgi?id=270

A Linux client on a platform with 64KB pages can overrun and crash
an x86_64 NFS/RDMA server when the r/wsize is 1MB. An x86/64 Linux
client seems to work fine using 1MB reads and writes when the Linux
server's maximum payload size is restored to 1MB.

BugLink: https://bugzilla.linux-nfs.org/show_bug.cgi?id=270
Fixes: 0380a3f375 ("svcrdma: Add a separate "max data segs" macro")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          | 9 ++-------
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +-
 net/sunrpc/xprtrdma/xprt_rdma.h          | 1 -
 3 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 13af61b70417..d5ee6d8b7c58 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -172,13 +172,6 @@ struct svcxprt_rdma {
 #define RDMAXPRT_SQ_PENDING	2
 #define RDMAXPRT_CONN_PENDING	3
 
-#define RPCRDMA_MAX_SVC_SEGS	(64)	/* server max scatter/gather */
-#if RPCSVC_MAXPAYLOAD < (RPCRDMA_MAX_SVC_SEGS << PAGE_SHIFT)
-#define RPCRDMA_MAXPAYLOAD	RPCSVC_MAXPAYLOAD
-#else
-#define RPCRDMA_MAXPAYLOAD	(RPCRDMA_MAX_SVC_SEGS << PAGE_SHIFT)
-#endif
-
 #define RPCRDMA_LISTEN_BACKLOG  10
 /* The default ORD value is based on two outstanding full-size writes with a
  * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ.  */
@@ -187,6 +180,8 @@ struct svcxprt_rdma {
 #define RPCRDMA_MAX_REQUESTS    32
 #define RPCRDMA_MAX_REQ_SIZE    4096
 
+#define RPCSVC_MAXPAYLOAD_RDMA	RPCSVC_MAXPAYLOAD
+
 /* svc_rdma_marshal.c */
 extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *);
 extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 4054a9de6a91..21e40365042c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -91,7 +91,7 @@ struct svc_xprt_class svc_rdma_class = {
 	.xcl_name = "rdma",
 	.xcl_owner = THIS_MODULE,
 	.xcl_ops = &svc_rdma_ops,
-	.xcl_max_payload = RPCRDMA_MAXPAYLOAD,
+	.xcl_max_payload = RPCSVC_MAXPAYLOAD_RDMA,
 	.xcl_ident = XPRT_TRANSPORT_RDMA,
 };
 
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index f49dd8b38122..e718d0959af3 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -51,7 +51,6 @@
 #include <linux/sunrpc/clnt.h> 		/* rpc_xprt */
 #include <linux/sunrpc/rpc_rdma.h> 	/* RPC/RDMA protocol */
 #include <linux/sunrpc/xprtrdma.h> 	/* xprt parameters */
-#include <linux/sunrpc/svc.h>		/* RPCSVC_MAXPAYLOAD */
 
 #define RDMA_RESOLVE_TIMEOUT	(5000)	/* 5 seconds */
 #define RDMA_CONNECT_RETRY_MAX	(2)	/* retries if no listener backlog */
-- 
cgit v1.2.3-70-g09d2


From ea126e74353453d15fc0a181910ae1e25162f2a1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Mon, 8 Jun 2015 12:03:32 -0700
Subject: nfsd/sunrpc: add a new svc_serv_ops struct and move sv_shutdown into
 it

In later patches we'll need to abstract out more operations on a
per-service level, besides sv_shutdown and sv_function.

Declare a new svc_serv_ops struct to hold these operations, and move
sv_shutdown into this struct.

Signed-off-by: Shirley Ma <shirley.ma@oracle.com>
Acked-by: Jeff Layton <jlayton@primarydata.com>
Tested-by: Shirley Ma <shirley.ma@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/svc.c             |  6 +++++-
 fs/nfs/callback.c          |  5 ++++-
 fs/nfsd/nfssvc.c           |  6 +++++-
 include/linux/sunrpc/svc.h | 20 ++++++++++----------
 net/sunrpc/svc.c           | 18 +++++++++---------
 5 files changed, 33 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 55505cbe11af..4182b2f925cd 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -322,6 +322,10 @@ out_rqst:
 	return error;
 }
 
+static struct svc_serv_ops lockd_sv_ops = {
+	.svo_shutdown	= svc_rpcb_cleanup,
+};
+
 static struct svc_serv *lockd_create_svc(void)
 {
 	struct svc_serv *serv;
@@ -350,7 +354,7 @@ static struct svc_serv *lockd_create_svc(void)
 		nlm_timeout = LOCKD_DFLT_TIMEO;
 	nlmsvc_timeout = nlm_timeout * HZ;
 
-	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, svc_rpcb_cleanup);
+	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, &lockd_sv_ops);
 	if (!serv) {
 		printk(KERN_WARNING "lockd_up: create service failed\n");
 		return ERR_PTR(-ENOMEM);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 682529c00996..182792d115fc 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -308,6 +308,9 @@ err_bind:
 	return ret;
 }
 
+static struct svc_serv_ops nfs_cb_sv_ops = {
+};
+
 static struct svc_serv *nfs_callback_create_svc(int minorversion)
 {
 	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
@@ -333,7 +336,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
 		printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
 			cb_info->users);
 
-	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
+	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, &nfs_cb_sv_ops);
 	if (!serv) {
 		printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
 		return ERR_PTR(-ENOMEM);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 9277cc91c21b..7311677330b2 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -391,6 +391,10 @@ static int nfsd_get_default_max_blksize(void)
 	return ret;
 }
 
+static struct svc_serv_ops nfsd_sv_ops = {
+	.svo_shutdown = nfsd_last_thread,
+};
+
 int nfsd_create_serv(struct net *net)
 {
 	int error;
@@ -405,7 +409,7 @@ int nfsd_create_serv(struct net *net)
 		nfsd_max_blksize = nfsd_get_default_max_blksize();
 	nfsd_reset_versions();
 	nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
-				      nfsd_last_thread, nfsd, THIS_MODULE);
+				      &nfsd_sv_ops, nfsd, THIS_MODULE);
 	if (nn->nfsd_serv == NULL)
 		return -ENOMEM;
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index fae6fb947fc8..2e682f636b13 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -54,6 +54,13 @@ struct svc_pool {
 	unsigned long		sp_flags;
 } ____cacheline_aligned_in_smp;
 
+struct svc_serv;
+
+struct svc_serv_ops {
+	/* Callback to use when last thread exits. */
+	void	(*svo_shutdown)(struct svc_serv *serv, struct net *net);
+};
+
 /*
  * RPC service.
  *
@@ -85,13 +92,7 @@ struct svc_serv {
 
 	unsigned int		sv_nrpools;	/* number of thread pools */
 	struct svc_pool *	sv_pools;	/* array of thread pools */
-
-	void			(*sv_shutdown)(struct svc_serv *serv,
-					       struct net *net);
-						/* Callback to use when last thread
-						 * exits.
-						 */
-
+	struct svc_serv_ops	*sv_ops;	/* server operations */
 	struct module *		sv_module;	/* optional module to count when
 						 * adding threads */
 	svc_thread_fn		sv_function;	/* main function for threads */
@@ -429,13 +430,12 @@ int svc_rpcb_setup(struct svc_serv *serv, struct net *net);
 void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net);
 int svc_bind(struct svc_serv *serv, struct net *net);
 struct svc_serv *svc_create(struct svc_program *, unsigned int,
-			    void (*shutdown)(struct svc_serv *, struct net *net));
+			    struct svc_serv_ops *);
 struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
 					struct svc_pool *pool, int node);
 void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
-			void (*shutdown)(struct svc_serv *, struct net *net),
-			svc_thread_fn, struct module *);
+			struct svc_serv_ops *, svc_thread_fn, struct module *);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
 void		   svc_destroy(struct svc_serv *);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 5a16d8d8c831..36eee907696b 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -423,7 +423,7 @@ EXPORT_SYMBOL_GPL(svc_bind);
  */
 static struct svc_serv *
 __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
-	     void (*shutdown)(struct svc_serv *serv, struct net *net))
+	     struct svc_serv_ops *ops)
 {
 	struct svc_serv	*serv;
 	unsigned int vers;
@@ -440,7 +440,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 		bufsize = RPCSVC_MAXPAYLOAD;
 	serv->sv_max_payload = bufsize? bufsize : 4096;
 	serv->sv_max_mesg  = roundup(serv->sv_max_payload + PAGE_SIZE, PAGE_SIZE);
-	serv->sv_shutdown  = shutdown;
+	serv->sv_ops = ops;
 	xdrsize = 0;
 	while (prog) {
 		prog->pg_lovers = prog->pg_nvers-1;
@@ -486,21 +486,21 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 
 struct svc_serv *
 svc_create(struct svc_program *prog, unsigned int bufsize,
-	   void (*shutdown)(struct svc_serv *serv, struct net *net))
+	   struct svc_serv_ops *ops)
 {
-	return __svc_create(prog, bufsize, /*npools*/1, shutdown);
+	return __svc_create(prog, bufsize, /*npools*/1, ops);
 }
 EXPORT_SYMBOL_GPL(svc_create);
 
 struct svc_serv *
 svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
-		  void (*shutdown)(struct svc_serv *serv, struct net *net),
-		  svc_thread_fn func, struct module *mod)
+		  struct svc_serv_ops *ops, svc_thread_fn func,
+		  struct module *mod)
 {
 	struct svc_serv *serv;
 	unsigned int npools = svc_pool_map_get();
 
-	serv = __svc_create(prog, bufsize, npools, shutdown);
+	serv = __svc_create(prog, bufsize, npools, ops);
 	if (!serv)
 		goto out_err;
 
@@ -517,8 +517,8 @@ void svc_shutdown_net(struct svc_serv *serv, struct net *net)
 {
 	svc_close_net(serv, net);
 
-	if (serv->sv_shutdown)
-		serv->sv_shutdown(serv, net);
+	if (serv->sv_ops->svo_shutdown)
+		serv->sv_ops->svo_shutdown(serv, net);
 }
 EXPORT_SYMBOL_GPL(svc_shutdown_net);
 
-- 
cgit v1.2.3-70-g09d2


From c369014f1776367269c8fbb5ea8932826d89ce2f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Mon, 8 Jun 2015 12:04:46 -0700
Subject: nfsd/sunrpc: move sv_function into sv_ops

Since we now have a container for holding svc_serv operations, move the
sv_function into it as well.

Signed-off-by: Shirley Ma <shirley.ma@oracle.com>
Acked-by: Jeff Layton <jlayton@primarydata.com>
Tested-by: Shirley Ma <shirley.ma@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfssvc.c           |  3 ++-
 include/linux/sunrpc/svc.h | 11 +++--------
 net/sunrpc/svc.c           |  8 +++-----
 3 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 7311677330b2..bd03968363ff 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -393,6 +393,7 @@ static int nfsd_get_default_max_blksize(void)
 
 static struct svc_serv_ops nfsd_sv_ops = {
 	.svo_shutdown = nfsd_last_thread,
+	.svo_function = nfsd,
 };
 
 int nfsd_create_serv(struct net *net)
@@ -409,7 +410,7 @@ int nfsd_create_serv(struct net *net)
 		nfsd_max_blksize = nfsd_get_default_max_blksize();
 	nfsd_reset_versions();
 	nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
-				      &nfsd_sv_ops, nfsd, THIS_MODULE);
+				      &nfsd_sv_ops, THIS_MODULE);
 	if (nn->nfsd_serv == NULL)
 		return -ENOMEM;
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 2e682f636b13..7c51b21ce9d6 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -19,11 +19,6 @@
 #include <linux/wait.h>
 #include <linux/mm.h>
 
-/*
- * This is the RPC server thread function prototype
- */
-typedef int		(*svc_thread_fn)(void *);
-
 /* statistics for svc_pool structures */
 struct svc_pool_stats {
 	atomic_long_t	packets;
@@ -58,7 +53,8 @@ struct svc_serv;
 
 struct svc_serv_ops {
 	/* Callback to use when last thread exits. */
-	void	(*svo_shutdown)(struct svc_serv *serv, struct net *net);
+	void		(*svo_shutdown)(struct svc_serv *, struct net *);
+	int		(*svo_function)(void *);
 };
 
 /*
@@ -95,7 +91,6 @@ struct svc_serv {
 	struct svc_serv_ops	*sv_ops;	/* server operations */
 	struct module *		sv_module;	/* optional module to count when
 						 * adding threads */
-	svc_thread_fn		sv_function;	/* main function for threads */
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	struct list_head	sv_cb_list;	/* queue for callback requests
 						 * that arrive over the same
@@ -435,7 +430,7 @@ struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
 					struct svc_pool *pool, int node);
 void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
-			struct svc_serv_ops *, svc_thread_fn, struct module *);
+			struct svc_serv_ops *, struct module *);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
 void		   svc_destroy(struct svc_serv *);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 36eee907696b..5b8726030c24 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -34,7 +34,7 @@
 
 static void svc_unregister(const struct svc_serv *serv, struct net *net);
 
-#define svc_serv_is_pooled(serv)    ((serv)->sv_function)
+#define svc_serv_is_pooled(serv)    ((serv)->sv_ops->svo_function)
 
 /*
  * Mode for mapping cpus to pools.
@@ -494,8 +494,7 @@ EXPORT_SYMBOL_GPL(svc_create);
 
 struct svc_serv *
 svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
-		  struct svc_serv_ops *ops, svc_thread_fn func,
-		  struct module *mod)
+		  struct svc_serv_ops *ops, struct module *mod)
 {
 	struct svc_serv *serv;
 	unsigned int npools = svc_pool_map_get();
@@ -504,7 +503,6 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
 	if (!serv)
 		goto out_err;
 
-	serv->sv_function = func;
 	serv->sv_module = mod;
 	return serv;
 out_err:
@@ -740,7 +738,7 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 		}
 
 		__module_get(serv->sv_module);
-		task = kthread_create_on_node(serv->sv_function, rqstp,
+		task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp,
 					      node, "%s", serv->sv_name);
 		if (IS_ERR(task)) {
 			error = PTR_ERR(task);
-- 
cgit v1.2.3-70-g09d2


From 758f62fff9ad630f05866a1dd6ae9453a7730c2e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Mon, 8 Jun 2015 12:05:56 -0700
Subject: nfsd/sunrpc: move sv_module parm into sv_ops

...not technically an operation, but it's more convenient and cleaner
to pass the module pointer in this struct.

Signed-off-by: Shirley Ma <shirley.ma@oracle.com>
Acked-by: Jeff Layton <jlayton@primarydata.com>
Tested-by: Shirley Ma <shirley.ma@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfssvc.c           | 3 ++-
 include/linux/sunrpc/svc.h | 9 ++++++---
 net/sunrpc/svc.c           | 8 +++-----
 3 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index bd03968363ff..17ceaad5f80a 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -394,6 +394,7 @@ static int nfsd_get_default_max_blksize(void)
 static struct svc_serv_ops nfsd_sv_ops = {
 	.svo_shutdown = nfsd_last_thread,
 	.svo_function = nfsd,
+	.svo_module = THIS_MODULE,
 };
 
 int nfsd_create_serv(struct net *net)
@@ -410,7 +411,7 @@ int nfsd_create_serv(struct net *net)
 		nfsd_max_blksize = nfsd_get_default_max_blksize();
 	nfsd_reset_versions();
 	nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
-				      &nfsd_sv_ops, THIS_MODULE);
+						&nfsd_sv_ops);
 	if (nn->nfsd_serv == NULL)
 		return -ENOMEM;
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 7c51b21ce9d6..0150003d584b 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -54,7 +54,12 @@ struct svc_serv;
 struct svc_serv_ops {
 	/* Callback to use when last thread exits. */
 	void		(*svo_shutdown)(struct svc_serv *, struct net *);
+
+	/* function for service threads to run */
 	int		(*svo_function)(void *);
+
+	/* optional module to count when adding threads (pooled svcs only) */
+	struct module	*svo_module;
 };
 
 /*
@@ -89,8 +94,6 @@ struct svc_serv {
 	unsigned int		sv_nrpools;	/* number of thread pools */
 	struct svc_pool *	sv_pools;	/* array of thread pools */
 	struct svc_serv_ops	*sv_ops;	/* server operations */
-	struct module *		sv_module;	/* optional module to count when
-						 * adding threads */
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	struct list_head	sv_cb_list;	/* queue for callback requests
 						 * that arrive over the same
@@ -430,7 +433,7 @@ struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
 					struct svc_pool *pool, int node);
 void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
-			struct svc_serv_ops *, struct module *);
+			struct svc_serv_ops *);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
 void		   svc_destroy(struct svc_serv *);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 5b8726030c24..5a6be22a7904 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -494,7 +494,7 @@ EXPORT_SYMBOL_GPL(svc_create);
 
 struct svc_serv *
 svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
-		  struct svc_serv_ops *ops, struct module *mod)
+		  struct svc_serv_ops *ops)
 {
 	struct svc_serv *serv;
 	unsigned int npools = svc_pool_map_get();
@@ -502,8 +502,6 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
 	serv = __svc_create(prog, bufsize, npools, ops);
 	if (!serv)
 		goto out_err;
-
-	serv->sv_module = mod;
 	return serv;
 out_err:
 	svc_pool_map_put();
@@ -737,12 +735,12 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 			break;
 		}
 
-		__module_get(serv->sv_module);
+		__module_get(serv->sv_ops->svo_module);
 		task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp,
 					      node, "%s", serv->sv_name);
 		if (IS_ERR(task)) {
 			error = PTR_ERR(task);
-			module_put(serv->sv_module);
+			module_put(serv->sv_ops->svo_module);
 			svc_exit_thread(rqstp);
 			break;
 		}
-- 
cgit v1.2.3-70-g09d2


From b9e13cdfac70e38ade17b53810a36968c5842339 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Mon, 8 Jun 2015 12:06:51 -0700
Subject: nfsd/sunrpc: turn enqueueing a svc_xprt into a svc_serv operation

For now, all services use svc_xprt_do_enqueue, but once we add
workqueue-based service support, we'll need to do something different.

Signed-off-by: Shirley Ma <shirley.ma@oracle.com>
Acked-by: Jeff Layton <jlayton@primarydata.com>
Tested-by: Shirley Ma <shirley.ma@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/svc.c                  |  3 ++-
 fs/nfs/callback.c               |  1 +
 fs/nfsd/nfssvc.c                | 11 ++++++-----
 include/linux/sunrpc/svc.h      |  3 +++
 include/linux/sunrpc/svc_xprt.h |  1 +
 net/sunrpc/svc_xprt.c           | 10 +++++-----
 6 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 4182b2f925cd..530914b5c455 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -323,7 +323,8 @@ out_rqst:
 }
 
 static struct svc_serv_ops lockd_sv_ops = {
-	.svo_shutdown	= svc_rpcb_cleanup,
+	.svo_shutdown		= svc_rpcb_cleanup,
+	.svo_enqueue_xprt	= svc_xprt_do_enqueue,
 };
 
 static struct svc_serv *lockd_create_svc(void)
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 182792d115fc..2c4a0b565d28 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -309,6 +309,7 @@ err_bind:
 }
 
 static struct svc_serv_ops nfs_cb_sv_ops = {
+	.svo_enqueue_xprt	= svc_xprt_do_enqueue,
 };
 
 static struct svc_serv *nfs_callback_create_svc(int minorversion)
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 17ceaad5f80a..d8b9b4cd37c6 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -391,10 +391,11 @@ static int nfsd_get_default_max_blksize(void)
 	return ret;
 }
 
-static struct svc_serv_ops nfsd_sv_ops = {
-	.svo_shutdown = nfsd_last_thread,
-	.svo_function = nfsd,
-	.svo_module = THIS_MODULE,
+static struct svc_serv_ops nfsd_thread_sv_ops = {
+	.svo_shutdown		= nfsd_last_thread,
+	.svo_function		= nfsd,
+	.svo_enqueue_xprt	= svc_xprt_do_enqueue,
+	.svo_module		= THIS_MODULE,
 };
 
 int nfsd_create_serv(struct net *net)
@@ -411,7 +412,7 @@ int nfsd_create_serv(struct net *net)
 		nfsd_max_blksize = nfsd_get_default_max_blksize();
 	nfsd_reset_versions();
 	nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
-						&nfsd_sv_ops);
+						&nfsd_thread_sv_ops);
 	if (nn->nfsd_serv == NULL)
 		return -ENOMEM;
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 0150003d584b..97609d0f68f6 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -58,6 +58,9 @@ struct svc_serv_ops {
 	/* function for service threads to run */
 	int		(*svo_function)(void *);
 
+	/* queue up a transport for servicing */
+	void		(*svo_enqueue_xprt)(struct svc_xprt *);
+
 	/* optional module to count when adding threads (pooled svcs only) */
 	struct module	*svo_module;
 };
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 79f6f8f3dc0a..78512cfe1fe6 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -116,6 +116,7 @@ void	svc_xprt_init(struct net *, struct svc_xprt_class *, struct svc_xprt *,
 		      struct svc_serv *);
 int	svc_create_xprt(struct svc_serv *, const char *, struct net *,
 			const int, const unsigned short, int);
+void	svc_xprt_do_enqueue(struct svc_xprt *xprt);
 void	svc_xprt_enqueue(struct svc_xprt *xprt);
 void	svc_xprt_put(struct svc_xprt *xprt);
 void	svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 163ac45c3639..a6cbb2104667 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -24,7 +24,6 @@ static int svc_deferred_recv(struct svc_rqst *rqstp);
 static struct cache_deferred_req *svc_defer(struct cache_req *req);
 static void svc_age_temp_xprts(unsigned long closure);
 static void svc_delete_xprt(struct svc_xprt *xprt);
-static void svc_xprt_do_enqueue(struct svc_xprt *xprt);
 
 /* apparently the "standard" is that clients close
  * idle connections after 5 minutes, servers after
@@ -225,12 +224,12 @@ static void svc_xprt_received(struct svc_xprt *xprt)
 	}
 
 	/* As soon as we clear busy, the xprt could be closed and
-	 * 'put', so we need a reference to call svc_xprt_do_enqueue with:
+	 * 'put', so we need a reference to call svc_enqueue_xprt with:
 	 */
 	svc_xprt_get(xprt);
 	smp_mb__before_atomic();
 	clear_bit(XPT_BUSY, &xprt->xpt_flags);
-	svc_xprt_do_enqueue(xprt);
+	xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt);
 	svc_xprt_put(xprt);
 }
 
@@ -320,7 +319,7 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt)
 	return false;
 }
 
-static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
+void svc_xprt_do_enqueue(struct svc_xprt *xprt)
 {
 	struct svc_pool *pool;
 	struct svc_rqst	*rqstp = NULL;
@@ -402,6 +401,7 @@ redo_search:
 out:
 	trace_svc_xprt_do_enqueue(xprt, rqstp);
 }
+EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue);
 
 /*
  * Queue up a transport with data pending. If there are idle nfsd
@@ -412,7 +412,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 {
 	if (test_bit(XPT_BUSY, &xprt->xpt_flags))
 		return;
-	svc_xprt_do_enqueue(xprt);
+	xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt);
 }
 EXPORT_SYMBOL_GPL(svc_xprt_enqueue);
 
-- 
cgit v1.2.3-70-g09d2


From 598e2359090d393b01a8e10386dc3056ccfa47ae Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Mon, 8 Jun 2015 12:08:33 -0700
Subject: nfsd/sunrpc: abstract out svc_set_num_threads to sv_ops

Add an operation that will do setup of the service. In the case of a
classic thread-based service that means starting up threads. In the case
of a workqueue-based service, the setup will do something different.

Signed-off-by: Shirley Ma <shirley.ma@oracle.com>
Acked-by: Jeff Layton <jlayton@primarydata.com>
Tested-by: Shirley Ma <shirliey.ma@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfssvc.c           | 8 +++++---
 include/linux/sunrpc/svc.h | 3 +++
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index d8b9b4cd37c6..ad4e2377dd63 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -395,6 +395,7 @@ static struct svc_serv_ops nfsd_thread_sv_ops = {
 	.svo_shutdown		= nfsd_last_thread,
 	.svo_function		= nfsd,
 	.svo_enqueue_xprt	= svc_xprt_do_enqueue,
+	.svo_setup		= svc_set_num_threads,
 	.svo_module		= THIS_MODULE,
 };
 
@@ -507,8 +508,8 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
 	/* apply the new numbers */
 	svc_get(nn->nfsd_serv);
 	for (i = 0; i < n; i++) {
-		err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i],
-				    	  nthreads[i]);
+		err = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv,
+				&nn->nfsd_serv->sv_pools[i], nthreads[i]);
 		if (err)
 			break;
 	}
@@ -547,7 +548,8 @@ nfsd_svc(int nrservs, struct net *net)
 	error = nfsd_startup_net(nrservs, net);
 	if (error)
 		goto out_destroy;
-	error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs);
+	error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv,
+			NULL, nrservs);
 	if (error)
 		goto out_shutdown;
 	/* We are holding a reference to nn->nfsd_serv which
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 97609d0f68f6..fd5bb9922545 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -61,6 +61,9 @@ struct svc_serv_ops {
 	/* queue up a transport for servicing */
 	void		(*svo_enqueue_xprt)(struct svc_xprt *);
 
+	/* set up thread (or whatever) execution context */
+	int		(*svo_setup)(struct svc_serv *, struct svc_pool *, int);
+
 	/* optional module to count when adding threads (pooled svcs only) */
 	struct module	*svo_module;
 };
-- 
cgit v1.2.3-70-g09d2


From d70bc0c67c7aaf0d00084b2f91b44fe1a8ae4e15 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Mon, 8 Jun 2015 12:09:54 -0700
Subject: nfsd/sunrpc: move pool_mode definitions into svc.h

In later patches, we're going to need to allow code external to svc.c
to figure out what pool_mode is in use. Move these definitions into
svc.h to prepare for that.

Also, make the svc_pool_map object available and exported so that other
modules can peek in there to get insight into what pool mode is in use.
Likewise, export svc_pool_map_get/put function to make it safe to do so.

Signed-off-by: Shirley Ma <shirley.ma@oracle.com>
Acked-by: Jeff Layton <jlayton@primarydata.com>
Tested-by: Shirley Ma <shirley.ma@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc.h | 25 +++++++++++++++++++++++++
 net/sunrpc/svc.c           | 31 +++++++------------------------
 2 files changed, 32 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index fd5bb9922545..3a9baead5c3e 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -427,6 +427,29 @@ struct svc_procedure {
 	unsigned int		pc_xdrressize;	/* maximum size of XDR reply */
 };
 
+/*
+ * Mode for mapping cpus to pools.
+ */
+enum {
+	SVC_POOL_AUTO = -1,	/* choose one of the others */
+	SVC_POOL_GLOBAL,	/* no mapping, just a single global pool
+				 * (legacy & UP mode) */
+	SVC_POOL_PERCPU,	/* one pool per cpu */
+	SVC_POOL_PERNODE	/* one pool per numa node */
+};
+
+struct svc_pool_map {
+	int count;			/* How many svc_servs use us */
+	int mode;			/* Note: int not enum to avoid
+					 * warnings about "enumeration value
+					 * not handled in switch" */
+	unsigned int npools;
+	unsigned int *pool_to;		/* maps pool id to cpu or node */
+	unsigned int *to_pool;		/* maps cpu or node to pool id */
+};
+
+extern struct svc_pool_map svc_pool_map;
+
 /*
  * Function prototypes.
  */
@@ -438,6 +461,8 @@ struct svc_serv *svc_create(struct svc_program *, unsigned int,
 struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
 					struct svc_pool *pool, int node);
 void		   svc_exit_thread(struct svc_rqst *);
+unsigned int	   svc_pool_map_get(void);
+void		   svc_pool_map_put(void);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 			struct svc_serv_ops *);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 5a6be22a7904..486c14bf4e49 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -36,34 +36,17 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net);
 
 #define svc_serv_is_pooled(serv)    ((serv)->sv_ops->svo_function)
 
-/*
- * Mode for mapping cpus to pools.
- */
-enum {
-	SVC_POOL_AUTO = -1,	/* choose one of the others */
-	SVC_POOL_GLOBAL,	/* no mapping, just a single global pool
-				 * (legacy & UP mode) */
-	SVC_POOL_PERCPU,	/* one pool per cpu */
-	SVC_POOL_PERNODE	/* one pool per numa node */
-};
 #define SVC_POOL_DEFAULT	SVC_POOL_GLOBAL
 
 /*
  * Structure for mapping cpus to pools and vice versa.
  * Setup once during sunrpc initialisation.
  */
-static struct svc_pool_map {
-	int count;			/* How many svc_servs use us */
-	int mode;			/* Note: int not enum to avoid
-					 * warnings about "enumeration value
-					 * not handled in switch" */
-	unsigned int npools;
-	unsigned int *pool_to;		/* maps pool id to cpu or node */
-	unsigned int *to_pool;		/* maps cpu or node to pool id */
-} svc_pool_map = {
-	.count = 0,
+struct svc_pool_map svc_pool_map = {
 	.mode = SVC_POOL_DEFAULT
 };
+EXPORT_SYMBOL_GPL(svc_pool_map);
+
 static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */
 
 static int
@@ -236,7 +219,7 @@ svc_pool_map_init_pernode(struct svc_pool_map *m)
  * vice versa).  Initialise the map if we're the first user.
  * Returns the number of pools.
  */
-static unsigned int
+unsigned int
 svc_pool_map_get(void)
 {
 	struct svc_pool_map *m = &svc_pool_map;
@@ -271,7 +254,7 @@ svc_pool_map_get(void)
 	mutex_unlock(&svc_pool_map_mutex);
 	return m->npools;
 }
-
+EXPORT_SYMBOL_GPL(svc_pool_map_get);
 
 /*
  * Drop a reference to the global map of cpus to pools.
@@ -280,7 +263,7 @@ svc_pool_map_get(void)
  * mode using the pool_mode module option without
  * rebooting or re-loading sunrpc.ko.
  */
-static void
+void
 svc_pool_map_put(void)
 {
 	struct svc_pool_map *m = &svc_pool_map;
@@ -297,7 +280,7 @@ svc_pool_map_put(void)
 
 	mutex_unlock(&svc_pool_map_mutex);
 }
-
+EXPORT_SYMBOL_GPL(svc_pool_map_put);
 
 static int svc_pool_map_get_node(unsigned int pidx)
 {
-- 
cgit v1.2.3-70-g09d2


From 1b6dc1dffbb142de60eb65f6155276ac31ff5474 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Mon, 8 Jun 2015 12:11:10 -0700
Subject: nfsd/sunrpc: factor svc_rqst allocation and freeing from sv_nrthreads
 refcounting

In later patches, we'll want to be able to allocate and free svc_rqst
structures without monkeying with the serv->sv_nrthreads refcount.

Factor those pieces out of their respective functions.

Signed-off-by: Shirley Ma <shirley.ma@oracle.com>
Acked-by: Jeff Layton <jlayton@primarydata.com>
Tested-by: Shirley Ma <shirley.ma@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc.h |  3 +++
 net/sunrpc/svc.c           | 54 ++++++++++++++++++++++++++++++----------------
 2 files changed, 39 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 3a9baead5c3e..cc0fc712bb82 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -458,8 +458,11 @@ void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net);
 int svc_bind(struct svc_serv *serv, struct net *net);
 struct svc_serv *svc_create(struct svc_program *, unsigned int,
 			    struct svc_serv_ops *);
+struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv,
+					struct svc_pool *pool, int node);
 struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
 					struct svc_pool *pool, int node);
+void		   svc_rqst_free(struct svc_rqst *);
 void		   svc_exit_thread(struct svc_rqst *);
 unsigned int	   svc_pool_map_get(void);
 void		   svc_pool_map_put(void);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 486c14bf4e49..a8f579df14d8 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -583,40 +583,52 @@ svc_release_buffer(struct svc_rqst *rqstp)
 }
 
 struct svc_rqst *
-svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
+svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node)
 {
 	struct svc_rqst	*rqstp;
 
 	rqstp = kzalloc_node(sizeof(*rqstp), GFP_KERNEL, node);
 	if (!rqstp)
-		goto out_enomem;
+		return rqstp;
 
-	serv->sv_nrthreads++;
 	__set_bit(RQ_BUSY, &rqstp->rq_flags);
 	spin_lock_init(&rqstp->rq_lock);
 	rqstp->rq_server = serv;
 	rqstp->rq_pool = pool;
-	spin_lock_bh(&pool->sp_lock);
-	pool->sp_nrthreads++;
-	list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
-	spin_unlock_bh(&pool->sp_lock);
 
 	rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
 	if (!rqstp->rq_argp)
-		goto out_thread;
+		goto out_enomem;
 
 	rqstp->rq_resp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
 	if (!rqstp->rq_resp)
-		goto out_thread;
+		goto out_enomem;
 
 	if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node))
-		goto out_thread;
+		goto out_enomem;
 
 	return rqstp;
-out_thread:
-	svc_exit_thread(rqstp);
 out_enomem:
-	return ERR_PTR(-ENOMEM);
+	svc_rqst_free(rqstp);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(svc_rqst_alloc);
+
+struct svc_rqst *
+svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
+{
+	struct svc_rqst	*rqstp;
+
+	rqstp = svc_rqst_alloc(serv, pool, node);
+	if (!rqstp)
+		return ERR_PTR(-ENOMEM);
+
+	serv->sv_nrthreads++;
+	spin_lock_bh(&pool->sp_lock);
+	pool->sp_nrthreads++;
+	list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
+	spin_unlock_bh(&pool->sp_lock);
+	return rqstp;
 }
 EXPORT_SYMBOL_GPL(svc_prepare_thread);
 
@@ -751,15 +763,21 @@ EXPORT_SYMBOL_GPL(svc_set_num_threads);
  * mutex" for the service.
  */
 void
-svc_exit_thread(struct svc_rqst *rqstp)
+svc_rqst_free(struct svc_rqst *rqstp)
 {
-	struct svc_serv	*serv = rqstp->rq_server;
-	struct svc_pool	*pool = rqstp->rq_pool;
-
 	svc_release_buffer(rqstp);
 	kfree(rqstp->rq_resp);
 	kfree(rqstp->rq_argp);
 	kfree(rqstp->rq_auth_data);
+	kfree_rcu(rqstp, rq_rcu_head);
+}
+EXPORT_SYMBOL_GPL(svc_rqst_free);
+
+void
+svc_exit_thread(struct svc_rqst *rqstp)
+{
+	struct svc_serv	*serv = rqstp->rq_server;
+	struct svc_pool	*pool = rqstp->rq_pool;
 
 	spin_lock_bh(&pool->sp_lock);
 	pool->sp_nrthreads--;
@@ -767,7 +785,7 @@ svc_exit_thread(struct svc_rqst *rqstp)
 		list_del_rcu(&rqstp->rq_all);
 	spin_unlock_bh(&pool->sp_lock);
 
-	kfree_rcu(rqstp, rq_rcu_head);
+	svc_rqst_free(rqstp);
 
 	/* Release the server */
 	if (serv)
-- 
cgit v1.2.3-70-g09d2


From 7a76a021cd5a292be875fbc616daf03eab1e6996 Mon Sep 17 00:00:00 2001
From: Benjamin Poirier <bpoirier@suse.com>
Date: Fri, 7 Aug 2015 09:32:21 -0700
Subject: net-timestamp: Update skb_complete_tx_timestamp comment

After "62bccb8 net-timestamp: Make the clone operation stand-alone from phy
timestamping" the hwtstamps parameter of skb_complete_tx_timestamp() may no
longer be NULL.

Signed-off-by: Benjamin Poirier <bpoirier@suse.com>
Cc: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d6cdd6e87d53..22b6d9ca1654 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2884,11 +2884,11 @@ static inline bool skb_defer_rx_timestamp(struct sk_buff *skb)
  *
  * PHY drivers may accept clones of transmitted packets for
  * timestamping via their phy_driver.txtstamp method. These drivers
- * must call this function to return the skb back to the stack, with
- * or without a timestamp.
+ * must call this function to return the skb back to the stack with a
+ * timestamp.
  *
  * @skb: clone of the the original outgoing packet
- * @hwtstamps: hardware time stamps, may be NULL if not available
+ * @hwtstamps: hardware time stamps
  *
  */
 void skb_complete_tx_timestamp(struct sk_buff *skb,
-- 
cgit v1.2.3-70-g09d2


From 124fe20d94630b6f173dae5eb815e6e6e350c72d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 10 Aug 2015 23:07:05 -0400
Subject: mm: enhance region_is_ram() to region_intersects()

region_is_ram() is used to prevent the establishment of aliased mappings
to physical "System RAM" with incompatible cache settings.  However, it
uses "-1" to indicate both "unknown" memory ranges (ranges not described
by platform firmware) and "mixed" ranges (where the parameters describe
a range that partially overlaps "System RAM").

Fix this up by explicitly tracking the "unknown" vs "mixed" resource
cases and returning REGION_INTERSECTS, REGION_MIXED, or REGION_DISJOINT.
This re-write also adds support for detecting when the requested region
completely eclipses all of a resource.  Note, the implementation treats
overlaps between "unknown" and the requested memory type as
REGION_INTERSECTS.

Finally, other memory types can be passed in by name, for now the only
usage "System RAM".

Suggested-by: Luis R. Rodriguez <mcgrof@suse.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/mm.h |  9 +++++++-
 kernel/resource.c  | 61 ++++++++++++++++++++++++++++++++----------------------
 2 files changed, 44 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2e872f92dbac..84b05ebedb2d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -369,7 +369,14 @@ static inline int put_page_unless_one(struct page *page)
 }
 
 extern int page_is_ram(unsigned long pfn);
-extern int region_is_ram(resource_size_t phys_addr, unsigned long size);
+
+enum {
+	REGION_INTERSECTS,
+	REGION_DISJOINT,
+	REGION_MIXED,
+};
+
+int region_intersects(resource_size_t offset, size_t size, const char *type);
 
 /* Support for virtually mapped pages */
 struct page *vmalloc_to_page(const void *addr);
diff --git a/kernel/resource.c b/kernel/resource.c
index fed052a1bc9f..f150dbbe6f62 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -492,40 +492,51 @@ int __weak page_is_ram(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(page_is_ram);
 
-/*
- * Search for a resouce entry that fully contains the specified region.
- * If found, return 1 if it is RAM, 0 if not.
- * If not found, or region is not fully contained, return -1
+/**
+ * region_intersects() - determine intersection of region with known resources
+ * @start: region start address
+ * @size: size of region
+ * @name: name of resource (in iomem_resource)
  *
- * Used by the ioremap functions to ensure the user is not remapping RAM and is
- * a vast speed up over walking through the resource table page by page.
+ * Check if the specified region partially overlaps or fully eclipses a
+ * resource identified by @name.  Return REGION_DISJOINT if the region
+ * does not overlap @name, return REGION_MIXED if the region overlaps
+ * @type and another resource, and return REGION_INTERSECTS if the
+ * region overlaps @type and no other defined resource. Note, that
+ * REGION_INTERSECTS is also returned in the case when the specified
+ * region overlaps RAM and undefined memory holes.
+ *
+ * region_intersect() is used by memory remapping functions to ensure
+ * the user is not remapping RAM and is a vast speed up over walking
+ * through the resource table page by page.
  */
-int region_is_ram(resource_size_t start, unsigned long size)
+int region_intersects(resource_size_t start, size_t size, const char *name)
 {
-	struct resource *p;
-	resource_size_t end = start + size - 1;
 	unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-	const char *name = "System RAM";
-	int ret = -1;
+	resource_size_t end = start + size - 1;
+	int type = 0; int other = 0;
+	struct resource *p;
 
 	read_lock(&resource_lock);
 	for (p = iomem_resource.child; p ; p = p->sibling) {
-		if (p->end < start)
-			continue;
-
-		if (p->start <= start && end <= p->end) {
-			/* resource fully contains region */
-			if ((p->flags != flags) || strcmp(p->name, name))
-				ret = 0;
-			else
-				ret = 1;
-			break;
-		}
-		if (end < p->start)
-			break;	/* not found */
+		bool is_type = strcmp(p->name, name) == 0 && p->flags == flags;
+
+		if (start >= p->start && start <= p->end)
+			is_type ? type++ : other++;
+		if (end >= p->start && end <= p->end)
+			is_type ? type++ : other++;
+		if (p->start >= start && p->end <= end)
+			is_type ? type++ : other++;
 	}
 	read_unlock(&resource_lock);
-	return ret;
+
+	if (other == 0)
+		return type ? REGION_INTERSECTS : REGION_DISJOINT;
+
+	if (type)
+		return REGION_MIXED;
+
+	return REGION_DISJOINT;
 }
 
 void __weak arch_remove_reservations(struct resource *avail)
-- 
cgit v1.2.3-70-g09d2


From 2584cf83578c26db144730ef498f4070f82ee3ea Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 10 Aug 2015 23:07:05 -0400
Subject: arch, drivers: don't include <asm/io.h> directly, use <linux/io.h>
 instead

Preparation for uniform definition of ioremap, ioremap_wc, ioremap_wt,
and ioremap_cache, tree-wide.

Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/arm/mach-shmobile/pm-rcar.c            | 2 +-
 arch/ia64/kernel/cyclone.c                  | 2 +-
 drivers/isdn/icn/icn.h                      | 2 +-
 drivers/mtd/devices/slram.c                 | 2 +-
 drivers/mtd/nand/diskonchip.c               | 2 +-
 drivers/mtd/onenand/generic.c               | 2 +-
 drivers/scsi/sun3x_esp.c                    | 2 +-
 drivers/staging/comedi/drivers/ii_pci20kc.c | 1 +
 drivers/tty/serial/8250/8250_core.c         | 2 +-
 drivers/video/fbdev/s1d13xxxfb.c            | 3 +--
 drivers/video/fbdev/stifb.c                 | 1 +
 include/linux/io-mapping.h                  | 2 +-
 include/linux/mtd/map.h                     | 2 +-
 include/video/vga.h                         | 2 +-
 14 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-shmobile/pm-rcar.c b/arch/arm/mach-shmobile/pm-rcar.c
index 00022ee56f80..9d3dde00c2fe 100644
--- a/arch/arm/mach-shmobile/pm-rcar.c
+++ b/arch/arm/mach-shmobile/pm-rcar.c
@@ -12,7 +12,7 @@
 #include <linux/err.h>
 #include <linux/mm.h>
 #include <linux/spinlock.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include "pm-rcar.h"
 
 /* SYSC */
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
index 4826ff957a3d..5fa3848ba224 100644
--- a/arch/ia64/kernel/cyclone.c
+++ b/arch/ia64/kernel/cyclone.c
@@ -4,7 +4,7 @@
 #include <linux/errno.h>
 #include <linux/timex.h>
 #include <linux/clocksource.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 /* IBM Summit (EXA) Cyclone counter code*/
 #define CYCLONE_CBAR_ADDR 0xFEB00CD0
diff --git a/drivers/isdn/icn/icn.h b/drivers/isdn/icn/icn.h
index b713466997a0..f8f2e76d34bf 100644
--- a/drivers/isdn/icn/icn.h
+++ b/drivers/isdn/icn/icn.h
@@ -38,7 +38,7 @@ typedef struct icn_cdef {
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/major.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/slab.h>
diff --git a/drivers/mtd/devices/slram.c b/drivers/mtd/devices/slram.c
index 2fc4957cbe7f..a70eb83e68f1 100644
--- a/drivers/mtd/devices/slram.c
+++ b/drivers/mtd/devices/slram.c
@@ -41,7 +41,7 @@
 #include <linux/fs.h>
 #include <linux/ioctl.h>
 #include <linux/init.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 #include <linux/mtd/mtd.h>
 
diff --git a/drivers/mtd/nand/diskonchip.c b/drivers/mtd/nand/diskonchip.c
index 7da266a53979..0802158a3f75 100644
--- a/drivers/mtd/nand/diskonchip.c
+++ b/drivers/mtd/nand/diskonchip.c
@@ -24,7 +24,7 @@
 #include <linux/rslib.h>
 #include <linux/moduleparam.h>
 #include <linux/slab.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
diff --git a/drivers/mtd/onenand/generic.c b/drivers/mtd/onenand/generic.c
index 32a216d31141..ab7bda0bb245 100644
--- a/drivers/mtd/onenand/generic.c
+++ b/drivers/mtd/onenand/generic.c
@@ -18,7 +18,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/onenand.h>
 #include <linux/mtd/partitions.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 /*
  * Note: Driver name and platform data format have been updated!
diff --git a/drivers/scsi/sun3x_esp.c b/drivers/scsi/sun3x_esp.c
index e26e81de7c45..d50c5ed8f428 100644
--- a/drivers/scsi/sun3x_esp.c
+++ b/drivers/scsi/sun3x_esp.c
@@ -12,9 +12,9 @@
 #include <linux/platform_device.h>
 #include <linux/dma-mapping.h>
 #include <linux/interrupt.h>
+#include <linux/io.h>
 
 #include <asm/sun3x.h>
-#include <asm/io.h>
 #include <asm/dma.h>
 #include <asm/dvma.h>
 
diff --git a/drivers/staging/comedi/drivers/ii_pci20kc.c b/drivers/staging/comedi/drivers/ii_pci20kc.c
index 0768bc42a5db..14ef1f67dd42 100644
--- a/drivers/staging/comedi/drivers/ii_pci20kc.c
+++ b/drivers/staging/comedi/drivers/ii_pci20kc.c
@@ -28,6 +28,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/io.h>
 #include "../comedidev.h"
 
 /*
diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index 37fff12dd4d0..fe902ff52e58 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -38,11 +38,11 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
+#include <linux/io.h>
 #ifdef CONFIG_SPARC
 #include <linux/sunserialcore.h>
 #endif
 
-#include <asm/io.h>
 #include <asm/irq.h>
 
 #include "8250.h"
diff --git a/drivers/video/fbdev/s1d13xxxfb.c b/drivers/video/fbdev/s1d13xxxfb.c
index 83433cb0dfba..96aa46dc696c 100644
--- a/drivers/video/fbdev/s1d13xxxfb.c
+++ b/drivers/video/fbdev/s1d13xxxfb.c
@@ -32,8 +32,7 @@
 #include <linux/spinlock_types.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
-
-#include <asm/io.h>
+#include <linux/io.h>
 
 #include <video/s1d13xxxfb.h>
 
diff --git a/drivers/video/fbdev/stifb.c b/drivers/video/fbdev/stifb.c
index 735355b0e023..7df4228e25f0 100644
--- a/drivers/video/fbdev/stifb.c
+++ b/drivers/video/fbdev/stifb.c
@@ -64,6 +64,7 @@
 #include <linux/fb.h>
 #include <linux/init.h>
 #include <linux/ioport.h>
+#include <linux/io.h>
 
 #include <asm/grfioctl.h>	/* for HP-UX compatibility */
 #include <asm/uaccess.h>
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index c27dde7215b5..e399029b68c5 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -21,7 +21,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/bug.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include <asm/page.h>
 
 /*
diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index 29975c73a953..366cf77953b5 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -27,9 +27,9 @@
 #include <linux/string.h>
 #include <linux/bug.h>
 #include <linux/kernel.h>
+#include <linux/io.h>
 
 #include <asm/unaligned.h>
-#include <asm/io.h>
 #include <asm/barrier.h>
 
 #ifdef CONFIG_MTD_MAP_BANK_WIDTH_1
diff --git a/include/video/vga.h b/include/video/vga.h
index cac567f22e62..d334e64c1c19 100644
--- a/include/video/vga.h
+++ b/include/video/vga.h
@@ -18,7 +18,7 @@
 #define __linux_video_vga_h__
 
 #include <linux/types.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include <asm/vga.h>
 #include <asm/byteorder.h>
 
-- 
cgit v1.2.3-70-g09d2


From 420b54de25828c45f3fc1f12d52d9657f5e90a53 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Thu, 6 Aug 2015 13:46:24 +0100
Subject: mfd: watchdog: iTCO_wdt: Expose watchdog properties using platform
 data

Intel Sunrisepoint (Skylake PCH) has the iTCO watchdog accessible across
the SMBus, unlike previous generations of PCH/ICH where it was on the
LPC bus. Because it's on the SMBus, it doesn't make sense to pass around
a 'struct lpc_ich_info', and leaking the type of bus into the iTCO
watchdog driver is kind of backwards anyway.

This change introduces a new 'struct itco_wdt_platform_data' for use
inside the iTCO watchdog driver and by the upcoming Intel Sunrisepoint
code, which neatly avoids having to include lpc_ich headers in the i801
i2c driver.

This change is overdue because lpc_ich_info has already found its way
into other TCO watchdog users, notably the intel_pmc_ipc driver where
the watchdog actually isn't on the LPC bus as far as I can see.

A simple translation layer is provided for converting from the existing
'struct lpc_ich_info' inside the lpc_ich mfd driver.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Acked-by: Darren Hart <dvhart@linux.intel.com> [drivers/x86 refactoring]
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/lpc_ich.c                  | 32 +++++++++++++++++++++++++++++---
 drivers/platform/x86/intel_pmc_ipc.c   |  9 ++++-----
 drivers/watchdog/iTCO_wdt.c            | 11 +++++------
 include/linux/mfd/lpc_ich.h            |  6 ------
 include/linux/platform_data/itco_wdt.h | 19 +++++++++++++++++++
 5 files changed, 57 insertions(+), 20 deletions(-)
 create mode 100644 include/linux/platform_data/itco_wdt.h

(limited to 'include/linux')

diff --git a/drivers/mfd/lpc_ich.c b/drivers/mfd/lpc_ich.c
index 8de34398abc0..c5a9a08b5dfb 100644
--- a/drivers/mfd/lpc_ich.c
+++ b/drivers/mfd/lpc_ich.c
@@ -66,6 +66,7 @@
 #include <linux/pci.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/lpc_ich.h>
+#include <linux/platform_data/itco_wdt.h>
 
 #define ACPIBASE		0x40
 #define ACPIBASE_GPE_OFF	0x28
@@ -835,9 +836,31 @@ static void lpc_ich_enable_pmc_space(struct pci_dev *dev)
 	priv->actrl_pbase_save = reg_save;
 }
 
-static void lpc_ich_finalize_cell(struct pci_dev *dev, struct mfd_cell *cell)
+static int lpc_ich_finalize_wdt_cell(struct pci_dev *dev)
 {
+	struct itco_wdt_platform_data *pdata;
 	struct lpc_ich_priv *priv = pci_get_drvdata(dev);
+	struct lpc_ich_info *info;
+	struct mfd_cell *cell = &lpc_ich_cells[LPC_WDT];
+
+	pdata = devm_kzalloc(&dev->dev, sizeof(*pdata), GFP_KERNEL);
+	if (!pdata)
+		return -ENOMEM;
+
+	info = &lpc_chipset_info[priv->chipset];
+
+	pdata->version = info->iTCO_version;
+	strlcpy(pdata->name, info->name, sizeof(pdata->name));
+
+	cell->platform_data = pdata;
+	cell->pdata_size = sizeof(*pdata);
+	return 0;
+}
+
+static void lpc_ich_finalize_gpio_cell(struct pci_dev *dev)
+{
+	struct lpc_ich_priv *priv = pci_get_drvdata(dev);
+	struct mfd_cell *cell = &lpc_ich_cells[LPC_GPIO];
 
 	cell->platform_data = &lpc_chipset_info[priv->chipset];
 	cell->pdata_size = sizeof(struct lpc_ich_info);
@@ -933,7 +956,7 @@ gpe0_done:
 	lpc_chipset_info[priv->chipset].use_gpio = ret;
 	lpc_ich_enable_gpio_space(dev);
 
-	lpc_ich_finalize_cell(dev, &lpc_ich_cells[LPC_GPIO]);
+	lpc_ich_finalize_gpio_cell(dev);
 	ret = mfd_add_devices(&dev->dev, PLATFORM_DEVID_AUTO,
 			      &lpc_ich_cells[LPC_GPIO], 1, NULL, 0, NULL);
 
@@ -1007,7 +1030,10 @@ static int lpc_ich_init_wdt(struct pci_dev *dev)
 		res->end = base_addr + ACPIBASE_PMC_END;
 	}
 
-	lpc_ich_finalize_cell(dev, &lpc_ich_cells[LPC_WDT]);
+	ret = lpc_ich_finalize_wdt_cell(dev);
+	if (ret)
+		goto wdt_done;
+
 	ret = mfd_add_devices(&dev->dev, PLATFORM_DEVID_AUTO,
 			      &lpc_ich_cells[LPC_WDT], 1, NULL, 0, NULL);
 
diff --git a/drivers/platform/x86/intel_pmc_ipc.c b/drivers/platform/x86/intel_pmc_ipc.c
index d734763dab69..fbd1cc7d5de3 100644
--- a/drivers/platform/x86/intel_pmc_ipc.c
+++ b/drivers/platform/x86/intel_pmc_ipc.c
@@ -33,7 +33,7 @@
 #include <linux/suspend.h>
 #include <linux/acpi.h>
 #include <asm/intel_pmc_ipc.h>
-#include <linux/mfd/lpc_ich.h>
+#include <linux/platform_data/itco_wdt.h>
 
 /*
  * IPC registers
@@ -460,9 +460,9 @@ static struct resource tco_res[] = {
 	},
 };
 
-static struct lpc_ich_info tco_info = {
+static struct itco_wdt_platform_data tco_info = {
 	.name = "Apollo Lake SoC",
-	.iTCO_version = 3,
+	.version = 3,
 };
 
 static int ipc_create_punit_device(void)
@@ -539,8 +539,7 @@ static int ipc_create_tco_device(void)
 		goto err;
 	}
 
-	ret = platform_device_add_data(pdev, &tco_info,
-				       sizeof(struct lpc_ich_info));
+	ret = platform_device_add_data(pdev, &tco_info, sizeof(tco_info));
 	if (ret) {
 		dev_err(ipcdev.dev, "Failed to add tco platform data\n");
 		goto err;
diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c
index 3c3fd417ddeb..a94401b2deca 100644
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c
@@ -66,8 +66,7 @@
 #include <linux/spinlock.h>		/* For spin_lock/spin_unlock/... */
 #include <linux/uaccess.h>		/* For copy_to_user/put_user/... */
 #include <linux/io.h>			/* For inb/outb/... */
-#include <linux/mfd/core.h>
-#include <linux/mfd/lpc_ich.h>
+#include <linux/platform_data/itco_wdt.h>
 
 #include "iTCO_vendor.h"
 
@@ -418,9 +417,9 @@ static int iTCO_wdt_probe(struct platform_device *dev)
 {
 	int ret = -ENODEV;
 	unsigned long val32;
-	struct lpc_ich_info *ich_info = dev_get_platdata(&dev->dev);
+	struct itco_wdt_platform_data *pdata = dev_get_platdata(&dev->dev);
 
-	if (!ich_info)
+	if (!pdata)
 		goto out;
 
 	spin_lock_init(&iTCO_wdt_private.io_lock);
@@ -435,7 +434,7 @@ static int iTCO_wdt_probe(struct platform_device *dev)
 	if (!iTCO_wdt_private.smi_res)
 		goto out;
 
-	iTCO_wdt_private.iTCO_version = ich_info->iTCO_version;
+	iTCO_wdt_private.iTCO_version = pdata->version;
 	iTCO_wdt_private.dev = dev;
 	iTCO_wdt_private.pdev = to_pci_dev(dev->dev.parent);
 
@@ -501,7 +500,7 @@ static int iTCO_wdt_probe(struct platform_device *dev)
 	}
 
 	pr_info("Found a %s TCO device (Version=%d, TCOBASE=0x%04llx)\n",
-		ich_info->name, ich_info->iTCO_version, (u64)TCOBASE);
+		pdata->name, pdata->version, (u64)TCOBASE);
 
 	/* Clear out the (probably old) status */
 	if (iTCO_wdt_private.iTCO_version == 3) {
diff --git a/include/linux/mfd/lpc_ich.h b/include/linux/mfd/lpc_ich.h
index 8feac782fa83..2b300b44f994 100644
--- a/include/linux/mfd/lpc_ich.h
+++ b/include/linux/mfd/lpc_ich.h
@@ -20,12 +20,6 @@
 #ifndef LPC_ICH_H
 #define LPC_ICH_H
 
-/* Watchdog resources */
-#define ICH_RES_IO_TCO		0
-#define ICH_RES_IO_SMI		1
-#define ICH_RES_MEM_OFF		2
-#define ICH_RES_MEM_GCS_PMC	0
-
 /* GPIO resources */
 #define ICH_RES_GPIO	0
 #define ICH_RES_GPE0	1
diff --git a/include/linux/platform_data/itco_wdt.h b/include/linux/platform_data/itco_wdt.h
new file mode 100644
index 000000000000..f16542c77ff7
--- /dev/null
+++ b/include/linux/platform_data/itco_wdt.h
@@ -0,0 +1,19 @@
+/*
+ * Platform data for the Intel TCO Watchdog
+ */
+
+#ifndef _ITCO_WDT_H_
+#define _ITCO_WDT_H_
+
+/* Watchdog resources */
+#define ICH_RES_IO_TCO		0
+#define ICH_RES_IO_SMI		1
+#define ICH_RES_MEM_OFF		2
+#define ICH_RES_MEM_GCS_PMC	0
+
+struct itco_wdt_platform_data {
+	char name[32];
+	unsigned int version;
+};
+
+#endif /* _ITCO_WDT_H_ */
-- 
cgit v1.2.3-70-g09d2


From 00b6e9ff01a8f41816060b2f70752f66f100fecd Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Date: Fri, 19 Jun 2015 11:24:27 +0100
Subject: mfd: arizona: Update several pdata members to unsigned

Device tree and ACPI primarily deal with unsigned ints, many of the
pdata members in the Arizona driver are signed ints but are only ever
assigned positive values. Changing these pdata fields to unsigned ints
avoids us having to choose between overly verbose code and Sparse
warnings.

Signed-off-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/arizona/pdata.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index 43db4faad143..dc3c81252de7 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -101,7 +101,7 @@ struct arizona_pdata {
 	 * useful for systems where and I2S bus with multiple data
 	 * lines is mastered.
 	 */
-	int max_channels_clocked[ARIZONA_MAX_AIF];
+	unsigned int max_channels_clocked[ARIZONA_MAX_AIF];
 
 	/** GPIO5 is used for jack detection */
 	bool jd_gpio5;
@@ -125,22 +125,22 @@ struct arizona_pdata {
 	unsigned int hpdet_channel;
 
 	/** Extra debounce timeout used during initial mic detection (ms) */
-	int micd_detect_debounce;
+	unsigned int micd_detect_debounce;
 
 	/** GPIO for mic detection polarity */
 	int micd_pol_gpio;
 
 	/** Mic detect ramp rate */
-	int micd_bias_start_time;
+	unsigned int micd_bias_start_time;
 
 	/** Mic detect sample rate */
-	int micd_rate;
+	unsigned int micd_rate;
 
 	/** Mic detect debounce level */
-	int micd_dbtime;
+	unsigned int micd_dbtime;
 
 	/** Mic detect timeout (ms) */
-	int micd_timeout;
+	unsigned int micd_timeout;
 
 	/** Force MICBIAS on for mic detect */
 	bool micd_force_micbias;
-- 
cgit v1.2.3-70-g09d2


From bc00d68f2f209dd7ad01f64c3bdf67e608c363f1 Mon Sep 17 00:00:00 2001
From: Vaibhav Hiremath <vaibhav.hiremath@linaro.org>
Date: Fri, 26 Jun 2015 18:38:11 +0530
Subject: mfd: 880m80x: Make use of BIT() macro

Instead of hard coding the shift for bit definition, use
BIT() macro.

Signed-off-by: Vaibhav Hiremath <vaibhav.hiremath@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/88pm80x.h | 162 ++++++++++++++++++++++----------------------
 1 file changed, 81 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/88pm80x.h b/include/linux/mfd/88pm80x.h
index 97cb283cc8e1..8fcad63fab55 100644
--- a/include/linux/mfd/88pm80x.h
+++ b/include/linux/mfd/88pm80x.h
@@ -60,60 +60,60 @@ enum {
 /* page 0 basic: slave adder 0x60 */
 
 #define PM800_STATUS_1			(0x01)
-#define PM800_ONKEY_STS1		(1 << 0)
-#define PM800_EXTON_STS1		(1 << 1)
-#define PM800_CHG_STS1			(1 << 2)
-#define PM800_BAT_STS1			(1 << 3)
-#define PM800_VBUS_STS1			(1 << 4)
-#define PM800_LDO_PGOOD_STS1	(1 << 5)
-#define PM800_BUCK_PGOOD_STS1	(1 << 6)
+#define PM800_ONKEY_STS1		BIT(0)
+#define PM800_EXTON_STS1		BIT(1)
+#define PM800_CHG_STS1			BIT(2)
+#define PM800_BAT_STS1			BIT(3)
+#define PM800_VBUS_STS1			BIT(4)
+#define PM800_LDO_PGOOD_STS1		BIT(5)
+#define PM800_BUCK_PGOOD_STS1		BIT(6)
 
 #define PM800_STATUS_2			(0x02)
-#define PM800_RTC_ALARM_STS2	(1 << 0)
+#define PM800_RTC_ALARM_STS2		BIT(0)
 
 /* Wakeup Registers */
-#define PM800_WAKEUP1		(0x0D)
+#define PM800_WAKEUP1			(0x0D)
 
-#define PM800_WAKEUP2		(0x0E)
-#define PM800_WAKEUP2_INV_INT		(1 << 0)
-#define PM800_WAKEUP2_INT_CLEAR		(1 << 1)
-#define PM800_WAKEUP2_INT_MASK		(1 << 2)
+#define PM800_WAKEUP2			(0x0E)
+#define PM800_WAKEUP2_INV_INT		BIT(0)
+#define PM800_WAKEUP2_INT_CLEAR		BIT(1)
+#define PM800_WAKEUP2_INT_MASK		BIT(2)
 
-#define PM800_POWER_UP_LOG	(0x10)
+#define PM800_POWER_UP_LOG		(0x10)
 
 /* Referance and low power registers */
 #define PM800_LOW_POWER1		(0x20)
 #define PM800_LOW_POWER2		(0x21)
-#define PM800_LOW_POWER_CONFIG3	(0x22)
-#define PM800_LOW_POWER_CONFIG4	(0x23)
+#define PM800_LOW_POWER_CONFIG3		(0x22)
+#define PM800_LOW_POWER_CONFIG4		(0x23)
 
 /* GPIO register */
 #define PM800_GPIO_0_1_CNTRL		(0x30)
-#define PM800_GPIO0_VAL				(1 << 0)
+#define PM800_GPIO0_VAL			BIT(0)
 #define PM800_GPIO0_GPIO_MODE(x)	(x << 1)
-#define PM800_GPIO1_VAL				(1 << 4)
+#define PM800_GPIO1_VAL			BIT(4)
 #define PM800_GPIO1_GPIO_MODE(x)	(x << 5)
 
 #define PM800_GPIO_2_3_CNTRL		(0x31)
-#define PM800_GPIO2_VAL				(1 << 0)
+#define PM800_GPIO2_VAL			BIT(0)
 #define PM800_GPIO2_GPIO_MODE(x)	(x << 1)
-#define PM800_GPIO3_VAL				(1 << 4)
+#define PM800_GPIO3_VAL			BIT(4)
 #define PM800_GPIO3_GPIO_MODE(x)	(x << 5)
 #define PM800_GPIO3_MODE_MASK		0x1F
 #define PM800_GPIO3_HEADSET_MODE	PM800_GPIO3_GPIO_MODE(6)
 
-#define PM800_GPIO_4_CNTRL			(0x32)
-#define PM800_GPIO4_VAL				(1 << 0)
+#define PM800_GPIO_4_CNTRL		(0x32)
+#define PM800_GPIO4_VAL			BIT(0)
 #define PM800_GPIO4_GPIO_MODE(x)	(x << 1)
 
 #define PM800_HEADSET_CNTRL		(0x38)
-#define PM800_HEADSET_DET_EN		(1 << 7)
-#define PM800_HSDET_SLP			(1 << 1)
+#define PM800_HEADSET_DET_EN		BIT(7)
+#define PM800_HSDET_SLP			BIT(1)
 /* PWM register */
-#define PM800_PWM1		(0x40)
-#define PM800_PWM2		(0x41)
-#define PM800_PWM3		(0x42)
-#define PM800_PWM4		(0x43)
+#define PM800_PWM1			(0x40)
+#define PM800_PWM2			(0x41)
+#define PM800_PWM3			(0x42)
+#define PM800_PWM4			(0x43)
 
 /* RTC Registers */
 #define PM800_RTC_CONTROL		(0xD0)
@@ -123,55 +123,55 @@ enum {
 #define PM800_RTC_MISC4			(0xE4)
 #define PM800_RTC_MISC5			(0xE7)
 /* bit definitions of RTC Register 1 (0xD0) */
-#define PM800_ALARM1_EN			(1 << 0)
-#define PM800_ALARM_WAKEUP		(1 << 4)
-#define PM800_ALARM			(1 << 5)
-#define PM800_RTC1_USE_XO		(1 << 7)
+#define PM800_ALARM1_EN			BIT(0)
+#define PM800_ALARM_WAKEUP		BIT(4)
+#define PM800_ALARM			BIT(5)
+#define PM800_RTC1_USE_XO		BIT(7)
 
 /* Regulator Control Registers: BUCK1,BUCK5,LDO1 have DVC */
 
 /* buck registers */
-#define PM800_SLEEP_BUCK1	(0x30)
+#define PM800_SLEEP_BUCK1		(0x30)
 
 /* BUCK Sleep Mode Register 1: BUCK[1..4] */
-#define PM800_BUCK_SLP1		(0x5A)
-#define PM800_BUCK1_SLP1_SHIFT	0
-#define PM800_BUCK1_SLP1_MASK	(0x3 << PM800_BUCK1_SLP1_SHIFT)
+#define PM800_BUCK_SLP1			(0x5A)
+#define PM800_BUCK1_SLP1_SHIFT		0
+#define PM800_BUCK1_SLP1_MASK		(0x3 << PM800_BUCK1_SLP1_SHIFT)
 
 /* page 2 GPADC: slave adder 0x02 */
 #define PM800_GPADC_MEAS_EN1		(0x01)
-#define PM800_MEAS_EN1_VBAT         (1 << 2)
+#define PM800_MEAS_EN1_VBAT		BIT(2)
 #define PM800_GPADC_MEAS_EN2		(0x02)
-#define PM800_MEAS_EN2_RFTMP        (1 << 0)
-#define PM800_MEAS_GP0_EN			(1 << 2)
-#define PM800_MEAS_GP1_EN			(1 << 3)
-#define PM800_MEAS_GP2_EN			(1 << 4)
-#define PM800_MEAS_GP3_EN			(1 << 5)
-#define PM800_MEAS_GP4_EN			(1 << 6)
+#define PM800_MEAS_EN2_RFTMP		BIT(0)
+#define PM800_MEAS_GP0_EN		BIT(2)
+#define PM800_MEAS_GP1_EN		BIT(3)
+#define PM800_MEAS_GP2_EN		BIT(4)
+#define PM800_MEAS_GP3_EN		BIT(5)
+#define PM800_MEAS_GP4_EN		BIT(6)
 
 #define PM800_GPADC_MISC_CONFIG1	(0x05)
 #define PM800_GPADC_MISC_CONFIG2	(0x06)
-#define PM800_GPADC_MISC_GPFSM_EN	(1 << 0)
+#define PM800_GPADC_MISC_GPFSM_EN	BIT(0)
 #define PM800_GPADC_SLOW_MODE(x)	(x << 3)
 
-#define PM800_GPADC_MISC_CONFIG3		(0x09)
-#define PM800_GPADC_MISC_CONFIG4		(0x0A)
+#define PM800_GPADC_MISC_CONFIG3	(0x09)
+#define PM800_GPADC_MISC_CONFIG4	(0x0A)
 
-#define PM800_GPADC_PREBIAS1			(0x0F)
+#define PM800_GPADC_PREBIAS1		(0x0F)
 #define PM800_GPADC0_GP_PREBIAS_TIME(x)	(x << 0)
-#define PM800_GPADC_PREBIAS2			(0x10)
+#define PM800_GPADC_PREBIAS2		(0x10)
 
-#define PM800_GP_BIAS_ENA1				(0x14)
-#define PM800_GPADC_GP_BIAS_EN0			(1 << 0)
-#define PM800_GPADC_GP_BIAS_EN1			(1 << 1)
-#define PM800_GPADC_GP_BIAS_EN2			(1 << 2)
-#define PM800_GPADC_GP_BIAS_EN3			(1 << 3)
+#define PM800_GP_BIAS_ENA1		(0x14)
+#define PM800_GPADC_GP_BIAS_EN0		BIT(0)
+#define PM800_GPADC_GP_BIAS_EN1		BIT(1)
+#define PM800_GPADC_GP_BIAS_EN2		BIT(2)
+#define PM800_GPADC_GP_BIAS_EN3		BIT(3)
 
 #define PM800_GP_BIAS_OUT1		(0x15)
-#define PM800_BIAS_OUT_GP0		(1 << 0)
-#define PM800_BIAS_OUT_GP1		(1 << 1)
-#define PM800_BIAS_OUT_GP2		(1 << 2)
-#define PM800_BIAS_OUT_GP3		(1 << 3)
+#define PM800_BIAS_OUT_GP0		BIT(0)
+#define PM800_BIAS_OUT_GP1		BIT(1)
+#define PM800_BIAS_OUT_GP2		BIT(2)
+#define PM800_BIAS_OUT_GP3		BIT(3)
 
 #define PM800_GPADC0_LOW_TH		0x20
 #define PM800_GPADC1_LOW_TH		0x21
@@ -222,37 +222,37 @@ enum {
 
 #define PM805_INT_STATUS1		(0x03)
 
-#define PM805_INT1_HP1_SHRT		(1 << 0)
-#define PM805_INT1_HP2_SHRT		(1 << 1)
-#define PM805_INT1_MIC_CONFLICT		(1 << 2)
-#define PM805_INT1_CLIP_FAULT		(1 << 3)
-#define PM805_INT1_LDO_OFF			(1 << 4)
-#define PM805_INT1_SRC_DPLL_LOCK	(1 << 5)
+#define PM805_INT1_HP1_SHRT		BIT(0)
+#define PM805_INT1_HP2_SHRT		BIT(1)
+#define PM805_INT1_MIC_CONFLICT		BIT(2)
+#define PM805_INT1_CLIP_FAULT		BIT(3)
+#define PM805_INT1_LDO_OFF		BIT(4)
+#define PM805_INT1_SRC_DPLL_LOCK	BIT(5)
 
 #define PM805_INT_STATUS2		(0x04)
 
-#define PM805_INT2_MIC_DET			(1 << 0)
-#define PM805_INT2_SHRT_BTN_DET		(1 << 1)
-#define PM805_INT2_VOLM_BTN_DET		(1 << 2)
-#define PM805_INT2_VOLP_BTN_DET		(1 << 3)
-#define PM805_INT2_RAW_PLL_FAULT	(1 << 4)
-#define PM805_INT2_FINE_PLL_FAULT	(1 << 5)
+#define PM805_INT2_MIC_DET		BIT(0)
+#define PM805_INT2_SHRT_BTN_DET		BIT(1)
+#define PM805_INT2_VOLM_BTN_DET		BIT(2)
+#define PM805_INT2_VOLP_BTN_DET		BIT(3)
+#define PM805_INT2_RAW_PLL_FAULT	BIT(4)
+#define PM805_INT2_FINE_PLL_FAULT	BIT(5)
 
 #define PM805_INT_MASK1			(0x05)
 #define PM805_INT_MASK2			(0x06)
-#define PM805_SHRT_BTN_DET		(1 << 1)
+#define PM805_SHRT_BTN_DET		BIT(1)
 
 /* number of status and int reg in a row */
 #define PM805_INT_REG_NUM		(2)
 
 #define PM805_MIC_DET1			(0x07)
-#define PM805_MIC_DET_EN_MIC_DET (1 << 0)
+#define PM805_MIC_DET_EN_MIC_DET	BIT(0)
 #define PM805_MIC_DET2			(0x08)
-#define PM805_MIC_DET_STATUS1	(0x09)
+#define PM805_MIC_DET_STATUS1		(0x09)
 
-#define PM805_MIC_DET_STATUS3	(0x0A)
-#define PM805_AUTO_SEQ_STATUS1	(0x0B)
-#define PM805_AUTO_SEQ_STATUS2	(0x0C)
+#define PM805_MIC_DET_STATUS3		(0x0A)
+#define PM805_AUTO_SEQ_STATUS1		(0x0B)
+#define PM805_AUTO_SEQ_STATUS2		(0x0C)
 
 #define PM805_ADC_SETTING1		(0x10)
 #define PM805_ADC_SETTING2		(0x11)
@@ -261,7 +261,7 @@ enum {
 #define PM805_ADC_GAIN2			(0x13)
 #define PM805_DMIC_SETTING		(0x15)
 #define PM805_DWS_SETTING		(0x16)
-#define PM805_MIC_CONFLICT_STS	(0x17)
+#define PM805_MIC_CONFLICT_STS		(0x17)
 
 #define PM805_PDM_SETTING1		(0x20)
 #define PM805_PDM_SETTING2		(0x21)
@@ -270,11 +270,11 @@ enum {
 #define PM805_PDM_CONTROL2		(0x24)
 #define PM805_PDM_CONTROL3		(0x25)
 
-#define PM805_HEADPHONE_SETTING			(0x26)
-#define PM805_HEADPHONE_GAIN_A2A		(0x27)
-#define PM805_HEADPHONE_SHORT_STATE		(0x28)
-#define PM805_EARPHONE_SETTING			(0x29)
-#define PM805_AUTO_SEQ_SETTING			(0x2A)
+#define PM805_HEADPHONE_SETTING		(0x26)
+#define PM805_HEADPHONE_GAIN_A2A	(0x27)
+#define PM805_HEADPHONE_SHORT_STATE	(0x28)
+#define PM805_EARPHONE_SETTING		(0x29)
+#define PM805_AUTO_SEQ_SETTING		(0x2A)
 
 struct pm80x_rtc_pdata {
 	int		vrtc;
-- 
cgit v1.2.3-70-g09d2


From 6887b042c52ee05a405bae859f410c2f63b45339 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.wolfsonmicro.com>
Date: Fri, 3 Jul 2015 16:16:35 +0100
Subject: mfd: arizona: Add support for WM8998 and WM1814

Signed-off-by: Richard Fitzgerald <rf@opensource.wolfsonmicro.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig                   |    6 +
 drivers/mfd/Makefile                  |    3 +
 drivers/mfd/arizona-core.c            |  105 ++-
 drivers/mfd/arizona-i2c.c             |    8 +
 drivers/mfd/arizona-irq.c             |    9 +
 drivers/mfd/arizona.h                 |    5 +
 drivers/mfd/wm8998-tables.c           | 1592 +++++++++++++++++++++++++++++++++
 include/linux/mfd/arizona/core.h      |    3 +
 include/linux/mfd/arizona/pdata.h     |    2 +
 include/linux/mfd/arizona/registers.h |  220 +++++
 10 files changed, 1945 insertions(+), 8 deletions(-)
 create mode 100644 drivers/mfd/wm8998-tables.c

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 076f593f90d3..5f746ef84418 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1379,6 +1379,12 @@ config MFD_WM8997
 	help
 	  Support for Wolfson Microelectronics WM8997 low power audio SoC
 
+config MFD_WM8998
+	bool "Wolfson Microelectronics WM8998"
+	depends on MFD_ARIZONA
+	help
+	  Support for Wolfson Microelectronics WM8998 low power audio SoC
+
 config MFD_WM8400
 	bool "Wolfson Microelectronics WM8400"
 	select MFD_CORE
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 9d730a2d1878..530620aa2f70 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -48,6 +48,9 @@ endif
 ifeq ($(CONFIG_MFD_WM8997),y)
 obj-$(CONFIG_MFD_ARIZONA)	+= wm8997-tables.o
 endif
+ifeq ($(CONFIG_MFD_WM8998),y)
+obj-$(CONFIG_MFD_ARIZONA)	+= wm8998-tables.o
+endif
 obj-$(CONFIG_MFD_WM8400)	+= wm8400-core.o
 wm831x-objs			:= wm831x-core.o wm831x-irq.o wm831x-otp.o
 wm831x-objs			+= wm831x-auxadc.o
diff --git a/drivers/mfd/arizona-core.c b/drivers/mfd/arizona-core.c
index e60bcd901d02..bc814b023c05 100644
--- a/drivers/mfd/arizona-core.c
+++ b/drivers/mfd/arizona-core.c
@@ -146,17 +146,31 @@ static irqreturn_t arizona_underclocked(int irq, void *data)
 static irqreturn_t arizona_overclocked(int irq, void *data)
 {
 	struct arizona *arizona = data;
-	unsigned int val[2];
+	unsigned int val[3];
 	int ret;
 	
 	ret = regmap_bulk_read(arizona->regmap, ARIZONA_INTERRUPT_RAW_STATUS_6,
-			       &val[0], 2);
+			       &val[0], 3);
 	if (ret != 0) {
 		dev_err(arizona->dev, "Failed to read overclock status: %d\n",
 			ret);
 		return IRQ_NONE;
 	}
 
+	switch (arizona->type) {
+	case WM8998:
+	case WM1814:
+		/* Some bits are shifted on WM8998,
+		 * rearrange to match the standard bit layout
+		 */
+		val[0] = ((val[0] & 0x60e0) >> 1) |
+			 ((val[0] & 0x1e00) >> 2) |
+			 (val[0] & 0x000f);
+		break;
+	default:
+		break;
+	}
+
 	if (val[0] & ARIZONA_PWM_OVERCLOCKED_STS)
 		dev_err(arizona->dev, "PWM overclocked\n");
 	if (val[0] & ARIZONA_FX_CORE_OVERCLOCKED_STS)
@@ -201,6 +215,9 @@ static irqreturn_t arizona_overclocked(int irq, void *data)
 	if (val[1] & ARIZONA_ISRC1_OVERCLOCKED_STS)
 		dev_err(arizona->dev, "ISRC1 overclocked\n");
 
+	if (val[2] & ARIZONA_SPDIF_OVERCLOCKED_STS)
+		dev_err(arizona->dev, "SPDIF overclocked\n");
+
 	return IRQ_HANDLED;
 }
 
@@ -806,6 +823,8 @@ const struct of_device_id arizona_of_match[] = {
 	{ .compatible = "wlf,wm5110", .data = (void *)WM5110 },
 	{ .compatible = "wlf,wm8280", .data = (void *)WM8280 },
 	{ .compatible = "wlf,wm8997", .data = (void *)WM8997 },
+	{ .compatible = "wlf,wm8998", .data = (void *)WM8998 },
+	{ .compatible = "wlf,wm1814", .data = (void *)WM1814 },
 	{},
 };
 EXPORT_SYMBOL_GPL(arizona_of_match);
@@ -887,11 +906,28 @@ static const struct mfd_cell wm8997_devs[] = {
 	},
 };
 
+static const struct mfd_cell wm8998_devs[] = {
+	{
+		.name = "arizona-extcon",
+		.parent_supplies = wm5102_supplies,
+		.num_parent_supplies = 1, /* We only need MICVDD */
+	},
+	{ .name = "arizona-gpio" },
+	{ .name = "arizona-haptics" },
+	{ .name = "arizona-pwm" },
+	{
+		.name = "wm8998-codec",
+		.parent_supplies = wm5102_supplies,
+		.num_parent_supplies = ARRAY_SIZE(wm5102_supplies),
+	},
+	{ .name = "arizona-micsupp" },
+};
+
 int arizona_dev_init(struct arizona *arizona)
 {
 	struct device *dev = arizona->dev;
 	const char *type_name;
-	unsigned int reg, val;
+	unsigned int reg, val, mask;
 	int (*apply_patch)(struct arizona *) = NULL;
 	int ret, i;
 
@@ -911,6 +947,8 @@ int arizona_dev_init(struct arizona *arizona)
 	case WM5110:
 	case WM8280:
 	case WM8997:
+	case WM8998:
+	case WM1814:
 		for (i = 0; i < ARRAY_SIZE(wm5102_core_supplies); i++)
 			arizona->core_supplies[i].supply
 				= wm5102_core_supplies[i];
@@ -992,6 +1030,7 @@ int arizona_dev_init(struct arizona *arizona)
 	switch (reg) {
 	case 0x5102:
 	case 0x5110:
+	case 0x6349:
 	case 0x8997:
 		break;
 	default:
@@ -1092,6 +1131,27 @@ int arizona_dev_init(struct arizona *arizona)
 		}
 		apply_patch = wm8997_patch;
 		break;
+#endif
+#ifdef CONFIG_MFD_WM8998
+	case 0x6349:
+		switch (arizona->type) {
+		case WM8998:
+			type_name = "WM8998";
+			break;
+
+		case WM1814:
+			type_name = "WM1814";
+			break;
+
+		default:
+			type_name = "WM8998";
+			dev_err(arizona->dev, "WM8998 registered as %d\n",
+				arizona->type);
+			arizona->type = WM8998;
+		}
+
+		apply_patch = wm8998_patch;
+		break;
 #endif
 	default:
 		dev_err(arizona->dev, "Unknown device ID %x\n", reg);
@@ -1208,14 +1268,38 @@ int arizona_dev_init(struct arizona *arizona)
 			<< ARIZONA_IN1_DMIC_SUP_SHIFT;
 		if (arizona->pdata.inmode[i] & ARIZONA_INMODE_DMIC)
 			val |= 1 << ARIZONA_IN1_MODE_SHIFT;
-		if (arizona->pdata.inmode[i] & ARIZONA_INMODE_SE)
-			val |= 1 << ARIZONA_IN1_SINGLE_ENDED_SHIFT;
+
+		switch (arizona->type) {
+		case WM8998:
+		case WM1814:
+			regmap_update_bits(arizona->regmap,
+				ARIZONA_ADC_DIGITAL_VOLUME_1L + (i * 8),
+				ARIZONA_IN1L_SRC_SE_MASK,
+				(arizona->pdata.inmode[i] & ARIZONA_INMODE_SE)
+					<< ARIZONA_IN1L_SRC_SE_SHIFT);
+
+			regmap_update_bits(arizona->regmap,
+				ARIZONA_ADC_DIGITAL_VOLUME_1R + (i * 8),
+				ARIZONA_IN1R_SRC_SE_MASK,
+				(arizona->pdata.inmode[i] & ARIZONA_INMODE_SE)
+					<< ARIZONA_IN1R_SRC_SE_SHIFT);
+
+			mask = ARIZONA_IN1_DMIC_SUP_MASK |
+				ARIZONA_IN1_MODE_MASK;
+			break;
+		default:
+			if (arizona->pdata.inmode[i] & ARIZONA_INMODE_SE)
+				val |= 1 << ARIZONA_IN1_SINGLE_ENDED_SHIFT;
+
+			mask = ARIZONA_IN1_DMIC_SUP_MASK |
+				ARIZONA_IN1_MODE_MASK |
+				ARIZONA_IN1_SINGLE_ENDED_MASK;
+			break;
+		}
 
 		regmap_update_bits(arizona->regmap,
 				   ARIZONA_IN1L_CONTROL + (i * 8),
-				   ARIZONA_IN1_DMIC_SUP_MASK |
-				   ARIZONA_IN1_MODE_MASK |
-				   ARIZONA_IN1_SINGLE_ENDED_MASK, val);
+				   mask, val);
 	}
 
 	for (i = 0; i < ARIZONA_MAX_OUTPUT; i++) {
@@ -1271,6 +1355,11 @@ int arizona_dev_init(struct arizona *arizona)
 		ret = mfd_add_devices(arizona->dev, -1, wm8997_devs,
 				      ARRAY_SIZE(wm8997_devs), NULL, 0, NULL);
 		break;
+	case WM8998:
+	case WM1814:
+		ret = mfd_add_devices(arizona->dev, -1, wm8998_devs,
+				      ARRAY_SIZE(wm8998_devs), NULL, 0, NULL);
+		break;
 	}
 
 	if (ret != 0) {
diff --git a/drivers/mfd/arizona-i2c.c b/drivers/mfd/arizona-i2c.c
index ff782a5de235..920276f53326 100644
--- a/drivers/mfd/arizona-i2c.c
+++ b/drivers/mfd/arizona-i2c.c
@@ -52,6 +52,12 @@ static int arizona_i2c_probe(struct i2c_client *i2c,
 	case WM8997:
 		regmap_config = &wm8997_i2c_regmap;
 		break;
+#endif
+#ifdef CONFIG_MFD_WM8998
+	case WM8998:
+	case WM1814:
+		regmap_config = &wm8998_i2c_regmap;
+		break;
 #endif
 	default:
 		dev_err(&i2c->dev, "Unknown device type %ld\n",
@@ -90,6 +96,8 @@ static const struct i2c_device_id arizona_i2c_id[] = {
 	{ "wm5110", WM5110 },
 	{ "wm8280", WM8280 },
 	{ "wm8997", WM8997 },
+	{ "wm8998", WM8998 },
+	{ "wm1814", WM1814 },
 	{ }
 };
 MODULE_DEVICE_TABLE(i2c, arizona_i2c_id);
diff --git a/drivers/mfd/arizona-irq.c b/drivers/mfd/arizona-irq.c
index 2b9965d53e4e..bb0063db6c4e 100644
--- a/drivers/mfd/arizona-irq.c
+++ b/drivers/mfd/arizona-irq.c
@@ -234,6 +234,15 @@ int arizona_irq_init(struct arizona *arizona)
 		arizona->ctrlif_error = false;
 		break;
 #endif
+#ifdef CONFIG_MFD_WM8998
+	case WM8998:
+	case WM1814:
+		aod = &wm8998_aod;
+		irq = &wm8998_irq;
+
+		arizona->ctrlif_error = false;
+		break;
+#endif
 	default:
 		BUG_ON("Unknown Arizona class device" == NULL);
 		return -EINVAL;
diff --git a/drivers/mfd/arizona.h b/drivers/mfd/arizona.h
index fbe2843271c5..3af12e938f57 100644
--- a/drivers/mfd/arizona.h
+++ b/drivers/mfd/arizona.h
@@ -27,6 +27,8 @@ extern const struct regmap_config wm5110_spi_regmap;
 
 extern const struct regmap_config wm8997_i2c_regmap;
 
+extern const struct regmap_config wm8998_i2c_regmap;
+
 extern const struct dev_pm_ops arizona_pm_ops;
 
 extern const struct of_device_id arizona_of_match[];
@@ -41,6 +43,9 @@ extern const struct regmap_irq_chip wm5110_revd_irq;
 extern const struct regmap_irq_chip wm8997_aod;
 extern const struct regmap_irq_chip wm8997_irq;
 
+extern struct regmap_irq_chip wm8998_aod;
+extern struct regmap_irq_chip wm8998_irq;
+
 int arizona_dev_init(struct arizona *arizona);
 int arizona_dev_exit(struct arizona *arizona);
 int arizona_irq_init(struct arizona *arizona);
diff --git a/drivers/mfd/wm8998-tables.c b/drivers/mfd/wm8998-tables.c
new file mode 100644
index 000000000000..60e8622df962
--- /dev/null
+++ b/drivers/mfd/wm8998-tables.c
@@ -0,0 +1,1592 @@
+/*
+ * wm8998-tables.c  --  data tables for wm8998-class codecs
+ *
+ * Copyright 2014 Wolfson Microelectronics plc
+ *
+ * Author: Richard Fitzgerald <rf@opensource.wolfsonmicro.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+
+#include <linux/mfd/arizona/core.h>
+#include <linux/mfd/arizona/registers.h>
+#include <linux/device.h>
+
+#include "arizona.h"
+
+#define WM8998_NUM_AOD_ISR 2
+#define WM8998_NUM_ISR 5
+
+static const struct reg_default wm8998_rev_a_patch[] = {
+	{ 0x0212, 0x0000 },
+	{ 0x0211, 0x0014 },
+	{ 0x04E4, 0x0E0D },
+	{ 0x04E5, 0x0E0D },
+	{ 0x04E6, 0x0E0D },
+	{ 0x04EB, 0x060E },
+	{ 0x0441, 0xC759 },
+	{ 0x0442, 0x2A08 },
+	{ 0x0443, 0x5CFA },
+	{ 0x026E, 0x0064 },
+	{ 0x026F, 0x00EA },
+	{ 0x0270, 0x1F16 },
+	{ 0x0410, 0x2080 },
+	{ 0x0418, 0x2080 },
+	{ 0x0420, 0x2080 },
+	{ 0x04B8, 0x1120 },
+	{ 0x047E, 0x080E },
+	{ 0x0448, 0x03EF },
+};
+
+/* We use a function so we can use ARRAY_SIZE() */
+int wm8998_patch(struct arizona *arizona)
+{
+	return regmap_register_patch(arizona->regmap,
+				     wm8998_rev_a_patch,
+				     ARRAY_SIZE(wm8998_rev_a_patch));
+}
+
+static const struct regmap_irq wm8998_aod_irqs[ARIZONA_NUM_IRQ] = {
+	[ARIZONA_IRQ_MICD_CLAMP_FALL] = {
+		.mask = ARIZONA_MICD_CLAMP_FALL_EINT1
+	},
+	[ARIZONA_IRQ_MICD_CLAMP_RISE] = {
+		.mask = ARIZONA_MICD_CLAMP_RISE_EINT1
+	},
+	[ARIZONA_IRQ_GP5_FALL] = { .mask = ARIZONA_GP5_FALL_EINT1 },
+	[ARIZONA_IRQ_GP5_RISE] = { .mask = ARIZONA_GP5_RISE_EINT1 },
+	[ARIZONA_IRQ_JD_FALL] = { .mask = ARIZONA_JD1_FALL_EINT1 },
+	[ARIZONA_IRQ_JD_RISE] = { .mask = ARIZONA_JD1_RISE_EINT1 },
+};
+
+struct regmap_irq_chip wm8998_aod = {
+	.name = "wm8998 AOD",
+	.status_base = ARIZONA_AOD_IRQ1,
+	.mask_base = ARIZONA_AOD_IRQ_MASK_IRQ1,
+	.ack_base = ARIZONA_AOD_IRQ1,
+	.wake_base = ARIZONA_WAKE_CONTROL,
+	.wake_invert = 1,
+	.num_regs = 1,
+	.irqs = wm8998_aod_irqs,
+	.num_irqs = ARRAY_SIZE(wm8998_aod_irqs),
+};
+
+static const struct regmap_irq wm8998_irqs[ARIZONA_NUM_IRQ] = {
+	[ARIZONA_IRQ_GP4] = { .reg_offset = 0, .mask = ARIZONA_GP4_EINT1 },
+	[ARIZONA_IRQ_GP3] = { .reg_offset = 0, .mask = ARIZONA_GP3_EINT1 },
+	[ARIZONA_IRQ_GP2] = { .reg_offset = 0, .mask = ARIZONA_GP2_EINT1 },
+	[ARIZONA_IRQ_GP1] = { .reg_offset = 0, .mask = ARIZONA_GP1_EINT1 },
+
+	[ARIZONA_IRQ_SPK_OVERHEAT_WARN] = {
+		.reg_offset = 2, .mask = ARIZONA_SPK_OVERHEAT_WARN_EINT1
+	},
+	[ARIZONA_IRQ_SPK_OVERHEAT] = {
+		.reg_offset = 2, .mask = ARIZONA_SPK_OVERHEAT_EINT1
+	},
+	[ARIZONA_IRQ_HPDET] = {
+		.reg_offset = 2, .mask = ARIZONA_HPDET_EINT1
+	},
+	[ARIZONA_IRQ_MICDET] = {
+		.reg_offset = 2, .mask = ARIZONA_MICDET_EINT1
+	},
+	[ARIZONA_IRQ_WSEQ_DONE] = {
+		.reg_offset = 2, .mask = ARIZONA_WSEQ_DONE_EINT1
+	},
+	[ARIZONA_IRQ_DRC1_SIG_DET] = {
+		.reg_offset = 2, .mask = ARIZONA_DRC1_SIG_DET_EINT1
+	},
+	[ARIZONA_IRQ_ASRC2_LOCK] = {
+		.reg_offset = 2, .mask = ARIZONA_ASRC2_LOCK_EINT1
+	},
+	[ARIZONA_IRQ_ASRC1_LOCK] = {
+		.reg_offset = 2, .mask = ARIZONA_ASRC1_LOCK_EINT1
+	},
+	[ARIZONA_IRQ_UNDERCLOCKED] = {
+		.reg_offset = 2, .mask = ARIZONA_UNDERCLOCKED_EINT1
+	},
+	[ARIZONA_IRQ_OVERCLOCKED] = {
+		.reg_offset = 2, .mask = ARIZONA_OVERCLOCKED_EINT1
+	},
+	[ARIZONA_IRQ_FLL2_LOCK] = {
+		.reg_offset = 2, .mask = ARIZONA_FLL2_LOCK_EINT1
+	},
+	[ARIZONA_IRQ_FLL1_LOCK] = {
+		.reg_offset = 2, .mask = ARIZONA_FLL1_LOCK_EINT1
+	},
+	[ARIZONA_IRQ_CLKGEN_ERR] = {
+		.reg_offset = 2, .mask = ARIZONA_CLKGEN_ERR_EINT1
+	},
+	[ARIZONA_IRQ_CLKGEN_ERR_ASYNC] = {
+		.reg_offset = 2, .mask = ARIZONA_CLKGEN_ERR_ASYNC_EINT1
+	},
+
+	[ARIZONA_IRQ_ASRC_CFG_ERR] = {
+		.reg_offset = 3, .mask = ARIZONA_ASRC_CFG_ERR_EINT1
+	},
+	[ARIZONA_IRQ_AIF3_ERR] = {
+		.reg_offset = 3, .mask = ARIZONA_AIF3_ERR_EINT1
+	},
+	[ARIZONA_IRQ_AIF2_ERR] = {
+		.reg_offset = 3, .mask = ARIZONA_AIF2_ERR_EINT1
+	},
+	[ARIZONA_IRQ_AIF1_ERR] = {
+		.reg_offset = 3, .mask = ARIZONA_AIF1_ERR_EINT1
+	},
+	[ARIZONA_IRQ_CTRLIF_ERR] = {
+		.reg_offset = 3, .mask = ARIZONA_CTRLIF_ERR_EINT1
+	},
+	[ARIZONA_IRQ_MIXER_DROPPED_SAMPLES] = {
+		.reg_offset = 3, .mask = ARIZONA_MIXER_DROPPED_SAMPLE_EINT1
+	},
+	[ARIZONA_IRQ_ASYNC_CLK_ENA_LOW] = {
+		.reg_offset = 3, .mask = ARIZONA_ASYNC_CLK_ENA_LOW_EINT1
+	},
+	[ARIZONA_IRQ_SYSCLK_ENA_LOW] = {
+		.reg_offset = 3, .mask = ARIZONA_SYSCLK_ENA_LOW_EINT1
+	},
+	[ARIZONA_IRQ_ISRC1_CFG_ERR] = {
+		.reg_offset = 3, .mask = ARIZONA_ISRC1_CFG_ERR_EINT1
+	},
+	[ARIZONA_IRQ_ISRC2_CFG_ERR] = {
+		.reg_offset = 3, .mask = ARIZONA_ISRC2_CFG_ERR_EINT1
+	},
+
+	[ARIZONA_IRQ_BOOT_DONE] = {
+		.reg_offset = 4, .mask = ARIZONA_BOOT_DONE_EINT1
+	},
+	[ARIZONA_IRQ_FLL2_CLOCK_OK] = {
+		.reg_offset = 4, .mask = ARIZONA_FLL2_CLOCK_OK_EINT1
+	},
+	[ARIZONA_IRQ_FLL1_CLOCK_OK] = {
+		.reg_offset = 4, .mask = ARIZONA_FLL1_CLOCK_OK_EINT1
+	},
+};
+
+struct regmap_irq_chip wm8998_irq = {
+	.name = "wm8998 IRQ",
+	.status_base = ARIZONA_INTERRUPT_STATUS_1,
+	.mask_base = ARIZONA_INTERRUPT_STATUS_1_MASK,
+	.ack_base = ARIZONA_INTERRUPT_STATUS_1,
+	.num_regs = 5,
+	.irqs = wm8998_irqs,
+	.num_irqs = ARRAY_SIZE(wm8998_irqs),
+};
+
+static const struct reg_default wm8998_reg_default[] = {
+	{ 0x00000009, 0x0001 },    /* R9     - Ctrl IF I2C1 CFG 1 */
+	{ 0x0000000B, 0x001A },    /* R11    - Ctrl IF I2C1 CFG 2 */
+	{ 0x00000020, 0x0000 },    /* R32    - Tone Generator 1 */
+	{ 0x00000021, 0x1000 },    /* R33    - Tone Generator 2 */
+	{ 0x00000022, 0x0000 },    /* R34    - Tone Generator 3 */
+	{ 0x00000023, 0x1000 },    /* R35    - Tone Generator 4 */
+	{ 0x00000024, 0x0000 },    /* R36    - Tone Generator 5 */
+	{ 0x00000030, 0x0000 },    /* R48    - PWM Drive 1 */
+	{ 0x00000031, 0x0100 },    /* R49    - PWM Drive 2 */
+	{ 0x00000032, 0x0100 },    /* R50    - PWM Drive 3 */
+	{ 0x00000040, 0x0000 },    /* R64    - Wake control */
+	{ 0x00000041, 0x0000 },    /* R65    - Sequence control */
+	{ 0x00000061, 0x01FF },    /* R97    - Sample Rate Sequence Select 1 */
+	{ 0x00000062, 0x01FF },    /* R98    - Sample Rate Sequence Select 2 */
+	{ 0x00000063, 0x01FF },    /* R99    - Sample Rate Sequence Select 3 */
+	{ 0x00000064, 0x01FF },    /* R100   - Sample Rate Sequence Select 4 */
+	{ 0x00000066, 0x01FF },    /* R102   - Always On Triggers Sequence Select 1 */
+	{ 0x00000067, 0x01FF },    /* R103   - Always On Triggers Sequence Select 2 */
+	{ 0x00000068, 0x01FF },    /* R104   - Always On Triggers Sequence Select 3 */
+	{ 0x00000069, 0x01FF },    /* R105   - Always On Triggers Sequence Select 4 */
+	{ 0x0000006A, 0x01FF },    /* R106   - Always On Triggers Sequence Select 5 */
+	{ 0x0000006B, 0x01FF },    /* R107   - Always On Triggers Sequence Select 6 */
+	{ 0x0000006E, 0x01FF },    /* R110   - Trigger Sequence Select 32 */
+	{ 0x0000006F, 0x01FF },    /* R111   - Trigger Sequence Select 33 */
+	{ 0x00000090, 0x0000 },    /* R144   - Haptics Control 1 */
+	{ 0x00000091, 0x7FFF },    /* R145   - Haptics Control 2 */
+	{ 0x00000092, 0x0000 },    /* R146   - Haptics phase 1 intensity */
+	{ 0x00000093, 0x0000 },    /* R147   - Haptics phase 1 duration */
+	{ 0x00000094, 0x0000 },    /* R148   - Haptics phase 2 intensity */
+	{ 0x00000095, 0x0000 },    /* R149   - Haptics phase 2 duration */
+	{ 0x00000096, 0x0000 },    /* R150   - Haptics phase 3 intensity */
+	{ 0x00000097, 0x0000 },    /* R151   - Haptics phase 3 duration */
+	{ 0x00000100, 0x0002 },    /* R256   - Clock 32k 1 */
+	{ 0x00000101, 0x0304 },    /* R257   - System Clock 1 */
+	{ 0x00000102, 0x0011 },    /* R258   - Sample rate 1 */
+	{ 0x00000103, 0x0011 },    /* R259   - Sample rate 2 */
+	{ 0x00000104, 0x0011 },    /* R260   - Sample rate 3 */
+	{ 0x00000112, 0x0305 },    /* R274   - Async clock 1 */
+	{ 0x00000113, 0x0011 },    /* R275   - Async sample rate 1 */
+	{ 0x00000114, 0x0011 },    /* R276   - Async sample rate 2 */
+	{ 0x00000149, 0x0000 },    /* R329   - Output system clock */
+	{ 0x0000014A, 0x0000 },    /* R330   - Output async clock */
+	{ 0x00000152, 0x0000 },    /* R338   - Rate Estimator 1 */
+	{ 0x00000153, 0x0000 },    /* R339   - Rate Estimator 2 */
+	{ 0x00000154, 0x0000 },    /* R340   - Rate Estimator 3 */
+	{ 0x00000155, 0x0000 },    /* R341   - Rate Estimator 4 */
+	{ 0x00000156, 0x0000 },    /* R342   - Rate Estimator 5 */
+	{ 0x00000161, 0x0000 },    /* R353   - Dynamic Frequency Scaling 1 */
+	{ 0x00000171, 0x0002 },    /* R369   - FLL1 Control 1 */
+	{ 0x00000172, 0x0008 },    /* R370   - FLL1 Control 2 */
+	{ 0x00000173, 0x0018 },    /* R371   - FLL1 Control 3 */
+	{ 0x00000174, 0x007D },    /* R372   - FLL1 Control 4 */
+	{ 0x00000175, 0x0004 },    /* R373   - FLL1 Control 5 */
+	{ 0x00000176, 0x0000 },    /* R374   - FLL1 Control 6 */
+	{ 0x00000177, 0x0181 },    /* R375   - FLL1 Loop Filter Test 1 */
+	{ 0x00000178, 0x0000 },    /* R376   - FLL1 NCO Test 0 */
+	{ 0x00000179, 0x0000 },    /* R377   - FLL1 Control 7 */
+	{ 0x00000181, 0x0000 },    /* R385   - FLL1 Synchroniser 1 */
+	{ 0x00000182, 0x0000 },    /* R386   - FLL1 Synchroniser 2 */
+	{ 0x00000183, 0x0000 },    /* R387   - FLL1 Synchroniser 3 */
+	{ 0x00000184, 0x0000 },    /* R388   - FLL1 Synchroniser 4 */
+	{ 0x00000185, 0x0000 },    /* R389   - FLL1 Synchroniser 5 */
+	{ 0x00000186, 0x0000 },    /* R390   - FLL1 Synchroniser 6 */
+	{ 0x00000187, 0x0001 },    /* R391   - FLL1 Synchroniser 7 */
+	{ 0x00000189, 0x0000 },    /* R393   - FLL1 Spread Spectrum */
+	{ 0x0000018A, 0x0004 },    /* R394   - FLL1 GPIO Clock */
+	{ 0x00000191, 0x0000 },    /* R401   - FLL2 Control 1 */
+	{ 0x00000192, 0x0008 },    /* R402   - FLL2 Control 2 */
+	{ 0x00000193, 0x0018 },    /* R403   - FLL2 Control 3 */
+	{ 0x00000194, 0x007D },    /* R404   - FLL2 Control 4 */
+	{ 0x00000195, 0x0004 },    /* R405   - FLL2 Control 5 */
+	{ 0x00000196, 0x0000 },    /* R406   - FLL2 Control 6 */
+	{ 0x00000197, 0x0000 },    /* R407   - FLL2 Loop Filter Test 1 */
+	{ 0x00000198, 0x0000 },    /* R408   - FLL2 NCO Test 0 */
+	{ 0x00000199, 0x0000 },    /* R409   - FLL2 Control 7 */
+	{ 0x000001A1, 0x0000 },    /* R417   - FLL2 Synchroniser 1 */
+	{ 0x000001A2, 0x0000 },    /* R418   - FLL2 Synchroniser 2 */
+	{ 0x000001A3, 0x0000 },    /* R419   - FLL2 Synchroniser 3 */
+	{ 0x000001A4, 0x0000 },    /* R420   - FLL2 Synchroniser 4 */
+	{ 0x000001A5, 0x0000 },    /* R421   - FLL2 Synchroniser 5 */
+	{ 0x000001A6, 0x0000 },    /* R422   - FLL2 Synchroniser 6 */
+	{ 0x000001A7, 0x0001 },    /* R423   - FLL2 Synchroniser 7 */
+	{ 0x000001A9, 0x0000 },    /* R425   - FLL2 Spread Spectrum */
+	{ 0x000001AA, 0x0004 },    /* R426   - FLL2 GPIO Clock */
+	{ 0x00000200, 0x0006 },    /* R512   - Mic Charge Pump 1 */
+	{ 0x00000210, 0x00D4 },    /* R528   - LDO1 Control 1 */
+	{ 0x00000212, 0x0000 },    /* R530   - LDO1 Control 2 */
+	{ 0x00000213, 0x0344 },    /* R531   - LDO2 Control 1 */
+	{ 0x00000218, 0x01A6 },    /* R536   - Mic Bias Ctrl 1 */
+	{ 0x00000219, 0x01A6 },    /* R537   - Mic Bias Ctrl 2 */
+	{ 0x0000021A, 0x01A6 },    /* R538   - Mic Bias Ctrl 3 */
+	{ 0x00000293, 0x0080 },    /* R659   - Accessory Detect Mode 1 */
+	{ 0x0000029B, 0x0000 },    /* R667   - Headphone Detect 1 */
+	{ 0x0000029C, 0x0000 },    /* R668   - Headphone Detect 2 */
+	{ 0x000002A2, 0x0000 },    /* R674   - Micd Clamp control */
+	{ 0x000002A3, 0x1102 },    /* R675   - Mic Detect 1 */
+	{ 0x000002A4, 0x009F },    /* R676   - Mic Detect 2 */
+	{ 0x000002A5, 0x0000 },    /* R677   - Mic Detect 3 */
+	{ 0x000002A6, 0x3737 },    /* R678   - Mic Detect Level 1 */
+	{ 0x000002A7, 0x2C37 },    /* R679   - Mic Detect Level 2 */
+	{ 0x000002A8, 0x1422 },    /* R680   - Mic Detect Level 3 */
+	{ 0x000002A9, 0x030A },    /* R681   - Mic Detect Level 4 */
+	{ 0x000002AB, 0x0000 },    /* R683   - Mic Detect 4 */
+	{ 0x000002CB, 0x0000 },    /* R715   - Isolation control */
+	{ 0x000002D3, 0x0000 },    /* R723   - Jack detect analogue */
+	{ 0x00000300, 0x0000 },    /* R768   - Input Enables */
+	{ 0x00000308, 0x0000 },    /* R776   - Input Rate */
+	{ 0x00000309, 0x0022 },    /* R777   - Input Volume Ramp */
+	{ 0x0000030C, 0x0002 },    /* R780   - HPF Control */
+	{ 0x00000310, 0x2080 },    /* R784   - IN1L Control */
+	{ 0x00000311, 0x0180 },    /* R785   - ADC Digital Volume 1L */
+	{ 0x00000312, 0x0000 },    /* R786   - DMIC1L Control */
+	{ 0x00000314, 0x0080 },    /* R788   - IN1R Control */
+	{ 0x00000315, 0x0180 },    /* R789   - ADC Digital Volume 1R */
+	{ 0x00000316, 0x0000 },    /* R790   - DMIC1R Control */
+	{ 0x00000318, 0x2080 },    /* R792   - IN2L Control */
+	{ 0x00000319, 0x0180 },    /* R793   - ADC Digital Volume 2L */
+	{ 0x0000031A, 0x0000 },    /* R794   - DMIC2L Control */
+	{ 0x00000400, 0x0000 },    /* R1024  - Output Enables 1 */
+	{ 0x00000408, 0x0000 },    /* R1032  - Output Rate 1 */
+	{ 0x00000409, 0x0022 },    /* R1033  - Output Volume Ramp */
+	{ 0x00000410, 0x2080 },    /* R1040  - Output Path Config 1L */
+	{ 0x00000411, 0x0180 },    /* R1041  - DAC Digital Volume 1L */
+	{ 0x00000413, 0x0001 },    /* R1043  - Noise Gate Select 1L */
+	{ 0x00000414, 0x0080 },    /* R1044  - Output Path Config 1R */
+	{ 0x00000415, 0x0180 },    /* R1045  - DAC Digital Volume 1R */
+	{ 0x00000417, 0x0002 },    /* R1047  - Noise Gate Select 1R */
+	{ 0x00000418, 0x2080 },    /* R1048  - Output Path Config 2L */
+	{ 0x00000419, 0x0180 },    /* R1049  - DAC Digital Volume 2L */
+	{ 0x0000041B, 0x0004 },    /* R1051  - Noise Gate Select 2L */
+	{ 0x0000041C, 0x0080 },    /* R1052  - Output Path Config 2R */
+	{ 0x0000041D, 0x0180 },    /* R1053  - DAC Digital Volume 2R */
+	{ 0x0000041F, 0x0008 },    /* R1055  - Noise Gate Select 2R */
+	{ 0x00000420, 0x2080 },    /* R1056  - Output Path Config 3L */
+	{ 0x00000421, 0x0180 },    /* R1057  - DAC Digital Volume 3L */
+	{ 0x00000423, 0x0010 },    /* R1059  - Noise Gate Select 3L */
+	{ 0x00000428, 0x0000 },    /* R1064  - Output Path Config 4L */
+	{ 0x00000429, 0x0180 },    /* R1065  - DAC Digital Volume 4L */
+	{ 0x0000042B, 0x0040 },    /* R1067  - Noise Gate Select 4L */
+	{ 0x0000042C, 0x0000 },    /* R1068  - Output Path Config 4R */
+	{ 0x0000042D, 0x0180 },    /* R1069  - DAC Digital Volume 4R */
+	{ 0x0000042F, 0x0080 },    /* R1071  - Noise Gate Select 4R */
+	{ 0x00000430, 0x0000 },    /* R1072  - Output Path Config 5L */
+	{ 0x00000431, 0x0180 },    /* R1073  - DAC Digital Volume 5L */
+	{ 0x00000433, 0x0100 },    /* R1075  - Noise Gate Select 5L */
+	{ 0x00000434, 0x0000 },    /* R1076  - Output Path Config 5R */
+	{ 0x00000435, 0x0180 },    /* R1077  - DAC Digital Volume 5R */
+	{ 0x00000437, 0x0200 },    /* R1079  - Noise Gate Select 5R */
+	{ 0x00000440, 0x8FFF },    /* R1088  - DRE Enable */
+	{ 0x00000441, 0xC759 },    /* R1089  - DRE Control 1 */
+	{ 0x00000442, 0x2A08 },    /* R1089  - DRE Control 2 */
+	{ 0x00000443, 0x5CFA },    /* R1089  - DRE Control 3 */
+	{ 0x00000448, 0x03EF },    /* R1096  - EDRE Enable */
+	{ 0x00000450, 0x0000 },    /* R1104  - DAC AEC Control 1 */
+	{ 0x00000451, 0x0000 },    /* R1105  - DAC AEC Control 2 */
+	{ 0x00000458, 0x0000 },    /* R1112  - Noise Gate Control */
+	{ 0x00000490, 0x0069 },    /* R1168  - PDM SPK1 CTRL 1 */
+	{ 0x00000491, 0x0000 },    /* R1169  - PDM SPK1 CTRL 2 */
+	{ 0x0000049A, 0x0000 },    /* R1178  - HP_TEST_CTRL_13 */
+	{ 0x00000500, 0x000C },    /* R1280  - AIF1 BCLK Ctrl */
+	{ 0x00000501, 0x0008 },    /* R1281  - AIF1 Tx Pin Ctrl */
+	{ 0x00000502, 0x0000 },    /* R1282  - AIF1 Rx Pin Ctrl */
+	{ 0x00000503, 0x0000 },    /* R1283  - AIF1 Rate Ctrl */
+	{ 0x00000504, 0x0000 },    /* R1284  - AIF1 Format */
+	{ 0x00000506, 0x0040 },    /* R1286  - AIF1 Rx BCLK Rate */
+	{ 0x00000507, 0x1818 },    /* R1287  - AIF1 Frame Ctrl 1 */
+	{ 0x00000508, 0x1818 },    /* R1288  - AIF1 Frame Ctrl 2 */
+	{ 0x00000509, 0x0000 },    /* R1289  - AIF1 Frame Ctrl 3 */
+	{ 0x0000050A, 0x0001 },    /* R1290  - AIF1 Frame Ctrl 4 */
+	{ 0x0000050B, 0x0002 },    /* R1291  - AIF1 Frame Ctrl 5 */
+	{ 0x0000050C, 0x0003 },    /* R1292  - AIF1 Frame Ctrl 6 */
+	{ 0x0000050D, 0x0004 },    /* R1293  - AIF1 Frame Ctrl 7 */
+	{ 0x0000050E, 0x0005 },    /* R1294  - AIF1 Frame Ctrl 8 */
+	{ 0x00000511, 0x0000 },    /* R1297  - AIF1 Frame Ctrl 11 */
+	{ 0x00000512, 0x0001 },    /* R1298  - AIF1 Frame Ctrl 12 */
+	{ 0x00000513, 0x0002 },    /* R1299  - AIF1 Frame Ctrl 13 */
+	{ 0x00000514, 0x0003 },    /* R1300  - AIF1 Frame Ctrl 14 */
+	{ 0x00000515, 0x0004 },    /* R1301  - AIF1 Frame Ctrl 15 */
+	{ 0x00000516, 0x0005 },    /* R1302  - AIF1 Frame Ctrl 16 */
+	{ 0x00000519, 0x0000 },    /* R1305  - AIF1 Tx Enables */
+	{ 0x0000051A, 0x0000 },    /* R1306  - AIF1 Rx Enables */
+	{ 0x00000540, 0x000C },    /* R1344  - AIF2 BCLK Ctrl */
+	{ 0x00000541, 0x0008 },    /* R1345  - AIF2 Tx Pin Ctrl */
+	{ 0x00000542, 0x0000 },    /* R1346  - AIF2 Rx Pin Ctrl */
+	{ 0x00000543, 0x0000 },    /* R1347  - AIF2 Rate Ctrl */
+	{ 0x00000544, 0x0000 },    /* R1348  - AIF2 Format */
+	{ 0x00000546, 0x0040 },    /* R1350  - AIF2 Rx BCLK Rate */
+	{ 0x00000547, 0x1818 },    /* R1351  - AIF2 Frame Ctrl 1 */
+	{ 0x00000548, 0x1818 },    /* R1352  - AIF2 Frame Ctrl 2 */
+	{ 0x00000549, 0x0000 },    /* R1353  - AIF2 Frame Ctrl 3 */
+	{ 0x0000054A, 0x0001 },    /* R1354  - AIF2 Frame Ctrl 4 */
+	{ 0x0000054B, 0x0002 },    /* R1355  - AIF2 Frame Ctrl 5 */
+	{ 0x0000054C, 0x0003 },    /* R1356  - AIF2 Frame Ctrl 6 */
+	{ 0x0000054D, 0x0004 },    /* R1357  - AIF2 Frame Ctrl 7 */
+	{ 0x0000054E, 0x0005 },    /* R1358  - AIF2 Frame Ctrl 8 */
+	{ 0x00000551, 0x0000 },    /* R1361  - AIF2 Frame Ctrl 11 */
+	{ 0x00000552, 0x0001 },    /* R1362  - AIF2 Frame Ctrl 12 */
+	{ 0x00000553, 0x0002 },    /* R1363  - AIF2 Frame Ctrl 13 */
+	{ 0x00000554, 0x0003 },    /* R1364  - AIF2 Frame Ctrl 14 */
+	{ 0x00000555, 0x0004 },    /* R1365  - AIF2 Frame Ctrl 15 */
+	{ 0x00000556, 0x0005 },    /* R1366  - AIF2 Frame Ctrl 16 */
+	{ 0x00000559, 0x0000 },    /* R1369  - AIF2 Tx Enables */
+	{ 0x0000055A, 0x0000 },    /* R1370  - AIF2 Rx Enables */
+	{ 0x00000580, 0x000C },    /* R1408  - AIF3 BCLK Ctrl */
+	{ 0x00000581, 0x0008 },    /* R1409  - AIF3 Tx Pin Ctrl */
+	{ 0x00000582, 0x0000 },    /* R1410  - AIF3 Rx Pin Ctrl */
+	{ 0x00000583, 0x0000 },    /* R1411  - AIF3 Rate Ctrl */
+	{ 0x00000584, 0x0000 },    /* R1412  - AIF3 Format */
+	{ 0x00000586, 0x0040 },    /* R1414  - AIF3 Rx BCLK Rate */
+	{ 0x00000587, 0x1818 },    /* R1415  - AIF3 Frame Ctrl 1 */
+	{ 0x00000588, 0x1818 },    /* R1416  - AIF3 Frame Ctrl 2 */
+	{ 0x00000589, 0x0000 },    /* R1417  - AIF3 Frame Ctrl 3 */
+	{ 0x0000058A, 0x0001 },    /* R1418  - AIF3 Frame Ctrl 4 */
+	{ 0x00000591, 0x0000 },    /* R1425  - AIF3 Frame Ctrl 11 */
+	{ 0x00000592, 0x0001 },    /* R1426  - AIF3 Frame Ctrl 12 */
+	{ 0x00000599, 0x0000 },    /* R1433  - AIF3 Tx Enables */
+	{ 0x0000059A, 0x0000 },    /* R1434  - AIF3 Rx Enables */
+	{ 0x000005C2, 0x0000 },    /* R1474  - SPD1 TX Control */
+	{ 0x000005C3, 0x0000 },    /* R1475  - SPD1 TX Channel Status 1 */
+	{ 0x000005C4, 0x0B01 },    /* R1476  - SPD1 TX Channel Status 2 */
+	{ 0x000005C5, 0x0000 },    /* R1477  - SPD1 TX Channel Status 3 */
+	{ 0x000005E3, 0x0004 },    /* R1507  - SLIMbus Framer Ref Gear */
+	{ 0x000005E5, 0x0000 },    /* R1509  - SLIMbus Rates 1 */
+	{ 0x000005E6, 0x0000 },    /* R1510  - SLIMbus Rates 2 */
+	{ 0x000005E9, 0x0000 },    /* R1513  - SLIMbus Rates 5 */
+	{ 0x000005EA, 0x0000 },    /* R1514  - SLIMbus Rates 6 */
+	{ 0x000005EB, 0x0000 },    /* R1515  - SLIMbus Rates 7 */
+	{ 0x000005F5, 0x0000 },    /* R1525  - SLIMbus RX Channel Enable */
+	{ 0x000005F6, 0x0000 },    /* R1526  - SLIMbus TX Channel Enable */
+	{ 0x00000640, 0x0000 },    /* R1600  - PWM1MIX Input 1 Source */
+	{ 0x00000641, 0x0080 },    /* R1601  - PWM1MIX Input 1 Volume */
+	{ 0x00000642, 0x0000 },    /* R1602  - PWM1MIX Input 2 Source */
+	{ 0x00000643, 0x0080 },    /* R1603  - PWM1MIX Input 2 Volume */
+	{ 0x00000644, 0x0000 },    /* R1604  - PWM1MIX Input 3 Source */
+	{ 0x00000645, 0x0080 },    /* R1605  - PWM1MIX Input 3 Volume */
+	{ 0x00000646, 0x0000 },    /* R1606  - PWM1MIX Input 4 Source */
+	{ 0x00000647, 0x0080 },    /* R1607  - PWM1MIX Input 4 Volume */
+	{ 0x00000648, 0x0000 },    /* R1608  - PWM2MIX Input 1 Source */
+	{ 0x00000649, 0x0080 },    /* R1609  - PWM2MIX Input 1 Volume */
+	{ 0x0000064A, 0x0000 },    /* R1610  - PWM2MIX Input 2 Source */
+	{ 0x0000064B, 0x0080 },    /* R1611  - PWM2MIX Input 2 Volume */
+	{ 0x0000064C, 0x0000 },    /* R1612  - PWM2MIX Input 3 Source */
+	{ 0x0000064D, 0x0080 },    /* R1613  - PWM2MIX Input 3 Volume */
+	{ 0x0000064E, 0x0000 },    /* R1614  - PWM2MIX Input 4 Source */
+	{ 0x0000064F, 0x0080 },    /* R1615  - PWM2MIX Input 4 Volume */
+	{ 0x00000680, 0x0000 },    /* R1664  - OUT1LMIX Input 1 Source */
+	{ 0x00000681, 0x0080 },    /* R1665  - OUT1LMIX Input 1 Volume */
+	{ 0x00000682, 0x0000 },    /* R1666  - OUT1LMIX Input 2 Source */
+	{ 0x00000683, 0x0080 },    /* R1667  - OUT1LMIX Input 2 Volume */
+	{ 0x00000684, 0x0000 },    /* R1668  - OUT1LMIX Input 3 Source */
+	{ 0x00000685, 0x0080 },    /* R1669  - OUT1LMIX Input 3 Volume */
+	{ 0x00000686, 0x0000 },    /* R1670  - OUT1LMIX Input 4 Source */
+	{ 0x00000687, 0x0080 },    /* R1671  - OUT1LMIX Input 4 Volume */
+	{ 0x00000688, 0x0000 },    /* R1672  - OUT1RMIX Input 1 Source */
+	{ 0x00000689, 0x0080 },    /* R1673  - OUT1RMIX Input 1 Volume */
+	{ 0x0000068A, 0x0000 },    /* R1674  - OUT1RMIX Input 2 Source */
+	{ 0x0000068B, 0x0080 },    /* R1675  - OUT1RMIX Input 2 Volume */
+	{ 0x0000068C, 0x0000 },    /* R1676  - OUT1RMIX Input 3 Source */
+	{ 0x0000068D, 0x0080 },    /* R1677  - OUT1RMIX Input 3 Volume */
+	{ 0x0000068E, 0x0000 },    /* R1678  - OUT1RMIX Input 4 Source */
+	{ 0x0000068F, 0x0080 },    /* R1679  - OUT1RMIX Input 4 Volume */
+	{ 0x00000690, 0x0000 },    /* R1680  - OUT2LMIX Input 1 Source */
+	{ 0x00000691, 0x0080 },    /* R1681  - OUT2LMIX Input 1 Volume */
+	{ 0x00000692, 0x0000 },    /* R1682  - OUT2LMIX Input 2 Source */
+	{ 0x00000693, 0x0080 },    /* R1683  - OUT2LMIX Input 2 Volume */
+	{ 0x00000694, 0x0000 },    /* R1684  - OUT2LMIX Input 3 Source */
+	{ 0x00000695, 0x0080 },    /* R1685  - OUT2LMIX Input 3 Volume */
+	{ 0x00000696, 0x0000 },    /* R1686  - OUT2LMIX Input 4 Source */
+	{ 0x00000697, 0x0080 },    /* R1687  - OUT2LMIX Input 4 Volume */
+	{ 0x00000698, 0x0000 },    /* R1688  - OUT2RMIX Input 1 Source */
+	{ 0x00000699, 0x0080 },    /* R1689  - OUT2RMIX Input 1 Volume */
+	{ 0x0000069A, 0x0000 },    /* R1690  - OUT2RMIX Input 2 Source */
+	{ 0x0000069B, 0x0080 },    /* R1691  - OUT2RMIX Input 2 Volume */
+	{ 0x0000069C, 0x0000 },    /* R1692  - OUT2RMIX Input 3 Source */
+	{ 0x0000069D, 0x0080 },    /* R1693  - OUT2RMIX Input 3 Volume */
+	{ 0x0000069E, 0x0000 },    /* R1694  - OUT2RMIX Input 4 Source */
+	{ 0x0000069F, 0x0080 },    /* R1695  - OUT2RMIX Input 4 Volume */
+	{ 0x000006A0, 0x0000 },    /* R1696  - OUT3LMIX Input 1 Source */
+	{ 0x000006A1, 0x0080 },    /* R1697  - OUT3LMIX Input 1 Volume */
+	{ 0x000006A2, 0x0000 },    /* R1698  - OUT3LMIX Input 2 Source */
+	{ 0x000006A3, 0x0080 },    /* R1699  - OUT3LMIX Input 2 Volume */
+	{ 0x000006A4, 0x0000 },    /* R1700  - OUT3LMIX Input 3 Source */
+	{ 0x000006A5, 0x0080 },    /* R1701  - OUT3LMIX Input 3 Volume */
+	{ 0x000006A6, 0x0000 },    /* R1702  - OUT3LMIX Input 4 Source */
+	{ 0x000006A7, 0x0080 },    /* R1703  - OUT3LMIX Input 4 Volume */
+	{ 0x000006B0, 0x0000 },    /* R1712  - OUT4LMIX Input 1 Source */
+	{ 0x000006B1, 0x0080 },    /* R1713  - OUT4LMIX Input 1 Volume */
+	{ 0x000006B2, 0x0000 },    /* R1714  - OUT4LMIX Input 2 Source */
+	{ 0x000006B3, 0x0080 },    /* R1715  - OUT4LMIX Input 2 Volume */
+	{ 0x000006B4, 0x0000 },    /* R1716  - OUT4LMIX Input 3 Source */
+	{ 0x000006B5, 0x0080 },    /* R1717  - OUT4LMIX Input 3 Volume */
+	{ 0x000006B6, 0x0000 },    /* R1718  - OUT4LMIX Input 4 Source */
+	{ 0x000006B7, 0x0080 },    /* R1719  - OUT4LMIX Input 4 Volume */
+	{ 0x000006B8, 0x0000 },    /* R1720  - OUT4RMIX Input 1 Source */
+	{ 0x000006B9, 0x0080 },    /* R1721  - OUT4RMIX Input 1 Volume */
+	{ 0x000006BA, 0x0000 },    /* R1722  - OUT4RMIX Input 2 Source */
+	{ 0x000006BB, 0x0080 },    /* R1723  - OUT4RMIX Input 2 Volume */
+	{ 0x000006BC, 0x0000 },    /* R1724  - OUT4RMIX Input 3 Source */
+	{ 0x000006BD, 0x0080 },    /* R1725  - OUT4RMIX Input 3 Volume */
+	{ 0x000006BE, 0x0000 },    /* R1726  - OUT4RMIX Input 4 Source */
+	{ 0x000006BF, 0x0080 },    /* R1727  - OUT4RMIX Input 4 Volume */
+	{ 0x000006C0, 0x0000 },    /* R1728  - OUT5LMIX Input 1 Source */
+	{ 0x000006C1, 0x0080 },    /* R1729  - OUT5LMIX Input 1 Volume */
+	{ 0x000006C2, 0x0000 },    /* R1730  - OUT5LMIX Input 2 Source */
+	{ 0x000006C3, 0x0080 },    /* R1731  - OUT5LMIX Input 2 Volume */
+	{ 0x000006C4, 0x0000 },    /* R1732  - OUT5LMIX Input 3 Source */
+	{ 0x000006C5, 0x0080 },    /* R1733  - OUT5LMIX Input 3 Volume */
+	{ 0x000006C6, 0x0000 },    /* R1734  - OUT5LMIX Input 4 Source */
+	{ 0x000006C7, 0x0080 },    /* R1735  - OUT5LMIX Input 4 Volume */
+	{ 0x000006C8, 0x0000 },    /* R1736  - OUT5RMIX Input 1 Source */
+	{ 0x000006C9, 0x0080 },    /* R1737  - OUT5RMIX Input 1 Volume */
+	{ 0x000006CA, 0x0000 },    /* R1738  - OUT5RMIX Input 2 Source */
+	{ 0x000006CB, 0x0080 },    /* R1739  - OUT5RMIX Input 2 Volume */
+	{ 0x000006CC, 0x0000 },    /* R1740  - OUT5RMIX Input 3 Source */
+	{ 0x000006CD, 0x0080 },    /* R1741  - OUT5RMIX Input 3 Volume */
+	{ 0x000006CE, 0x0000 },    /* R1742  - OUT5RMIX Input 4 Source */
+	{ 0x000006CF, 0x0080 },    /* R1743  - OUT5RMIX Input 4 Volume */
+	{ 0x00000700, 0x0000 },    /* R1792  - AIF1TX1MIX Input 1 Source */
+	{ 0x00000701, 0x0080 },    /* R1793  - AIF1TX1MIX Input 1 Volume */
+	{ 0x00000702, 0x0000 },    /* R1794  - AIF1TX1MIX Input 2 Source */
+	{ 0x00000703, 0x0080 },    /* R1795  - AIF1TX1MIX Input 2 Volume */
+	{ 0x00000704, 0x0000 },    /* R1796  - AIF1TX1MIX Input 3 Source */
+	{ 0x00000705, 0x0080 },    /* R1797  - AIF1TX1MIX Input 3 Volume */
+	{ 0x00000706, 0x0000 },    /* R1798  - AIF1TX1MIX Input 4 Source */
+	{ 0x00000707, 0x0080 },    /* R1799  - AIF1TX1MIX Input 4 Volume */
+	{ 0x00000708, 0x0000 },    /* R1800  - AIF1TX2MIX Input 1 Source */
+	{ 0x00000709, 0x0080 },    /* R1801  - AIF1TX2MIX Input 1 Volume */
+	{ 0x0000070A, 0x0000 },    /* R1802  - AIF1TX2MIX Input 2 Source */
+	{ 0x0000070B, 0x0080 },    /* R1803  - AIF1TX2MIX Input 2 Volume */
+	{ 0x0000070C, 0x0000 },    /* R1804  - AIF1TX2MIX Input 3 Source */
+	{ 0x0000070D, 0x0080 },    /* R1805  - AIF1TX2MIX Input 3 Volume */
+	{ 0x0000070E, 0x0000 },    /* R1806  - AIF1TX2MIX Input 4 Source */
+	{ 0x0000070F, 0x0080 },    /* R1807  - AIF1TX2MIX Input 4 Volume */
+	{ 0x00000710, 0x0000 },    /* R1808  - AIF1TX3MIX Input 1 Source */
+	{ 0x00000711, 0x0080 },    /* R1809  - AIF1TX3MIX Input 1 Volume */
+	{ 0x00000712, 0x0000 },    /* R1810  - AIF1TX3MIX Input 2 Source */
+	{ 0x00000713, 0x0080 },    /* R1811  - AIF1TX3MIX Input 2 Volume */
+	{ 0x00000714, 0x0000 },    /* R1812  - AIF1TX3MIX Input 3 Source */
+	{ 0x00000715, 0x0080 },    /* R1813  - AIF1TX3MIX Input 3 Volume */
+	{ 0x00000716, 0x0000 },    /* R1814  - AIF1TX3MIX Input 4 Source */
+	{ 0x00000717, 0x0080 },    /* R1815  - AIF1TX3MIX Input 4 Volume */
+	{ 0x00000718, 0x0000 },    /* R1816  - AIF1TX4MIX Input 1 Source */
+	{ 0x00000719, 0x0080 },    /* R1817  - AIF1TX4MIX Input 1 Volume */
+	{ 0x0000071A, 0x0000 },    /* R1818  - AIF1TX4MIX Input 2 Source */
+	{ 0x0000071B, 0x0080 },    /* R1819  - AIF1TX4MIX Input 2 Volume */
+	{ 0x0000071C, 0x0000 },    /* R1820  - AIF1TX4MIX Input 3 Source */
+	{ 0x0000071D, 0x0080 },    /* R1821  - AIF1TX4MIX Input 3 Volume */
+	{ 0x0000071E, 0x0000 },    /* R1822  - AIF1TX4MIX Input 4 Source */
+	{ 0x0000071F, 0x0080 },    /* R1823  - AIF1TX4MIX Input 4 Volume */
+	{ 0x00000720, 0x0000 },    /* R1824  - AIF1TX5MIX Input 1 Source */
+	{ 0x00000721, 0x0080 },    /* R1825  - AIF1TX5MIX Input 1 Volume */
+	{ 0x00000722, 0x0000 },    /* R1826  - AIF1TX5MIX Input 2 Source */
+	{ 0x00000723, 0x0080 },    /* R1827  - AIF1TX5MIX Input 2 Volume */
+	{ 0x00000724, 0x0000 },    /* R1828  - AIF1TX5MIX Input 3 Source */
+	{ 0x00000725, 0x0080 },    /* R1829  - AIF1TX5MIX Input 3 Volume */
+	{ 0x00000726, 0x0000 },    /* R1830  - AIF1TX5MIX Input 4 Source */
+	{ 0x00000727, 0x0080 },    /* R1831  - AIF1TX5MIX Input 4 Volume */
+	{ 0x00000728, 0x0000 },    /* R1832  - AIF1TX6MIX Input 1 Source */
+	{ 0x00000729, 0x0080 },    /* R1833  - AIF1TX6MIX Input 1 Volume */
+	{ 0x0000072A, 0x0000 },    /* R1834  - AIF1TX6MIX Input 2 Source */
+	{ 0x0000072B, 0x0080 },    /* R1835  - AIF1TX6MIX Input 2 Volume */
+	{ 0x0000072C, 0x0000 },    /* R1836  - AIF1TX6MIX Input 3 Source */
+	{ 0x0000072D, 0x0080 },    /* R1837  - AIF1TX6MIX Input 3 Volume */
+	{ 0x0000072E, 0x0000 },    /* R1838  - AIF1TX6MIX Input 4 Source */
+	{ 0x0000072F, 0x0080 },    /* R1839  - AIF1TX6MIX Input 4 Volume */
+	{ 0x00000740, 0x0000 },    /* R1856  - AIF2TX1MIX Input 1 Source */
+	{ 0x00000741, 0x0080 },    /* R1857  - AIF2TX1MIX Input 1 Volume */
+	{ 0x00000742, 0x0000 },    /* R1858  - AIF2TX1MIX Input 2 Source */
+	{ 0x00000743, 0x0080 },    /* R1859  - AIF2TX1MIX Input 2 Volume */
+	{ 0x00000744, 0x0000 },    /* R1860  - AIF2TX1MIX Input 3 Source */
+	{ 0x00000745, 0x0080 },    /* R1861  - AIF2TX1MIX Input 3 Volume */
+	{ 0x00000746, 0x0000 },    /* R1862  - AIF2TX1MIX Input 4 Source */
+	{ 0x00000747, 0x0080 },    /* R1863  - AIF2TX1MIX Input 4 Volume */
+	{ 0x00000748, 0x0000 },    /* R1864  - AIF2TX2MIX Input 1 Source */
+	{ 0x00000749, 0x0080 },    /* R1865  - AIF2TX2MIX Input 1 Volume */
+	{ 0x0000074A, 0x0000 },    /* R1866  - AIF2TX2MIX Input 2 Source */
+	{ 0x0000074B, 0x0080 },    /* R1867  - AIF2TX2MIX Input 2 Volume */
+	{ 0x0000074C, 0x0000 },    /* R1868  - AIF2TX2MIX Input 3 Source */
+	{ 0x0000074D, 0x0080 },    /* R1869  - AIF2TX2MIX Input 3 Volume */
+	{ 0x0000074E, 0x0000 },    /* R1870  - AIF2TX2MIX Input 4 Source */
+	{ 0x0000074F, 0x0080 },    /* R1871  - AIF2TX2MIX Input 4 Volume */
+	{ 0x00000750, 0x0000 },    /* R1872  - AIF2TX3MIX Input 1 Source */
+	{ 0x00000751, 0x0080 },    /* R1873  - AIF2TX3MIX Input 1 Volume */
+	{ 0x00000752, 0x0000 },    /* R1874  - AIF2TX3MIX Input 2 Source */
+	{ 0x00000753, 0x0080 },    /* R1875  - AIF2TX3MIX Input 2 Volume */
+	{ 0x00000754, 0x0000 },    /* R1876  - AIF2TX3MIX Input 3 Source */
+	{ 0x00000755, 0x0080 },    /* R1877  - AIF2TX3MIX Input 3 Volume */
+	{ 0x00000756, 0x0000 },    /* R1878  - AIF2TX3MIX Input 4 Source */
+	{ 0x00000757, 0x0080 },    /* R1879  - AIF2TX3MIX Input 4 Volume */
+	{ 0x00000758, 0x0000 },    /* R1880  - AIF2TX4MIX Input 1 Source */
+	{ 0x00000759, 0x0080 },    /* R1881  - AIF2TX4MIX Input 1 Volume */
+	{ 0x0000075A, 0x0000 },    /* R1882  - AIF2TX4MIX Input 2 Source */
+	{ 0x0000075B, 0x0080 },    /* R1883  - AIF2TX4MIX Input 2 Volume */
+	{ 0x0000075C, 0x0000 },    /* R1884  - AIF2TX4MIX Input 3 Source */
+	{ 0x0000075D, 0x0080 },    /* R1885  - AIF2TX4MIX Input 3 Volume */
+	{ 0x0000075E, 0x0000 },    /* R1886  - AIF2TX4MIX Input 4 Source */
+	{ 0x0000075F, 0x0080 },    /* R1887  - AIF2TX4MIX Input 4 Volume */
+	{ 0x00000760, 0x0000 },    /* R1888  - AIF2TX5MIX Input 1 Source */
+	{ 0x00000761, 0x0080 },    /* R1889  - AIF2TX5MIX Input 1 Volume */
+	{ 0x00000762, 0x0000 },    /* R1890  - AIF2TX5MIX Input 2 Source */
+	{ 0x00000763, 0x0080 },    /* R1891  - AIF2TX5MIX Input 2 Volume */
+	{ 0x00000764, 0x0000 },    /* R1892  - AIF2TX5MIX Input 3 Source */
+	{ 0x00000765, 0x0080 },    /* R1893  - AIF2TX5MIX Input 3 Volume */
+	{ 0x00000766, 0x0000 },    /* R1894  - AIF2TX5MIX Input 4 Source */
+	{ 0x00000767, 0x0080 },    /* R1895  - AIF2TX5MIX Input 4 Volume */
+	{ 0x00000768, 0x0000 },    /* R1896  - AIF2TX6MIX Input 1 Source */
+	{ 0x00000769, 0x0080 },    /* R1897  - AIF2TX6MIX Input 1 Volume */
+	{ 0x0000076A, 0x0000 },    /* R1898  - AIF2TX6MIX Input 2 Source */
+	{ 0x0000076B, 0x0080 },    /* R1899  - AIF2TX6MIX Input 2 Volume */
+	{ 0x0000076C, 0x0000 },    /* R1900  - AIF2TX6MIX Input 3 Source */
+	{ 0x0000076D, 0x0080 },    /* R1901  - AIF2TX6MIX Input 3 Volume */
+	{ 0x0000076E, 0x0000 },    /* R1902  - AIF2TX6MIX Input 4 Source */
+	{ 0x0000076F, 0x0080 },    /* R1903  - AIF2TX6MIX Input 4 Volume */
+	{ 0x00000780, 0x0000 },    /* R1920  - AIF3TX1MIX Input 1 Source */
+	{ 0x00000781, 0x0080 },    /* R1921  - AIF3TX1MIX Input 1 Volume */
+	{ 0x00000782, 0x0000 },    /* R1922  - AIF3TX1MIX Input 2 Source */
+	{ 0x00000783, 0x0080 },    /* R1923  - AIF3TX1MIX Input 2 Volume */
+	{ 0x00000784, 0x0000 },    /* R1924  - AIF3TX1MIX Input 3 Source */
+	{ 0x00000785, 0x0080 },    /* R1925  - AIF3TX1MIX Input 3 Volume */
+	{ 0x00000786, 0x0000 },    /* R1926  - AIF3TX1MIX Input 4 Source */
+	{ 0x00000787, 0x0080 },    /* R1927  - AIF3TX1MIX Input 4 Volume */
+	{ 0x00000788, 0x0000 },    /* R1928  - AIF3TX2MIX Input 1 Source */
+	{ 0x00000789, 0x0080 },    /* R1929  - AIF3TX2MIX Input 1 Volume */
+	{ 0x0000078A, 0x0000 },    /* R1930  - AIF3TX2MIX Input 2 Source */
+	{ 0x0000078B, 0x0080 },    /* R1931  - AIF3TX2MIX Input 2 Volume */
+	{ 0x0000078C, 0x0000 },    /* R1932  - AIF3TX2MIX Input 3 Source */
+	{ 0x0000078D, 0x0080 },    /* R1933  - AIF3TX2MIX Input 3 Volume */
+	{ 0x0000078E, 0x0000 },    /* R1934  - AIF3TX2MIX Input 4 Source */
+	{ 0x0000078F, 0x0080 },    /* R1935  - AIF3TX2MIX Input 4 Volume */
+	{ 0x000007C0, 0x0000 },    /* R1984  - SLIMTX1MIX Input 1 Source */
+	{ 0x000007C1, 0x0080 },    /* R1985  - SLIMTX1MIX Input 1 Volume */
+	{ 0x000007C8, 0x0000 },    /* R1992  - SLIMTX2MIX Input 1 Source */
+	{ 0x000007C9, 0x0080 },    /* R1993  - SLIMTX2MIX Input 1 Volume */
+	{ 0x000007D0, 0x0000 },    /* R2000  - SLIMTX3MIX Input 1 Source */
+	{ 0x000007D1, 0x0080 },    /* R2001  - SLIMTX3MIX Input 1 Volume */
+	{ 0x000007D8, 0x0000 },    /* R2008  - SLIMTX4MIX Input 1 Source */
+	{ 0x000007D9, 0x0080 },    /* R2009  - SLIMTX4MIX Input 1 Volume */
+	{ 0x000007E0, 0x0000 },    /* R2016  - SLIMTX5MIX Input 1 Source */
+	{ 0x000007E1, 0x0080 },    /* R2017  - SLIMTX5MIX Input 1 Volume */
+	{ 0x000007E8, 0x0000 },    /* R2024  - SLIMTX6MIX Input 1 Source */
+	{ 0x000007E9, 0x0080 },    /* R2025  - SLIMTX6MIX Input 1 Volume */
+	{ 0x00000800, 0x0000 },    /* R2048  - SPDIF1TX1MIX Input 1 Source */
+	{ 0x00000801, 0x0080 },    /* R2049  - SPDIF1TX1MIX Input 1 Volume */
+	{ 0x00000808, 0x0000 },    /* R2056  - SPDIF1TX2MIX Input 1 Source */
+	{ 0x00000809, 0x0080 },    /* R2057  - SPDIF1TX2MIX Input 1 Volume */
+	{ 0x00000880, 0x0000 },    /* R2176  - EQ1MIX Input 1 Source */
+	{ 0x00000881, 0x0080 },    /* R2177  - EQ1MIX Input 1 Volume */
+	{ 0x00000888, 0x0000 },    /* R2184  - EQ2MIX Input 1 Source */
+	{ 0x00000889, 0x0080 },    /* R2185  - EQ2MIX Input 1 Volume */
+	{ 0x00000890, 0x0000 },    /* R2192  - EQ3MIX Input 1 Source */
+	{ 0x00000891, 0x0080 },    /* R2193  - EQ3MIX Input 1 Volume */
+	{ 0x00000898, 0x0000 },    /* R2200  - EQ4MIX Input 1 Source */
+	{ 0x00000899, 0x0080 },    /* R2201  - EQ4MIX Input 1 Volume */
+	{ 0x000008C0, 0x0000 },    /* R2240  - DRC1LMIX Input 1 Source */
+	{ 0x000008C1, 0x0080 },    /* R2241  - DRC1LMIX Input 1 Volume */
+	{ 0x000008C8, 0x0000 },    /* R2248  - DRC1RMIX Input 1 Source */
+	{ 0x000008C9, 0x0080 },    /* R2249  - DRC1RMIX Input 1 Volume */
+	{ 0x00000900, 0x0000 },    /* R2304  - HPLP1MIX Input 1 Source */
+	{ 0x00000901, 0x0080 },    /* R2305  - HPLP1MIX Input 1 Volume */
+	{ 0x00000902, 0x0000 },    /* R2306  - HPLP1MIX Input 2 Source */
+	{ 0x00000903, 0x0080 },    /* R2307  - HPLP1MIX Input 2 Volume */
+	{ 0x00000904, 0x0000 },    /* R2308  - HPLP1MIX Input 3 Source */
+	{ 0x00000905, 0x0080 },    /* R2309  - HPLP1MIX Input 3 Volume */
+	{ 0x00000906, 0x0000 },    /* R2310  - HPLP1MIX Input 4 Source */
+	{ 0x00000907, 0x0080 },    /* R2311  - HPLP1MIX Input 4 Volume */
+	{ 0x00000908, 0x0000 },    /* R2312  - HPLP2MIX Input 1 Source */
+	{ 0x00000909, 0x0080 },    /* R2313  - HPLP2MIX Input 1 Volume */
+	{ 0x0000090A, 0x0000 },    /* R2314  - HPLP2MIX Input 2 Source */
+	{ 0x0000090B, 0x0080 },    /* R2315  - HPLP2MIX Input 2 Volume */
+	{ 0x0000090C, 0x0000 },    /* R2316  - HPLP2MIX Input 3 Source */
+	{ 0x0000090D, 0x0080 },    /* R2317  - HPLP2MIX Input 3 Volume */
+	{ 0x0000090E, 0x0000 },    /* R2318  - HPLP2MIX Input 4 Source */
+	{ 0x0000090F, 0x0080 },    /* R2319  - HPLP2MIX Input 4 Volume */
+	{ 0x00000910, 0x0000 },    /* R2320  - HPLP3MIX Input 1 Source */
+	{ 0x00000911, 0x0080 },    /* R2321  - HPLP3MIX Input 1 Volume */
+	{ 0x00000912, 0x0000 },    /* R2322  - HPLP3MIX Input 2 Source */
+	{ 0x00000913, 0x0080 },    /* R2323  - HPLP3MIX Input 2 Volume */
+	{ 0x00000914, 0x0000 },    /* R2324  - HPLP3MIX Input 3 Source */
+	{ 0x00000915, 0x0080 },    /* R2325  - HPLP3MIX Input 3 Volume */
+	{ 0x00000916, 0x0000 },    /* R2326  - HPLP3MIX Input 4 Source */
+	{ 0x00000917, 0x0080 },    /* R2327  - HPLP3MIX Input 4 Volume */
+	{ 0x00000918, 0x0000 },    /* R2328  - HPLP4MIX Input 1 Source */
+	{ 0x00000919, 0x0080 },    /* R2329  - HPLP4MIX Input 1 Volume */
+	{ 0x0000091A, 0x0000 },    /* R2330  - HPLP4MIX Input 2 Source */
+	{ 0x0000091B, 0x0080 },    /* R2331  - HPLP4MIX Input 2 Volume */
+	{ 0x0000091C, 0x0000 },    /* R2332  - HPLP4MIX Input 3 Source */
+	{ 0x0000091D, 0x0080 },    /* R2333  - HPLP4MIX Input 3 Volume */
+	{ 0x0000091E, 0x0000 },    /* R2334  - HPLP4MIX Input 4 Source */
+	{ 0x0000091F, 0x0080 },    /* R2335  - HPLP4MIX Input 4 Volume */
+	{ 0x00000A80, 0x0000 },    /* R2688  - ASRC1LMIX Input 1 Source */
+	{ 0x00000A88, 0x0000 },    /* R2696  - ASRC1RMIX Input 1 Source */
+	{ 0x00000A90, 0x0000 },    /* R2704  - ASRC2LMIX Input 1 Source */
+	{ 0x00000A98, 0x0000 },    /* R2712  - ASRC2RMIX Input 1 Source */
+	{ 0x00000B00, 0x0000 },    /* R2816  - ISRC1DEC1MIX Input 1 Source */
+	{ 0x00000B08, 0x0000 },    /* R2824  - ISRC1DEC2MIX Input 1 Source */
+	{ 0x00000B10, 0x0000 },    /* R2832  - ISRC1DEC3MIX Input 1 Source */
+	{ 0x00000B18, 0x0000 },    /* R2840  - ISRC1DEC4MIX Input 1 Source */
+	{ 0x00000B20, 0x0000 },    /* R2848  - ISRC1INT1MIX Input 1 Source */
+	{ 0x00000B28, 0x0000 },    /* R2856  - ISRC1INT2MIX Input 1 Source */
+	{ 0x00000B30, 0x0000 },    /* R2864  - ISRC1INT3MIX Input 1 Source */
+	{ 0x00000B38, 0x0000 },    /* R2872  - ISRC1INT4MIX Input 1 Source */
+	{ 0x00000B40, 0x0000 },    /* R2880  - ISRC2DEC1MIX Input 1 Source */
+	{ 0x00000B48, 0x0000 },    /* R2888  - ISRC2DEC2MIX Input 1 Source */
+	{ 0x00000B60, 0x0000 },    /* R2912  - ISRC2INT1MIX Input 1 Source */
+	{ 0x00000B68, 0x0000 },    /* R2920  - ISRC2INT2MIX Input 1 Source */
+	{ 0x00000C00, 0xA101 },    /* R3072  - GPIO1 CTRL */
+	{ 0x00000C01, 0xA101 },    /* R3073  - GPIO2 CTRL */
+	{ 0x00000C02, 0xA101 },    /* R3074  - GPIO3 CTRL */
+	{ 0x00000C03, 0xA101 },    /* R3075  - GPIO4 CTRL */
+	{ 0x00000C04, 0xA101 },    /* R3076  - GPIO5 CTRL */
+	{ 0x00000C0F, 0x0400 },    /* R3087  - IRQ CTRL 1 */
+	{ 0x00000C10, 0x1000 },    /* R3088  - GPIO Debounce Config */
+	{ 0x00000C18, 0x0000 },    /* R3096  - GP Switch 1 */
+	{ 0x00000C20, 0x8002 },    /* R3104  - Misc Pad Ctrl 1 */
+	{ 0x00000C21, 0x8001 },    /* R3105  - Misc Pad Ctrl 2 */
+	{ 0x00000C22, 0x0000 },    /* R3106  - Misc Pad Ctrl 3 */
+	{ 0x00000C23, 0x0000 },    /* R3107  - Misc Pad Ctrl 4 */
+	{ 0x00000C24, 0x0000 },    /* R3108  - Misc Pad Ctrl 5 */
+	{ 0x00000C25, 0x0000 },    /* R3109  - Misc Pad Ctrl 6 */
+	{ 0x00000D08, 0xFFFF },    /* R3336  - Interrupt Status 1 Mask */
+	{ 0x00000D09, 0xFFFF },    /* R3337  - Interrupt Status 2 Mask */
+	{ 0x00000D0A, 0xFFFF },    /* R3338  - Interrupt Status 3 Mask */
+	{ 0x00000D0B, 0xFFFF },    /* R3339  - Interrupt Status 4 Mask */
+	{ 0x00000D0C, 0xFEFF },    /* R3340  - Interrupt Status 5 Mask */
+	{ 0x00000D0F, 0x0000 },    /* R3343  - Interrupt Control */
+	{ 0x00000D18, 0xFFFF },    /* R3352  - IRQ2 Status 1 Mask */
+	{ 0x00000D19, 0xFFFF },    /* R3353  - IRQ2 Status 2 Mask */
+	{ 0x00000D1A, 0xFFFF },    /* R3354  - IRQ2 Status 3 Mask */
+	{ 0x00000D1B, 0xFFFF },    /* R3355  - IRQ2 Status 4 Mask */
+	{ 0x00000D1C, 0xFEFF },    /* R3356  - IRQ2 Status 5 Mask */
+	{ 0x00000D1D, 0xFFFF },    /* R3357  - IRQ2 Status 6 Mask */
+	{ 0x00000D1F, 0x0000 },    /* R3359  - IRQ2 Control */
+	{ 0x00000D53, 0xFFFF },    /* R3411  - AOD IRQ Mask IRQ1 */
+	{ 0x00000D54, 0xFFFF },    /* R3412  - AOD IRQ Mask IRQ2 */
+	{ 0x00000D56, 0x0000 },    /* R3414  - Jack detect debounce */
+	{ 0x00000E00, 0x0000 },    /* R3584  - FX_Ctrl1 */
+	{ 0x00000E01, 0x0000 },    /* R3585  - FX_Ctrl2 */
+	{ 0x00000E10, 0x6318 },    /* R3600  - EQ1_1 */
+	{ 0x00000E11, 0x6300 },    /* R3601  - EQ1_2 */
+	{ 0x00000E12, 0x0FC8 },    /* R3602  - EQ1_3 */
+	{ 0x00000E13, 0x03FE },    /* R3603  - EQ1_4 */
+	{ 0x00000E14, 0x00E0 },    /* R3604  - EQ1_5 */
+	{ 0x00000E15, 0x1EC4 },    /* R3605  - EQ1_6 */
+	{ 0x00000E16, 0xF136 },    /* R3606  - EQ1_7 */
+	{ 0x00000E17, 0x0409 },    /* R3607  - EQ1_8 */
+	{ 0x00000E18, 0x04CC },    /* R3608  - EQ1_9 */
+	{ 0x00000E19, 0x1C9B },    /* R3609  - EQ1_10 */
+	{ 0x00000E1A, 0xF337 },    /* R3610  - EQ1_11 */
+	{ 0x00000E1B, 0x040B },    /* R3611  - EQ1_12 */
+	{ 0x00000E1C, 0x0CBB },    /* R3612  - EQ1_13 */
+	{ 0x00000E1D, 0x16F8 },    /* R3613  - EQ1_14 */
+	{ 0x00000E1E, 0xF7D9 },    /* R3614  - EQ1_15 */
+	{ 0x00000E1F, 0x040A },    /* R3615  - EQ1_16 */
+	{ 0x00000E20, 0x1F14 },    /* R3616  - EQ1_17 */
+	{ 0x00000E21, 0x058C },    /* R3617  - EQ1_18 */
+	{ 0x00000E22, 0x0563 },    /* R3618  - EQ1_19 */
+	{ 0x00000E23, 0x4000 },    /* R3619  - EQ1_20 */
+	{ 0x00000E24, 0x0B75 },    /* R3620  - EQ1_21 */
+	{ 0x00000E26, 0x6318 },    /* R3622  - EQ2_1 */
+	{ 0x00000E27, 0x6300 },    /* R3623  - EQ2_2 */
+	{ 0x00000E28, 0x0FC8 },    /* R3624  - EQ2_3 */
+	{ 0x00000E29, 0x03FE },    /* R3625  - EQ2_4 */
+	{ 0x00000E2A, 0x00E0 },    /* R3626  - EQ2_5 */
+	{ 0x00000E2B, 0x1EC4 },    /* R3627  - EQ2_6 */
+	{ 0x00000E2C, 0xF136 },    /* R3628  - EQ2_7 */
+	{ 0x00000E2D, 0x0409 },    /* R3629  - EQ2_8 */
+	{ 0x00000E2E, 0x04CC },    /* R3630  - EQ2_9 */
+	{ 0x00000E2F, 0x1C9B },    /* R3631  - EQ2_10 */
+	{ 0x00000E30, 0xF337 },    /* R3632  - EQ2_11 */
+	{ 0x00000E31, 0x040B },    /* R3633  - EQ2_12 */
+	{ 0x00000E32, 0x0CBB },    /* R3634  - EQ2_13 */
+	{ 0x00000E33, 0x16F8 },    /* R3635  - EQ2_14 */
+	{ 0x00000E34, 0xF7D9 },    /* R3636  - EQ2_15 */
+	{ 0x00000E35, 0x040A },    /* R3637  - EQ2_16 */
+	{ 0x00000E36, 0x1F14 },    /* R3638  - EQ2_17 */
+	{ 0x00000E37, 0x058C },    /* R3639  - EQ2_18 */
+	{ 0x00000E38, 0x0563 },    /* R3640  - EQ2_19 */
+	{ 0x00000E39, 0x4000 },    /* R3641  - EQ2_20 */
+	{ 0x00000E3A, 0x0B75 },    /* R3642  - EQ2_21 */
+	{ 0x00000E3C, 0x6318 },    /* R3644  - EQ3_1 */
+	{ 0x00000E3D, 0x6300 },    /* R3645  - EQ3_2 */
+	{ 0x00000E3E, 0x0FC8 },    /* R3646  - EQ3_3 */
+	{ 0x00000E3F, 0x03FE },    /* R3647  - EQ3_4 */
+	{ 0x00000E40, 0x00E0 },    /* R3648  - EQ3_5 */
+	{ 0x00000E41, 0x1EC4 },    /* R3649  - EQ3_6 */
+	{ 0x00000E42, 0xF136 },    /* R3650  - EQ3_7 */
+	{ 0x00000E43, 0x0409 },    /* R3651  - EQ3_8 */
+	{ 0x00000E44, 0x04CC },    /* R3652  - EQ3_9 */
+	{ 0x00000E45, 0x1C9B },    /* R3653  - EQ3_10 */
+	{ 0x00000E46, 0xF337 },    /* R3654  - EQ3_11 */
+	{ 0x00000E47, 0x040B },    /* R3655  - EQ3_12 */
+	{ 0x00000E48, 0x0CBB },    /* R3656  - EQ3_13 */
+	{ 0x00000E49, 0x16F8 },    /* R3657  - EQ3_14 */
+	{ 0x00000E4A, 0xF7D9 },    /* R3658  - EQ3_15 */
+	{ 0x00000E4B, 0x040A },    /* R3659  - EQ3_16 */
+	{ 0x00000E4C, 0x1F14 },    /* R3660  - EQ3_17 */
+	{ 0x00000E4D, 0x058C },    /* R3661  - EQ3_18 */
+	{ 0x00000E4E, 0x0563 },    /* R3662  - EQ3_19 */
+	{ 0x00000E4F, 0x4000 },    /* R3663  - EQ3_20 */
+	{ 0x00000E50, 0x0B75 },    /* R3664  - EQ3_21 */
+	{ 0x00000E52, 0x6318 },    /* R3666  - EQ4_1 */
+	{ 0x00000E53, 0x6300 },    /* R3667  - EQ4_2 */
+	{ 0x00000E54, 0x0FC8 },    /* R3668  - EQ4_3 */
+	{ 0x00000E55, 0x03FE },    /* R3669  - EQ4_4 */
+	{ 0x00000E56, 0x00E0 },    /* R3670  - EQ4_5 */
+	{ 0x00000E57, 0x1EC4 },    /* R3671  - EQ4_6 */
+	{ 0x00000E58, 0xF136 },    /* R3672  - EQ4_7 */
+	{ 0x00000E59, 0x0409 },    /* R3673  - EQ4_8 */
+	{ 0x00000E5A, 0x04CC },    /* R3674  - EQ4_9 */
+	{ 0x00000E5B, 0x1C9B },    /* R3675  - EQ4_10 */
+	{ 0x00000E5C, 0xF337 },    /* R3676  - EQ4_11 */
+	{ 0x00000E5D, 0x040B },    /* R3677  - EQ4_12 */
+	{ 0x00000E5E, 0x0CBB },    /* R3678  - EQ4_13 */
+	{ 0x00000E5F, 0x16F8 },    /* R3679  - EQ4_14 */
+	{ 0x00000E60, 0xF7D9 },    /* R3680  - EQ4_15 */
+	{ 0x00000E61, 0x040A },    /* R3681  - EQ4_16 */
+	{ 0x00000E62, 0x1F14 },    /* R3682  - EQ4_17 */
+	{ 0x00000E63, 0x058C },    /* R3683  - EQ4_18 */
+	{ 0x00000E64, 0x0563 },    /* R3684  - EQ4_19 */
+	{ 0x00000E65, 0x4000 },    /* R3685  - EQ4_20 */
+	{ 0x00000E66, 0x0B75 },    /* R3686  - EQ4_21 */
+	{ 0x00000E80, 0x0018 },    /* R3712  - DRC1 ctrl1 */
+	{ 0x00000E81, 0x0933 },    /* R3713  - DRC1 ctrl2 */
+	{ 0x00000E82, 0x0018 },    /* R3714  - DRC1 ctrl3 */
+	{ 0x00000E83, 0x0000 },    /* R3715  - DRC1 ctrl4 */
+	{ 0x00000E84, 0x0000 },    /* R3716  - DRC1 ctrl5 */
+	{ 0x00000EC0, 0x0000 },    /* R3776  - HPLPF1_1 */
+	{ 0x00000EC1, 0x0000 },    /* R3777  - HPLPF1_2 */
+	{ 0x00000EC4, 0x0000 },    /* R3780  - HPLPF2_1 */
+	{ 0x00000EC5, 0x0000 },    /* R3781  - HPLPF2_2 */
+	{ 0x00000EC8, 0x0000 },    /* R3784  - HPLPF3_1 */
+	{ 0x00000EC9, 0x0000 },    /* R3785  - HPLPF3_2 */
+	{ 0x00000ECC, 0x0000 },    /* R3788  - HPLPF4_1 */
+	{ 0x00000ECD, 0x0000 },    /* R3789  - HPLPF4_2 */
+	{ 0x00000EE0, 0x0000 },    /* R3808  - ASRC_ENABLE */
+	{ 0x00000EE2, 0x0000 },    /* R3810  - ASRC_RATE1 */
+	{ 0x00000EE3, 0x4000 },    /* R3811  - ASRC_RATE2 */
+	{ 0x00000EF0, 0x0000 },    /* R3824  - ISRC 1 CTRL 1 */
+	{ 0x00000EF1, 0x0001 },    /* R3825  - ISRC 1 CTRL 2 */
+	{ 0x00000EF2, 0x0000 },    /* R3826  - ISRC 1 CTRL 3 */
+	{ 0x00000EF3, 0x0000 },    /* R3827  - ISRC 2 CTRL 1 */
+	{ 0x00000EF4, 0x0001 },    /* R3828  - ISRC 2 CTRL 2 */
+	{ 0x00000EF5, 0x0000 },    /* R3829  - ISRC 2 CTRL 3 */
+	{ 0x00001700, 0x0000 },    /* R5888  - FRF_COEFF_1 */
+	{ 0x00001701, 0x0000 },    /* R5889  - FRF_COEFF_2 */
+	{ 0x00001702, 0x0000 },    /* R5890  - FRF_COEFF_3 */
+	{ 0x00001703, 0x0000 },    /* R5891  - FRF_COEFF_4 */
+	{ 0x00001704, 0x0000 },    /* R5892  - DAC_COMP_1 */
+	{ 0x00001705, 0x0000 },    /* R5893  - DAC_COMP_2 */
+};
+
+static bool wm8998_readable_register(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case ARIZONA_SOFTWARE_RESET:
+	case ARIZONA_DEVICE_REVISION:
+	case ARIZONA_CTRL_IF_SPI_CFG_1:
+	case ARIZONA_CTRL_IF_I2C1_CFG_1:
+	case ARIZONA_CTRL_IF_I2C1_CFG_2:
+	case ARIZONA_WRITE_SEQUENCER_CTRL_0:
+	case ARIZONA_WRITE_SEQUENCER_CTRL_1:
+	case ARIZONA_WRITE_SEQUENCER_CTRL_2:
+	case ARIZONA_TONE_GENERATOR_1:
+	case ARIZONA_TONE_GENERATOR_2:
+	case ARIZONA_TONE_GENERATOR_3:
+	case ARIZONA_TONE_GENERATOR_4:
+	case ARIZONA_TONE_GENERATOR_5:
+	case ARIZONA_PWM_DRIVE_1:
+	case ARIZONA_PWM_DRIVE_2:
+	case ARIZONA_PWM_DRIVE_3:
+	case ARIZONA_WAKE_CONTROL:
+	case ARIZONA_SEQUENCE_CONTROL:
+	case ARIZONA_SAMPLE_RATE_SEQUENCE_SELECT_1:
+	case ARIZONA_SAMPLE_RATE_SEQUENCE_SELECT_2:
+	case ARIZONA_SAMPLE_RATE_SEQUENCE_SELECT_3:
+	case ARIZONA_SAMPLE_RATE_SEQUENCE_SELECT_4:
+	case ARIZONA_ALWAYS_ON_TRIGGERS_SEQUENCE_SELECT_1:
+	case ARIZONA_ALWAYS_ON_TRIGGERS_SEQUENCE_SELECT_2:
+	case ARIZONA_ALWAYS_ON_TRIGGERS_SEQUENCE_SELECT_3:
+	case ARIZONA_ALWAYS_ON_TRIGGERS_SEQUENCE_SELECT_4:
+	case ARIZONA_ALWAYS_ON_TRIGGERS_SEQUENCE_SELECT_5:
+	case ARIZONA_ALWAYS_ON_TRIGGERS_SEQUENCE_SELECT_6:
+	case ARIZONA_HAPTICS_CONTROL_1:
+	case ARIZONA_HAPTICS_CONTROL_2:
+	case ARIZONA_HAPTICS_PHASE_1_INTENSITY:
+	case ARIZONA_HAPTICS_PHASE_1_DURATION:
+	case ARIZONA_HAPTICS_PHASE_2_INTENSITY:
+	case ARIZONA_HAPTICS_PHASE_2_DURATION:
+	case ARIZONA_HAPTICS_PHASE_3_INTENSITY:
+	case ARIZONA_HAPTICS_PHASE_3_DURATION:
+	case ARIZONA_HAPTICS_STATUS:
+	case ARIZONA_CLOCK_32K_1:
+	case ARIZONA_SYSTEM_CLOCK_1:
+	case ARIZONA_SAMPLE_RATE_1:
+	case ARIZONA_SAMPLE_RATE_2:
+	case ARIZONA_SAMPLE_RATE_3:
+	case ARIZONA_SAMPLE_RATE_1_STATUS:
+	case ARIZONA_SAMPLE_RATE_2_STATUS:
+	case ARIZONA_SAMPLE_RATE_3_STATUS:
+	case ARIZONA_ASYNC_CLOCK_1:
+	case ARIZONA_ASYNC_SAMPLE_RATE_1:
+	case ARIZONA_ASYNC_SAMPLE_RATE_1_STATUS:
+	case ARIZONA_ASYNC_SAMPLE_RATE_2:
+	case ARIZONA_ASYNC_SAMPLE_RATE_2_STATUS:
+	case ARIZONA_OUTPUT_SYSTEM_CLOCK:
+	case ARIZONA_OUTPUT_ASYNC_CLOCK:
+	case ARIZONA_RATE_ESTIMATOR_1:
+	case ARIZONA_RATE_ESTIMATOR_2:
+	case ARIZONA_RATE_ESTIMATOR_3:
+	case ARIZONA_RATE_ESTIMATOR_4:
+	case ARIZONA_RATE_ESTIMATOR_5:
+	case ARIZONA_DYNAMIC_FREQUENCY_SCALING_1:
+	case ARIZONA_FLL1_CONTROL_1:
+	case ARIZONA_FLL1_CONTROL_2:
+	case ARIZONA_FLL1_CONTROL_3:
+	case ARIZONA_FLL1_CONTROL_4:
+	case ARIZONA_FLL1_CONTROL_5:
+	case ARIZONA_FLL1_CONTROL_6:
+	case ARIZONA_FLL1_CONTROL_7:
+	case ARIZONA_FLL1_LOOP_FILTER_TEST_1:
+	case ARIZONA_FLL1_NCO_TEST_0:
+	case ARIZONA_FLL1_SYNCHRONISER_1:
+	case ARIZONA_FLL1_SYNCHRONISER_2:
+	case ARIZONA_FLL1_SYNCHRONISER_3:
+	case ARIZONA_FLL1_SYNCHRONISER_4:
+	case ARIZONA_FLL1_SYNCHRONISER_5:
+	case ARIZONA_FLL1_SYNCHRONISER_6:
+	case ARIZONA_FLL1_SYNCHRONISER_7:
+	case ARIZONA_FLL1_SPREAD_SPECTRUM:
+	case ARIZONA_FLL1_GPIO_CLOCK:
+	case ARIZONA_FLL2_CONTROL_1:
+	case ARIZONA_FLL2_CONTROL_2:
+	case ARIZONA_FLL2_CONTROL_3:
+	case ARIZONA_FLL2_CONTROL_4:
+	case ARIZONA_FLL2_CONTROL_5:
+	case ARIZONA_FLL2_CONTROL_6:
+	case ARIZONA_FLL2_CONTROL_7:
+	case ARIZONA_FLL2_LOOP_FILTER_TEST_1:
+	case ARIZONA_FLL2_NCO_TEST_0:
+	case ARIZONA_FLL2_SYNCHRONISER_1:
+	case ARIZONA_FLL2_SYNCHRONISER_2:
+	case ARIZONA_FLL2_SYNCHRONISER_3:
+	case ARIZONA_FLL2_SYNCHRONISER_4:
+	case ARIZONA_FLL2_SYNCHRONISER_5:
+	case ARIZONA_FLL2_SYNCHRONISER_6:
+	case ARIZONA_FLL2_SYNCHRONISER_7:
+	case ARIZONA_FLL2_SPREAD_SPECTRUM:
+	case ARIZONA_FLL2_GPIO_CLOCK:
+	case ARIZONA_MIC_CHARGE_PUMP_1:
+	case ARIZONA_LDO1_CONTROL_1:
+	case ARIZONA_LDO1_CONTROL_2:
+	case ARIZONA_LDO2_CONTROL_1:
+	case ARIZONA_MIC_BIAS_CTRL_1:
+	case ARIZONA_MIC_BIAS_CTRL_2:
+	case ARIZONA_MIC_BIAS_CTRL_3:
+	case ARIZONA_ACCESSORY_DETECT_MODE_1:
+	case ARIZONA_HEADPHONE_DETECT_1:
+	case ARIZONA_HEADPHONE_DETECT_2:
+	case ARIZONA_MICD_CLAMP_CONTROL:
+	case ARIZONA_MIC_DETECT_1:
+	case ARIZONA_MIC_DETECT_2:
+	case ARIZONA_MIC_DETECT_3:
+	case ARIZONA_MIC_DETECT_4:
+	case ARIZONA_MIC_DETECT_LEVEL_1:
+	case ARIZONA_MIC_DETECT_LEVEL_2:
+	case ARIZONA_MIC_DETECT_LEVEL_3:
+	case ARIZONA_MIC_DETECT_LEVEL_4:
+	case ARIZONA_ISOLATION_CONTROL:
+	case ARIZONA_JACK_DETECT_ANALOGUE:
+	case ARIZONA_INPUT_ENABLES:
+	case ARIZONA_INPUT_ENABLES_STATUS:
+	case ARIZONA_INPUT_RATE:
+	case ARIZONA_INPUT_VOLUME_RAMP:
+	case ARIZONA_HPF_CONTROL:
+	case ARIZONA_IN1L_CONTROL:
+	case ARIZONA_ADC_DIGITAL_VOLUME_1L:
+	case ARIZONA_DMIC1L_CONTROL:
+	case ARIZONA_IN1R_CONTROL:
+	case ARIZONA_ADC_DIGITAL_VOLUME_1R:
+	case ARIZONA_DMIC1R_CONTROL:
+	case ARIZONA_IN2L_CONTROL:
+	case ARIZONA_ADC_DIGITAL_VOLUME_2L:
+	case ARIZONA_DMIC2L_CONTROL:
+	case ARIZONA_OUTPUT_ENABLES_1:
+	case ARIZONA_OUTPUT_STATUS_1:
+	case ARIZONA_RAW_OUTPUT_STATUS_1:
+	case ARIZONA_OUTPUT_RATE_1:
+	case ARIZONA_OUTPUT_VOLUME_RAMP:
+	case ARIZONA_OUTPUT_PATH_CONFIG_1L:
+	case ARIZONA_DAC_DIGITAL_VOLUME_1L:
+	case ARIZONA_NOISE_GATE_SELECT_1L:
+	case ARIZONA_OUTPUT_PATH_CONFIG_1R:
+	case ARIZONA_DAC_DIGITAL_VOLUME_1R:
+	case ARIZONA_NOISE_GATE_SELECT_1R:
+	case ARIZONA_OUTPUT_PATH_CONFIG_2L:
+	case ARIZONA_DAC_DIGITAL_VOLUME_2L:
+	case ARIZONA_NOISE_GATE_SELECT_2L:
+	case ARIZONA_OUTPUT_PATH_CONFIG_2R:
+	case ARIZONA_DAC_DIGITAL_VOLUME_2R:
+	case ARIZONA_NOISE_GATE_SELECT_2R:
+	case ARIZONA_OUTPUT_PATH_CONFIG_3L:
+	case ARIZONA_DAC_DIGITAL_VOLUME_3L:
+	case ARIZONA_NOISE_GATE_SELECT_3L:
+	case ARIZONA_OUTPUT_PATH_CONFIG_4L:
+	case ARIZONA_DAC_DIGITAL_VOLUME_4L:
+	case ARIZONA_NOISE_GATE_SELECT_4L:
+	case ARIZONA_OUTPUT_PATH_CONFIG_4R:
+	case ARIZONA_DAC_DIGITAL_VOLUME_4R:
+	case ARIZONA_NOISE_GATE_SELECT_4R:
+	case ARIZONA_OUTPUT_PATH_CONFIG_5L:
+	case ARIZONA_DAC_DIGITAL_VOLUME_5L:
+	case ARIZONA_NOISE_GATE_SELECT_5L:
+	case ARIZONA_OUTPUT_PATH_CONFIG_5R:
+	case ARIZONA_DAC_DIGITAL_VOLUME_5R:
+	case ARIZONA_NOISE_GATE_SELECT_5R:
+	case ARIZONA_DRE_ENABLE:
+	case ARIZONA_DRE_CONTROL_1:
+	case ARIZONA_DRE_CONTROL_2:
+	case ARIZONA_DRE_CONTROL_3:
+	case ARIZONA_EDRE_ENABLE:
+	case ARIZONA_DAC_AEC_CONTROL_1:
+	case ARIZONA_DAC_AEC_CONTROL_2:
+	case ARIZONA_NOISE_GATE_CONTROL:
+	case ARIZONA_PDM_SPK1_CTRL_1:
+	case ARIZONA_PDM_SPK1_CTRL_2:
+	case ARIZONA_HP_TEST_CTRL_13:
+	case ARIZONA_AIF1_BCLK_CTRL:
+	case ARIZONA_AIF1_TX_PIN_CTRL:
+	case ARIZONA_AIF1_RX_PIN_CTRL:
+	case ARIZONA_AIF1_RATE_CTRL:
+	case ARIZONA_AIF1_FORMAT:
+	case ARIZONA_AIF1_RX_BCLK_RATE:
+	case ARIZONA_AIF1_FRAME_CTRL_1:
+	case ARIZONA_AIF1_FRAME_CTRL_2:
+	case ARIZONA_AIF1_FRAME_CTRL_3:
+	case ARIZONA_AIF1_FRAME_CTRL_4:
+	case ARIZONA_AIF1_FRAME_CTRL_5:
+	case ARIZONA_AIF1_FRAME_CTRL_6:
+	case ARIZONA_AIF1_FRAME_CTRL_7:
+	case ARIZONA_AIF1_FRAME_CTRL_8:
+	case ARIZONA_AIF1_FRAME_CTRL_11:
+	case ARIZONA_AIF1_FRAME_CTRL_12:
+	case ARIZONA_AIF1_FRAME_CTRL_13:
+	case ARIZONA_AIF1_FRAME_CTRL_14:
+	case ARIZONA_AIF1_FRAME_CTRL_15:
+	case ARIZONA_AIF1_FRAME_CTRL_16:
+	case ARIZONA_AIF1_TX_ENABLES:
+	case ARIZONA_AIF1_RX_ENABLES:
+	case ARIZONA_AIF2_BCLK_CTRL:
+	case ARIZONA_AIF2_TX_PIN_CTRL:
+	case ARIZONA_AIF2_RX_PIN_CTRL:
+	case ARIZONA_AIF2_RATE_CTRL:
+	case ARIZONA_AIF2_FORMAT:
+	case ARIZONA_AIF2_RX_BCLK_RATE:
+	case ARIZONA_AIF2_FRAME_CTRL_1:
+	case ARIZONA_AIF2_FRAME_CTRL_2:
+	case ARIZONA_AIF2_FRAME_CTRL_3:
+	case ARIZONA_AIF2_FRAME_CTRL_4:
+	case ARIZONA_AIF2_FRAME_CTRL_5:
+	case ARIZONA_AIF2_FRAME_CTRL_6:
+	case ARIZONA_AIF2_FRAME_CTRL_7:
+	case ARIZONA_AIF2_FRAME_CTRL_8:
+	case ARIZONA_AIF2_FRAME_CTRL_11:
+	case ARIZONA_AIF2_FRAME_CTRL_12:
+	case ARIZONA_AIF2_FRAME_CTRL_13:
+	case ARIZONA_AIF2_FRAME_CTRL_14:
+	case ARIZONA_AIF2_FRAME_CTRL_15:
+	case ARIZONA_AIF2_FRAME_CTRL_16:
+	case ARIZONA_AIF2_TX_ENABLES:
+	case ARIZONA_AIF2_RX_ENABLES:
+	case ARIZONA_AIF3_BCLK_CTRL:
+	case ARIZONA_AIF3_TX_PIN_CTRL:
+	case ARIZONA_AIF3_RX_PIN_CTRL:
+	case ARIZONA_AIF3_RATE_CTRL:
+	case ARIZONA_AIF3_FORMAT:
+	case ARIZONA_AIF3_RX_BCLK_RATE:
+	case ARIZONA_AIF3_FRAME_CTRL_1:
+	case ARIZONA_AIF3_FRAME_CTRL_2:
+	case ARIZONA_AIF3_FRAME_CTRL_3:
+	case ARIZONA_AIF3_FRAME_CTRL_4:
+	case ARIZONA_AIF3_FRAME_CTRL_11:
+	case ARIZONA_AIF3_FRAME_CTRL_12:
+	case ARIZONA_AIF3_TX_ENABLES:
+	case ARIZONA_AIF3_RX_ENABLES:
+	case ARIZONA_SPD1_TX_CONTROL:
+	case ARIZONA_SPD1_TX_CHANNEL_STATUS_1:
+	case ARIZONA_SPD1_TX_CHANNEL_STATUS_2:
+	case ARIZONA_SPD1_TX_CHANNEL_STATUS_3:
+	case ARIZONA_SLIMBUS_FRAMER_REF_GEAR:
+	case ARIZONA_SLIMBUS_RATES_1:
+	case ARIZONA_SLIMBUS_RATES_2:
+	case ARIZONA_SLIMBUS_RATES_5:
+	case ARIZONA_SLIMBUS_RATES_6:
+	case ARIZONA_SLIMBUS_RATES_7:
+	case ARIZONA_SLIMBUS_RX_CHANNEL_ENABLE:
+	case ARIZONA_SLIMBUS_TX_CHANNEL_ENABLE:
+	case ARIZONA_SLIMBUS_RX_PORT_STATUS:
+	case ARIZONA_SLIMBUS_TX_PORT_STATUS:
+	case ARIZONA_PWM1MIX_INPUT_1_SOURCE:
+	case ARIZONA_PWM1MIX_INPUT_1_VOLUME:
+	case ARIZONA_PWM1MIX_INPUT_2_SOURCE:
+	case ARIZONA_PWM1MIX_INPUT_2_VOLUME:
+	case ARIZONA_PWM1MIX_INPUT_3_SOURCE:
+	case ARIZONA_PWM1MIX_INPUT_3_VOLUME:
+	case ARIZONA_PWM1MIX_INPUT_4_SOURCE:
+	case ARIZONA_PWM1MIX_INPUT_4_VOLUME:
+	case ARIZONA_PWM2MIX_INPUT_1_SOURCE:
+	case ARIZONA_PWM2MIX_INPUT_1_VOLUME:
+	case ARIZONA_PWM2MIX_INPUT_2_SOURCE:
+	case ARIZONA_PWM2MIX_INPUT_2_VOLUME:
+	case ARIZONA_PWM2MIX_INPUT_3_SOURCE:
+	case ARIZONA_PWM2MIX_INPUT_3_VOLUME:
+	case ARIZONA_PWM2MIX_INPUT_4_SOURCE:
+	case ARIZONA_PWM2MIX_INPUT_4_VOLUME:
+	case ARIZONA_OUT1LMIX_INPUT_1_SOURCE:
+	case ARIZONA_OUT1LMIX_INPUT_1_VOLUME:
+	case ARIZONA_OUT1LMIX_INPUT_2_SOURCE:
+	case ARIZONA_OUT1LMIX_INPUT_2_VOLUME:
+	case ARIZONA_OUT1LMIX_INPUT_3_SOURCE:
+	case ARIZONA_OUT1LMIX_INPUT_3_VOLUME:
+	case ARIZONA_OUT1LMIX_INPUT_4_SOURCE:
+	case ARIZONA_OUT1LMIX_INPUT_4_VOLUME:
+	case ARIZONA_OUT1RMIX_INPUT_1_SOURCE:
+	case ARIZONA_OUT1RMIX_INPUT_1_VOLUME:
+	case ARIZONA_OUT1RMIX_INPUT_2_SOURCE:
+	case ARIZONA_OUT1RMIX_INPUT_2_VOLUME:
+	case ARIZONA_OUT1RMIX_INPUT_3_SOURCE:
+	case ARIZONA_OUT1RMIX_INPUT_3_VOLUME:
+	case ARIZONA_OUT1RMIX_INPUT_4_SOURCE:
+	case ARIZONA_OUT1RMIX_INPUT_4_VOLUME:
+	case ARIZONA_OUT2LMIX_INPUT_1_SOURCE:
+	case ARIZONA_OUT2LMIX_INPUT_1_VOLUME:
+	case ARIZONA_OUT2LMIX_INPUT_2_SOURCE:
+	case ARIZONA_OUT2LMIX_INPUT_2_VOLUME:
+	case ARIZONA_OUT2LMIX_INPUT_3_SOURCE:
+	case ARIZONA_OUT2LMIX_INPUT_3_VOLUME:
+	case ARIZONA_OUT2LMIX_INPUT_4_SOURCE:
+	case ARIZONA_OUT2LMIX_INPUT_4_VOLUME:
+	case ARIZONA_OUT2RMIX_INPUT_1_SOURCE:
+	case ARIZONA_OUT2RMIX_INPUT_1_VOLUME:
+	case ARIZONA_OUT2RMIX_INPUT_2_SOURCE:
+	case ARIZONA_OUT2RMIX_INPUT_2_VOLUME:
+	case ARIZONA_OUT2RMIX_INPUT_3_SOURCE:
+	case ARIZONA_OUT2RMIX_INPUT_3_VOLUME:
+	case ARIZONA_OUT2RMIX_INPUT_4_SOURCE:
+	case ARIZONA_OUT2RMIX_INPUT_4_VOLUME:
+	case ARIZONA_OUT3LMIX_INPUT_1_SOURCE:
+	case ARIZONA_OUT3LMIX_INPUT_1_VOLUME:
+	case ARIZONA_OUT3LMIX_INPUT_2_SOURCE:
+	case ARIZONA_OUT3LMIX_INPUT_2_VOLUME:
+	case ARIZONA_OUT3LMIX_INPUT_3_SOURCE:
+	case ARIZONA_OUT3LMIX_INPUT_3_VOLUME:
+	case ARIZONA_OUT3LMIX_INPUT_4_SOURCE:
+	case ARIZONA_OUT3LMIX_INPUT_4_VOLUME:
+	case ARIZONA_OUT4LMIX_INPUT_1_SOURCE:
+	case ARIZONA_OUT4LMIX_INPUT_1_VOLUME:
+	case ARIZONA_OUT4LMIX_INPUT_2_SOURCE:
+	case ARIZONA_OUT4LMIX_INPUT_2_VOLUME:
+	case ARIZONA_OUT4LMIX_INPUT_3_SOURCE:
+	case ARIZONA_OUT4LMIX_INPUT_3_VOLUME:
+	case ARIZONA_OUT4LMIX_INPUT_4_SOURCE:
+	case ARIZONA_OUT4LMIX_INPUT_4_VOLUME:
+	case ARIZONA_OUT4RMIX_INPUT_1_SOURCE:
+	case ARIZONA_OUT4RMIX_INPUT_1_VOLUME:
+	case ARIZONA_OUT4RMIX_INPUT_2_SOURCE:
+	case ARIZONA_OUT4RMIX_INPUT_2_VOLUME:
+	case ARIZONA_OUT4RMIX_INPUT_3_SOURCE:
+	case ARIZONA_OUT4RMIX_INPUT_3_VOLUME:
+	case ARIZONA_OUT4RMIX_INPUT_4_SOURCE:
+	case ARIZONA_OUT4RMIX_INPUT_4_VOLUME:
+	case ARIZONA_OUT5LMIX_INPUT_1_SOURCE:
+	case ARIZONA_OUT5LMIX_INPUT_1_VOLUME:
+	case ARIZONA_OUT5LMIX_INPUT_2_SOURCE:
+	case ARIZONA_OUT5LMIX_INPUT_2_VOLUME:
+	case ARIZONA_OUT5LMIX_INPUT_3_SOURCE:
+	case ARIZONA_OUT5LMIX_INPUT_3_VOLUME:
+	case ARIZONA_OUT5LMIX_INPUT_4_SOURCE:
+	case ARIZONA_OUT5LMIX_INPUT_4_VOLUME:
+	case ARIZONA_OUT5RMIX_INPUT_1_SOURCE:
+	case ARIZONA_OUT5RMIX_INPUT_1_VOLUME:
+	case ARIZONA_OUT5RMIX_INPUT_2_SOURCE:
+	case ARIZONA_OUT5RMIX_INPUT_2_VOLUME:
+	case ARIZONA_OUT5RMIX_INPUT_3_SOURCE:
+	case ARIZONA_OUT5RMIX_INPUT_3_VOLUME:
+	case ARIZONA_OUT5RMIX_INPUT_4_SOURCE:
+	case ARIZONA_OUT5RMIX_INPUT_4_VOLUME:
+	case ARIZONA_AIF1TX1MIX_INPUT_1_SOURCE:
+	case ARIZONA_AIF1TX1MIX_INPUT_1_VOLUME:
+	case ARIZONA_AIF1TX1MIX_INPUT_2_SOURCE:
+	case ARIZONA_AIF1TX1MIX_INPUT_2_VOLUME:
+	case ARIZONA_AIF1TX1MIX_INPUT_3_SOURCE:
+	case ARIZONA_AIF1TX1MIX_INPUT_3_VOLUME:
+	case ARIZONA_AIF1TX1MIX_INPUT_4_SOURCE:
+	case ARIZONA_AIF1TX1MIX_INPUT_4_VOLUME:
+	case ARIZONA_AIF1TX2MIX_INPUT_1_SOURCE:
+	case ARIZONA_AIF1TX2MIX_INPUT_1_VOLUME:
+	case ARIZONA_AIF1TX2MIX_INPUT_2_SOURCE:
+	case ARIZONA_AIF1TX2MIX_INPUT_2_VOLUME:
+	case ARIZONA_AIF1TX2MIX_INPUT_3_SOURCE:
+	case ARIZONA_AIF1TX2MIX_INPUT_3_VOLUME:
+	case ARIZONA_AIF1TX2MIX_INPUT_4_SOURCE:
+	case ARIZONA_AIF1TX2MIX_INPUT_4_VOLUME:
+	case ARIZONA_AIF1TX3MIX_INPUT_1_SOURCE:
+	case ARIZONA_AIF1TX3MIX_INPUT_1_VOLUME:
+	case ARIZONA_AIF1TX3MIX_INPUT_2_SOURCE:
+	case ARIZONA_AIF1TX3MIX_INPUT_2_VOLUME:
+	case ARIZONA_AIF1TX3MIX_INPUT_3_SOURCE:
+	case ARIZONA_AIF1TX3MIX_INPUT_3_VOLUME:
+	case ARIZONA_AIF1TX3MIX_INPUT_4_SOURCE:
+	case ARIZONA_AIF1TX3MIX_INPUT_4_VOLUME:
+	case ARIZONA_AIF1TX4MIX_INPUT_1_SOURCE:
+	case ARIZONA_AIF1TX4MIX_INPUT_1_VOLUME:
+	case ARIZONA_AIF1TX4MIX_INPUT_2_SOURCE:
+	case ARIZONA_AIF1TX4MIX_INPUT_2_VOLUME:
+	case ARIZONA_AIF1TX4MIX_INPUT_3_SOURCE:
+	case ARIZONA_AIF1TX4MIX_INPUT_3_VOLUME:
+	case ARIZONA_AIF1TX4MIX_INPUT_4_SOURCE:
+	case ARIZONA_AIF1TX4MIX_INPUT_4_VOLUME:
+	case ARIZONA_AIF1TX5MIX_INPUT_1_SOURCE:
+	case ARIZONA_AIF1TX5MIX_INPUT_1_VOLUME:
+	case ARIZONA_AIF1TX5MIX_INPUT_2_SOURCE:
+	case ARIZONA_AIF1TX5MIX_INPUT_2_VOLUME:
+	case ARIZONA_AIF1TX5MIX_INPUT_3_SOURCE:
+	case ARIZONA_AIF1TX5MIX_INPUT_3_VOLUME:
+	case ARIZONA_AIF1TX5MIX_INPUT_4_SOURCE:
+	case ARIZONA_AIF1TX5MIX_INPUT_4_VOLUME:
+	case ARIZONA_AIF1TX6MIX_INPUT_1_SOURCE:
+	case ARIZONA_AIF1TX6MIX_INPUT_1_VOLUME:
+	case ARIZONA_AIF1TX6MIX_INPUT_2_SOURCE:
+	case ARIZONA_AIF1TX6MIX_INPUT_2_VOLUME:
+	case ARIZONA_AIF1TX6MIX_INPUT_3_SOURCE:
+	case ARIZONA_AIF1TX6MIX_INPUT_3_VOLUME:
+	case ARIZONA_AIF1TX6MIX_INPUT_4_SOURCE:
+	case ARIZONA_AIF1TX6MIX_INPUT_4_VOLUME:
+	case ARIZONA_AIF2TX1MIX_INPUT_1_SOURCE:
+	case ARIZONA_AIF2TX1MIX_INPUT_1_VOLUME:
+	case ARIZONA_AIF2TX1MIX_INPUT_2_SOURCE:
+	case ARIZONA_AIF2TX1MIX_INPUT_2_VOLUME:
+	case ARIZONA_AIF2TX1MIX_INPUT_3_SOURCE:
+	case ARIZONA_AIF2TX1MIX_INPUT_3_VOLUME:
+	case ARIZONA_AIF2TX1MIX_INPUT_4_SOURCE:
+	case ARIZONA_AIF2TX1MIX_INPUT_4_VOLUME:
+	case ARIZONA_AIF2TX2MIX_INPUT_1_SOURCE:
+	case ARIZONA_AIF2TX2MIX_INPUT_1_VOLUME:
+	case ARIZONA_AIF2TX2MIX_INPUT_2_SOURCE:
+	case ARIZONA_AIF2TX2MIX_INPUT_2_VOLUME:
+	case ARIZONA_AIF2TX2MIX_INPUT_3_SOURCE:
+	case ARIZONA_AIF2TX2MIX_INPUT_3_VOLUME:
+	case ARIZONA_AIF2TX2MIX_INPUT_4_SOURCE:
+	case ARIZONA_AIF2TX2MIX_INPUT_4_VOLUME:
+	case ARIZONA_AIF2TX3MIX_INPUT_1_SOURCE:
+	case ARIZONA_AIF2TX3MIX_INPUT_1_VOLUME:
+	case ARIZONA_AIF2TX3MIX_INPUT_2_SOURCE:
+	case ARIZONA_AIF2TX3MIX_INPUT_2_VOLUME:
+	case ARIZONA_AIF2TX3MIX_INPUT_3_SOURCE:
+	case ARIZONA_AIF2TX3MIX_INPUT_3_VOLUME:
+	case ARIZONA_AIF2TX3MIX_INPUT_4_SOURCE:
+	case ARIZONA_AIF2TX3MIX_INPUT_4_VOLUME:
+	case ARIZONA_AIF2TX4MIX_INPUT_1_SOURCE:
+	case ARIZONA_AIF2TX4MIX_INPUT_1_VOLUME:
+	case ARIZONA_AIF2TX4MIX_INPUT_2_SOURCE:
+	case ARIZONA_AIF2TX4MIX_INPUT_2_VOLUME:
+	case ARIZONA_AIF2TX4MIX_INPUT_3_SOURCE:
+	case ARIZONA_AIF2TX4MIX_INPUT_3_VOLUME:
+	case ARIZONA_AIF2TX4MIX_INPUT_4_SOURCE:
+	case ARIZONA_AIF2TX4MIX_INPUT_4_VOLUME:
+	case ARIZONA_AIF2TX5MIX_INPUT_1_SOURCE:
+	case ARIZONA_AIF2TX5MIX_INPUT_1_VOLUME:
+	case ARIZONA_AIF2TX5MIX_INPUT_2_SOURCE:
+	case ARIZONA_AIF2TX5MIX_INPUT_2_VOLUME:
+	case ARIZONA_AIF2TX5MIX_INPUT_3_SOURCE:
+	case ARIZONA_AIF2TX5MIX_INPUT_3_VOLUME:
+	case ARIZONA_AIF2TX5MIX_INPUT_4_SOURCE:
+	case ARIZONA_AIF2TX5MIX_INPUT_4_VOLUME:
+	case ARIZONA_AIF2TX6MIX_INPUT_1_SOURCE:
+	case ARIZONA_AIF2TX6MIX_INPUT_1_VOLUME:
+	case ARIZONA_AIF2TX6MIX_INPUT_2_SOURCE:
+	case ARIZONA_AIF2TX6MIX_INPUT_2_VOLUME:
+	case ARIZONA_AIF2TX6MIX_INPUT_3_SOURCE:
+	case ARIZONA_AIF2TX6MIX_INPUT_3_VOLUME:
+	case ARIZONA_AIF2TX6MIX_INPUT_4_SOURCE:
+	case ARIZONA_AIF2TX6MIX_INPUT_4_VOLUME:
+	case ARIZONA_AIF3TX1MIX_INPUT_1_SOURCE:
+	case ARIZONA_AIF3TX1MIX_INPUT_1_VOLUME:
+	case ARIZONA_AIF3TX1MIX_INPUT_2_SOURCE:
+	case ARIZONA_AIF3TX1MIX_INPUT_2_VOLUME:
+	case ARIZONA_AIF3TX1MIX_INPUT_3_SOURCE:
+	case ARIZONA_AIF3TX1MIX_INPUT_3_VOLUME:
+	case ARIZONA_AIF3TX1MIX_INPUT_4_SOURCE:
+	case ARIZONA_AIF3TX1MIX_INPUT_4_VOLUME:
+	case ARIZONA_AIF3TX2MIX_INPUT_1_SOURCE:
+	case ARIZONA_AIF3TX2MIX_INPUT_1_VOLUME:
+	case ARIZONA_AIF3TX2MIX_INPUT_2_SOURCE:
+	case ARIZONA_AIF3TX2MIX_INPUT_2_VOLUME:
+	case ARIZONA_AIF3TX2MIX_INPUT_3_SOURCE:
+	case ARIZONA_AIF3TX2MIX_INPUT_3_VOLUME:
+	case ARIZONA_AIF3TX2MIX_INPUT_4_SOURCE:
+	case ARIZONA_AIF3TX2MIX_INPUT_4_VOLUME:
+	case ARIZONA_SLIMTX1MIX_INPUT_1_SOURCE:
+	case ARIZONA_SLIMTX1MIX_INPUT_1_VOLUME:
+	case ARIZONA_SLIMTX2MIX_INPUT_1_SOURCE:
+	case ARIZONA_SLIMTX2MIX_INPUT_1_VOLUME:
+	case ARIZONA_SLIMTX3MIX_INPUT_1_SOURCE:
+	case ARIZONA_SLIMTX3MIX_INPUT_1_VOLUME:
+	case ARIZONA_SLIMTX4MIX_INPUT_1_SOURCE:
+	case ARIZONA_SLIMTX4MIX_INPUT_1_VOLUME:
+	case ARIZONA_SLIMTX5MIX_INPUT_1_SOURCE:
+	case ARIZONA_SLIMTX5MIX_INPUT_1_VOLUME:
+	case ARIZONA_SLIMTX6MIX_INPUT_1_SOURCE:
+	case ARIZONA_SLIMTX6MIX_INPUT_1_VOLUME:
+	case ARIZONA_SPDIFTX1MIX_INPUT_1_SOURCE:
+	case ARIZONA_SPDIFTX1MIX_INPUT_1_VOLUME:
+	case ARIZONA_SPDIFTX2MIX_INPUT_1_SOURCE:
+	case ARIZONA_SPDIFTX2MIX_INPUT_1_VOLUME:
+	case ARIZONA_EQ1MIX_INPUT_1_SOURCE:
+	case ARIZONA_EQ1MIX_INPUT_1_VOLUME:
+	case ARIZONA_EQ2MIX_INPUT_1_SOURCE:
+	case ARIZONA_EQ2MIX_INPUT_1_VOLUME:
+	case ARIZONA_EQ3MIX_INPUT_1_SOURCE:
+	case ARIZONA_EQ3MIX_INPUT_1_VOLUME:
+	case ARIZONA_EQ4MIX_INPUT_1_SOURCE:
+	case ARIZONA_EQ4MIX_INPUT_1_VOLUME:
+	case ARIZONA_DRC1LMIX_INPUT_1_SOURCE:
+	case ARIZONA_DRC1LMIX_INPUT_1_VOLUME:
+	case ARIZONA_DRC1RMIX_INPUT_1_SOURCE:
+	case ARIZONA_DRC1RMIX_INPUT_1_VOLUME:
+	case ARIZONA_HPLP1MIX_INPUT_1_SOURCE:
+	case ARIZONA_HPLP1MIX_INPUT_1_VOLUME:
+	case ARIZONA_HPLP1MIX_INPUT_2_SOURCE:
+	case ARIZONA_HPLP1MIX_INPUT_2_VOLUME:
+	case ARIZONA_HPLP1MIX_INPUT_3_SOURCE:
+	case ARIZONA_HPLP1MIX_INPUT_3_VOLUME:
+	case ARIZONA_HPLP1MIX_INPUT_4_SOURCE:
+	case ARIZONA_HPLP1MIX_INPUT_4_VOLUME:
+	case ARIZONA_HPLP2MIX_INPUT_1_SOURCE:
+	case ARIZONA_HPLP2MIX_INPUT_1_VOLUME:
+	case ARIZONA_HPLP2MIX_INPUT_2_SOURCE:
+	case ARIZONA_HPLP2MIX_INPUT_2_VOLUME:
+	case ARIZONA_HPLP2MIX_INPUT_3_SOURCE:
+	case ARIZONA_HPLP2MIX_INPUT_3_VOLUME:
+	case ARIZONA_HPLP2MIX_INPUT_4_SOURCE:
+	case ARIZONA_HPLP2MIX_INPUT_4_VOLUME:
+	case ARIZONA_HPLP3MIX_INPUT_1_SOURCE:
+	case ARIZONA_HPLP3MIX_INPUT_1_VOLUME:
+	case ARIZONA_HPLP3MIX_INPUT_2_SOURCE:
+	case ARIZONA_HPLP3MIX_INPUT_2_VOLUME:
+	case ARIZONA_HPLP3MIX_INPUT_3_SOURCE:
+	case ARIZONA_HPLP3MIX_INPUT_3_VOLUME:
+	case ARIZONA_HPLP3MIX_INPUT_4_SOURCE:
+	case ARIZONA_HPLP3MIX_INPUT_4_VOLUME:
+	case ARIZONA_HPLP4MIX_INPUT_1_SOURCE:
+	case ARIZONA_HPLP4MIX_INPUT_1_VOLUME:
+	case ARIZONA_HPLP4MIX_INPUT_2_SOURCE:
+	case ARIZONA_HPLP4MIX_INPUT_2_VOLUME:
+	case ARIZONA_HPLP4MIX_INPUT_3_SOURCE:
+	case ARIZONA_HPLP4MIX_INPUT_3_VOLUME:
+	case ARIZONA_HPLP4MIX_INPUT_4_SOURCE:
+	case ARIZONA_HPLP4MIX_INPUT_4_VOLUME:
+	case ARIZONA_ASRC1LMIX_INPUT_1_SOURCE:
+	case ARIZONA_ASRC1RMIX_INPUT_1_SOURCE:
+	case ARIZONA_ASRC2LMIX_INPUT_1_SOURCE:
+	case ARIZONA_ASRC2RMIX_INPUT_1_SOURCE:
+	case ARIZONA_ISRC1DEC1MIX_INPUT_1_SOURCE:
+	case ARIZONA_ISRC1DEC2MIX_INPUT_1_SOURCE:
+	case ARIZONA_ISRC1DEC3MIX_INPUT_1_SOURCE:
+	case ARIZONA_ISRC1DEC4MIX_INPUT_1_SOURCE:
+	case ARIZONA_ISRC1INT1MIX_INPUT_1_SOURCE:
+	case ARIZONA_ISRC1INT2MIX_INPUT_1_SOURCE:
+	case ARIZONA_ISRC1INT3MIX_INPUT_1_SOURCE:
+	case ARIZONA_ISRC1INT4MIX_INPUT_1_SOURCE:
+	case ARIZONA_ISRC2DEC1MIX_INPUT_1_SOURCE:
+	case ARIZONA_ISRC2DEC2MIX_INPUT_1_SOURCE:
+	case ARIZONA_ISRC2INT1MIX_INPUT_1_SOURCE:
+	case ARIZONA_ISRC2INT2MIX_INPUT_1_SOURCE:
+	case ARIZONA_GPIO1_CTRL:
+	case ARIZONA_GPIO2_CTRL:
+	case ARIZONA_GPIO3_CTRL:
+	case ARIZONA_GPIO4_CTRL:
+	case ARIZONA_GPIO5_CTRL:
+	case ARIZONA_IRQ_CTRL_1:
+	case ARIZONA_GPIO_DEBOUNCE_CONFIG:
+	case ARIZONA_GP_SWITCH_1:
+	case ARIZONA_MISC_PAD_CTRL_1:
+	case ARIZONA_MISC_PAD_CTRL_2:
+	case ARIZONA_MISC_PAD_CTRL_3:
+	case ARIZONA_MISC_PAD_CTRL_4:
+	case ARIZONA_MISC_PAD_CTRL_5:
+	case ARIZONA_MISC_PAD_CTRL_6:
+	case ARIZONA_INTERRUPT_STATUS_1:
+	case ARIZONA_INTERRUPT_STATUS_2:
+	case ARIZONA_INTERRUPT_STATUS_3:
+	case ARIZONA_INTERRUPT_STATUS_4:
+	case ARIZONA_INTERRUPT_STATUS_5:
+	case ARIZONA_INTERRUPT_STATUS_1_MASK:
+	case ARIZONA_INTERRUPT_STATUS_2_MASK:
+	case ARIZONA_INTERRUPT_STATUS_3_MASK:
+	case ARIZONA_INTERRUPT_STATUS_4_MASK:
+	case ARIZONA_INTERRUPT_STATUS_5_MASK:
+	case ARIZONA_INTERRUPT_CONTROL:
+	case ARIZONA_IRQ2_STATUS_1:
+	case ARIZONA_IRQ2_STATUS_2:
+	case ARIZONA_IRQ2_STATUS_3:
+	case ARIZONA_IRQ2_STATUS_4:
+	case ARIZONA_IRQ2_STATUS_5:
+	case ARIZONA_IRQ2_STATUS_1_MASK:
+	case ARIZONA_IRQ2_STATUS_2_MASK:
+	case ARIZONA_IRQ2_STATUS_3_MASK:
+	case ARIZONA_IRQ2_STATUS_4_MASK:
+	case ARIZONA_IRQ2_STATUS_5_MASK:
+	case ARIZONA_IRQ2_CONTROL:
+	case ARIZONA_INTERRUPT_RAW_STATUS_2:
+	case ARIZONA_INTERRUPT_RAW_STATUS_3:
+	case ARIZONA_INTERRUPT_RAW_STATUS_4:
+	case ARIZONA_INTERRUPT_RAW_STATUS_5:
+	case ARIZONA_INTERRUPT_RAW_STATUS_6:
+	case ARIZONA_INTERRUPT_RAW_STATUS_7:
+	case ARIZONA_INTERRUPT_RAW_STATUS_8:
+	case ARIZONA_IRQ_PIN_STATUS:
+	case ARIZONA_AOD_WKUP_AND_TRIG:
+	case ARIZONA_AOD_IRQ1:
+	case ARIZONA_AOD_IRQ2:
+	case ARIZONA_AOD_IRQ_MASK_IRQ1:
+	case ARIZONA_AOD_IRQ_MASK_IRQ2:
+	case ARIZONA_AOD_IRQ_RAW_STATUS:
+	case ARIZONA_JACK_DETECT_DEBOUNCE:
+	case ARIZONA_FX_CTRL1:
+	case ARIZONA_FX_CTRL2:
+	case ARIZONA_EQ1_1:
+	case ARIZONA_EQ1_2:
+	case ARIZONA_EQ1_3:
+	case ARIZONA_EQ1_4:
+	case ARIZONA_EQ1_5:
+	case ARIZONA_EQ1_6:
+	case ARIZONA_EQ1_7:
+	case ARIZONA_EQ1_8:
+	case ARIZONA_EQ1_9:
+	case ARIZONA_EQ1_10:
+	case ARIZONA_EQ1_11:
+	case ARIZONA_EQ1_12:
+	case ARIZONA_EQ1_13:
+	case ARIZONA_EQ1_14:
+	case ARIZONA_EQ1_15:
+	case ARIZONA_EQ1_16:
+	case ARIZONA_EQ1_17:
+	case ARIZONA_EQ1_18:
+	case ARIZONA_EQ1_19:
+	case ARIZONA_EQ1_20:
+	case ARIZONA_EQ1_21:
+	case ARIZONA_EQ2_1:
+	case ARIZONA_EQ2_2:
+	case ARIZONA_EQ2_3:
+	case ARIZONA_EQ2_4:
+	case ARIZONA_EQ2_5:
+	case ARIZONA_EQ2_6:
+	case ARIZONA_EQ2_7:
+	case ARIZONA_EQ2_8:
+	case ARIZONA_EQ2_9:
+	case ARIZONA_EQ2_10:
+	case ARIZONA_EQ2_11:
+	case ARIZONA_EQ2_12:
+	case ARIZONA_EQ2_13:
+	case ARIZONA_EQ2_14:
+	case ARIZONA_EQ2_15:
+	case ARIZONA_EQ2_16:
+	case ARIZONA_EQ2_17:
+	case ARIZONA_EQ2_18:
+	case ARIZONA_EQ2_19:
+	case ARIZONA_EQ2_20:
+	case ARIZONA_EQ2_21:
+	case ARIZONA_EQ3_1:
+	case ARIZONA_EQ3_2:
+	case ARIZONA_EQ3_3:
+	case ARIZONA_EQ3_4:
+	case ARIZONA_EQ3_5:
+	case ARIZONA_EQ3_6:
+	case ARIZONA_EQ3_7:
+	case ARIZONA_EQ3_8:
+	case ARIZONA_EQ3_9:
+	case ARIZONA_EQ3_10:
+	case ARIZONA_EQ3_11:
+	case ARIZONA_EQ3_12:
+	case ARIZONA_EQ3_13:
+	case ARIZONA_EQ3_14:
+	case ARIZONA_EQ3_15:
+	case ARIZONA_EQ3_16:
+	case ARIZONA_EQ3_17:
+	case ARIZONA_EQ3_18:
+	case ARIZONA_EQ3_19:
+	case ARIZONA_EQ3_20:
+	case ARIZONA_EQ3_21:
+	case ARIZONA_EQ4_1:
+	case ARIZONA_EQ4_2:
+	case ARIZONA_EQ4_3:
+	case ARIZONA_EQ4_4:
+	case ARIZONA_EQ4_5:
+	case ARIZONA_EQ4_6:
+	case ARIZONA_EQ4_7:
+	case ARIZONA_EQ4_8:
+	case ARIZONA_EQ4_9:
+	case ARIZONA_EQ4_10:
+	case ARIZONA_EQ4_11:
+	case ARIZONA_EQ4_12:
+	case ARIZONA_EQ4_13:
+	case ARIZONA_EQ4_14:
+	case ARIZONA_EQ4_15:
+	case ARIZONA_EQ4_16:
+	case ARIZONA_EQ4_17:
+	case ARIZONA_EQ4_18:
+	case ARIZONA_EQ4_19:
+	case ARIZONA_EQ4_20:
+	case ARIZONA_EQ4_21:
+	case ARIZONA_DRC1_CTRL1:
+	case ARIZONA_DRC1_CTRL2:
+	case ARIZONA_DRC1_CTRL3:
+	case ARIZONA_DRC1_CTRL4:
+	case ARIZONA_DRC1_CTRL5:
+	case ARIZONA_HPLPF1_1:
+	case ARIZONA_HPLPF1_2:
+	case ARIZONA_HPLPF2_1:
+	case ARIZONA_HPLPF2_2:
+	case ARIZONA_HPLPF3_1:
+	case ARIZONA_HPLPF3_2:
+	case ARIZONA_HPLPF4_1:
+	case ARIZONA_HPLPF4_2:
+	case ARIZONA_ASRC_ENABLE:
+	case ARIZONA_ASRC_STATUS:
+	case ARIZONA_ASRC_RATE1:
+	case ARIZONA_ASRC_RATE2:
+	case ARIZONA_ISRC_1_CTRL_1:
+	case ARIZONA_ISRC_1_CTRL_2:
+	case ARIZONA_ISRC_1_CTRL_3:
+	case ARIZONA_ISRC_2_CTRL_1:
+	case ARIZONA_ISRC_2_CTRL_2:
+	case ARIZONA_ISRC_2_CTRL_3:
+	case ARIZONA_FRF_COEFF_1:
+	case ARIZONA_FRF_COEFF_2:
+	case ARIZONA_FRF_COEFF_3:
+	case ARIZONA_FRF_COEFF_4:
+	case ARIZONA_V2_DAC_COMP_1:
+	case ARIZONA_V2_DAC_COMP_2:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool wm8998_volatile_register(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case ARIZONA_SOFTWARE_RESET:
+	case ARIZONA_DEVICE_REVISION:
+	case ARIZONA_WRITE_SEQUENCER_CTRL_0:
+	case ARIZONA_WRITE_SEQUENCER_CTRL_1:
+	case ARIZONA_WRITE_SEQUENCER_CTRL_2:
+	case ARIZONA_HAPTICS_STATUS:
+	case ARIZONA_SAMPLE_RATE_1_STATUS:
+	case ARIZONA_SAMPLE_RATE_2_STATUS:
+	case ARIZONA_SAMPLE_RATE_3_STATUS:
+	case ARIZONA_ASYNC_SAMPLE_RATE_1_STATUS:
+	case ARIZONA_ASYNC_SAMPLE_RATE_2_STATUS:
+	case ARIZONA_MIC_DETECT_3:
+	case ARIZONA_MIC_DETECT_4:
+	case ARIZONA_HEADPHONE_DETECT_2:
+	case ARIZONA_INPUT_ENABLES_STATUS:
+	case ARIZONA_OUTPUT_STATUS_1:
+	case ARIZONA_RAW_OUTPUT_STATUS_1:
+	case ARIZONA_SLIMBUS_RX_PORT_STATUS:
+	case ARIZONA_SLIMBUS_TX_PORT_STATUS:
+	case ARIZONA_INTERRUPT_STATUS_1:
+	case ARIZONA_INTERRUPT_STATUS_2:
+	case ARIZONA_INTERRUPT_STATUS_3:
+	case ARIZONA_INTERRUPT_STATUS_4:
+	case ARIZONA_INTERRUPT_STATUS_5:
+	case ARIZONA_IRQ2_STATUS_1:
+	case ARIZONA_IRQ2_STATUS_2:
+	case ARIZONA_IRQ2_STATUS_3:
+	case ARIZONA_IRQ2_STATUS_4:
+	case ARIZONA_IRQ2_STATUS_5:
+	case ARIZONA_INTERRUPT_RAW_STATUS_2:
+	case ARIZONA_INTERRUPT_RAW_STATUS_3:
+	case ARIZONA_INTERRUPT_RAW_STATUS_4:
+	case ARIZONA_INTERRUPT_RAW_STATUS_5:
+	case ARIZONA_INTERRUPT_RAW_STATUS_6:
+	case ARIZONA_INTERRUPT_RAW_STATUS_7:
+	case ARIZONA_INTERRUPT_RAW_STATUS_8:
+	case ARIZONA_IRQ_PIN_STATUS:
+	case ARIZONA_AOD_WKUP_AND_TRIG:
+	case ARIZONA_AOD_IRQ1:
+	case ARIZONA_AOD_IRQ2:
+	case ARIZONA_AOD_IRQ_RAW_STATUS:
+	case ARIZONA_FX_CTRL2:
+	case ARIZONA_ASRC_STATUS:
+		return true;
+	default:
+		return false;
+	}
+}
+
+#define WM8998_MAX_REGISTER 0x31ff
+
+const struct regmap_config wm8998_i2c_regmap = {
+	.reg_bits = 32,
+	.val_bits = 16,
+
+	.max_register = WM8998_MAX_REGISTER,
+	.readable_reg = wm8998_readable_register,
+	.volatile_reg = wm8998_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+	.reg_defaults = wm8998_reg_default,
+	.num_reg_defaults = ARRAY_SIZE(wm8998_reg_default),
+};
+EXPORT_SYMBOL_GPL(wm8998_i2c_regmap);
diff --git a/include/linux/mfd/arizona/core.h b/include/linux/mfd/arizona/core.h
index 2f434f4f79a1..79e607e2f081 100644
--- a/include/linux/mfd/arizona/core.h
+++ b/include/linux/mfd/arizona/core.h
@@ -25,6 +25,8 @@ enum arizona_type {
 	WM5110 = 2,
 	WM8997 = 3,
 	WM8280 = 4,
+	WM8998 = 5,
+	WM1814 = 6,
 };
 
 #define ARIZONA_IRQ_GP1                    0
@@ -165,6 +167,7 @@ static inline int wm5102_patch(struct arizona *arizona)
 
 int wm5110_patch(struct arizona *arizona);
 int wm8997_patch(struct arizona *arizona);
+int wm8998_patch(struct arizona *arizona);
 
 extern int arizona_of_get_named_gpio(struct arizona *arizona, const char *prop,
 				     bool mandatory);
diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index dc3c81252de7..1dc385850ba2 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -162,6 +162,8 @@ struct arizona_pdata {
 	/**
 	 * Mode of input structures
 	 * One of the ARIZONA_INMODE_xxx values
+	 * wm5102/wm5110/wm8280/wm8997: [0]=IN1 [1]=IN2 [2]=IN3 [3]=IN4
+	 * wm8998: [0]=IN1A [1]=IN2A [2]=IN1B [3]=IN2B
 	 */
 	int inmode[ARIZONA_MAX_INPUT];
 
diff --git a/include/linux/mfd/arizona/registers.h b/include/linux/mfd/arizona/registers.h
index 3499d36e6067..5c39dc5d279e 100644
--- a/include/linux/mfd/arizona/registers.h
+++ b/include/linux/mfd/arizona/registers.h
@@ -139,6 +139,7 @@
 #define ARIZONA_MIC_DETECT_LEVEL_2		 0x2A7
 #define ARIZONA_MIC_DETECT_LEVEL_3		 0x2A8
 #define ARIZONA_MIC_DETECT_LEVEL_4		 0x2A9
+#define ARIZONA_MIC_DETECT_4                     0x2AB
 #define ARIZONA_MIC_NOISE_MIX_CONTROL_1          0x2C3
 #define ARIZONA_ISOLATION_CONTROL                0x2CB
 #define ARIZONA_JACK_DETECT_ANALOGUE             0x2D3
@@ -225,14 +226,18 @@
 #define ARIZONA_DAC_VOLUME_LIMIT_6R              0x43E
 #define ARIZONA_NOISE_GATE_SELECT_6R             0x43F
 #define ARIZONA_DRE_ENABLE                       0x440
+#define ARIZONA_DRE_CONTROL_1                    0x441
 #define ARIZONA_DRE_CONTROL_2                    0x442
 #define ARIZONA_DRE_CONTROL_3                    0x443
+#define ARIZONA_EDRE_ENABLE                      0x448
 #define ARIZONA_DAC_AEC_CONTROL_1                0x450
+#define ARIZONA_DAC_AEC_CONTROL_2                0x451
 #define ARIZONA_NOISE_GATE_CONTROL               0x458
 #define ARIZONA_PDM_SPK1_CTRL_1                  0x490
 #define ARIZONA_PDM_SPK1_CTRL_2                  0x491
 #define ARIZONA_PDM_SPK2_CTRL_1                  0x492
 #define ARIZONA_PDM_SPK2_CTRL_2                  0x493
+#define ARIZONA_HP_TEST_CTRL_13                  0x49A
 #define ARIZONA_HP1_SHORT_CIRCUIT_CTRL           0x4A0
 #define ARIZONA_HP2_SHORT_CIRCUIT_CTRL           0x4A1
 #define ARIZONA_HP3_SHORT_CIRCUIT_CTRL           0x4A2
@@ -310,6 +315,10 @@
 #define ARIZONA_AIF3_TX_ENABLES                  0x599
 #define ARIZONA_AIF3_RX_ENABLES                  0x59A
 #define ARIZONA_AIF3_FORCE_WRITE                 0x59B
+#define ARIZONA_SPD1_TX_CONTROL                  0x5C2
+#define ARIZONA_SPD1_TX_CHANNEL_STATUS_1         0x5C3
+#define ARIZONA_SPD1_TX_CHANNEL_STATUS_2         0x5C4
+#define ARIZONA_SPD1_TX_CHANNEL_STATUS_3         0x5C5
 #define ARIZONA_SLIMBUS_FRAMER_REF_GEAR          0x5E3
 #define ARIZONA_SLIMBUS_RATES_1                  0x5E5
 #define ARIZONA_SLIMBUS_RATES_2                  0x5E6
@@ -643,6 +652,10 @@
 #define ARIZONA_SLIMTX8MIX_INPUT_3_VOLUME        0x7FD
 #define ARIZONA_SLIMTX8MIX_INPUT_4_SOURCE        0x7FE
 #define ARIZONA_SLIMTX8MIX_INPUT_4_VOLUME        0x7FF
+#define ARIZONA_SPDIFTX1MIX_INPUT_1_SOURCE       0x800
+#define ARIZONA_SPDIFTX1MIX_INPUT_1_VOLUME       0x801
+#define ARIZONA_SPDIFTX2MIX_INPUT_1_SOURCE       0x808
+#define ARIZONA_SPDIFTX2MIX_INPUT_1_VOLUME       0x809
 #define ARIZONA_EQ1MIX_INPUT_1_SOURCE            0x880
 #define ARIZONA_EQ1MIX_INPUT_1_VOLUME            0x881
 #define ARIZONA_EQ1MIX_INPUT_2_SOURCE            0x882
@@ -868,6 +881,7 @@
 #define ARIZONA_GPIO5_CTRL                       0xC04
 #define ARIZONA_IRQ_CTRL_1                       0xC0F
 #define ARIZONA_GPIO_DEBOUNCE_CONFIG             0xC10
+#define ARIZONA_GP_SWITCH_1                      0xC18
 #define ARIZONA_MISC_PAD_CTRL_1                  0xC20
 #define ARIZONA_MISC_PAD_CTRL_2                  0xC21
 #define ARIZONA_MISC_PAD_CTRL_3                  0xC22
@@ -1169,6 +1183,13 @@
 #define ARIZONA_DSP4_SCRATCH_1                   0x1441
 #define ARIZONA_DSP4_SCRATCH_2                   0x1442
 #define ARIZONA_DSP4_SCRATCH_3                   0x1443
+#define ARIZONA_FRF_COEFF_1                      0x1700
+#define ARIZONA_FRF_COEFF_2                      0x1701
+#define ARIZONA_FRF_COEFF_3                      0x1702
+#define ARIZONA_FRF_COEFF_4                      0x1703
+#define ARIZONA_V2_DAC_COMP_1                    0x1704
+#define ARIZONA_V2_DAC_COMP_2                    0x1705
+
 
 /*
  * Field Definitions.
@@ -2325,6 +2346,9 @@
 #define ARIZONA_HP_IDAC_STEER_MASK               0x0004  /* HP_IDAC_STEER */
 #define ARIZONA_HP_IDAC_STEER_SHIFT                   2  /* HP_IDAC_STEER */
 #define ARIZONA_HP_IDAC_STEER_WIDTH                   1  /* HP_IDAC_STEER */
+#define WM8998_HP_RATE_MASK                      0x0006  /* HP_RATE - [2:1] */
+#define WM8998_HP_RATE_SHIFT                          1  /* HP_RATE - [2:1] */
+#define WM8998_HP_RATE_WIDTH                          2  /* HP_RATE - [2:1] */
 #define ARIZONA_HP_RATE                          0x0002  /* HP_RATE */
 #define ARIZONA_HP_RATE_MASK                     0x0002  /* HP_RATE */
 #define ARIZONA_HP_RATE_SHIFT                         1  /* HP_RATE */
@@ -2412,6 +2436,16 @@
 #define ARIZONA_MICD_STS_SHIFT                        0  /* MICD_STS */
 #define ARIZONA_MICD_STS_WIDTH                        1  /* MICD_STS */
 
+/*
+ * R683 (0x2AB) - Mic Detect 4
+ */
+#define ARIZONA_MICDET_ADCVAL_DIFF_MASK          0xFF00  /* MICDET_ADCVAL_DIFF - [15:8] */
+#define ARIZONA_MICDET_ADCVAL_DIFF_SHIFT              8  /* MICDET_ADCVAL_DIFF - [15:8] */
+#define ARIZONA_MICDET_ADCVAL_DIFF_WIDTH              8  /* MICDET_ADCVAL_DIFF - [15:8] */
+#define ARIZONA_MICDET_ADCVAL_MASK               0x007F  /* MICDET_ADCVAL - [15:8] */
+#define ARIZONA_MICDET_ADCVAL_SHIFT                   0  /* MICDET_ADCVAL - [15:8] */
+#define ARIZONA_MICDET_ADCVAL_WIDTH                   7  /* MICDET_ADCVAL - [15:8] */
+
 /*
  * R707 (0x2C3) - Mic noise mix control 1
  */
@@ -2528,6 +2562,12 @@
 /*
  * R785 (0x311) - ADC Digital Volume 1L
  */
+#define ARIZONA_IN1L_SRC_MASK                    0x4000  /* IN1L_SRC - [14] */
+#define ARIZONA_IN1L_SRC_SHIFT                       14  /* IN1L_SRC - [14] */
+#define ARIZONA_IN1L_SRC_WIDTH                        1  /* IN1L_SRC - [14] */
+#define ARIZONA_IN1L_SRC_SE_MASK                 0x2000  /* IN1L_SRC - [13] */
+#define ARIZONA_IN1L_SRC_SE_SHIFT                    13  /* IN1L_SRC - [13] */
+#define ARIZONA_IN1L_SRC_SE_WIDTH                     1  /* IN1L_SRC - [13] */
 #define ARIZONA_IN_VU                            0x0200  /* IN_VU */
 #define ARIZONA_IN_VU_MASK                       0x0200  /* IN_VU */
 #define ARIZONA_IN_VU_SHIFT                           9  /* IN_VU */
@@ -2560,6 +2600,12 @@
 /*
  * R789 (0x315) - ADC Digital Volume 1R
  */
+#define ARIZONA_IN1R_SRC_MASK                    0x4000  /* IN1R_SRC - [14] */
+#define ARIZONA_IN1R_SRC_SHIFT                       14  /* IN1R_SRC - [14] */
+#define ARIZONA_IN1R_SRC_WIDTH                        1  /* IN1R_SRC - [14] */
+#define ARIZONA_IN1R_SRC_SE_MASK                 0x2000  /* IN1R_SRC - [13] */
+#define ARIZONA_IN1R_SRC_SE_SHIFT                    13  /* IN1R_SRC - [13] */
+#define ARIZONA_IN1R_SRC_SE_WIDTH                     1  /* IN1R_SRC - [13] */
 #define ARIZONA_IN_VU                            0x0200  /* IN_VU */
 #define ARIZONA_IN_VU_MASK                       0x0200  /* IN_VU */
 #define ARIZONA_IN_VU_SHIFT                           9  /* IN_VU */
@@ -2604,6 +2650,12 @@
 /*
  * R793 (0x319) - ADC Digital Volume 2L
  */
+#define ARIZONA_IN2L_SRC_MASK                    0x4000  /* IN2L_SRC - [14] */
+#define ARIZONA_IN2L_SRC_SHIFT                       14  /* IN2L_SRC - [14] */
+#define ARIZONA_IN2L_SRC_WIDTH                        1  /* IN2L_SRC - [14] */
+#define ARIZONA_IN2L_SRC_SE_MASK                 0x2000  /* IN2L_SRC - [13] */
+#define ARIZONA_IN2L_SRC_SE_SHIFT                    13  /* IN2L_SRC - [13] */
+#define ARIZONA_IN2L_SRC_SE_WIDTH                     1  /* IN2L_SRC - [13] */
 #define ARIZONA_IN_VU                            0x0200  /* IN_VU */
 #define ARIZONA_IN_VU_MASK                       0x0200  /* IN_VU */
 #define ARIZONA_IN_VU_SHIFT                           9  /* IN_VU */
@@ -3411,12 +3463,46 @@
 #define ARIZONA_DRE1L_ENA_SHIFT                       0  /* DRE1L_ENA */
 #define ARIZONA_DRE1L_ENA_WIDTH                       1  /* DRE1L_ENA */
 
+/*
+ * R1088 (0x440) - DRE Enable (WM8998)
+ */
+#define WM8998_DRE3L_ENA                          0x0020  /* DRE3L_ENA */
+#define WM8998_DRE3L_ENA_MASK                     0x0020  /* DRE3L_ENA */
+#define WM8998_DRE3L_ENA_SHIFT                         5  /* DRE3L_ENA */
+#define WM8998_DRE3L_ENA_WIDTH                         1  /* DRE3L_ENA */
+#define WM8998_DRE2L_ENA                          0x0008  /* DRE2L_ENA */
+#define WM8998_DRE2L_ENA_MASK                     0x0008  /* DRE2L_ENA */
+#define WM8998_DRE2L_ENA_SHIFT                         3  /* DRE2L_ENA */
+#define WM8998_DRE2L_ENA_WIDTH                         1  /* DRE2L_ENA */
+#define WM8998_DRE2R_ENA                          0x0004  /* DRE2R_ENA */
+#define WM8998_DRE2R_ENA_MASK                     0x0004  /* DRE2R_ENA */
+#define WM8998_DRE2R_ENA_SHIFT                         2  /* DRE2R_ENA */
+#define WM8998_DRE2R_ENA_WIDTH                         1  /* DRE2R_ENA */
+#define WM8998_DRE1L_ENA                          0x0002  /* DRE1L_ENA */
+#define WM8998_DRE1L_ENA_MASK                     0x0002  /* DRE1L_ENA */
+#define WM8998_DRE1L_ENA_SHIFT                         1  /* DRE1L_ENA */
+#define WM8998_DRE1L_ENA_WIDTH                         1  /* DRE1L_ENA */
+#define WM8998_DRE1R_ENA                          0x0001  /* DRE1R_ENA */
+#define WM8998_DRE1R_ENA_MASK                     0x0001  /* DRE1R_ENA */
+#define WM8998_DRE1R_ENA_SHIFT                         0  /* DRE1R_ENA */
+#define WM8998_DRE1R_ENA_WIDTH                         1  /* DRE1R_ENA */
+
+/*
+ * R1089 (0x441) - DRE Control 1
+ */
+#define ARIZONA_DRE_ENV_TC_FAST_MASK             0x0F00  /* DRE_ENV_TC_FAST - [11:8] */
+#define ARIZONA_DRE_ENV_TC_FAST_SHIFT                 8  /* DRE_ENV_TC_FAST - [11:8] */
+#define ARIZONA_DRE_ENV_TC_FAST_WIDTH                 4  /* DRE_ENV_TC_FAST - [11:8] */
+
 /*
  * R1090 (0x442) - DRE Control 2
  */
 #define ARIZONA_DRE_T_LOW_MASK                   0x3F00  /* DRE_T_LOW - [13:8] */
 #define ARIZONA_DRE_T_LOW_SHIFT                       8  /* DRE_T_LOW - [13:8] */
 #define ARIZONA_DRE_T_LOW_WIDTH                       6  /* DRE_T_LOW - [13:8] */
+#define ARIZONA_DRE_ALOG_VOL_DELAY_MASK          0x000F  /* DRE_ALOG_VOL_DELAY - [3:0] */
+#define ARIZONA_DRE_ALOG_VOL_DELAY_SHIFT              0  /* DRE_ALOG_VOL_DELAY - [3:0] */
+#define ARIZONA_DRE_ALOG_VOL_DELAY_WIDTH              4  /* DRE_ALOG_VOL_DELAY - [3:0] */
 
 /*
  * R1091 (0x443) - DRE Control 3
@@ -3428,6 +3514,49 @@
 #define ARIZONA_DRE_LOW_LEVEL_ABS_SHIFT               0  /* LOW_LEVEL_ABS - [3:0] */
 #define ARIZONA_DRE_LOW_LEVEL_ABS_WIDTH               4  /* LOW_LEVEL_ABS - [3:0] */
 
+/* R486 (0x448) - EDRE_Enable
+ */
+#define ARIZONA_EDRE_OUT4L_THR2_ENA              0x0200  /* EDRE_OUT4L_THR2_ENA */
+#define ARIZONA_EDRE_OUT4L_THR2_ENA_MASK         0x0200  /* EDRE_OUT4L_THR2_ENA */
+#define ARIZONA_EDRE_OUT4L_THR2_ENA_SHIFT             9  /* EDRE_OUT4L_THR2_ENA */
+#define ARIZONA_EDRE_OUT4L_THR2_ENA_WIDTH             1  /* EDRE_OUT4L_THR2_ENA */
+#define ARIZONA_EDRE_OUT4R_THR2_ENA              0x0100  /* EDRE_OUT4R_THR2_ENA */
+#define ARIZONA_EDRE_OUT4R_THR2_ENA_MASK         0x0100  /* EDRE_OUT4R_THR2_ENA */
+#define ARIZONA_EDRE_OUT4R_THR2_ENA_SHIFT             8  /* EDRE_OUT4R_THR2_ENA */
+#define ARIZONA_EDRE_OUT4R_THR2_ENA_WIDTH             1  /* EDRE_OUT4R_THR2_ENA */
+#define ARIZONA_EDRE_OUT4L_THR1_ENA              0x0080  /* EDRE_OUT4L_THR1_ENA */
+#define ARIZONA_EDRE_OUT4L_THR1_ENA_MASK         0x0080  /* EDRE_OUT4L_THR1_ENA */
+#define ARIZONA_EDRE_OUT4L_THR1_ENA_SHIFT             7  /* EDRE_OUT4L_THR1_ENA */
+#define ARIZONA_EDRE_OUT4L_THR1_ENA_WIDTH             1  /* EDRE_OUT4L_THR1_ENA */
+#define ARIZONA_EDRE_OUT4R_THR1_ENA              0x0040  /* EDRE_OUT4R_THR1_ENA */
+#define ARIZONA_EDRE_OUT4R_THR1_ENA_MASK         0x0040  /* EDRE_OUT4R_THR1_ENA */
+#define ARIZONA_EDRE_OUT4R_THR1_ENA_SHIFT             6  /* EDRE_OUT4R_THR1_ENA */
+#define ARIZONA_EDRE_OUT4R_THR1_ENA_WIDTH             1  /* EDRE_OUT4R_THR1_ENA */
+#define ARIZONA_EDRE_OUT3L_THR1_ENA              0x0020  /* EDRE_OUT3L_THR1_ENA */
+#define ARIZONA_EDRE_OUT3L_THR1_ENA_MASK         0x0020  /* EDRE_OUT3L_THR1_ENA */
+#define ARIZONA_EDRE_OUT3L_THR1_ENA_SHIFT             5  /* EDRE_OUT3L_THR1_ENA */
+#define ARIZONA_EDRE_OUT3L_THR1_ENA_WIDTH             1  /* EDRE_OUT3L_THR1_ENA */
+#define ARIZONA_EDRE_OUT3R_THR1_ENA              0x0010  /* EDRE_OUT3R_THR1_ENA */
+#define ARIZONA_EDRE_OUT3R_THR1_ENA_MASK         0x0010  /* EDRE_OUT3R_THR1_ENA */
+#define ARIZONA_EDRE_OUT3R_THR1_ENA_SHIFT             4  /* EDRE_OUT3R_THR1_ENA */
+#define ARIZONA_EDRE_OUT3R_THR1_ENA_WIDTH             1  /* EDRE_OUT3R_THR1_ENA */
+#define ARIZONA_EDRE_OUT2L_THR1_ENA              0x0008  /* EDRE_OUT2L_THR1_ENA */
+#define ARIZONA_EDRE_OUT2L_THR1_ENA_MASK         0x0008  /* EDRE_OUT2L_THR1_ENA */
+#define ARIZONA_EDRE_OUT2L_THR1_ENA_SHIFT             3  /* EDRE_OUT2L_THR1_ENA */
+#define ARIZONA_EDRE_OUT2L_THR1_ENA_WIDTH             1  /* EDRE_OUT2L_THR1_ENA */
+#define ARIZONA_EDRE_OUT2R_THR1_ENA              0x0004  /* EDRE_OUT2R_THR1_ENA */
+#define ARIZONA_EDRE_OUT2R_THR1_ENA_MASK         0x0004  /* EDRE_OUT2R_THR1_ENA */
+#define ARIZONA_EDRE_OUT2R_THR1_ENA_SHIFT             2  /* EDRE_OUT2R_THR1_ENA */
+#define ARIZONA_EDRE_OUT2R_THR1_ENA_WIDTH             1  /* EDRE_OUT2R_THR1_ENA */
+#define ARIZONA_EDRE_OUT1L_THR1_ENA              0x0002  /* EDRE_OUT1L_THR1_ENA */
+#define ARIZONA_EDRE_OUT1L_THR1_ENA_MASK         0x0002  /* EDRE_OUT1L_THR1_ENA */
+#define ARIZONA_EDRE_OUT1L_THR1_ENA_SHIFT             1  /* EDRE_OUT1L_THR1_ENA */
+#define ARIZONA_EDRE_OUT1L_THR1_ENA_WIDTH             1  /* EDRE_OUT1L_THR1_ENA */
+#define ARIZONA_EDRE_OUT1R_THR1_ENA              0x0001  /* EDRE_OUT1R_THR1_ENA */
+#define ARIZONA_EDRE_OUT1R_THR1_ENA_MASK         0x0001  /* EDRE_OUT1R_THR1_ENA */
+#define ARIZONA_EDRE_OUT1R_THR1_ENA_SHIFT             0  /* EDRE_OUT1R_THR1_ENA */
+#define ARIZONA_EDRE_OUT1R_THR1_ENA_WIDTH             1  /* EDRE_OUT1R_THR1_ENA */
+
 /*
  * R1104 (0x450) - DAC AEC Control 1
  */
@@ -4307,6 +4436,86 @@
 #define ARIZONA_AIF3_FRC_WR_SHIFT                     0  /* AIF3_FRC_WR */
 #define ARIZONA_AIF3_FRC_WR_WIDTH                     1  /* AIF3_FRC_WR */
 
+/*
+ * R1474 (0x5C2) - SPD1 TX Control
+ */
+#define ARIZONA_SPD1_VAL2                        0x2000  /* SPD1_VAL2 */
+#define ARIZONA_SPD1_VAL2_MASK                   0x2000  /* SPD1_VAL2 */
+#define ARIZONA_SPD1_VAL2_SHIFT                      13  /* SPD1_VAL2 */
+#define ARIZONA_SPD1_VAL2_WIDTH                       1  /* SPD1_VAL2 */
+#define ARIZONA_SPD1_VAL1                        0x1000  /* SPD1_VAL1 */
+#define ARIZONA_SPD1_VAL1_MASK                   0x1000  /* SPD1_VAL1 */
+#define ARIZONA_SPD1_VAL1_SHIFT                      12  /* SPD1_VAL1 */
+#define ARIZONA_SPD1_VAL1_WIDTH                       1  /* SPD1_VAL1 */
+#define ARIZONA_SPD1_RATE_MASK                   0x00F0  /* SPD1_RATE */
+#define ARIZONA_SPD1_RATE_SHIFT                       4  /* SPD1_RATE */
+#define ARIZONA_SPD1_RATE_WIDTH                       4  /* SPD1_RATE */
+#define ARIZONA_SPD1_ENA                         0x0001  /* SPD1_ENA */
+#define ARIZONA_SPD1_ENA_MASK                    0x0001  /* SPD1_ENA */
+#define ARIZONA_SPD1_ENA_SHIFT                        0  /* SPD1_ENA */
+#define ARIZONA_SPD1_ENA_WIDTH                        1  /* SPD1_ENA */
+
+/*
+ * R1475 (0x5C3) - SPD1 TX Channel Status 1
+ */
+#define ARIZONA_SPD1_CATCODE_MASK                0xFF00  /* SPD1_CATCODE */
+#define ARIZONA_SPD1_CATCODE_SHIFT                    8  /* SPD1_CATCODE */
+#define ARIZONA_SPD1_CATCODE_WIDTH                    8  /* SPD1_CATCODE */
+#define ARIZONA_SPD1_CHSTMODE_MASK               0x00C0  /* SPD1_CHSTMODE */
+#define ARIZONA_SPD1_CHSTMODE_SHIFT                   6  /* SPD1_CHSTMODE */
+#define ARIZONA_SPD1_CHSTMODE_WIDTH                   2  /* SPD1_CHSTMODE */
+#define ARIZONA_SPD1_PREEMPH_MASK                0x0038  /* SPD1_PREEMPH */
+#define ARIZONA_SPD1_PREEMPH_SHIFT                    3  /* SPD1_PREEMPH */
+#define ARIZONA_SPD1_PREEMPH_WIDTH                    3  /* SPD1_PREEMPH */
+#define ARIZONA_SPD1_NOCOPY                      0x0004  /* SPD1_NOCOPY */
+#define ARIZONA_SPD1_NOCOPY_MASK                 0x0004  /* SPD1_NOCOPY */
+#define ARIZONA_SPD1_NOCOPY_SHIFT                     2  /* SPD1_NOCOPY */
+#define ARIZONA_SPD1_NOCOPY_WIDTH                     1  /* SPD1_NOCOPY */
+#define ARIZONA_SPD1_NOAUDIO                     0x0002  /* SPD1_NOAUDIO */
+#define ARIZONA_SPD1_NOAUDIO_MASK                0x0002  /* SPD1_NOAUDIO */
+#define ARIZONA_SPD1_NOAUDIO_SHIFT                    1  /* SPD1_NOAUDIO */
+#define ARIZONA_SPD1_NOAUDIO_WIDTH                    1  /* SPD1_NOAUDIO */
+#define ARIZONA_SPD1_PRO                         0x0001  /* SPD1_PRO */
+#define ARIZONA_SPD1_PRO_MASK                    0x0001  /* SPD1_PRO */
+#define ARIZONA_SPD1_PRO_SHIFT                        0  /* SPD1_PRO */
+#define ARIZONA_SPD1_PRO_WIDTH                        1  /* SPD1_PRO */
+
+/*
+ * R1475 (0x5C4) - SPD1 TX Channel Status 2
+ */
+#define ARIZONA_SPD1_FREQ_MASK                   0xF000  /* SPD1_FREQ */
+#define ARIZONA_SPD1_FREQ_SHIFT                      12  /* SPD1_FREQ */
+#define ARIZONA_SPD1_FREQ_WIDTH                       4  /* SPD1_FREQ */
+#define ARIZONA_SPD1_CHNUM2_MASK                 0x0F00  /* SPD1_CHNUM2 */
+#define ARIZONA_SPD1_CHNUM2_SHIFT                     8  /* SPD1_CHNUM2 */
+#define ARIZONA_SPD1_CHNUM2_WIDTH                     4  /* SPD1_CHNUM2 */
+#define ARIZONA_SPD1_CHNUM1_MASK                 0x00F0  /* SPD1_CHNUM1 */
+#define ARIZONA_SPD1_CHNUM1_SHIFT                     4  /* SPD1_CHNUM1 */
+#define ARIZONA_SPD1_CHNUM1_WIDTH                     4  /* SPD1_CHNUM1 */
+#define ARIZONA_SPD1_SRCNUM_MASK                 0x000F  /* SPD1_SRCNUM */
+#define ARIZONA_SPD1_SRCNUM_SHIFT                     0  /* SPD1_SRCNUM */
+#define ARIZONA_SPD1_SRCNUM_WIDTH                     4  /* SPD1_SRCNUM */
+
+/*
+ * R1475 (0x5C5) - SPD1 TX Channel Status 3
+ */
+#define ARIZONA_SPD1_ORGSAMP_MASK                 0x0F00  /* SPD1_ORGSAMP */
+#define ARIZONA_SPD1_ORGSAMP_SHIFT                     8  /* SPD1_ORGSAMP */
+#define ARIZONA_SPD1_ORGSAMP_WIDTH                     4  /* SPD1_ORGSAMP */
+#define ARIZONA_SPD1_TXWL_MASK                    0x00E0  /* SPD1_TXWL */
+#define ARIZONA_SPD1_TXWL_SHIFT                        5  /* SPD1_TXWL */
+#define ARIZONA_SPD1_TXWL_WIDTH                        3  /* SPD1_TXWL */
+#define ARIZONA_SPD1_MAXWL                        0x0010  /* SPD1_MAXWL */
+#define ARIZONA_SPD1_MAXWL_MASK                   0x0010  /* SPD1_MAXWL */
+#define ARIZONA_SPD1_MAXWL_SHIFT                       4  /* SPD1_MAXWL */
+#define ARIZONA_SPD1_MAXWL_WIDTH                       1  /* SPD1_MAXWL */
+#define ARIZONA_SPD1_CS31_30_MASK                 0x000C  /* SPD1_CS31_30 */
+#define ARIZONA_SPD1_CS31_30_SHIFT                     2  /* SPD1_CS31_30 */
+#define ARIZONA_SPD1_CS31_30_WIDTH                     2  /* SPD1_CS31_30 */
+#define ARIZONA_SPD1_CLKACU_MASK                  0x0003  /* SPD1_CLKACU */
+#define ARIZONA_SPD1_CLKACU_SHIFT                      2  /* SPD1_CLKACU */
+#define ARIZONA_SPD1_CLKACU_WIDTH                      0  /* SPD1_CLKACU */
+
 /*
  * R1507 (0x5E3) - SLIMbus Framer Ref Gear
  */
@@ -4561,6 +4770,13 @@
 #define ARIZONA_GP_DBTIME_SHIFT                      12  /* GP_DBTIME - [15:12] */
 #define ARIZONA_GP_DBTIME_WIDTH                       4  /* GP_DBTIME - [15:12] */
 
+/*
+ * R3096 (0xC18) - GP Switch 1
+ */
+#define ARIZONA_SW1_MODE_MASK                    0x0003  /* SW1_MODE - [1:0] */
+#define ARIZONA_SW1_MODE_SHIFT                        0  /* SW1_MODE - [1:0] */
+#define ARIZONA_SW1_MODE_WIDTH                        2  /* SW1_MODE - [1:0] */
+
 /*
  * R3104 (0xC20) - Misc Pad Ctrl 1
  */
@@ -6301,6 +6517,10 @@
 /*
  * R3366 (0xD26) - Interrupt Raw Status 8
  */
+#define ARIZONA_SPDIF_OVERCLOCKED_STS            0x8000  /* SPDIF_OVERCLOCKED_STS */
+#define ARIZONA_SPDIF_OVERCLOCKED_STS_MASK       0x8000  /* SPDIF_OVERCLOCKED_STS */
+#define ARIZONA_SPDIF_OVERCLOCKED_STS_SHIFT          15  /* SPDIF_OVERCLOCKED_STS */
+#define ARIZONA_SPDIF_OVERCLOCKED_STS_WIDTH           1  /* SPDIF_OVERCLOCKED_STS */
 #define ARIZONA_AIF3_UNDERCLOCKED_STS            0x0400  /* AIF3_UNDERCLOCKED_STS */
 #define ARIZONA_AIF3_UNDERCLOCKED_STS_MASK       0x0400  /* AIF3_UNDERCLOCKED_STS */
 #define ARIZONA_AIF3_UNDERCLOCKED_STS_SHIFT          10  /* AIF3_UNDERCLOCKED_STS */
-- 
cgit v1.2.3-70-g09d2


From 9b40b030c4ad685732dd3ad5b57249db52a74e71 Mon Sep 17 00:00:00 2001
From: S Twiss <stwiss.opensource@diasemi.com>
Date: Wed, 1 Jul 2015 14:11:32 +0100
Subject: mfd: da9062: Supply core driver

Add MFD core driver support for DA9062

Signed-off-by: Steve Twiss <stwiss.opensource@diasemi.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig                  |   12 +
 drivers/mfd/Makefile                 |    3 +-
 drivers/mfd/da9062-core.c            |  512 ++++++++++++++++
 include/linux/mfd/da9062/core.h      |   50 ++
 include/linux/mfd/da9062/registers.h | 1108 ++++++++++++++++++++++++++++++++++
 5 files changed, 1684 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mfd/da9062-core.c
 create mode 100644 include/linux/mfd/da9062/core.h
 create mode 100644 include/linux/mfd/da9062/registers.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 5f746ef84418..0b442230c716 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -186,6 +186,18 @@ config MFD_DA9055
 	  This driver can be built as a module. If built as a module it will be
 	  called "da9055"
 
+config MFD_DA9062
+	tristate "Dialog Semiconductor DA9062 PMIC Support"
+	select MFD_CORE
+	select REGMAP_I2C
+	select REGMAP_IRQ
+	depends on I2C=y
+	help
+	  Say yes here for support for the Dialog Semiconductor DA9062 PMIC.
+	  This includes the I2C driver and core APIs.
+	  Additional drivers must be enabled in order to use the functionality
+	  of the device.
+
 config MFD_DA9063
 	bool "Dialog Semiconductor DA9063 PMIC Support"
 	select MFD_CORE
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 530620aa2f70..a59e3fcc8626 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -113,10 +113,11 @@ obj-$(CONFIG_MFD_LP8788)	+= lp8788.o lp8788-irq.o
 
 da9055-objs			:= da9055-core.o da9055-i2c.o
 obj-$(CONFIG_MFD_DA9055)	+= da9055.o
-
+obj-$(CONFIG_MFD_DA9062)	+= da9062-core.o
 da9063-objs			:= da9063-core.o da9063-irq.o da9063-i2c.o
 obj-$(CONFIG_MFD_DA9063)	+= da9063.o
 obj-$(CONFIG_MFD_DA9150)	+= da9150-core.o
+
 obj-$(CONFIG_MFD_MAX14577)	+= max14577.o
 obj-$(CONFIG_MFD_MAX77686)	+= max77686.o
 obj-$(CONFIG_MFD_MAX77693)	+= max77693.o
diff --git a/drivers/mfd/da9062-core.c b/drivers/mfd/da9062-core.c
new file mode 100644
index 000000000000..4cf06431f256
--- /dev/null
+++ b/drivers/mfd/da9062-core.c
@@ -0,0 +1,512 @@
+/*
+ * Core, IRQ and I2C device driver for DA9062 PMIC
+ * Copyright (C) 2015  Dialog Semiconductor Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/regmap.h>
+#include <linux/irq.h>
+#include <linux/mfd/core.h>
+#include <linux/i2c.h>
+#include <linux/mfd/da9062/core.h>
+#include <linux/mfd/da9062/registers.h>
+#include <linux/regulator/of_regulator.h>
+
+#define	DA9062_REG_EVENT_A_OFFSET	0
+#define	DA9062_REG_EVENT_B_OFFSET	1
+#define	DA9062_REG_EVENT_C_OFFSET	2
+
+static struct regmap_irq da9062_irqs[] = {
+	/* EVENT A */
+	[DA9062_IRQ_ONKEY] = {
+		.reg_offset = DA9062_REG_EVENT_A_OFFSET,
+		.mask = DA9062AA_M_NONKEY_MASK,
+	},
+	[DA9062_IRQ_ALARM] = {
+		.reg_offset = DA9062_REG_EVENT_A_OFFSET,
+		.mask = DA9062AA_M_ALARM_MASK,
+	},
+	[DA9062_IRQ_TICK] = {
+		.reg_offset = DA9062_REG_EVENT_A_OFFSET,
+		.mask = DA9062AA_M_TICK_MASK,
+	},
+	[DA9062_IRQ_WDG_WARN] = {
+		.reg_offset = DA9062_REG_EVENT_A_OFFSET,
+		.mask = DA9062AA_M_WDG_WARN_MASK,
+	},
+	[DA9062_IRQ_SEQ_RDY] = {
+		.reg_offset = DA9062_REG_EVENT_A_OFFSET,
+		.mask = DA9062AA_M_SEQ_RDY_MASK,
+	},
+	/* EVENT B */
+	[DA9062_IRQ_TEMP] = {
+		.reg_offset = DA9062_REG_EVENT_B_OFFSET,
+		.mask = DA9062AA_M_TEMP_MASK,
+	},
+	[DA9062_IRQ_LDO_LIM] = {
+		.reg_offset = DA9062_REG_EVENT_B_OFFSET,
+		.mask = DA9062AA_M_LDO_LIM_MASK,
+	},
+	[DA9062_IRQ_DVC_RDY] = {
+		.reg_offset = DA9062_REG_EVENT_B_OFFSET,
+		.mask = DA9062AA_M_DVC_RDY_MASK,
+	},
+	[DA9062_IRQ_VDD_WARN] = {
+		.reg_offset = DA9062_REG_EVENT_B_OFFSET,
+		.mask = DA9062AA_M_VDD_WARN_MASK,
+	},
+	/* EVENT C */
+	[DA9062_IRQ_GPI0] = {
+		.reg_offset = DA9062_REG_EVENT_C_OFFSET,
+		.mask = DA9062AA_M_GPI0_MASK,
+	},
+	[DA9062_IRQ_GPI1] = {
+		.reg_offset = DA9062_REG_EVENT_C_OFFSET,
+		.mask = DA9062AA_M_GPI1_MASK,
+	},
+	[DA9062_IRQ_GPI2] = {
+		.reg_offset = DA9062_REG_EVENT_C_OFFSET,
+		.mask = DA9062AA_M_GPI2_MASK,
+	},
+	[DA9062_IRQ_GPI3] = {
+		.reg_offset = DA9062_REG_EVENT_C_OFFSET,
+		.mask = DA9062AA_M_GPI3_MASK,
+	},
+	[DA9062_IRQ_GPI4] = {
+		.reg_offset = DA9062_REG_EVENT_C_OFFSET,
+		.mask = DA9062AA_M_GPI4_MASK,
+	},
+};
+
+static struct regmap_irq_chip da9062_irq_chip = {
+	.name = "da9062-irq",
+	.irqs = da9062_irqs,
+	.num_irqs = DA9062_NUM_IRQ,
+	.num_regs = 3,
+	.status_base = DA9062AA_EVENT_A,
+	.mask_base = DA9062AA_IRQ_MASK_A,
+	.ack_base = DA9062AA_EVENT_A,
+};
+
+static struct resource da9062_core_resources[] = {
+	DEFINE_RES_NAMED(DA9062_IRQ_VDD_WARN, 1, "VDD_WARN", IORESOURCE_IRQ),
+};
+
+static struct resource da9062_regulators_resources[] = {
+	DEFINE_RES_NAMED(DA9062_IRQ_LDO_LIM, 1, "LDO_LIM", IORESOURCE_IRQ),
+};
+
+static struct resource da9062_thermal_resources[] = {
+	DEFINE_RES_NAMED(DA9062_IRQ_TEMP, 1, "THERMAL", IORESOURCE_IRQ),
+};
+
+static struct resource da9062_wdt_resources[] = {
+	DEFINE_RES_NAMED(DA9062_IRQ_WDG_WARN, 1, "WD_WARN", IORESOURCE_IRQ),
+};
+
+static const struct mfd_cell da9062_devs[] = {
+	{
+		.name		= "da9062-core",
+		.num_resources	= ARRAY_SIZE(da9062_core_resources),
+		.resources	= da9062_core_resources,
+	},
+	{
+		.name		= "da9062-regulators",
+		.num_resources	= ARRAY_SIZE(da9062_regulators_resources),
+		.resources	= da9062_regulators_resources,
+	},
+	{
+		.name		= "da9062-watchdog",
+		.num_resources	= ARRAY_SIZE(da9062_wdt_resources),
+		.resources	= da9062_wdt_resources,
+		.of_compatible  = "dlg,da9062-wdt",
+	},
+	{
+		.name		= "da9062-thermal",
+		.num_resources	= ARRAY_SIZE(da9062_thermal_resources),
+		.resources	= da9062_thermal_resources,
+		.of_compatible  = "dlg,da9062-thermal",
+	},
+};
+
+static int da9062_clear_fault_log(struct da9062 *chip)
+{
+	int ret;
+	int fault_log;
+
+	ret = regmap_read(chip->regmap, DA9062AA_FAULT_LOG, &fault_log);
+	if (ret < 0)
+		return ret;
+
+	if (fault_log) {
+		if (fault_log & DA9062AA_TWD_ERROR_MASK)
+			dev_dbg(chip->dev, "Fault log entry detected: TWD_ERROR\n");
+		if (fault_log & DA9062AA_POR_MASK)
+			dev_dbg(chip->dev, "Fault log entry detected: POR\n");
+		if (fault_log & DA9062AA_VDD_FAULT_MASK)
+			dev_dbg(chip->dev, "Fault log entry detected: VDD_FAULT\n");
+		if (fault_log & DA9062AA_VDD_START_MASK)
+			dev_dbg(chip->dev, "Fault log entry detected: VDD_START\n");
+		if (fault_log & DA9062AA_TEMP_CRIT_MASK)
+			dev_dbg(chip->dev, "Fault log entry detected: TEMP_CRIT\n");
+		if (fault_log & DA9062AA_KEY_RESET_MASK)
+			dev_dbg(chip->dev, "Fault log entry detected: KEY_RESET\n");
+		if (fault_log & DA9062AA_NSHUTDOWN_MASK)
+			dev_dbg(chip->dev, "Fault log entry detected: NSHUTDOWN\n");
+		if (fault_log & DA9062AA_WAIT_SHUT_MASK)
+			dev_dbg(chip->dev, "Fault log entry detected: WAIT_SHUT\n");
+
+		ret = regmap_write(chip->regmap, DA9062AA_FAULT_LOG,
+				   fault_log);
+	}
+
+	return ret;
+}
+
+int get_device_type(struct da9062 *chip)
+{
+	int device_id, variant_id, variant_mrc;
+	int ret;
+
+	ret = regmap_read(chip->regmap, DA9062AA_DEVICE_ID, &device_id);
+	if (ret < 0) {
+		dev_err(chip->dev, "Cannot read chip ID.\n");
+		return -EIO;
+	}
+	if (device_id != DA9062_PMIC_DEVICE_ID) {
+		dev_err(chip->dev, "Invalid device ID: 0x%02x\n", device_id);
+		return -ENODEV;
+	}
+
+	ret = regmap_read(chip->regmap, DA9062AA_VARIANT_ID, &variant_id);
+	if (ret < 0) {
+		dev_err(chip->dev, "Cannot read chip variant id.\n");
+		return -EIO;
+	}
+
+	dev_info(chip->dev,
+		 "Device detected (device-ID: 0x%02X, var-ID: 0x%02X)\n",
+		 device_id, variant_id);
+
+	variant_mrc = (variant_id & DA9062AA_MRC_MASK) >> DA9062AA_MRC_SHIFT;
+
+	if (variant_mrc < DA9062_PMIC_VARIANT_MRC_AA) {
+		dev_err(chip->dev,
+			"Cannot support variant MRC: 0x%02X\n", variant_mrc);
+		return -ENODEV;
+	}
+
+	return ret;
+}
+
+static const struct regmap_range da9062_aa_readable_ranges[] = {
+	{
+		.range_min = DA9062AA_PAGE_CON,
+		.range_max = DA9062AA_STATUS_B,
+	}, {
+		.range_min = DA9062AA_STATUS_D,
+		.range_max = DA9062AA_EVENT_C,
+	}, {
+		.range_min = DA9062AA_IRQ_MASK_A,
+		.range_max = DA9062AA_IRQ_MASK_C,
+	}, {
+		.range_min = DA9062AA_CONTROL_A,
+		.range_max = DA9062AA_GPIO_4,
+	}, {
+		.range_min = DA9062AA_GPIO_WKUP_MODE,
+		.range_max = DA9062AA_BUCK4_CONT,
+	}, {
+		.range_min = DA9062AA_BUCK3_CONT,
+		.range_max = DA9062AA_BUCK3_CONT,
+	}, {
+		.range_min = DA9062AA_LDO1_CONT,
+		.range_max = DA9062AA_LDO4_CONT,
+	}, {
+		.range_min = DA9062AA_DVC_1,
+		.range_max = DA9062AA_DVC_1,
+	}, {
+		.range_min = DA9062AA_COUNT_S,
+		.range_max = DA9062AA_SECOND_D,
+	}, {
+		.range_min = DA9062AA_SEQ,
+		.range_max = DA9062AA_ID_4_3,
+	}, {
+		.range_min = DA9062AA_ID_12_11,
+		.range_max = DA9062AA_ID_16_15,
+	}, {
+		.range_min = DA9062AA_ID_22_21,
+		.range_max = DA9062AA_ID_32_31,
+	}, {
+		.range_min = DA9062AA_SEQ_A,
+		.range_max = DA9062AA_BUCK3_CFG,
+	}, {
+		.range_min = DA9062AA_VBUCK2_A,
+		.range_max = DA9062AA_VBUCK4_A,
+	}, {
+		.range_min = DA9062AA_VBUCK3_A,
+		.range_max = DA9062AA_VBUCK3_A,
+	}, {
+		.range_min = DA9062AA_VLDO1_A,
+		.range_max = DA9062AA_VLDO4_A,
+	}, {
+		.range_min = DA9062AA_VBUCK2_B,
+		.range_max = DA9062AA_VBUCK4_B,
+	}, {
+		.range_min = DA9062AA_VBUCK3_B,
+		.range_max = DA9062AA_VBUCK3_B,
+	}, {
+		.range_min = DA9062AA_VLDO1_B,
+		.range_max = DA9062AA_VLDO4_B,
+	}, {
+		.range_min = DA9062AA_BBAT_CONT,
+		.range_max = DA9062AA_BBAT_CONT,
+	}, {
+		.range_min = DA9062AA_INTERFACE,
+		.range_max = DA9062AA_CONFIG_E,
+	}, {
+		.range_min = DA9062AA_CONFIG_G,
+		.range_max = DA9062AA_CONFIG_K,
+	}, {
+		.range_min = DA9062AA_CONFIG_M,
+		.range_max = DA9062AA_CONFIG_M,
+	}, {
+		.range_min = DA9062AA_TRIM_CLDR,
+		.range_max = DA9062AA_GP_ID_19,
+	}, {
+		.range_min = DA9062AA_DEVICE_ID,
+		.range_max = DA9062AA_CONFIG_ID,
+	},
+};
+
+static const struct regmap_range da9062_aa_writeable_ranges[] = {
+	{
+		.range_min = DA9062AA_PAGE_CON,
+		.range_max = DA9062AA_PAGE_CON,
+	}, {
+		.range_min = DA9062AA_FAULT_LOG,
+		.range_max = DA9062AA_EVENT_C,
+	}, {
+		.range_min = DA9062AA_IRQ_MASK_A,
+		.range_max = DA9062AA_IRQ_MASK_C,
+	}, {
+		.range_min = DA9062AA_CONTROL_A,
+		.range_max = DA9062AA_GPIO_4,
+	}, {
+		.range_min = DA9062AA_GPIO_WKUP_MODE,
+		.range_max = DA9062AA_BUCK4_CONT,
+	}, {
+		.range_min = DA9062AA_BUCK3_CONT,
+		.range_max = DA9062AA_BUCK3_CONT,
+	}, {
+		.range_min = DA9062AA_LDO1_CONT,
+		.range_max = DA9062AA_LDO4_CONT,
+	}, {
+		.range_min = DA9062AA_DVC_1,
+		.range_max = DA9062AA_DVC_1,
+	}, {
+		.range_min = DA9062AA_COUNT_S,
+		.range_max = DA9062AA_ALARM_Y,
+	}, {
+		.range_min = DA9062AA_SEQ,
+		.range_max = DA9062AA_ID_4_3,
+	}, {
+		.range_min = DA9062AA_ID_12_11,
+		.range_max = DA9062AA_ID_16_15,
+	}, {
+		.range_min = DA9062AA_ID_22_21,
+		.range_max = DA9062AA_ID_32_31,
+	}, {
+		.range_min = DA9062AA_SEQ_A,
+		.range_max = DA9062AA_BUCK3_CFG,
+	}, {
+		.range_min = DA9062AA_VBUCK2_A,
+		.range_max = DA9062AA_VBUCK4_A,
+	}, {
+		.range_min = DA9062AA_VBUCK3_A,
+		.range_max = DA9062AA_VBUCK3_A,
+	}, {
+		.range_min = DA9062AA_VLDO1_A,
+		.range_max = DA9062AA_VLDO4_A,
+	}, {
+		.range_min = DA9062AA_VBUCK2_B,
+		.range_max = DA9062AA_VBUCK4_B,
+	}, {
+		.range_min = DA9062AA_VBUCK3_B,
+		.range_max = DA9062AA_VBUCK3_B,
+	}, {
+		.range_min = DA9062AA_VLDO1_B,
+		.range_max = DA9062AA_VLDO4_B,
+	}, {
+		.range_min = DA9062AA_BBAT_CONT,
+		.range_max = DA9062AA_BBAT_CONT,
+	}, {
+		.range_min = DA9062AA_GP_ID_0,
+		.range_max = DA9062AA_GP_ID_19,
+	},
+};
+
+static const struct regmap_range da9062_aa_volatile_ranges[] = {
+	{
+		.range_min = DA9062AA_PAGE_CON,
+		.range_max = DA9062AA_STATUS_B,
+	}, {
+		.range_min = DA9062AA_STATUS_D,
+		.range_max = DA9062AA_EVENT_C,
+	}, {
+		.range_min = DA9062AA_CONTROL_F,
+		.range_max = DA9062AA_CONTROL_F,
+	}, {
+		.range_min = DA9062AA_COUNT_S,
+		.range_max = DA9062AA_SECOND_D,
+	},
+};
+
+static const struct regmap_access_table da9062_aa_readable_table = {
+	.yes_ranges = da9062_aa_readable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(da9062_aa_readable_ranges),
+};
+
+static const struct regmap_access_table da9062_aa_writeable_table = {
+	.yes_ranges = da9062_aa_writeable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(da9062_aa_writeable_ranges),
+};
+
+static const struct regmap_access_table da9062_aa_volatile_table = {
+	.yes_ranges = da9062_aa_volatile_ranges,
+	.n_yes_ranges = ARRAY_SIZE(da9062_aa_volatile_ranges),
+};
+
+static const struct regmap_range_cfg da9062_range_cfg[] = {
+	{
+		.range_min = DA9062AA_PAGE_CON,
+		.range_max = DA9062AA_CONFIG_ID,
+		.selector_reg = DA9062AA_PAGE_CON,
+		.selector_mask = 1 << DA9062_I2C_PAGE_SEL_SHIFT,
+		.selector_shift = DA9062_I2C_PAGE_SEL_SHIFT,
+		.window_start = 0,
+		.window_len = 256,
+	}
+};
+
+static struct regmap_config da9062_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.ranges = da9062_range_cfg,
+	.num_ranges = ARRAY_SIZE(da9062_range_cfg),
+	.max_register = DA9062AA_CONFIG_ID,
+	.cache_type = REGCACHE_RBTREE,
+	.rd_table = &da9062_aa_readable_table,
+	.wr_table = &da9062_aa_writeable_table,
+	.volatile_table = &da9062_aa_volatile_table,
+};
+
+static int da9062_i2c_probe(struct i2c_client *i2c,
+	const struct i2c_device_id *id)
+{
+	struct da9062 *chip;
+	unsigned int irq_base;
+	int ret;
+
+	chip = devm_kzalloc(&i2c->dev, sizeof(*chip), GFP_KERNEL);
+	if (!chip)
+		return -ENOMEM;
+
+	i2c_set_clientdata(i2c, chip);
+	chip->dev = &i2c->dev;
+
+	if (!i2c->irq) {
+		dev_err(chip->dev, "No IRQ configured\n");
+		return -EINVAL;
+	}
+
+	chip->regmap = devm_regmap_init_i2c(i2c, &da9062_regmap_config);
+	if (IS_ERR(chip->regmap)) {
+		ret = PTR_ERR(chip->regmap);
+		dev_err(chip->dev, "Failed to allocate register map: %d\n",
+			ret);
+		return ret;
+	}
+
+	ret = da9062_clear_fault_log(chip);
+	if (ret < 0)
+		dev_warn(chip->dev, "Cannot clear fault log\n");
+
+	ret = get_device_type(chip);
+	if (ret)
+		return ret;
+
+	ret = regmap_add_irq_chip(chip->regmap, i2c->irq,
+			IRQF_TRIGGER_LOW | IRQF_ONESHOT | IRQF_SHARED,
+			-1, &da9062_irq_chip,
+			&chip->regmap_irq);
+	if (ret) {
+		dev_err(chip->dev, "Failed to request IRQ %d: %d\n",
+			i2c->irq, ret);
+		return ret;
+	}
+
+	irq_base = regmap_irq_chip_get_base(chip->regmap_irq);
+
+	ret = mfd_add_devices(chip->dev, PLATFORM_DEVID_NONE, da9062_devs,
+			      ARRAY_SIZE(da9062_devs), NULL, irq_base,
+			      NULL);
+	if (ret) {
+		dev_err(chip->dev, "Cannot register child devices\n");
+		regmap_del_irq_chip(i2c->irq, chip->regmap_irq);
+		return ret;
+	}
+
+	return ret;
+}
+
+static int da9062_i2c_remove(struct i2c_client *i2c)
+{
+	struct da9062 *chip = i2c_get_clientdata(i2c);
+
+	mfd_remove_devices(chip->dev);
+	regmap_del_irq_chip(i2c->irq, chip->regmap_irq);
+
+	return 0;
+}
+
+static const struct i2c_device_id da9062_i2c_id[] = {
+	{ "da9062", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(i2c, da9062_i2c_id);
+
+static const struct of_device_id da9062_dt_ids[] = {
+	{ .compatible = "dlg,da9062", },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, da9062_dt_ids);
+
+static struct i2c_driver da9062_i2c_driver = {
+	.driver = {
+		.name = "da9062",
+		.of_match_table = of_match_ptr(da9062_dt_ids),
+	},
+	.probe    = da9062_i2c_probe,
+	.remove   = da9062_i2c_remove,
+	.id_table = da9062_i2c_id,
+};
+
+module_i2c_driver(da9062_i2c_driver);
+
+MODULE_DESCRIPTION("Core device driver for Dialog DA9062");
+MODULE_AUTHOR("Steve Twiss <stwiss.opensource@diasemi.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/da9062/core.h b/include/linux/mfd/da9062/core.h
new file mode 100644
index 000000000000..376ba84366a0
--- /dev/null
+++ b/include/linux/mfd/da9062/core.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2015  Dialog Semiconductor Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __MFD_DA9062_CORE_H__
+#define __MFD_DA9062_CORE_H__
+
+#include <linux/interrupt.h>
+#include <linux/mfd/da9062/registers.h>
+
+/* Interrupts */
+enum da9062_irqs {
+	/* IRQ A */
+	DA9062_IRQ_ONKEY,
+	DA9062_IRQ_ALARM,
+	DA9062_IRQ_TICK,
+	DA9062_IRQ_WDG_WARN,
+	DA9062_IRQ_SEQ_RDY,
+	/* IRQ B*/
+	DA9062_IRQ_TEMP,
+	DA9062_IRQ_LDO_LIM,
+	DA9062_IRQ_DVC_RDY,
+	DA9062_IRQ_VDD_WARN,
+	/* IRQ C */
+	DA9062_IRQ_GPI0,
+	DA9062_IRQ_GPI1,
+	DA9062_IRQ_GPI2,
+	DA9062_IRQ_GPI3,
+	DA9062_IRQ_GPI4,
+
+	DA9062_NUM_IRQ,
+};
+
+struct da9062 {
+	struct device *dev;
+	struct regmap *regmap;
+	struct regmap_irq_chip_data *regmap_irq;
+};
+
+#endif /* __MFD_DA9062_CORE_H__ */
diff --git a/include/linux/mfd/da9062/registers.h b/include/linux/mfd/da9062/registers.h
new file mode 100644
index 000000000000..97790d1b02c5
--- /dev/null
+++ b/include/linux/mfd/da9062/registers.h
@@ -0,0 +1,1108 @@
+/*
+ * registers.h - REGISTERS H for DA9062
+ * Copyright (C) 2015  Dialog Semiconductor Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __DA9062_H__
+#define __DA9062_H__
+
+#define DA9062_PMIC_DEVICE_ID		0x62
+#define DA9062_PMIC_VARIANT_MRC_AA	0x01
+
+#define DA9062_I2C_PAGE_SEL_SHIFT	1
+
+/*
+ * Registers
+ */
+
+#define DA9062AA_PAGE_CON		0x000
+#define DA9062AA_STATUS_A		0x001
+#define DA9062AA_STATUS_B		0x002
+#define DA9062AA_STATUS_D		0x004
+#define DA9062AA_FAULT_LOG		0x005
+#define DA9062AA_EVENT_A		0x006
+#define DA9062AA_EVENT_B		0x007
+#define DA9062AA_EVENT_C		0x008
+#define DA9062AA_IRQ_MASK_A		0x00A
+#define DA9062AA_IRQ_MASK_B		0x00B
+#define DA9062AA_IRQ_MASK_C		0x00C
+#define DA9062AA_CONTROL_A		0x00E
+#define DA9062AA_CONTROL_B		0x00F
+#define DA9062AA_CONTROL_C		0x010
+#define DA9062AA_CONTROL_D		0x011
+#define DA9062AA_CONTROL_E		0x012
+#define DA9062AA_CONTROL_F		0x013
+#define DA9062AA_PD_DIS			0x014
+#define DA9062AA_GPIO_0_1		0x015
+#define DA9062AA_GPIO_2_3		0x016
+#define DA9062AA_GPIO_4			0x017
+#define DA9062AA_GPIO_WKUP_MODE		0x01C
+#define DA9062AA_GPIO_MODE0_4		0x01D
+#define DA9062AA_GPIO_OUT0_2		0x01E
+#define DA9062AA_GPIO_OUT3_4		0x01F
+#define DA9062AA_BUCK2_CONT		0x020
+#define DA9062AA_BUCK1_CONT		0x021
+#define DA9062AA_BUCK4_CONT		0x022
+#define DA9062AA_BUCK3_CONT		0x024
+#define DA9062AA_LDO1_CONT		0x026
+#define DA9062AA_LDO2_CONT		0x027
+#define DA9062AA_LDO3_CONT		0x028
+#define DA9062AA_LDO4_CONT		0x029
+#define DA9062AA_DVC_1			0x032
+#define DA9062AA_COUNT_S		0x040
+#define DA9062AA_COUNT_MI		0x041
+#define DA9062AA_COUNT_H		0x042
+#define DA9062AA_COUNT_D		0x043
+#define DA9062AA_COUNT_MO		0x044
+#define DA9062AA_COUNT_Y		0x045
+#define DA9062AA_ALARM_S		0x046
+#define DA9062AA_ALARM_MI		0x047
+#define DA9062AA_ALARM_H		0x048
+#define DA9062AA_ALARM_D		0x049
+#define DA9062AA_ALARM_MO		0x04A
+#define DA9062AA_ALARM_Y		0x04B
+#define DA9062AA_SECOND_A		0x04C
+#define DA9062AA_SECOND_B		0x04D
+#define DA9062AA_SECOND_C		0x04E
+#define DA9062AA_SECOND_D		0x04F
+#define DA9062AA_SEQ			0x081
+#define DA9062AA_SEQ_TIMER		0x082
+#define DA9062AA_ID_2_1			0x083
+#define DA9062AA_ID_4_3			0x084
+#define DA9062AA_ID_12_11		0x088
+#define DA9062AA_ID_14_13		0x089
+#define DA9062AA_ID_16_15		0x08A
+#define DA9062AA_ID_22_21		0x08D
+#define DA9062AA_ID_24_23		0x08E
+#define DA9062AA_ID_26_25		0x08F
+#define DA9062AA_ID_28_27		0x090
+#define DA9062AA_ID_30_29		0x091
+#define DA9062AA_ID_32_31		0x092
+#define DA9062AA_SEQ_A			0x095
+#define DA9062AA_SEQ_B			0x096
+#define DA9062AA_WAIT			0x097
+#define DA9062AA_EN_32K			0x098
+#define DA9062AA_RESET			0x099
+#define DA9062AA_BUCK_ILIM_A		0x09A
+#define DA9062AA_BUCK_ILIM_B		0x09B
+#define DA9062AA_BUCK_ILIM_C		0x09C
+#define DA9062AA_BUCK2_CFG		0x09D
+#define DA9062AA_BUCK1_CFG		0x09E
+#define DA9062AA_BUCK4_CFG		0x09F
+#define DA9062AA_BUCK3_CFG		0x0A0
+#define DA9062AA_VBUCK2_A		0x0A3
+#define DA9062AA_VBUCK1_A		0x0A4
+#define DA9062AA_VBUCK4_A		0x0A5
+#define DA9062AA_VBUCK3_A		0x0A7
+#define DA9062AA_VLDO1_A		0x0A9
+#define DA9062AA_VLDO2_A		0x0AA
+#define DA9062AA_VLDO3_A		0x0AB
+#define DA9062AA_VLDO4_A		0x0AC
+#define DA9062AA_VBUCK2_B		0x0B4
+#define DA9062AA_VBUCK1_B		0x0B5
+#define DA9062AA_VBUCK4_B		0x0B6
+#define DA9062AA_VBUCK3_B		0x0B8
+#define DA9062AA_VLDO1_B		0x0BA
+#define DA9062AA_VLDO2_B		0x0BB
+#define DA9062AA_VLDO3_B		0x0BC
+#define DA9062AA_VLDO4_B		0x0BD
+#define DA9062AA_BBAT_CONT		0x0C5
+#define DA9062AA_INTERFACE		0x105
+#define DA9062AA_CONFIG_A		0x106
+#define DA9062AA_CONFIG_B		0x107
+#define DA9062AA_CONFIG_C		0x108
+#define DA9062AA_CONFIG_D		0x109
+#define DA9062AA_CONFIG_E		0x10A
+#define DA9062AA_CONFIG_G		0x10C
+#define DA9062AA_CONFIG_H		0x10D
+#define DA9062AA_CONFIG_I		0x10E
+#define DA9062AA_CONFIG_J		0x10F
+#define DA9062AA_CONFIG_K		0x110
+#define DA9062AA_CONFIG_M		0x112
+#define DA9062AA_TRIM_CLDR		0x120
+#define DA9062AA_GP_ID_0		0x121
+#define DA9062AA_GP_ID_1		0x122
+#define DA9062AA_GP_ID_2		0x123
+#define DA9062AA_GP_ID_3		0x124
+#define DA9062AA_GP_ID_4		0x125
+#define DA9062AA_GP_ID_5		0x126
+#define DA9062AA_GP_ID_6		0x127
+#define DA9062AA_GP_ID_7		0x128
+#define DA9062AA_GP_ID_8		0x129
+#define DA9062AA_GP_ID_9		0x12A
+#define DA9062AA_GP_ID_10		0x12B
+#define DA9062AA_GP_ID_11		0x12C
+#define DA9062AA_GP_ID_12		0x12D
+#define DA9062AA_GP_ID_13		0x12E
+#define DA9062AA_GP_ID_14		0x12F
+#define DA9062AA_GP_ID_15		0x130
+#define DA9062AA_GP_ID_16		0x131
+#define DA9062AA_GP_ID_17		0x132
+#define DA9062AA_GP_ID_18		0x133
+#define DA9062AA_GP_ID_19		0x134
+#define DA9062AA_DEVICE_ID		0x181
+#define DA9062AA_VARIANT_ID		0x182
+#define DA9062AA_CUSTOMER_ID		0x183
+#define DA9062AA_CONFIG_ID		0x184
+
+/*
+ * Bit fields
+ */
+
+/* DA9062AA_PAGE_CON = 0x000 */
+#define DA9062AA_PAGE_SHIFT		0
+#define DA9062AA_PAGE_MASK		0x3f
+#define DA9062AA_WRITE_MODE_SHIFT	6
+#define DA9062AA_WRITE_MODE_MASK	BIT(6)
+#define DA9062AA_REVERT_SHIFT		7
+#define DA9062AA_REVERT_MASK		BIT(7)
+
+/* DA9062AA_STATUS_A = 0x001 */
+#define DA9062AA_NONKEY_SHIFT		0
+#define DA9062AA_NONKEY_MASK		0x01
+#define DA9062AA_DVC_BUSY_SHIFT		2
+#define DA9062AA_DVC_BUSY_MASK		BIT(2)
+
+/* DA9062AA_STATUS_B = 0x002 */
+#define DA9062AA_GPI0_SHIFT		0
+#define DA9062AA_GPI0_MASK		0x01
+#define DA9062AA_GPI1_SHIFT		1
+#define DA9062AA_GPI1_MASK		BIT(1)
+#define DA9062AA_GPI2_SHIFT		2
+#define DA9062AA_GPI2_MASK		BIT(2)
+#define DA9062AA_GPI3_SHIFT		3
+#define DA9062AA_GPI3_MASK		BIT(3)
+#define DA9062AA_GPI4_SHIFT		4
+#define DA9062AA_GPI4_MASK		BIT(4)
+
+/* DA9062AA_STATUS_D = 0x004 */
+#define DA9062AA_LDO1_ILIM_SHIFT	0
+#define DA9062AA_LDO1_ILIM_MASK		0x01
+#define DA9062AA_LDO2_ILIM_SHIFT	1
+#define DA9062AA_LDO2_ILIM_MASK		BIT(1)
+#define DA9062AA_LDO3_ILIM_SHIFT	2
+#define DA9062AA_LDO3_ILIM_MASK		BIT(2)
+#define DA9062AA_LDO4_ILIM_SHIFT	3
+#define DA9062AA_LDO4_ILIM_MASK		BIT(3)
+
+/* DA9062AA_FAULT_LOG = 0x005 */
+#define DA9062AA_TWD_ERROR_SHIFT	0
+#define DA9062AA_TWD_ERROR_MASK		0x01
+#define DA9062AA_POR_SHIFT		1
+#define DA9062AA_POR_MASK		BIT(1)
+#define DA9062AA_VDD_FAULT_SHIFT	2
+#define DA9062AA_VDD_FAULT_MASK		BIT(2)
+#define DA9062AA_VDD_START_SHIFT	3
+#define DA9062AA_VDD_START_MASK		BIT(3)
+#define DA9062AA_TEMP_CRIT_SHIFT	4
+#define DA9062AA_TEMP_CRIT_MASK		BIT(4)
+#define DA9062AA_KEY_RESET_SHIFT	5
+#define DA9062AA_KEY_RESET_MASK		BIT(5)
+#define DA9062AA_NSHUTDOWN_SHIFT	6
+#define DA9062AA_NSHUTDOWN_MASK		BIT(6)
+#define DA9062AA_WAIT_SHUT_SHIFT	7
+#define DA9062AA_WAIT_SHUT_MASK		BIT(7)
+
+/* DA9062AA_EVENT_A = 0x006 */
+#define DA9062AA_E_NONKEY_SHIFT		0
+#define DA9062AA_E_NONKEY_MASK		0x01
+#define DA9062AA_E_ALARM_SHIFT		1
+#define DA9062AA_E_ALARM_MASK		BIT(1)
+#define DA9062AA_E_TICK_SHIFT		2
+#define DA9062AA_E_TICK_MASK		BIT(2)
+#define DA9062AA_E_WDG_WARN_SHIFT	3
+#define DA9062AA_E_WDG_WARN_MASK	BIT(3)
+#define DA9062AA_E_SEQ_RDY_SHIFT	4
+#define DA9062AA_E_SEQ_RDY_MASK		BIT(4)
+#define DA9062AA_EVENTS_B_SHIFT		5
+#define DA9062AA_EVENTS_B_MASK		BIT(5)
+#define DA9062AA_EVENTS_C_SHIFT		6
+#define DA9062AA_EVENTS_C_MASK		BIT(6)
+
+/* DA9062AA_EVENT_B = 0x007 */
+#define DA9062AA_E_TEMP_SHIFT		1
+#define DA9062AA_E_TEMP_MASK		BIT(1)
+#define DA9062AA_E_LDO_LIM_SHIFT	3
+#define DA9062AA_E_LDO_LIM_MASK		BIT(3)
+#define DA9062AA_E_DVC_RDY_SHIFT	5
+#define DA9062AA_E_DVC_RDY_MASK		BIT(5)
+#define DA9062AA_E_VDD_WARN_SHIFT	7
+#define DA9062AA_E_VDD_WARN_MASK	BIT(7)
+
+/* DA9062AA_EVENT_C = 0x008 */
+#define DA9062AA_E_GPI0_SHIFT		0
+#define DA9062AA_E_GPI0_MASK		0x01
+#define DA9062AA_E_GPI1_SHIFT		1
+#define DA9062AA_E_GPI1_MASK		BIT(1)
+#define DA9062AA_E_GPI2_SHIFT		2
+#define DA9062AA_E_GPI2_MASK		BIT(2)
+#define DA9062AA_E_GPI3_SHIFT		3
+#define DA9062AA_E_GPI3_MASK		BIT(3)
+#define DA9062AA_E_GPI4_SHIFT		4
+#define DA9062AA_E_GPI4_MASK		BIT(4)
+
+/* DA9062AA_IRQ_MASK_A = 0x00A */
+#define DA9062AA_M_NONKEY_SHIFT		0
+#define DA9062AA_M_NONKEY_MASK		0x01
+#define DA9062AA_M_ALARM_SHIFT		1
+#define DA9062AA_M_ALARM_MASK		BIT(1)
+#define DA9062AA_M_TICK_SHIFT		2
+#define DA9062AA_M_TICK_MASK		BIT(2)
+#define DA9062AA_M_WDG_WARN_SHIFT	3
+#define DA9062AA_M_WDG_WARN_MASK	BIT(3)
+#define DA9062AA_M_SEQ_RDY_SHIFT	4
+#define DA9062AA_M_SEQ_RDY_MASK		BIT(4)
+
+/* DA9062AA_IRQ_MASK_B = 0x00B */
+#define DA9062AA_M_TEMP_SHIFT		1
+#define DA9062AA_M_TEMP_MASK		BIT(1)
+#define DA9062AA_M_LDO_LIM_SHIFT	3
+#define DA9062AA_M_LDO_LIM_MASK		BIT(3)
+#define DA9062AA_M_DVC_RDY_SHIFT	5
+#define DA9062AA_M_DVC_RDY_MASK		BIT(5)
+#define DA9062AA_M_VDD_WARN_SHIFT	7
+#define DA9062AA_M_VDD_WARN_MASK	BIT(7)
+
+/* DA9062AA_IRQ_MASK_C = 0x00C */
+#define DA9062AA_M_GPI0_SHIFT		0
+#define DA9062AA_M_GPI0_MASK		0x01
+#define DA9062AA_M_GPI1_SHIFT		1
+#define DA9062AA_M_GPI1_MASK		BIT(1)
+#define DA9062AA_M_GPI2_SHIFT		2
+#define DA9062AA_M_GPI2_MASK		BIT(2)
+#define DA9062AA_M_GPI3_SHIFT		3
+#define DA9062AA_M_GPI3_MASK		BIT(3)
+#define DA9062AA_M_GPI4_SHIFT		4
+#define DA9062AA_M_GPI4_MASK		BIT(4)
+
+/* DA9062AA_CONTROL_A = 0x00E */
+#define DA9062AA_SYSTEM_EN_SHIFT	0
+#define DA9062AA_SYSTEM_EN_MASK		0x01
+#define DA9062AA_POWER_EN_SHIFT		1
+#define DA9062AA_POWER_EN_MASK		BIT(1)
+#define DA9062AA_POWER1_EN_SHIFT	2
+#define DA9062AA_POWER1_EN_MASK		BIT(2)
+#define DA9062AA_STANDBY_SHIFT		3
+#define DA9062AA_STANDBY_MASK		BIT(3)
+#define DA9062AA_M_SYSTEM_EN_SHIFT	4
+#define DA9062AA_M_SYSTEM_EN_MASK	BIT(4)
+#define DA9062AA_M_POWER_EN_SHIFT	5
+#define DA9062AA_M_POWER_EN_MASK	BIT(5)
+#define DA9062AA_M_POWER1_EN_SHIFT	6
+#define DA9062AA_M_POWER1_EN_MASK	BIT(6)
+
+/* DA9062AA_CONTROL_B = 0x00F */
+#define DA9062AA_WATCHDOG_PD_SHIFT	1
+#define DA9062AA_WATCHDOG_PD_MASK	BIT(1)
+#define DA9062AA_FREEZE_EN_SHIFT	2
+#define DA9062AA_FREEZE_EN_MASK		BIT(2)
+#define DA9062AA_NRES_MODE_SHIFT	3
+#define DA9062AA_NRES_MODE_MASK		BIT(3)
+#define DA9062AA_NONKEY_LOCK_SHIFT	4
+#define DA9062AA_NONKEY_LOCK_MASK	BIT(4)
+#define DA9062AA_NFREEZE_SHIFT		5
+#define DA9062AA_NFREEZE_MASK		(0x03 << 5)
+#define DA9062AA_BUCK_SLOWSTART_SHIFT	7
+#define DA9062AA_BUCK_SLOWSTART_MASK	BIT(7)
+
+/* DA9062AA_CONTROL_C = 0x010 */
+#define DA9062AA_DEBOUNCING_SHIFT	0
+#define DA9062AA_DEBOUNCING_MASK	0x07
+#define DA9062AA_AUTO_BOOT_SHIFT	3
+#define DA9062AA_AUTO_BOOT_MASK		BIT(3)
+#define DA9062AA_OTPREAD_EN_SHIFT	4
+#define DA9062AA_OTPREAD_EN_MASK	BIT(4)
+#define DA9062AA_SLEW_RATE_SHIFT	5
+#define DA9062AA_SLEW_RATE_MASK		(0x03 << 5)
+#define DA9062AA_DEF_SUPPLY_SHIFT	7
+#define DA9062AA_DEF_SUPPLY_MASK	BIT(7)
+
+/* DA9062AA_CONTROL_D = 0x011 */
+#define DA9062AA_TWDSCALE_SHIFT		0
+#define DA9062AA_TWDSCALE_MASK		0x07
+
+/* DA9062AA_CONTROL_E = 0x012 */
+#define DA9062AA_RTC_MODE_PD_SHIFT	0
+#define DA9062AA_RTC_MODE_PD_MASK	0x01
+#define DA9062AA_RTC_MODE_SD_SHIFT	1
+#define DA9062AA_RTC_MODE_SD_MASK	BIT(1)
+#define DA9062AA_RTC_EN_SHIFT		2
+#define DA9062AA_RTC_EN_MASK		BIT(2)
+#define DA9062AA_V_LOCK_SHIFT		7
+#define DA9062AA_V_LOCK_MASK		BIT(7)
+
+/* DA9062AA_CONTROL_F = 0x013 */
+#define DA9062AA_WATCHDOG_SHIFT		0
+#define DA9062AA_WATCHDOG_MASK		0x01
+#define DA9062AA_SHUTDOWN_SHIFT		1
+#define DA9062AA_SHUTDOWN_MASK		BIT(1)
+#define DA9062AA_WAKE_UP_SHIFT		2
+#define DA9062AA_WAKE_UP_MASK		BIT(2)
+
+/* DA9062AA_PD_DIS = 0x014 */
+#define DA9062AA_GPI_DIS_SHIFT		0
+#define DA9062AA_GPI_DIS_MASK		0x01
+#define DA9062AA_PMIF_DIS_SHIFT		2
+#define DA9062AA_PMIF_DIS_MASK		BIT(2)
+#define DA9062AA_CLDR_PAUSE_SHIFT	4
+#define DA9062AA_CLDR_PAUSE_MASK	BIT(4)
+#define DA9062AA_BBAT_DIS_SHIFT		5
+#define DA9062AA_BBAT_DIS_MASK		BIT(5)
+#define DA9062AA_OUT32K_PAUSE_SHIFT	6
+#define DA9062AA_OUT32K_PAUSE_MASK	BIT(6)
+#define DA9062AA_PMCONT_DIS_SHIFT	7
+#define DA9062AA_PMCONT_DIS_MASK	BIT(7)
+
+/* DA9062AA_GPIO_0_1 = 0x015 */
+#define DA9062AA_GPIO0_PIN_SHIFT	0
+#define DA9062AA_GPIO0_PIN_MASK		0x03
+#define DA9062AA_GPIO0_TYPE_SHIFT	2
+#define DA9062AA_GPIO0_TYPE_MASK	BIT(2)
+#define DA9062AA_GPIO0_WEN_SHIFT	3
+#define DA9062AA_GPIO0_WEN_MASK		BIT(3)
+#define DA9062AA_GPIO1_PIN_SHIFT	4
+#define DA9062AA_GPIO1_PIN_MASK		(0x03 << 4)
+#define DA9062AA_GPIO1_TYPE_SHIFT	6
+#define DA9062AA_GPIO1_TYPE_MASK	BIT(6)
+#define DA9062AA_GPIO1_WEN_SHIFT	7
+#define DA9062AA_GPIO1_WEN_MASK		BIT(7)
+
+/* DA9062AA_GPIO_2_3 = 0x016 */
+#define DA9062AA_GPIO2_PIN_SHIFT	0
+#define DA9062AA_GPIO2_PIN_MASK		0x03
+#define DA9062AA_GPIO2_TYPE_SHIFT	2
+#define DA9062AA_GPIO2_TYPE_MASK	BIT(2)
+#define DA9062AA_GPIO2_WEN_SHIFT	3
+#define DA9062AA_GPIO2_WEN_MASK		BIT(3)
+#define DA9062AA_GPIO3_PIN_SHIFT	4
+#define DA9062AA_GPIO3_PIN_MASK		(0x03 << 4)
+#define DA9062AA_GPIO3_TYPE_SHIFT	6
+#define DA9062AA_GPIO3_TYPE_MASK	BIT(6)
+#define DA9062AA_GPIO3_WEN_SHIFT	7
+#define DA9062AA_GPIO3_WEN_MASK		BIT(7)
+
+/* DA9062AA_GPIO_4 = 0x017 */
+#define DA9062AA_GPIO4_PIN_SHIFT	0
+#define DA9062AA_GPIO4_PIN_MASK		0x03
+#define DA9062AA_GPIO4_TYPE_SHIFT	2
+#define DA9062AA_GPIO4_TYPE_MASK	BIT(2)
+#define DA9062AA_GPIO4_WEN_SHIFT	3
+#define DA9062AA_GPIO4_WEN_MASK		BIT(3)
+
+/* DA9062AA_GPIO_WKUP_MODE = 0x01C */
+#define DA9062AA_GPIO0_WKUP_MODE_SHIFT	0
+#define DA9062AA_GPIO0_WKUP_MODE_MASK	0x01
+#define DA9062AA_GPIO1_WKUP_MODE_SHIFT	1
+#define DA9062AA_GPIO1_WKUP_MODE_MASK	BIT(1)
+#define DA9062AA_GPIO2_WKUP_MODE_SHIFT	2
+#define DA9062AA_GPIO2_WKUP_MODE_MASK	BIT(2)
+#define DA9062AA_GPIO3_WKUP_MODE_SHIFT	3
+#define DA9062AA_GPIO3_WKUP_MODE_MASK	BIT(3)
+#define DA9062AA_GPIO4_WKUP_MODE_SHIFT	4
+#define DA9062AA_GPIO4_WKUP_MODE_MASK	BIT(4)
+
+/* DA9062AA_GPIO_MODE0_4 = 0x01D */
+#define DA9062AA_GPIO0_MODE_SHIFT	0
+#define DA9062AA_GPIO0_MODE_MASK	0x01
+#define DA9062AA_GPIO1_MODE_SHIFT	1
+#define DA9062AA_GPIO1_MODE_MASK	BIT(1)
+#define DA9062AA_GPIO2_MODE_SHIFT	2
+#define DA9062AA_GPIO2_MODE_MASK	BIT(2)
+#define DA9062AA_GPIO3_MODE_SHIFT	3
+#define DA9062AA_GPIO3_MODE_MASK	BIT(3)
+#define DA9062AA_GPIO4_MODE_SHIFT	4
+#define DA9062AA_GPIO4_MODE_MASK	BIT(4)
+
+/* DA9062AA_GPIO_OUT0_2 = 0x01E */
+#define DA9062AA_GPIO0_OUT_SHIFT	0
+#define DA9062AA_GPIO0_OUT_MASK		0x07
+#define DA9062AA_GPIO1_OUT_SHIFT	3
+#define DA9062AA_GPIO1_OUT_MASK		(0x07 << 3)
+#define DA9062AA_GPIO2_OUT_SHIFT	6
+#define DA9062AA_GPIO2_OUT_MASK		(0x03 << 6)
+
+/* DA9062AA_GPIO_OUT3_4 = 0x01F */
+#define DA9062AA_GPIO3_OUT_SHIFT	0
+#define DA9062AA_GPIO3_OUT_MASK		0x07
+#define DA9062AA_GPIO4_OUT_SHIFT	3
+#define DA9062AA_GPIO4_OUT_MASK		(0x03 << 3)
+
+/* DA9062AA_BUCK2_CONT = 0x020 */
+#define DA9062AA_BUCK2_EN_SHIFT		0
+#define DA9062AA_BUCK2_EN_MASK		0x01
+#define DA9062AA_BUCK2_GPI_SHIFT	1
+#define DA9062AA_BUCK2_GPI_MASK		(0x03 << 1)
+#define DA9062AA_BUCK2_CONF_SHIFT	3
+#define DA9062AA_BUCK2_CONF_MASK	BIT(3)
+#define DA9062AA_VBUCK2_GPI_SHIFT	5
+#define DA9062AA_VBUCK2_GPI_MASK	(0x03 << 5)
+
+/* DA9062AA_BUCK1_CONT = 0x021 */
+#define DA9062AA_BUCK1_EN_SHIFT		0
+#define DA9062AA_BUCK1_EN_MASK		0x01
+#define DA9062AA_BUCK1_GPI_SHIFT	1
+#define DA9062AA_BUCK1_GPI_MASK		(0x03 << 1)
+#define DA9062AA_BUCK1_CONF_SHIFT	3
+#define DA9062AA_BUCK1_CONF_MASK	BIT(3)
+#define DA9062AA_VBUCK1_GPI_SHIFT	5
+#define DA9062AA_VBUCK1_GPI_MASK	(0x03 << 5)
+
+/* DA9062AA_BUCK4_CONT = 0x022 */
+#define DA9062AA_BUCK4_EN_SHIFT		0
+#define DA9062AA_BUCK4_EN_MASK		0x01
+#define DA9062AA_BUCK4_GPI_SHIFT	1
+#define DA9062AA_BUCK4_GPI_MASK		(0x03 << 1)
+#define DA9062AA_BUCK4_CONF_SHIFT	3
+#define DA9062AA_BUCK4_CONF_MASK	BIT(3)
+#define DA9062AA_VBUCK4_GPI_SHIFT	5
+#define DA9062AA_VBUCK4_GPI_MASK	(0x03 << 5)
+
+/* DA9062AA_BUCK3_CONT = 0x024 */
+#define DA9062AA_BUCK3_EN_SHIFT		0
+#define DA9062AA_BUCK3_EN_MASK		0x01
+#define DA9062AA_BUCK3_GPI_SHIFT	1
+#define DA9062AA_BUCK3_GPI_MASK		(0x03 << 1)
+#define DA9062AA_BUCK3_CONF_SHIFT	3
+#define DA9062AA_BUCK3_CONF_MASK	BIT(3)
+#define DA9062AA_VBUCK3_GPI_SHIFT	5
+#define DA9062AA_VBUCK3_GPI_MASK	(0x03 << 5)
+
+/* DA9062AA_LDO1_CONT = 0x026 */
+#define DA9062AA_LDO1_EN_SHIFT		0
+#define DA9062AA_LDO1_EN_MASK		0x01
+#define DA9062AA_LDO1_GPI_SHIFT		1
+#define DA9062AA_LDO1_GPI_MASK		(0x03 << 1)
+#define DA9062AA_LDO1_PD_DIS_SHIFT	3
+#define DA9062AA_LDO1_PD_DIS_MASK	BIT(3)
+#define DA9062AA_VLDO1_GPI_SHIFT	5
+#define DA9062AA_VLDO1_GPI_MASK		(0x03 << 5)
+#define DA9062AA_LDO1_CONF_SHIFT	7
+#define DA9062AA_LDO1_CONF_MASK		BIT(7)
+
+/* DA9062AA_LDO2_CONT = 0x027 */
+#define DA9062AA_LDO2_EN_SHIFT		0
+#define DA9062AA_LDO2_EN_MASK		0x01
+#define DA9062AA_LDO2_GPI_SHIFT		1
+#define DA9062AA_LDO2_GPI_MASK		(0x03 << 1)
+#define DA9062AA_LDO2_PD_DIS_SHIFT	3
+#define DA9062AA_LDO2_PD_DIS_MASK	BIT(3)
+#define DA9062AA_VLDO2_GPI_SHIFT	5
+#define DA9062AA_VLDO2_GPI_MASK		(0x03 << 5)
+#define DA9062AA_LDO2_CONF_SHIFT	7
+#define DA9062AA_LDO2_CONF_MASK		BIT(7)
+
+/* DA9062AA_LDO3_CONT = 0x028 */
+#define DA9062AA_LDO3_EN_SHIFT		0
+#define DA9062AA_LDO3_EN_MASK		0x01
+#define DA9062AA_LDO3_GPI_SHIFT		1
+#define DA9062AA_LDO3_GPI_MASK		(0x03 << 1)
+#define DA9062AA_LDO3_PD_DIS_SHIFT	3
+#define DA9062AA_LDO3_PD_DIS_MASK	BIT(3)
+#define DA9062AA_VLDO3_GPI_SHIFT	5
+#define DA9062AA_VLDO3_GPI_MASK		(0x03 << 5)
+#define DA9062AA_LDO3_CONF_SHIFT	7
+#define DA9062AA_LDO3_CONF_MASK		BIT(7)
+
+/* DA9062AA_LDO4_CONT = 0x029 */
+#define DA9062AA_LDO4_EN_SHIFT		0
+#define DA9062AA_LDO4_EN_MASK		0x01
+#define DA9062AA_LDO4_GPI_SHIFT		1
+#define DA9062AA_LDO4_GPI_MASK		(0x03 << 1)
+#define DA9062AA_LDO4_PD_DIS_SHIFT	3
+#define DA9062AA_LDO4_PD_DIS_MASK	BIT(3)
+#define DA9062AA_VLDO4_GPI_SHIFT	5
+#define DA9062AA_VLDO4_GPI_MASK		(0x03 << 5)
+#define DA9062AA_LDO4_CONF_SHIFT	7
+#define DA9062AA_LDO4_CONF_MASK		BIT(7)
+
+/* DA9062AA_DVC_1 = 0x032 */
+#define DA9062AA_VBUCK1_SEL_SHIFT	0
+#define DA9062AA_VBUCK1_SEL_MASK	0x01
+#define DA9062AA_VBUCK2_SEL_SHIFT	1
+#define DA9062AA_VBUCK2_SEL_MASK	BIT(1)
+#define DA9062AA_VBUCK4_SEL_SHIFT	2
+#define DA9062AA_VBUCK4_SEL_MASK	BIT(2)
+#define DA9062AA_VBUCK3_SEL_SHIFT	3
+#define DA9062AA_VBUCK3_SEL_MASK	BIT(3)
+#define DA9062AA_VLDO1_SEL_SHIFT	4
+#define DA9062AA_VLDO1_SEL_MASK		BIT(4)
+#define DA9062AA_VLDO2_SEL_SHIFT	5
+#define DA9062AA_VLDO2_SEL_MASK		BIT(5)
+#define DA9062AA_VLDO3_SEL_SHIFT	6
+#define DA9062AA_VLDO3_SEL_MASK		BIT(6)
+#define DA9062AA_VLDO4_SEL_SHIFT	7
+#define DA9062AA_VLDO4_SEL_MASK		BIT(7)
+
+/* DA9062AA_COUNT_S = 0x040 */
+#define DA9062AA_COUNT_SEC_SHIFT	0
+#define DA9062AA_COUNT_SEC_MASK		0x3f
+#define DA9062AA_RTC_READ_SHIFT		7
+#define DA9062AA_RTC_READ_MASK		BIT(7)
+
+/* DA9062AA_COUNT_MI = 0x041 */
+#define DA9062AA_COUNT_MIN_SHIFT	0
+#define DA9062AA_COUNT_MIN_MASK		0x3f
+
+/* DA9062AA_COUNT_H = 0x042 */
+#define DA9062AA_COUNT_HOUR_SHIFT	0
+#define DA9062AA_COUNT_HOUR_MASK	0x1f
+
+/* DA9062AA_COUNT_D = 0x043 */
+#define DA9062AA_COUNT_DAY_SHIFT	0
+#define DA9062AA_COUNT_DAY_MASK		0x1f
+
+/* DA9062AA_COUNT_MO = 0x044 */
+#define DA9062AA_COUNT_MONTH_SHIFT	0
+#define DA9062AA_COUNT_MONTH_MASK	0x0f
+
+/* DA9062AA_COUNT_Y = 0x045 */
+#define DA9062AA_COUNT_YEAR_SHIFT	0
+#define DA9062AA_COUNT_YEAR_MASK	0x3f
+#define DA9062AA_MONITOR_SHIFT		6
+#define DA9062AA_MONITOR_MASK		BIT(6)
+
+/* DA9062AA_ALARM_S = 0x046 */
+#define DA9062AA_ALARM_SEC_SHIFT	0
+#define DA9062AA_ALARM_SEC_MASK		0x3f
+#define DA9062AA_ALARM_STATUS_SHIFT	6
+#define DA9062AA_ALARM_STATUS_MASK	(0x03 << 6)
+
+/* DA9062AA_ALARM_MI = 0x047 */
+#define DA9062AA_ALARM_MIN_SHIFT	0
+#define DA9062AA_ALARM_MIN_MASK		0x3f
+
+/* DA9062AA_ALARM_H = 0x048 */
+#define DA9062AA_ALARM_HOUR_SHIFT	0
+#define DA9062AA_ALARM_HOUR_MASK	0x1f
+
+/* DA9062AA_ALARM_D = 0x049 */
+#define DA9062AA_ALARM_DAY_SHIFT	0
+#define DA9062AA_ALARM_DAY_MASK		0x1f
+
+/* DA9062AA_ALARM_MO = 0x04A */
+#define DA9062AA_ALARM_MONTH_SHIFT	0
+#define DA9062AA_ALARM_MONTH_MASK	0x0f
+#define DA9062AA_TICK_TYPE_SHIFT	4
+#define DA9062AA_TICK_TYPE_MASK		BIT(4)
+#define DA9062AA_TICK_WAKE_SHIFT	5
+#define DA9062AA_TICK_WAKE_MASK		BIT(5)
+
+/* DA9062AA_ALARM_Y = 0x04B */
+#define DA9062AA_ALARM_YEAR_SHIFT	0
+#define DA9062AA_ALARM_YEAR_MASK	0x3f
+#define DA9062AA_ALARM_ON_SHIFT		6
+#define DA9062AA_ALARM_ON_MASK		BIT(6)
+#define DA9062AA_TICK_ON_SHIFT		7
+#define DA9062AA_TICK_ON_MASK		BIT(7)
+
+/* DA9062AA_SECOND_A = 0x04C */
+#define DA9062AA_SECONDS_A_SHIFT	0
+#define DA9062AA_SECONDS_A_MASK		0xff
+
+/* DA9062AA_SECOND_B = 0x04D */
+#define DA9062AA_SECONDS_B_SHIFT	0
+#define DA9062AA_SECONDS_B_MASK		0xff
+
+/* DA9062AA_SECOND_C = 0x04E */
+#define DA9062AA_SECONDS_C_SHIFT	0
+#define DA9062AA_SECONDS_C_MASK		0xff
+
+/* DA9062AA_SECOND_D = 0x04F */
+#define DA9062AA_SECONDS_D_SHIFT	0
+#define DA9062AA_SECONDS_D_MASK		0xff
+
+/* DA9062AA_SEQ = 0x081 */
+#define DA9062AA_SEQ_POINTER_SHIFT	0
+#define DA9062AA_SEQ_POINTER_MASK	0x0f
+#define DA9062AA_NXT_SEQ_START_SHIFT	4
+#define DA9062AA_NXT_SEQ_START_MASK	(0x0f << 4)
+
+/* DA9062AA_SEQ_TIMER = 0x082 */
+#define DA9062AA_SEQ_TIME_SHIFT		0
+#define DA9062AA_SEQ_TIME_MASK		0x0f
+#define DA9062AA_SEQ_DUMMY_SHIFT	4
+#define DA9062AA_SEQ_DUMMY_MASK		(0x0f << 4)
+
+/* DA9062AA_ID_2_1 = 0x083 */
+#define DA9062AA_LDO1_STEP_SHIFT	0
+#define DA9062AA_LDO1_STEP_MASK		0x0f
+#define DA9062AA_LDO2_STEP_SHIFT	4
+#define DA9062AA_LDO2_STEP_MASK		(0x0f << 4)
+
+/* DA9062AA_ID_4_3 = 0x084 */
+#define DA9062AA_LDO3_STEP_SHIFT	0
+#define DA9062AA_LDO3_STEP_MASK		0x0f
+#define DA9062AA_LDO4_STEP_SHIFT	4
+#define DA9062AA_LDO4_STEP_MASK		(0x0f << 4)
+
+/* DA9062AA_ID_12_11 = 0x088 */
+#define DA9062AA_PD_DIS_STEP_SHIFT	4
+#define DA9062AA_PD_DIS_STEP_MASK	(0x0f << 4)
+
+/* DA9062AA_ID_14_13 = 0x089 */
+#define DA9062AA_BUCK1_STEP_SHIFT	0
+#define DA9062AA_BUCK1_STEP_MASK	0x0f
+#define DA9062AA_BUCK2_STEP_SHIFT	4
+#define DA9062AA_BUCK2_STEP_MASK	(0x0f << 4)
+
+/* DA9062AA_ID_16_15 = 0x08A */
+#define DA9062AA_BUCK4_STEP_SHIFT	0
+#define DA9062AA_BUCK4_STEP_MASK	0x0f
+#define DA9062AA_BUCK3_STEP_SHIFT	4
+#define DA9062AA_BUCK3_STEP_MASK	(0x0f << 4)
+
+/* DA9062AA_ID_22_21 = 0x08D */
+#define DA9062AA_GP_RISE1_STEP_SHIFT	0
+#define DA9062AA_GP_RISE1_STEP_MASK	0x0f
+#define DA9062AA_GP_FALL1_STEP_SHIFT	4
+#define DA9062AA_GP_FALL1_STEP_MASK	(0x0f << 4)
+
+/* DA9062AA_ID_24_23 = 0x08E */
+#define DA9062AA_GP_RISE2_STEP_SHIFT	0
+#define DA9062AA_GP_RISE2_STEP_MASK	0x0f
+#define DA9062AA_GP_FALL2_STEP_SHIFT	4
+#define DA9062AA_GP_FALL2_STEP_MASK	(0x0f << 4)
+
+/* DA9062AA_ID_26_25 = 0x08F */
+#define DA9062AA_GP_RISE3_STEP_SHIFT	0
+#define DA9062AA_GP_RISE3_STEP_MASK	0x0f
+#define DA9062AA_GP_FALL3_STEP_SHIFT	4
+#define DA9062AA_GP_FALL3_STEP_MASK	(0x0f << 4)
+
+/* DA9062AA_ID_28_27 = 0x090 */
+#define DA9062AA_GP_RISE4_STEP_SHIFT	0
+#define DA9062AA_GP_RISE4_STEP_MASK	0x0f
+#define DA9062AA_GP_FALL4_STEP_SHIFT	4
+#define DA9062AA_GP_FALL4_STEP_MASK	(0x0f << 4)
+
+/* DA9062AA_ID_30_29 = 0x091 */
+#define DA9062AA_GP_RISE5_STEP_SHIFT	0
+#define DA9062AA_GP_RISE5_STEP_MASK	0x0f
+#define DA9062AA_GP_FALL5_STEP_SHIFT	4
+#define DA9062AA_GP_FALL5_STEP_MASK	(0x0f << 4)
+
+/* DA9062AA_ID_32_31 = 0x092 */
+#define DA9062AA_WAIT_STEP_SHIFT	0
+#define DA9062AA_WAIT_STEP_MASK		0x0f
+#define DA9062AA_EN32K_STEP_SHIFT	4
+#define DA9062AA_EN32K_STEP_MASK	(0x0f << 4)
+
+/* DA9062AA_SEQ_A = 0x095 */
+#define DA9062AA_SYSTEM_END_SHIFT	0
+#define DA9062AA_SYSTEM_END_MASK	0x0f
+#define DA9062AA_POWER_END_SHIFT	4
+#define DA9062AA_POWER_END_MASK		(0x0f << 4)
+
+/* DA9062AA_SEQ_B = 0x096 */
+#define DA9062AA_MAX_COUNT_SHIFT	0
+#define DA9062AA_MAX_COUNT_MASK		0x0f
+#define DA9062AA_PART_DOWN_SHIFT	4
+#define DA9062AA_PART_DOWN_MASK		(0x0f << 4)
+
+/* DA9062AA_WAIT = 0x097 */
+#define DA9062AA_WAIT_TIME_SHIFT	0
+#define DA9062AA_WAIT_TIME_MASK		0x0f
+#define DA9062AA_WAIT_MODE_SHIFT	4
+#define DA9062AA_WAIT_MODE_MASK		BIT(4)
+#define DA9062AA_TIME_OUT_SHIFT		5
+#define DA9062AA_TIME_OUT_MASK		BIT(5)
+#define DA9062AA_WAIT_DIR_SHIFT		6
+#define DA9062AA_WAIT_DIR_MASK		(0x03 << 6)
+
+/* DA9062AA_EN_32K = 0x098 */
+#define DA9062AA_STABILISATION_TIME_SHIFT	0
+#define DA9062AA_STABILISATION_TIME_MASK	0x07
+#define DA9062AA_CRYSTAL_SHIFT			3
+#define DA9062AA_CRYSTAL_MASK			BIT(3)
+#define DA9062AA_DELAY_MODE_SHIFT		4
+#define DA9062AA_DELAY_MODE_MASK		BIT(4)
+#define DA9062AA_OUT_CLOCK_SHIFT		5
+#define DA9062AA_OUT_CLOCK_MASK			BIT(5)
+#define DA9062AA_RTC_CLOCK_SHIFT		6
+#define DA9062AA_RTC_CLOCK_MASK			BIT(6)
+#define DA9062AA_EN_32KOUT_SHIFT		7
+#define DA9062AA_EN_32KOUT_MASK			BIT(7)
+
+/* DA9062AA_RESET = 0x099 */
+#define DA9062AA_RESET_TIMER_SHIFT	0
+#define DA9062AA_RESET_TIMER_MASK	0x3f
+#define DA9062AA_RESET_EVENT_SHIFT	6
+#define DA9062AA_RESET_EVENT_MASK	(0x03 << 6)
+
+/* DA9062AA_BUCK_ILIM_A = 0x09A */
+#define DA9062AA_BUCK3_ILIM_SHIFT	0
+#define DA9062AA_BUCK3_ILIM_MASK	0x0f
+
+/* DA9062AA_BUCK_ILIM_B = 0x09B */
+#define DA9062AA_BUCK4_ILIM_SHIFT	0
+#define DA9062AA_BUCK4_ILIM_MASK	0x0f
+
+/* DA9062AA_BUCK_ILIM_C = 0x09C */
+#define DA9062AA_BUCK1_ILIM_SHIFT	0
+#define DA9062AA_BUCK1_ILIM_MASK	0x0f
+#define DA9062AA_BUCK2_ILIM_SHIFT	4
+#define DA9062AA_BUCK2_ILIM_MASK	(0x0f << 4)
+
+/* DA9062AA_BUCK2_CFG = 0x09D */
+#define DA9062AA_BUCK2_PD_DIS_SHIFT	5
+#define DA9062AA_BUCK2_PD_DIS_MASK	BIT(5)
+#define DA9062AA_BUCK2_MODE_SHIFT	6
+#define DA9062AA_BUCK2_MODE_MASK	(0x03 << 6)
+
+/* DA9062AA_BUCK1_CFG = 0x09E */
+#define DA9062AA_BUCK1_PD_DIS_SHIFT	5
+#define DA9062AA_BUCK1_PD_DIS_MASK	BIT(5)
+#define DA9062AA_BUCK1_MODE_SHIFT	6
+#define DA9062AA_BUCK1_MODE_MASK	(0x03 << 6)
+
+/* DA9062AA_BUCK4_CFG = 0x09F */
+#define DA9062AA_BUCK4_VTTR_EN_SHIFT	3
+#define DA9062AA_BUCK4_VTTR_EN_MASK	BIT(3)
+#define DA9062AA_BUCK4_VTT_EN_SHIFT	4
+#define DA9062AA_BUCK4_VTT_EN_MASK	BIT(4)
+#define DA9062AA_BUCK4_PD_DIS_SHIFT	5
+#define DA9062AA_BUCK4_PD_DIS_MASK	BIT(5)
+#define DA9062AA_BUCK4_MODE_SHIFT	6
+#define DA9062AA_BUCK4_MODE_MASK	(0x03 << 6)
+
+/* DA9062AA_BUCK3_CFG = 0x0A0 */
+#define DA9062AA_BUCK3_PD_DIS_SHIFT	5
+#define DA9062AA_BUCK3_PD_DIS_MASK	BIT(5)
+#define DA9062AA_BUCK3_MODE_SHIFT	6
+#define DA9062AA_BUCK3_MODE_MASK	(0x03 << 6)
+
+/* DA9062AA_VBUCK2_A = 0x0A3 */
+#define DA9062AA_VBUCK2_A_SHIFT		0
+#define DA9062AA_VBUCK2_A_MASK		0x7f
+#define DA9062AA_BUCK2_SL_A_SHIFT	7
+#define DA9062AA_BUCK2_SL_A_MASK	BIT(7)
+
+/* DA9062AA_VBUCK1_A = 0x0A4 */
+#define DA9062AA_VBUCK1_A_SHIFT		0
+#define DA9062AA_VBUCK1_A_MASK		0x7f
+#define DA9062AA_BUCK1_SL_A_SHIFT	7
+#define DA9062AA_BUCK1_SL_A_MASK	BIT(7)
+
+/* DA9062AA_VBUCK4_A = 0x0A5 */
+#define DA9062AA_VBUCK4_A_SHIFT		0
+#define DA9062AA_VBUCK4_A_MASK		0x7f
+#define DA9062AA_BUCK4_SL_A_SHIFT	7
+#define DA9062AA_BUCK4_SL_A_MASK	BIT(7)
+
+/* DA9062AA_VBUCK3_A = 0x0A7 */
+#define DA9062AA_VBUCK3_A_SHIFT		0
+#define DA9062AA_VBUCK3_A_MASK		0x7f
+#define DA9062AA_BUCK3_SL_A_SHIFT	7
+#define DA9062AA_BUCK3_SL_A_MASK	BIT(7)
+
+/* DA9062AA_VLDO1_A = 0x0A9 */
+#define DA9062AA_VLDO1_A_SHIFT		0
+#define DA9062AA_VLDO1_A_MASK		0x3f
+#define DA9062AA_LDO1_SL_A_SHIFT	7
+#define DA9062AA_LDO1_SL_A_MASK		BIT(7)
+
+/* DA9062AA_VLDO2_A = 0x0AA */
+#define DA9062AA_VLDO2_A_SHIFT		0
+#define DA9062AA_VLDO2_A_MASK		0x3f
+#define DA9062AA_LDO2_SL_A_SHIFT	7
+#define DA9062AA_LDO2_SL_A_MASK		BIT(7)
+
+/* DA9062AA_VLDO3_A = 0x0AB */
+#define DA9062AA_VLDO3_A_SHIFT		0
+#define DA9062AA_VLDO3_A_MASK		0x3f
+#define DA9062AA_LDO3_SL_A_SHIFT	7
+#define DA9062AA_LDO3_SL_A_MASK		BIT(7)
+
+/* DA9062AA_VLDO4_A = 0x0AC */
+#define DA9062AA_VLDO4_A_SHIFT		0
+#define DA9062AA_VLDO4_A_MASK		0x3f
+#define DA9062AA_LDO4_SL_A_SHIFT	7
+#define DA9062AA_LDO4_SL_A_MASK		BIT(7)
+
+/* DA9062AA_VBUCK2_B = 0x0B4 */
+#define DA9062AA_VBUCK2_B_SHIFT		0
+#define DA9062AA_VBUCK2_B_MASK		0x7f
+#define DA9062AA_BUCK2_SL_B_SHIFT	7
+#define DA9062AA_BUCK2_SL_B_MASK	BIT(7)
+
+/* DA9062AA_VBUCK1_B = 0x0B5 */
+#define DA9062AA_VBUCK1_B_SHIFT		0
+#define DA9062AA_VBUCK1_B_MASK		0x7f
+#define DA9062AA_BUCK1_SL_B_SHIFT	7
+#define DA9062AA_BUCK1_SL_B_MASK	BIT(7)
+
+/* DA9062AA_VBUCK4_B = 0x0B6 */
+#define DA9062AA_VBUCK4_B_SHIFT		0
+#define DA9062AA_VBUCK4_B_MASK		0x7f
+#define DA9062AA_BUCK4_SL_B_SHIFT	7
+#define DA9062AA_BUCK4_SL_B_MASK	BIT(7)
+
+/* DA9062AA_VBUCK3_B = 0x0B8 */
+#define DA9062AA_VBUCK3_B_SHIFT		0
+#define DA9062AA_VBUCK3_B_MASK		0x7f
+#define DA9062AA_BUCK3_SL_B_SHIFT	7
+#define DA9062AA_BUCK3_SL_B_MASK	BIT(7)
+
+/* DA9062AA_VLDO1_B = 0x0BA */
+#define DA9062AA_VLDO1_B_SHIFT		0
+#define DA9062AA_VLDO1_B_MASK		0x3f
+#define DA9062AA_LDO1_SL_B_SHIFT	7
+#define DA9062AA_LDO1_SL_B_MASK		BIT(7)
+
+/* DA9062AA_VLDO2_B = 0x0BB */
+#define DA9062AA_VLDO2_B_SHIFT		0
+#define DA9062AA_VLDO2_B_MASK		0x3f
+#define DA9062AA_LDO2_SL_B_SHIFT	7
+#define DA9062AA_LDO2_SL_B_MASK		BIT(7)
+
+/* DA9062AA_VLDO3_B = 0x0BC */
+#define DA9062AA_VLDO3_B_SHIFT		0
+#define DA9062AA_VLDO3_B_MASK		0x3f
+#define DA9062AA_LDO3_SL_B_SHIFT	7
+#define DA9062AA_LDO3_SL_B_MASK		BIT(7)
+
+/* DA9062AA_VLDO4_B = 0x0BD */
+#define DA9062AA_VLDO4_B_SHIFT		0
+#define DA9062AA_VLDO4_B_MASK		0x3f
+#define DA9062AA_LDO4_SL_B_SHIFT	7
+#define DA9062AA_LDO4_SL_B_MASK		BIT(7)
+
+/* DA9062AA_BBAT_CONT = 0x0C5 */
+#define DA9062AA_BCHG_VSET_SHIFT	0
+#define DA9062AA_BCHG_VSET_MASK		0x0f
+#define DA9062AA_BCHG_ISET_SHIFT	4
+#define DA9062AA_BCHG_ISET_MASK		(0x0f << 4)
+
+/* DA9062AA_INTERFACE = 0x105 */
+#define DA9062AA_IF_BASE_ADDR_SHIFT	4
+#define DA9062AA_IF_BASE_ADDR_MASK	(0x0f << 4)
+
+/* DA9062AA_CONFIG_A = 0x106 */
+#define DA9062AA_PM_I_V_SHIFT		0
+#define DA9062AA_PM_I_V_MASK		0x01
+#define DA9062AA_PM_O_TYPE_SHIFT	2
+#define DA9062AA_PM_O_TYPE_MASK		BIT(2)
+#define DA9062AA_IRQ_TYPE_SHIFT		3
+#define DA9062AA_IRQ_TYPE_MASK		BIT(3)
+#define DA9062AA_PM_IF_V_SHIFT		4
+#define DA9062AA_PM_IF_V_MASK		BIT(4)
+#define DA9062AA_PM_IF_FMP_SHIFT	5
+#define DA9062AA_PM_IF_FMP_MASK		BIT(5)
+#define DA9062AA_PM_IF_HSM_SHIFT	6
+#define DA9062AA_PM_IF_HSM_MASK		BIT(6)
+
+/* DA9062AA_CONFIG_B = 0x107 */
+#define DA9062AA_VDD_FAULT_ADJ_SHIFT	0
+#define DA9062AA_VDD_FAULT_ADJ_MASK	0x0f
+#define DA9062AA_VDD_HYST_ADJ_SHIFT	4
+#define DA9062AA_VDD_HYST_ADJ_MASK	(0x07 << 4)
+
+/* DA9062AA_CONFIG_C = 0x108 */
+#define DA9062AA_BUCK_ACTV_DISCHRG_SHIFT	2
+#define DA9062AA_BUCK_ACTV_DISCHRG_MASK		BIT(2)
+#define DA9062AA_BUCK1_CLK_INV_SHIFT		3
+#define DA9062AA_BUCK1_CLK_INV_MASK		BIT(3)
+#define DA9062AA_BUCK4_CLK_INV_SHIFT		4
+#define DA9062AA_BUCK4_CLK_INV_MASK		BIT(4)
+#define DA9062AA_BUCK3_CLK_INV_SHIFT		6
+#define DA9062AA_BUCK3_CLK_INV_MASK		BIT(6)
+
+/* DA9062AA_CONFIG_D = 0x109 */
+#define DA9062AA_GPI_V_SHIFT		0
+#define DA9062AA_GPI_V_MASK		0x01
+#define DA9062AA_NIRQ_MODE_SHIFT	1
+#define DA9062AA_NIRQ_MODE_MASK		BIT(1)
+#define DA9062AA_SYSTEM_EN_RD_SHIFT	2
+#define DA9062AA_SYSTEM_EN_RD_MASK	BIT(2)
+#define DA9062AA_FORCE_RESET_SHIFT	5
+#define DA9062AA_FORCE_RESET_MASK	BIT(5)
+
+/* DA9062AA_CONFIG_E = 0x10A */
+#define DA9062AA_BUCK1_AUTO_SHIFT	0
+#define DA9062AA_BUCK1_AUTO_MASK	0x01
+#define DA9062AA_BUCK2_AUTO_SHIFT	1
+#define DA9062AA_BUCK2_AUTO_MASK	BIT(1)
+#define DA9062AA_BUCK4_AUTO_SHIFT	2
+#define DA9062AA_BUCK4_AUTO_MASK	BIT(2)
+#define DA9062AA_BUCK3_AUTO_SHIFT	4
+#define DA9062AA_BUCK3_AUTO_MASK	BIT(4)
+
+/* DA9062AA_CONFIG_G = 0x10C */
+#define DA9062AA_LDO1_AUTO_SHIFT	0
+#define DA9062AA_LDO1_AUTO_MASK		0x01
+#define DA9062AA_LDO2_AUTO_SHIFT	1
+#define DA9062AA_LDO2_AUTO_MASK		BIT(1)
+#define DA9062AA_LDO3_AUTO_SHIFT	2
+#define DA9062AA_LDO3_AUTO_MASK		BIT(2)
+#define DA9062AA_LDO4_AUTO_SHIFT	3
+#define DA9062AA_LDO4_AUTO_MASK		BIT(3)
+
+/* DA9062AA_CONFIG_H = 0x10D */
+#define DA9062AA_BUCK1_2_MERGE_SHIFT	3
+#define DA9062AA_BUCK1_2_MERGE_MASK	BIT(3)
+#define DA9062AA_BUCK2_OD_SHIFT		5
+#define DA9062AA_BUCK2_OD_MASK		BIT(5)
+#define DA9062AA_BUCK1_OD_SHIFT		6
+#define DA9062AA_BUCK1_OD_MASK		BIT(6)
+
+/* DA9062AA_CONFIG_I = 0x10E */
+#define DA9062AA_NONKEY_PIN_SHIFT	0
+#define DA9062AA_NONKEY_PIN_MASK	0x03
+#define DA9062AA_nONKEY_SD_SHIFT	2
+#define DA9062AA_nONKEY_SD_MASK		BIT(2)
+#define DA9062AA_WATCHDOG_SD_SHIFT	3
+#define DA9062AA_WATCHDOG_SD_MASK	BIT(3)
+#define DA9062AA_KEY_SD_MODE_SHIFT	4
+#define DA9062AA_KEY_SD_MODE_MASK	BIT(4)
+#define DA9062AA_HOST_SD_MODE_SHIFT	5
+#define DA9062AA_HOST_SD_MODE_MASK	BIT(5)
+#define DA9062AA_INT_SD_MODE_SHIFT	6
+#define DA9062AA_INT_SD_MODE_MASK	BIT(6)
+#define DA9062AA_LDO_SD_SHIFT		7
+#define DA9062AA_LDO_SD_MASK		BIT(7)
+
+/* DA9062AA_CONFIG_J = 0x10F */
+#define DA9062AA_KEY_DELAY_SHIFT	0
+#define DA9062AA_KEY_DELAY_MASK		0x03
+#define DA9062AA_SHUT_DELAY_SHIFT	2
+#define DA9062AA_SHUT_DELAY_MASK	(0x03 << 2)
+#define DA9062AA_RESET_DURATION_SHIFT	4
+#define DA9062AA_RESET_DURATION_MASK	(0x03 << 4)
+#define DA9062AA_TWOWIRE_TO_SHIFT	6
+#define DA9062AA_TWOWIRE_TO_MASK	BIT(6)
+#define DA9062AA_IF_RESET_SHIFT		7
+#define DA9062AA_IF_RESET_MASK		BIT(7)
+
+/* DA9062AA_CONFIG_K = 0x110 */
+#define DA9062AA_GPIO0_PUPD_SHIFT	0
+#define DA9062AA_GPIO0_PUPD_MASK	0x01
+#define DA9062AA_GPIO1_PUPD_SHIFT	1
+#define DA9062AA_GPIO1_PUPD_MASK	BIT(1)
+#define DA9062AA_GPIO2_PUPD_SHIFT	2
+#define DA9062AA_GPIO2_PUPD_MASK	BIT(2)
+#define DA9062AA_GPIO3_PUPD_SHIFT	3
+#define DA9062AA_GPIO3_PUPD_MASK	BIT(3)
+#define DA9062AA_GPIO4_PUPD_SHIFT	4
+#define DA9062AA_GPIO4_PUPD_MASK	BIT(4)
+
+/* DA9062AA_CONFIG_M = 0x112 */
+#define DA9062AA_NSHUTDOWN_PU_SHIFT	1
+#define DA9062AA_NSHUTDOWN_PU_MASK	BIT(1)
+#define DA9062AA_WDG_MODE_SHIFT		3
+#define DA9062AA_WDG_MODE_MASK		BIT(3)
+#define DA9062AA_OSC_FRQ_SHIFT		4
+#define DA9062AA_OSC_FRQ_MASK		(0x0f << 4)
+
+/* DA9062AA_TRIM_CLDR = 0x120 */
+#define DA9062AA_TRIM_CLDR_SHIFT	0
+#define DA9062AA_TRIM_CLDR_MASK		0xff
+
+/* DA9062AA_GP_ID_0 = 0x121 */
+#define DA9062AA_GP_0_SHIFT		0
+#define DA9062AA_GP_0_MASK		0xff
+
+/* DA9062AA_GP_ID_1 = 0x122 */
+#define DA9062AA_GP_1_SHIFT		0
+#define DA9062AA_GP_1_MASK		0xff
+
+/* DA9062AA_GP_ID_2 = 0x123 */
+#define DA9062AA_GP_2_SHIFT		0
+#define DA9062AA_GP_2_MASK		0xff
+
+/* DA9062AA_GP_ID_3 = 0x124 */
+#define DA9062AA_GP_3_SHIFT		0
+#define DA9062AA_GP_3_MASK		0xff
+
+/* DA9062AA_GP_ID_4 = 0x125 */
+#define DA9062AA_GP_4_SHIFT		0
+#define DA9062AA_GP_4_MASK		0xff
+
+/* DA9062AA_GP_ID_5 = 0x126 */
+#define DA9062AA_GP_5_SHIFT		0
+#define DA9062AA_GP_5_MASK		0xff
+
+/* DA9062AA_GP_ID_6 = 0x127 */
+#define DA9062AA_GP_6_SHIFT		0
+#define DA9062AA_GP_6_MASK		0xff
+
+/* DA9062AA_GP_ID_7 = 0x128 */
+#define DA9062AA_GP_7_SHIFT		0
+#define DA9062AA_GP_7_MASK		0xff
+
+/* DA9062AA_GP_ID_8 = 0x129 */
+#define DA9062AA_GP_8_SHIFT		0
+#define DA9062AA_GP_8_MASK		0xff
+
+/* DA9062AA_GP_ID_9 = 0x12A */
+#define DA9062AA_GP_9_SHIFT		0
+#define DA9062AA_GP_9_MASK		0xff
+
+/* DA9062AA_GP_ID_10 = 0x12B */
+#define DA9062AA_GP_10_SHIFT		0
+#define DA9062AA_GP_10_MASK		0xff
+
+/* DA9062AA_GP_ID_11 = 0x12C */
+#define DA9062AA_GP_11_SHIFT		0
+#define DA9062AA_GP_11_MASK		0xff
+
+/* DA9062AA_GP_ID_12 = 0x12D */
+#define DA9062AA_GP_12_SHIFT		0
+#define DA9062AA_GP_12_MASK		0xff
+
+/* DA9062AA_GP_ID_13 = 0x12E */
+#define DA9062AA_GP_13_SHIFT		0
+#define DA9062AA_GP_13_MASK		0xff
+
+/* DA9062AA_GP_ID_14 = 0x12F */
+#define DA9062AA_GP_14_SHIFT		0
+#define DA9062AA_GP_14_MASK		0xff
+
+/* DA9062AA_GP_ID_15 = 0x130 */
+#define DA9062AA_GP_15_SHIFT		0
+#define DA9062AA_GP_15_MASK		0xff
+
+/* DA9062AA_GP_ID_16 = 0x131 */
+#define DA9062AA_GP_16_SHIFT		0
+#define DA9062AA_GP_16_MASK		0xff
+
+/* DA9062AA_GP_ID_17 = 0x132 */
+#define DA9062AA_GP_17_SHIFT		0
+#define DA9062AA_GP_17_MASK		0xff
+
+/* DA9062AA_GP_ID_18 = 0x133 */
+#define DA9062AA_GP_18_SHIFT		0
+#define DA9062AA_GP_18_MASK		0xff
+
+/* DA9062AA_GP_ID_19 = 0x134 */
+#define DA9062AA_GP_19_SHIFT		0
+#define DA9062AA_GP_19_MASK		0xff
+
+/* DA9062AA_DEVICE_ID = 0x181 */
+#define DA9062AA_DEV_ID_SHIFT		0
+#define DA9062AA_DEV_ID_MASK		0xff
+
+/* DA9062AA_VARIANT_ID = 0x182 */
+#define DA9062AA_VRC_SHIFT		0
+#define DA9062AA_VRC_MASK		0x0f
+#define DA9062AA_MRC_SHIFT		4
+#define DA9062AA_MRC_MASK		(0x0f << 4)
+
+/* DA9062AA_CUSTOMER_ID = 0x183 */
+#define DA9062AA_CUST_ID_SHIFT		0
+#define DA9062AA_CUST_ID_MASK		0xff
+
+/* DA9062AA_CONFIG_ID = 0x184 */
+#define DA9062AA_CONFIG_REV_SHIFT	0
+#define DA9062AA_CONFIG_REV_MASK	0xff
+
+#endif /* __DA9062_H__ */
-- 
cgit v1.2.3-70-g09d2


From d8d79f8f60c4363a0fa490ff1626be4bd5e003a3 Mon Sep 17 00:00:00 2001
From: Michal Suchanek <hramrach@gmail.com>
Date: Sat, 11 Jul 2015 14:59:56 +0200
Subject: mfd: axp20x: Add axp152 support

The axp152 is a stripped down version of the axp202 pmic with the battery
charging function removed as it is intended for top-set boxes.

Signed-off-by: Michal Suchanek <hramrach@gmail.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/axp20x.c       | 83 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/axp20x.h | 61 +++++++++++++++++++++++++++++++++-
 2 files changed, 143 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index 8c2c3c4c9a56..b369cfce866c 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -30,12 +30,34 @@
 #define AXP20X_OFF	0x80
 
 static const char * const axp20x_model_names[] = {
+	"AXP152",
 	"AXP202",
 	"AXP209",
 	"AXP221",
 	"AXP288",
 };
 
+static const struct regmap_range axp152_writeable_ranges[] = {
+	regmap_reg_range(AXP152_LDO3456_DC1234_CTRL, AXP152_IRQ3_STATE),
+	regmap_reg_range(AXP152_DCDC_MODE, AXP152_PWM1_DUTY_CYCLE),
+};
+
+static const struct regmap_range axp152_volatile_ranges[] = {
+	regmap_reg_range(AXP152_PWR_OP_MODE, AXP152_PWR_OP_MODE),
+	regmap_reg_range(AXP152_IRQ1_EN, AXP152_IRQ3_STATE),
+	regmap_reg_range(AXP152_GPIO_INPUT, AXP152_GPIO_INPUT),
+};
+
+static const struct regmap_access_table axp152_writeable_table = {
+	.yes_ranges	= axp152_writeable_ranges,
+	.n_yes_ranges	= ARRAY_SIZE(axp152_writeable_ranges),
+};
+
+static const struct regmap_access_table axp152_volatile_table = {
+	.yes_ranges	= axp152_volatile_ranges,
+	.n_yes_ranges	= ARRAY_SIZE(axp152_volatile_ranges),
+};
+
 static const struct regmap_range axp20x_writeable_ranges[] = {
 	regmap_reg_range(AXP20X_DATACACHE(0), AXP20X_IRQ5_STATE),
 	regmap_reg_range(AXP20X_DCDC_MODE, AXP20X_FG_RES),
@@ -93,6 +115,11 @@ static const struct regmap_access_table axp288_volatile_table = {
 	.n_yes_ranges	= ARRAY_SIZE(axp288_volatile_ranges),
 };
 
+static struct resource axp152_pek_resources[] = {
+	DEFINE_RES_IRQ_NAMED(AXP152_IRQ_PEK_RIS_EDGE, "PEK_DBR"),
+	DEFINE_RES_IRQ_NAMED(AXP152_IRQ_PEK_FAL_EDGE, "PEK_DBF"),
+};
+
 static struct resource axp20x_pek_resources[] = {
 	{
 		.name	= "PEK_DBR",
@@ -154,6 +181,15 @@ static struct resource axp288_fuel_gauge_resources[] = {
 	},
 };
 
+static const struct regmap_config axp152_regmap_config = {
+	.reg_bits	= 8,
+	.val_bits	= 8,
+	.wr_table	= &axp152_writeable_table,
+	.volatile_table	= &axp152_volatile_table,
+	.max_register	= AXP152_PWM1_DUTY_CYCLE,
+	.cache_type	= REGCACHE_RBTREE,
+};
+
 static const struct regmap_config axp20x_regmap_config = {
 	.reg_bits	= 8,
 	.val_bits	= 8,
@@ -184,6 +220,26 @@ static const struct regmap_config axp288_regmap_config = {
 #define INIT_REGMAP_IRQ(_variant, _irq, _off, _mask)			\
 	[_variant##_IRQ_##_irq] = { .reg_offset = (_off), .mask = BIT(_mask) }
 
+static const struct regmap_irq axp152_regmap_irqs[] = {
+	INIT_REGMAP_IRQ(AXP152, LDO0IN_CONNECT,		0, 6),
+	INIT_REGMAP_IRQ(AXP152, LDO0IN_REMOVAL,		0, 5),
+	INIT_REGMAP_IRQ(AXP152, ALDO0IN_CONNECT,	0, 3),
+	INIT_REGMAP_IRQ(AXP152, ALDO0IN_REMOVAL,	0, 2),
+	INIT_REGMAP_IRQ(AXP152, DCDC1_V_LOW,		1, 5),
+	INIT_REGMAP_IRQ(AXP152, DCDC2_V_LOW,		1, 4),
+	INIT_REGMAP_IRQ(AXP152, DCDC3_V_LOW,		1, 3),
+	INIT_REGMAP_IRQ(AXP152, DCDC4_V_LOW,		1, 2),
+	INIT_REGMAP_IRQ(AXP152, PEK_SHORT,		1, 1),
+	INIT_REGMAP_IRQ(AXP152, PEK_LONG,		1, 0),
+	INIT_REGMAP_IRQ(AXP152, TIMER,			2, 7),
+	INIT_REGMAP_IRQ(AXP152, PEK_RIS_EDGE,		2, 6),
+	INIT_REGMAP_IRQ(AXP152, PEK_FAL_EDGE,		2, 5),
+	INIT_REGMAP_IRQ(AXP152, GPIO3_INPUT,		2, 3),
+	INIT_REGMAP_IRQ(AXP152, GPIO2_INPUT,		2, 2),
+	INIT_REGMAP_IRQ(AXP152, GPIO1_INPUT,		2, 1),
+	INIT_REGMAP_IRQ(AXP152, GPIO0_INPUT,		2, 0),
+};
+
 static const struct regmap_irq axp20x_regmap_irqs[] = {
 	INIT_REGMAP_IRQ(AXP20X, ACIN_OVER_V,		0, 7),
 	INIT_REGMAP_IRQ(AXP20X, ACIN_PLUGIN,		0, 6),
@@ -293,6 +349,7 @@ static const struct regmap_irq axp288_regmap_irqs[] = {
 };
 
 static const struct of_device_id axp20x_of_match[] = {
+	{ .compatible = "x-powers,axp152", .data = (void *) AXP152_ID },
 	{ .compatible = "x-powers,axp202", .data = (void *) AXP202_ID },
 	{ .compatible = "x-powers,axp209", .data = (void *) AXP209_ID },
 	{ .compatible = "x-powers,axp221", .data = (void *) AXP221_ID },
@@ -317,6 +374,18 @@ static const struct acpi_device_id axp20x_acpi_match[] = {
 };
 MODULE_DEVICE_TABLE(acpi, axp20x_acpi_match);
 
+static const struct regmap_irq_chip axp152_regmap_irq_chip = {
+	.name			= "axp152_irq_chip",
+	.status_base		= AXP152_IRQ1_STATE,
+	.ack_base		= AXP152_IRQ1_STATE,
+	.mask_base		= AXP152_IRQ1_EN,
+	.mask_invert		= true,
+	.init_ack_masked	= true,
+	.irqs			= axp152_regmap_irqs,
+	.num_irqs		= ARRAY_SIZE(axp152_regmap_irqs),
+	.num_regs		= 3,
+};
+
 static const struct regmap_irq_chip axp20x_regmap_irq_chip = {
 	.name			= "axp20x_irq_chip",
 	.status_base		= AXP20X_IRQ1_STATE,
@@ -375,6 +444,14 @@ static struct mfd_cell axp22x_cells[] = {
 	},
 };
 
+static struct mfd_cell axp152_cells[] = {
+	{
+		.name			= "axp20x-pek",
+		.num_resources		= ARRAY_SIZE(axp152_pek_resources),
+		.resources		= axp152_pek_resources,
+	},
+};
+
 static struct resource axp288_adc_resources[] = {
 	{
 		.name  = "GPADC",
@@ -513,6 +590,12 @@ static int axp20x_match_device(struct axp20x_dev *axp20x, struct device *dev)
 	}
 
 	switch (axp20x->variant) {
+	case AXP152_ID:
+		axp20x->nr_cells = ARRAY_SIZE(axp152_cells);
+		axp20x->cells = axp152_cells;
+		axp20x->regmap_cfg = &axp152_regmap_config;
+		axp20x->regmap_irq_chip = &axp152_regmap_irq_chip;
+		break;
 	case AXP202_ID:
 	case AXP209_ID:
 		axp20x->nr_cells = ARRAY_SIZE(axp20x_cells);
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index c2aa853fb412..52203d5f7984 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -12,7 +12,8 @@
 #define __LINUX_MFD_AXP20X_H
 
 enum {
-	AXP202_ID = 0,
+	AXP152_ID = 0,
+	AXP202_ID,
 	AXP209_ID,
 	AXP221_ID,
 	AXP288_ID,
@@ -22,6 +23,24 @@ enum {
 #define AXP20X_DATACACHE(m)		(0x04 + (m))
 
 /* Power supply */
+#define AXP152_PWR_OP_MODE		0x01
+#define AXP152_LDO3456_DC1234_CTRL	0x12
+#define AXP152_ALDO_OP_MODE		0x13
+#define AXP152_LDO0_CTRL		0x15
+#define AXP152_DCDC2_V_OUT		0x23
+#define AXP152_DCDC2_V_SCAL		0x25
+#define AXP152_DCDC1_V_OUT		0x26
+#define AXP152_DCDC3_V_OUT		0x27
+#define AXP152_ALDO12_V_OUT		0x28
+#define AXP152_DLDO1_V_OUT		0x29
+#define AXP152_DLDO2_V_OUT		0x2a
+#define AXP152_DCDC4_V_OUT		0x2b
+#define AXP152_V_OFF			0x31
+#define AXP152_OFF_CTRL			0x32
+#define AXP152_PEK_KEY			0x36
+#define AXP152_DCDC_FREQ		0x37
+#define AXP152_DCDC_MODE		0x80
+
 #define AXP20X_PWR_INPUT_STATUS		0x00
 #define AXP20X_PWR_OP_MODE		0x01
 #define AXP20X_USB_OTG_STATUS		0x02
@@ -69,6 +88,13 @@ enum {
 #define AXP22X_CHRG_CTRL3		0x35
 
 /* Interrupt */
+#define AXP152_IRQ1_EN			0x40
+#define AXP152_IRQ2_EN			0x41
+#define AXP152_IRQ3_EN			0x42
+#define AXP152_IRQ1_STATE		0x48
+#define AXP152_IRQ2_STATE		0x49
+#define AXP152_IRQ3_STATE		0x4a
+
 #define AXP20X_IRQ1_EN			0x40
 #define AXP20X_IRQ2_EN			0x41
 #define AXP20X_IRQ3_EN			0x42
@@ -127,6 +153,19 @@ enum {
 #define AXP22X_PWREN_CTRL2		0x8d
 
 /* GPIO */
+#define AXP152_GPIO0_CTRL		0x90
+#define AXP152_GPIO1_CTRL		0x91
+#define AXP152_GPIO2_CTRL		0x92
+#define AXP152_GPIO3_CTRL		0x93
+#define AXP152_LDOGPIO2_V_OUT		0x96
+#define AXP152_GPIO_INPUT		0x97
+#define AXP152_PWM0_FREQ_X		0x98
+#define AXP152_PWM0_FREQ_Y		0x99
+#define AXP152_PWM0_DUTY_CYCLE		0x9a
+#define AXP152_PWM1_FREQ_X		0x9b
+#define AXP152_PWM1_FREQ_Y		0x9c
+#define AXP152_PWM1_DUTY_CYCLE		0x9d
+
 #define AXP20X_GPIO0_CTRL		0x90
 #define AXP20X_LDO5_V_OUT		0x91
 #define AXP20X_GPIO1_CTRL		0x92
@@ -217,6 +256,26 @@ enum {
 };
 
 /* IRQs */
+enum {
+	AXP152_IRQ_LDO0IN_CONNECT = 1,
+	AXP152_IRQ_LDO0IN_REMOVAL,
+	AXP152_IRQ_ALDO0IN_CONNECT,
+	AXP152_IRQ_ALDO0IN_REMOVAL,
+	AXP152_IRQ_DCDC1_V_LOW,
+	AXP152_IRQ_DCDC2_V_LOW,
+	AXP152_IRQ_DCDC3_V_LOW,
+	AXP152_IRQ_DCDC4_V_LOW,
+	AXP152_IRQ_PEK_SHORT,
+	AXP152_IRQ_PEK_LONG,
+	AXP152_IRQ_TIMER,
+	AXP152_IRQ_PEK_RIS_EDGE,
+	AXP152_IRQ_PEK_FAL_EDGE,
+	AXP152_IRQ_GPIO3_INPUT,
+	AXP152_IRQ_GPIO2_INPUT,
+	AXP152_IRQ_GPIO1_INPUT,
+	AXP152_IRQ_GPIO0_INPUT,
+};
+
 enum {
 	AXP20X_IRQ_ACIN_OVER_V = 1,
 	AXP20X_IRQ_ACIN_PLUGIN,
-- 
cgit v1.2.3-70-g09d2


From 03b42710420f2db5cc1e3506625e1311becb286b Mon Sep 17 00:00:00 2001
From: Steve Twiss <stwiss.opensource@diasemi.com>
Date: Fri, 17 Jul 2015 10:55:10 +0100
Subject: mfd: da9063: Fix missing DA9063_M_DVC_RDY mask bit

Fix a missing DVC_RDY interrupt mask in struct regmap_irq definition.

The original submission of this driver did not contain all interrupt
masking definitions in the struct regmap_irq contained in the file
da9063-irq.c

The solution is to add a DA9063_IRQ_DVC_RDY entry to enum da9063_irqs
list and to add the corresponding values to compensate for the missing
mask bit in the static const struct regmap_irq da9063_irqs[] table.

Signed-off-by: Steve Twiss <stwiss.opensource@diasemi.com>
Signed-off-by: Adam Ward <adam.ward.opensource@diasemi.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/da9063-irq.c        | 4 ++++
 include/linux/mfd/da9063/core.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/da9063-irq.c b/drivers/mfd/da9063-irq.c
index eaf1ec9208b2..26302634633c 100644
--- a/drivers/mfd/da9063-irq.c
+++ b/drivers/mfd/da9063-irq.c
@@ -77,6 +77,10 @@ static const struct regmap_irq da9063_irqs[] = {
 		.reg_offset = DA9063_REG_EVENT_B_OFFSET,
 		.mask = DA9063_M_UVOV,
 	},
+	[DA9063_IRQ_DVC_RDY] = {
+		.reg_offset = DA9063_REG_EVENT_B_OFFSET,
+		.mask = DA9063_M_DVC_RDY,
+	},
 	[DA9063_IRQ_VDD_MON] = {
 		.reg_offset = DA9063_REG_EVENT_B_OFFSET,
 		.mask = DA9063_M_VDD_MON,
diff --git a/include/linux/mfd/da9063/core.h b/include/linux/mfd/da9063/core.h
index 79f4d822ba13..621af82123c6 100644
--- a/include/linux/mfd/da9063/core.h
+++ b/include/linux/mfd/da9063/core.h
@@ -51,6 +51,7 @@ enum da9063_irqs {
 	DA9063_IRQ_COMP_1V2,
 	DA9063_IRQ_LDO_LIM,
 	DA9063_IRQ_REG_UVOV,
+	DA9063_IRQ_DVC_RDY,
 	DA9063_IRQ_VDD_MON,
 	DA9063_IRQ_WARN,
 	DA9063_IRQ_GPI0,
-- 
cgit v1.2.3-70-g09d2


From f3151ab4a7320ce3f48955fe796bd90dd00881ab Mon Sep 17 00:00:00 2001
From: Henry Chen <henryc.chen@mediatek.com>
Date: Mon, 10 Aug 2015 21:10:45 +0800
Subject: mfd: mt6397: Implement wake handler and suspend/resume to handle wake
 up event

Implement .irq_set_wake() to get who is wakeup source and setup on suspend/reumse. Enable
mt6393_irq as wake up source properly to pinctrl by enable_irq_wake()/enable_irq_wake().

Signed-off-by: Henry Chen <henryc.chen@mediatek.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/mt6397-core.c       | 49 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/mt6397/core.h |  1 +
 2 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c
index 9ca290fec003..1749c1c9f405 100644
--- a/drivers/mfd/mt6397-core.c
+++ b/drivers/mfd/mt6397-core.c
@@ -93,12 +93,31 @@ static void mt6397_irq_enable(struct irq_data *data)
 	mt6397->irq_masks_cur[reg] |= BIT(shift);
 }
 
+#ifdef CONFIG_PM_SLEEP
+static int mt6397_irq_set_wake(struct irq_data *irq_data, unsigned int on)
+{
+	struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(irq_data);
+	int shift = irq_data->hwirq & 0xf;
+	int reg = irq_data->hwirq >> 4;
+
+	if (on)
+		mt6397->wake_mask[reg] |= BIT(shift);
+	else
+		mt6397->wake_mask[reg] &= ~BIT(shift);
+
+	return 0;
+}
+#else
+#define mt6397_irq_set_wake NULL
+#endif
+
 static struct irq_chip mt6397_irq_chip = {
 	.name = "mt6397-irq",
 	.irq_bus_lock = mt6397_irq_lock,
 	.irq_bus_sync_unlock = mt6397_irq_sync_unlock,
 	.irq_enable = mt6397_irq_enable,
 	.irq_disable = mt6397_irq_disable,
+	.irq_set_wake = mt6397_irq_set_wake,
 };
 
 static void mt6397_irq_handle_reg(struct mt6397_chip *mt6397, int reg,
@@ -179,6 +198,35 @@ static int mt6397_irq_init(struct mt6397_chip *mt6397)
 	return 0;
 }
 
+#ifdef CONFIG_PM_SLEEP
+static int mt6397_irq_suspend(struct device *dev)
+{
+	struct mt6397_chip *chip = dev_get_drvdata(dev);
+
+	regmap_write(chip->regmap, MT6397_INT_CON0, chip->wake_mask[0]);
+	regmap_write(chip->regmap, MT6397_INT_CON1, chip->wake_mask[1]);
+
+	enable_irq_wake(chip->irq);
+
+	return 0;
+}
+
+static int mt6397_irq_resume(struct device *dev)
+{
+	struct mt6397_chip *chip = dev_get_drvdata(dev);
+
+	regmap_write(chip->regmap, MT6397_INT_CON0, chip->irq_masks_cur[0]);
+	regmap_write(chip->regmap, MT6397_INT_CON1, chip->irq_masks_cur[1]);
+
+	disable_irq_wake(chip->irq);
+
+	return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(mt6397_pm_ops, mt6397_irq_suspend,
+			mt6397_irq_resume);
+
 static int mt6397_probe(struct platform_device *pdev)
 {
 	int ret;
@@ -233,6 +281,7 @@ static struct platform_driver mt6397_driver = {
 	.driver = {
 		.name = "mt6397",
 		.of_match_table = of_match_ptr(mt6397_of_match),
+		.pm = &mt6397_pm_ops,
 	},
 };
 
diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h
index cf5265b0d1c1..45b8e8aa1fbf 100644
--- a/include/linux/mfd/mt6397/core.h
+++ b/include/linux/mfd/mt6397/core.h
@@ -57,6 +57,7 @@ struct mt6397_chip {
 	int irq;
 	struct irq_domain *irq_domain;
 	struct mutex irqlock;
+	u16 wake_mask[2];
 	u16 irq_masks_cur[2];
 	u16 irq_masks_cache[2];
 };
-- 
cgit v1.2.3-70-g09d2


From 1ed8111443ae8caa455e7107031da36d1a6d351a Mon Sep 17 00:00:00 2001
From: Nicolas Boichat <drinkcat@chromium.org>
Date: Tue, 11 Aug 2015 18:04:21 +0800
Subject: regmap: Move documentation to regmap.h

Init functions defined in regmap*.c files are now prefixed with
__, take lockdep key and class parameters, and should not be
called directly: move the documentation to regmap.h, where the
macros are defined.

Signed-off-by: Nicolas Boichat <drinkcat@chromium.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-ac97.c |  19 -----
 drivers/base/regmap/regmap-i2c.c  |  19 -----
 drivers/base/regmap/regmap-mmio.c |  23 -----
 drivers/base/regmap/regmap-spi.c  |  19 -----
 drivers/base/regmap/regmap-spmi.c |  34 --------
 drivers/base/regmap/regmap.c      |  25 ------
 include/linux/regmap.h            | 173 +++++++++++++++++++++++++++++++++++---
 7 files changed, 162 insertions(+), 150 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regmap-ac97.c b/drivers/base/regmap/regmap-ac97.c
index aa631be8b821..c03ebfd4c731 100644
--- a/drivers/base/regmap/regmap-ac97.c
+++ b/drivers/base/regmap/regmap-ac97.c
@@ -78,15 +78,6 @@ static const struct regmap_bus ac97_regmap_bus = {
 	.reg_read = regmap_ac97_reg_read,
 };
 
-/**
- * regmap_init_ac97(): Initialise AC'97 register map
- *
- * @ac97: Device that will be interacted with
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.
- */
 struct regmap *__regmap_init_ac97(struct snd_ac97 *ac97,
 				  const struct regmap_config *config,
 				  struct lock_class_key *lock_key,
@@ -97,16 +88,6 @@ struct regmap *__regmap_init_ac97(struct snd_ac97 *ac97,
 }
 EXPORT_SYMBOL_GPL(__regmap_init_ac97);
 
-/**
- * devm_regmap_init_ac97(): Initialise AC'97 register map
- *
- * @ac97: Device that will be interacted with
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer
- * to a struct regmap.  The regmap will be automatically freed by the
- * device management code.
- */
 struct regmap *__devm_regmap_init_ac97(struct snd_ac97 *ac97,
 				       const struct regmap_config *config,
 				       struct lock_class_key *lock_key,
diff --git a/drivers/base/regmap/regmap-i2c.c b/drivers/base/regmap/regmap-i2c.c
index 3163b22e2baf..7007d6ea333c 100644
--- a/drivers/base/regmap/regmap-i2c.c
+++ b/drivers/base/regmap/regmap-i2c.c
@@ -233,15 +233,6 @@ static const struct regmap_bus *regmap_get_i2c_bus(struct i2c_client *i2c,
 	return ERR_PTR(-ENOTSUPP);
 }
 
-/**
- * regmap_init_i2c(): Initialise register map
- *
- * @i2c: Device that will be interacted with
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.
- */
 struct regmap *__regmap_init_i2c(struct i2c_client *i2c,
 				 const struct regmap_config *config,
 				 struct lock_class_key *lock_key,
@@ -257,16 +248,6 @@ struct regmap *__regmap_init_i2c(struct i2c_client *i2c,
 }
 EXPORT_SYMBOL_GPL(__regmap_init_i2c);
 
-/**
- * devm_regmap_init_i2c(): Initialise managed register map
- *
- * @i2c: Device that will be interacted with
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer
- * to a struct regmap.  The regmap will be automatically freed by the
- * device management code.
- */
 struct regmap *__devm_regmap_init_i2c(struct i2c_client *i2c,
 				      const struct regmap_config *config,
 				      struct lock_class_key *lock_key,
diff --git a/drivers/base/regmap/regmap-mmio.c b/drivers/base/regmap/regmap-mmio.c
index a1b2b270e4bc..426a57e41ac7 100644
--- a/drivers/base/regmap/regmap-mmio.c
+++ b/drivers/base/regmap/regmap-mmio.c
@@ -296,17 +296,6 @@ err_free:
 	return ERR_PTR(ret);
 }
 
-/**
- * regmap_init_mmio_clk(): Initialise register map with register clock
- *
- * @dev: Device that will be interacted with
- * @clk_id: register clock consumer ID
- * @regs: Pointer to memory-mapped IO region
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.
- */
 struct regmap *__regmap_init_mmio_clk(struct device *dev, const char *clk_id,
 				      void __iomem *regs,
 				      const struct regmap_config *config,
@@ -324,18 +313,6 @@ struct regmap *__regmap_init_mmio_clk(struct device *dev, const char *clk_id,
 }
 EXPORT_SYMBOL_GPL(__regmap_init_mmio_clk);
 
-/**
- * devm_regmap_init_mmio_clk(): Initialise managed register map with clock
- *
- * @dev: Device that will be interacted with
- * @clk_id: register clock consumer ID
- * @regs: Pointer to memory-mapped IO region
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer
- * to a struct regmap.  The regmap will be automatically freed by the
- * device management code.
- */
 struct regmap *__devm_regmap_init_mmio_clk(struct device *dev,
 					   const char *clk_id,
 					   void __iomem *regs,
diff --git a/drivers/base/regmap/regmap-spi.c b/drivers/base/regmap/regmap-spi.c
index 4c7850d660d1..edd9a839d004 100644
--- a/drivers/base/regmap/regmap-spi.c
+++ b/drivers/base/regmap/regmap-spi.c
@@ -113,15 +113,6 @@ static struct regmap_bus regmap_spi = {
 	.val_format_endian_default = REGMAP_ENDIAN_BIG,
 };
 
-/**
- * regmap_init_spi(): Initialise register map
- *
- * @spi: Device that will be interacted with
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.
- */
 struct regmap *__regmap_init_spi(struct spi_device *spi,
 				 const struct regmap_config *config,
 				 struct lock_class_key *lock_key,
@@ -132,16 +123,6 @@ struct regmap *__regmap_init_spi(struct spi_device *spi,
 }
 EXPORT_SYMBOL_GPL(__regmap_init_spi);
 
-/**
- * devm_regmap_init_spi(): Initialise register map
- *
- * @spi: Device that will be interacted with
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer
- * to a struct regmap.  The map will be automatically freed by the
- * device management code.
- */
 struct regmap *__devm_regmap_init_spi(struct spi_device *spi,
 				      const struct regmap_config *config,
 				      struct lock_class_key *lock_key,
diff --git a/drivers/base/regmap/regmap-spmi.c b/drivers/base/regmap/regmap-spmi.c
index 7f50f5862d39..7e58f6560399 100644
--- a/drivers/base/regmap/regmap-spmi.c
+++ b/drivers/base/regmap/regmap-spmi.c
@@ -91,14 +91,6 @@ static struct regmap_bus regmap_spmi_base = {
 	.val_format_endian_default	= REGMAP_ENDIAN_NATIVE,
 };
 
-/**
- * regmap_init_spmi_base(): Create regmap for the Base register space
- * @sdev:	SPMI device that will be interacted with
- * @config:	Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.
- */
 struct regmap *__regmap_init_spmi_base(struct spmi_device *sdev,
 				       const struct regmap_config *config,
 				       struct lock_class_key *lock_key,
@@ -109,15 +101,6 @@ struct regmap *__regmap_init_spmi_base(struct spmi_device *sdev,
 }
 EXPORT_SYMBOL_GPL(__regmap_init_spmi_base);
 
-/**
- * devm_regmap_init_spmi_base(): Create managed regmap for Base register space
- * @sdev:	SPMI device that will be interacted with
- * @config:	Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer
- * to a struct regmap.  The regmap will be automatically freed by the
- * device management code.
- */
 struct regmap *__devm_regmap_init_spmi_base(struct spmi_device *sdev,
 					    const struct regmap_config *config,
 					    struct lock_class_key *lock_key,
@@ -228,14 +211,6 @@ static struct regmap_bus regmap_spmi_ext = {
 	.val_format_endian_default	= REGMAP_ENDIAN_NATIVE,
 };
 
-/**
- * regmap_init_spmi_ext(): Create regmap for Ext register space
- * @sdev:	Device that will be interacted with
- * @config:	Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.
- */
 struct regmap *__regmap_init_spmi_ext(struct spmi_device *sdev,
 				      const struct regmap_config *config,
 				      struct lock_class_key *lock_key,
@@ -246,15 +221,6 @@ struct regmap *__regmap_init_spmi_ext(struct spmi_device *sdev,
 }
 EXPORT_SYMBOL_GPL(__regmap_init_spmi_ext);
 
-/**
- * devm_regmap_init_spmi_ext(): Create managed regmap for Ext register space
- * @sdev:	SPMI device that will be interacted with
- * @config:	Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer
- * to a struct regmap.  The regmap will be automatically freed by the
- * device management code.
- */
 struct regmap *__devm_regmap_init_spmi_ext(struct spmi_device *sdev,
 					   const struct regmap_config *config,
 					   struct lock_class_key *lock_key,
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index b9fddccd6e06..53ba9d9e17d1 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -515,18 +515,6 @@ enum regmap_endian regmap_get_val_endian(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(regmap_get_val_endian);
 
-/**
- * regmap_init(): Initialise register map
- *
- * @dev: Device that will be interacted with
- * @bus: Bus-specific callbacks to use with device
- * @bus_context: Data passed to bus-specific callbacks
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.  This function should generally not be called
- * directly, it should be called by bus-specific init functions.
- */
 struct regmap *__regmap_init(struct device *dev,
 			     const struct regmap_bus *bus,
 			     void *bus_context,
@@ -912,19 +900,6 @@ static void devm_regmap_release(struct device *dev, void *res)
 	regmap_exit(*(struct regmap **)res);
 }
 
-/**
- * devm_regmap_init(): Initialise managed register map
- *
- * @dev: Device that will be interacted with
- * @bus: Bus-specific callbacks to use with device
- * @bus_context: Data passed to bus-specific callbacks
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer
- * to a struct regmap.  This function should generally not be called
- * directly, it should be called by bus-specific init functions.  The
- * map will be automatically freed by the device management code.
- */
 struct regmap *__devm_regmap_init(struct device *dev,
 				  const struct regmap_bus *bus,
 				  void *bus_context,
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 5d7027286032..ee032eca630d 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -419,65 +419,202 @@ struct regmap *__devm_regmap_init_ac97(struct snd_ac97 *ac97,
 #define __regmap_lockdep_wrapper(fn, name, ...) fn(__VA_ARGS__, NULL, NULL)
 #endif
 
+/**
+ * regmap_init(): Initialise register map
+ *
+ * @dev: Device that will be interacted with
+ * @bus: Bus-specific callbacks to use with device
+ * @bus_context: Data passed to bus-specific callbacks
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.  This function should generally not be called
+ * directly, it should be called by bus-specific init functions.
+ */
 #define regmap_init(dev, bus, bus_context, config)			\
 	__regmap_lockdep_wrapper(__regmap_init, #config,		\
 				dev, bus, bus_context, config)
 int regmap_attach_dev(struct device *dev, struct regmap *map,
 		      const struct regmap_config *config);
+
+/**
+ * regmap_init_i2c(): Initialise register map
+ *
+ * @i2c: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
 #define regmap_init_i2c(i2c, config)					\
 	__regmap_lockdep_wrapper(__regmap_init_i2c, #config,		\
 				i2c, config)
+
+/**
+ * regmap_init_spi(): Initialise register map
+ *
+ * @spi: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
 #define regmap_init_spi(dev, config)					\
 	__regmap_lockdep_wrapper(__regmap_init_spi, #config,		\
 				dev, config)
+
+/**
+ * regmap_init_spmi_base(): Create regmap for the Base register space
+ * @sdev:	SPMI device that will be interacted with
+ * @config:	Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
 #define regmap_init_spmi_base(dev, config)				\
 	__regmap_lockdep_wrapper(__regmap_init_spmi_base, #config,	\
 				dev, config)
+
+/**
+ * regmap_init_spmi_ext(): Create regmap for Ext register space
+ * @sdev:	Device that will be interacted with
+ * @config:	Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
 #define regmap_init_spmi_ext(dev, config)				\
 	__regmap_lockdep_wrapper(__regmap_init_spmi_ext, #config,	\
 				dev, config)
+
+/**
+ * regmap_init_mmio_clk(): Initialise register map with register clock
+ *
+ * @dev: Device that will be interacted with
+ * @clk_id: register clock consumer ID
+ * @regs: Pointer to memory-mapped IO region
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
 #define regmap_init_mmio_clk(dev, clk_id, regs, config)			\
 	__regmap_lockdep_wrapper(__regmap_init_mmio_clk, #config,	\
 				dev, clk_id, regs, config)
+
+/**
+ * regmap_init_mmio(): Initialise register map
+ *
+ * @dev: Device that will be interacted with
+ * @regs: Pointer to memory-mapped IO region
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
+#define regmap_init_mmio(dev, regs, config)		\
+	regmap_init_mmio_clk(dev, NULL, regs, config)
+
+/**
+ * regmap_init_ac97(): Initialise AC'97 register map
+ *
+ * @ac97: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
 #define regmap_init_ac97(ac97, config)					\
 	__regmap_lockdep_wrapper(__regmap_init_ac97, #config,		\
 				ac97, config)
 bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 
+/**
+ * devm_regmap_init(): Initialise managed register map
+ *
+ * @dev: Device that will be interacted with
+ * @bus: Bus-specific callbacks to use with device
+ * @bus_context: Data passed to bus-specific callbacks
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  This function should generally not be called
+ * directly, it should be called by bus-specific init functions.  The
+ * map will be automatically freed by the device management code.
+ */
 #define devm_regmap_init(dev, bus, bus_context, config)			\
 	__regmap_lockdep_wrapper(__devm_regmap_init, #config,		\
 				dev, bus, bus_context, config)
+
+/**
+ * devm_regmap_init_i2c(): Initialise managed register map
+ *
+ * @i2c: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The regmap will be automatically freed by the
+ * device management code.
+ */
 #define devm_regmap_init_i2c(i2c, config)				\
 	__regmap_lockdep_wrapper(__devm_regmap_init_i2c, #config,	\
 				i2c, config)
+
+/**
+ * devm_regmap_init_spi(): Initialise register map
+ *
+ * @spi: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The map will be automatically freed by the
+ * device management code.
+ */
 #define devm_regmap_init_spi(dev, config)				\
 	__regmap_lockdep_wrapper(__devm_regmap_init_spi, #config,	\
 				dev, config)
+
+/**
+ * devm_regmap_init_spmi_base(): Create managed regmap for Base register space
+ * @sdev:	SPMI device that will be interacted with
+ * @config:	Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The regmap will be automatically freed by the
+ * device management code.
+ */
 #define devm_regmap_init_spmi_base(dev, config)				\
 	__regmap_lockdep_wrapper(__devm_regmap_init_spmi_base, #config,	\
 				dev, config)
+
+/**
+ * devm_regmap_init_spmi_ext(): Create managed regmap for Ext register space
+ * @sdev:	SPMI device that will be interacted with
+ * @config:	Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The regmap will be automatically freed by the
+ * device management code.
+ */
 #define devm_regmap_init_spmi_ext(dev, config)				\
 	__regmap_lockdep_wrapper(__devm_regmap_init_spmi_ext, #config,	\
 				dev, config)
-#define devm_regmap_init_mmio_clk(dev, clk_id, regs, config)		\
-	__regmap_lockdep_wrapper(__devm_regmap_init_mmio_clk, #config,	\
-				dev, clk_id, regs, config)
-#define devm_regmap_init_ac97(ac97, config)				\
-	__regmap_lockdep_wrapper(__devm_regmap_init_ac97, #config,	\
-				ac97, config)
 
 /**
- * regmap_init_mmio(): Initialise register map
+ * devm_regmap_init_mmio_clk(): Initialise managed register map with clock
  *
  * @dev: Device that will be interacted with
+ * @clk_id: register clock consumer ID
  * @regs: Pointer to memory-mapped IO region
  * @config: Configuration for register map
  *
- * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The regmap will be automatically freed by the
+ * device management code.
  */
-#define regmap_init_mmio(dev, regs, config)		\
-	regmap_init_mmio_clk(dev, NULL, regs, config)
+#define devm_regmap_init_mmio_clk(dev, clk_id, regs, config)		\
+	__regmap_lockdep_wrapper(__devm_regmap_init_mmio_clk, #config,	\
+				dev, clk_id, regs, config)
 
 /**
  * devm_regmap_init_mmio(): Initialise managed register map
@@ -493,6 +630,20 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 #define devm_regmap_init_mmio(dev, regs, config)		\
 	devm_regmap_init_mmio_clk(dev, NULL, regs, config)
 
+/**
+ * devm_regmap_init_ac97(): Initialise AC'97 register map
+ *
+ * @ac97: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The regmap will be automatically freed by the
+ * device management code.
+ */
+#define devm_regmap_init_ac97(ac97, config)				\
+	__regmap_lockdep_wrapper(__devm_regmap_init_ac97, #config,	\
+				ac97, config)
+
 void regmap_exit(struct regmap *map);
 int regmap_reinit_cache(struct regmap *map,
 			const struct regmap_config *config);
-- 
cgit v1.2.3-70-g09d2


From 731a981e1059dd68c97212ccc9c0e1f169c1a77b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Aug 2015 13:35:42 -0400
Subject: cgroup: make cftype->private a unsigned long

It's pretty unusual to have an int as a private data field and it
makes it impossible to carray a pointer value through it.  Let's make
it an unsigned long.  AFAICS, this shouldn't break anything.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/cgroup-defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 93755a629299..8f5770a7d85a 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -318,7 +318,7 @@ struct cftype {
 	 * end of cftype array.
 	 */
 	char name[MAX_CFTYPE_NAME];
-	int private;
+	unsigned long private;
 	/*
 	 * If not 0, file mode is set to this value, otherwise it will
 	 * be figured out automatically
-- 
cgit v1.2.3-70-g09d2


From 752b5ed5f6998e118626feea7375782c4cf5aad6 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 4 Aug 2015 14:28:02 +0200
Subject: clk: shmobile: Add CPG/MSTP Clock Domain support

Add Clock Domain support to the Clock Pulse Generator (CPG) Module Stop
(MSTP) Clocks driver using the generic PM Domain.  This allows to
power-manage the module clocks of SoC devices that are part of the
CPG/MSTP Clock Domain using Runtime PM, or for system suspend/resume.

SoC devices that are part of the CPG/MSTP Clock Domain and can be
power-managed through an MSTP clock should be tagged in DT with a
proper "power-domains" property.

The CPG/MSTP Clock Domain code will scan such devices for clocks that
are suitable for power-managing the device, by looking for a clock that
is compatible with "renesas,cpg-mstp-clocks".

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Stephen Boyd <sboyd@codeaurora.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Kevin Hilman <khilman@linaro.org>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 drivers/clk/shmobile/clk-mstp.c | 87 +++++++++++++++++++++++++++++++++++++++++
 include/linux/clk/shmobile.h    | 12 ++++++
 2 files changed, 99 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/shmobile/clk-mstp.c b/drivers/clk/shmobile/clk-mstp.c
index 2d2fe773ac81..b1df7b2f1e97 100644
--- a/drivers/clk/shmobile/clk-mstp.c
+++ b/drivers/clk/shmobile/clk-mstp.c
@@ -2,6 +2,7 @@
  * R-Car MSTP clocks
  *
  * Copyright (C) 2013 Ideas On Board SPRL
+ * Copyright (C) 2015 Glider bvba
  *
  * Contact: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
  *
@@ -10,11 +11,16 @@
  * the Free Software Foundation; version 2 of the License.
  */
 
+#include <linux/clk.h>
 #include <linux/clk-provider.h>
 #include <linux/clkdev.h>
+#include <linux/clk/shmobile.h>
+#include <linux/device.h>
 #include <linux/io.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/pm_clock.h>
+#include <linux/pm_domain.h>
 #include <linux/spinlock.h>
 
 /*
@@ -236,3 +242,84 @@ static void __init cpg_mstp_clocks_init(struct device_node *np)
 	of_clk_add_provider(np, of_clk_src_onecell_get, &group->data);
 }
 CLK_OF_DECLARE(cpg_mstp_clks, "renesas,cpg-mstp-clocks", cpg_mstp_clocks_init);
+
+
+#ifdef CONFIG_PM_GENERIC_DOMAINS_OF
+int cpg_mstp_attach_dev(struct generic_pm_domain *domain, struct device *dev)
+{
+	struct device_node *np = dev->of_node;
+	struct of_phandle_args clkspec;
+	struct clk *clk;
+	int i = 0;
+	int error;
+
+	while (!of_parse_phandle_with_args(np, "clocks", "#clock-cells", i,
+					   &clkspec)) {
+		if (of_device_is_compatible(clkspec.np,
+					    "renesas,cpg-mstp-clocks"))
+			goto found;
+
+		of_node_put(clkspec.np);
+		i++;
+	}
+
+	return 0;
+
+found:
+	clk = of_clk_get_from_provider(&clkspec);
+	of_node_put(clkspec.np);
+
+	if (IS_ERR(clk))
+		return PTR_ERR(clk);
+
+	error = pm_clk_create(dev);
+	if (error) {
+		dev_err(dev, "pm_clk_create failed %d\n", error);
+		goto fail_put;
+	}
+
+	error = pm_clk_add_clk(dev, clk);
+	if (error) {
+		dev_err(dev, "pm_clk_add_clk %pC failed %d\n", clk, error);
+		goto fail_destroy;
+	}
+
+	return 0;
+
+fail_destroy:
+	pm_clk_destroy(dev);
+fail_put:
+	clk_put(clk);
+	return error;
+}
+
+void cpg_mstp_detach_dev(struct generic_pm_domain *domain, struct device *dev)
+{
+	if (!list_empty(&dev->power.subsys_data->clock_list))
+		pm_clk_destroy(dev);
+}
+
+void __init cpg_mstp_add_clk_domain(struct device_node *np)
+{
+	struct generic_pm_domain *pd;
+	u32 ncells;
+
+	if (of_property_read_u32(np, "#power-domain-cells", &ncells)) {
+		pr_warn("%s lacks #power-domain-cells\n", np->full_name);
+		return;
+	}
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return;
+
+	pd->name = np->name;
+
+	pd->flags = GENPD_FLAG_PM_CLK;
+	pm_genpd_init(pd, &simple_qos_governor, false);
+	pd->attach_dev = cpg_mstp_attach_dev;
+	pd->detach_dev = cpg_mstp_detach_dev;
+
+	of_genpd_add_provider_simple(np, pd);
+}
+#endif /* !CONFIG_PM_GENERIC_DOMAINS_OF */
diff --git a/include/linux/clk/shmobile.h b/include/linux/clk/shmobile.h
index 63a8159c4e64..cb19cc1865ca 100644
--- a/include/linux/clk/shmobile.h
+++ b/include/linux/clk/shmobile.h
@@ -16,8 +16,20 @@
 
 #include <linux/types.h>
 
+struct device;
+struct device_node;
+struct generic_pm_domain;
+
 void r8a7778_clocks_init(u32 mode);
 void r8a7779_clocks_init(u32 mode);
 void rcar_gen2_clocks_init(u32 mode);
 
+#ifdef CONFIG_PM_GENERIC_DOMAINS_OF
+void cpg_mstp_add_clk_domain(struct device_node *np);
+int cpg_mstp_attach_dev(struct generic_pm_domain *domain, struct device *dev);
+void cpg_mstp_detach_dev(struct generic_pm_domain *domain, struct device *dev);
+#else
+static inline void cpg_mstp_add_clk_domain(struct device_node *np) {}
+#endif
+
 #endif
-- 
cgit v1.2.3-70-g09d2


From 553ed4b5dff66dbb10c20599e493b72ec6af72ab Mon Sep 17 00:00:00 2001
From: Bruno Prémont <bonbons@linux-vserver.org>
Date: Sat, 8 Aug 2015 17:58:40 +0200
Subject: mfd: axp20x: Add missing registers, and mark more registers volatile
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an extra set of registers which is necessary tu support the PMICs
battery charger function, and mark registers which contain status bits,
gpio status, and adc readings as volatile.

Signed-off-by: Bruno Prémont <bonbons@linux-vserver.org>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/axp20x.c       | 8 +++++++-
 include/linux/mfd/axp20x.h | 6 ++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index b369cfce866c..d9f2f967d653 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -61,10 +61,16 @@ static const struct regmap_access_table axp152_volatile_table = {
 static const struct regmap_range axp20x_writeable_ranges[] = {
 	regmap_reg_range(AXP20X_DATACACHE(0), AXP20X_IRQ5_STATE),
 	regmap_reg_range(AXP20X_DCDC_MODE, AXP20X_FG_RES),
+	regmap_reg_range(AXP20X_RDC_H, AXP20X_OCV(AXP20X_OCV_MAX)),
 };
 
 static const struct regmap_range axp20x_volatile_ranges[] = {
+	regmap_reg_range(AXP20X_PWR_INPUT_STATUS, AXP20X_USB_OTG_STATUS),
+	regmap_reg_range(AXP20X_CHRG_CTRL1, AXP20X_CHRG_CTRL2),
 	regmap_reg_range(AXP20X_IRQ1_EN, AXP20X_IRQ5_STATE),
+	regmap_reg_range(AXP20X_ACIN_V_ADC_H, AXP20X_IPSOUT_V_HIGH_L),
+	regmap_reg_range(AXP20X_GPIO20_SS, AXP20X_GPIO3_CTRL),
+	regmap_reg_range(AXP20X_FG_RES, AXP20X_RDC_L),
 };
 
 static const struct regmap_access_table axp20x_writeable_table = {
@@ -195,7 +201,7 @@ static const struct regmap_config axp20x_regmap_config = {
 	.val_bits	= 8,
 	.wr_table	= &axp20x_writeable_table,
 	.volatile_table	= &axp20x_volatile_table,
-	.max_register	= AXP20X_FG_RES,
+	.max_register	= AXP20X_OCV(AXP20X_OCV_MAX),
 	.cache_type	= REGCACHE_RBTREE,
 };
 
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 52203d5f7984..cc8ad1e1a307 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -190,6 +190,12 @@ enum {
 #define AXP20X_CC_CTRL			0xb8
 #define AXP20X_FG_RES			0xb9
 
+/* OCV */
+#define AXP20X_RDC_H			0xba
+#define AXP20X_RDC_L			0xbb
+#define AXP20X_OCV(m)			(0xc0 + (m))
+#define AXP20X_OCV_MAX			0xf
+
 /* AXP22X specific registers */
 #define AXP22X_BATLOW_THRES1		0xe6
 
-- 
cgit v1.2.3-70-g09d2


From ba33034fffc1189d95301bd865f1c799256e72a2 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 4 Aug 2015 09:55:48 +0200
Subject: locking, compiler.h: Cast away attributes in the WRITE_ONCE() magic

The kernel build bot showed a new warning triggered by commit:

  76695af20c01 ("locking, arch: use WRITE_ONCE()/READ_ONCE() in smp_store_release()/smp_load_acquire()")

because Sparse does not like WRITE_ONCE() accessing elements
from the (sparse) RCU address space:

  fs/afs/inode.c:448:9: sparse: incorrect type in initializer (different address spaces)
  fs/afs/inode.c:448:9:    expected struct afs_permits *__val
  fs/afs/inode.c:448:9:    got void [noderef] <asn:4>*<noident>

Solution is to force cast away the sparse attributes for the initializer
of the union in WRITE_ONCE().

(And as this now gets too long, also split the macro into multiple lines.)

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438674948-38310-2-git-send-email-borntraeger@de.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index e08a6ae7c0a4..c836eb2dc44d 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -252,7 +252,12 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
 
 #define WRITE_ONCE(x, val) \
-	({ union { typeof(x) __val; char __c[1]; } __u = { .__val = (val) }; __write_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
+({							\
+	union { typeof(x) __val; char __c[1]; } __u =	\
+		{ .__val = (__force typeof(x)) (val) }; \
+	__write_once_size(&(x), __u.__c, sizeof(x));	\
+	__u.__val;					\
+})
 
 /**
  * READ_ONCE_CTRL - Read a value heading a control dependency
-- 
cgit v1.2.3-70-g09d2


From 654672d4ba1a6001c365833be895f9477c4d5eab Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 6 Aug 2015 17:54:37 +0100
Subject: locking/atomics: Add _{acquire|release|relaxed}() variants of some
 atomic operations

Whilst porting the generic qrwlock code over to arm64, it became
apparent that any portable locking code needs finer-grained control of
the memory-ordering guarantees provided by our atomic routines.

In particular: xchg, cmpxchg, {add,sub}_return are often used in
situations where full barrier semantics (currently the only option
available) are not required. For example, when a reader increments a
reader count to obtain a lock, checking the old value to see if a writer
was present, only acquire semantics are strictly needed.

This patch introduces three new ordering semantics for these operations:

  - *_relaxed: No ordering guarantees. This is similar to what we have
               already for the non-return atomics (e.g. atomic_add).

  - *_acquire: ACQUIRE semantics, similar to smp_load_acquire.

  - *_release: RELEASE semantics, similar to smp_store_release.

In memory-ordering speak, this means that the acquire/release semantics
are RCpc as opposed to RCsc. Consequently a RELEASE followed by an
ACQUIRE does not imply a full barrier, as already documented in
memory-barriers.txt.

Currently, all the new macros are conditionally mapped to the full-mb
variants, however if the *_relaxed version is provided by the
architecture, then the acquire/release variants are constructed by
supplementing the relaxed routine with an explicit barrier.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman.Long@hp.com
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/1438880084-18856-2-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/atomic.h | 323 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 323 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 8b98b423388f..00a5763e850e 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -2,6 +2,329 @@
 #ifndef _LINUX_ATOMIC_H
 #define _LINUX_ATOMIC_H
 #include <asm/atomic.h>
+#include <asm/barrier.h>
+
+/*
+ * Relaxed variants of xchg, cmpxchg and some atomic operations.
+ *
+ * We support four variants:
+ *
+ * - Fully ordered: The default implementation, no suffix required.
+ * - Acquire: Provides ACQUIRE semantics, _acquire suffix.
+ * - Release: Provides RELEASE semantics, _release suffix.
+ * - Relaxed: No ordering guarantees, _relaxed suffix.
+ *
+ * For compound atomics performing both a load and a store, ACQUIRE
+ * semantics apply only to the load and RELEASE semantics only to the
+ * store portion of the operation. Note that a failed cmpxchg_acquire
+ * does -not- imply any memory ordering constraints.
+ *
+ * See Documentation/memory-barriers.txt for ACQUIRE/RELEASE definitions.
+ */
+
+#ifndef atomic_read_acquire
+#define  atomic_read_acquire(v)		smp_load_acquire(&(v)->counter)
+#endif
+
+#ifndef atomic_set_release
+#define  atomic_set_release(v, i)	smp_store_release(&(v)->counter, (i))
+#endif
+
+/*
+ * The idea here is to build acquire/release variants by adding explicit
+ * barriers on top of the relaxed variant. In the case where the relaxed
+ * variant is already fully ordered, no additional barriers are needed.
+ */
+#define __atomic_op_acquire(op, args...)				\
+({									\
+	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
+	smp_mb__after_atomic();						\
+	__ret;								\
+})
+
+#define __atomic_op_release(op, args...)				\
+({									\
+	smp_mb__before_atomic();					\
+	op##_relaxed(args);						\
+})
+
+#define __atomic_op_fence(op, args...)					\
+({									\
+	typeof(op##_relaxed(args)) __ret;				\
+	smp_mb__before_atomic();					\
+	__ret = op##_relaxed(args);					\
+	smp_mb__after_atomic();						\
+	__ret;								\
+})
+
+/* atomic_add_return_relaxed */
+#ifndef atomic_add_return_relaxed
+#define  atomic_add_return_relaxed	atomic_add_return
+#define  atomic_add_return_acquire	atomic_add_return
+#define  atomic_add_return_release	atomic_add_return
+
+#else /* atomic_add_return_relaxed */
+
+#ifndef atomic_add_return_acquire
+#define  atomic_add_return_acquire(...)					\
+	__atomic_op_acquire(atomic_add_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic_add_return_release
+#define  atomic_add_return_release(...)					\
+	__atomic_op_release(atomic_add_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic_add_return
+#define  atomic_add_return(...)						\
+	__atomic_op_fence(atomic_add_return, __VA_ARGS__)
+#endif
+#endif /* atomic_add_return_relaxed */
+
+/* atomic_sub_return_relaxed */
+#ifndef atomic_sub_return_relaxed
+#define  atomic_sub_return_relaxed	atomic_sub_return
+#define  atomic_sub_return_acquire	atomic_sub_return
+#define  atomic_sub_return_release	atomic_sub_return
+
+#else /* atomic_sub_return_relaxed */
+
+#ifndef atomic_sub_return_acquire
+#define  atomic_sub_return_acquire(...)					\
+	__atomic_op_acquire(atomic_sub_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic_sub_return_release
+#define  atomic_sub_return_release(...)					\
+	__atomic_op_release(atomic_sub_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic_sub_return
+#define  atomic_sub_return(...)						\
+	__atomic_op_fence(atomic_sub_return, __VA_ARGS__)
+#endif
+#endif /* atomic_sub_return_relaxed */
+
+/* atomic_xchg_relaxed */
+#ifndef atomic_xchg_relaxed
+#define  atomic_xchg_relaxed		atomic_xchg
+#define  atomic_xchg_acquire		atomic_xchg
+#define  atomic_xchg_release		atomic_xchg
+
+#else /* atomic_xchg_relaxed */
+
+#ifndef atomic_xchg_acquire
+#define  atomic_xchg_acquire(...)					\
+	__atomic_op_acquire(atomic_xchg, __VA_ARGS__)
+#endif
+
+#ifndef atomic_xchg_release
+#define  atomic_xchg_release(...)					\
+	__atomic_op_release(atomic_xchg, __VA_ARGS__)
+#endif
+
+#ifndef atomic_xchg
+#define  atomic_xchg(...)						\
+	__atomic_op_fence(atomic_xchg, __VA_ARGS__)
+#endif
+#endif /* atomic_xchg_relaxed */
+
+/* atomic_cmpxchg_relaxed */
+#ifndef atomic_cmpxchg_relaxed
+#define  atomic_cmpxchg_relaxed		atomic_cmpxchg
+#define  atomic_cmpxchg_acquire		atomic_cmpxchg
+#define  atomic_cmpxchg_release		atomic_cmpxchg
+
+#else /* atomic_cmpxchg_relaxed */
+
+#ifndef atomic_cmpxchg_acquire
+#define  atomic_cmpxchg_acquire(...)					\
+	__atomic_op_acquire(atomic_cmpxchg, __VA_ARGS__)
+#endif
+
+#ifndef atomic_cmpxchg_release
+#define  atomic_cmpxchg_release(...)					\
+	__atomic_op_release(atomic_cmpxchg, __VA_ARGS__)
+#endif
+
+#ifndef atomic_cmpxchg
+#define  atomic_cmpxchg(...)						\
+	__atomic_op_fence(atomic_cmpxchg, __VA_ARGS__)
+#endif
+#endif /* atomic_cmpxchg_relaxed */
+
+#ifndef atomic64_read_acquire
+#define  atomic64_read_acquire(v)	smp_load_acquire(&(v)->counter)
+#endif
+
+#ifndef atomic64_set_release
+#define  atomic64_set_release(v, i)	smp_store_release(&(v)->counter, (i))
+#endif
+
+/* atomic64_add_return_relaxed */
+#ifndef atomic64_add_return_relaxed
+#define  atomic64_add_return_relaxed	atomic64_add_return
+#define  atomic64_add_return_acquire	atomic64_add_return
+#define  atomic64_add_return_release	atomic64_add_return
+
+#else /* atomic64_add_return_relaxed */
+
+#ifndef atomic64_add_return_acquire
+#define  atomic64_add_return_acquire(...)				\
+	__atomic_op_acquire(atomic64_add_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_add_return_release
+#define  atomic64_add_return_release(...)				\
+	__atomic_op_release(atomic64_add_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_add_return
+#define  atomic64_add_return(...)					\
+	__atomic_op_fence(atomic64_add_return, __VA_ARGS__)
+#endif
+#endif /* atomic64_add_return_relaxed */
+
+/* atomic64_sub_return_relaxed */
+#ifndef atomic64_sub_return_relaxed
+#define  atomic64_sub_return_relaxed	atomic64_sub_return
+#define  atomic64_sub_return_acquire	atomic64_sub_return
+#define  atomic64_sub_return_release	atomic64_sub_return
+
+#else /* atomic64_sub_return_relaxed */
+
+#ifndef atomic64_sub_return_acquire
+#define  atomic64_sub_return_acquire(...)				\
+	__atomic_op_acquire(atomic64_sub_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_sub_return_release
+#define  atomic64_sub_return_release(...)				\
+	__atomic_op_release(atomic64_sub_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_sub_return
+#define  atomic64_sub_return(...)					\
+	__atomic_op_fence(atomic64_sub_return, __VA_ARGS__)
+#endif
+#endif /* atomic64_sub_return_relaxed */
+
+/* atomic64_xchg_relaxed */
+#ifndef atomic64_xchg_relaxed
+#define  atomic64_xchg_relaxed		atomic64_xchg
+#define  atomic64_xchg_acquire		atomic64_xchg
+#define  atomic64_xchg_release		atomic64_xchg
+
+#else /* atomic64_xchg_relaxed */
+
+#ifndef atomic64_xchg_acquire
+#define  atomic64_xchg_acquire(...)					\
+	__atomic_op_acquire(atomic64_xchg, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_xchg_release
+#define  atomic64_xchg_release(...)					\
+	__atomic_op_release(atomic64_xchg, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_xchg
+#define  atomic64_xchg(...)						\
+	__atomic_op_fence(atomic64_xchg, __VA_ARGS__)
+#endif
+#endif /* atomic64_xchg_relaxed */
+
+/* atomic64_cmpxchg_relaxed */
+#ifndef atomic64_cmpxchg_relaxed
+#define  atomic64_cmpxchg_relaxed	atomic64_cmpxchg
+#define  atomic64_cmpxchg_acquire	atomic64_cmpxchg
+#define  atomic64_cmpxchg_release	atomic64_cmpxchg
+
+#else /* atomic64_cmpxchg_relaxed */
+
+#ifndef atomic64_cmpxchg_acquire
+#define  atomic64_cmpxchg_acquire(...)					\
+	__atomic_op_acquire(atomic64_cmpxchg, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_cmpxchg_release
+#define  atomic64_cmpxchg_release(...)					\
+	__atomic_op_release(atomic64_cmpxchg, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_cmpxchg
+#define  atomic64_cmpxchg(...)						\
+	__atomic_op_fence(atomic64_cmpxchg, __VA_ARGS__)
+#endif
+#endif /* atomic64_cmpxchg_relaxed */
+
+/* cmpxchg_relaxed */
+#ifndef cmpxchg_relaxed
+#define  cmpxchg_relaxed		cmpxchg
+#define  cmpxchg_acquire		cmpxchg
+#define  cmpxchg_release		cmpxchg
+
+#else /* cmpxchg_relaxed */
+
+#ifndef cmpxchg_acquire
+#define  cmpxchg_acquire(...)						\
+	__atomic_op_acquire(cmpxchg, __VA_ARGS__)
+#endif
+
+#ifndef cmpxchg_release
+#define  cmpxchg_release(...)						\
+	__atomic_op_release(cmpxchg, __VA_ARGS__)
+#endif
+
+#ifndef cmpxchg
+#define  cmpxchg(...)							\
+	__atomic_op_fence(cmpxchg, __VA_ARGS__)
+#endif
+#endif /* cmpxchg_relaxed */
+
+/* cmpxchg64_relaxed */
+#ifndef cmpxchg64_relaxed
+#define  cmpxchg64_relaxed		cmpxchg64
+#define  cmpxchg64_acquire		cmpxchg64
+#define  cmpxchg64_release		cmpxchg64
+
+#else /* cmpxchg64_relaxed */
+
+#ifndef cmpxchg64_acquire
+#define  cmpxchg64_acquire(...)						\
+	__atomic_op_acquire(cmpxchg64, __VA_ARGS__)
+#endif
+
+#ifndef cmpxchg64_release
+#define  cmpxchg64_release(...)						\
+	__atomic_op_release(cmpxchg64, __VA_ARGS__)
+#endif
+
+#ifndef cmpxchg64
+#define  cmpxchg64(...)							\
+	__atomic_op_fence(cmpxchg64, __VA_ARGS__)
+#endif
+#endif /* cmpxchg64_relaxed */
+
+/* xchg_relaxed */
+#ifndef xchg_relaxed
+#define  xchg_relaxed			xchg
+#define  xchg_acquire			xchg
+#define  xchg_release			xchg
+
+#else /* xchg_relaxed */
+
+#ifndef xchg_acquire
+#define  xchg_acquire(...)		__atomic_op_acquire(xchg, __VA_ARGS__)
+#endif
+
+#ifndef xchg_release
+#define  xchg_release(...)		__atomic_op_release(xchg, __VA_ARGS__)
+#endif
+
+#ifndef xchg
+#define  xchg(...)			__atomic_op_fence(xchg, __VA_ARGS__)
+#endif
+#endif /* xchg_relaxed */
 
 /**
  * atomic_add_unless - add unless the number is already a given value
-- 
cgit v1.2.3-70-g09d2


From cd074aea9261784e44f292e1132830ec221802c6 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 6 Aug 2015 17:54:43 +0100
Subject: locking, include/llist: Use linux/atomic.h instead of asm/cmpxchg.h

Including an asm/ header directly is best avoided, so use linux/atomic.h
instead of asm/cmpxchg.h in linux/llist.h.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman.Long@hp.com
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/1438880084-18856-8-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/llist.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/llist.h b/include/linux/llist.h
index fbf10a0bc095..fd4ca0b4fe0f 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -55,8 +55,8 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/atomic.h>
 #include <linux/kernel.h>
-#include <asm/cmpxchg.h>
 
 struct llist_head {
 	struct llist_node *first;
-- 
cgit v1.2.3-70-g09d2


From 25834c73f93af7f0712c98ca4593691592e6b360 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 May 2015 17:43:34 +0200
Subject: sched: Fix a race between __kthread_bind() and sched_setaffinity()

Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY
without locks, a caller might observe an old value and race with the
set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo
it:

	__kthread_bind()
	  do_set_cpus_allowed()
						<SYSCALL>
						  sched_setaffinity()
						    if (p->flags & PF_NO_SETAFFINITIY)
						    set_cpus_allowed_ptr()
	  p->flags |= PF_NO_SETAFFINITY

Fix the bug by putting everything under the regular scheduler locks.

This also closes a hole in the serialization of task_struct::{nr_,}cpus_allowed.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dedekind1@gmail.com
Cc: juri.lelli@arm.com
Cc: mgorman@suse.de
Cc: riel@redhat.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/20150515154833.545640346@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kthread.h |  1 +
 include/linux/sched.h   |  7 -------
 kernel/kthread.c        | 20 +++++++++++++++++---
 kernel/sched/core.c     | 36 ++++++++++++++++++++++++++++++++----
 kernel/workqueue.c      |  6 ++----
 5 files changed, 52 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 13d55206ccf6..869b21dcf503 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -38,6 +38,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 })
 
 void kthread_bind(struct task_struct *k, unsigned int cpu);
+void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
 int kthread_stop(struct task_struct *k);
 bool kthread_should_stop(void);
 bool kthread_should_park(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 44dca5b35de6..81bb4577274b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2203,13 +2203,6 @@ static inline void calc_load_enter_idle(void) { }
 static inline void calc_load_exit_idle(void) { }
 #endif /* CONFIG_NO_HZ_COMMON */
 
-#ifndef CONFIG_CPUMASK_OFFSTACK
-static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
-{
-	return set_cpus_allowed_ptr(p, &new_mask);
-}
-#endif
-
 /*
  * Do not use outside of architecture code which knows its limitations.
  *
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 10e489c448fe..7c40a189becc 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -325,16 +325,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create_on_node);
 
-static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
 {
-	/* Must have done schedule() in kthread() before we set_task_cpu */
+	unsigned long flags;
+
 	if (!wait_task_inactive(p, state)) {
 		WARN_ON(1);
 		return;
 	}
+
 	/* It's safe because the task is inactive. */
-	do_set_cpus_allowed(p, cpumask_of(cpu));
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	do_set_cpus_allowed(p, mask);
 	p->flags |= PF_NO_SETAFFINITY;
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+{
+	__kthread_bind_mask(p, cpumask_of(cpu), state);
+}
+
+void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
+{
+	__kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
 }
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ea6d74345e60..2e3b983da836 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1153,6 +1153,8 @@ static int migration_cpu_stop(void *data)
 
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
+	lockdep_assert_held(&p->pi_lock);
+
 	if (p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 
@@ -1169,7 +1171,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask, bool check)
 {
 	unsigned long flags;
 	struct rq *rq;
@@ -1178,6 +1181,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 
 	rq = task_rq_lock(p, &flags);
 
+	/*
+	 * Must re-check here, to close a race against __kthread_bind(),
+	 * sched_setaffinity() is not guaranteed to observe the flag.
+	 */
+	if (check && (p->flags & PF_NO_SETAFFINITY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	if (cpumask_equal(&p->cpus_allowed, new_mask))
 		goto out;
 
@@ -1214,6 +1226,11 @@ out:
 
 	return ret;
 }
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	return __set_cpus_allowed_ptr(p, new_mask, false);
+}
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -1595,6 +1612,15 @@ static void update_avg(u64 *avg, u64 sample)
 	s64 diff = sample - *avg;
 	*avg += diff >> 3;
 }
+
+#else
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+					 const struct cpumask *new_mask, bool check)
+{
+	return set_cpus_allowed_ptr(p, new_mask);
+}
+
 #endif /* CONFIG_SMP */
 
 static void
@@ -4340,7 +4366,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	}
 #endif
 again:
-	retval = set_cpus_allowed_ptr(p, new_mask);
+	retval = __set_cpus_allowed_ptr(p, new_mask, true);
 
 	if (!retval) {
 		cpuset_cpus_allowed(p, cpus_allowed);
@@ -4865,7 +4891,8 @@ void init_idle(struct task_struct *idle, int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	raw_spin_lock_irqsave(&rq->lock, flags);
+	raw_spin_lock_irqsave(&idle->pi_lock, flags);
+	raw_spin_lock(&rq->lock);
 
 	__sched_fork(0, idle);
 	idle->state = TASK_RUNNING;
@@ -4891,7 +4918,8 @@ void init_idle(struct task_struct *idle, int cpu)
 #if defined(CONFIG_SMP)
 	idle->on_cpu = 1;
 #endif
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
 	init_idle_preempt_count(idle, cpu);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4c4f06176f74..f5782d5fd196 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1714,9 +1714,7 @@ static struct worker *create_worker(struct worker_pool *pool)
 		goto fail;
 
 	set_user_nice(worker->task, pool->attrs->nice);
-
-	/* prevent userland from meddling with cpumask of workqueue workers */
-	worker->task->flags |= PF_NO_SETAFFINITY;
+	kthread_bind_mask(worker->task, pool->attrs->cpumask);
 
 	/* successful, attach the worker to the pool */
 	worker_attach_to_pool(worker, pool);
@@ -3856,7 +3854,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 		}
 
 		wq->rescuer = rescuer;
-		rescuer->task->flags |= PF_NO_SETAFFINITY;
+		kthread_bind_mask(rescuer->task, cpu_possible_mask);
 		wake_up_process(rescuer->task);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From fb182cf84568cc33ab41121bc8cc999f7aacbd47 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 8 Jun 2015 15:37:26 +0100
Subject: KVM: arm/arm64: vgic: Allow HW irq to be encoded in LR

Now that struct vgic_lr supports the LR_HW bit and carries a hwirq
field, we can encode that information into the list registers.

This patch provides implementations for both GICv2 and GICv3.

Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v3.h |  3 +++
 include/linux/irqchip/arm-gic.h    |  3 ++-
 virt/kvm/arm/vgic-v2.c             | 16 +++++++++++++++-
 virt/kvm/arm/vgic-v3.c             | 21 ++++++++++++++++++---
 4 files changed, 38 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index ffbc034c8810..cf637d65b589 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -268,9 +268,12 @@
 
 #define ICH_LR_EOI			(1UL << 41)
 #define ICH_LR_GROUP			(1UL << 60)
+#define ICH_LR_HW			(1UL << 61)
 #define ICH_LR_STATE			(3UL << 62)
 #define ICH_LR_PENDING_BIT		(1UL << 62)
 #define ICH_LR_ACTIVE_BIT		(1UL << 63)
+#define ICH_LR_PHYS_ID_SHIFT		32
+#define ICH_LR_PHYS_ID_MASK		(0x3ffUL << ICH_LR_PHYS_ID_SHIFT)
 
 #define ICH_MISR_EOI			(1 << 0)
 #define ICH_MISR_U			(1 << 1)
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index 9de976b4f9a7..ca88dad65260 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -71,11 +71,12 @@
 
 #define GICH_LR_VIRTUALID		(0x3ff << 0)
 #define GICH_LR_PHYSID_CPUID_SHIFT	(10)
-#define GICH_LR_PHYSID_CPUID		(7 << GICH_LR_PHYSID_CPUID_SHIFT)
+#define GICH_LR_PHYSID_CPUID		(0x3ff << GICH_LR_PHYSID_CPUID_SHIFT)
 #define GICH_LR_STATE			(3 << 28)
 #define GICH_LR_PENDING_BIT		(1 << 28)
 #define GICH_LR_ACTIVE_BIT		(1 << 29)
 #define GICH_LR_EOI			(1 << 19)
+#define GICH_LR_HW			(1 << 31)
 
 #define GICH_VMCR_CTRL_SHIFT		0
 #define GICH_VMCR_CTRL_MASK		(0x21f << GICH_VMCR_CTRL_SHIFT)
diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
index f9b9c7c51372..8d7b04db8471 100644
--- a/virt/kvm/arm/vgic-v2.c
+++ b/virt/kvm/arm/vgic-v2.c
@@ -48,6 +48,10 @@ static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr)
 		lr_desc.state |= LR_STATE_ACTIVE;
 	if (val & GICH_LR_EOI)
 		lr_desc.state |= LR_EOI_INT;
+	if (val & GICH_LR_HW) {
+		lr_desc.state |= LR_HW;
+		lr_desc.hwirq = (val & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT;
+	}
 
 	return lr_desc;
 }
@@ -55,7 +59,9 @@ static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr)
 static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
 			   struct vgic_lr lr_desc)
 {
-	u32 lr_val = (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT) | lr_desc.irq;
+	u32 lr_val;
+
+	lr_val = lr_desc.irq;
 
 	if (lr_desc.state & LR_STATE_PENDING)
 		lr_val |= GICH_LR_PENDING_BIT;
@@ -64,6 +70,14 @@ static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
 	if (lr_desc.state & LR_EOI_INT)
 		lr_val |= GICH_LR_EOI;
 
+	if (lr_desc.state & LR_HW) {
+		lr_val |= GICH_LR_HW;
+		lr_val |= (u32)lr_desc.hwirq << GICH_LR_PHYSID_CPUID_SHIFT;
+	}
+
+	if (lr_desc.irq < VGIC_NR_SGIS)
+		lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);
+
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
 }
 
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
index dff06021e748..afbf925b00f4 100644
--- a/virt/kvm/arm/vgic-v3.c
+++ b/virt/kvm/arm/vgic-v3.c
@@ -67,6 +67,10 @@ static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
 		lr_desc.state |= LR_STATE_ACTIVE;
 	if (val & ICH_LR_EOI)
 		lr_desc.state |= LR_EOI_INT;
+	if (val & ICH_LR_HW) {
+		lr_desc.state |= LR_HW;
+		lr_desc.hwirq = (val >> ICH_LR_PHYS_ID_SHIFT) & GENMASK(9, 0);
+	}
 
 	return lr_desc;
 }
@@ -84,10 +88,17 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
 	 * Eventually we want to make this configurable, so we may revisit
 	 * this in the future.
 	 */
-	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+	switch (vcpu->kvm->arch.vgic.vgic_model) {
+	case KVM_DEV_TYPE_ARM_VGIC_V3:
 		lr_val |= ICH_LR_GROUP;
-	else
-		lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
+		break;
+	case  KVM_DEV_TYPE_ARM_VGIC_V2:
+		if (lr_desc.irq < VGIC_NR_SGIS)
+			lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
+		break;
+	default:
+		BUG();
+	}
 
 	if (lr_desc.state & LR_STATE_PENDING)
 		lr_val |= ICH_LR_PENDING_BIT;
@@ -95,6 +106,10 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
 		lr_val |= ICH_LR_ACTIVE_BIT;
 	if (lr_desc.state & LR_EOI_INT)
 		lr_val |= ICH_LR_EOI;
+	if (lr_desc.state & LR_HW) {
+		lr_val |= ICH_LR_HW;
+		lr_val |= ((u64)lr_desc.hwirq) << ICH_LR_PHYS_ID_SHIFT;
+	}
 
 	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val;
 }
-- 
cgit v1.2.3-70-g09d2


From 8bf478163e69e42973c7070179a11815139e5bf0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Tue, 21 Jul 2015 10:41:21 +0200
Subject: iommu/vt-d: Split up iommu->domains array

This array is indexed by the domain-id and contains the
pointers to the domains attached to this iommu. Modern
systems support 65536 domain ids, so that this array has a
size of 512kb, per iommu.

This is a huge waste of space, as the array is usually
sparsely populated. This patch makes the array
two-dimensional and allocates the memory for the domain
pointers on-demand.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel-iommu.c | 54 ++++++++++++++++++++++++++++++++++++---------
 include/linux/intel-iommu.h |  2 +-
 2 files changed, 44 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 1c2d6126e5fd..90ab4b0d975c 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -571,13 +571,32 @@ static struct kmem_cache *iommu_devinfo_cache;
 
 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 {
-	return iommu->domains[did];
+	struct dmar_domain **domains;
+	int idx = did >> 8;
+
+	domains = iommu->domains[idx];
+	if (!domains)
+		return NULL;
+
+	return domains[did & 0xff];
 }
 
 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 			     struct dmar_domain *domain)
 {
-	iommu->domains[did] = domain;
+	struct dmar_domain **domains;
+	int idx = did >> 8;
+
+	if (!iommu->domains[idx]) {
+		size_t size = 256 * sizeof(struct dmar_domain *);
+		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
+	}
+
+	domains = iommu->domains[idx];
+	if (WARN_ON(!domains))
+		return;
+	else
+		domains[did & 0xff] = domain;
 }
 
 static inline void *alloc_pgtable_page(int node)
@@ -1530,35 +1549,43 @@ static void iommu_disable_translation(struct intel_iommu *iommu)
 
 static int iommu_init_domains(struct intel_iommu *iommu)
 {
-	unsigned long ndomains;
-	unsigned long nlongs;
+	u32 ndomains, nlongs;
+	size_t size;
 
 	ndomains = cap_ndoms(iommu->cap);
-	pr_debug("%s: Number of Domains supported <%ld>\n",
+	pr_debug("%s: Number of Domains supported <%d>\n",
 		 iommu->name, ndomains);
 	nlongs = BITS_TO_LONGS(ndomains);
 
 	spin_lock_init(&iommu->lock);
 
-	/* TBD: there might be 64K domains,
-	 * consider other allocation for future chip
-	 */
 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
 	if (!iommu->domain_ids) {
 		pr_err("%s: Allocating domain id array failed\n",
 		       iommu->name);
 		return -ENOMEM;
 	}
-	iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
-			GFP_KERNEL);
-	if (!iommu->domains) {
+
+	size = ((ndomains >> 8) + 1) * sizeof(struct dmar_domain **);
+	iommu->domains = kzalloc(size, GFP_KERNEL);
+
+	if (iommu->domains) {
+		size = 256 * sizeof(struct dmar_domain *);
+		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
+	}
+
+	if (!iommu->domains || !iommu->domains[0]) {
 		pr_err("%s: Allocating domain array failed\n",
 		       iommu->name);
 		kfree(iommu->domain_ids);
+		kfree(iommu->domains);
 		iommu->domain_ids = NULL;
+		iommu->domains    = NULL;
 		return -ENOMEM;
 	}
 
+
+
 	/*
 	 * If Caching mode is set, then invalid translations are tagged
 	 * with domain-id 0, hence we need to pre-allocate it. We also
@@ -1600,6 +1627,11 @@ static void disable_dmar_iommu(struct intel_iommu *iommu)
 static void free_dmar_iommu(struct intel_iommu *iommu)
 {
 	if ((iommu->domains) && (iommu->domain_ids)) {
+		int elems = (cap_ndoms(iommu->cap) >> 8) + 1;
+		int i;
+
+		for (i = 0; i < elems; i++)
+			kfree(iommu->domains[i]);
 		kfree(iommu->domains);
 		kfree(iommu->domain_ids);
 		iommu->domains = NULL;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index d9a366d24e3b..6240063bdcac 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -344,7 +344,7 @@ struct intel_iommu {
 
 #ifdef CONFIG_INTEL_IOMMU
 	unsigned long 	*domain_ids; /* bitmap of domains */
-	struct dmar_domain **domains; /* ptr to domains */
+	struct dmar_domain ***domains; /* ptr to domains */
 	spinlock_t	lock; /* protect context, domain ids */
 	struct root_entry *root_entry; /* virtual address */
 
-- 
cgit v1.2.3-70-g09d2


From 99db44350672c8a5ee9a7b0a6f4cd6ff10136065 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 5 Aug 2015 15:22:27 +0100
Subject: PKCS#7: Appropriately restrict authenticated attributes and content
 type

A PKCS#7 or CMS message can have per-signature authenticated attributes
that are digested as a lump and signed by the authorising key for that
signature.  If such attributes exist, the content digest isn't itself
signed, but rather it is included in a special authattr which then
contributes to the signature.

Further, we already require the master message content type to be
pkcs7_signedData - but there's also a separate content type for the data
itself within the SignedData object and this must be repeated inside the
authattrs for each signer [RFC2315 9.2, RFC5652 11.1].

We should really validate the authattrs if they exist or forbid them
entirely as appropriate.  To this end:

 (1) Alter the PKCS#7 parser to reject any message that has more than one
     signature where at least one signature has authattrs and at least one
     that does not.

 (2) Validate authattrs if they are present and strongly restrict them.
     Only the following authattrs are permitted and all others are
     rejected:

     (a) contentType.  This is checked to be an OID that matches the
     	 content type in the SignedData object.

     (b) messageDigest.  This must match the crypto digest of the data.

     (c) signingTime.  If present, we check that this is a valid, parseable
     	 UTCTime or GeneralTime and that the date it encodes fits within
     	 the validity window of the matching X.509 cert.

     (d) S/MIME capabilities.  We don't check the contents.

     (e) Authenticode SP Opus Info.  We don't check the contents.

     (f) Authenticode Statement Type.  We don't check the contents.

     The message is rejected if (a) or (b) are missing.  If the message is
     an Authenticode type, the message is rejected if (e) is missing; if
     not Authenticode, the message is rejected if (d) - (f) are present.

     The S/MIME capabilities authattr (d) unfortunately has to be allowed
     to support kernels already signed by the pesign program.  This only
     affects kexec.  sign-file suppresses them (CMS_NOSMIMECAP).

     The message is also rejected if an authattr is given more than once or
     if it contains more than one element in its set of values.

 (3) Add a parameter to pkcs7_verify() to select one of the following
     restrictions and pass in the appropriate option from the callers:

     (*) VERIFYING_MODULE_SIGNATURE

	 This requires that the SignedData content type be pkcs7-data and
	 forbids authattrs.  sign-file sets CMS_NOATTR.  We could be more
	 flexible and permit authattrs optionally, but only permit minimal
	 content.

     (*) VERIFYING_FIRMWARE_SIGNATURE

	 This requires that the SignedData content type be pkcs7-data and
	 requires authattrs.  In future, this will require an attribute
	 holding the target firmware name in addition to the minimal set.

     (*) VERIFYING_UNSPECIFIED_SIGNATURE

	 This requires that the SignedData content type be pkcs7-data but
	 allows either no authattrs or only permits the minimal set.

     (*) VERIFYING_KEXEC_PE_SIGNATURE

	 This only supports the Authenticode SPC_INDIRECT_DATA content type
	 and requires at least an SpcSpOpusInfo authattr in addition to the
	 minimal set.  It also permits an SPC_STATEMENT_TYPE authattr (and
	 an S/MIME capabilities authattr because the pesign program doesn't
	 remove these).

     (*) VERIFYING_KEY_SIGNATURE
     (*) VERIFYING_KEY_SELF_SIGNATURE

	 These are invalid in this context but are included for later use
	 when limiting the use of X.509 certs.

 (4) The pkcs7_test key type is given a module parameter to select between
     the above options for testing purposes.  For example:

	echo 1 >/sys/module/pkcs7_test_key/parameters/usage
	keyctl padd pkcs7_test foo @s </tmp/stuff.pkcs7

     will attempt to check the signature on stuff.pkcs7 as if it contains a
     firmware blob (1 being VERIFYING_FIRMWARE_SIGNATURE).

Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Marcel Holtmann <marcel@holtmann.org>
Reviewed-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/x86/kernel/kexec-bzimage64.c        |   4 +-
 crypto/asymmetric_keys/asymmetric_type.c |  11 +++
 crypto/asymmetric_keys/pkcs7.asn1        |   6 +-
 crypto/asymmetric_keys/pkcs7_key_type.c  |  14 +++-
 crypto/asymmetric_keys/pkcs7_parser.c    | 138 +++++++++++++++++++++++++++++--
 crypto/asymmetric_keys/pkcs7_parser.h    |  15 +++-
 crypto/asymmetric_keys/pkcs7_verify.c    |  65 ++++++++++++++-
 crypto/asymmetric_keys/verify_pefile.c   |   7 +-
 include/crypto/pkcs7.h                   |  10 ++-
 include/crypto/public_key.h              |  14 ++++
 include/keys/system_keyring.h            |   4 +-
 include/linux/oid_registry.h             |   4 +-
 include/linux/verify_pefile.h            |   6 +-
 kernel/module_signing.c                  |   3 +-
 kernel/system_keyring.c                  |   6 +-
 scripts/sign-file.c                      |   5 +-
 16 files changed, 285 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index ca83f7ac388b..fab22e72808c 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -536,7 +536,9 @@ static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
 	int ret;
 
 	ret = verify_pefile_signature(kernel, kernel_len,
-				      system_trusted_keyring, &trusted);
+				      system_trusted_keyring,
+				      VERIFYING_KEXEC_PE_SIGNATURE,
+				      &trusted);
 	if (ret < 0)
 		return ret;
 	if (!trusted)
diff --git a/crypto/asymmetric_keys/asymmetric_type.c b/crypto/asymmetric_keys/asymmetric_type.c
index b0e4ed23d668..1916680ad81b 100644
--- a/crypto/asymmetric_keys/asymmetric_type.c
+++ b/crypto/asymmetric_keys/asymmetric_type.c
@@ -12,6 +12,7 @@
  */
 #include <keys/asymmetric-subtype.h>
 #include <keys/asymmetric-parser.h>
+#include <crypto/public_key.h>
 #include <linux/seq_file.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -20,6 +21,16 @@
 
 MODULE_LICENSE("GPL");
 
+const char *const key_being_used_for[NR__KEY_BEING_USED_FOR] = {
+	[VERIFYING_MODULE_SIGNATURE]		= "mod sig",
+	[VERIFYING_FIRMWARE_SIGNATURE]		= "firmware sig",
+	[VERIFYING_KEXEC_PE_SIGNATURE]		= "kexec PE sig",
+	[VERIFYING_KEY_SIGNATURE]		= "key sig",
+	[VERIFYING_KEY_SELF_SIGNATURE]		= "key self sig",
+	[VERIFYING_UNSPECIFIED_SIGNATURE]	= "unspec sig",
+};
+EXPORT_SYMBOL_GPL(key_being_used_for);
+
 static LIST_HEAD(asymmetric_key_parsers);
 static DECLARE_RWSEM(asymmetric_key_parsers_sem);
 
diff --git a/crypto/asymmetric_keys/pkcs7.asn1 b/crypto/asymmetric_keys/pkcs7.asn1
index 6bf8ff4f7414..1eca740b816a 100644
--- a/crypto/asymmetric_keys/pkcs7.asn1
+++ b/crypto/asymmetric_keys/pkcs7.asn1
@@ -8,7 +8,7 @@ ContentType ::= OBJECT IDENTIFIER ({ pkcs7_note_OID })
 SignedData ::= SEQUENCE {
 	version			INTEGER ({ pkcs7_note_signeddata_version }),
 	digestAlgorithms	DigestAlgorithmIdentifiers,
-	contentInfo		ContentInfo,
+	contentInfo		ContentInfo ({ pkcs7_note_content }),
 	certificates		CHOICE {
 		certSet		[0] IMPLICIT ExtendedCertificatesAndCertificates,
 		certSequence	[2] IMPLICIT Certificates
@@ -21,7 +21,7 @@ SignedData ::= SEQUENCE {
 }
 
 ContentInfo ::= SEQUENCE {
-	contentType	ContentType,
+	contentType	ContentType ({ pkcs7_note_OID }),
 	content		[0] EXPLICIT Data OPTIONAL
 }
 
@@ -111,7 +111,7 @@ AuthenticatedAttribute ::= SEQUENCE {
 }
 
 UnauthenticatedAttribute ::= SEQUENCE {
-	type			OBJECT IDENTIFIER ({ pkcs7_note_OID }),
+	type			OBJECT IDENTIFIER,
 	values			SET OF ANY
 }
 
diff --git a/crypto/asymmetric_keys/pkcs7_key_type.c b/crypto/asymmetric_keys/pkcs7_key_type.c
index 3d13b042da73..10d34dbd00b9 100644
--- a/crypto/asymmetric_keys/pkcs7_key_type.c
+++ b/crypto/asymmetric_keys/pkcs7_key_type.c
@@ -14,16 +14,23 @@
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/key-type.h>
+#include <keys/asymmetric-type.h>
 #include <crypto/pkcs7.h>
 #include <keys/user-type.h>
 #include <keys/system_keyring.h>
 #include "pkcs7_parser.h"
 
+static unsigned pkcs7_usage;
+module_param_named(usage, pkcs7_usage, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(pkcs7_usage,
+		 "Usage to specify when verifying the PKCS#7 message");
+
 /*
  * Preparse a PKCS#7 wrapped and validated data blob.
  */
 static int pkcs7_preparse(struct key_preparsed_payload *prep)
 {
+	enum key_being_used_for usage = pkcs7_usage;
 	struct pkcs7_message *pkcs7;
 	const void *data, *saved_prep_data;
 	size_t datalen, saved_prep_datalen;
@@ -32,6 +39,11 @@ static int pkcs7_preparse(struct key_preparsed_payload *prep)
 
 	kenter("");
 
+	if (usage >= NR__KEY_BEING_USED_FOR) {
+		pr_err("Invalid usage type %d\n", usage);
+		return -EINVAL;
+	}
+
 	saved_prep_data = prep->data;
 	saved_prep_datalen = prep->datalen;
 	pkcs7 = pkcs7_parse_message(saved_prep_data, saved_prep_datalen);
@@ -40,7 +52,7 @@ static int pkcs7_preparse(struct key_preparsed_payload *prep)
 		goto error;
 	}
 
-	ret = pkcs7_verify(pkcs7);
+	ret = pkcs7_verify(pkcs7, usage);
 	if (ret < 0)
 		goto error_free;
 
diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c
index 826e2f3f507b..e6298b7a945a 100644
--- a/crypto/asymmetric_keys/pkcs7_parser.c
+++ b/crypto/asymmetric_keys/pkcs7_parser.c
@@ -81,6 +81,30 @@ void pkcs7_free_message(struct pkcs7_message *pkcs7)
 }
 EXPORT_SYMBOL_GPL(pkcs7_free_message);
 
+/*
+ * Check authenticatedAttributes are provided or not provided consistently.
+ */
+static int pkcs7_check_authattrs(struct pkcs7_message *msg)
+{
+	struct pkcs7_signed_info *sinfo;
+	bool want;
+
+	sinfo = msg->signed_infos;
+	if (sinfo->authattrs) {
+		want = true;
+		msg->have_authattrs = true;
+	}
+
+	for (sinfo = sinfo->next; sinfo; sinfo = sinfo->next)
+		if (!!sinfo->authattrs != want)
+			goto inconsistent;
+	return 0;
+
+inconsistent:
+	pr_warn("Inconsistently supplied authAttrs\n");
+	return -EINVAL;
+}
+
 /**
  * pkcs7_parse_message - Parse a PKCS#7 message
  * @data: The raw binary ASN.1 encoded message to be parsed
@@ -113,6 +137,10 @@ struct pkcs7_message *pkcs7_parse_message(const void *data, size_t datalen)
 		goto out;
 	}
 
+	ret = pkcs7_check_authattrs(ctx->msg);
+	if (ret < 0)
+		goto out;
+
 	msg = ctx->msg;
 	ctx->msg = NULL;
 
@@ -380,6 +408,25 @@ int pkcs7_note_certificate_list(void *context, size_t hdrlen,
 	return 0;
 }
 
+/*
+ * Note the content type.
+ */
+int pkcs7_note_content(void *context, size_t hdrlen,
+		       unsigned char tag,
+		       const void *value, size_t vlen)
+{
+	struct pkcs7_parse_context *ctx = context;
+
+	if (ctx->last_oid != OID_data &&
+	    ctx->last_oid != OID_msIndirectData) {
+		pr_warn("Unsupported data type %d\n", ctx->last_oid);
+		return -EINVAL;
+	}
+
+	ctx->msg->data_type = ctx->last_oid;
+	return 0;
+}
+
 /*
  * Extract the data from the message and store that and its content type OID in
  * the context.
@@ -395,31 +442,90 @@ int pkcs7_note_data(void *context, size_t hdrlen,
 	ctx->msg->data = value;
 	ctx->msg->data_len = vlen;
 	ctx->msg->data_hdrlen = hdrlen;
-	ctx->msg->data_type = ctx->last_oid;
 	return 0;
 }
 
 /*
- * Parse authenticated attributes
+ * Parse authenticated attributes.
  */
 int pkcs7_sig_note_authenticated_attr(void *context, size_t hdrlen,
 				      unsigned char tag,
 				      const void *value, size_t vlen)
 {
 	struct pkcs7_parse_context *ctx = context;
+	struct pkcs7_signed_info *sinfo = ctx->sinfo;
+	enum OID content_type;
 
 	pr_devel("AuthAttr: %02x %zu [%*ph]\n", tag, vlen, (unsigned)vlen, value);
 
 	switch (ctx->last_oid) {
+	case OID_contentType:
+		if (__test_and_set_bit(sinfo_has_content_type, &sinfo->aa_set))
+			goto repeated;
+		content_type = look_up_OID(value, vlen);
+		if (content_type != ctx->msg->data_type) {
+			pr_warn("Mismatch between global data type (%d) and sinfo %u (%d)\n",
+				ctx->msg->data_type, sinfo->index,
+				content_type);
+			return -EBADMSG;
+		}
+		return 0;
+
+	case OID_signingTime:
+		if (__test_and_set_bit(sinfo_has_signing_time, &sinfo->aa_set))
+			goto repeated;
+		/* Should we check that the signing time is consistent
+		 * with the signer's X.509 cert?
+		 */
+		return x509_decode_time(&sinfo->signing_time,
+					hdrlen, tag, value, vlen);
+
 	case OID_messageDigest:
+		if (__test_and_set_bit(sinfo_has_message_digest, &sinfo->aa_set))
+			goto repeated;
 		if (tag != ASN1_OTS)
 			return -EBADMSG;
-		ctx->sinfo->msgdigest = value;
-		ctx->sinfo->msgdigest_len = vlen;
+		sinfo->msgdigest = value;
+		sinfo->msgdigest_len = vlen;
+		return 0;
+
+	case OID_smimeCapabilites:
+		if (__test_and_set_bit(sinfo_has_smime_caps, &sinfo->aa_set))
+			goto repeated;
+		if (ctx->msg->data_type != OID_msIndirectData) {
+			pr_warn("S/MIME Caps only allowed with Authenticode\n");
+			return -EKEYREJECTED;
+		}
+		return 0;
+
+		/* Microsoft SpOpusInfo seems to be contain cont[0] 16-bit BE
+		 * char URLs and cont[1] 8-bit char URLs.
+		 *
+		 * Microsoft StatementType seems to contain a list of OIDs that
+		 * are also used as extendedKeyUsage types in X.509 certs.
+		 */
+	case OID_msSpOpusInfo:
+		if (__test_and_set_bit(sinfo_has_ms_opus_info, &sinfo->aa_set))
+			goto repeated;
+		goto authenticode_check;
+	case OID_msStatementType:
+		if (__test_and_set_bit(sinfo_has_ms_statement_type, &sinfo->aa_set))
+			goto repeated;
+	authenticode_check:
+		if (ctx->msg->data_type != OID_msIndirectData) {
+			pr_warn("Authenticode AuthAttrs only allowed with Authenticode\n");
+			return -EKEYREJECTED;
+		}
+		/* I'm not sure how to validate these */
 		return 0;
 	default:
 		return 0;
 	}
+
+repeated:
+	/* We permit max one item per AuthenticatedAttribute and no repeats */
+	pr_warn("Repeated/multivalue AuthAttrs not permitted\n");
+	return -EKEYREJECTED;
 }
 
 /*
@@ -430,10 +536,25 @@ int pkcs7_sig_note_set_of_authattrs(void *context, size_t hdrlen,
 				    const void *value, size_t vlen)
 {
 	struct pkcs7_parse_context *ctx = context;
+	struct pkcs7_signed_info *sinfo = ctx->sinfo;
+
+	if (!test_bit(sinfo_has_content_type, &sinfo->aa_set) ||
+	    !test_bit(sinfo_has_message_digest, &sinfo->aa_set) ||
+	    (ctx->msg->data_type == OID_msIndirectData &&
+	     !test_bit(sinfo_has_ms_opus_info, &sinfo->aa_set))) {
+		pr_warn("Missing required AuthAttr\n");
+		return -EBADMSG;
+	}
+
+	if (ctx->msg->data_type != OID_msIndirectData &&
+	    test_bit(sinfo_has_ms_opus_info, &sinfo->aa_set)) {
+		pr_warn("Unexpected Authenticode AuthAttr\n");
+		return -EBADMSG;
+	}
 
 	/* We need to switch the 'CONT 0' to a 'SET OF' when we digest */
-	ctx->sinfo->authattrs = value - (hdrlen - 1);
-	ctx->sinfo->authattrs_len = vlen + (hdrlen - 1);
+	sinfo->authattrs = value - (hdrlen - 1);
+	sinfo->authattrs_len = vlen + (hdrlen - 1);
 	return 0;
 }
 
@@ -511,6 +632,11 @@ int pkcs7_note_signed_info(void *context, size_t hdrlen,
 	struct pkcs7_signed_info *sinfo = ctx->sinfo;
 	struct asymmetric_key_id *kid;
 
+	if (ctx->msg->data_type == OID_msIndirectData && !sinfo->authattrs) {
+		pr_warn("Authenticode requires AuthAttrs\n");
+		return -EBADMSG;
+	}
+
 	/* Generate cert issuer + serial number key ID */
 	if (!ctx->expect_skid) {
 		kid = asymmetric_key_generate_id(ctx->raw_serial,
diff --git a/crypto/asymmetric_keys/pkcs7_parser.h b/crypto/asymmetric_keys/pkcs7_parser.h
index 790dd7cec82c..a66b19ebcf47 100644
--- a/crypto/asymmetric_keys/pkcs7_parser.h
+++ b/crypto/asymmetric_keys/pkcs7_parser.h
@@ -21,9 +21,9 @@
 struct pkcs7_signed_info {
 	struct pkcs7_signed_info *next;
 	struct x509_certificate *signer; /* Signing certificate (in msg->certs) */
-	unsigned index;
-	bool trusted;
-	bool unsupported_crypto;	/* T if not usable due to missing crypto */
+	unsigned	index;
+	bool		trusted;
+	bool		unsupported_crypto;	/* T if not usable due to missing crypto */
 
 	/* Message digest - the digest of the Content Data (or NULL) */
 	const void	*msgdigest;
@@ -32,6 +32,14 @@ struct pkcs7_signed_info {
 	/* Authenticated Attribute data (or NULL) */
 	unsigned	authattrs_len;
 	const void	*authattrs;
+	unsigned long	aa_set;
+#define	sinfo_has_content_type		0
+#define	sinfo_has_signing_time		1
+#define	sinfo_has_message_digest	2
+#define sinfo_has_smime_caps		3
+#define	sinfo_has_ms_opus_info		4
+#define	sinfo_has_ms_statement_type	5
+	time64_t	signing_time;
 
 	/* Issuing cert serial number and issuer's name [PKCS#7 or CMS ver 1]
 	 * or issuing cert's SKID [CMS ver 3].
@@ -53,6 +61,7 @@ struct pkcs7_message {
 	struct x509_certificate *crl;	/* Revocation list */
 	struct pkcs7_signed_info *signed_infos;
 	u8		version;	/* Version of cert (1 -> PKCS#7 or CMS; 3 -> CMS) */
+	bool		have_authattrs;	/* T if have authattrs */
 
 	/* Content Data (or NULL) */
 	enum OID	data_type;	/* Type of Data */
diff --git a/crypto/asymmetric_keys/pkcs7_verify.c b/crypto/asymmetric_keys/pkcs7_verify.c
index 404f89a0f852..d20c0b4b880e 100644
--- a/crypto/asymmetric_keys/pkcs7_verify.c
+++ b/crypto/asymmetric_keys/pkcs7_verify.c
@@ -70,9 +70,15 @@ static int pkcs7_digest(struct pkcs7_message *pkcs7,
 	 * message digest attribute amongst them which corresponds to the
 	 * digest we just calculated.
 	 */
-	if (sinfo->msgdigest) {
+	if (sinfo->authattrs) {
 		u8 tag;
 
+		if (!sinfo->msgdigest) {
+			pr_warn("Sig %u: No messageDigest\n", sinfo->index);
+			ret = -EKEYREJECTED;
+			goto error;
+		}
+
 		if (sinfo->msgdigest_len != sinfo->sig.digest_size) {
 			pr_debug("Sig %u: Invalid digest size (%u)\n",
 				 sinfo->index, sinfo->msgdigest_len);
@@ -314,6 +320,18 @@ static int pkcs7_verify_one(struct pkcs7_message *pkcs7,
 	pr_devel("Using X.509[%u] for sig %u\n",
 		 sinfo->signer->index, sinfo->index);
 
+	/* Check that the PKCS#7 signing time is valid according to the X.509
+	 * certificate.  We can't, however, check against the system clock
+	 * since that may not have been set yet and may be wrong.
+	 */
+	if (test_bit(sinfo_has_signing_time, &sinfo->aa_set)) {
+		if (sinfo->signing_time < sinfo->signer->valid_from ||
+		    sinfo->signing_time > sinfo->signer->valid_to) {
+			pr_warn("Message signed outside of X.509 validity window\n");
+			return -EKEYREJECTED;
+		}
+	}
+
 	/* Verify the PKCS#7 binary against the key */
 	ret = public_key_verify_signature(sinfo->signer->pub, &sinfo->sig);
 	if (ret < 0)
@@ -328,6 +346,7 @@ static int pkcs7_verify_one(struct pkcs7_message *pkcs7,
 /**
  * pkcs7_verify - Verify a PKCS#7 message
  * @pkcs7: The PKCS#7 message to be verified
+ * @usage: The use to which the key is being put
  *
  * Verify a PKCS#7 message is internally consistent - that is, the data digest
  * matches the digest in the AuthAttrs and any signature in the message or one
@@ -339,6 +358,9 @@ static int pkcs7_verify_one(struct pkcs7_message *pkcs7,
  *
  * Returns, in order of descending priority:
  *
+ *  (*) -EKEYREJECTED if a key was selected that had a usage restriction at
+ *      odds with the specified usage, or:
+ *
  *  (*) -EKEYREJECTED if a signature failed to match for which we found an
  *	appropriate X.509 certificate, or:
  *
@@ -350,7 +372,8 @@ static int pkcs7_verify_one(struct pkcs7_message *pkcs7,
  *  (*) 0 if all the signature chains that don't incur -ENOPKG can be verified
  *	(note that a signature chain may be of zero length), or:
  */
-int pkcs7_verify(struct pkcs7_message *pkcs7)
+int pkcs7_verify(struct pkcs7_message *pkcs7,
+		 enum key_being_used_for usage)
 {
 	struct pkcs7_signed_info *sinfo;
 	struct x509_certificate *x509;
@@ -359,6 +382,44 @@ int pkcs7_verify(struct pkcs7_message *pkcs7)
 
 	kenter("");
 
+	switch (usage) {
+	case VERIFYING_MODULE_SIGNATURE:
+		if (pkcs7->data_type != OID_data) {
+			pr_warn("Invalid module sig (not pkcs7-data)\n");
+			return -EKEYREJECTED;
+		}
+		if (pkcs7->have_authattrs) {
+			pr_warn("Invalid module sig (has authattrs)\n");
+			return -EKEYREJECTED;
+		}
+		break;
+	case VERIFYING_FIRMWARE_SIGNATURE:
+		if (pkcs7->data_type != OID_data) {
+			pr_warn("Invalid firmware sig (not pkcs7-data)\n");
+			return -EKEYREJECTED;
+		}
+		if (!pkcs7->have_authattrs) {
+			pr_warn("Invalid firmware sig (missing authattrs)\n");
+			return -EKEYREJECTED;
+		}
+		break;
+	case VERIFYING_KEXEC_PE_SIGNATURE:
+		if (pkcs7->data_type != OID_msIndirectData) {
+			pr_warn("Invalid kexec sig (not Authenticode)\n");
+			return -EKEYREJECTED;
+		}
+		/* Authattr presence checked in parser */
+		break;
+	case VERIFYING_UNSPECIFIED_SIGNATURE:
+		if (pkcs7->data_type != OID_data) {
+			pr_warn("Invalid unspecified sig (not pkcs7-data)\n");
+			return -EKEYREJECTED;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	for (n = 0, x509 = pkcs7->certs; x509; x509 = x509->next, n++) {
 		ret = x509_get_sig_params(x509);
 		if (ret < 0)
diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c
index 2421f46184ce..897b734dabf9 100644
--- a/crypto/asymmetric_keys/verify_pefile.c
+++ b/crypto/asymmetric_keys/verify_pefile.c
@@ -393,6 +393,7 @@ error_no_desc:
  * @pebuf: Buffer containing the PE binary image
  * @pelen: Length of the binary image
  * @trust_keyring: Signing certificates to use as starting points
+ * @usage: The use to which the key is being put.
  * @_trusted: Set to true if trustworth, false otherwise
  *
  * Validate that the certificate chain inside the PKCS#7 message inside the PE
@@ -417,7 +418,9 @@ error_no_desc:
  * May also return -ENOMEM.
  */
 int verify_pefile_signature(const void *pebuf, unsigned pelen,
-			    struct key *trusted_keyring, bool *_trusted)
+			    struct key *trusted_keyring,
+			    enum key_being_used_for usage,
+			    bool *_trusted)
 {
 	struct pkcs7_message *pkcs7;
 	struct pefile_context ctx;
@@ -462,7 +465,7 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen,
 	if (ret < 0)
 		goto error;
 
-	ret = pkcs7_verify(pkcs7);
+	ret = pkcs7_verify(pkcs7, usage);
 	if (ret < 0)
 		goto error;
 
diff --git a/include/crypto/pkcs7.h b/include/crypto/pkcs7.h
index e235ab4957ee..441aff9b5aa7 100644
--- a/include/crypto/pkcs7.h
+++ b/include/crypto/pkcs7.h
@@ -9,6 +9,11 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 
+#ifndef _CRYPTO_PKCS7_H
+#define _CRYPTO_PKCS7_H
+
+#include <crypto/public_key.h>
+
 struct key;
 struct pkcs7_message;
 
@@ -33,7 +38,10 @@ extern int pkcs7_validate_trust(struct pkcs7_message *pkcs7,
 /*
  * pkcs7_verify.c
  */
-extern int pkcs7_verify(struct pkcs7_message *pkcs7);
+extern int pkcs7_verify(struct pkcs7_message *pkcs7,
+			enum key_being_used_for usage);
 
 extern int pkcs7_supply_detached_data(struct pkcs7_message *pkcs7,
 				      const void *data, size_t datalen);
+
+#endif /* _CRYPTO_PKCS7_H */
diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h
index fda097e079a4..067c242b1e15 100644
--- a/include/crypto/public_key.h
+++ b/include/crypto/public_key.h
@@ -39,6 +39,20 @@ enum pkey_id_type {
 
 extern const char *const pkey_id_type_name[PKEY_ID_TYPE__LAST];
 
+/*
+ * The use to which an asymmetric key is being put.
+ */
+enum key_being_used_for {
+	VERIFYING_MODULE_SIGNATURE,
+	VERIFYING_FIRMWARE_SIGNATURE,
+	VERIFYING_KEXEC_PE_SIGNATURE,
+	VERIFYING_KEY_SIGNATURE,
+	VERIFYING_KEY_SELF_SIGNATURE,
+	VERIFYING_UNSPECIFIED_SIGNATURE,
+	NR__KEY_BEING_USED_FOR
+};
+extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR];
+
 /*
  * Cryptographic data for the public-key subtype of the asymmetric key type.
  *
diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h
index 9791c907cdb7..b20cd885c1fd 100644
--- a/include/keys/system_keyring.h
+++ b/include/keys/system_keyring.h
@@ -15,6 +15,7 @@
 #ifdef CONFIG_SYSTEM_TRUSTED_KEYRING
 
 #include <linux/key.h>
+#include <crypto/public_key.h>
 
 extern struct key *system_trusted_keyring;
 static inline struct key *get_system_trusted_keyring(void)
@@ -30,7 +31,8 @@ static inline struct key *get_system_trusted_keyring(void)
 
 #ifdef CONFIG_SYSTEM_DATA_VERIFICATION
 extern int system_verify_data(const void *data, unsigned long len,
-			      const void *raw_pkcs7, size_t pkcs7_len);
+			      const void *raw_pkcs7, size_t pkcs7_len,
+			      enum key_being_used_for usage);
 #endif
 
 #endif /* _KEYS_SYSTEM_KEYRING_H */
diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index c2bbf672b84e..93e0ff92fb9b 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -41,7 +41,7 @@ enum OID {
 	OID_signed_data,		/* 1.2.840.113549.1.7.2 */
 	/* PKCS#9 {iso(1) member-body(2) us(840) rsadsi(113549) pkcs(1) pkcs-9(9)} */
 	OID_email_address,		/* 1.2.840.113549.1.9.1 */
-	OID_content_type,		/* 1.2.840.113549.1.9.3 */
+	OID_contentType,		/* 1.2.840.113549.1.9.3 */
 	OID_messageDigest,		/* 1.2.840.113549.1.9.4 */
 	OID_signingTime,		/* 1.2.840.113549.1.9.5 */
 	OID_smimeCapabilites,		/* 1.2.840.113549.1.9.15 */
@@ -54,6 +54,8 @@ enum OID {
 
 	/* Microsoft Authenticode & Software Publishing */
 	OID_msIndirectData,		/* 1.3.6.1.4.1.311.2.1.4 */
+	OID_msStatementType,		/* 1.3.6.1.4.1.311.2.1.11 */
+	OID_msSpOpusInfo,		/* 1.3.6.1.4.1.311.2.1.12 */
 	OID_msPeImageDataObjId,		/* 1.3.6.1.4.1.311.2.1.15 */
 	OID_msIndividualSPKeyPurpose,	/* 1.3.6.1.4.1.311.2.1.21 */
 	OID_msOutlookExpress,		/* 1.3.6.1.4.1.311.16.4 */
diff --git a/include/linux/verify_pefile.h b/include/linux/verify_pefile.h
index ac34819214f9..da2049b5161c 100644
--- a/include/linux/verify_pefile.h
+++ b/include/linux/verify_pefile.h
@@ -12,7 +12,11 @@
 #ifndef _LINUX_VERIFY_PEFILE_H
 #define _LINUX_VERIFY_PEFILE_H
 
+#include <crypto/public_key.h>
+
 extern int verify_pefile_signature(const void *pebuf, unsigned pelen,
-				   struct key *trusted_keyring, bool *_trusted);
+				   struct key *trusted_keyring,
+				   enum key_being_used_for usage,
+				   bool *_trusted);
 
 #endif /* _LINUX_VERIFY_PEFILE_H */
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 70ad463f6df0..bd62f5cda746 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -72,5 +72,6 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
 		return -EBADMSG;
 	}
 
-	return system_verify_data(mod, modlen, mod + modlen, sig_len);
+	return system_verify_data(mod, modlen, mod + modlen, sig_len,
+				  VERIFYING_MODULE_SIGNATURE);
 }
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
index 95f2dcbc7616..2570598b784d 100644
--- a/kernel/system_keyring.c
+++ b/kernel/system_keyring.c
@@ -113,9 +113,11 @@ late_initcall(load_system_certificate_list);
  * @len: Size of @data.
  * @raw_pkcs7: The PKCS#7 message that is the signature.
  * @pkcs7_len: The size of @raw_pkcs7.
+ * @usage: The use to which the key is being put.
  */
 int system_verify_data(const void *data, unsigned long len,
-		       const void *raw_pkcs7, size_t pkcs7_len)
+		       const void *raw_pkcs7, size_t pkcs7_len,
+		       enum key_being_used_for usage)
 {
 	struct pkcs7_message *pkcs7;
 	bool trusted;
@@ -132,7 +134,7 @@ int system_verify_data(const void *data, unsigned long len,
 		goto error;
 	}
 
-	ret = pkcs7_verify(pkcs7);
+	ret = pkcs7_verify(pkcs7, usage);
 	if (ret < 0)
 		goto error;
 
diff --git a/scripts/sign-file.c b/scripts/sign-file.c
index de213e5c0cd3..e9741e879bbd 100755
--- a/scripts/sign-file.c
+++ b/scripts/sign-file.c
@@ -111,7 +111,7 @@ int main(int argc, char **argv)
 	bool sign_only = false;
 	unsigned char buf[4096];
 	unsigned long module_size, cms_size;
-	unsigned int use_keyid = 0;
+	unsigned int use_keyid = 0, use_signed_attrs = CMS_NOATTR;
 	const EVP_MD *digest_algo;
 	EVP_PKEY *private_key;
 	CMS_ContentInfo *cms;
@@ -216,7 +216,8 @@ int main(int argc, char **argv)
 	ERR(!cms, "CMS_sign");
 
 	ERR(!CMS_add1_signer(cms, x509, private_key, digest_algo,
-			     CMS_NOCERTS | CMS_BINARY | CMS_NOSMIMECAP | use_keyid),
+			     CMS_NOCERTS | CMS_BINARY | CMS_NOSMIMECAP |
+			     use_keyid | use_signed_attrs),
 	    "CMS_sign_add_signer");
 	ERR(CMS_final(cms, bm, NULL, CMS_NOCERTS | CMS_BINARY) < 0,
 	    "CMS_final");
-- 
cgit v1.2.3-70-g09d2


From 24a9a9610ce3ba36fd87c1d2f2c9106de6b7e832 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Mon, 3 Aug 2015 07:44:53 -0400
Subject: sunrpc: increase UNX_MAXNODENAME from 32 to __NEW_UTS_LEN bytes

The current limit of 32 bytes artificially limits the name string that
we end up stuffing into NFSv4.x client ID blobs. If you have multiple
hosts with long hostnames that only differ near the end, then this can
cause NFSv4 client ID collisions.

Linux nodenames are actually limited to __NEW_UTS_LEN bytes (64), so use
that as the limit instead. Also, use XDR_QUADLEN to specify the slack
length, just for clarity and in case someone in the future changes this
to something not evenly divisible by 4.

Reported-by: Michael Skralivetsky <michael.skralivetsky@primarydata.com>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/auth.h | 8 ++++++--
 net/sunrpc/auth_unix.c      | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index a7cbb570cc5c..1ecf13e148b8 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -18,9 +18,13 @@
 #include <linux/atomic.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
+#include <linux/utsname.h>
 
-/* size of the nodename buffer */
-#define UNX_MAXNODENAME	32
+/*
+ * Size of the nodename buffer. RFC1831 specifies a hard limit of 255 bytes,
+ * but Linux hostnames are actually limited to __NEW_UTS_LEN bytes.
+ */
+#define UNX_MAXNODENAME	__NEW_UTS_LEN
 
 struct rpcsec_gss_info;
 
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 4feda2d0a833..548240dd15fc 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -23,7 +23,7 @@ struct unx_cred {
 };
 #define uc_uid			uc_base.cr_uid
 
-#define UNX_WRITESLACK		(21 + (UNX_MAXNODENAME >> 2))
+#define UNX_WRITESLACK		(21 + XDR_QUADLEN(UNX_MAXNODENAME))
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_AUTH
-- 
cgit v1.2.3-70-g09d2


From 16478b61f0d88a08163b92af27f0e77f471576ce Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 11 Aug 2015 17:17:18 +0200
Subject: ARM/fb: ep93xx: switch framebuffer to use modedb only

All the EP93xx boards exclusively use modedb to look up video
modes from the command line. Root out the parametrization of
custom video modes from the platform data and board files
and simplify the driver.

Cc: Jean-Christophe Plagniol-Villard <plagnioj@jcrosoft.com>
Reviewed-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Acked-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Olof Johansson <olof@lixom.net>
---
 arch/arm/mach-ep93xx/edb93xx.c             |  2 --
 arch/arm/mach-ep93xx/simone.c              |  2 --
 arch/arm/mach-ep93xx/snappercl15.c         |  2 --
 arch/arm/mach-ep93xx/vision_ep9307.c       |  2 --
 drivers/video/fbdev/ep93xx-fb.c            | 30 ++++--------------------------
 include/linux/platform_data/video-ep93xx.h |  8 --------
 6 files changed, 4 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-ep93xx/edb93xx.c b/arch/arm/mach-ep93xx/edb93xx.c
index 27b14ae92c7e..ad92d9f7e4df 100644
--- a/arch/arm/mach-ep93xx/edb93xx.c
+++ b/arch/arm/mach-ep93xx/edb93xx.c
@@ -205,8 +205,6 @@ static void __init edb93xx_register_pwm(void)
  * EDB93xx framebuffer
  *************************************************************************/
 static struct ep93xxfb_mach_info __initdata edb93xxfb_info = {
-	.num_modes	= EP93XXFB_USE_MODEDB,
-	.bpp		= 16,
 	.flags		= 0,
 };
 
diff --git a/arch/arm/mach-ep93xx/simone.c b/arch/arm/mach-ep93xx/simone.c
index d0938a219443..7bb540c421ee 100644
--- a/arch/arm/mach-ep93xx/simone.c
+++ b/arch/arm/mach-ep93xx/simone.c
@@ -40,8 +40,6 @@ static struct ep93xx_eth_data __initdata simone_eth_data = {
 };
 
 static struct ep93xxfb_mach_info __initdata simone_fb_info = {
-	.num_modes	= EP93XXFB_USE_MODEDB,
-	.bpp		= 16,
 	.flags		= EP93XXFB_USE_SDCSN0 | EP93XXFB_PCLK_FALLING,
 };
 
diff --git a/arch/arm/mach-ep93xx/snappercl15.c b/arch/arm/mach-ep93xx/snappercl15.c
index aa86f86638dd..c4904264256a 100644
--- a/arch/arm/mach-ep93xx/snappercl15.c
+++ b/arch/arm/mach-ep93xx/snappercl15.c
@@ -144,8 +144,6 @@ static struct i2c_board_info __initdata snappercl15_i2c_data[] = {
 };
 
 static struct ep93xxfb_mach_info __initdata snappercl15_fb_info = {
-	.num_modes		= EP93XXFB_USE_MODEDB,
-	.bpp			= 16,
 };
 
 static struct platform_device snappercl15_audio_device = {
diff --git a/arch/arm/mach-ep93xx/vision_ep9307.c b/arch/arm/mach-ep93xx/vision_ep9307.c
index ff22a6a6e2bf..5cced5988498 100644
--- a/arch/arm/mach-ep93xx/vision_ep9307.c
+++ b/arch/arm/mach-ep93xx/vision_ep9307.c
@@ -106,8 +106,6 @@ static void vision_lcd_blank(int blank_mode, struct fb_info *info)
 }
 
 static struct ep93xxfb_mach_info ep93xxfb_info __initdata = {
-	.num_modes	= EP93XXFB_USE_MODEDB,
-	.bpp		= 16,
 	.flags		= EP93XXFB_USE_SDCSN0 | EP93XXFB_PCLK_FALLING,
 	.setup		= vision_lcd_setup,
 	.teardown	= vision_lcd_teardown,
diff --git a/drivers/video/fbdev/ep93xx-fb.c b/drivers/video/fbdev/ep93xx-fb.c
index 7ec251cc9c03..5b1081030cbb 100644
--- a/drivers/video/fbdev/ep93xx-fb.c
+++ b/drivers/video/fbdev/ep93xx-fb.c
@@ -419,36 +419,15 @@ static struct fb_ops ep93xxfb_ops = {
 	.fb_mmap	= ep93xxfb_mmap,
 };
 
-static int ep93xxfb_calc_fbsize(struct ep93xxfb_mach_info *mach_info)
-{
-	int i, fb_size = 0;
-
-	if (mach_info->num_modes == EP93XXFB_USE_MODEDB) {
-		fb_size = EP93XXFB_MAX_XRES * EP93XXFB_MAX_YRES *
-			mach_info->bpp / 8;
-	} else {
-		for (i = 0; i < mach_info->num_modes; i++) {
-			const struct fb_videomode *mode;
-			int size;
-
-			mode = &mach_info->modes[i];
-			size = mode->xres * mode->yres * mach_info->bpp / 8;
-			if (size > fb_size)
-				fb_size = size;
-		}
-	}
-
-	return fb_size;
-}
-
 static int ep93xxfb_alloc_videomem(struct fb_info *info)
 {
-	struct ep93xx_fbi *fbi = info->par;
 	char __iomem *virt_addr;
 	dma_addr_t phys_addr;
 	unsigned int fb_size;
 
-	fb_size = ep93xxfb_calc_fbsize(fbi->mach_info);
+	/* Maximum 16bpp -> used memory is maximum x*y*2 bytes */
+	fb_size = EP93XXFB_MAX_XRES * EP93XXFB_MAX_YRES * 2;
+
 	virt_addr = dma_alloc_writecombine(info->dev, fb_size,
 					   &phys_addr, GFP_KERNEL);
 	if (!virt_addr)
@@ -550,8 +529,7 @@ static int ep93xxfb_probe(struct platform_device *pdev)
 
 	fb_get_options("ep93xx-fb", &video_mode);
 	err = fb_find_mode(&info->var, info, video_mode,
-			   fbi->mach_info->modes, fbi->mach_info->num_modes,
-			   fbi->mach_info->default_mode, fbi->mach_info->bpp);
+			   NULL, 0, NULL, 16);
 	if (err == 0) {
 		dev_err(info->dev, "No suitable video mode found\n");
 		err = -EINVAL;
diff --git a/include/linux/platform_data/video-ep93xx.h b/include/linux/platform_data/video-ep93xx.h
index 92fc2b2232e7..699ac4109366 100644
--- a/include/linux/platform_data/video-ep93xx.h
+++ b/include/linux/platform_data/video-ep93xx.h
@@ -2,11 +2,8 @@
 #define __VIDEO_EP93XX_H
 
 struct platform_device;
-struct fb_videomode;
 struct fb_info;
 
-#define EP93XXFB_USE_MODEDB		0
-
 /* VideoAttributes flags */
 #define EP93XXFB_STATE_MACHINE_ENABLE	(1 << 0)
 #define EP93XXFB_PIXEL_CLOCK_ENABLE	(1 << 1)
@@ -38,12 +35,7 @@ struct fb_info;
 					 EP93XXFB_PIXEL_DATA_ENABLE)
 
 struct ep93xxfb_mach_info {
-	unsigned int			num_modes;
-	const struct fb_videomode	*modes;
-	const struct fb_videomode	*default_mode;
-	int				bpp;
 	unsigned int			flags;
-
 	int	(*setup)(struct platform_device *pdev);
 	void	(*teardown)(struct platform_device *pdev);
 	void	(*blank)(int blank_mode, struct fb_info *info);
-- 
cgit v1.2.3-70-g09d2


From c8c081b70cb563cc4d41ab9933fa3323c6f6ffca Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Mon, 27 Jul 2015 11:09:42 +0800
Subject: sunrpc/nfsd: Remove redundant code by exports seq_operations
 functions

Nfsd has implement a site of seq_operations functions as sunrpc's cache.
Just exports sunrpc's codes, and remove nfsd's redundant codes.

v8, same as v6

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/export.c             | 73 ++------------------------------------------
 include/linux/sunrpc/cache.h |  5 +++
 net/sunrpc/cache.c           | 15 +++++----
 3 files changed, 17 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index f79521a59747..b4d84b579f20 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1075,73 +1075,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
 	return rv;
 }
 
-/* Iterator */
-
-static void *e_start(struct seq_file *m, loff_t *pos)
-	__acquires(((struct cache_detail *)m->private)->hash_lock)
-{
-	loff_t n = *pos;
-	unsigned hash, export;
-	struct cache_head *ch;
-	struct cache_detail *cd = m->private;
-	struct cache_head **export_table = cd->hash_table;
-
-	read_lock(&cd->hash_lock);
-	if (!n--)
-		return SEQ_START_TOKEN;
-	hash = n >> 32;
-	export = n & ((1LL<<32) - 1);
-
-	
-	for (ch=export_table[hash]; ch; ch=ch->next)
-		if (!export--)
-			return ch;
-	n &= ~((1LL<<32) - 1);
-	do {
-		hash++;
-		n += 1LL<<32;
-	} while(hash < EXPORT_HASHMAX && export_table[hash]==NULL);
-	if (hash >= EXPORT_HASHMAX)
-		return NULL;
-	*pos = n+1;
-	return export_table[hash];
-}
-
-static void *e_next(struct seq_file *m, void *p, loff_t *pos)
-{
-	struct cache_head *ch = p;
-	int hash = (*pos >> 32);
-	struct cache_detail *cd = m->private;
-	struct cache_head **export_table = cd->hash_table;
-
-	if (p == SEQ_START_TOKEN)
-		hash = 0;
-	else if (ch->next == NULL) {
-		hash++;
-		*pos += 1LL<<32;
-	} else {
-		++*pos;
-		return ch->next;
-	}
-	*pos &= ~((1LL<<32) - 1);
-	while (hash < EXPORT_HASHMAX && export_table[hash] == NULL) {
-		hash++;
-		*pos += 1LL<<32;
-	}
-	if (hash >= EXPORT_HASHMAX)
-		return NULL;
-	++*pos;
-	return export_table[hash];
-}
-
-static void e_stop(struct seq_file *m, void *p)
-	__releases(((struct cache_detail *)m->private)->hash_lock)
-{
-	struct cache_detail *cd = m->private;
-
-	read_unlock(&cd->hash_lock);
-}
-
 static struct flags {
 	int flag;
 	char *name[2];
@@ -1270,9 +1203,9 @@ static int e_show(struct seq_file *m, void *p)
 }
 
 const struct seq_operations nfs_exports_op = {
-	.start	= e_start,
-	.next	= e_next,
-	.stop	= e_stop,
+	.start	= cache_seq_start,
+	.next	= cache_seq_next,
+	.stop	= cache_seq_stop,
 	.show	= e_show,
 };
 
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 437ddb6c4aef..04ee5a284aac 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -224,6 +224,11 @@ extern int sunrpc_cache_register_pipefs(struct dentry *parent, const char *,
 					umode_t, struct cache_detail *);
 extern void sunrpc_cache_unregister_pipefs(struct cache_detail *);
 
+/* Must store cache_detail in seq_file->private if using next three functions */
+extern void *cache_seq_start(struct seq_file *file, loff_t *pos);
+extern void *cache_seq_next(struct seq_file *file, void *p, loff_t *pos);
+extern void cache_seq_stop(struct seq_file *file, void *p);
+
 extern void qword_add(char **bpp, int *lp, char *str);
 extern void qword_addhex(char **bpp, int *lp, char *buf, int blen);
 extern int qword_get(char **bpp, char *dest, int bufsize);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index edec603abc17..673c2fa3c6c2 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1270,7 +1270,7 @@ EXPORT_SYMBOL_GPL(qword_get);
  * get a header, then pass each real item in the cache
  */
 
-static void *c_start(struct seq_file *m, loff_t *pos)
+void *cache_seq_start(struct seq_file *m, loff_t *pos)
 	__acquires(cd->hash_lock)
 {
 	loff_t n = *pos;
@@ -1298,8 +1298,9 @@ static void *c_start(struct seq_file *m, loff_t *pos)
 	*pos = n+1;
 	return cd->hash_table[hash];
 }
+EXPORT_SYMBOL_GPL(cache_seq_start);
 
-static void *c_next(struct seq_file *m, void *p, loff_t *pos)
+void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos)
 {
 	struct cache_head *ch = p;
 	int hash = (*pos >> 32);
@@ -1325,13 +1326,15 @@ static void *c_next(struct seq_file *m, void *p, loff_t *pos)
 	++*pos;
 	return cd->hash_table[hash];
 }
+EXPORT_SYMBOL_GPL(cache_seq_next);
 
-static void c_stop(struct seq_file *m, void *p)
+void cache_seq_stop(struct seq_file *m, void *p)
 	__releases(cd->hash_lock)
 {
 	struct cache_detail *cd = m->private;
 	read_unlock(&cd->hash_lock);
 }
+EXPORT_SYMBOL_GPL(cache_seq_stop);
 
 static int c_show(struct seq_file *m, void *p)
 {
@@ -1359,9 +1362,9 @@ static int c_show(struct seq_file *m, void *p)
 }
 
 static const struct seq_operations cache_content_op = {
-	.start	= c_start,
-	.next	= c_next,
-	.stop	= c_stop,
+	.start	= cache_seq_start,
+	.next	= cache_seq_next,
+	.stop	= cache_seq_stop,
 	.show	= c_show,
 };
 
-- 
cgit v1.2.3-70-g09d2


From 129e5824cd96d9289679973f0ff7c48e88d569bb Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Mon, 27 Jul 2015 11:10:15 +0800
Subject: sunrpc: Switch to using hash list instead single list

Switch using list_head for cache_head in cache_detail,
it is useful of remove an cache_head entry directly from cache_detail.

v8, using hash list, not head list

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/cache.h |  4 +--
 net/sunrpc/cache.c           | 60 +++++++++++++++++++++++---------------------
 2 files changed, 33 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 04ee5a284aac..03d3b4c92d9f 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -46,7 +46,7 @@
  * 
  */
 struct cache_head {
-	struct cache_head * next;
+	struct hlist_node	cache_list;
 	time_t		expiry_time;	/* After time time, don't use the data */
 	time_t		last_refresh;   /* If CACHE_PENDING, this is when upcall 
 					 * was sent, else this is when update was received
@@ -73,7 +73,7 @@ struct cache_detail_pipefs {
 struct cache_detail {
 	struct module *		owner;
 	int			hash_size;
-	struct cache_head **	hash_table;
+	struct hlist_head *	hash_table;
 	rwlock_t		hash_lock;
 
 	atomic_t		inuse; /* active user-space update or lookup */
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 673c2fa3c6c2..4a2340a54401 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -44,7 +44,7 @@ static void cache_revisit_request(struct cache_head *item);
 static void cache_init(struct cache_head *h)
 {
 	time_t now = seconds_since_boot();
-	h->next = NULL;
+	INIT_HLIST_NODE(&h->cache_list);
 	h->flags = 0;
 	kref_init(&h->ref);
 	h->expiry_time = now + CACHE_NEW_EXPIRY;
@@ -54,15 +54,14 @@ static void cache_init(struct cache_head *h)
 struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
 				       struct cache_head *key, int hash)
 {
-	struct cache_head **head,  **hp;
-	struct cache_head *new = NULL, *freeme = NULL;
+	struct cache_head *new = NULL, *freeme = NULL, *tmp = NULL;
+	struct hlist_head *head;
 
 	head = &detail->hash_table[hash];
 
 	read_lock(&detail->hash_lock);
 
-	for (hp=head; *hp != NULL ; hp = &(*hp)->next) {
-		struct cache_head *tmp = *hp;
+	hlist_for_each_entry(tmp, head, cache_list) {
 		if (detail->match(tmp, key)) {
 			if (cache_is_expired(detail, tmp))
 				/* This entry is expired, we will discard it. */
@@ -88,12 +87,10 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
 	write_lock(&detail->hash_lock);
 
 	/* check if entry appeared while we slept */
-	for (hp=head; *hp != NULL ; hp = &(*hp)->next) {
-		struct cache_head *tmp = *hp;
+	hlist_for_each_entry(tmp, head, cache_list) {
 		if (detail->match(tmp, key)) {
 			if (cache_is_expired(detail, tmp)) {
-				*hp = tmp->next;
-				tmp->next = NULL;
+				hlist_del_init(&tmp->cache_list);
 				detail->entries --;
 				freeme = tmp;
 				break;
@@ -104,8 +101,8 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
 			return tmp;
 		}
 	}
-	new->next = *head;
-	*head = new;
+
+	hlist_add_head(&new->cache_list, head);
 	detail->entries++;
 	cache_get(new);
 	write_unlock(&detail->hash_lock);
@@ -143,7 +140,6 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 	 * If 'old' is not VALID, we update it directly,
 	 * otherwise we need to replace it
 	 */
-	struct cache_head **head;
 	struct cache_head *tmp;
 
 	if (!test_bit(CACHE_VALID, &old->flags)) {
@@ -168,15 +164,13 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 	}
 	cache_init(tmp);
 	detail->init(tmp, old);
-	head = &detail->hash_table[hash];
 
 	write_lock(&detail->hash_lock);
 	if (test_bit(CACHE_NEGATIVE, &new->flags))
 		set_bit(CACHE_NEGATIVE, &tmp->flags);
 	else
 		detail->update(tmp, new);
-	tmp->next = *head;
-	*head = tmp;
+	hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]);
 	detail->entries++;
 	cache_get(tmp);
 	cache_fresh_locked(tmp, new->expiry_time);
@@ -416,28 +410,29 @@ static int cache_clean(void)
 	/* find a non-empty bucket in the table */
 	while (current_detail &&
 	       current_index < current_detail->hash_size &&
-	       current_detail->hash_table[current_index] == NULL)
+	       hlist_empty(&current_detail->hash_table[current_index]))
 		current_index++;
 
 	/* find a cleanable entry in the bucket and clean it, or set to next bucket */
 
 	if (current_detail && current_index < current_detail->hash_size) {
-		struct cache_head *ch, **cp;
+		struct cache_head *ch = NULL;
 		struct cache_detail *d;
+		struct hlist_head *head;
+		struct hlist_node *tmp;
 
 		write_lock(&current_detail->hash_lock);
 
 		/* Ok, now to clean this strand */
 
-		cp = & current_detail->hash_table[current_index];
-		for (ch = *cp ; ch ; cp = & ch->next, ch = *cp) {
+		head = &current_detail->hash_table[current_index];
+		hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
 			if (current_detail->nextcheck > ch->expiry_time)
 				current_detail->nextcheck = ch->expiry_time+1;
 			if (!cache_is_expired(current_detail, ch))
 				continue;
 
-			*cp = ch->next;
-			ch->next = NULL;
+			hlist_del_init(&ch->cache_list);
 			current_detail->entries--;
 			rv = 1;
 			break;
@@ -1284,7 +1279,7 @@ void *cache_seq_start(struct seq_file *m, loff_t *pos)
 	hash = n >> 32;
 	entry = n & ((1LL<<32) - 1);
 
-	for (ch=cd->hash_table[hash]; ch; ch=ch->next)
+	hlist_for_each_entry(ch, &cd->hash_table[hash], cache_list)
 		if (!entry--)
 			return ch;
 	n &= ~((1LL<<32) - 1);
@@ -1292,11 +1287,12 @@ void *cache_seq_start(struct seq_file *m, loff_t *pos)
 		hash++;
 		n += 1LL<<32;
 	} while(hash < cd->hash_size &&
-		cd->hash_table[hash]==NULL);
+		hlist_empty(&cd->hash_table[hash]));
 	if (hash >= cd->hash_size)
 		return NULL;
 	*pos = n+1;
-	return cd->hash_table[hash];
+	return hlist_entry_safe(cd->hash_table[hash].first,
+				struct cache_head, cache_list);
 }
 EXPORT_SYMBOL_GPL(cache_seq_start);
 
@@ -1308,23 +1304,25 @@ void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos)
 
 	if (p == SEQ_START_TOKEN)
 		hash = 0;
-	else if (ch->next == NULL) {
+	else if (ch->cache_list.next == NULL) {
 		hash++;
 		*pos += 1LL<<32;
 	} else {
 		++*pos;
-		return ch->next;
+		return hlist_entry_safe(ch->cache_list.next,
+					struct cache_head, cache_list);
 	}
 	*pos &= ~((1LL<<32) - 1);
 	while (hash < cd->hash_size &&
-	       cd->hash_table[hash] == NULL) {
+	       hlist_empty(&cd->hash_table[hash])) {
 		hash++;
 		*pos += 1LL<<32;
 	}
 	if (hash >= cd->hash_size)
 		return NULL;
 	++*pos;
-	return cd->hash_table[hash];
+	return hlist_entry_safe(cd->hash_table[hash].first,
+				struct cache_head, cache_list);
 }
 EXPORT_SYMBOL_GPL(cache_seq_next);
 
@@ -1666,17 +1664,21 @@ EXPORT_SYMBOL_GPL(cache_unregister_net);
 struct cache_detail *cache_create_net(struct cache_detail *tmpl, struct net *net)
 {
 	struct cache_detail *cd;
+	int i;
 
 	cd = kmemdup(tmpl, sizeof(struct cache_detail), GFP_KERNEL);
 	if (cd == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	cd->hash_table = kzalloc(cd->hash_size * sizeof(struct cache_head *),
+	cd->hash_table = kzalloc(cd->hash_size * sizeof(struct hlist_head),
 				 GFP_KERNEL);
 	if (cd->hash_table == NULL) {
 		kfree(cd);
 		return ERR_PTR(-ENOMEM);
 	}
+
+	for (i = 0; i < cd->hash_size; i++)
+		INIT_HLIST_HEAD(&cd->hash_table[i]);
 	cd->net = net;
 	return cd;
 }
-- 
cgit v1.2.3-70-g09d2


From c87fb4a378f93f114b9906e180d83877cee4e7f4 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 6 Aug 2015 12:47:02 -0400
Subject: lockd: NLM grace period shouldn't block NFSv4 opens

NLM locks don't conflict with NFSv4 share reservations, so we're not
going to learn anything new by watiting for them.

They do conflict with NFSv4 locks and with delegations.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/svc.c        |  1 +
 fs/nfs_common/grace.c | 23 +++++++++++++++++++++--
 fs/nfsd/nfs4proc.c    |  8 ++++----
 fs/nfsd/nfs4state.c   |  8 +++++---
 include/linux/fs.h    |  6 ++++++
 5 files changed, 37 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 530914b5c455..d678bcc3cbcb 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -591,6 +591,7 @@ static int lockd_init_net(struct net *net)
 
 	INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
 	INIT_LIST_HEAD(&ln->lockd_manager.list);
+	ln->lockd_manager.block_opens = false;
 	spin_lock_init(&ln->nsm_clnt_lock);
 	return 0;
 }
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index ae6e58ea4de5..fd8c9a5bcac4 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c
@@ -63,14 +63,33 @@ EXPORT_SYMBOL_GPL(locks_end_grace);
  * lock reclaims.
  */
 int
-locks_in_grace(struct net *net)
+__state_in_grace(struct net *net, bool open)
 {
 	struct list_head *grace_list = net_generic(net, grace_net_id);
+	struct lock_manager *lm;
 
-	return !list_empty(grace_list);
+	if (!open)
+		return !list_empty(grace_list);
+
+	list_for_each_entry(lm, grace_list, list) {
+		if (lm->block_opens)
+			return true;
+	}
+	return false;
+}
+
+int locks_in_grace(struct net *net)
+{
+	return __state_in_grace(net, 0);
 }
 EXPORT_SYMBOL_GPL(locks_in_grace);
 
+int opens_in_grace(struct net *net)
+{
+	return __state_in_grace(net, 1);
+}
+EXPORT_SYMBOL_GPL(opens_in_grace);
+
 static int __net_init
 grace_init_net(struct net *net)
 {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e779d7db24b0..b9681ee0ed19 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -415,10 +415,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	/* Openowner is now set, so sequence id will get bumped.  Now we need
 	 * these checks before we do any creates: */
 	status = nfserr_grace;
-	if (locks_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+	if (opens_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
 		goto out;
 	status = nfserr_no_grace;
-	if (!locks_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+	if (!opens_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
 		goto out;
 
 	switch (open->op_claim_type) {
@@ -827,7 +827,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	__be32 status;
 
-	if (locks_in_grace(SVC_NET(rqstp)))
+	if (opens_in_grace(SVC_NET(rqstp)))
 		return nfserr_grace;
 	status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
 			     remove->rm_name, remove->rm_namelen);
@@ -846,7 +846,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	if (!cstate->save_fh.fh_dentry)
 		return status;
-	if (locks_in_grace(SVC_NET(rqstp)) &&
+	if (opens_in_grace(SVC_NET(rqstp)) &&
 		!(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK))
 		return nfserr_grace;
 	status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 43903abd6189..c0c47a878cc6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4065,7 +4065,8 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
 		case NFS4_OPEN_CLAIM_FH:
 			/*
 			 * Let's not give out any delegations till everyone's
-			 * had the chance to reclaim theirs....
+			 * had the chance to reclaim theirs, *and* until
+			 * NLM locks have all been reclaimed:
 			 */
 			if (locks_in_grace(clp->net))
 				goto out_no_deleg;
@@ -4440,7 +4441,7 @@ check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid,
 {
 	if (ONE_STATEID(stateid) && (flags & RD_STATE))
 		return nfs_ok;
-	else if (locks_in_grace(net)) {
+	else if (opens_in_grace(net)) {
 		/* Answer in remaining cases depends on existence of
 		 * conflicting state; so we must wait out the grace period. */
 		return nfserr_grace;
@@ -4459,7 +4460,7 @@ check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid,
 static inline int
 grace_disallows_io(struct net *net, struct inode *inode)
 {
-	return locks_in_grace(net) && mandatory_lock(inode);
+	return opens_in_grace(net) && mandatory_lock(inode);
 }
 
 /* Returns true iff a is later than b: */
@@ -6578,6 +6579,7 @@ nfs4_state_start_net(struct net *net)
 		return ret;
 	nn->boot_time = get_seconds();
 	nn->grace_ended = false;
+	nn->nfsd4_manager.block_opens = true;
 	locks_start_grace(net, &nn->nfsd4_manager);
 	nfsd4_client_tracking_init(net);
 	printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cc008c338f5a..9a9d314f7b27 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -942,12 +942,18 @@ struct lock_manager_operations {
 
 struct lock_manager {
 	struct list_head list;
+	/*
+	 * NFSv4 and up also want opens blocked during the grace period;
+	 * NLM doesn't care:
+	 */
+	bool block_opens;
 };
 
 struct net;
 void locks_start_grace(struct net *, struct lock_manager *);
 void locks_end_grace(struct lock_manager *);
 int locks_in_grace(struct net *);
+int opens_in_grace(struct net *);
 
 /* that will die - we need it for nfs_lock_info */
 #include <linux/nfs_fs_i.h>
-- 
cgit v1.2.3-70-g09d2


From 41609892701e26724b8617201f43254cadf2e7ae Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 12 Aug 2015 15:59:45 +0530
Subject: blk-cgroup: Drop unlikely before IS_ERR(_OR_NULL)

IS_ERR(_OR_NULL) already contain an 'unlikely' compiler flag and there
is no need to do that again from its callers. Drop it.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 1b62d768c7df..a4cd1641e9e2 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -374,7 +374,7 @@ static inline struct request_list *blk_get_rl(struct request_queue *q,
 	 * root_rl in such cases.
 	 */
 	blkg = blkg_lookup_create(blkcg, q);
-	if (unlikely(IS_ERR(blkg)))
+	if (IS_ERR(blkg))
 		goto root_rl;
 
 	blkg_get(blkg);
-- 
cgit v1.2.3-70-g09d2


From 54efd50bfd873e2dbf784e0b21a8027ba4299a3e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 23 Apr 2015 22:37:18 -0700
Subject: block: make generic_make_request handle arbitrarily sized bios

The way the block layer is currently written, it goes to great lengths
to avoid having to split bios; upper layer code (such as bio_add_page())
checks what the underlying device can handle and tries to always create
bios that don't need to be split.

But this approach becomes unwieldy and eventually breaks down with
stacked devices and devices with dynamic limits, and it adds a lot of
complexity. If the block layer could split bios as needed, we could
eliminate a lot of complexity elsewhere - particularly in stacked
drivers. Code that creates bios can then create whatever size bios are
convenient, and more importantly stacked drivers don't have to deal with
both their own bio size limitations and the limitations of the
(potentially multiple) devices underneath them.  In the future this will
let us delete merge_bvec_fn and a bunch of other code.

We do this by adding calls to blk_queue_split() to the various
make_request functions that need it - a few can already handle arbitrary
size bios. Note that we add the call _after_ any call to
blk_queue_bounce(); this means that blk_queue_split() and
blk_recalc_rq_segments() don't need to be concerned with bouncing
affecting segment merging.

Some make_request_fn() callbacks were simple enough to audit and verify
they don't need blk_queue_split() calls. The skipped ones are:

 * nfhd_make_request (arch/m68k/emu/nfblock.c)
 * axon_ram_make_request (arch/powerpc/sysdev/axonram.c)
 * simdisk_make_request (arch/xtensa/platforms/iss/simdisk.c)
 * brd_make_request (ramdisk - drivers/block/brd.c)
 * mtip_submit_request (drivers/block/mtip32xx/mtip32xx.c)
 * loop_make_request
 * null_queue_bio
 * bcache's make_request fns

Some others are almost certainly safe to remove now, but will be left
for future patches.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ming Lei <ming.lei@canonical.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: dm-devel@redhat.com
Cc: Lars Ellenberg <drbd-dev@lists.linbit.com>
Cc: drbd-user@lists.linbit.com
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Jim Paris <jim@jtan.com>
Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Oleg Drokin <oleg.drokin@intel.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Acked-by: NeilBrown <neilb@suse.de> (for the 'md/md.c' bits)
Acked-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
[dpark: skip more mq-based drivers, resolve merge conflicts, etc.]
Signed-off-by: Dongsu Park <dpark@posteo.net>
Signed-off-by: Ming Lin <ming.l@ssi.samsung.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c                            |  19 ++--
 block/blk-merge.c                           | 159 ++++++++++++++++++++++++++--
 block/blk-mq.c                              |   4 +
 block/blk-sysfs.c                           |   3 +
 drivers/block/drbd/drbd_req.c               |   2 +
 drivers/block/pktcdvd.c                     |   6 +-
 drivers/block/ps3vram.c                     |   2 +
 drivers/block/rsxx/dev.c                    |   2 +
 drivers/block/umem.c                        |   2 +
 drivers/block/zram/zram_drv.c               |   2 +
 drivers/md/dm.c                             |   2 +
 drivers/md/md.c                             |   2 +
 drivers/s390/block/dcssblk.c                |   2 +
 drivers/s390/block/xpram.c                  |   2 +
 drivers/staging/lustre/lustre/llite/lloop.c |   2 +
 include/linux/blkdev.h                      |   3 +
 16 files changed, 192 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index d1796b54e97a..60912e983f16 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -643,6 +643,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (q->id < 0)
 		goto fail_q;
 
+	q->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+	if (!q->bio_split)
+		goto fail_id;
+
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
@@ -651,7 +655,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 
 	err = bdi_init(&q->backing_dev_info);
 	if (err)
-		goto fail_id;
+		goto fail_split;
 
 	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
@@ -693,6 +697,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 
 fail_bdi:
 	bdi_destroy(&q->backing_dev_info);
+fail_split:
+	bioset_free(q->bio_split);
 fail_id:
 	ida_simple_remove(&blk_queue_ida, q->id);
 fail_q:
@@ -1610,6 +1616,8 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	struct request *req;
 	unsigned int request_count = 0;
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	/*
 	 * low level driver can indicate that it wants pages above a
 	 * certain limit bounced to low memory (ie for highmem, or even
@@ -1832,15 +1840,6 @@ generic_make_request_checks(struct bio *bio)
 		goto end_io;
 	}
 
-	if (likely(bio_is_rw(bio) &&
-		   nr_sectors > queue_max_hw_sectors(q))) {
-		printk(KERN_ERR "bio too big device %s (%u > %u)\n",
-		       bdevname(bio->bi_bdev, b),
-		       bio_sectors(bio),
-		       queue_max_hw_sectors(q));
-		goto end_io;
-	}
-
 	part = bio->bi_bdev->bd_part;
 	if (should_fail_request(part, bio->bi_iter.bi_size) ||
 	    should_fail_request(&part_to_disk(part)->part0,
diff --git a/block/blk-merge.c b/block/blk-merge.c
index a455b9860143..d9c3a75e4a60 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -9,12 +9,158 @@
 
 #include "blk.h"
 
+static struct bio *blk_bio_discard_split(struct request_queue *q,
+					 struct bio *bio,
+					 struct bio_set *bs)
+{
+	unsigned int max_discard_sectors, granularity;
+	int alignment;
+	sector_t tmp;
+	unsigned split_sectors;
+
+	/* Zero-sector (unknown) and one-sector granularities are the same.  */
+	granularity = max(q->limits.discard_granularity >> 9, 1U);
+
+	max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+	max_discard_sectors -= max_discard_sectors % granularity;
+
+	if (unlikely(!max_discard_sectors)) {
+		/* XXX: warn */
+		return NULL;
+	}
+
+	if (bio_sectors(bio) <= max_discard_sectors)
+		return NULL;
+
+	split_sectors = max_discard_sectors;
+
+	/*
+	 * If the next starting sector would be misaligned, stop the discard at
+	 * the previous aligned sector.
+	 */
+	alignment = (q->limits.discard_alignment >> 9) % granularity;
+
+	tmp = bio->bi_iter.bi_sector + split_sectors - alignment;
+	tmp = sector_div(tmp, granularity);
+
+	if (split_sectors > tmp)
+		split_sectors -= tmp;
+
+	return bio_split(bio, split_sectors, GFP_NOIO, bs);
+}
+
+static struct bio *blk_bio_write_same_split(struct request_queue *q,
+					    struct bio *bio,
+					    struct bio_set *bs)
+{
+	if (!q->limits.max_write_same_sectors)
+		return NULL;
+
+	if (bio_sectors(bio) <= q->limits.max_write_same_sectors)
+		return NULL;
+
+	return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
+}
+
+static struct bio *blk_bio_segment_split(struct request_queue *q,
+					 struct bio *bio,
+					 struct bio_set *bs)
+{
+	struct bio *split;
+	struct bio_vec bv, bvprv;
+	struct bvec_iter iter;
+	unsigned seg_size = 0, nsegs = 0;
+	int prev = 0;
+
+	struct bvec_merge_data bvm = {
+		.bi_bdev	= bio->bi_bdev,
+		.bi_sector	= bio->bi_iter.bi_sector,
+		.bi_size	= 0,
+		.bi_rw		= bio->bi_rw,
+	};
+
+	bio_for_each_segment(bv, bio, iter) {
+		if (q->merge_bvec_fn &&
+		    q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len)
+			goto split;
+
+		bvm.bi_size += bv.bv_len;
+
+		if (bvm.bi_size >> 9 > queue_max_sectors(q))
+			goto split;
+
+		/*
+		 * If the queue doesn't support SG gaps and adding this
+		 * offset would create a gap, disallow it.
+		 */
+		if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) &&
+		    prev && bvec_gap_to_prev(&bvprv, bv.bv_offset))
+			goto split;
+
+		if (prev && blk_queue_cluster(q)) {
+			if (seg_size + bv.bv_len > queue_max_segment_size(q))
+				goto new_segment;
+			if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv))
+				goto new_segment;
+			if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv))
+				goto new_segment;
+
+			seg_size += bv.bv_len;
+			bvprv = bv;
+			prev = 1;
+			continue;
+		}
+new_segment:
+		if (nsegs == queue_max_segments(q))
+			goto split;
+
+		nsegs++;
+		bvprv = bv;
+		prev = 1;
+		seg_size = bv.bv_len;
+	}
+
+	return NULL;
+split:
+	split = bio_clone_bioset(bio, GFP_NOIO, bs);
+
+	split->bi_iter.bi_size -= iter.bi_size;
+	bio->bi_iter = iter;
+
+	if (bio_integrity(bio)) {
+		bio_integrity_advance(bio, split->bi_iter.bi_size);
+		bio_integrity_trim(split, 0, bio_sectors(split));
+	}
+
+	return split;
+}
+
+void blk_queue_split(struct request_queue *q, struct bio **bio,
+		     struct bio_set *bs)
+{
+	struct bio *split;
+
+	if ((*bio)->bi_rw & REQ_DISCARD)
+		split = blk_bio_discard_split(q, *bio, bs);
+	else if ((*bio)->bi_rw & REQ_WRITE_SAME)
+		split = blk_bio_write_same_split(q, *bio, bs);
+	else
+		split = blk_bio_segment_split(q, *bio, q->bio_split);
+
+	if (split) {
+		bio_chain(split, *bio);
+		generic_make_request(*bio);
+		*bio = split;
+	}
+}
+EXPORT_SYMBOL(blk_queue_split);
+
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 					     struct bio *bio,
 					     bool no_sg_merge)
 {
 	struct bio_vec bv, bvprv = { NULL };
-	int cluster, high, highprv = 1;
+	int cluster, prev = 0;
 	unsigned int seg_size, nr_phys_segs;
 	struct bio *fbio, *bbio;
 	struct bvec_iter iter;
@@ -36,7 +182,6 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	cluster = blk_queue_cluster(q);
 	seg_size = 0;
 	nr_phys_segs = 0;
-	high = 0;
 	for_each_bio(bio) {
 		bio_for_each_segment(bv, bio, iter) {
 			/*
@@ -46,13 +191,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 			if (no_sg_merge)
 				goto new_segment;
 
-			/*
-			 * the trick here is making sure that a high page is
-			 * never considered part of another segment, since
-			 * that might change with the bounce page.
-			 */
-			high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q);
-			if (!high && !highprv && cluster) {
+			if (prev && cluster) {
 				if (seg_size + bv.bv_len
 				    > queue_max_segment_size(q))
 					goto new_segment;
@@ -72,8 +211,8 @@ new_segment:
 
 			nr_phys_segs++;
 			bvprv = bv;
+			prev = 1;
 			seg_size = bv.bv_len;
-			highprv = high;
 		}
 		bbio = bio;
 	}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 94559025c5e6..81edbd95bda8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1287,6 +1287,8 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		return;
 	}
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
 	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
 		return;
@@ -1372,6 +1374,8 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 		return;
 	}
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
 	    blk_attempt_plug_merge(q, bio, &request_count, NULL))
 		return;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b1f34e463c0f..3e44a9da2a13 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -561,6 +561,9 @@ static void blk_release_queue(struct kobject *kobj)
 
 	blk_trace_shutdown(q);
 
+	if (q->bio_split)
+		bioset_free(q->bio_split);
+
 	ida_simple_remove(&blk_queue_ida, q->id);
 	call_rcu(&q->rcu_head, blk_free_queue_rcu);
 }
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 9cb41166366e..923c857b395b 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1499,6 +1499,8 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
 	struct drbd_device *device = (struct drbd_device *) q->queuedata;
 	unsigned long start_jif;
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	start_jif = jiffies;
 
 	/*
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index a7a259e031da..ee7ad5e44632 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2447,6 +2447,10 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 	char b[BDEVNAME_SIZE];
 	struct bio *split;
 
+	blk_queue_bounce(q, &bio);
+
+	blk_queue_split(q, &bio, q->bio_split);
+
 	pd = q->queuedata;
 	if (!pd) {
 		pr_err("%s incorrect request queue\n",
@@ -2477,8 +2481,6 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 		goto end_io;
 	}
 
-	blk_queue_bounce(q, &bio);
-
 	do {
 		sector_t zone = get_zone(bio->bi_iter.bi_sector, pd);
 		sector_t last_zone = get_zone(bio_end_sector(bio) - 1, pd);
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 49b4706b162c..d89fcac59515 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -606,6 +606,8 @@ static void ps3vram_make_request(struct request_queue *q, struct bio *bio)
 
 	dev_dbg(&dev->core, "%s\n", __func__);
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	spin_lock_irq(&priv->lock);
 	busy = !bio_list_empty(&priv->list);
 	bio_list_add(&priv->list, bio);
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index 63b9d2ffa8ee..3163e4cdc2cc 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -151,6 +151,8 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio)
 	struct rsxx_bio_meta *bio_meta;
 	int st = -EINVAL;
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	might_sleep();
 
 	if (!card)
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 3b3afd2ec5d6..04d65790a886 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -531,6 +531,8 @@ static void mm_make_request(struct request_queue *q, struct bio *bio)
 		 (unsigned long long)bio->bi_iter.bi_sector,
 		 bio->bi_iter.bi_size);
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	spin_lock_irq(&card->lock);
 	*card->biotail = bio;
 	bio->bi_next = NULL;
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 68c3d4800464..aec781acee9d 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -900,6 +900,8 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
 	if (unlikely(!zram_meta_get(zram)))
 		goto error;
 
+	blk_queue_split(queue, &bio, queue->bio_split);
+
 	if (!valid_io_request(zram, bio->bi_iter.bi_sector,
 					bio->bi_iter.bi_size)) {
 		atomic64_inc(&zram->stats.invalid_io);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7f367fcace03..069f8d7e890e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1799,6 +1799,8 @@ static void dm_make_request(struct request_queue *q, struct bio *bio)
 
 	map = dm_get_live_table(md, &srcu_idx);
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
 
 	/* if we're suspended, we have to queue this io for later */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ac4381a6625c..e1d8723720cc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -257,6 +257,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
 	unsigned int sectors;
 	int cpu;
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	if (mddev == NULL || mddev->pers == NULL
 	    || !mddev->ready) {
 		bio_io_error(bio);
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 8bcb822b0bac..29ea2394c896 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -826,6 +826,8 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio)
 	unsigned long source_addr;
 	unsigned long bytes_done;
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	bytes_done = 0;
 	dev_info = bio->bi_bdev->bd_disk->private_data;
 	if (dev_info == NULL)
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index 93856b9b6214..02871f1db562 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -190,6 +190,8 @@ static void xpram_make_request(struct request_queue *q, struct bio *bio)
 	unsigned long page_addr;
 	unsigned long bytes;
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	if ((bio->bi_iter.bi_sector & 7) != 0 ||
 	    (bio->bi_iter.bi_size & 4095) != 0)
 		/* Request is not page-aligned. */
diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c
index cc00fd10fbcf..1e33d540b223 100644
--- a/drivers/staging/lustre/lustre/llite/lloop.c
+++ b/drivers/staging/lustre/lustre/llite/lloop.c
@@ -340,6 +340,8 @@ static void loop_make_request(struct request_queue *q, struct bio *old_bio)
 	int rw = bio_rw(old_bio);
 	int inactive;
 
+	blk_queue_split(q, &old_bio, q->bio_split);
+
 	if (!lo)
 		goto err;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 243f29e779ec..ca778d9c7d81 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -463,6 +463,7 @@ struct request_queue {
 
 	struct blk_mq_tag_set	*tag_set;
 	struct list_head	tag_set_list;
+	struct bio_set		*bio_split;
 };
 
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
@@ -783,6 +784,8 @@ extern void blk_rq_unprep_clone(struct request *rq);
 extern int blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern void blk_delay_queue(struct request_queue *, unsigned long);
+extern void blk_queue_split(struct request_queue *, struct bio **,
+			    struct bio_set *);
 extern void blk_recount_segments(struct request_queue *, struct bio *);
 extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int);
 extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t,
-- 
cgit v1.2.3-70-g09d2


From 8ae126660fddbeebb9251a174e6fa45b6ad8f932 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 27 Apr 2015 23:48:34 -0700
Subject: block: kill merge_bvec_fn() completely

As generic_make_request() is now able to handle arbitrarily sized bios,
it's no longer necessary for each individual block driver to define its
own ->merge_bvec_fn() callback. Remove every invocation completely.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Lars Ellenberg <drbd-dev@lists.linbit.com>
Cc: drbd-user@lists.linbit.com
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Yehuda Sadeh <yehuda@inktank.com>
Cc: Sage Weil <sage@inktank.com>
Cc: Alex Elder <elder@kernel.org>
Cc: ceph-devel@vger.kernel.org
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: dm-devel@redhat.com
Cc: Neil Brown <neilb@suse.de>
Cc: linux-raid@vger.kernel.org
Cc: Christoph Hellwig <hch@infradead.org>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Acked-by: NeilBrown <neilb@suse.de> (for the 'md' bits)
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
[dpark: also remove ->merge_bvec_fn() in dm-thin as well as
 dm-era-target, and resolve merge conflicts]
Signed-off-by: Dongsu Park <dpark@posteo.net>
Signed-off-by: Ming Lin <ming.l@ssi.samsung.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-merge.c              |  17 +-----
 block/blk-settings.c           |  22 -------
 drivers/block/drbd/drbd_int.h  |   1 -
 drivers/block/drbd/drbd_main.c |   1 -
 drivers/block/drbd/drbd_req.c  |  35 ------------
 drivers/block/pktcdvd.c        |  21 -------
 drivers/block/rbd.c            |  47 ---------------
 drivers/md/dm-cache-target.c   |  21 -------
 drivers/md/dm-crypt.c          |  16 ------
 drivers/md/dm-era-target.c     |  15 -----
 drivers/md/dm-flakey.c         |  16 ------
 drivers/md/dm-linear.c         |  16 ------
 drivers/md/dm-log-writes.c     |  16 ------
 drivers/md/dm-raid.c           |  19 ------
 drivers/md/dm-snap.c           |  15 -----
 drivers/md/dm-stripe.c         |  21 -------
 drivers/md/dm-table.c          |   8 ---
 drivers/md/dm-thin.c           |  31 ----------
 drivers/md/dm-verity.c         |  16 ------
 drivers/md/dm.c                | 127 +----------------------------------------
 drivers/md/dm.h                |   2 -
 drivers/md/linear.c            |  43 --------------
 drivers/md/md.c                |  26 ---------
 drivers/md/md.h                |  12 ----
 drivers/md/multipath.c         |  21 -------
 drivers/md/raid0.c             |  56 ------------------
 drivers/md/raid0.h             |   2 -
 drivers/md/raid1.c             |  58 +------------------
 drivers/md/raid10.c            | 121 +--------------------------------------
 drivers/md/raid5.c             |  32 -----------
 include/linux/blkdev.h         |  10 ----
 include/linux/device-mapper.h  |   4 --
 32 files changed, 9 insertions(+), 859 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index d9c3a75e4a60..0027def35f5a 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -69,24 +69,13 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 	struct bio *split;
 	struct bio_vec bv, bvprv;
 	struct bvec_iter iter;
-	unsigned seg_size = 0, nsegs = 0;
+	unsigned seg_size = 0, nsegs = 0, sectors = 0;
 	int prev = 0;
 
-	struct bvec_merge_data bvm = {
-		.bi_bdev	= bio->bi_bdev,
-		.bi_sector	= bio->bi_iter.bi_sector,
-		.bi_size	= 0,
-		.bi_rw		= bio->bi_rw,
-	};
-
 	bio_for_each_segment(bv, bio, iter) {
-		if (q->merge_bvec_fn &&
-		    q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len)
-			goto split;
-
-		bvm.bi_size += bv.bv_len;
+		sectors += bv.bv_len >> 9;
 
-		if (bvm.bi_size >> 9 > queue_max_sectors(q))
+		if (sectors > queue_max_sectors(q))
 			goto split;
 
 		/*
diff --git a/block/blk-settings.c b/block/blk-settings.c
index b38d8d723276..9df73991b231 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -53,28 +53,6 @@ void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn)
 }
 EXPORT_SYMBOL(blk_queue_unprep_rq);
 
-/**
- * blk_queue_merge_bvec - set a merge_bvec function for queue
- * @q:		queue
- * @mbfn:	merge_bvec_fn
- *
- * Usually queues have static limitations on the max sectors or segments that
- * we can put in a request. Stacking drivers may have some settings that
- * are dynamic, and thus we have to query the queue whether it is ok to
- * add a new bio_vec to a bio at a given offset or not. If the block device
- * has such limitations, it needs to register a merge_bvec_fn to control
- * the size of bio's sent to it. Note that a block device *must* allow a
- * single page to be added to an empty bio. The block device driver may want
- * to use the bio_split() function to deal with these bio's. By default
- * no merge_bvec_fn is defined for a queue, and only the fixed limits are
- * honored.
- */
-void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)
-{
-	q->merge_bvec_fn = mbfn;
-}
-EXPORT_SYMBOL(blk_queue_merge_bvec);
-
 void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
 {
 	q->softirq_done_fn = fn;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index a08c4a9179f1..015c6e91b756 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1450,7 +1450,6 @@ extern void do_submit(struct work_struct *ws);
 extern void __drbd_make_request(struct drbd_device *, struct bio *, unsigned long);
 extern void drbd_make_request(struct request_queue *q, struct bio *bio);
 extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req);
-extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
 extern int is_valid_ar_handle(struct drbd_request *, sector_t);
 
 
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index a1518539b858..74d97f4bac34 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2774,7 +2774,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 	   This triggers a max_bio_size message upon first attach or connect */
 	blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
 	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
-	blk_queue_merge_bvec(q, drbd_merge_bvec);
 	q->queue_lock = &resource->req_lock;
 
 	device->md_io.page = alloc_page(GFP_KERNEL);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 923c857b395b..211592682169 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1512,41 +1512,6 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
 	__drbd_make_request(device, bio, start_jif);
 }
 
-/* This is called by bio_add_page().
- *
- * q->max_hw_sectors and other global limits are already enforced there.
- *
- * We need to call down to our lower level device,
- * in case it has special restrictions.
- *
- * We also may need to enforce configured max-bio-bvecs limits.
- *
- * As long as the BIO is empty we have to allow at least one bvec,
- * regardless of size and offset, so no need to ask lower levels.
- */
-int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
-{
-	struct drbd_device *device = (struct drbd_device *) q->queuedata;
-	unsigned int bio_size = bvm->bi_size;
-	int limit = DRBD_MAX_BIO_SIZE;
-	int backing_limit;
-
-	if (bio_size && get_ldev(device)) {
-		unsigned int max_hw_sectors = queue_max_hw_sectors(q);
-		struct request_queue * const b =
-			device->ldev->backing_bdev->bd_disk->queue;
-		if (b->merge_bvec_fn) {
-			bvm->bi_bdev = device->ldev->backing_bdev;
-			backing_limit = b->merge_bvec_fn(b, bvm, bvec);
-			limit = min(limit, backing_limit);
-		}
-		put_ldev(device);
-		if ((limit >> 9) > max_hw_sectors)
-			limit = max_hw_sectors << 9;
-	}
-	return limit;
-}
-
 void request_timer_fn(unsigned long data)
 {
 	struct drbd_device *device = (struct drbd_device *) data;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index ee7ad5e44632..7be2375db7f2 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2506,26 +2506,6 @@ end_io:
 
 
-static int pkt_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
-			  struct bio_vec *bvec)
-{
-	struct pktcdvd_device *pd = q->queuedata;
-	sector_t zone = get_zone(bmd->bi_sector, pd);
-	int used = ((bmd->bi_sector - zone) << 9) + bmd->bi_size;
-	int remaining = (pd->settings.size << 9) - used;
-	int remaining2;
-
-	/*
-	 * A bio <= PAGE_SIZE must be allowed. If it crosses a packet
-	 * boundary, pkt_make_request() will split the bio.
-	 */
-	remaining2 = PAGE_SIZE - bmd->bi_size;
-	remaining = max(remaining, remaining2);
-
-	BUG_ON(remaining < 0);
-	return remaining;
-}
-
 static void pkt_init_queue(struct pktcdvd_device *pd)
 {
 	struct request_queue *q = pd->disk->queue;
@@ -2533,7 +2513,6 @@ static void pkt_init_queue(struct pktcdvd_device *pd)
 	blk_queue_make_request(q, pkt_make_request);
 	blk_queue_logical_block_size(q, CD_FRAMESIZE);
 	blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS);
-	blk_queue_merge_bvec(q, pkt_merge_bvec);
 	q->queuedata = pd;
 }
 
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index dcc86937f55c..71dd061a7e11 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3462,52 +3462,6 @@ static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_MQ_RQ_QUEUE_OK;
 }
 
-/*
- * a queue callback. Makes sure that we don't create a bio that spans across
- * multiple osd objects. One exception would be with a single page bios,
- * which we handle later at bio_chain_clone_range()
- */
-static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
-			  struct bio_vec *bvec)
-{
-	struct rbd_device *rbd_dev = q->queuedata;
-	sector_t sector_offset;
-	sector_t sectors_per_obj;
-	sector_t obj_sector_offset;
-	int ret;
-
-	/*
-	 * Find how far into its rbd object the partition-relative
-	 * bio start sector is to offset relative to the enclosing
-	 * device.
-	 */
-	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
-	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
-	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
-
-	/*
-	 * Compute the number of bytes from that offset to the end
-	 * of the object.  Account for what's already used by the bio.
-	 */
-	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
-	if (ret > bmd->bi_size)
-		ret -= bmd->bi_size;
-	else
-		ret = 0;
-
-	/*
-	 * Don't send back more than was asked for.  And if the bio
-	 * was empty, let the whole thing through because:  "Note
-	 * that a block device *must* allow a single page to be
-	 * added to an empty bio."
-	 */
-	rbd_assert(bvec->bv_len <= PAGE_SIZE);
-	if (ret > (int) bvec->bv_len || !bmd->bi_size)
-		ret = (int) bvec->bv_len;
-
-	return ret;
-}
-
 static void rbd_free_disk(struct rbd_device *rbd_dev)
 {
 	struct gendisk *disk = rbd_dev->disk;
@@ -3806,7 +3760,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
 	q->limits.discard_zeroes_data = 1;
 
-	blk_queue_merge_bvec(q, rbd_merge_bvec);
 	disk->queue = q;
 
 	q->queuedata = rbd_dev;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 04d0dadc48b1..d2b5dfbb30cf 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -3771,26 +3771,6 @@ static int cache_iterate_devices(struct dm_target *ti,
 	return r;
 }
 
-/*
- * We assume I/O is going to the origin (which is the volume
- * more likely to have restrictions e.g. by being striped).
- * (Looking up the exact location of the data would be expensive
- * and could always be out of date by the time the bio is submitted.)
- */
-static int cache_bvec_merge(struct dm_target *ti,
-			    struct bvec_merge_data *bvm,
-			    struct bio_vec *biovec, int max_size)
-{
-	struct cache *cache = ti->private;
-	struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
-
-	if (!q->merge_bvec_fn)
-		return max_size;
-
-	bvm->bi_bdev = cache->origin_dev->bdev;
-	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
 {
 	/*
@@ -3834,7 +3814,6 @@ static struct target_type cache_target = {
 	.status = cache_status,
 	.message = cache_message,
 	.iterate_devices = cache_iterate_devices,
-	.merge = cache_bvec_merge,
 	.io_hints = cache_io_hints,
 };
 
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 59da573cf994..ba5c2105f4e6 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2035,21 +2035,6 @@ error:
 	return -EINVAL;
 }
 
-static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-		       struct bio_vec *biovec, int max_size)
-{
-	struct crypt_config *cc = ti->private;
-	struct request_queue *q = bdev_get_queue(cc->dev->bdev);
-
-	if (!q->merge_bvec_fn)
-		return max_size;
-
-	bvm->bi_bdev = cc->dev->bdev;
-	bvm->bi_sector = cc->start + dm_target_offset(ti, bvm->bi_sector);
-
-	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
 static int crypt_iterate_devices(struct dm_target *ti,
 				 iterate_devices_callout_fn fn, void *data)
 {
@@ -2070,7 +2055,6 @@ static struct target_type crypt_target = {
 	.preresume = crypt_preresume,
 	.resume = crypt_resume,
 	.message = crypt_message,
-	.merge  = crypt_merge,
 	.iterate_devices = crypt_iterate_devices,
 };
 
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index ad913cd4aded..0119ebfb3d49 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1673,20 +1673,6 @@ static int era_iterate_devices(struct dm_target *ti,
 	return fn(ti, era->origin_dev, 0, get_dev_size(era->origin_dev), data);
 }
 
-static int era_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-		     struct bio_vec *biovec, int max_size)
-{
-	struct era *era = ti->private;
-	struct request_queue *q = bdev_get_queue(era->origin_dev->bdev);
-
-	if (!q->merge_bvec_fn)
-		return max_size;
-
-	bvm->bi_bdev = era->origin_dev->bdev;
-
-	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
 static void era_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct era *era = ti->private;
@@ -1717,7 +1703,6 @@ static struct target_type era_target = {
 	.status = era_status,
 	.message = era_message,
 	.iterate_devices = era_iterate_devices,
-	.merge = era_merge,
 	.io_hints = era_io_hints
 };
 
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 04481247aab8..afab13bd683e 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -387,21 +387,6 @@ static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long ar
 	return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
 }
 
-static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-			struct bio_vec *biovec, int max_size)
-{
-	struct flakey_c *fc = ti->private;
-	struct request_queue *q = bdev_get_queue(fc->dev->bdev);
-
-	if (!q->merge_bvec_fn)
-		return max_size;
-
-	bvm->bi_bdev = fc->dev->bdev;
-	bvm->bi_sector = flakey_map_sector(ti, bvm->bi_sector);
-
-	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
 static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
 {
 	struct flakey_c *fc = ti->private;
@@ -419,7 +404,6 @@ static struct target_type flakey_target = {
 	.end_io = flakey_end_io,
 	.status = flakey_status,
 	.ioctl	= flakey_ioctl,
-	.merge	= flakey_merge,
 	.iterate_devices = flakey_iterate_devices,
 };
 
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 53e848c10939..7dd5fc8e3eea 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -130,21 +130,6 @@ static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
 	return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
 }
 
-static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-			struct bio_vec *biovec, int max_size)
-{
-	struct linear_c *lc = ti->private;
-	struct request_queue *q = bdev_get_queue(lc->dev->bdev);
-
-	if (!q->merge_bvec_fn)
-		return max_size;
-
-	bvm->bi_bdev = lc->dev->bdev;
-	bvm->bi_sector = linear_map_sector(ti, bvm->bi_sector);
-
-	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
 static int linear_iterate_devices(struct dm_target *ti,
 				  iterate_devices_callout_fn fn, void *data)
 {
@@ -162,7 +147,6 @@ static struct target_type linear_target = {
 	.map    = linear_map,
 	.status = linear_status,
 	.ioctl  = linear_ioctl,
-	.merge  = linear_merge,
 	.iterate_devices = linear_iterate_devices,
 };
 
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index e9d17488d5e3..316cc3fb741f 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -725,21 +725,6 @@ static int log_writes_ioctl(struct dm_target *ti, unsigned int cmd,
 	return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
 }
 
-static int log_writes_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-			    struct bio_vec *biovec, int max_size)
-{
-	struct log_writes_c *lc = ti->private;
-	struct request_queue *q = bdev_get_queue(lc->dev->bdev);
-
-	if (!q->merge_bvec_fn)
-		return max_size;
-
-	bvm->bi_bdev = lc->dev->bdev;
-	bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
-
-	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
 static int log_writes_iterate_devices(struct dm_target *ti,
 				      iterate_devices_callout_fn fn,
 				      void *data)
@@ -793,7 +778,6 @@ static struct target_type log_writes_target = {
 	.end_io = normal_end_io,
 	.status = log_writes_status,
 	.ioctl	= log_writes_ioctl,
-	.merge	= log_writes_merge,
 	.message = log_writes_message,
 	.iterate_devices = log_writes_iterate_devices,
 	.io_hints = log_writes_io_hints,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 2daa67793511..97e165183e79 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1717,24 +1717,6 @@ static void raid_resume(struct dm_target *ti)
 	mddev_resume(&rs->md);
 }
 
-static int raid_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-		      struct bio_vec *biovec, int max_size)
-{
-	struct raid_set *rs = ti->private;
-	struct md_personality *pers = rs->md.pers;
-
-	if (pers && pers->mergeable_bvec)
-		return min(max_size, pers->mergeable_bvec(&rs->md, bvm, biovec));
-
-	/*
-	 * In case we can't request the personality because
-	 * the raid set is not running yet
-	 *
-	 * -> return safe minimum
-	 */
-	return rs->md.chunk_sectors;
-}
-
 static struct target_type raid_target = {
 	.name = "raid",
 	.version = {1, 7, 0},
@@ -1749,7 +1731,6 @@ static struct target_type raid_target = {
 	.presuspend = raid_presuspend,
 	.postsuspend = raid_postsuspend,
 	.resume = raid_resume,
-	.merge = raid_merge,
 };
 
 static int __init dm_raid_init(void)
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index dd8ca0bb0980..d10b6876018e 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -2330,20 +2330,6 @@ static void origin_status(struct dm_target *ti, status_type_t type,
 	}
 }
 
-static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-			struct bio_vec *biovec, int max_size)
-{
-	struct dm_origin *o = ti->private;
-	struct request_queue *q = bdev_get_queue(o->dev->bdev);
-
-	if (!q->merge_bvec_fn)
-		return max_size;
-
-	bvm->bi_bdev = o->dev->bdev;
-
-	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
 static int origin_iterate_devices(struct dm_target *ti,
 				  iterate_devices_callout_fn fn, void *data)
 {
@@ -2362,7 +2348,6 @@ static struct target_type origin_target = {
 	.resume  = origin_resume,
 	.postsuspend = origin_postsuspend,
 	.status  = origin_status,
-	.merge	 = origin_merge,
 	.iterate_devices = origin_iterate_devices,
 };
 
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 4f94c7da82f6..484029db8cba 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -412,26 +412,6 @@ static void stripe_io_hints(struct dm_target *ti,
 	blk_limits_io_opt(limits, chunk_size * sc->stripes);
 }
 
-static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-			struct bio_vec *biovec, int max_size)
-{
-	struct stripe_c *sc = ti->private;
-	sector_t bvm_sector = bvm->bi_sector;
-	uint32_t stripe;
-	struct request_queue *q;
-
-	stripe_map_sector(sc, bvm_sector, &stripe, &bvm_sector);
-
-	q = bdev_get_queue(sc->stripe[stripe].dev->bdev);
-	if (!q->merge_bvec_fn)
-		return max_size;
-
-	bvm->bi_bdev = sc->stripe[stripe].dev->bdev;
-	bvm->bi_sector = sc->stripe[stripe].physical_start + bvm_sector;
-
-	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
 static struct target_type stripe_target = {
 	.name   = "striped",
 	.version = {1, 5, 1},
@@ -443,7 +423,6 @@ static struct target_type stripe_target = {
 	.status = stripe_status,
 	.iterate_devices = stripe_iterate_devices,
 	.io_hints = stripe_io_hints,
-	.merge  = stripe_merge,
 };
 
 int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 16ba55ad7089..afb4ad3dfeb3 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -440,14 +440,6 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
 		       q->limits.alignment_offset,
 		       (unsigned long long) start << SECTOR_SHIFT);
 
-	/*
-	 * Check if merge fn is supported.
-	 * If not we'll force DM to use PAGE_SIZE or
-	 * smaller I/O, just to be safe.
-	 */
-	if (dm_queue_merge_is_compulsory(q) && !ti->type->merge)
-		blk_limits_max_hw_sectors(limits,
-					  (unsigned int) (PAGE_SIZE >> 9));
 	return 0;
 }
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2ade2c46dca9..f352e4990998 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3845,20 +3845,6 @@ static int pool_iterate_devices(struct dm_target *ti,
 	return fn(ti, pt->data_dev, 0, ti->len, data);
 }
 
-static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-		      struct bio_vec *biovec, int max_size)
-{
-	struct pool_c *pt = ti->private;
-	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
-
-	if (!q->merge_bvec_fn)
-		return max_size;
-
-	bvm->bi_bdev = pt->data_dev->bdev;
-
-	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct pool_c *pt = ti->private;
@@ -3935,7 +3921,6 @@ static struct target_type pool_target = {
 	.resume = pool_resume,
 	.message = pool_message,
 	.status = pool_status,
-	.merge = pool_merge,
 	.iterate_devices = pool_iterate_devices,
 	.io_hints = pool_io_hints,
 };
@@ -4262,21 +4247,6 @@ err:
 	DMEMIT("Error");
 }
 
-static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-		      struct bio_vec *biovec, int max_size)
-{
-	struct thin_c *tc = ti->private;
-	struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev);
-
-	if (!q->merge_bvec_fn)
-		return max_size;
-
-	bvm->bi_bdev = tc->pool_dev->bdev;
-	bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
-
-	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
 static int thin_iterate_devices(struct dm_target *ti,
 				iterate_devices_callout_fn fn, void *data)
 {
@@ -4320,7 +4290,6 @@ static struct target_type thin_target = {
 	.presuspend = thin_presuspend,
 	.postsuspend = thin_postsuspend,
 	.status = thin_status,
-	.merge = thin_merge,
 	.iterate_devices = thin_iterate_devices,
 	.io_hints = thin_io_hints,
 };
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 4b34df8fdb58..c137dcb147b8 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -649,21 +649,6 @@ static int verity_ioctl(struct dm_target *ti, unsigned cmd,
 				     cmd, arg);
 }
 
-static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-			struct bio_vec *biovec, int max_size)
-{
-	struct dm_verity *v = ti->private;
-	struct request_queue *q = bdev_get_queue(v->data_dev->bdev);
-
-	if (!q->merge_bvec_fn)
-		return max_size;
-
-	bvm->bi_bdev = v->data_dev->bdev;
-	bvm->bi_sector = verity_map_sector(v, bvm->bi_sector);
-
-	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
 static int verity_iterate_devices(struct dm_target *ti,
 				  iterate_devices_callout_fn fn, void *data)
 {
@@ -996,7 +981,6 @@ static struct target_type verity_target = {
 	.map		= verity_map,
 	.status		= verity_status,
 	.ioctl		= verity_ioctl,
-	.merge		= verity_merge,
 	.iterate_devices = verity_iterate_devices,
 	.io_hints	= verity_io_hints,
 };
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 069f8d7e890e..8bb1ebb6ca7b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -124,9 +124,8 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define DMF_FREEING 3
 #define DMF_DELETING 4
 #define DMF_NOFLUSH_SUSPENDING 5
-#define DMF_MERGE_IS_OPTIONAL 6
-#define DMF_DEFERRED_REMOVE 7
-#define DMF_SUSPENDED_INTERNALLY 8
+#define DMF_DEFERRED_REMOVE 6
+#define DMF_SUSPENDED_INTERNALLY 7
 
 /*
  * A dummy definition to make RCU happy.
@@ -1725,67 +1724,6 @@ static void __split_and_process_bio(struct mapped_device *md,
  * CRUD END
  *---------------------------------------------------------------*/
 
-static int dm_merge_bvec(struct request_queue *q,
-			 struct bvec_merge_data *bvm,
-			 struct bio_vec *biovec)
-{
-	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_live_table_fast(md);
-	struct dm_target *ti;
-	sector_t max_sectors, max_size = 0;
-
-	if (unlikely(!map))
-		goto out;
-
-	ti = dm_table_find_target(map, bvm->bi_sector);
-	if (!dm_target_is_valid(ti))
-		goto out;
-
-	/*
-	 * Find maximum amount of I/O that won't need splitting
-	 */
-	max_sectors = min(max_io_len(bvm->bi_sector, ti),
-			  (sector_t) queue_max_sectors(q));
-	max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
-
-	/*
-	 * FIXME: this stop-gap fix _must_ be cleaned up (by passing a sector_t
-	 * to the targets' merge function since it holds sectors not bytes).
-	 * Just doing this as an interim fix for stable@ because the more
-	 * comprehensive cleanup of switching to sector_t will impact every
-	 * DM target that implements a ->merge hook.
-	 */
-	if (max_size > INT_MAX)
-		max_size = INT_MAX;
-
-	/*
-	 * merge_bvec_fn() returns number of bytes
-	 * it can accept at this offset
-	 * max is precomputed maximal io size
-	 */
-	if (max_size && ti->type->merge)
-		max_size = ti->type->merge(ti, bvm, biovec, (int) max_size);
-	/*
-	 * If the target doesn't support merge method and some of the devices
-	 * provided their merge_bvec method (we know this by looking for the
-	 * max_hw_sectors that dm_set_device_limits may set), then we can't
-	 * allow bios with multiple vector entries.  So always set max_size
-	 * to 0, and the code below allows just one page.
-	 */
-	else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
-		max_size = 0;
-
-out:
-	dm_put_live_table_fast(md);
-	/*
-	 * Always allow an entire first page
-	 */
-	if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
-		max_size = biovec->bv_len;
-
-	return max_size;
-}
-
 /*
  * The request function that just remaps the bio built up by
  * dm_merge_bvec.
@@ -2507,59 +2445,6 @@ static void __set_size(struct mapped_device *md, sector_t size)
 	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
 }
 
-/*
- * Return 1 if the queue has a compulsory merge_bvec_fn function.
- *
- * If this function returns 0, then the device is either a non-dm
- * device without a merge_bvec_fn, or it is a dm device that is
- * able to split any bios it receives that are too big.
- */
-int dm_queue_merge_is_compulsory(struct request_queue *q)
-{
-	struct mapped_device *dev_md;
-
-	if (!q->merge_bvec_fn)
-		return 0;
-
-	if (q->make_request_fn == dm_make_request) {
-		dev_md = q->queuedata;
-		if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
-			return 0;
-	}
-
-	return 1;
-}
-
-static int dm_device_merge_is_compulsory(struct dm_target *ti,
-					 struct dm_dev *dev, sector_t start,
-					 sector_t len, void *data)
-{
-	struct block_device *bdev = dev->bdev;
-	struct request_queue *q = bdev_get_queue(bdev);
-
-	return dm_queue_merge_is_compulsory(q);
-}
-
-/*
- * Return 1 if it is acceptable to ignore merge_bvec_fn based
- * on the properties of the underlying devices.
- */
-static int dm_table_merge_is_optional(struct dm_table *table)
-{
-	unsigned i = 0;
-	struct dm_target *ti;
-
-	while (i < dm_table_get_num_targets(table)) {
-		ti = dm_table_get_target(table, i++);
-
-		if (ti->type->iterate_devices &&
-		    ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
-			return 0;
-	}
-
-	return 1;
-}
-
 /*
  * Returns old map, which caller must destroy.
  */
@@ -2569,7 +2454,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 	struct dm_table *old_map;
 	struct request_queue *q = md->queue;
 	sector_t size;
-	int merge_is_optional;
 
 	size = dm_table_get_size(t);
 
@@ -2595,17 +2479,11 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 
 	__bind_mempools(md, t);
 
-	merge_is_optional = dm_table_merge_is_optional(t);
-
 	old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
 	rcu_assign_pointer(md->map, t);
 	md->immutable_target_type = dm_table_get_immutable_target_type(t);
 
 	dm_table_set_restrictions(t, q, limits);
-	if (merge_is_optional)
-		set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
-	else
-		clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
 	if (old_map)
 		dm_sync_table(md);
 
@@ -2886,7 +2764,6 @@ int dm_setup_md_queue(struct mapped_device *md)
 	case DM_TYPE_BIO_BASED:
 		dm_init_old_md_queue(md);
 		blk_queue_make_request(md->queue, dm_make_request);
-		blk_queue_merge_bvec(md->queue, dm_merge_bvec);
 		break;
 	}
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 4e984993d40a..7edcf97dfa5a 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -78,8 +78,6 @@ bool dm_table_mq_request_based(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
-int dm_queue_merge_is_compulsory(struct request_queue *q);
-
 void dm_lock_md_type(struct mapped_device *md);
 void dm_unlock_md_type(struct mapped_device *md);
 void dm_set_md_type(struct mapped_device *md, unsigned type);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index aefd66142eef..b7fe7e9fc777 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -52,48 +52,6 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
 	return conf->disks + lo;
 }
 
-/**
- *	linear_mergeable_bvec -- tell bio layer if two requests can be merged
- *	@q: request queue
- *	@bvm: properties of new bio
- *	@biovec: the request that could be merged to it.
- *
- *	Return amount of bytes we can take at this offset
- */
-static int linear_mergeable_bvec(struct mddev *mddev,
-				 struct bvec_merge_data *bvm,
-				 struct bio_vec *biovec)
-{
-	struct dev_info *dev0;
-	unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
-	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
-	int maxbytes = biovec->bv_len;
-	struct request_queue *subq;
-
-	dev0 = which_dev(mddev, sector);
-	maxsectors = dev0->end_sector - sector;
-	subq = bdev_get_queue(dev0->rdev->bdev);
-	if (subq->merge_bvec_fn) {
-		bvm->bi_bdev = dev0->rdev->bdev;
-		bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors;
-		maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
-							     biovec));
-	}
-
-	if (maxsectors < bio_sectors)
-		maxsectors = 0;
-	else
-		maxsectors -= bio_sectors;
-
-	if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
-		return maxbytes;
-
-	if (maxsectors > (maxbytes >> 9))
-		return maxbytes;
-	else
-		return maxsectors << 9;
-}
-
 static int linear_congested(struct mddev *mddev, int bits)
 {
 	struct linear_conf *conf;
@@ -338,7 +296,6 @@ static struct md_personality linear_personality =
 	.size		= linear_size,
 	.quiesce	= linear_quiesce,
 	.congested	= linear_congested,
-	.mergeable_bvec	= linear_mergeable_bvec,
 };
 
 static int __init linear_init (void)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index e1d8723720cc..d28bf5cea224 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -354,29 +354,6 @@ static int md_congested(void *data, int bits)
 	return mddev_congested(mddev, bits);
 }
 
-static int md_mergeable_bvec(struct request_queue *q,
-			     struct bvec_merge_data *bvm,
-			     struct bio_vec *biovec)
-{
-	struct mddev *mddev = q->queuedata;
-	int ret;
-	rcu_read_lock();
-	if (mddev->suspended) {
-		/* Must always allow one vec */
-		if (bvm->bi_size == 0)
-			ret = biovec->bv_len;
-		else
-			ret = 0;
-	} else {
-		struct md_personality *pers = mddev->pers;
-		if (pers && pers->mergeable_bvec)
-			ret = pers->mergeable_bvec(mddev, bvm, biovec);
-		else
-			ret = biovec->bv_len;
-	}
-	rcu_read_unlock();
-	return ret;
-}
 /*
  * Generic flush handling for md
  */
@@ -5188,7 +5165,6 @@ int md_run(struct mddev *mddev)
 	if (mddev->queue) {
 		mddev->queue->backing_dev_info.congested_data = mddev;
 		mddev->queue->backing_dev_info.congested_fn = md_congested;
-		blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec);
 	}
 	if (pers->sync_request) {
 		if (mddev->kobj.sd &&
@@ -5317,7 +5293,6 @@ static void md_clean(struct mddev *mddev)
 	mddev->degraded = 0;
 	mddev->safemode = 0;
 	mddev->private = NULL;
-	mddev->merge_check_needed = 0;
 	mddev->bitmap_info.offset = 0;
 	mddev->bitmap_info.default_offset = 0;
 	mddev->bitmap_info.default_space = 0;
@@ -5514,7 +5489,6 @@ static int do_md_stop(struct mddev *mddev, int mode,
 
 		__md_stop_writes(mddev);
 		__md_stop(mddev);
-		mddev->queue->merge_bvec_fn = NULL;
 		mddev->queue->backing_dev_info.congested_fn = NULL;
 
 		/* tell userspace to handle 'inactive' */
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7da6e9c3cb53..ab339571e57f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -134,10 +134,6 @@ enum flag_bits {
 	Bitmap_sync,		/* ..actually, not quite In_sync.  Need a
 				 * bitmap-based recovery to get fully in sync
 				 */
-	Unmerged,		/* device is being added to array and should
-				 * be considerred for bvec_merge_fn but not
-				 * yet for actual IO
-				 */
 	WriteMostly,		/* Avoid reading if at all possible */
 	AutoDetected,		/* added by auto-detect */
 	Blocked,		/* An error occurred but has not yet
@@ -374,10 +370,6 @@ struct mddev {
 	int				degraded;	/* whether md should consider
 							 * adding a spare
 							 */
-	int				merge_check_needed; /* at least one
-							     * member device
-							     * has a
-							     * merge_bvec_fn */
 
 	atomic_t			recovery_active; /* blocks scheduled, but not written */
 	wait_queue_head_t		recovery_wait;
@@ -532,10 +524,6 @@ struct md_personality
 	/* congested implements bdi.congested_fn().
 	 * Will not be called while array is 'suspended' */
 	int (*congested)(struct mddev *mddev, int bits);
-	/* mergeable_bvec is use to implement ->merge_bvec_fn */
-	int (*mergeable_bvec)(struct mddev *mddev,
-			      struct bvec_merge_data *bvm,
-			      struct bio_vec *biovec);
 };
 
 struct md_sysfs_entry {
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 082a489af9d3..d222522c52e0 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -257,18 +257,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
 
-		/* as we don't honour merge_bvec_fn, we must never risk
-		 * violating it, so limit ->max_segments to one, lying
-		 * within a single page.
-		 * (Note: it is very unlikely that a device with
-		 * merge_bvec_fn will be involved in multipath.)
-		 */
-			if (q->merge_bvec_fn) {
-				blk_queue_max_segments(mddev->queue, 1);
-				blk_queue_segment_boundary(mddev->queue,
-							   PAGE_CACHE_SIZE - 1);
-			}
-
 			spin_lock_irq(&conf->device_lock);
 			mddev->degraded--;
 			rdev->raid_disk = path;
@@ -432,15 +420,6 @@ static int multipath_run (struct mddev *mddev)
 		disk_stack_limits(mddev->gendisk, rdev->bdev,
 				  rdev->data_offset << 9);
 
-		/* as we don't honour merge_bvec_fn, we must never risk
-		 * violating it, not that we ever expect a device with
-		 * a merge_bvec_fn to be involved in multipath */
-		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-			blk_queue_max_segments(mddev->queue, 1);
-			blk_queue_segment_boundary(mddev->queue,
-						   PAGE_CACHE_SIZE - 1);
-		}
-
 		if (!test_bit(Faulty, &rdev->flags))
 			working_disks++;
 	}
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index e6e0ae56f66b..59cda501a224 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -192,9 +192,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 			disk_stack_limits(mddev->gendisk, rdev1->bdev,
 					  rdev1->data_offset << 9);
 
-		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
-			conf->has_merge_bvec = 1;
-
 		if (!smallest || (rdev1->sectors < smallest->sectors))
 			smallest = rdev1;
 		cnt++;
@@ -351,58 +348,6 @@ static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
 			     + sector_div(sector, zone->nb_dev)];
 }
 
-/**
- *	raid0_mergeable_bvec -- tell bio layer if two requests can be merged
- *	@mddev: the md device
- *	@bvm: properties of new bio
- *	@biovec: the request that could be merged to it.
- *
- *	Return amount of bytes we can accept at this offset
- */
-static int raid0_mergeable_bvec(struct mddev *mddev,
-				struct bvec_merge_data *bvm,
-				struct bio_vec *biovec)
-{
-	struct r0conf *conf = mddev->private;
-	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
-	sector_t sector_offset = sector;
-	int max;
-	unsigned int chunk_sectors = mddev->chunk_sectors;
-	unsigned int bio_sectors = bvm->bi_size >> 9;
-	struct strip_zone *zone;
-	struct md_rdev *rdev;
-	struct request_queue *subq;
-
-	if (is_power_of_2(chunk_sectors))
-		max =  (chunk_sectors - ((sector & (chunk_sectors-1))
-						+ bio_sectors)) << 9;
-	else
-		max =  (chunk_sectors - (sector_div(sector, chunk_sectors)
-						+ bio_sectors)) << 9;
-	if (max < 0)
-		max = 0; /* bio_add cannot handle a negative return */
-	if (max <= biovec->bv_len && bio_sectors == 0)
-		return biovec->bv_len;
-	if (max < biovec->bv_len)
-		/* too small already, no need to check further */
-		return max;
-	if (!conf->has_merge_bvec)
-		return max;
-
-	/* May need to check subordinate device */
-	sector = sector_offset;
-	zone = find_zone(mddev->private, &sector_offset);
-	rdev = map_sector(mddev, zone, sector, &sector_offset);
-	subq = bdev_get_queue(rdev->bdev);
-	if (subq->merge_bvec_fn) {
-		bvm->bi_bdev = rdev->bdev;
-		bvm->bi_sector = sector_offset + zone->dev_start +
-			rdev->data_offset;
-		return min(max, subq->merge_bvec_fn(subq, bvm, biovec));
-	} else
-		return max;
-}
-
 static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 {
 	sector_t array_sectors = 0;
@@ -727,7 +672,6 @@ static struct md_personality raid0_personality=
 	.takeover	= raid0_takeover,
 	.quiesce	= raid0_quiesce,
 	.congested	= raid0_congested,
-	.mergeable_bvec	= raid0_mergeable_bvec,
 };
 
 static int __init raid0_init (void)
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 05539d9c97f0..7127a623f5da 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -12,8 +12,6 @@ struct r0conf {
 	struct md_rdev		**devlist; /* lists of rdevs, pointed to
 					    * by strip_zone->dev */
 	int			nr_strip_zones;
-	int			has_merge_bvec;	/* at least one member has
-						 * a merge_bvec_fn */
 };
 
 #endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 60d0a8626e63..0ff06fdb83a9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -557,7 +557,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 		rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (r1_bio->bios[disk] == IO_BLOCKED
 		    || rdev == NULL
-		    || test_bit(Unmerged, &rdev->flags)
 		    || test_bit(Faulty, &rdev->flags))
 			continue;
 		if (!test_bit(In_sync, &rdev->flags) &&
@@ -708,38 +707,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 	return best_disk;
 }
 
-static int raid1_mergeable_bvec(struct mddev *mddev,
-				struct bvec_merge_data *bvm,
-				struct bio_vec *biovec)
-{
-	struct r1conf *conf = mddev->private;
-	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
-	int max = biovec->bv_len;
-
-	if (mddev->merge_check_needed) {
-		int disk;
-		rcu_read_lock();
-		for (disk = 0; disk < conf->raid_disks * 2; disk++) {
-			struct md_rdev *rdev = rcu_dereference(
-				conf->mirrors[disk].rdev);
-			if (rdev && !test_bit(Faulty, &rdev->flags)) {
-				struct request_queue *q =
-					bdev_get_queue(rdev->bdev);
-				if (q->merge_bvec_fn) {
-					bvm->bi_sector = sector +
-						rdev->data_offset;
-					bvm->bi_bdev = rdev->bdev;
-					max = min(max, q->merge_bvec_fn(
-							  q, bvm, biovec));
-				}
-			}
-		}
-		rcu_read_unlock();
-	}
-	return max;
-
-}
-
 static int raid1_congested(struct mddev *mddev, int bits)
 {
 	struct r1conf *conf = mddev->private;
@@ -1268,8 +1235,7 @@ read_again:
 			break;
 		}
 		r1_bio->bios[i] = NULL;
-		if (!rdev || test_bit(Faulty, &rdev->flags)
-		    || test_bit(Unmerged, &rdev->flags)) {
+		if (!rdev || test_bit(Faulty, &rdev->flags)) {
 			if (i < conf->raid_disks)
 				set_bit(R1BIO_Degraded, &r1_bio->state);
 			continue;
@@ -1614,7 +1580,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	struct raid1_info *p;
 	int first = 0;
 	int last = conf->raid_disks - 1;
-	struct request_queue *q = bdev_get_queue(rdev->bdev);
 
 	if (mddev->recovery_disabled == conf->recovery_disabled)
 		return -EBUSY;
@@ -1622,11 +1587,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
-	if (q->merge_bvec_fn) {
-		set_bit(Unmerged, &rdev->flags);
-		mddev->merge_check_needed = 1;
-	}
-
 	for (mirror = first; mirror <= last; mirror++) {
 		p = conf->mirrors+mirror;
 		if (!p->rdev) {
@@ -1658,19 +1618,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			break;
 		}
 	}
-	if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
-		/* Some requests might not have seen this new
-		 * merge_bvec_fn.  We must wait for them to complete
-		 * before merging the device fully.
-		 * First we make sure any code which has tested
-		 * our function has submitted the request, then
-		 * we wait for all outstanding requests to complete.
-		 */
-		synchronize_sched();
-		freeze_array(conf, 0);
-		unfreeze_array(conf);
-		clear_bit(Unmerged, &rdev->flags);
-	}
 	md_integrity_add_rdev(rdev, mddev);
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
@@ -2806,8 +2753,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 			goto abort;
 		disk->rdev = rdev;
 		q = bdev_get_queue(rdev->bdev);
-		if (q->merge_bvec_fn)
-			mddev->merge_check_needed = 1;
 
 		disk->head_position = 0;
 		disk->seq_start = MaxSector;
@@ -3172,7 +3117,6 @@ static struct md_personality raid1_personality =
 	.quiesce	= raid1_quiesce,
 	.takeover	= raid1_takeover,
 	.congested	= raid1_congested,
-	.mergeable_bvec	= raid1_mergeable_bvec,
 };
 
 static int __init raid_init(void)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 316ff6f611e9..d92098f3e65b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -671,93 +671,6 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 	return (vchunk << geo->chunk_shift) + offset;
 }
 
-/**
- *	raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
- *	@mddev: the md device
- *	@bvm: properties of new bio
- *	@biovec: the request that could be merged to it.
- *
- *	Return amount of bytes we can accept at this offset
- *	This requires checking for end-of-chunk if near_copies != raid_disks,
- *	and for subordinate merge_bvec_fns if merge_check_needed.
- */
-static int raid10_mergeable_bvec(struct mddev *mddev,
-				 struct bvec_merge_data *bvm,
-				 struct bio_vec *biovec)
-{
-	struct r10conf *conf = mddev->private;
-	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
-	int max;
-	unsigned int chunk_sectors;
-	unsigned int bio_sectors = bvm->bi_size >> 9;
-	struct geom *geo = &conf->geo;
-
-	chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
-	if (conf->reshape_progress != MaxSector &&
-	    ((sector >= conf->reshape_progress) !=
-	     conf->mddev->reshape_backwards))
-		geo = &conf->prev;
-
-	if (geo->near_copies < geo->raid_disks) {
-		max = (chunk_sectors - ((sector & (chunk_sectors - 1))
-					+ bio_sectors)) << 9;
-		if (max < 0)
-			/* bio_add cannot handle a negative return */
-			max = 0;
-		if (max <= biovec->bv_len && bio_sectors == 0)
-			return biovec->bv_len;
-	} else
-		max = biovec->bv_len;
-
-	if (mddev->merge_check_needed) {
-		struct {
-			struct r10bio r10_bio;
-			struct r10dev devs[conf->copies];
-		} on_stack;
-		struct r10bio *r10_bio = &on_stack.r10_bio;
-		int s;
-		if (conf->reshape_progress != MaxSector) {
-			/* Cannot give any guidance during reshape */
-			if (max <= biovec->bv_len && bio_sectors == 0)
-				return biovec->bv_len;
-			return 0;
-		}
-		r10_bio->sector = sector;
-		raid10_find_phys(conf, r10_bio);
-		rcu_read_lock();
-		for (s = 0; s < conf->copies; s++) {
-			int disk = r10_bio->devs[s].devnum;
-			struct md_rdev *rdev = rcu_dereference(
-				conf->mirrors[disk].rdev);
-			if (rdev && !test_bit(Faulty, &rdev->flags)) {
-				struct request_queue *q =
-					bdev_get_queue(rdev->bdev);
-				if (q->merge_bvec_fn) {
-					bvm->bi_sector = r10_bio->devs[s].addr
-						+ rdev->data_offset;
-					bvm->bi_bdev = rdev->bdev;
-					max = min(max, q->merge_bvec_fn(
-							  q, bvm, biovec));
-				}
-			}
-			rdev = rcu_dereference(conf->mirrors[disk].replacement);
-			if (rdev && !test_bit(Faulty, &rdev->flags)) {
-				struct request_queue *q =
-					bdev_get_queue(rdev->bdev);
-				if (q->merge_bvec_fn) {
-					bvm->bi_sector = r10_bio->devs[s].addr
-						+ rdev->data_offset;
-					bvm->bi_bdev = rdev->bdev;
-					max = min(max, q->merge_bvec_fn(
-							  q, bvm, biovec));
-				}
-			}
-		}
-		rcu_read_unlock();
-	}
-	return max;
-}
-
 /*
  * This routine returns the disk from which the requested read should
  * be done. There is a per-array 'next expected sequential IO' sector
@@ -820,12 +733,10 @@ retry:
 		disk = r10_bio->devs[slot].devnum;
 		rdev = rcu_dereference(conf->mirrors[disk].replacement);
 		if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
-		    test_bit(Unmerged, &rdev->flags) ||
 		    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
 			rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (rdev == NULL ||
-		    test_bit(Faulty, &rdev->flags) ||
-		    test_bit(Unmerged, &rdev->flags))
+		    test_bit(Faulty, &rdev->flags))
 			continue;
 		if (!test_bit(In_sync, &rdev->flags) &&
 		    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
@@ -1325,11 +1236,9 @@ retry_write:
 			blocked_rdev = rrdev;
 			break;
 		}
-		if (rdev && (test_bit(Faulty, &rdev->flags)
-			     || test_bit(Unmerged, &rdev->flags)))
+		if (rdev && (test_bit(Faulty, &rdev->flags)))
 			rdev = NULL;
-		if (rrdev && (test_bit(Faulty, &rrdev->flags)
-			      || test_bit(Unmerged, &rrdev->flags)))
+		if (rrdev && (test_bit(Faulty, &rrdev->flags)))
 			rrdev = NULL;
 
 		r10_bio->devs[i].bio = NULL;
@@ -1776,7 +1685,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	int mirror;
 	int first = 0;
 	int last = conf->geo.raid_disks - 1;
-	struct request_queue *q = bdev_get_queue(rdev->bdev);
 
 	if (mddev->recovery_cp < MaxSector)
 		/* only hot-add to in-sync arrays, as recovery is
@@ -1789,11 +1697,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
-	if (q->merge_bvec_fn) {
-		set_bit(Unmerged, &rdev->flags);
-		mddev->merge_check_needed = 1;
-	}
-
 	if (rdev->saved_raid_disk >= first &&
 	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
 		mirror = rdev->saved_raid_disk;
@@ -1832,19 +1735,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		rcu_assign_pointer(p->rdev, rdev);
 		break;
 	}
-	if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
-		/* Some requests might not have seen this new
-		 * merge_bvec_fn.  We must wait for them to complete
-		 * before merging the device fully.
-		 * First we make sure any code which has tested
-		 * our function has submitted the request, then
-		 * we wait for all outstanding requests to complete.
-		 */
-		synchronize_sched();
-		freeze_array(conf, 0);
-		unfreeze_array(conf);
-		clear_bit(Unmerged, &rdev->flags);
-	}
 	md_integrity_add_rdev(rdev, mddev);
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
@@ -2392,7 +2282,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 			d = r10_bio->devs[sl].devnum;
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
 			if (rdev &&
-			    !test_bit(Unmerged, &rdev->flags) &&
 			    test_bit(In_sync, &rdev->flags) &&
 			    is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
 					&first_bad, &bad_sectors) == 0) {
@@ -2446,7 +2335,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 			d = r10_bio->devs[sl].devnum;
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
 			if (!rdev ||
-			    test_bit(Unmerged, &rdev->flags) ||
 			    !test_bit(In_sync, &rdev->flags))
 				continue;
 
@@ -3638,8 +3526,6 @@ static int run(struct mddev *mddev)
 			disk->rdev = rdev;
 		}
 		q = bdev_get_queue(rdev->bdev);
-		if (q->merge_bvec_fn)
-			mddev->merge_check_needed = 1;
 		diff = (rdev->new_data_offset - rdev->data_offset);
 		if (!mddev->reshape_backwards)
 			diff = -diff;
@@ -4692,7 +4578,6 @@ static struct md_personality raid10_personality =
 	.start_reshape	= raid10_start_reshape,
 	.finish_reshape	= raid10_finish_reshape,
 	.congested	= raid10_congested,
-	.mergeable_bvec	= raid10_mergeable_bvec,
 };
 
 static int __init raid_init(void)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7823295332de..6d20692952d2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4663,35 +4663,6 @@ static int raid5_congested(struct mddev *mddev, int bits)
 	return 0;
 }
 
-/* We want read requests to align with chunks where possible,
- * but write requests don't need to.
- */
-static int raid5_mergeable_bvec(struct mddev *mddev,
-				struct bvec_merge_data *bvm,
-				struct bio_vec *biovec)
-{
-	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
-	int max;
-	unsigned int chunk_sectors = mddev->chunk_sectors;
-	unsigned int bio_sectors = bvm->bi_size >> 9;
-
-	/*
-	 * always allow writes to be mergeable, read as well if array
-	 * is degraded as we'll go through stripe cache anyway.
-	 */
-	if ((bvm->bi_rw & 1) == WRITE || mddev->degraded)
-		return biovec->bv_len;
-
-	if (mddev->new_chunk_sectors < mddev->chunk_sectors)
-		chunk_sectors = mddev->new_chunk_sectors;
-	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
-	if (max < 0) max = 0;
-	if (max <= biovec->bv_len && bio_sectors == 0)
-		return biovec->bv_len;
-	else
-		return max;
-}
-
 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
 {
 	sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
@@ -7764,7 +7735,6 @@ static struct md_personality raid6_personality =
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid6_takeover,
 	.congested	= raid5_congested,
-	.mergeable_bvec	= raid5_mergeable_bvec,
 };
 static struct md_personality raid5_personality =
 {
@@ -7788,7 +7758,6 @@ static struct md_personality raid5_personality =
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid5_takeover,
 	.congested	= raid5_congested,
-	.mergeable_bvec	= raid5_mergeable_bvec,
 };
 
 static struct md_personality raid4_personality =
@@ -7813,7 +7782,6 @@ static struct md_personality raid4_personality =
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid4_takeover,
 	.congested	= raid5_congested,
-	.mergeable_bvec	= raid5_mergeable_bvec,
 };
 
 static int __init raid5_init(void)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ca778d9c7d81..a1feff54aeab 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -213,14 +213,6 @@ typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
 
 struct bio_vec;
-struct bvec_merge_data {
-	struct block_device *bi_bdev;
-	sector_t bi_sector;
-	unsigned bi_size;
-	unsigned long bi_rw;
-};
-typedef int (merge_bvec_fn) (struct request_queue *, struct bvec_merge_data *,
-			     struct bio_vec *);
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
 typedef int (lld_busy_fn) (struct request_queue *q);
@@ -306,7 +298,6 @@ struct request_queue {
 	make_request_fn		*make_request_fn;
 	prep_rq_fn		*prep_rq_fn;
 	unprep_rq_fn		*unprep_rq_fn;
-	merge_bvec_fn		*merge_bvec_fn;
 	softirq_done_fn		*softirq_done_fn;
 	rq_timed_out_fn		*rq_timed_out_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
@@ -992,7 +983,6 @@ extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
 extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn);
-extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 51cc1deb7af3..76d23fa8c7d3 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -82,9 +82,6 @@ typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv);
 typedef int (*dm_ioctl_fn) (struct dm_target *ti, unsigned int cmd,
 			    unsigned long arg);
 
-typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm,
-			    struct bio_vec *biovec, int max_size);
-
 /*
  * These iteration functions are typically used to check (and combine)
  * properties of underlying devices.
@@ -160,7 +157,6 @@ struct target_type {
 	dm_status_fn status;
 	dm_message_fn message;
 	dm_ioctl_fn ioctl;
-	dm_merge_fn merge;
 	dm_busy_fn busy;
 	dm_iterate_devices_fn iterate_devices;
 	dm_io_hints_fn io_hints;
-- 
cgit v1.2.3-70-g09d2


From b54ffb73cadcdcff9cc1ae0e11f502407e3e2e4c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 19 May 2015 14:31:01 +0200
Subject: block: remove bio_get_nr_vecs()

We can always fill up the bio now, no need to estimate the possible
size based on queue parameters.

Acked-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
[hch: rebased and wrote a changelog]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lin <ming.l@ssi.samsung.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c            | 23 -----------------------
 drivers/md/dm-io.c     |  2 +-
 fs/btrfs/compression.c |  5 +----
 fs/btrfs/extent_io.c   |  9 ++-------
 fs/btrfs/inode.c       |  3 +--
 fs/btrfs/scrub.c       | 18 ++----------------
 fs/direct-io.c         |  2 +-
 fs/ext4/page-io.c      |  3 +--
 fs/ext4/readpage.c     |  2 +-
 fs/f2fs/data.c         |  2 +-
 fs/gfs2/lops.c         |  9 +--------
 fs/logfs/dev_bdev.c    |  4 ++--
 fs/mpage.c             |  4 ++--
 fs/nilfs2/segbuf.c     |  2 +-
 fs/xfs/xfs_aops.c      |  3 +--
 include/linux/bio.h    |  1 -
 16 files changed, 18 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index ba9c4b0c0ff2..425d6d4a2f7a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -693,29 +693,6 @@ integrity_clone:
 }
 EXPORT_SYMBOL(bio_clone_bioset);
 
-/**
- *	bio_get_nr_vecs		- return approx number of vecs
- *	@bdev:  I/O target
- *
- *	Return the approximate number of pages we can send to this target.
- *	There's no guarantee that you will be able to fit this number of pages
- *	into a bio, it does not account for dynamic restrictions that vary
- *	on offset.
- */
-int bio_get_nr_vecs(struct block_device *bdev)
-{
-	struct request_queue *q = bdev_get_queue(bdev);
-	int nr_pages;
-
-	nr_pages = min_t(unsigned,
-		     queue_max_segments(q),
-		     queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1);
-
-	return min_t(unsigned, nr_pages, BIO_MAX_PAGES);
-
-}
-EXPORT_SYMBOL(bio_get_nr_vecs);
-
 /**
  *	bio_add_pc_page	-	attempt to add page to bio
  *	@q: the target queue
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index c84714f70378..6f8e83b2a6f8 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -316,7 +316,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
 		if ((rw & REQ_DISCARD) || (rw & REQ_WRITE_SAME))
 			num_bvecs = 1;
 		else
-			num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev),
+			num_bvecs = min_t(int, BIO_MAX_PAGES,
 					  dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
 
 		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 302266ec2cdb..57ee8ca29b06 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -97,10 +97,7 @@ static inline int compressed_bio_size(struct btrfs_root *root,
 static struct bio *compressed_bio_alloc(struct block_device *bdev,
 					u64 first_byte, gfp_t gfp_flags)
 {
-	int nr_vecs;
-
-	nr_vecs = bio_get_nr_vecs(bdev);
-	return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags);
+	return btrfs_bio_alloc(bdev, first_byte >> 9, BIO_MAX_PAGES, gfp_flags);
 }
 
 static int check_compressed_csum(struct inode *inode,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c22f175ed024..68b12bbc709f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2795,9 +2795,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 {
 	int ret = 0;
 	struct bio *bio;
-	int nr;
 	int contig = 0;
-	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
 	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
 	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
 
@@ -2822,12 +2820,9 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 			return 0;
 		}
 	}
-	if (this_compressed)
-		nr = BIO_MAX_PAGES;
-	else
-		nr = bio_get_nr_vecs(bdev);
 
-	bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+	bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES,
+			GFP_NOFS | __GFP_HIGH);
 	if (!bio)
 		return -ENOMEM;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6b8becfe2057..8635ef01a04a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7959,8 +7959,7 @@ out:
 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
 				       u64 first_sector, gfp_t gfp_flags)
 {
-	int nr_vecs = bio_get_nr_vecs(bdev);
-	return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
+	return btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags);
 }
 
 static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ebb8260186fe..9c146d8307b5 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -454,27 +454,14 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 	struct scrub_ctx *sctx;
 	int		i;
 	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
-	int pages_per_rd_bio;
 	int ret;
 
-	/*
-	 * the setting of pages_per_rd_bio is correct for scrub but might
-	 * be wrong for the dev_replace code where we might read from
-	 * different devices in the initial huge bios. However, that
-	 * code is able to correctly handle the case when adding a page
-	 * to a bio fails.
-	 */
-	if (dev->bdev)
-		pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
-					 bio_get_nr_vecs(dev->bdev));
-	else
-		pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 	sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
 	if (!sctx)
 		goto nomem;
 	atomic_set(&sctx->refs, 1);
 	sctx->is_dev_replace = is_dev_replace;
-	sctx->pages_per_rd_bio = pages_per_rd_bio;
+	sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 	sctx->curr = -1;
 	sctx->dev_root = dev->dev_root;
 	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
@@ -3896,8 +3883,7 @@ static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
 		return 0;
 
 	WARN_ON(!dev->bdev);
-	wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
-					 bio_get_nr_vecs(dev->bdev));
+	wr_ctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
 	wr_ctx->tgtdev = dev;
 	atomic_set(&wr_ctx->flush_all_writes, 0);
 	return 0;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 818c647f36d3..11256291642e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -655,7 +655,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
 	if (ret)
 		goto out;
 	sector = start_sector << (sdio->blkbits - 9);
-	nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
+	nr_pages = min(sdio->pages_in_io, BIO_MAX_PAGES);
 	BUG_ON(nr_pages <= 0);
 	dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
 	sdio->boundary = 0;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index aa95566f14be..8a9d63a0c071 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -372,10 +372,9 @@ void ext4_io_submit_init(struct ext4_io_submit *io,
 static int io_submit_init_bio(struct ext4_io_submit *io,
 			      struct buffer_head *bh)
 {
-	int nvecs = bio_get_nr_vecs(bh->b_bdev);
 	struct bio *bio;
 
-	bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
+	bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
 	if (!bio)
 		return -ENOMEM;
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 5de5b871c178..e26803fb210d 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -284,7 +284,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
 					goto set_error_page;
 			}
 			bio = bio_alloc(GFP_KERNEL,
-				min_t(int, nr_pages, bio_get_nr_vecs(bdev)));
+				min_t(int, nr_pages, BIO_MAX_PAGES));
 			if (!bio) {
 				if (ctx)
 					ext4_release_crypto_ctx(ctx);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 8f0baa7ffb50..b478accb24d9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1552,7 +1552,7 @@ submit_and_realloc:
 			}
 
 			bio = bio_alloc(GFP_KERNEL,
-				min_t(int, nr_pages, bio_get_nr_vecs(bdev)));
+				min_t(int, nr_pages, BIO_MAX_PAGES));
 			if (!bio) {
 				if (ctx)
 					f2fs_release_crypto_ctx(ctx);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index c0a1b967deba..92324ac58290 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -261,18 +261,11 @@ void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw)
 static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno)
 {
 	struct super_block *sb = sdp->sd_vfs;
-	unsigned nrvecs = bio_get_nr_vecs(sb->s_bdev);
 	struct bio *bio;
 
 	BUG_ON(sdp->sd_log_bio);
 
-	while (1) {
-		bio = bio_alloc(GFP_NOIO, nrvecs);
-		if (likely(bio))
-			break;
-		nrvecs = max(nrvecs/2, 1U);
-	}
-
+	bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
 	bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
 	bio->bi_bdev = sb->s_bdev;
 	bio->bi_end_io = gfs2_end_log_write;
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index cea0cc9878b7..a7fdbd868474 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -81,7 +81,7 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
 	unsigned int max_pages;
 	int i;
 
-	max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev));
+	max_pages = min(nr_pages, BIO_MAX_PAGES);
 
 	bio = bio_alloc(GFP_NOFS, max_pages);
 	BUG_ON(!bio);
@@ -171,7 +171,7 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
 	unsigned int max_pages;
 	int i;
 
-	max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev));
+	max_pages = min(nr_pages, BIO_MAX_PAGES);
 
 	bio = bio_alloc(GFP_NOFS, max_pages);
 	BUG_ON(!bio);
diff --git a/fs/mpage.c b/fs/mpage.c
index abac9361b3f1..778a4ddef77a 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -277,7 +277,7 @@ alloc_new:
 				goto out;
 		}
 		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
-			  	min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
+				min_t(int, nr_pages, BIO_MAX_PAGES),
 				GFP_KERNEL);
 		if (bio == NULL)
 			goto confused;
@@ -602,7 +602,7 @@ alloc_new:
 			}
 		}
 		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
-				bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
+				BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH);
 		if (bio == NULL)
 			goto confused;
 
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 550b10efb14e..f63620ce3892 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -414,7 +414,7 @@ static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
 {
 	wi->bio = NULL;
 	wi->rest_blocks = segbuf->sb_sum.nblocks;
-	wi->max_pages = bio_get_nr_vecs(wi->nilfs->ns_bdev);
+	wi->max_pages = BIO_MAX_PAGES;
 	wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
 	wi->start = wi->end = 0;
 	wi->blocknr = segbuf->sb_pseg_start;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3714844a81d8..c77499bcbd7a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -381,8 +381,7 @@ STATIC struct bio *
 xfs_alloc_ioend_bio(
 	struct buffer_head	*bh)
 {
-	int			nvecs = bio_get_nr_vecs(bh->b_bdev);
-	struct bio		*bio = bio_alloc(GFP_NOIO, nvecs);
+	struct bio		*bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
 
 	ASSERT(bio->bi_private == NULL);
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b7892a1906bd..ad7217458812 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -460,7 +460,6 @@ void bio_chain(struct bio *, struct bio *);
 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
-extern int bio_get_nr_vecs(struct block_device *);
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
 				    const struct iov_iter *, gfp_t);
-- 
cgit v1.2.3-70-g09d2


From edc90fee916b4f0d14af9c6b5c08666747488ef8 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 17 Jul 2015 15:05:46 -0500
Subject: PCI: Allocate ATS struct during enumeration

Previously, we allocated pci_ats structures when an IOMMU driver called
pci_enable_ats().  An SR-IOV VF shares the STU setting with its PF, so when
enabling ATS on the VF, we allocated a pci_ats struct for the PF if it
didn't already have one.  We held the sriov->lock to serialize threads
concurrently enabling ATS on several VFS so only one would allocate the PF
pci_ats.

Gregor reported a deadlock here:

  pci_enable_sriov
    sriov_enable
      virtfn_add
        mutex_lock(dev->sriov->lock)      # acquire sriov->lock
        pci_device_add
          device_add
            BUS_NOTIFY_ADD_DEVICE notifier chain
            iommu_bus_notifier
              amd_iommu_add_device        # iommu_ops.add_device
                init_iommu_group
                  iommu_group_get_for_dev
                    iommu_group_add_device
                      __iommu_attach_device
                        amd_iommu_attach_device  # iommu_ops.attach_device
                          attach_device
                            pci_enable_ats
                              mutex_lock(dev->sriov->lock) # deadlock

There's no reason to delay allocating the pci_ats struct, and if we
allocate it for each device at enumeration-time, there's no need for
locking in pci_enable_ats().

Allocate pci_ats struct during enumeration, when we initialize other
capabilities.

Note that this implementation requires ATS to be enabled on the PF first,
before on any of the VFs because the PF controls the STU for all the VFs.

Link: http://permalink.gmane.org/gmane.linux.kernel.iommu/9433
Reported-by: Gregor Dick <gdick@solarflare.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/pci/ats.c       | 98 +++++++++++++++++++++----------------------------
 drivers/pci/probe.c     |  3 ++
 drivers/pci/remove.c    |  1 +
 include/linux/pci-ats.h |  2 +-
 include/linux/pci.h     |  9 +++++
 5 files changed, 56 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index a8099d4d0c9d..2026f5388796 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -17,7 +17,7 @@
 
 #include "pci.h"
 
-static int ats_alloc_one(struct pci_dev *dev, int ps)
+static void ats_alloc_one(struct pci_dev *dev)
 {
 	int pos;
 	u16 cap;
@@ -25,20 +25,19 @@ static int ats_alloc_one(struct pci_dev *dev, int ps)
 
 	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
 	if (!pos)
-		return -ENODEV;
+		return;
 
 	ats = kzalloc(sizeof(*ats), GFP_KERNEL);
-	if (!ats)
-		return -ENOMEM;
+	if (!ats) {
+		dev_warn(&dev->dev, "can't allocate space for ATS state\n");
+		return;
+	}
 
 	ats->pos = pos;
-	ats->stu = ps;
 	pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap);
 	ats->qdep = PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) :
 					    PCI_ATS_MAX_QDEP;
 	dev->ats = ats;
-
-	return 0;
 }
 
 static void ats_free_one(struct pci_dev *dev)
@@ -47,6 +46,16 @@ static void ats_free_one(struct pci_dev *dev)
 	dev->ats = NULL;
 }
 
+void pci_ats_init(struct pci_dev *dev)
+{
+	ats_alloc_one(dev);
+}
+
+void pci_ats_free(struct pci_dev *dev)
+{
+	ats_free_one(dev);
+}
+
 /**
  * pci_enable_ats - enable the ATS capability
  * @dev: the PCI device
@@ -56,43 +65,35 @@ static void ats_free_one(struct pci_dev *dev)
  */
 int pci_enable_ats(struct pci_dev *dev, int ps)
 {
-	int rc;
 	u16 ctrl;
 
 	BUG_ON(dev->ats && dev->ats->is_enabled);
 
+	if (!dev->ats)
+		return -EINVAL;
+
 	if (ps < PCI_ATS_MIN_STU)
 		return -EINVAL;
 
-	if (dev->is_physfn || dev->is_virtfn) {
-		struct pci_dev *pdev = dev->is_physfn ? dev : dev->physfn;
+	/*
+	 * Note that enabling ATS on a VF fails unless it's already enabled
+	 * with the same STU on the PF.
+	 */
+	ctrl = PCI_ATS_CTRL_ENABLE;
+	if (dev->is_virtfn) {
+		struct pci_dev *pdev = dev->physfn;
 
-		mutex_lock(&pdev->sriov->lock);
-		if (pdev->ats)
-			rc = pdev->ats->stu == ps ? 0 : -EINVAL;
-		else
-			rc = ats_alloc_one(pdev, ps);
+		if (pdev->ats->stu != ps)
+			return -EINVAL;
 
-		if (!rc)
-			pdev->ats->ref_cnt++;
-		mutex_unlock(&pdev->sriov->lock);
-		if (rc)
-			return rc;
-	}
-
-	if (!dev->is_physfn) {
-		rc = ats_alloc_one(dev, ps);
-		if (rc)
-			return rc;
+		atomic_inc(&pdev->ats->ref_cnt);  /* count enabled VFs */
+	} else {
+		dev->ats->stu = ps;
+		ctrl |= PCI_ATS_CTRL_STU(dev->ats->stu - PCI_ATS_MIN_STU);
 	}
-
-	ctrl = PCI_ATS_CTRL_ENABLE;
-	if (!dev->is_virtfn)
-		ctrl |= PCI_ATS_CTRL_STU(ps - PCI_ATS_MIN_STU);
 	pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl);
 
 	dev->ats->is_enabled = 1;
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(pci_enable_ats);
@@ -107,24 +108,20 @@ void pci_disable_ats(struct pci_dev *dev)
 
 	BUG_ON(!dev->ats || !dev->ats->is_enabled);
 
+	if (atomic_read(&dev->ats->ref_cnt))
+		return;		/* VFs still enabled */
+
+	if (dev->is_virtfn) {
+		struct pci_dev *pdev = dev->physfn;
+
+		atomic_dec(&pdev->ats->ref_cnt);
+	}
+
 	pci_read_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, &ctrl);
 	ctrl &= ~PCI_ATS_CTRL_ENABLE;
 	pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl);
 
 	dev->ats->is_enabled = 0;
-
-	if (dev->is_physfn || dev->is_virtfn) {
-		struct pci_dev *pdev = dev->is_physfn ? dev : dev->physfn;
-
-		mutex_lock(&pdev->sriov->lock);
-		pdev->ats->ref_cnt--;
-		if (!pdev->ats->ref_cnt)
-			ats_free_one(pdev);
-		mutex_unlock(&pdev->sriov->lock);
-	}
-
-	if (!dev->is_physfn)
-		ats_free_one(dev);
 }
 EXPORT_SYMBOL_GPL(pci_disable_ats);
 
@@ -140,7 +137,6 @@ void pci_restore_ats_state(struct pci_dev *dev)
 	ctrl = PCI_ATS_CTRL_ENABLE;
 	if (!dev->is_virtfn)
 		ctrl |= PCI_ATS_CTRL_STU(dev->ats->stu - PCI_ATS_MIN_STU);
-
 	pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl);
 }
 EXPORT_SYMBOL_GPL(pci_restore_ats_state);
@@ -159,23 +155,13 @@ EXPORT_SYMBOL_GPL(pci_restore_ats_state);
  */
 int pci_ats_queue_depth(struct pci_dev *dev)
 {
-	int pos;
-	u16 cap;
-
 	if (dev->is_virtfn)
 		return 0;
 
 	if (dev->ats)
 		return dev->ats->qdep;
 
-	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
-	if (!pos)
-		return -ENODEV;
-
-	pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap);
-
-	return PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) :
-				       PCI_ATS_MAX_QDEP;
+	return -ENODEV;
 }
 EXPORT_SYMBOL_GPL(pci_ats_queue_depth);
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index cefd636681b6..c206398ca67e 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1540,6 +1540,9 @@ static void pci_init_capabilities(struct pci_dev *dev)
 	/* Single Root I/O Virtualization */
 	pci_iov_init(dev);
 
+	/* Address Translation Services */
+	pci_ats_init(dev);
+
 	/* Enable ACS P2P upstream forwarding */
 	pci_enable_acs(dev);
 }
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 8a280e9c2ad1..27617b862ca8 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -26,6 +26,7 @@ static void pci_stop_dev(struct pci_dev *dev)
 		dev->is_added = 0;
 	}
 
+	pci_ats_free(dev);
 	if (dev->bus->self)
 		pcie_aspm_exit_link_state(dev);
 }
diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h
index 72031785fe1d..e2dcc2ff3d0e 100644
--- a/include/linux/pci-ats.h
+++ b/include/linux/pci-ats.h
@@ -8,7 +8,7 @@ struct pci_ats {
 	int pos;        /* capability position */
 	int stu;        /* Smallest Translation Unit */
 	int qdep;       /* Invalidate Queue Depth */
-	int ref_cnt;    /* Physical Function reference count */
+	atomic_t ref_cnt; /* number of VFs with ATS enabled */
 	unsigned int is_enabled:1;      /* Enable bit is set */
 };
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8a0321a8fb59..1817819ba57b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1294,6 +1294,15 @@ int  ht_create_irq(struct pci_dev *dev, int idx);
 void ht_destroy_irq(unsigned int irq);
 #endif /* CONFIG_HT_IRQ */
 
+#ifdef CONFIG_PCI_ATS
+/* Address Translation Service */
+void pci_ats_init(struct pci_dev *dev);
+void pci_ats_free(struct pci_dev *dev);
+#else
+static inline void pci_ats_init(struct pci_dev *dev) { }
+static inline void pci_ats_free(struct pci_dev *dev) { }
+#endif
+
 void pci_cfg_access_lock(struct pci_dev *dev);
 bool pci_cfg_access_trylock(struct pci_dev *dev);
 void pci_cfg_access_unlock(struct pci_dev *dev);
-- 
cgit v1.2.3-70-g09d2


From d544d75ac96aa1b0a8a378826626a0fbd8ce4380 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 17 Jul 2015 15:15:19 -0500
Subject: PCI: Embed ATS info directly into struct pci_dev

The pci_ats struct is small and will get smaller, so I don't think it's
worth allocating it separately from the pci_dev struct.

Embed the ATS fields directly into struct pci_dev.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/pci/ats.c       | 61 +++++++++++++++++--------------------------------
 drivers/pci/remove.c    |  1 -
 include/linux/pci-ats.h | 10 +-------
 include/linux/pci.h     |  8 ++++---
 4 files changed, 27 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 2026f5388796..690ae6e6786c 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -21,29 +21,15 @@ static void ats_alloc_one(struct pci_dev *dev)
 {
 	int pos;
 	u16 cap;
-	struct pci_ats *ats;
 
 	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
 	if (!pos)
 		return;
 
-	ats = kzalloc(sizeof(*ats), GFP_KERNEL);
-	if (!ats) {
-		dev_warn(&dev->dev, "can't allocate space for ATS state\n");
-		return;
-	}
-
-	ats->pos = pos;
-	pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap);
-	ats->qdep = PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) :
+	dev->ats_cap = pos;
+	pci_read_config_word(dev, dev->ats_cap + PCI_ATS_CAP, &cap);
+	dev->ats_qdep = PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) :
 					    PCI_ATS_MAX_QDEP;
-	dev->ats = ats;
-}
-
-static void ats_free_one(struct pci_dev *dev)
-{
-	kfree(dev->ats);
-	dev->ats = NULL;
 }
 
 void pci_ats_init(struct pci_dev *dev)
@@ -51,11 +37,6 @@ void pci_ats_init(struct pci_dev *dev)
 	ats_alloc_one(dev);
 }
 
-void pci_ats_free(struct pci_dev *dev)
-{
-	ats_free_one(dev);
-}
-
 /**
  * pci_enable_ats - enable the ATS capability
  * @dev: the PCI device
@@ -67,9 +48,9 @@ int pci_enable_ats(struct pci_dev *dev, int ps)
 {
 	u16 ctrl;
 
-	BUG_ON(dev->ats && dev->ats->is_enabled);
+	BUG_ON(dev->ats_cap && dev->ats_enabled);
 
-	if (!dev->ats)
+	if (!dev->ats_cap)
 		return -EINVAL;
 
 	if (ps < PCI_ATS_MIN_STU)
@@ -83,17 +64,17 @@ int pci_enable_ats(struct pci_dev *dev, int ps)
 	if (dev->is_virtfn) {
 		struct pci_dev *pdev = dev->physfn;
 
-		if (pdev->ats->stu != ps)
+		if (pdev->ats_stu != ps)
 			return -EINVAL;
 
-		atomic_inc(&pdev->ats->ref_cnt);  /* count enabled VFs */
+		atomic_inc(&pdev->ats_ref_cnt);  /* count enabled VFs */
 	} else {
-		dev->ats->stu = ps;
-		ctrl |= PCI_ATS_CTRL_STU(dev->ats->stu - PCI_ATS_MIN_STU);
+		dev->ats_stu = ps;
+		ctrl |= PCI_ATS_CTRL_STU(dev->ats_stu - PCI_ATS_MIN_STU);
 	}
-	pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl);
+	pci_write_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, ctrl);
 
-	dev->ats->is_enabled = 1;
+	dev->ats_enabled = 1;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(pci_enable_ats);
@@ -106,22 +87,22 @@ void pci_disable_ats(struct pci_dev *dev)
 {
 	u16 ctrl;
 
-	BUG_ON(!dev->ats || !dev->ats->is_enabled);
+	BUG_ON(!dev->ats_cap || !dev->ats_enabled);
 
-	if (atomic_read(&dev->ats->ref_cnt))
+	if (atomic_read(&dev->ats_ref_cnt))
 		return;		/* VFs still enabled */
 
 	if (dev->is_virtfn) {
 		struct pci_dev *pdev = dev->physfn;
 
-		atomic_dec(&pdev->ats->ref_cnt);
+		atomic_dec(&pdev->ats_ref_cnt);
 	}
 
-	pci_read_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, &ctrl);
+	pci_read_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, &ctrl);
 	ctrl &= ~PCI_ATS_CTRL_ENABLE;
-	pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl);
+	pci_write_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, ctrl);
 
-	dev->ats->is_enabled = 0;
+	dev->ats_enabled = 0;
 }
 EXPORT_SYMBOL_GPL(pci_disable_ats);
 
@@ -136,8 +117,8 @@ void pci_restore_ats_state(struct pci_dev *dev)
 
 	ctrl = PCI_ATS_CTRL_ENABLE;
 	if (!dev->is_virtfn)
-		ctrl |= PCI_ATS_CTRL_STU(dev->ats->stu - PCI_ATS_MIN_STU);
-	pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl);
+		ctrl |= PCI_ATS_CTRL_STU(dev->ats_stu - PCI_ATS_MIN_STU);
+	pci_write_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, ctrl);
 }
 EXPORT_SYMBOL_GPL(pci_restore_ats_state);
 
@@ -158,8 +139,8 @@ int pci_ats_queue_depth(struct pci_dev *dev)
 	if (dev->is_virtfn)
 		return 0;
 
-	if (dev->ats)
-		return dev->ats->qdep;
+	if (dev->ats_cap)
+		return dev->ats_qdep;
 
 	return -ENODEV;
 }
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 27617b862ca8..8a280e9c2ad1 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -26,7 +26,6 @@ static void pci_stop_dev(struct pci_dev *dev)
 		dev->is_added = 0;
 	}
 
-	pci_ats_free(dev);
 	if (dev->bus->self)
 		pcie_aspm_exit_link_state(dev);
 }
diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h
index e2dcc2ff3d0e..5d81d47b0a95 100644
--- a/include/linux/pci-ats.h
+++ b/include/linux/pci-ats.h
@@ -4,14 +4,6 @@
 #include <linux/pci.h>
 
 /* Address Translation Service */
-struct pci_ats {
-	int pos;        /* capability position */
-	int stu;        /* Smallest Translation Unit */
-	int qdep;       /* Invalidate Queue Depth */
-	atomic_t ref_cnt; /* number of VFs with ATS enabled */
-	unsigned int is_enabled:1;      /* Enable bit is set */
-};
-
 #ifdef CONFIG_PCI_ATS
 
 int pci_enable_ats(struct pci_dev *dev, int ps);
@@ -26,7 +18,7 @@ int pci_ats_queue_depth(struct pci_dev *dev);
  */
 static inline int pci_ats_enabled(struct pci_dev *dev)
 {
-	return dev->ats && dev->ats->is_enabled;
+	return dev->ats_cap && dev->ats_enabled;
 }
 
 #else /* CONFIG_PCI_ATS */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 1817819ba57b..8bc16b5e4747 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -343,6 +343,7 @@ struct pci_dev {
 	unsigned int	msi_enabled:1;
 	unsigned int	msix_enabled:1;
 	unsigned int	ari_enabled:1;	/* ARI forwarding */
+	unsigned int	ats_enabled:1;	/* Address Translation Service */
 	unsigned int	is_managed:1;
 	unsigned int    needs_freset:1; /* Dev requires fundamental reset */
 	unsigned int	state_saved:1;
@@ -375,7 +376,10 @@ struct pci_dev {
 		struct pci_sriov *sriov;	/* SR-IOV capability related */
 		struct pci_dev *physfn;	/* the PF this VF is associated with */
 	};
-	struct pci_ats	*ats;	/* Address Translation Service */
+	int		ats_cap;	/* ATS Capability offset */
+	int		ats_stu;	/* ATS Smallest Translation Unit */
+	int		ats_qdep;	/* ATS Invalidate Queue Depth */
+	atomic_t	ats_ref_cnt;	/* number of VFs with ATS enabled */
 #endif
 	phys_addr_t rom; /* Physical address of ROM if it's not from the BAR */
 	size_t romlen; /* Length of ROM if it's not from the BAR */
@@ -1297,10 +1301,8 @@ void ht_destroy_irq(unsigned int irq);
 #ifdef CONFIG_PCI_ATS
 /* Address Translation Service */
 void pci_ats_init(struct pci_dev *dev);
-void pci_ats_free(struct pci_dev *dev);
 #else
 static inline void pci_ats_init(struct pci_dev *dev) { }
-static inline void pci_ats_free(struct pci_dev *dev) { }
 #endif
 
 void pci_cfg_access_lock(struct pci_dev *dev);
-- 
cgit v1.2.3-70-g09d2


From 67930995d7fb8ae7d2078822b563010b289ace2e Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 17 Jul 2015 15:27:34 -0500
Subject: PCI: Reduce size of ATS structure elements

The extended capabilities list is linked with 12-bit pointers, and the ATS
Smallest Translation Unit and Invalidate Queue Depth fields are both 5
bits.

Use u16 and u8 to hold the extended capability address and the stu and qdep
values.  No functional change.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/pci.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8bc16b5e4747..238b77e8ca41 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -376,9 +376,9 @@ struct pci_dev {
 		struct pci_sriov *sriov;	/* SR-IOV capability related */
 		struct pci_dev *physfn;	/* the PF this VF is associated with */
 	};
-	int		ats_cap;	/* ATS Capability offset */
-	int		ats_stu;	/* ATS Smallest Translation Unit */
-	int		ats_qdep;	/* ATS Invalidate Queue Depth */
+	u16		ats_cap;	/* ATS Capability offset */
+	u8		ats_stu;	/* ATS Smallest Translation Unit */
+	u8		ats_qdep;	/* ATS Invalidate Queue Depth */
 	atomic_t	ats_ref_cnt;	/* number of VFs with ATS enabled */
 #endif
 	phys_addr_t rom; /* Physical address of ROM if it's not from the BAR */
-- 
cgit v1.2.3-70-g09d2


From ff9bee895c4d11a519a6b2c49451376025a6af4e Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 17 Jul 2015 15:55:48 -0500
Subject: PCI: Move ATS declarations to linux/pci.h so they're all together

Move ATS declarations to linux/pci.h so they're all in one place.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/pci-ats.h | 41 -----------------------------------------
 include/linux/pci.h     | 10 +++++++++-
 2 files changed, 9 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h
index 5d81d47b0a95..57e0b8250947 100644
--- a/include/linux/pci-ats.h
+++ b/include/linux/pci-ats.h
@@ -3,47 +3,6 @@
 
 #include <linux/pci.h>
 
-/* Address Translation Service */
-#ifdef CONFIG_PCI_ATS
-
-int pci_enable_ats(struct pci_dev *dev, int ps);
-void pci_disable_ats(struct pci_dev *dev);
-int pci_ats_queue_depth(struct pci_dev *dev);
-
-/**
- * pci_ats_enabled - query the ATS status
- * @dev: the PCI device
- *
- * Returns 1 if ATS capability is enabled, or 0 if not.
- */
-static inline int pci_ats_enabled(struct pci_dev *dev)
-{
-	return dev->ats_cap && dev->ats_enabled;
-}
-
-#else /* CONFIG_PCI_ATS */
-
-static inline int pci_enable_ats(struct pci_dev *dev, int ps)
-{
-	return -ENODEV;
-}
-
-static inline void pci_disable_ats(struct pci_dev *dev)
-{
-}
-
-static inline int pci_ats_queue_depth(struct pci_dev *dev)
-{
-	return -ENODEV;
-}
-
-static inline int pci_ats_enabled(struct pci_dev *dev)
-{
-	return 0;
-}
-
-#endif /* CONFIG_PCI_ATS */
-
 #ifdef CONFIG_PCI_PRI
 
 int pci_enable_pri(struct pci_dev *pdev, u32 reqs);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 238b77e8ca41..307f96a58e8b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1301,8 +1301,16 @@ void ht_destroy_irq(unsigned int irq);
 #ifdef CONFIG_PCI_ATS
 /* Address Translation Service */
 void pci_ats_init(struct pci_dev *dev);
+int pci_enable_ats(struct pci_dev *dev, int ps);
+void pci_disable_ats(struct pci_dev *dev);
+int pci_ats_queue_depth(struct pci_dev *dev);
+static inline int pci_ats_enabled(struct pci_dev *dev) { return dev->ats_cap && dev->ats_enabled; }
 #else
-static inline void pci_ats_init(struct pci_dev *dev) { }
+static inline void pci_ats_init(struct pci_dev *d) { }
+static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; }
+static inline void pci_disable_ats(struct pci_dev *d) { }
+static inline int pci_ats_queue_depth(struct pci_dev *d) { return -ENODEV; }
+static inline int pci_ats_enabled(struct pci_dev *d) { return 0; }
 #endif
 
 void pci_cfg_access_lock(struct pci_dev *dev);
-- 
cgit v1.2.3-70-g09d2


From a71f938f3a9a7bc879296cd34ecae9effe5edf3f Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Mon, 20 Jul 2015 09:24:32 -0500
Subject: PCI: Stop caching ATS Invalidate Queue Depth

Stop caching the Invalidate Queue Depth in struct pci_dev.
pci_ats_queue_depth() is typically called only once per device, and it
returns a fixed value per-device, so callers who need the value frequently
can cache it themselves.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/pci/ats.c   | 9 ++++-----
 include/linux/pci.h | 1 -
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 9355f754c7c2..ceda7dc556d4 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -20,16 +20,12 @@
 void pci_ats_init(struct pci_dev *dev)
 {
 	int pos;
-	u16 cap;
 
 	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
 	if (!pos)
 		return;
 
 	dev->ats_cap = pos;
-	pci_read_config_word(dev, dev->ats_cap + PCI_ATS_CAP, &cap);
-	dev->ats_qdep = PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) :
-					    PCI_ATS_MAX_QDEP;
 }
 
 /**
@@ -131,13 +127,16 @@ EXPORT_SYMBOL_GPL(pci_restore_ats_state);
  */
 int pci_ats_queue_depth(struct pci_dev *dev)
 {
+	u16 cap;
+
 	if (!dev->ats_cap)
 		return -EINVAL;
 
 	if (dev->is_virtfn)
 		return 0;
 
-	return dev->ats_qdep;
+	pci_read_config_word(dev, dev->ats_cap + PCI_ATS_CAP, &cap);
+	return PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) : PCI_ATS_MAX_QDEP;
 }
 EXPORT_SYMBOL_GPL(pci_ats_queue_depth);
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 307f96a58e8b..4b484fdfd66f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -378,7 +378,6 @@ struct pci_dev {
 	};
 	u16		ats_cap;	/* ATS Capability offset */
 	u8		ats_stu;	/* ATS Smallest Translation Unit */
-	u8		ats_qdep;	/* ATS Invalidate Queue Depth */
 	atomic_t	ats_ref_cnt;	/* number of VFs with ATS enabled */
 #endif
 	phys_addr_t rom; /* Physical address of ROM if it's not from the BAR */
-- 
cgit v1.2.3-70-g09d2


From f7ef1340bb501717372a39f4807d0ad519ebd432 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Mon, 20 Jul 2015 09:23:37 -0500
Subject: PCI: Remove pci_ats_enabled()

Remove pci_ats_enabled().  There are no callers outside the ATS code
itself.  We don't need to check ats_cap, because if we don't find an ATS
capability, we'll never set ats_enabled.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/pci/ats.c   | 6 +++---
 include/linux/pci.h | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index ceda7dc556d4..eeb9fb2b47aa 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -43,7 +43,7 @@ int pci_enable_ats(struct pci_dev *dev, int ps)
 	if (!dev->ats_cap)
 		return -EINVAL;
 
-	if (WARN_ON(pci_ats_enabled(dev)))
+	if (WARN_ON(dev->ats_enabled))
 		return -EBUSY;
 
 	if (ps < PCI_ATS_MIN_STU)
@@ -80,7 +80,7 @@ void pci_disable_ats(struct pci_dev *dev)
 	struct pci_dev *pdev;
 	u16 ctrl;
 
-	if (WARN_ON(!pci_ats_enabled(dev)))
+	if (WARN_ON(!dev->ats_enabled))
 		return;
 
 	if (atomic_read(&dev->ats_ref_cnt))
@@ -103,7 +103,7 @@ void pci_restore_ats_state(struct pci_dev *dev)
 {
 	u16 ctrl;
 
-	if (!pci_ats_enabled(dev))
+	if (!dev->ats_enabled)
 		return;
 
 	ctrl = PCI_ATS_CTRL_ENABLE;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4b484fdfd66f..806da7634f91 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1303,13 +1303,11 @@ void pci_ats_init(struct pci_dev *dev);
 int pci_enable_ats(struct pci_dev *dev, int ps);
 void pci_disable_ats(struct pci_dev *dev);
 int pci_ats_queue_depth(struct pci_dev *dev);
-static inline int pci_ats_enabled(struct pci_dev *dev) { return dev->ats_cap && dev->ats_enabled; }
 #else
 static inline void pci_ats_init(struct pci_dev *d) { }
 static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; }
 static inline void pci_disable_ats(struct pci_dev *d) { }
 static inline int pci_ats_queue_depth(struct pci_dev *d) { return -ENODEV; }
-static inline int pci_ats_enabled(struct pci_dev *d) { return 0; }
 #endif
 
 void pci_cfg_access_lock(struct pci_dev *dev);
-- 
cgit v1.2.3-70-g09d2


From 4c96b7dc0d393f12c17e0d81db15aa4a820a6ab3 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Wed, 12 Aug 2015 17:06:26 -0500
Subject: Add a matching set of device_ functions for determining mac/phy

OF has some helper functions for parsing MAC and PHY settings.
In cases where the platform is providing this information rather
than the device itself, there needs to be similar functions for ACPI.

These functions are slightly modified versions of the ones in
of_net which can use information provided via DT or ACPI.

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/base/property.c  | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/property.h |  4 +++
 2 files changed, 77 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index f3f6d167f3f1..2e8cd147f02d 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -16,6 +16,8 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/property.h>
+#include <linux/etherdevice.h>
+#include <linux/phy.h>
 
 /**
  * device_add_property_set - Add a collection of properties to a device object.
@@ -533,3 +535,74 @@ bool device_dma_is_coherent(struct device *dev)
 	return coherent;
 }
 EXPORT_SYMBOL_GPL(device_dma_is_coherent);
+
+/**
+ * device_get_phy_mode - Get phy mode for given device_node
+ * @dev:	Pointer to the given device
+ *
+ * The function gets phy interface string from property 'phy-mode' or
+ * 'phy-connection-type', and return its index in phy_modes table, or errno in
+ * error case.
+ */
+int device_get_phy_mode(struct device *dev)
+{
+	const char *pm;
+	int err, i;
+
+	err = device_property_read_string(dev, "phy-mode", &pm);
+	if (err < 0)
+		err = device_property_read_string(dev,
+						  "phy-connection-type", &pm);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++)
+		if (!strcasecmp(pm, phy_modes(i)))
+			return i;
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(device_get_phy_mode);
+
+static void *device_get_mac_addr(struct device *dev,
+				 const char *name, char *addr,
+				 int alen)
+{
+	int ret = device_property_read_u8_array(dev, name, addr, alen);
+
+	if (ret == 0 && is_valid_ether_addr(addr))
+		return addr;
+	return NULL;
+}
+
+/**
+ * Search the device tree for the best MAC address to use.  'mac-address' is
+ * checked first, because that is supposed to contain to "most recent" MAC
+ * address. If that isn't set, then 'local-mac-address' is checked next,
+ * because that is the default address.  If that isn't set, then the obsolete
+ * 'address' is checked, just in case we're using an old device tree.
+ *
+ * Note that the 'address' property is supposed to contain a virtual address of
+ * the register set, but some DTS files have redefined that property to be the
+ * MAC address.
+ *
+ * All-zero MAC addresses are rejected, because those could be properties that
+ * exist in the device tree, but were not set by U-Boot.  For example, the
+ * DTS could define 'mac-address' and 'local-mac-address', with zero MAC
+ * addresses.  Some older U-Boots only initialized 'local-mac-address'.  In
+ * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists
+ * but is all zeros.
+*/
+void *device_get_mac_address(struct device *dev, char *addr, int alen)
+{
+	addr = device_get_mac_addr(dev, "mac-address", addr, alen);
+	if (addr)
+		return addr;
+
+	addr = device_get_mac_addr(dev, "local-mac-address", addr, alen);
+	if (addr)
+		return addr;
+
+	return device_get_mac_addr(dev, "address", addr, alen);
+}
+EXPORT_SYMBOL(device_get_mac_address);
diff --git a/include/linux/property.h b/include/linux/property.h
index 76ebde9c11d4..a59c6ee566c2 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -166,4 +166,8 @@ void device_add_property_set(struct device *dev, struct property_set *pset);
 
 bool device_dma_is_coherent(struct device *dev);
 
+int device_get_phy_mode(struct device *dev);
+
+void *device_get_mac_address(struct device *dev, char *addr, int alen);
+
 #endif /* _LINUX_PROPERTY_H_ */
-- 
cgit v1.2.3-70-g09d2


From 28362673129e7d4510a5a92a8b68ee47f282210b Mon Sep 17 00:00:00 2001
From: Peter Chen <peter.chen@freescale.com>
Date: Thu, 18 Jun 2015 11:51:53 +0800
Subject: usb: chipidea: add ttctrl.ttha control interface

The register of ttctrl.ttha describes like below:
- Internal TT Hub Address Representation
- RW
- Default = 0000000b
This field is used to match against the Hub Address field in QH & siTD
to determine if the packet is routed to the internal TT for directly
attached FS/LS devices. If the Hub Address in the QH or siTD does not
match this address then the packet will be broadcast on the High Speed
ports destined for a downstream High Speed hub with the address in the QH/siTD.

In silicon RTL, this entry only affects QH and siTD, and the hub.addr at
both QH and siTD are 0 in ehci core for chipidea (with hcd->has_tt = 1).

So, for QH, if the "usage_tt" flag at RTL is 0, set CI_HDRC_SET_NON_ZERO_TTHA
will not affect QH (with non-hs device); for siTD, set this flag
will change remaining space requirement for the last transaction from 1023
bytes to 188 bytes, it can increase the number of transactions within one
frame, ehci periodic schedule code will not queue the packet if the frame space
is full, so it is safe to set this flag for siTD.

With this flag, it can fix the problem Alan Stern reported below:
http://www.spinics.net/lists/linux-usb/msg123125.html
And may fix Michael Tessier's problem too.
http://www.spinics.net/lists/linux-usb/msg118679.html

CC: stern@rowland.harvard.edu
CC: michael.tessier@axiontech.ca
Signed-off-by: Peter Chen <peter.chen@freescale.com>
---
 drivers/usb/chipidea/bits.h  | 5 +++++
 drivers/usb/chipidea/ci.h    | 1 +
 drivers/usb/chipidea/core.c  | 2 ++
 drivers/usb/chipidea/host.c  | 3 +++
 include/linux/usb/chipidea.h | 1 +
 5 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/chipidea/bits.h b/drivers/usb/chipidea/bits.h
index 3cb9bda51ddf..831a8f645ea5 100644
--- a/drivers/usb/chipidea/bits.h
+++ b/drivers/usb/chipidea/bits.h
@@ -53,6 +53,11 @@
 #define DEVICEADDR_USBADRA    BIT(24)
 #define DEVICEADDR_USBADR     (0x7FUL << 25)
 
+/* TTCTRL */
+#define TTCTRL_TTHA_MASK	(0x7fUL << 24)
+/* Set non-zero value for internal TT Hub address representation */
+#define TTCTRL_TTHA		(0x7fUL << 24)
+
 /* PORTSC */
 #define PORTSC_CCS            BIT(0)
 #define PORTSC_CSC            BIT(1)
diff --git a/drivers/usb/chipidea/ci.h b/drivers/usb/chipidea/ci.h
index 6d6200e37b71..df57e49ed4ef 100644
--- a/drivers/usb/chipidea/ci.h
+++ b/drivers/usb/chipidea/ci.h
@@ -50,6 +50,7 @@ enum ci_hw_regs {
 	OP_USBINTR,
 	OP_DEVICEADDR,
 	OP_ENDPTLISTADDR,
+	OP_TTCTRL,
 	OP_PORTSC,
 	OP_DEVLC,
 	OP_OTGSC,
diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c
index 3ad48e1c0c57..b0d01f26cd5e 100644
--- a/drivers/usb/chipidea/core.c
+++ b/drivers/usb/chipidea/core.c
@@ -84,6 +84,7 @@ static const u8 ci_regs_nolpm[] = {
 	[OP_USBINTR]		= 0x08U,
 	[OP_DEVICEADDR]		= 0x14U,
 	[OP_ENDPTLISTADDR]	= 0x18U,
+	[OP_TTCTRL]		= 0x1CU,
 	[OP_PORTSC]		= 0x44U,
 	[OP_DEVLC]		= 0x84U,
 	[OP_OTGSC]		= 0x64U,
@@ -106,6 +107,7 @@ static const u8 ci_regs_lpm[] = {
 	[OP_USBINTR]		= 0x08U,
 	[OP_DEVICEADDR]		= 0x14U,
 	[OP_ENDPTLISTADDR]	= 0x18U,
+	[OP_TTCTRL]		= 0x1CU,
 	[OP_PORTSC]		= 0x44U,
 	[OP_DEVLC]		= 0x84U,
 	[OP_OTGSC]		= 0xC4U,
diff --git a/drivers/usb/chipidea/host.c b/drivers/usb/chipidea/host.c
index 7161439def19..b01716c8c66b 100644
--- a/drivers/usb/chipidea/host.c
+++ b/drivers/usb/chipidea/host.c
@@ -159,6 +159,9 @@ static int host_start(struct ci_hdrc *ci)
 	if (ci->platdata->flags & CI_HDRC_FORCE_FULLSPEED)
 		hw_write(ci, OP_PORTSC, PORTSC_PFSC, PORTSC_PFSC);
 
+	if (ci->platdata->flags & CI_HDRC_SET_NON_ZERO_TTHA)
+		hw_write(ci, OP_TTCTRL, TTCTRL_TTHA_MASK, TTCTRL_TTHA);
+
 	return ret;
 
 disable_reg:
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index ab94f78c4dd1..d1e1285a971d 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -29,6 +29,7 @@ struct ci_hdrc_platform_data {
 #define CI_HDRC_IMX28_WRITE_FIX		BIT(5)
 #define CI_HDRC_FORCE_FULLSPEED		BIT(6)
 #define CI_HDRC_TURN_VBUS_EARLY_ON	BIT(7)
+#define CI_HDRC_SET_NON_ZERO_TTHA	BIT(8)
 	enum usb_dr_mode	dr_mode;
 #define CI_HDRC_CONTROLLER_RESET_EVENT		0
 #define CI_HDRC_CONTROLLER_STOPPED_EVENT	1
-- 
cgit v1.2.3-70-g09d2


From df96ed8dced21426c54c7f69cf7513e75280957a Mon Sep 17 00:00:00 2001
From: Peter Chen <peter.chen@freescale.com>
Date: Mon, 22 Sep 2014 16:45:39 +0800
Subject: usb: chipidea: introduce ITC tuning interface

ITC (Interrupt Threshold Control) is used to set the maximum rate at which
the host/device controller will issue interrupts. The default value is 8 (1ms)
for it. EHCI core will modify it to 1, but device mode keeps it as default
value.

In some use cases like Android ADB, it only has one usb request for each
direction, and maximum payload data is only 4KB, so the speed is 4MB/s
at most, it needs controller to trigger interrupt as fast as possible
to increase the speed. The USB performance will be better if the interrupt
can be triggered faster.

Reduce ITC value is benefit for USB performance, but the interrupt number
is increased at the same time, it may increase cpu utilization too.
Most of use case cares about performance, but some may care about
cpu utilization, so, we leave a platform interface for user.
We set ITC as 1 (1 micro-frame) as default value which is aligned
with ehci core default value.

Signed-off-by: Peter Chen <peter.chen@freescale.com>
---
 drivers/usb/chipidea/core.c  | 16 ++++++++++++++++
 include/linux/usb/chipidea.h |  2 ++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c
index 1b1dd80897f7..845d0d7d28e6 100644
--- a/drivers/usb/chipidea/core.c
+++ b/drivers/usb/chipidea/core.c
@@ -425,6 +425,9 @@ void ci_platform_configure(struct ci_hdrc *ci)
 
 	if (ci->platdata->flags & CI_HDRC_SET_NON_ZERO_TTHA)
 		hw_write(ci, OP_TTCTRL, TTCTRL_TTHA_MASK, TTCTRL_TTHA);
+
+	hw_write(ci, OP_USBCMD, 0xff0000, ci->platdata->itc_setting << 16);
+
 }
 
 /**
@@ -576,6 +579,8 @@ static irqreturn_t ci_irq(int irq, void *data)
 static int ci_get_platdata(struct device *dev,
 		struct ci_hdrc_platform_data *platdata)
 {
+	int ret;
+
 	if (!platdata->phy_mode)
 		platdata->phy_mode = of_usb_get_phy_mode(dev->of_node);
 
@@ -607,6 +612,17 @@ static int ci_get_platdata(struct device *dev,
 	if (of_usb_get_maximum_speed(dev->of_node) == USB_SPEED_FULL)
 		platdata->flags |= CI_HDRC_FORCE_FULLSPEED;
 
+	platdata->itc_setting = 1;
+	if (of_find_property(dev->of_node, "itc-setting", NULL)) {
+		ret = of_property_read_u32(dev->of_node, "itc-setting",
+			&platdata->itc_setting);
+		if (ret) {
+			dev_err(dev,
+				"failed to get itc-setting\n");
+			return ret;
+		}
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index d1e1285a971d..1c5d7763990a 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -36,6 +36,8 @@ struct ci_hdrc_platform_data {
 	void	(*notify_event) (struct ci_hdrc *ci, unsigned event);
 	struct regulator	*reg_vbus;
 	bool			tpl_support;
+	/* interrupt threshold setting */
+	u32			itc_setting;
 };
 
 /* Default offset of capability registers */
-- 
cgit v1.2.3-70-g09d2


From 8022d3d51c6fb14736e26fd13caca91a38e07580 Mon Sep 17 00:00:00 2001
From: Peter Chen <peter.chen@freescale.com>
Date: Thu, 30 Oct 2014 09:15:15 +0800
Subject: usb: chipidea: define stream mode disable for both roles
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The system bus and chipidea IP have different limitations for
both host and device mode.
For example, with below errata, we need to enable SDIS(Stream Disable
Mode) at host mode. But we don't want it for device mode at the
same system.

TAR 9000378958
Title: Non-Double Word Aligned Buffer Address Sometimes Causes Host to
Hang on OUT Retry
Impacted Configuration: Host mode, all transfer types
Description:
The host core operating in streaming mode may under run while sending
the data packet of an OUT transaction. This under run can occur if
there are unexpected system delays in fetching the remaining packet
data from memory. The host forces a bad CRC on the packet, the device
detects the error and discards the packet. The host then retries a Bulk,
Interrupt, or Control transfer if an under run occurs according to the
USB specification. During simulations, it was found that the host does
not issue the retry of the failed bulk OUT. It does not issue any other
transactions except SOF packets that have incorrect frame numbers.

The second failure mode occurs if the under run occurs on an ISO OUT
transaction and the next ISO transaction is a zero byte packet. The host
does not issue any transactions (including SOFs). The device detects a
Suspend condition, reverts to full speed, and waits for resume signaling.

A third failure mode occurs when the host under runs on an ISO OUT and
the next ISO in the schedule is an ISO OUT with two max packets of 1024
bytes each. The host should issue MDATA for the first OUT followed by
DATA1 for the second. However, it drops the MDATA transaction, and
issues the DATA1 transaction.

The system impact of this bug is the same regardless of the failure mode
observed. The host core hangs, the ehci_ctrl state machine waits for the
protocol engine to send the completion status for the corrupted
transaction, which never occurs. No indication is sent to the host
controller driver, no register bits change and no interrupts occur.
Eventually the requesting application times out.

Detailed internal behavior:
The EHCI control state machine (ehci_ctrl) in the DMA block is responsible
for parsing the schedules and initiating all transactions. The ehci_ctrl
state machine passes the transaction details to the protocol block by
writing the transaction information in to the TxFIFO. It then asserts
the pe_hst_run_pkt signal to inform the host protocol state machine
(pe_hst_state) that there is a packet in the TxFIFO.
A tag of 0x0 indicates a start of packet with the data providing the
following information:

35:32 Tag
31:30 Reserved
29:23 Endpoint (lowest 4 bits)
22:16 Address
15:10 Reserved
9:8 Endpoint speed
7:6 Endpoint type
5:6 Data Toggle
3:0 PID
The pe_hst_state reads the packet information and constructs the packet
and issues it to the PHY interface.
The ehci_ctrl state machine writes the start transaction information in
to the TxFIFO as 0x03002910c for the OUT packet that had the under run
error. However, it writes 0xC3002910C for the retry of the Out
transaction, which is incorrect.
The pe_hst_state enters a bus timeout state after sending the bad CRC
for the packet that under ran. It then purges any data that was back
filled in to the TxFIFO for the packet that under ran. The pe_hst_state
machine stops purging the TxFIFO when it is empty or if it reads a
location that has a tag of 0x0, indicating a start of packet command.

The pe_hst_state reads 0xC3002910C and discards it as it does not decode
to a start of packet command. It continues to purge the OUT data that
has been pre-buffered for the OUT retry . The pe_hst_state detects the
hst_packet_run signal and attempts to read the PID and address
information from the TxFIFO. This location has packet data and so does
not decode to a valid PID and so falls through to the PE_HST_SOF_LOAD
state where the frame_num_counter is updated. The frame_num_counter
is updated with the data in the TxFIFO. In this case, the data is
incorrect as the ehci_ctrl state machine did not initiate the load.
The hst_pe_state machine detects the SOF request signal and sends an
SOF with the bad frame number. Meanwhile, the ehci_ctrl state machine
waits indefinitely in the run_pkt state waiting for the completion
status from pe_hst_state machine, which will never happen.

The ISO failure case is similar except that there is no retry for ISO.
The ehci_ctrl state machine moves to the next transfer in the periodic
schedule. If the under run occurs on the last entry of the periodic
list then it moves to the Async schedule.

In the case of ISO OUT simulations, the next ISO is a zero byte OUT
and again the start of packet command gets corrupted. The TxFIFO is
empty when the hst_pe_state attempts to read the Address and PID
information as the transaction is a zero byte packet. This results
in the hst_pe_state machine staying in the GET_PID state, which means
that it does not issue any transactions (including SOFs). The device
detects a Suspend condition and reverts to full speed mode and waits
for a Resume or Reset signal.

The EHCI specification allows a Non-DoubleWord (32 bits) offset to
be used as a current offset for Buffer Pointer Page 0 of the qTD.
In Non-DoubleWord aligned cases, the core reads the packet data
from the AHB memory, performs the alignment operation before writing
it in to the TxFIFO as a 32 bit data word. An End Of Packet tag (EOP)
is written to the TxFIFO after all the packet data has been written
in to the TxFIFO. The alignment function is reset to Idle by the EOP
tag. The corruption of the start of packet command arises because the
packet buffer for the OUT transaction that under ran is not aligned
to a DoubleWord, and hence no EOP tag is written to the TxFIFO. The
alignment function is still active when the start packet information
is written in to the TxFIFO for the retry of the bulk packet or for
the next transaction in the case of an under run on an ISO. This
results in the corruption of the start tag and the transaction
information.

Click for waveform showing the command 0x 0000300291 being written in
to the TX FIFO for the Out that under ran.
Click for waveform showing the command 0xC3002910C written to the
TxFIFO instead of 0x 0000300291
Versions affected: Versions 2.10a and previous versions
How discovered: Customer simulation

Workaround:
1- The EHCI specification allows a non-DoubleWord offset to be used
as a current offset for Buffer Pointer Page 0 of the qTD. However,
if a DoubleWord offset is used then this issue does not arise.
2- Use non streaming mode to eliminate under runs.

Resolution:
The fix involves changes to the traffic state machine in the
vusb_hs_dma_traf block. The ehci_ctrl state machine updates the context
information by encoding the transaction results on the
hst_op_context_update signals at the end of a transaction. The signal
hst_op_context_update is added to the traffic state machine, and the
tx_fifo_under_ran_r signal is generated if the transaction results in
an under run error. Click for waveform

The traffic state machine then traverses to the do_eop states if the
tx_fifo_under_ran error is asserted. Thus an EOP tag is written in to
the TxFIFO as shown in this waveform .

The EOP tag resets the align state machine to the Idle state ensuring
that the next command written by the echi_ctrl state machine does not
get corrupted.

File(s) modified:
RTL code fixed: …..
Method of reproducing: This failure cannot be reproduced in the current
test bench.
Date Found: March 2010
Date Fixed: June 2010
Update information:
Added the RTL code fix

Signed-off-by: Peter Chen <peter.chen@freescale.com>
---
 drivers/usb/chipidea/core.c  | 13 ++++++++++++-
 include/linux/usb/chipidea.h |  5 ++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c
index f620e3546eaf..7a2c217fb150 100644
--- a/drivers/usb/chipidea/core.c
+++ b/drivers/usb/chipidea/core.c
@@ -64,6 +64,7 @@
 #include <linux/of.h>
 #include <linux/phy.h>
 #include <linux/regulator/consumer.h>
+#include <linux/usb/ehci_def.h>
 
 #include "ci.h"
 #include "udc.h"
@@ -412,7 +413,17 @@ static int ci_usb_phy_init(struct ci_hdrc *ci)
  */
 void ci_platform_configure(struct ci_hdrc *ci)
 {
-	if (ci->platdata->flags & CI_HDRC_DISABLE_STREAMING)
+	bool is_device_mode, is_host_mode;
+
+	is_device_mode = hw_read(ci, OP_USBMODE, USBMODE_CM) == USBMODE_CM_DC;
+	is_host_mode = hw_read(ci, OP_USBMODE, USBMODE_CM) == USBMODE_CM_HC;
+
+	if (is_device_mode &&
+		(ci->platdata->flags & CI_HDRC_DISABLE_DEVICE_STREAMING))
+		hw_write(ci, OP_USBMODE, USBMODE_CI_SDIS, USBMODE_CI_SDIS);
+
+	if (is_host_mode &&
+		(ci->platdata->flags & CI_HDRC_DISABLE_HOST_STREAMING))
 		hw_write(ci, OP_USBMODE, USBMODE_CI_SDIS, USBMODE_CI_SDIS);
 
 	if (ci->platdata->flags & CI_HDRC_FORCE_FULLSPEED) {
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index 1c5d7763990a..4d34a8612e85 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -19,8 +19,11 @@ struct ci_hdrc_platform_data {
 	enum usb_phy_interface phy_mode;
 	unsigned long	 flags;
 #define CI_HDRC_REGS_SHARED		BIT(0)
+#define CI_HDRC_DISABLE_DEVICE_STREAMING	BIT(1)
 #define CI_HDRC_SUPPORTS_RUNTIME_PM	BIT(2)
-#define CI_HDRC_DISABLE_STREAMING	BIT(3)
+#define CI_HDRC_DISABLE_HOST_STREAMING	BIT(3)
+#define CI_HDRC_DISABLE_STREAMING (CI_HDRC_DISABLE_DEVICE_STREAMING |	\
+		CI_HDRC_DISABLE_HOST_STREAMING)
 	/*
 	 * Only set it when DCCPARAMS.DC==1 and DCCPARAMS.HC==1,
 	 * but otg is not supported (no register otgsc).
-- 
cgit v1.2.3-70-g09d2


From 65668718f2c5b76d5f4513564a3c56672bb07892 Mon Sep 17 00:00:00 2001
From: Peter Chen <peter.chen@freescale.com>
Date: Tue, 17 Mar 2015 14:21:00 +0800
Subject: usb: chipidea: add ahb burst configuration interface

The users can change it through dts or platform data if they
want to change the default value.

Signed-off-by: Peter Chen <peter.chen@freescale.com>
---
 drivers/usb/chipidea/bits.h  |  3 +++
 drivers/usb/chipidea/core.c  | 14 ++++++++++++++
 include/linux/usb/chipidea.h |  2 ++
 3 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/chipidea/bits.h b/drivers/usb/chipidea/bits.h
index 831a8f645ea5..462ad02167b8 100644
--- a/drivers/usb/chipidea/bits.h
+++ b/drivers/usb/chipidea/bits.h
@@ -25,6 +25,9 @@
 #define VERSION		      (0xF << 25)
 #define CIVERSION	      (0x7 << 29)
 
+/* SBUSCFG */
+#define AHBBRST_MASK		0x7
+
 /* HCCPARAMS */
 #define HCCPARAMS_LEN         BIT(17)
 
diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c
index 7a2c217fb150..ce0489754fde 100644
--- a/drivers/usb/chipidea/core.c
+++ b/drivers/usb/chipidea/core.c
@@ -438,6 +438,9 @@ void ci_platform_configure(struct ci_hdrc *ci)
 
 	hw_write(ci, OP_USBCMD, 0xff0000, ci->platdata->itc_setting << 16);
 
+	if (ci->platdata->flags & CI_HDRC_OVERRIDE_AHB_BURST)
+		hw_write_id_reg(ci, ID_SBUSCFG, AHBBRST_MASK,
+			ci->platdata->ahb_burst_config);
 }
 
 /**
@@ -633,6 +636,17 @@ static int ci_get_platdata(struct device *dev,
 		}
 	}
 
+	if (of_find_property(dev->of_node, "ahb-burst-config", NULL)) {
+		ret = of_property_read_u32(dev->of_node, "ahb-burst-config",
+			&platdata->ahb_burst_config);
+		if (ret) {
+			dev_err(dev,
+				"failed to get ahb-burst-config\n");
+			return ret;
+		}
+		platdata->flags |= CI_HDRC_OVERRIDE_AHB_BURST;
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index 4d34a8612e85..cd7fcad49017 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -33,6 +33,7 @@ struct ci_hdrc_platform_data {
 #define CI_HDRC_FORCE_FULLSPEED		BIT(6)
 #define CI_HDRC_TURN_VBUS_EARLY_ON	BIT(7)
 #define CI_HDRC_SET_NON_ZERO_TTHA	BIT(8)
+#define CI_HDRC_OVERRIDE_AHB_BURST	BIT(9)
 	enum usb_dr_mode	dr_mode;
 #define CI_HDRC_CONTROLLER_RESET_EVENT		0
 #define CI_HDRC_CONTROLLER_STOPPED_EVENT	1
@@ -41,6 +42,7 @@ struct ci_hdrc_platform_data {
 	bool			tpl_support;
 	/* interrupt threshold setting */
 	u32			itc_setting;
+	u32			ahb_burst_config;
 };
 
 /* Default offset of capability registers */
-- 
cgit v1.2.3-70-g09d2


From 96625eadca1bb8832fb502f0899a543695f1ba35 Mon Sep 17 00:00:00 2001
From: Peter Chen <peter.chen@freescale.com>
Date: Tue, 17 Mar 2015 17:32:45 +0800
Subject: usb: chipidea: add tx/rx burst size configuration interface

The user can adjust it through dts or platform data

Signed-off-by: Peter Chen <peter.chen@freescale.com>
---
 drivers/usb/chipidea/bits.h  |  4 ++++
 drivers/usb/chipidea/ci.h    |  1 +
 drivers/usb/chipidea/core.c  | 35 +++++++++++++++++++++++++++++++++++
 include/linux/usb/chipidea.h |  4 ++++
 4 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/chipidea/bits.h b/drivers/usb/chipidea/bits.h
index 462ad02167b8..e462f55c8b99 100644
--- a/drivers/usb/chipidea/bits.h
+++ b/drivers/usb/chipidea/bits.h
@@ -61,6 +61,10 @@
 /* Set non-zero value for internal TT Hub address representation */
 #define TTCTRL_TTHA		(0x7fUL << 24)
 
+/* BURSTSIZE */
+#define RX_BURST_MASK		0xff
+#define TX_BURST_MASK		0xff00
+
 /* PORTSC */
 #define PORTSC_CCS            BIT(0)
 #define PORTSC_CSC            BIT(1)
diff --git a/drivers/usb/chipidea/ci.h b/drivers/usb/chipidea/ci.h
index 1320a4dbbcd5..62f2a7be0697 100644
--- a/drivers/usb/chipidea/ci.h
+++ b/drivers/usb/chipidea/ci.h
@@ -51,6 +51,7 @@ enum ci_hw_regs {
 	OP_DEVICEADDR,
 	OP_ENDPTLISTADDR,
 	OP_TTCTRL,
+	OP_BURSTSIZE,
 	OP_PORTSC,
 	OP_DEVLC,
 	OP_OTGSC,
diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c
index ce0489754fde..50cd23b3b7f1 100644
--- a/drivers/usb/chipidea/core.c
+++ b/drivers/usb/chipidea/core.c
@@ -86,6 +86,7 @@ static const u8 ci_regs_nolpm[] = {
 	[OP_DEVICEADDR]		= 0x14U,
 	[OP_ENDPTLISTADDR]	= 0x18U,
 	[OP_TTCTRL]		= 0x1CU,
+	[OP_BURSTSIZE]		= 0x20U,
 	[OP_PORTSC]		= 0x44U,
 	[OP_DEVLC]		= 0x84U,
 	[OP_OTGSC]		= 0x64U,
@@ -109,6 +110,7 @@ static const u8 ci_regs_lpm[] = {
 	[OP_DEVICEADDR]		= 0x14U,
 	[OP_ENDPTLISTADDR]	= 0x18U,
 	[OP_TTCTRL]		= 0x1CU,
+	[OP_BURSTSIZE]		= 0x20U,
 	[OP_PORTSC]		= 0x44U,
 	[OP_DEVLC]		= 0x84U,
 	[OP_OTGSC]		= 0xC4U,
@@ -441,6 +443,17 @@ void ci_platform_configure(struct ci_hdrc *ci)
 	if (ci->platdata->flags & CI_HDRC_OVERRIDE_AHB_BURST)
 		hw_write_id_reg(ci, ID_SBUSCFG, AHBBRST_MASK,
 			ci->platdata->ahb_burst_config);
+
+	/* override burst size, take effect only when ahb_burst_config is 0 */
+	if (!hw_read_id_reg(ci, ID_SBUSCFG, AHBBRST_MASK)) {
+		if (ci->platdata->flags & CI_HDRC_OVERRIDE_TX_BURST)
+			hw_write(ci, OP_BURSTSIZE, TX_BURST_MASK,
+			ci->platdata->tx_burst_size << __ffs(TX_BURST_MASK));
+
+		if (ci->platdata->flags & CI_HDRC_OVERRIDE_RX_BURST)
+			hw_write(ci, OP_BURSTSIZE, RX_BURST_MASK,
+				ci->platdata->rx_burst_size);
+	}
 }
 
 /**
@@ -647,6 +660,28 @@ static int ci_get_platdata(struct device *dev,
 		platdata->flags |= CI_HDRC_OVERRIDE_AHB_BURST;
 	}
 
+	if (of_find_property(dev->of_node, "tx-burst-size-dword", NULL)) {
+		ret = of_property_read_u32(dev->of_node, "tx-burst-size-dword",
+			&platdata->tx_burst_size);
+		if (ret) {
+			dev_err(dev,
+				"failed to get tx-burst-size-dword\n");
+			return ret;
+		}
+		platdata->flags |= CI_HDRC_OVERRIDE_TX_BURST;
+	}
+
+	if (of_find_property(dev->of_node, "rx-burst-size-dword", NULL)) {
+		ret = of_property_read_u32(dev->of_node, "rx-burst-size-dword",
+			&platdata->rx_burst_size);
+		if (ret) {
+			dev_err(dev,
+				"failed to get rx-burst-size-dword\n");
+			return ret;
+		}
+		platdata->flags |= CI_HDRC_OVERRIDE_RX_BURST;
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index cd7fcad49017..575eaf0ebac1 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -34,6 +34,8 @@ struct ci_hdrc_platform_data {
 #define CI_HDRC_TURN_VBUS_EARLY_ON	BIT(7)
 #define CI_HDRC_SET_NON_ZERO_TTHA	BIT(8)
 #define CI_HDRC_OVERRIDE_AHB_BURST	BIT(9)
+#define CI_HDRC_OVERRIDE_TX_BURST	BIT(10)
+#define CI_HDRC_OVERRIDE_RX_BURST	BIT(11)
 	enum usb_dr_mode	dr_mode;
 #define CI_HDRC_CONTROLLER_RESET_EVENT		0
 #define CI_HDRC_CONTROLLER_STOPPED_EVENT	1
@@ -43,6 +45,8 @@ struct ci_hdrc_platform_data {
 	/* interrupt threshold setting */
 	u32			itc_setting;
 	u32			ahb_burst_config;
+	u32			tx_burst_size;
+	u32			rx_burst_size;
 };
 
 /* Default offset of capability registers */
-- 
cgit v1.2.3-70-g09d2


From 35103d11173b8fea874183f8aa508ae71234d299 Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <gospo@cumulusnetworks.com>
Date: Thu, 13 Aug 2015 10:39:01 -0400
Subject: net: ipv6 sysctl option to ignore routes when nexthop link is down

Like the ipv4 patch with a similar title, this adds a sysctl to allow
the user to change routing behavior based on whether or not the
interface associated with the nexthop was an up or down link.  The
default setting preserves the current behavior, but anyone that enables
it will notice that nexthops on down interfaces will no longer be
selected:

net.ipv6.conf.all.ignore_routes_with_linkdown = 0
net.ipv6.conf.default.ignore_routes_with_linkdown = 0
net.ipv6.conf.lo.ignore_routes_with_linkdown = 0
...

When the above sysctls are set, not only will link status be reported to
userspace, but an indication that a nexthop is dead and will not be used
is also reported.

1000::/8 via 7000::2 dev p7p1  metric 1024 dead linkdown  pref medium
1000::/8 via 8000::2 dev p8p1  metric 1024  pref medium
7000::/8 dev p7p1  proto kernel  metric 256 dead linkdown  pref medium
8000::/8 dev p8p1  proto kernel  metric 256  pref medium
9000::/8 via 8000::2 dev p8p1  metric 2048  pref medium
9000::/8 via 7000::2 dev p7p1  metric 1024 dead linkdown  pref medium
fe80::/64 dev p7p1  proto kernel  metric 256 dead linkdown  pref medium
fe80::/64 dev p8p1  proto kernel  metric 256  pref medium

This also adds devconf support and notification when sysctl values
change.

v2: drop use of rt6i_nhflags since it is not needed right now

Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: Dinesh Dutt <ddutt@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h      |   1 +
 include/uapi/linux/ipv6.h |   1 +
 net/ipv6/addrconf.c       | 105 +++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/route.c          |  11 ++++-
 4 files changed, 116 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index cb9dcad72372..f1f32af6d9b9 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -31,6 +31,7 @@ struct ipv6_devconf {
 	__s32		accept_ra_defrtr;
 	__s32		accept_ra_min_hop_limit;
 	__s32		accept_ra_pinfo;
+	__s32		ignore_routes_with_linkdown;
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	__s32		accept_ra_rtr_pref;
 	__s32		rtr_probe_interval;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 80f3b74446a1..38b4fef20219 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -173,6 +173,7 @@ enum {
 	DEVCONF_STABLE_SECRET,
 	DEVCONF_USE_OIF_ADDRS_ONLY,
 	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
+	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 53e3a9d756b0..5dfbac72f1ab 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -214,6 +214,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 		.initialized = false,
 	},
 	.use_oif_addrs_only	= 0,
+	.ignore_routes_with_linkdown = 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -257,6 +258,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 		.initialized = false,
 	},
 	.use_oif_addrs_only	= 0,
+	.ignore_routes_with_linkdown = 0,
 };
 
 /* Check if a valid qdisc is available */
@@ -472,6 +474,9 @@ static int inet6_netconf_msgsize_devconf(int type)
 	if (type == -1 || type == NETCONFA_PROXY_NEIGH)
 		size += nla_total_size(4);
 
+	if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+		size += nla_total_size(4);
+
 	return size;
 }
 
@@ -508,6 +513,11 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0)
 		goto nla_put_failure;
 
+	if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+	    nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+			devconf->ignore_routes_with_linkdown) < 0)
+		goto nla_put_failure;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
@@ -544,6 +554,7 @@ static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = {
 	[NETCONFA_IFINDEX]	= { .len = sizeof(int) },
 	[NETCONFA_FORWARDING]	= { .len = sizeof(int) },
 	[NETCONFA_PROXY_NEIGH]	= { .len = sizeof(int) },
+	[NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN]	= { .len = sizeof(int) },
 };
 
 static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
@@ -766,6 +777,63 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf)
 		rt6_purge_dflt_routers(net);
 	return 1;
 }
+
+static void addrconf_linkdown_change(struct net *net, __s32 newf)
+{
+	struct net_device *dev;
+	struct inet6_dev *idev;
+
+	for_each_netdev(net, dev) {
+		idev = __in6_dev_get(dev);
+		if (idev) {
+			int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf);
+
+			idev->cnf.ignore_routes_with_linkdown = newf;
+			if (changed)
+				inet6_netconf_notify_devconf(dev_net(dev),
+							     NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+							     dev->ifindex,
+							     &idev->cnf);
+		}
+	}
+}
+
+static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf)
+{
+	struct net *net;
+	int old;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	net = (struct net *)table->extra2;
+	old = *p;
+	*p = newf;
+
+	if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) {
+		if ((!newf) ^ (!old))
+			inet6_netconf_notify_devconf(net,
+						     NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+						     NETCONFA_IFINDEX_DEFAULT,
+						     net->ipv6.devconf_dflt);
+		rtnl_unlock();
+		return 0;
+	}
+
+	if (p == &net->ipv6.devconf_all->ignore_routes_with_linkdown) {
+		net->ipv6.devconf_dflt->ignore_routes_with_linkdown = newf;
+		addrconf_linkdown_change(net, newf);
+		if ((!newf) ^ (!old))
+			inet6_netconf_notify_devconf(net,
+						     NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+						     NETCONFA_IFINDEX_ALL,
+						     net->ipv6.devconf_all);
+	}
+	rtnl_unlock();
+
+	return 1;
+}
+
 #endif
 
 /* Nobody refers to this ifaddr, destroy it */
@@ -4616,6 +4684,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc;
 	array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local;
 	array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu;
+	array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown;
 	/* we omit DEVCONF_STABLE_SECRET for now */
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 }
@@ -5338,6 +5407,34 @@ out:
 	return err;
 }
 
+static
+int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
+						int write,
+						void __user *buffer,
+						size_t *lenp,
+						loff_t *ppos)
+{
+	int *valp = ctl->data;
+	int val = *valp;
+	loff_t pos = *ppos;
+	struct ctl_table lctl;
+	int ret;
+
+	/* ctl->data points to idev->cnf.ignore_routes_when_linkdown
+	 * we should not modify it until we get the rtnl lock.
+	 */
+	lctl = *ctl;
+	lctl.data = &val;
+
+	ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);
+
+	if (write)
+		ret = addrconf_fixup_linkdown(ctl, valp, val);
+	if (ret)
+		*ppos = pos;
+	return ret;
+}
+
 static struct addrconf_sysctl_table
 {
 	struct ctl_table_header *sysctl_header;
@@ -5629,7 +5726,13 @@ static struct addrconf_sysctl_table
 			.maxlen         = sizeof(int),
 			.mode           = 0644,
 			.proc_handler   = proc_dointvec,
-
+		},
+		{
+			.procname	= "ignore_routes_with_linkdown",
+			.data		= &ipv6_devconf.ignore_routes_with_linkdown,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= addrconf_sysctl_ignore_routes_with_linkdown,
 		},
 		{
 			/* sentinel */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 370f72785385..1c0217e61357 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -665,6 +665,12 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 {
 	int m;
 	bool match_do_rr = false;
+	struct inet6_dev *idev = rt->rt6i_idev;
+	struct net_device *dev = rt->dst.dev;
+
+	if (dev && !netif_carrier_ok(dev) &&
+	    idev->cnf.ignore_routes_with_linkdown)
+		goto out;
 
 	if (rt6_check_expired(rt))
 		goto out;
@@ -2887,8 +2893,11 @@ static int rt6_fill_node(struct net *net,
 	else
 		rtm->rtm_type = RTN_UNICAST;
 	rtm->rtm_flags = 0;
-	if (!netif_carrier_ok(rt->dst.dev))
+	if (!netif_carrier_ok(rt->dst.dev)) {
 		rtm->rtm_flags |= RTNH_F_LINKDOWN;
+		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
+			rtm->rtm_flags |= RTNH_F_DEAD;
+	}
 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
 	rtm->rtm_protocol = rt->rt6i_protocol;
 	if (rt->rt6i_flags & RTF_DYNAMIC)
-- 
cgit v1.2.3-70-g09d2


From 4e3c89920cd3a6cfce22c6f537690747c26128dd Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 13 Aug 2015 14:59:00 -0600
Subject: net: Introduce VRF related flags and helpers

Add a VRF_MASTER flag for interfaces and helper functions for determining
if a device is a VRF_MASTER.

Add link attribute for passing VRF_TABLE id.

Add vrf_ptr to netdevice.

Add various macros for determining if a device is a VRF device, the index
of the master VRF device and table associated with VRF device.

Signed-off-by: Shrijeet Mukherjee <shm@cumulusnetworks.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    |  20 +++++++
 include/net/vrf.h            | 139 +++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/if_link.h |   9 +++
 3 files changed, 168 insertions(+)
 create mode 100644 include/net/vrf.h

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 607b5f41f46f..f7a6ef2fae3a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1289,6 +1289,7 @@ enum netdev_priv_flags {
 	IFF_XMIT_DST_RELEASE_PERM	= 1<<22,
 	IFF_IPVLAN_MASTER		= 1<<23,
 	IFF_IPVLAN_SLAVE		= 1<<24,
+	IFF_VRF_MASTER			= 1<<25,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1316,6 +1317,7 @@ enum netdev_priv_flags {
 #define IFF_XMIT_DST_RELEASE_PERM	IFF_XMIT_DST_RELEASE_PERM
 #define IFF_IPVLAN_MASTER		IFF_IPVLAN_MASTER
 #define IFF_IPVLAN_SLAVE		IFF_IPVLAN_SLAVE
+#define IFF_VRF_MASTER			IFF_VRF_MASTER
 
 /**
  *	struct net_device - The DEVICE structure.
@@ -1432,6 +1434,7 @@ enum netdev_priv_flags {
  *	@dn_ptr:	DECnet specific data
  *	@ip6_ptr:	IPv6 specific data
  *	@ax25_ptr:	AX.25 specific data
+ *	@vrf_ptr:	VRF specific data
  *	@ieee80211_ptr:	IEEE 802.11 specific data, assign before registering
  *
  *	@last_rx:	Time of last Rx
@@ -1650,6 +1653,7 @@ struct net_device {
 	struct dn_dev __rcu     *dn_ptr;
 	struct inet6_dev __rcu	*ip6_ptr;
 	void			*ax25_ptr;
+	struct net_vrf_dev __rcu *vrf_ptr;
 	struct wireless_dev	*ieee80211_ptr;
 	struct wpan_dev		*ieee802154_ptr;
 #if IS_ENABLED(CONFIG_MPLS_ROUTING)
@@ -3808,6 +3812,22 @@ static inline bool netif_supports_nofcs(struct net_device *dev)
 	return dev->priv_flags & IFF_SUPP_NOFCS;
 }
 
+static inline bool netif_is_vrf(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_VRF_MASTER;
+}
+
+static inline bool netif_index_is_vrf(struct net *net, int ifindex)
+{
+	struct net_device *dev = dev_get_by_index_rcu(net, ifindex);
+	bool rc = false;
+
+	if (dev)
+		rc = netif_is_vrf(dev);
+
+	return rc;
+}
+
 /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
 static inline void netif_keep_dst(struct net_device *dev)
 {
diff --git a/include/net/vrf.h b/include/net/vrf.h
new file mode 100644
index 000000000000..0484d29d4589
--- /dev/null
+++ b/include/net/vrf.h
@@ -0,0 +1,139 @@
+/*
+ * include/net/net_vrf.h - adds vrf dev structure definitions
+ * Copyright (c) 2015 Cumulus Networks
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_NET_VRF_H
+#define __LINUX_NET_VRF_H
+
+struct net_vrf_dev {
+	struct rcu_head		rcu;
+	int                     ifindex; /* ifindex of master dev */
+	u32                     tb_id;   /* table id for VRF */
+};
+
+struct slave {
+	struct list_head	list;
+	struct net_device	*dev;
+};
+
+struct slave_queue {
+	struct list_head	all_slaves;
+	int			num_slaves;
+};
+
+struct net_vrf {
+	struct slave_queue	queue;
+	struct rtable           *rth;
+	u32			tb_id;
+};
+
+
+#if IS_ENABLED(CONFIG_NET_VRF)
+/* called with rcu_read_lock() */
+static inline int vrf_master_ifindex_rcu(const struct net_device *dev)
+{
+	struct net_vrf_dev *vrf_ptr;
+	int ifindex = 0;
+
+	if (!dev)
+		return 0;
+
+	if (netif_is_vrf(dev))
+		ifindex = dev->ifindex;
+	else {
+		vrf_ptr = rcu_dereference(dev->vrf_ptr);
+		if (vrf_ptr)
+			ifindex = vrf_ptr->ifindex;
+	}
+
+	return ifindex;
+}
+
+/* called with rcu_read_lock */
+static inline int vrf_dev_table_rcu(const struct net_device *dev)
+{
+	int tb_id = 0;
+
+	if (dev) {
+		struct net_vrf_dev *vrf_ptr;
+
+		vrf_ptr = rcu_dereference(dev->vrf_ptr);
+		if (vrf_ptr)
+			tb_id = vrf_ptr->tb_id;
+	}
+	return tb_id;
+}
+
+static inline int vrf_dev_table(const struct net_device *dev)
+{
+	int tb_id;
+
+	rcu_read_lock();
+	tb_id = vrf_dev_table_rcu(dev);
+	rcu_read_unlock();
+
+	return tb_id;
+}
+
+/* called with rtnl */
+static inline int vrf_dev_table_rtnl(const struct net_device *dev)
+{
+	int tb_id = 0;
+
+	if (dev) {
+		struct net_vrf_dev *vrf_ptr;
+
+		vrf_ptr = rtnl_dereference(dev->vrf_ptr);
+		if (vrf_ptr)
+			tb_id = vrf_ptr->tb_id;
+	}
+	return tb_id;
+}
+
+/* caller has already checked netif_is_vrf(dev) */
+static inline struct rtable *vrf_dev_get_rth(const struct net_device *dev)
+{
+	struct rtable *rth = ERR_PTR(-ENETUNREACH);
+	struct net_vrf *vrf = netdev_priv(dev);
+
+	if (vrf) {
+		rth = vrf->rth;
+		atomic_inc(&rth->dst.__refcnt);
+	}
+	return rth;
+}
+
+#else
+static inline int vrf_master_ifindex_rcu(const struct net_device *dev)
+{
+	return 0;
+}
+
+static inline int vrf_dev_table_rcu(const struct net_device *dev)
+{
+	return 0;
+}
+
+static inline int vrf_dev_table(const struct net_device *dev)
+{
+	return 0;
+}
+
+static inline int vrf_dev_table_rtnl(const struct net_device *dev)
+{
+	return 0;
+}
+
+static inline struct rtable *vrf_dev_get_rth(const struct net_device *dev)
+{
+	return ERR_PTR(-ENETUNREACH);
+}
+#endif
+
+#endif /* __LINUX_NET_VRF_H */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index d450be36add2..313c305fd1ad 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -341,6 +341,15 @@ enum macvlan_macaddr_mode {
 
 #define MACVLAN_FLAG_NOPROMISC	1
 
+/* VRF section */
+enum {
+	IFLA_VRF_UNSPEC,
+	IFLA_VRF_TABLE,
+	__IFLA_VRF_MAX
+};
+
+#define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1)
+
 /* IPVLAN section */
 enum {
 	IFLA_IPVLAN_UNSPEC,
-- 
cgit v1.2.3-70-g09d2


From 2377799c084d86d22074cd4acd20edc32024d669 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 13 Jul 2015 12:17:25 +0200
Subject: average: provide macro to create static EWMA

Having the EWMA parameters stored in the runtime struct imposes
memory requirements for the constant values that could just be
inlined in the code. This particularly makes sense if there are
a lot of such structs, for example in mac80211 in the station
table where each station has a number of these in an array, and
there can be many stations.

Provide a macro DECLARE_EWMA() that declares the necessary struct
and inline functions to access it with the parameters hard-coded;
using this also means the user no longer needs to 'select AVERAGE'
as it's entirely self-contained.

In the mac80211 case, on x86-64, this actually slightly *reduces*
code size, while also saving 80 bytes of runtime memory per sta.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/average.h | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/average.h b/include/linux/average.h
index c6028fd742c1..60f8226c5652 100644
--- a/include/linux/average.h
+++ b/include/linux/average.h
@@ -27,4 +27,43 @@ static inline unsigned long ewma_read(const struct ewma *avg)
 	return avg->internal >> avg->factor;
 }
 
+#define DECLARE_EWMA(name, _factor, _weight)				\
+	struct ewma_##name {						\
+		unsigned long internal;					\
+	};								\
+	static inline void ewma_##name##_init(struct ewma_##name *e)	\
+	{								\
+		BUILD_BUG_ON(!__builtin_constant_p(_factor));		\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight));		\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_factor);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight);			\
+		e->internal = 0;					\
+	}								\
+	static inline unsigned long					\
+	ewma_##name##_read(struct ewma_##name *e)			\
+	{								\
+		BUILD_BUG_ON(!__builtin_constant_p(_factor));		\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight));		\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_factor);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight);			\
+		return e->internal >> ilog2(_factor);			\
+	}								\
+	static inline void ewma_##name##_add(struct ewma_##name *e,	\
+					     unsigned long val)		\
+	{								\
+		unsigned long internal = ACCESS_ONCE(e->internal);	\
+		unsigned long weight = ilog2(_weight);			\
+		unsigned long factor = ilog2(_factor);			\
+									\
+		BUILD_BUG_ON(!__builtin_constant_p(_factor));		\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight));		\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_factor);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight);			\
+									\
+		ACCESS_ONCE(e->internal) = internal ?			\
+			(((internal << weight) - internal) +		\
+				(val << factor)) >> weight :		\
+			(val << factor);				\
+	}
+
 #endif /* _LINUX_AVERAGE_H */
-- 
cgit v1.2.3-70-g09d2


From 8f9c98df949333f08b74e5df1caacf7e2c5e8552 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Sun, 19 Jul 2015 16:09:12 +0300
Subject: mac80211: fix BIT position for TDLS WIDE extended cap

The bit was not according to ieee80211 specification.
Fix that.

Reviewed-by: Arik Nemtsov <arik@wizery.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index b9c7897dc566..cfa906f28b7a 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2074,8 +2074,8 @@ enum ieee80211_tdls_actioncode {
 #define WLAN_EXT_CAPA5_TDLS_PROHIBITED	BIT(6)
 #define WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED	BIT(7)
 
+#define WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED	BIT(5)
 #define WLAN_EXT_CAPA8_OPMODE_NOTIF	BIT(6)
-#define WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED	BIT(7)
 
 /* TDLS specific payload type in the LLC/SNAP header */
 #define WLAN_TDLS_SNAP_RFTYPE	0x2
-- 
cgit v1.2.3-70-g09d2


From 92281dee825f6d2eb07c441437e4196a44b0861c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 10 Aug 2015 23:07:06 -0400
Subject: arch: introduce memremap()

Existing users of ioremap_cache() are mapping memory that is known in
advance to not have i/o side effects.  These users are forced to cast
away the __iomem annotation, or otherwise neglect to fix the sparse
errors thrown when dereferencing pointers to this memory.  Provide
memremap() as a non __iomem annotated ioremap_*() in the case when
ioremap is otherwise a pointer to cacheable memory. Empirically,
ioremap_<cacheable-type>() call sites are seeking memory-like semantics
(e.g.  speculative reads, and prefetching permitted).

memremap() is a break from the ioremap implementation pattern of adding
a new memremap_<type>() for each mapping type and having silent
compatibility fall backs.  Instead, the implementation defines flags
that are passed to the central memremap() and if a mapping type is not
supported by an arch memremap returns NULL.

We introduce a memremap prototype as a trivial wrapper of
ioremap_cache() and ioremap_wt().  Later, once all ioremap_cache() and
ioremap_wt() usage has been removed from drivers we teach archs to
implement arch_memremap() with the ability to strictly enforce the
mapping type.

Cc: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/ia64/include/asm/io.h   |  1 +
 arch/sh/include/asm/io.h     |  1 +
 arch/xtensa/include/asm/io.h |  1 +
 include/linux/io.h           |  9 ++++
 kernel/Makefile              |  2 +
 kernel/memremap.c            | 98 ++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 112 insertions(+)
 create mode 100644 kernel/memremap.c

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h
index 80a7e34be009..9041bbe2b7b4 100644
--- a/arch/ia64/include/asm/io.h
+++ b/arch/ia64/include/asm/io.h
@@ -435,6 +435,7 @@ static inline void __iomem * ioremap_cache (unsigned long phys_addr, unsigned lo
 {
 	return ioremap(phys_addr, size);
 }
+#define ioremap_cache ioremap_cache
 
 
 /*
diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h
index 728c4c571f40..6194e20fccca 100644
--- a/arch/sh/include/asm/io.h
+++ b/arch/sh/include/asm/io.h
@@ -342,6 +342,7 @@ ioremap_cache(phys_addr_t offset, unsigned long size)
 {
 	return __ioremap_mode(offset, size, PAGE_KERNEL);
 }
+#define ioremap_cache ioremap_cache
 
 #ifdef CONFIG_HAVE_IOREMAP_PROT
 static inline void __iomem *
diff --git a/arch/xtensa/include/asm/io.h b/arch/xtensa/include/asm/io.h
index c39bb6e61911..867840f5400f 100644
--- a/arch/xtensa/include/asm/io.h
+++ b/arch/xtensa/include/asm/io.h
@@ -57,6 +57,7 @@ static inline void __iomem *ioremap_cache(unsigned long offset,
 	else
 		BUG();
 }
+#define ioremap_cache ioremap_cache
 
 #define ioremap_wc ioremap_nocache
 #define ioremap_wt ioremap_nocache
diff --git a/include/linux/io.h b/include/linux/io.h
index fb5a99800e77..3fcf6256c088 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -121,4 +121,13 @@ static inline int arch_phys_wc_index(int handle)
 #endif
 #endif
 
+enum {
+	/* See memremap() kernel-doc for usage description... */
+	MEMREMAP_WB = 1 << 0,
+	MEMREMAP_WT = 1 << 1,
+};
+
+void *memremap(resource_size_t offset, size_t size, unsigned long flags);
+void memunmap(void *addr);
+
 #endif /* _LINUX_IO_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 43c4c920f30a..92866d36e376 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -99,6 +99,8 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
 
+obj-$(CONFIG_HAS_IOMEM) += memremap.o
+
 $(obj)/configs.o: $(obj)/config_data.h
 
 # config_data.h contains the same information as ikconfig.h but gzipped.
diff --git a/kernel/memremap.c b/kernel/memremap.c
new file mode 100644
index 000000000000..a293de52e837
--- /dev/null
+++ b/kernel/memremap.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+
+#ifndef ioremap_cache
+/* temporary while we convert existing ioremap_cache users to memremap */
+__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
+{
+	return ioremap(offset, size);
+}
+#endif
+
+/**
+ * memremap() - remap an iomem_resource as cacheable memory
+ * @offset: iomem resource start address
+ * @size: size of remap
+ * @flags: either MEMREMAP_WB or MEMREMAP_WT
+ *
+ * memremap() is "ioremap" for cases where it is known that the resource
+ * being mapped does not have i/o side effects and the __iomem
+ * annotation is not applicable.
+ *
+ * MEMREMAP_WB - matches the default mapping for "System RAM" on
+ * the architecture.  This is usually a read-allocate write-back cache.
+ * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
+ * memremap() will bypass establishing a new mapping and instead return
+ * a pointer into the direct map.
+ *
+ * MEMREMAP_WT - establish a mapping whereby writes either bypass the
+ * cache or are written through to memory and never exist in a
+ * cache-dirty state with respect to program visibility.  Attempts to
+ * map "System RAM" with this mapping type will fail.
+ */
+void *memremap(resource_size_t offset, size_t size, unsigned long flags)
+{
+	int is_ram = region_intersects(offset, size, "System RAM");
+	void *addr = NULL;
+
+	if (is_ram == REGION_MIXED) {
+		WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
+				&offset, (unsigned long) size);
+		return NULL;
+	}
+
+	/* Try all mapping types requested until one returns non-NULL */
+	if (flags & MEMREMAP_WB) {
+		flags &= ~MEMREMAP_WB;
+		/*
+		 * MEMREMAP_WB is special in that it can be satisifed
+		 * from the direct map.  Some archs depend on the
+		 * capability of memremap() to autodetect cases where
+		 * the requested range is potentially in "System RAM"
+		 */
+		if (is_ram == REGION_INTERSECTS)
+			addr = __va(offset);
+		else
+			addr = ioremap_cache(offset, size);
+	}
+
+	/*
+	 * If we don't have a mapping yet and more request flags are
+	 * pending then we will be attempting to establish a new virtual
+	 * address mapping.  Enforce that this mapping is not aliasing
+	 * "System RAM"
+	 */
+	if (!addr && is_ram == REGION_INTERSECTS && flags) {
+		WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
+				&offset, (unsigned long) size);
+		return NULL;
+	}
+
+	if (!addr && (flags & MEMREMAP_WT)) {
+		flags &= ~MEMREMAP_WT;
+		addr = ioremap_wt(offset, size);
+	}
+
+	return addr;
+}
+EXPORT_SYMBOL(memremap);
+
+void memunmap(void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		iounmap((void __iomem *) addr);
+}
+EXPORT_SYMBOL(memunmap);
-- 
cgit v1.2.3-70-g09d2


From e836a256e8fd579c9d7a3685f22981225a1ca451 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 12 Aug 2015 18:42:56 -0400
Subject: pmem: convert to generic memremap

Kill arch_memremap_pmem() and just let the architecture specify the
flags to be passed to memremap().  Default to writethrough by default.

Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/include/asm/io.h         |  6 +-----
 include/linux/pmem.h              | 28 +++++++++-------------------
 tools/testing/nvdimm/Kbuild       |  4 ++--
 tools/testing/nvdimm/test/iomap.c | 34 +++++++++++++++++++++++++---------
 4 files changed, 37 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index cc9c61bc1abe..d241fbd5c87b 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -248,11 +248,7 @@ static inline void flush_write_buffers(void)
 #endif
 }
 
-static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
-	unsigned long size)
-{
-	return (void __force __pmem *) ioremap_cache(offset, size);
-}
+#define ARCH_MEMREMAP_PMEM MEMREMAP_WB
 
 #endif /* __KERNEL__ */
 
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
index d2114045a6c4..093c35ecefcc 100644
--- a/include/linux/pmem.h
+++ b/include/linux/pmem.h
@@ -28,12 +28,6 @@ static inline bool __arch_has_wmb_pmem(void)
 	return false;
 }
 
-static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
-		unsigned long size)
-{
-	return NULL;
-}
-
 static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
 		size_t n)
 {
@@ -43,8 +37,8 @@ static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
 
 /*
  * Architectures that define ARCH_HAS_PMEM_API must provide
- * implementations for arch_memremap_pmem(), arch_memcpy_to_pmem(),
- * arch_wmb_pmem(), and __arch_has_wmb_pmem().
+ * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(), and
+ * __arch_has_wmb_pmem().
  */
 
 static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size)
@@ -54,7 +48,7 @@ static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t si
 
 static inline void memunmap_pmem(void __pmem *addr)
 {
-	iounmap((void __force __iomem *) addr);
+	memunmap((void __force *) addr);
 }
 
 /**
@@ -85,18 +79,12 @@ static inline bool arch_has_pmem_api(void)
  * default_memremap_pmem + default_memcpy_to_pmem is sufficient for
  * making data durable relative to i/o completion.
  */
-static void default_memcpy_to_pmem(void __pmem *dst, const void *src,
+static inline void default_memcpy_to_pmem(void __pmem *dst, const void *src,
 		size_t size)
 {
 	memcpy((void __force *) dst, src, size);
 }
 
-static void __pmem *default_memremap_pmem(resource_size_t offset,
-		unsigned long size)
-{
-	return (void __pmem __force *)ioremap_wt(offset, size);
-}
-
 /**
  * memremap_pmem - map physical persistent memory for pmem api
  * @offset: physical address of persistent memory
@@ -112,9 +100,11 @@ static void __pmem *default_memremap_pmem(resource_size_t offset,
 static inline void __pmem *memremap_pmem(resource_size_t offset,
 		unsigned long size)
 {
-	if (arch_has_pmem_api())
-		return arch_memremap_pmem(offset, size);
-	return default_memremap_pmem(offset, size);
+#ifdef ARCH_MEMREMAP_PMEM
+	return (void __pmem *) memremap(offset, size, ARCH_MEMREMAP_PMEM);
+#else
+	return (void __pmem *) memremap(offset, size, MEMREMAP_WT);
+#endif
 }
 
 /**
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index f56914c7929b..8032a49f7873 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -1,7 +1,7 @@
-ldflags-y += --wrap=ioremap_wt
 ldflags-y += --wrap=ioremap_wc
 ldflags-y += --wrap=devm_ioremap_nocache
-ldflags-y += --wrap=ioremap_cache
+ldflags-y += --wrap=memremap
+ldflags-y += --wrap=memunmap
 ldflags-y += --wrap=ioremap_nocache
 ldflags-y += --wrap=iounmap
 ldflags-y += --wrap=__request_region
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index 64bfaa50831c..21288f34a5ca 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -80,11 +80,20 @@ void __iomem *__wrap_devm_ioremap_nocache(struct device *dev,
 }
 EXPORT_SYMBOL(__wrap_devm_ioremap_nocache);
 
-void __iomem *__wrap_ioremap_cache(resource_size_t offset, unsigned long size)
+void *__wrap_memremap(resource_size_t offset, size_t size,
+		unsigned long flags)
 {
-	return __nfit_test_ioremap(offset, size, ioremap_cache);
+	struct nfit_test_resource *nfit_res;
+
+	rcu_read_lock();
+	nfit_res = get_nfit_res(offset);
+	rcu_read_unlock();
+	if (nfit_res)
+		return (void __iomem *) nfit_res->buf + offset
+			- nfit_res->res->start;
+	return memremap(offset, size, flags);
 }
-EXPORT_SYMBOL(__wrap_ioremap_cache);
+EXPORT_SYMBOL(__wrap_memremap);
 
 void __iomem *__wrap_ioremap_nocache(resource_size_t offset, unsigned long size)
 {
@@ -92,12 +101,6 @@ void __iomem *__wrap_ioremap_nocache(resource_size_t offset, unsigned long size)
 }
 EXPORT_SYMBOL(__wrap_ioremap_nocache);
 
-void __iomem *__wrap_ioremap_wt(resource_size_t offset, unsigned long size)
-{
-	return __nfit_test_ioremap(offset, size, ioremap_wt);
-}
-EXPORT_SYMBOL(__wrap_ioremap_wt);
-
 void __iomem *__wrap_ioremap_wc(resource_size_t offset, unsigned long size)
 {
 	return __nfit_test_ioremap(offset, size, ioremap_wc);
@@ -117,6 +120,19 @@ void __wrap_iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(__wrap_iounmap);
 
+void __wrap_memunmap(void *addr)
+{
+	struct nfit_test_resource *nfit_res;
+
+	rcu_read_lock();
+	nfit_res = get_nfit_res((unsigned long) addr);
+	rcu_read_unlock();
+	if (nfit_res)
+		return;
+	return memunmap(addr);
+}
+EXPORT_SYMBOL(__wrap_memunmap);
+
 struct resource *__wrap___request_region(struct resource *parent,
 		resource_size_t start, resource_size_t n, const char *name,
 		int flags)
-- 
cgit v1.2.3-70-g09d2


From 7d3dcf26a6559fa82af3f53e2c8b163cec95fdaf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 10 Aug 2015 23:07:07 -0400
Subject: devres: add devm_memremap

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/io.h |  4 ++++
 kernel/memremap.c  | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io.h b/include/linux/io.h
index 3fcf6256c088..d8d749abd665 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -80,6 +80,10 @@ int check_signature(const volatile void __iomem *io_addr,
 			const unsigned char *signature, int length);
 void devm_ioremap_release(struct device *dev, void *res);
 
+void *devm_memremap(struct device *dev, resource_size_t offset,
+		size_t size, unsigned long flags);
+void devm_memunmap(struct device *dev, void *addr);
+
 /*
  * Some systems do not have legacy ISA devices.
  * /dev/port is not a valid interface on these systems.
diff --git a/kernel/memremap.c b/kernel/memremap.c
index a293de52e837..5c9b55eaf121 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -10,6 +10,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  */
+#include <linux/device.h>
 #include <linux/types.h>
 #include <linux/io.h>
 #include <linux/mm.h>
@@ -96,3 +97,41 @@ void memunmap(void *addr)
 		iounmap((void __iomem *) addr);
 }
 EXPORT_SYMBOL(memunmap);
+
+static void devm_memremap_release(struct device *dev, void *res)
+{
+	memunmap(res);
+}
+
+static int devm_memremap_match(struct device *dev, void *res, void *match_data)
+{
+	return *(void **)res == match_data;
+}
+
+void *devm_memremap(struct device *dev, resource_size_t offset,
+		size_t size, unsigned long flags)
+{
+	void **ptr, *addr;
+
+	ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return NULL;
+
+	addr = memremap(offset, size, flags);
+	if (addr) {
+		*ptr = addr;
+		devres_add(dev, ptr);
+	} else
+		devres_free(ptr);
+
+	return addr;
+}
+EXPORT_SYMBOL(devm_memremap);
+
+void devm_memunmap(struct device *dev, void *addr)
+{
+	WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match,
+			       addr));
+	memunmap(addr);
+}
+EXPORT_SYMBOL(devm_memunmap);
-- 
cgit v1.2.3-70-g09d2


From 708ab62bef1ed3a3cf065a4138bd87f5d083cfeb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 10 Aug 2015 23:07:08 -0400
Subject: pmem: switch to devm_ allocations

Signed-off-by: Christoph Hellwig <hch@lst.de>
[djbw: tools/testing/nvdimm/ and memunmap_pmem support]
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/pmem.c             | 36 ++++++++++--------------------
 include/linux/pmem.h              | 14 +++++++-----
 tools/testing/nvdimm/Kbuild       |  4 ++--
 tools/testing/nvdimm/test/iomap.c | 46 ++++++++++++++++++++++-----------------
 4 files changed, 47 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index bcf48f133443..eb7552d939e1 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -119,7 +119,7 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 {
 	struct pmem_device *pmem;
 
-	pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
+	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
 	if (!pmem)
 		return ERR_PTR(-ENOMEM);
 
@@ -128,19 +128,16 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 	if (!arch_has_pmem_api())
 		dev_warn(dev, "unable to guarantee persistence of writes\n");
 
-	if (!request_mem_region(pmem->phys_addr, pmem->size, dev_name(dev))) {
+	if (!devm_request_mem_region(dev, pmem->phys_addr, pmem->size,
+			dev_name(dev))) {
 		dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
 				&pmem->phys_addr, pmem->size);
-		kfree(pmem);
 		return ERR_PTR(-EBUSY);
 	}
 
-	pmem->virt_addr = memremap_pmem(pmem->phys_addr, pmem->size);
-	if (!pmem->virt_addr) {
-		release_mem_region(pmem->phys_addr, pmem->size);
-		kfree(pmem);
+	pmem->virt_addr = memremap_pmem(dev, pmem->phys_addr, pmem->size);
+	if (!pmem->virt_addr)
 		return ERR_PTR(-ENXIO);
-	}
 
 	return pmem;
 }
@@ -210,20 +207,12 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns,
 	return 0;
 }
 
-static void pmem_free(struct pmem_device *pmem)
-{
-	memunmap_pmem(pmem->virt_addr);
-	release_mem_region(pmem->phys_addr, pmem->size);
-	kfree(pmem);
-}
-
 static int nd_pmem_probe(struct device *dev)
 {
 	struct nd_region *nd_region = to_nd_region(dev->parent);
 	struct nd_namespace_common *ndns;
 	struct nd_namespace_io *nsio;
 	struct pmem_device *pmem;
-	int rc;
 
 	ndns = nvdimm_namespace_common_probe(dev);
 	if (IS_ERR(ndns))
@@ -236,16 +225,14 @@ static int nd_pmem_probe(struct device *dev)
 
 	dev_set_drvdata(dev, pmem);
 	ndns->rw_bytes = pmem_rw_bytes;
+
 	if (is_nd_btt(dev))
-		rc = nvdimm_namespace_attach_btt(ndns);
-	else if (nd_btt_probe(ndns, pmem) == 0) {
+		return nvdimm_namespace_attach_btt(ndns);
+
+	if (nd_btt_probe(ndns, pmem) == 0)
 		/* we'll come back as btt-pmem */
-		rc = -ENXIO;
-	} else
-		rc = pmem_attach_disk(ndns, pmem);
-	if (rc)
-		pmem_free(pmem);
-	return rc;
+		return -ENXIO;
+	return pmem_attach_disk(ndns, pmem);
 }
 
 static int nd_pmem_remove(struct device *dev)
@@ -256,7 +243,6 @@ static int nd_pmem_remove(struct device *dev)
 		nvdimm_namespace_detach_btt(to_nd_btt(dev)->ndns);
 	else
 		pmem_detach_disk(pmem);
-	pmem_free(pmem);
 
 	return 0;
 }
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
index 093c35ecefcc..20c367cd76e6 100644
--- a/include/linux/pmem.h
+++ b/include/linux/pmem.h
@@ -46,9 +46,9 @@ static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t si
 	memcpy(dst, (void __force const *) src, size);
 }
 
-static inline void memunmap_pmem(void __pmem *addr)
+static inline void memunmap_pmem(struct device *dev, void __pmem *addr)
 {
-	memunmap((void __force *) addr);
+	devm_memunmap(dev, (void __force *) addr);
 }
 
 /**
@@ -97,13 +97,15 @@ static inline void default_memcpy_to_pmem(void __pmem *dst, const void *src,
  * wmb_pmem() arrange for the data to be written through the
  * cache to persistent media.
  */
-static inline void __pmem *memremap_pmem(resource_size_t offset,
-		unsigned long size)
+static inline void __pmem *memremap_pmem(struct device *dev,
+		resource_size_t offset, unsigned long size)
 {
 #ifdef ARCH_MEMREMAP_PMEM
-	return (void __pmem *) memremap(offset, size, ARCH_MEMREMAP_PMEM);
+	return (void __pmem *) devm_memremap(dev, offset, size,
+			ARCH_MEMREMAP_PMEM);
 #else
-	return (void __pmem *) memremap(offset, size, MEMREMAP_WT);
+	return (void __pmem *) devm_memremap(dev, offset, size,
+			MEMREMAP_WT);
 #endif
 }
 
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index 8032a49f7873..04c5fc09576d 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -1,9 +1,9 @@
 ldflags-y += --wrap=ioremap_wc
 ldflags-y += --wrap=devm_ioremap_nocache
-ldflags-y += --wrap=memremap
-ldflags-y += --wrap=memunmap
+ldflags-y += --wrap=devm_memremap
 ldflags-y += --wrap=ioremap_nocache
 ldflags-y += --wrap=iounmap
+ldflags-y += --wrap=__devm_request_region
 ldflags-y += --wrap=__request_region
 ldflags-y += --wrap=__release_region
 
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index 21288f34a5ca..ff1e00458864 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -80,8 +80,8 @@ void __iomem *__wrap_devm_ioremap_nocache(struct device *dev,
 }
 EXPORT_SYMBOL(__wrap_devm_ioremap_nocache);
 
-void *__wrap_memremap(resource_size_t offset, size_t size,
-		unsigned long flags)
+void *__wrap_devm_memremap(struct device *dev, resource_size_t offset,
+		size_t size, unsigned long flags)
 {
 	struct nfit_test_resource *nfit_res;
 
@@ -91,9 +91,9 @@ void *__wrap_memremap(resource_size_t offset, size_t size,
 	if (nfit_res)
 		return (void __iomem *) nfit_res->buf + offset
 			- nfit_res->res->start;
-	return memremap(offset, size, flags);
+	return devm_memremap(dev, offset, size, flags);
 }
-EXPORT_SYMBOL(__wrap_memremap);
+EXPORT_SYMBOL(__wrap_devm_memremap);
 
 void __iomem *__wrap_ioremap_nocache(resource_size_t offset, unsigned long size)
 {
@@ -120,22 +120,9 @@ void __wrap_iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(__wrap_iounmap);
 
-void __wrap_memunmap(void *addr)
-{
-	struct nfit_test_resource *nfit_res;
-
-	rcu_read_lock();
-	nfit_res = get_nfit_res((unsigned long) addr);
-	rcu_read_unlock();
-	if (nfit_res)
-		return;
-	return memunmap(addr);
-}
-EXPORT_SYMBOL(__wrap_memunmap);
-
-struct resource *__wrap___request_region(struct resource *parent,
-		resource_size_t start, resource_size_t n, const char *name,
-		int flags)
+static struct resource *nfit_test_request_region(struct device *dev,
+		struct resource *parent, resource_size_t start,
+		resource_size_t n, const char *name, int flags)
 {
 	struct nfit_test_resource *nfit_res;
 
@@ -163,10 +150,29 @@ struct resource *__wrap___request_region(struct resource *parent,
 			return res;
 		}
 	}
+	if (dev)
+		return __devm_request_region(dev, parent, start, n, name);
 	return __request_region(parent, start, n, name, flags);
 }
+
+struct resource *__wrap___request_region(struct resource *parent,
+		resource_size_t start, resource_size_t n, const char *name,
+		int flags)
+{
+	return nfit_test_request_region(NULL, parent, start, n, name, flags);
+}
 EXPORT_SYMBOL(__wrap___request_region);
 
+struct resource *__wrap___devm_request_region(struct device *dev,
+		struct resource *parent, resource_size_t start,
+		resource_size_t n, const char *name)
+{
+	if (!dev)
+		return NULL;
+	return nfit_test_request_region(dev, parent, start, n, name, 0);
+}
+EXPORT_SYMBOL(__wrap___devm_request_region);
+
 void __wrap___release_region(struct resource *parent, resource_size_t start,
 				resource_size_t n)
 {
-- 
cgit v1.2.3-70-g09d2


From f8786a91548df6930643a052e40e5c0b7a8403a5 Mon Sep 17 00:00:00 2001
From: Nikhil Badola <nikhil.badola@freescale.com>
Date: Thu, 6 Aug 2015 14:51:27 +0530
Subject: drivers: usb: fsl: Workaround for USB erratum-A005275

Incoming packets in high speed are randomly corrupted by h/w
resulting in multiple errors. This workaround makes FS as
default mode in all affected socs by disabling HS chirp
signalling.This errata does not affect FS and LS mode.

Forces all HS devices to connect in FS mode for all socs
affected by this erratum:
P3041 and P2041 rev 1.0 and 1.1
P5020 and P5010 rev 1.0 and 2.0
P5040, P1010 and T4240 rev 1.0

Signed-off-by: Ramneek Mehresh <ramneek.mehresh@freescale.com>
Signed-off-by: Nikhil Badola <nikhil.badola@freescale.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/ehci-fsl.c      |  4 ++++
 drivers/usb/host/ehci-hub.c      |  7 +++++++
 drivers/usb/host/ehci.h          | 12 ++++++++++++
 drivers/usb/host/fsl-mph-dr-of.c |  4 ++++
 include/linux/fsl_devices.h      |  1 +
 5 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/host/ehci-fsl.c b/drivers/usb/host/ehci-fsl.c
index 202dafb7d0cb..3b6eb219de1a 100644
--- a/drivers/usb/host/ehci-fsl.c
+++ b/drivers/usb/host/ehci-fsl.c
@@ -278,6 +278,10 @@ static int ehci_fsl_usb_setup(struct ehci_hcd *ehci)
 		out_be32(non_ehci + FSL_SOC_USB_SNOOP2, 0x80000000 | SNOOP_SIZE_2GB);
 	}
 
+	/* Deal with USB erratum A-005275 */
+	if (pdata->has_fsl_erratum_a005275 == 1)
+		ehci->has_fsl_hs_errata = 1;
+
 	if ((pdata->operating_mode == FSL_USB2_DR_HOST) ||
 			(pdata->operating_mode == FSL_USB2_DR_OTG))
 		if (ehci_fsl_setup_phy(hcd, pdata->phy_mode, 0))
diff --git a/drivers/usb/host/ehci-hub.c b/drivers/usb/host/ehci-hub.c
index 22abb6830dfa..086a7115d263 100644
--- a/drivers/usb/host/ehci-hub.c
+++ b/drivers/usb/host/ehci-hub.c
@@ -1221,6 +1221,13 @@ int ehci_hub_control(
 				 */
 				ehci->reset_done [wIndex] = jiffies
 						+ msecs_to_jiffies (50);
+
+				/*
+				 * Force full-speed connect for FSL high-speed
+				 * erratum; disable HS Chirp by setting PFSC bit
+				 */
+				if (ehci_has_fsl_hs_errata(ehci))
+					temp |= (1 << PORTSC_FSL_PFSC);
 			}
 			ehci_writel(ehci, temp, status_reg);
 			break;
diff --git a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h
index f700157cd084..46f62e41bcde 100644
--- a/drivers/usb/host/ehci.h
+++ b/drivers/usb/host/ehci.h
@@ -215,6 +215,7 @@ struct ehci_hcd {			/* one per controller */
 	/* SILICON QUIRKS */
 	unsigned		no_selective_suspend:1;
 	unsigned		has_fsl_port_bug:1; /* FreeScale */
+	unsigned		has_fsl_hs_errata:1;	/* Freescale HS quirk */
 	unsigned		big_endian_mmio:1;
 	unsigned		big_endian_desc:1;
 	unsigned		big_endian_capbase:1;
@@ -686,6 +687,17 @@ ehci_port_speed(struct ehci_hcd *ehci, unsigned int portsc)
 #define	ehci_has_fsl_portno_bug(e)		(0)
 #endif
 
+#define PORTSC_FSL_PFSC	24	/* Port Force Full-Speed Connect */
+
+#if defined(CONFIG_PPC_85xx)
+/* Some Freescale processors have an erratum (USB A-005275) in which
+ * incoming packets get corrupted in HS mode
+ */
+#define ehci_has_fsl_hs_errata(e)	((e)->has_fsl_hs_errata)
+#else
+#define ehci_has_fsl_hs_errata(e)	(0)
+#endif
+
 /*
  * While most USB host controllers implement their registers in
  * little-endian format, a minority (celleb companion chip) implement
diff --git a/drivers/usb/host/fsl-mph-dr-of.c b/drivers/usb/host/fsl-mph-dr-of.c
index 9f731413ab3e..534c4c5d278a 100644
--- a/drivers/usb/host/fsl-mph-dr-of.c
+++ b/drivers/usb/host/fsl-mph-dr-of.c
@@ -221,6 +221,10 @@ static int fsl_usb2_mph_dr_of_probe(struct platform_device *ofdev)
 		pdata->has_fsl_erratum_a007792 = 1;
 	else
 		pdata->has_fsl_erratum_a007792 = 0;
+	if (of_get_property(np, "fsl,usb-erratum-a005275", NULL))
+		pdata->has_fsl_erratum_a005275 = 1;
+	else
+		pdata->has_fsl_erratum_a005275 = 0;
 
 	/*
 	 * Determine whether phy_clk_valid needs to be checked
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index cebdbbb4aa69..f2912914141a 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -99,6 +99,7 @@ struct fsl_usb2_platform_data {
 	unsigned	suspended:1;
 	unsigned	already_suspended:1;
 	unsigned        has_fsl_erratum_a007792:1;
+	unsigned        has_fsl_erratum_a005275:1;
 	unsigned        check_phy_clk_valid:1;
 
 	/* register save area for suspend/resume */
-- 
cgit v1.2.3-70-g09d2


From 484ebaedecc5ddf778a30ee1efab367cbee27030 Mon Sep 17 00:00:00 2001
From: Stefan Koch <stefan.koch10@gmail.com>
Date: Sat, 8 Aug 2015 11:32:50 +0200
Subject: usb: interface authorization: Declare authorized attribute

The attribute authorized shows the authorization state for an interface.

Signed-off-by: Stefan Koch <skoch@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 447fe29b55b4..3deccab2ce0a 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -178,6 +178,7 @@ struct usb_interface {
 	unsigned needs_altsetting0:1;	/* switch to altsetting 0 is pending */
 	unsigned needs_binding:1;	/* needs delayed unbind/rebind */
 	unsigned resetting_device:1;	/* true: bandwidth alloc after reset */
+	unsigned authorized:1;		/* used for interface authorization */
 
 	struct device dev;		/* interface specific device info */
 	struct device *usb_dev;
-- 
cgit v1.2.3-70-g09d2


From 1d958bef45030acfc5578263e9de3bb07032b8da Mon Sep 17 00:00:00 2001
From: Stefan Koch <stefan.koch10@gmail.com>
Date: Sat, 8 Aug 2015 11:32:51 +0200
Subject: usb: interface authorization: Introduces the default interface
 authorization

Interfaces are allowed per default.
This can disabled or enabled (again) by writing 0 or 1 to
/sys/bus/usb/devices/usbX/interface_authorized_default

Signed-off-by: Stefan Koch <skoch@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hcd.c     | 47 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/usb/core/message.c |  1 +
 include/linux/usb/hcd.h    |  9 +++++++++
 3 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 4d64e5c499e1..83df1dde9c08 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -882,9 +882,53 @@ static ssize_t authorized_default_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(authorized_default);
 
+/*
+ * interface_authorized_default_show - show default authorization status
+ * for USB interfaces
+ *
+ * note: interface_authorized_default is the default value
+ *       for initializing the authorized attribute of interfaces
+ */
+static ssize_t interface_authorized_default_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct usb_device *usb_dev = to_usb_device(dev);
+	struct usb_hcd *hcd = bus_to_hcd(usb_dev->bus);
+
+	return sprintf(buf, "%u\n", !!HCD_INTF_AUTHORIZED(hcd));
+}
+
+/*
+ * interface_authorized_default_store - store default authorization status
+ * for USB interfaces
+ *
+ * note: interface_authorized_default is the default value
+ *       for initializing the authorized attribute of interfaces
+ */
+static ssize_t interface_authorized_default_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct usb_device *usb_dev = to_usb_device(dev);
+	struct usb_hcd *hcd = bus_to_hcd(usb_dev->bus);
+	int rc = count;
+	bool val;
+
+	if (strtobool(buf, &val) != 0)
+		return -EINVAL;
+
+	if (val)
+		set_bit(HCD_FLAG_INTF_AUTHORIZED, &hcd->flags);
+	else
+		clear_bit(HCD_FLAG_INTF_AUTHORIZED, &hcd->flags);
+
+	return rc;
+}
+static DEVICE_ATTR_RW(interface_authorized_default);
+
 /* Group all the USB bus attributes */
 static struct attribute *usb_bus_attrs[] = {
 		&dev_attr_authorized_default.attr,
+		&dev_attr_interface_authorized_default.attr,
 		NULL,
 };
 
@@ -2682,6 +2726,9 @@ int usb_add_hcd(struct usb_hcd *hcd,
 		hcd->authorized_default = authorized_default;
 	set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags);
 
+	/* per default all interfaces are authorized */
+	set_bit(HCD_FLAG_INTF_AUTHORIZED, &hcd->flags);
+
 	/* HC is in reset state, but accessible.  Now do the one-time init,
 	 * bottom up so that hcds can customize the root hubs before hub_wq
 	 * starts talking to them.  (Note, bus id is assigned early too.)
diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index f368d2053da5..3d25d89671d7 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -1807,6 +1807,7 @@ free_interfaces:
 		intfc = cp->intf_cache[i];
 		intf->altsetting = intfc->altsetting;
 		intf->num_altsetting = intfc->num_altsetting;
+		intf->authorized = !!HCD_INTF_AUTHORIZED(hcd);
 		kref_get(&intfc->ref);
 
 		alt = usb_altnum_to_altsetting(intf, 0);
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index d2784c10bfe2..36ce3d215cad 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -120,6 +120,7 @@ struct usb_hcd {
 #define HCD_FLAG_WAKEUP_PENDING		4	/* root hub is resuming? */
 #define HCD_FLAG_RH_RUNNING		5	/* root hub is running? */
 #define HCD_FLAG_DEAD			6	/* controller has died? */
+#define HCD_FLAG_INTF_AUTHORIZED	7	/* authorize interfaces? */
 
 	/* The flags can be tested using these macros; they are likely to
 	 * be slightly faster than test_bit().
@@ -131,6 +132,14 @@ struct usb_hcd {
 #define HCD_RH_RUNNING(hcd)	((hcd)->flags & (1U << HCD_FLAG_RH_RUNNING))
 #define HCD_DEAD(hcd)		((hcd)->flags & (1U << HCD_FLAG_DEAD))
 
+	/*
+	 * Specifies if interfaces are authorized by default
+	 * or they require explicit user space authorization; this bit is
+	 * settable through /sys/class/usb_host/X/interface_authorized_default
+	 */
+#define HCD_INTF_AUTHORIZED(hcd) \
+	((hcd)->flags & (1U << HCD_FLAG_INTF_AUTHORIZED))
+
 	/* Flags that get set only during HCD registration or removal. */
 	unsigned		rh_registered:1;/* is root hub registered? */
 	unsigned		rh_pollable:1;	/* may we poll the root hub? */
-- 
cgit v1.2.3-70-g09d2


From 3cf1fc80655d3af7083ea4b3615e5f8532543be7 Mon Sep 17 00:00:00 2001
From: Stefan Koch <stefan.koch10@gmail.com>
Date: Sat, 8 Aug 2015 11:32:56 +0200
Subject: usb: interface authorization: Use a flag for the default device
 authorization

With this patch a flag instead of a variable
is used for the default device authorization.

Signed-off-by: Stefan Koch <skoch@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hcd.c  | 31 +++++++++++++++++++++----------
 drivers/usb/core/usb.c  |  2 +-
 include/linux/usb/hcd.h | 16 +++++++++-------
 3 files changed, 31 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 83df1dde9c08..86b3d1190500 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -854,10 +854,10 @@ static ssize_t authorized_default_show(struct device *dev,
 {
 	struct usb_device *rh_usb_dev = to_usb_device(dev);
 	struct usb_bus *usb_bus = rh_usb_dev->bus;
-	struct usb_hcd *usb_hcd;
+	struct usb_hcd *hcd;
 
-	usb_hcd = bus_to_hcd(usb_bus);
-	return snprintf(buf, PAGE_SIZE, "%u\n", usb_hcd->authorized_default);
+	hcd = bus_to_hcd(usb_bus);
+	return snprintf(buf, PAGE_SIZE, "%u\n", !!HCD_DEV_AUTHORIZED(hcd));
 }
 
 static ssize_t authorized_default_store(struct device *dev,
@@ -868,12 +868,16 @@ static ssize_t authorized_default_store(struct device *dev,
 	unsigned val;
 	struct usb_device *rh_usb_dev = to_usb_device(dev);
 	struct usb_bus *usb_bus = rh_usb_dev->bus;
-	struct usb_hcd *usb_hcd;
+	struct usb_hcd *hcd;
 
-	usb_hcd = bus_to_hcd(usb_bus);
+	hcd = bus_to_hcd(usb_bus);
 	result = sscanf(buf, "%u\n", &val);
 	if (result == 1) {
-		usb_hcd->authorized_default = val ? 1 : 0;
+		if (val)
+			set_bit(HCD_FLAG_DEV_AUTHORIZED, &hcd->flags);
+		else
+			clear_bit(HCD_FLAG_DEV_AUTHORIZED, &hcd->flags);
+
 		result = size;
 	} else {
 		result = -EINVAL;
@@ -2720,10 +2724,17 @@ int usb_add_hcd(struct usb_hcd *hcd,
 	dev_info(hcd->self.controller, "%s\n", hcd->product_desc);
 
 	/* Keep old behaviour if authorized_default is not in [0, 1]. */
-	if (authorized_default < 0 || authorized_default > 1)
-		hcd->authorized_default = hcd->wireless ? 0 : 1;
-	else
-		hcd->authorized_default = authorized_default;
+	if (authorized_default < 0 || authorized_default > 1) {
+		if (hcd->wireless)
+			clear_bit(HCD_FLAG_DEV_AUTHORIZED, &hcd->flags);
+		else
+			set_bit(HCD_FLAG_DEV_AUTHORIZED, &hcd->flags);
+	} else {
+		if (authorized_default)
+			set_bit(HCD_FLAG_DEV_AUTHORIZED, &hcd->flags);
+		else
+			clear_bit(HCD_FLAG_DEV_AUTHORIZED, &hcd->flags);
+	}
 	set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags);
 
 	/* per default all interfaces are authorized */
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 8d5b2f4113cd..f8bbd0b6d9fe 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -510,7 +510,7 @@ struct usb_device *usb_alloc_dev(struct usb_device *parent,
 	if (root_hub)	/* Root hub always ok [and always wired] */
 		dev->authorized = 1;
 	else {
-		dev->authorized = usb_hcd->authorized_default;
+		dev->authorized = !!HCD_DEV_AUTHORIZED(usb_hcd);
 		dev->wusb = usb_bus_is_wusb(bus) ? 1 : 0;
 	}
 	return dev;
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 36ce3d215cad..4d147277b30d 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -58,12 +58,6 @@
  *
  * Since "struct usb_bus" is so thin, you can't share much code in it.
  * This framework is a layer over that, and should be more sharable.
- *
- * @authorized_default: Specifies if new devices are authorized to
- *                      connect by default or they require explicit
- *                      user space authorization; this bit is settable
- *                      through /sys/class/usb_host/X/authorized_default.
- *                      For the rest is RO, so we don't lock to r/w it.
  */
 
 /*-------------------------------------------------------------------------*/
@@ -121,6 +115,7 @@ struct usb_hcd {
 #define HCD_FLAG_RH_RUNNING		5	/* root hub is running? */
 #define HCD_FLAG_DEAD			6	/* controller has died? */
 #define HCD_FLAG_INTF_AUTHORIZED	7	/* authorize interfaces? */
+#define HCD_FLAG_DEV_AUTHORIZED		8	/* authorize devices? */
 
 	/* The flags can be tested using these macros; they are likely to
 	 * be slightly faster than test_bit().
@@ -140,6 +135,14 @@ struct usb_hcd {
 #define HCD_INTF_AUTHORIZED(hcd) \
 	((hcd)->flags & (1U << HCD_FLAG_INTF_AUTHORIZED))
 
+	/*
+	 * Specifies if devices are authorized by default
+	 * or they require explicit user space authorization; this bit is
+	 * settable through /sys/class/usb_host/X/authorized_default
+	 */
+#define HCD_DEV_AUTHORIZED(hcd) \
+	((hcd)->flags & (1U << HCD_FLAG_DEV_AUTHORIZED))
+
 	/* Flags that get set only during HCD registration or removal. */
 	unsigned		rh_registered:1;/* is root hub registered? */
 	unsigned		rh_pollable:1;	/* may we poll the root hub? */
@@ -150,7 +153,6 @@ struct usb_hcd {
 	 * support the new root-hub polling mechanism. */
 	unsigned		uses_new_polling:1;
 	unsigned		wireless:1;	/* Wireless USB HCD */
-	unsigned		authorized_default:1;
 	unsigned		has_tt:1;	/* Integrated TT in root hub */
 	unsigned		amd_resume_bug:1; /* AMD remote wakeup quirk */
 	unsigned		can_do_streams:1; /* HC supports streams */
-- 
cgit v1.2.3-70-g09d2


From b4f194706dd98d6de21c97eeb888a75abbf75174 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Date: Mon, 10 Aug 2015 16:23:09 +0200
Subject: USB: host: ohci-at91: move at91_usbh_data definition in c file

Move struct at91_usbh_data back in ohci-at91.c as this is the only user
left after switching all at91 platforms to DT only.

Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/ohci-at91.c        | 11 +++++++++++
 include/linux/platform_data/atmel.h | 12 ------------
 2 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/host/ohci-at91.c b/drivers/usb/host/ohci-at91.c
index 15df00cceed9..72dc4f9e2a0f 100644
--- a/drivers/usb/host/ohci-at91.c
+++ b/drivers/usb/host/ohci-at91.c
@@ -36,6 +36,17 @@
 #define hcd_to_ohci_at91_priv(h) \
 	((struct ohci_at91_priv *)hcd_to_ohci(h)->priv)
 
+#define AT91_MAX_USBH_PORTS	3
+struct at91_usbh_data {
+	int vbus_pin[AT91_MAX_USBH_PORTS];	/* port power-control pin */
+	int overcurrent_pin[AT91_MAX_USBH_PORTS];
+	u8 ports;				/* number of ports on root hub */
+	u8 overcurrent_supported;
+	u8 vbus_pin_active_low[AT91_MAX_USBH_PORTS];
+	u8 overcurrent_status[AT91_MAX_USBH_PORTS];
+	u8 overcurrent_changed[AT91_MAX_USBH_PORTS];
+};
+
 struct ohci_at91_priv {
 	struct clk *iclk;
 	struct clk *fclk;
diff --git a/include/linux/platform_data/atmel.h b/include/linux/platform_data/atmel.h
index 4b452c6a2f7b..527a85c61924 100644
--- a/include/linux/platform_data/atmel.h
+++ b/include/linux/platform_data/atmel.h
@@ -46,18 +46,6 @@ struct at91_cf_data {
 #define AT91_IDE_SWAP_A0_A2	0x02
 };
 
- /* USB Host */
-#define AT91_MAX_USBH_PORTS	3
-struct at91_usbh_data {
-	int		vbus_pin[AT91_MAX_USBH_PORTS];	/* port power-control pin */
-	int             overcurrent_pin[AT91_MAX_USBH_PORTS];
-	u8		ports;				/* number of ports on root hub */
-	u8              overcurrent_supported;
-	u8              vbus_pin_active_low[AT91_MAX_USBH_PORTS];
-	u8              overcurrent_status[AT91_MAX_USBH_PORTS];
-	u8              overcurrent_changed[AT91_MAX_USBH_PORTS];
-};
-
  /* NAND / SmartMedia */
 struct atmel_nand_data {
 	int		enable_pin;		/* chip enable */
-- 
cgit v1.2.3-70-g09d2


From bee9182d955227f01ff3b80c4cb6acca9bb40b11 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sun, 19 Jul 2015 23:48:20 +0200
Subject: introduce __sb_writers_{acquired,release}() helpers

Preparation to hide the sb->s_writers internals from xfs and btrfs.
Add 2 trivial define's they can use rather than play with ->s_writers
directly. No changes in btrfs/transaction.o and xfs/xfs_aops.o.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Jan Kara <jack@suse.com>
---
 fs/btrfs/transaction.c | 8 ++------
 fs/xfs/xfs_aops.c      | 6 ++----
 include/linux/fs.h     | 5 +++++
 3 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f5021fcb154e..a8ab8f5ef38e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1638,9 +1638,7 @@ static void do_async_commit(struct work_struct *work)
 	 * Tell lockdep about it.
 	 */
 	if (ac->newtrans->type & __TRANS_FREEZABLE)
-		rwsem_acquire_read(
-		     &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
-		     0, 1, _THIS_IP_);
+		__sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS);
 
 	current->journal_info = ac->newtrans;
 
@@ -1679,9 +1677,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 	 * async commit thread will be the one to unlock it.
 	 */
 	if (ac->newtrans->type & __TRANS_FREEZABLE)
-		rwsem_release(
-			&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
-			1, _THIS_IP_);
+		__sb_writers_release(root->fs_info->sb, SB_FREEZE_FS);
 
 	schedule_work(&ac->work);
 
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3859f5e27a4d..9bbb3507376a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -119,8 +119,7 @@ xfs_setfilesize_trans_alloc(
 	 * We may pass freeze protection with a transaction.  So tell lockdep
 	 * we released it.
 	 */
-	rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
-		      1, _THIS_IP_);
+	__sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
 	/*
 	 * We hand off the transaction to the completion thread now, so
 	 * clear the flag here.
@@ -171,8 +170,7 @@ xfs_setfilesize_ioend(
 	 * Similarly for freeze protection.
 	 */
 	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
-	rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
-			   0, 1, _THIS_IP_);
+	__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
 
 	return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 84b783f277f7..acb7cad84edd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1391,6 +1391,11 @@ extern struct timespec current_fs_time(struct super_block *sb);
 void __sb_end_write(struct super_block *sb, int level);
 int __sb_start_write(struct super_block *sb, int level, bool wait);
 
+#define __sb_writers_acquired(sb, lev)	\
+	rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_)
+#define __sb_writers_release(sb, lev)	\
+	rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_)
+
 /**
  * sb_end_write - drop write access to a superblock
  * @sb: the super we wrote to
-- 
cgit v1.2.3-70-g09d2


From 9287f6925ad9d8fb8c6283066b4f77fd87f123a9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 17:45:57 +0200
Subject: percpu-rwsem: introduce percpu_down_read_trylock()

Add percpu_down_read_trylock(), it will have the user soon.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/percpu-rwsem.h  |  1 +
 kernel/locking/percpu-rwsem.c | 13 +++++++++++++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 3e88c9a7d57f..16c30cd33501 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -16,6 +16,7 @@ struct percpu_rw_semaphore {
 };
 
 extern void percpu_down_read(struct percpu_rw_semaphore *);
+extern int  percpu_down_read_trylock(struct percpu_rw_semaphore *);
 extern void percpu_up_read(struct percpu_rw_semaphore *);
 
 extern void percpu_down_write(struct percpu_rw_semaphore *);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 652a8ee8efe9..f32567254867 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -88,6 +88,19 @@ void percpu_down_read(struct percpu_rw_semaphore *brw)
 	__up_read(&brw->rw_sem);
 }
 
+int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
+{
+	if (unlikely(!update_fast_ctr(brw, +1))) {
+		if (!__down_read_trylock(&brw->rw_sem))
+			return 0;
+		atomic_inc(&brw->slow_read_ctr);
+		__up_read(&brw->rw_sem);
+	}
+
+	rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
+	return 1;
+}
+
 void percpu_up_read(struct percpu_rw_semaphore *brw)
 {
 	rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
-- 
cgit v1.2.3-70-g09d2


From 55cc156505f2e43fa45dbd4bfe8f9c9d848ca44c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 20:26:44 +0200
Subject: percpu-rwsem: introduce percpu_rwsem_release() and
 percpu_rwsem_acquire()

Add percpu_rwsem_release() and percpu_rwsem_acquire() for the users
which need to return to userspace with percpu-rwsem lock held and/or
pass the ownership to another thread.

TODO: change percpu_rwsem_release() to use rwsem_clear_owner(). We can
either fold kernel/locking/rwsem.h into include/linux/rwsem.h, or add
the non-inline percpu_rwsem_clear_owner().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/percpu-rwsem.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 16c30cd33501..834c4e52cb2d 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -32,4 +32,23 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 	__percpu_init_rwsem(brw, #brw, &rwsem_key);		\
 })
 
+
+#define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
+
+static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
+					bool read, unsigned long ip)
+{
+	lock_release(&sem->rw_sem.dep_map, 1, ip);
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+	if (!read)
+		sem->rw_sem.owner = NULL;
+#endif
+}
+
+static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
+					bool read, unsigned long ip)
+{
+	lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip);
+}
+
 #endif
-- 
cgit v1.2.3-70-g09d2


From 853b39a7c82826b8413048feec7bf08e98ce7a84 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 22 Jul 2015 20:21:13 +0200
Subject: shift percpu_counter_destroy() into destroy_super_work()

Of course, this patch is ugly as hell. It will be (partially)
reverted later. We add it to ensure that other WIP changes in
percpu_rw_semaphore won't break fs/super.c.

We do not even need this change right now, percpu_free_rwsem()
is fine in atomic context. But we are going to change this, it
will be might_sleep() after we merge the rcu_sync() patches.

And even after that we do not really need destroy_super_work(),
we will kill it in any case. Instead, destroy_super_rcu() should
just check that rss->cb_state == CB_IDLE and do call_rcu() again
in the (very unlikely) case this is not true.

So this is just the temporary kludge which helps us to avoid the
conflicts with the changes which will be (hopefully) routed via
rcu tree.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Jan Kara <jack@suse.com>
---
 fs/super.c         | 23 +++++++++++++++++++----
 include/linux/fs.h |  3 ++-
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 8aa3cbc571d1..c937bd7b4d33 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -135,6 +135,24 @@ static unsigned long super_cache_count(struct shrinker *shrink,
 	return total_objects;
 }
 
+static void destroy_super_work(struct work_struct *work)
+{
+	struct super_block *s = container_of(work, struct super_block,
+							destroy_work);
+	int i;
+
+	for (i = 0; i < SB_FREEZE_LEVELS; i++)
+		percpu_counter_destroy(&s->s_writers.counter[i]);
+	kfree(s);
+}
+
+static void destroy_super_rcu(struct rcu_head *head)
+{
+	struct super_block *s = container_of(head, struct super_block, rcu);
+	INIT_WORK(&s->destroy_work, destroy_super_work);
+	schedule_work(&s->destroy_work);
+}
+
 /**
  *	destroy_super	-	frees a superblock
  *	@s: superblock to free
@@ -143,16 +161,13 @@ static unsigned long super_cache_count(struct shrinker *shrink,
  */
 static void destroy_super(struct super_block *s)
 {
-	int i;
 	list_lru_destroy(&s->s_dentry_lru);
 	list_lru_destroy(&s->s_inode_lru);
-	for (i = 0; i < SB_FREEZE_LEVELS; i++)
-		percpu_counter_destroy(&s->s_writers.counter[i]);
 	security_sb_free(s);
 	WARN_ON(!list_empty(&s->s_mounts));
 	kfree(s->s_subtype);
 	kfree(s->s_options);
-	kfree_rcu(s, rcu);
+	call_rcu(&s->rcu, destroy_super_rcu);
 }
 
 /**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index acb7cad84edd..4bed78966c6b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -30,6 +30,7 @@
 #include <linux/lockdep.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/blk_types.h>
+#include <linux/workqueue.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -1375,7 +1376,7 @@ struct super_block {
 	struct list_lru		s_dentry_lru ____cacheline_aligned_in_smp;
 	struct list_lru		s_inode_lru ____cacheline_aligned_in_smp;
 	struct rcu_head		rcu;
-
+	struct work_struct	destroy_work;
 	/*
 	 * Indicates how deep in a filesystem stack this SB is
 	 */
-- 
cgit v1.2.3-70-g09d2


From 8129ed29644bf56ed17ec1bbbeed5c568b43d6a0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 11 Aug 2015 17:05:04 +0200
Subject: change sb_writers to use percpu_rw_semaphore

We can remove everything from struct sb_writers except frozen
and add the array of percpu_rw_semaphore's instead.

This patch doesn't remove sb_writers->wait_unfrozen yet, we keep
it for get_super_thawed(). We will probably remove it later.

This change tries to address the following problems:

	- Firstly, __sb_start_write() looks simply buggy. It does
	  __sb_end_write() if it sees ->frozen, but if it migrates
	  to another CPU before percpu_counter_dec(), sb_wait_write()
	  can wrongly succeed if there is another task which holds
	  the same "semaphore": sb_wait_write() can miss the result
	  of the previous percpu_counter_inc() but see the result
	  of this percpu_counter_dec().

	- As Dave Hansen reports, it is suboptimal. The trivial
	  microbenchmark that writes to a tmpfs file in a loop runs
	  12% faster if we change this code to rely on RCU and kill
	  the memory barriers.

	- This code doesn't look simple. It would be better to rely
	  on the generic locking code.

	  According to Dave, this change adds the same performance
	  improvement.

Note: with this change both freeze_super() and thaw_super() will do
synchronize_sched_expedited() 3 times. This is just ugly. But:

	- This will be "fixed" by the rcu_sync changes we are going
	  to merge. After that freeze_super()->percpu_down_write()
	  will use synchronize_sched(), and thaw_super() won't use
	  synchronize() at all.

	  This doesn't need any changes in fs/super.c.

	- Once we merge rcu_sync changes, we can also change super.c
	  so that all wb_write->rw_sem's will share the single ->rss
	  in struct sb_writes, then freeze_super() will need only one
	  synchronize_sched().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Jan Kara <jack@suse.com>
---
 fs/super.c         | 111 +++++++++++++++--------------------------------------
 include/linux/fs.h |  19 +++------
 2 files changed, 36 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index c937bd7b4d33..767b1e10f6ad 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -142,7 +142,7 @@ static void destroy_super_work(struct work_struct *work)
 	int i;
 
 	for (i = 0; i < SB_FREEZE_LEVELS; i++)
-		percpu_counter_destroy(&s->s_writers.counter[i]);
+		percpu_free_rwsem(&s->s_writers.rw_sem[i]);
 	kfree(s);
 }
 
@@ -193,13 +193,11 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 		goto fail;
 
 	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
-		if (percpu_counter_init(&s->s_writers.counter[i], 0,
-					GFP_KERNEL) < 0)
+		if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
+					sb_writers_name[i],
+					&type->s_writers_key[i]))
 			goto fail;
-		lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
-				 &type->s_writers_key[i], 0);
 	}
-	init_waitqueue_head(&s->s_writers.wait);
 	init_waitqueue_head(&s->s_writers.wait_unfrozen);
 	s->s_bdi = &noop_backing_dev_info;
 	s->s_flags = flags;
@@ -1161,47 +1159,10 @@ out:
  */
 void __sb_end_write(struct super_block *sb, int level)
 {
-	percpu_counter_dec(&sb->s_writers.counter[level-1]);
-	/*
-	 * Make sure s_writers are updated before we wake up waiters in
-	 * freeze_super().
-	 */
-	smp_mb();
-	if (waitqueue_active(&sb->s_writers.wait))
-		wake_up(&sb->s_writers.wait);
-	rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
+	percpu_up_read(sb->s_writers.rw_sem + level-1);
 }
 EXPORT_SYMBOL(__sb_end_write);
 
-static int do_sb_start_write(struct super_block *sb, int level, bool wait,
-				unsigned long ip)
-{
-	if (wait)
-		rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 0, ip);
-retry:
-	if (unlikely(sb->s_writers.frozen >= level)) {
-		if (!wait)
-			return 0;
-		wait_event(sb->s_writers.wait_unfrozen,
-			   sb->s_writers.frozen < level);
-	}
-
-	percpu_counter_inc(&sb->s_writers.counter[level-1]);
-	/*
-	 * Make sure counter is updated before we check for frozen.
-	 * freeze_super() first sets frozen and then checks the counter.
-	 */
-	smp_mb();
-	if (unlikely(sb->s_writers.frozen >= level)) {
-		__sb_end_write(sb, level);
-		goto retry;
-	}
-
-	if (!wait)
-		rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 1, ip);
-	return 1;
-}
-
 /*
  * This is an internal function, please use sb_start_{write,pagefault,intwrite}
  * instead.
@@ -1209,7 +1170,7 @@ retry:
 int __sb_start_write(struct super_block *sb, int level, bool wait)
 {
 	bool force_trylock = false;
-	int ret;
+	int ret = 1;
 
 #ifdef CONFIG_LOCKDEP
 	/*
@@ -1225,13 +1186,17 @@ int __sb_start_write(struct super_block *sb, int level, bool wait)
 		int i;
 
 		for (i = 0; i < level - 1; i++)
-			if (lock_is_held(&sb->s_writers.lock_map[i])) {
+			if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) {
 				force_trylock = true;
 				break;
 			}
 	}
 #endif
-	ret = do_sb_start_write(sb, level, wait && !force_trylock, _RET_IP_);
+	if (wait && !force_trylock)
+		percpu_down_read(sb->s_writers.rw_sem + level-1);
+	else
+		ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
+
 	WARN_ON(force_trylock & !ret);
 	return ret;
 }
@@ -1243,15 +1208,11 @@ EXPORT_SYMBOL(__sb_start_write);
  * @level: type of writers we wait for (normal vs page fault)
  *
  * This function waits until there are no writers of given type to given file
- * system. Caller of this function should make sure there can be no new writers
- * of type @level before calling this function. Otherwise this function can
- * livelock.
+ * system.
  */
 static void sb_wait_write(struct super_block *sb, int level)
 {
-	s64 writers;
-
-	rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
+	percpu_down_write(sb->s_writers.rw_sem + level-1);
 	/*
 	 * We are going to return to userspace and forget about this lock, the
 	 * ownership goes to the caller of thaw_super() which does unlock.
@@ -1262,24 +1223,18 @@ static void sb_wait_write(struct super_block *sb, int level)
 	 * this leads to lockdep false-positives, so currently we do the early
 	 * release right after acquire.
 	 */
-	rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
-
-	do {
-		DEFINE_WAIT(wait);
+	percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_);
+}
 
-		/*
-		 * We use a barrier in prepare_to_wait() to separate setting
-		 * of frozen and checking of the counter
-		 */
-		prepare_to_wait(&sb->s_writers.wait, &wait,
-				TASK_UNINTERRUPTIBLE);
+static void sb_freeze_unlock(struct super_block *sb)
+{
+	int level;
 
-		writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
-		if (writers)
-			schedule();
+	for (level = 0; level < SB_FREEZE_LEVELS; ++level)
+		percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
 
-		finish_wait(&sb->s_writers.wait, &wait);
-	} while (writers);
+	for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
+		percpu_up_write(sb->s_writers.rw_sem + level);
 }
 
 /**
@@ -1338,20 +1293,14 @@ int freeze_super(struct super_block *sb)
 		return 0;
 	}
 
-	/* From now on, no new normal writers can start */
 	sb->s_writers.frozen = SB_FREEZE_WRITE;
-	smp_wmb();
-
 	/* Release s_umount to preserve sb_start_write -> s_umount ordering */
 	up_write(&sb->s_umount);
-
 	sb_wait_write(sb, SB_FREEZE_WRITE);
+	down_write(&sb->s_umount);
 
 	/* Now we go and block page faults... */
-	down_write(&sb->s_umount);
 	sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
-	smp_wmb();
-
 	sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
 
 	/* All writers are done so after syncing there won't be dirty data */
@@ -1359,7 +1308,6 @@ int freeze_super(struct super_block *sb)
 
 	/* Now wait for internal filesystem counter */
 	sb->s_writers.frozen = SB_FREEZE_FS;
-	smp_wmb();
 	sb_wait_write(sb, SB_FREEZE_FS);
 
 	if (sb->s_op->freeze_fs) {
@@ -1368,7 +1316,7 @@ int freeze_super(struct super_block *sb)
 			printk(KERN_ERR
 				"VFS:Filesystem freeze failed\n");
 			sb->s_writers.frozen = SB_UNFROZEN;
-			smp_wmb();
+			sb_freeze_unlock(sb);
 			wake_up(&sb->s_writers.wait_unfrozen);
 			deactivate_locked_super(sb);
 			return ret;
@@ -1400,8 +1348,10 @@ int thaw_super(struct super_block *sb)
 		return -EINVAL;
 	}
 
-	if (sb->s_flags & MS_RDONLY)
+	if (sb->s_flags & MS_RDONLY) {
+		sb->s_writers.frozen = SB_UNFROZEN;
 		goto out;
+	}
 
 	if (sb->s_op->unfreeze_fs) {
 		error = sb->s_op->unfreeze_fs(sb);
@@ -1413,12 +1363,11 @@ int thaw_super(struct super_block *sb)
 		}
 	}
 
-out:
 	sb->s_writers.frozen = SB_UNFROZEN;
-	smp_wmb();
+	sb_freeze_unlock(sb);
+out:
 	wake_up(&sb->s_writers.wait_unfrozen);
 	deactivate_locked_super(sb);
-
 	return 0;
 }
 EXPORT_SYMBOL(thaw_super);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4bed78966c6b..ce356f66cc2a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_FS_H
 #define _LINUX_FS_H
 
-
 #include <linux/linkage.h>
 #include <linux/wait.h>
 #include <linux/kdev_t.h>
@@ -31,6 +30,7 @@
 #include <linux/percpu-rwsem.h>
 #include <linux/blk_types.h>
 #include <linux/workqueue.h>
+#include <linux/percpu-rwsem.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -1275,16 +1275,9 @@ enum {
 #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
 
 struct sb_writers {
-	/* Counters for counting writers at each level */
-	struct percpu_counter	counter[SB_FREEZE_LEVELS];
-	wait_queue_head_t	wait;		/* queue for waiting for
-						   writers / faults to finish */
-	int			frozen;		/* Is sb frozen? */
-	wait_queue_head_t	wait_unfrozen;	/* queue for waiting for
-						   sb to be thawed */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	lock_map[SB_FREEZE_LEVELS];
-#endif
+	int				frozen;		/* Is sb frozen? */
+	wait_queue_head_t		wait_unfrozen;	/* for get_super_thawed() */
+	struct percpu_rw_semaphore	rw_sem[SB_FREEZE_LEVELS];
 };
 
 struct super_block {
@@ -1393,9 +1386,9 @@ void __sb_end_write(struct super_block *sb, int level);
 int __sb_start_write(struct super_block *sb, int level, bool wait);
 
 #define __sb_writers_acquired(sb, lev)	\
-	rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_)
+	percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
 #define __sb_writers_release(sb, lev)	\
-	rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_)
+	percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
 
 /**
  * sb_end_write - drop write access to a superblock
-- 
cgit v1.2.3-70-g09d2


From 8025e5ddf9c1cac0e632dad49a63abf7848b78cb Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 13 Jul 2015 11:55:44 -0300
Subject: [media] mm: Provide new get_vaddr_frames() helper

Provide new function get_vaddr_frames().  This function maps virtual
addresses from given start and fills given array with page frame numbers of
the corresponding pages. If given start belongs to a normal vma, the function
grabs reference to each of the pages to pin them in memory. If start
belongs to VM_IO | VM_PFNMAP vma, we don't touch page structures. Caller
must make sure pfns aren't reused for anything else while he is using
them.

This function is created for various drivers to simplify handling of
their buffers.

Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 include/linux/mm.h |  44 ++++++++++
 mm/Kconfig         |   3 +
 mm/Makefile        |   1 +
 mm/frame_vector.c  | 230 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 278 insertions(+)
 create mode 100644 mm/frame_vector.c

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2e872f92dbac..79ad29a8a60a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -20,6 +20,7 @@
 #include <linux/shrinker.h>
 #include <linux/resource.h>
 #include <linux/page_ext.h>
+#include <linux/err.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -1198,6 +1199,49 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 		    int write, int force, struct page **pages);
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
+
+/* Container for pinned pfns / pages */
+struct frame_vector {
+	unsigned int nr_allocated;	/* Number of frames we have space for */
+	unsigned int nr_frames;	/* Number of frames stored in ptrs array */
+	bool got_ref;		/* Did we pin pages by getting page ref? */
+	bool is_pfns;		/* Does array contain pages or pfns? */
+	void *ptrs[0];		/* Array of pinned pfns / pages. Use
+				 * pfns_vector_pages() or pfns_vector_pfns()
+				 * for access */
+};
+
+struct frame_vector *frame_vector_create(unsigned int nr_frames);
+void frame_vector_destroy(struct frame_vector *vec);
+int get_vaddr_frames(unsigned long start, unsigned int nr_pfns,
+		     bool write, bool force, struct frame_vector *vec);
+void put_vaddr_frames(struct frame_vector *vec);
+int frame_vector_to_pages(struct frame_vector *vec);
+void frame_vector_to_pfns(struct frame_vector *vec);
+
+static inline unsigned int frame_vector_count(struct frame_vector *vec)
+{
+	return vec->nr_frames;
+}
+
+static inline struct page **frame_vector_pages(struct frame_vector *vec)
+{
+	if (vec->is_pfns) {
+		int err = frame_vector_to_pages(vec);
+
+		if (err)
+			return ERR_PTR(err);
+	}
+	return (struct page **)(vec->ptrs);
+}
+
+static inline unsigned long *frame_vector_pfns(struct frame_vector *vec)
+{
+	if (!vec->is_pfns)
+		frame_vector_to_pfns(vec);
+	return (unsigned long *)(vec->ptrs);
+}
+
 struct kvec;
 int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
 			struct page **pages);
diff --git a/mm/Kconfig b/mm/Kconfig
index e79de2bd12cd..7f146dd32fc5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -654,3 +654,6 @@ config DEFERRED_STRUCT_PAGE_INIT
 	  when kswapd starts. This has a potential performance impact on
 	  processes running early in the lifetime of the systemm until kswapd
 	  finishes the initialisation.
+
+config FRAME_VECTOR
+	bool
diff --git a/mm/Makefile b/mm/Makefile
index 98c4eaeabdcb..be5d5c866305 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_CMA)	+= cma.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
new file mode 100644
index 000000000000..cdabcb93c6a6
--- /dev/null
+++ b/mm/frame_vector.c
@@ -0,0 +1,230 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+
+/*
+ * get_vaddr_frames() - map virtual addresses to pfns
+ * @start:	starting user address
+ * @nr_frames:	number of pages / pfns from start to map
+ * @write:	whether pages will be written to by the caller
+ * @force:	whether to force write access even if user mapping is
+ *		readonly. See description of the same argument of
+		get_user_pages().
+ * @vec:	structure which receives pages / pfns of the addresses mapped.
+ *		It should have space for at least nr_frames entries.
+ *
+ * This function maps virtual addresses from @start and fills @vec structure
+ * with page frame numbers or page pointers to corresponding pages (choice
+ * depends on the type of the vma underlying the virtual address). If @start
+ * belongs to a normal vma, the function grabs reference to each of the pages
+ * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't
+ * touch page structures and the caller must make sure pfns aren't reused for
+ * anything else while he is using them.
+ *
+ * The function returns number of pages mapped which may be less than
+ * @nr_frames. In particular we stop mapping if there are more vmas of
+ * different type underlying the specified range of virtual addresses.
+ * When the function isn't able to map a single page, it returns error.
+ *
+ * This function takes care of grabbing mmap_sem as necessary.
+ */
+int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
+		     bool write, bool force, struct frame_vector *vec)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	int ret = 0;
+	int err;
+	int locked;
+
+	if (nr_frames == 0)
+		return 0;
+
+	if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
+		nr_frames = vec->nr_allocated;
+
+	down_read(&mm->mmap_sem);
+	locked = 1;
+	vma = find_vma_intersection(mm, start, start + 1);
+	if (!vma) {
+		ret = -EFAULT;
+		goto out;
+	}
+	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
+		vec->got_ref = true;
+		vec->is_pfns = false;
+		ret = get_user_pages_locked(current, mm, start, nr_frames,
+			write, force, (struct page **)(vec->ptrs), &locked);
+		goto out;
+	}
+
+	vec->got_ref = false;
+	vec->is_pfns = true;
+	do {
+		unsigned long *nums = frame_vector_pfns(vec);
+
+		while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
+			err = follow_pfn(vma, start, &nums[ret]);
+			if (err) {
+				if (ret == 0)
+					ret = err;
+				goto out;
+			}
+			start += PAGE_SIZE;
+			ret++;
+		}
+		/*
+		 * We stop if we have enough pages or if VMA doesn't completely
+		 * cover the tail page.
+		 */
+		if (ret >= nr_frames || start < vma->vm_end)
+			break;
+		vma = find_vma_intersection(mm, start, start + 1);
+	} while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP));
+out:
+	if (locked)
+		up_read(&mm->mmap_sem);
+	if (!ret)
+		ret = -EFAULT;
+	if (ret > 0)
+		vec->nr_frames = ret;
+	return ret;
+}
+EXPORT_SYMBOL(get_vaddr_frames);
+
+/**
+ * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired
+ *			them
+ * @vec:	frame vector to put
+ *
+ * Drop references to pages if get_vaddr_frames() acquired them. We also
+ * invalidate the frame vector so that it is prepared for the next call into
+ * get_vaddr_frames().
+ */
+void put_vaddr_frames(struct frame_vector *vec)
+{
+	int i;
+	struct page **pages;
+
+	if (!vec->got_ref)
+		goto out;
+	pages = frame_vector_pages(vec);
+	/*
+	 * frame_vector_pages() might needed to do a conversion when
+	 * get_vaddr_frames() got pages but vec was later converted to pfns.
+	 * But it shouldn't really fail to convert pfns back...
+	 */
+	if (WARN_ON(IS_ERR(pages)))
+		goto out;
+	for (i = 0; i < vec->nr_frames; i++)
+		put_page(pages[i]);
+	vec->got_ref = false;
+out:
+	vec->nr_frames = 0;
+}
+EXPORT_SYMBOL(put_vaddr_frames);
+
+/**
+ * frame_vector_to_pages - convert frame vector to contain page pointers
+ * @vec:	frame vector to convert
+ *
+ * Convert @vec to contain array of page pointers.  If the conversion is
+ * successful, return 0. Otherwise return an error. Note that we do not grab
+ * page references for the page structures.
+ */
+int frame_vector_to_pages(struct frame_vector *vec)
+{
+	int i;
+	unsigned long *nums;
+	struct page **pages;
+
+	if (!vec->is_pfns)
+		return 0;
+	nums = frame_vector_pfns(vec);
+	for (i = 0; i < vec->nr_frames; i++)
+		if (!pfn_valid(nums[i]))
+			return -EINVAL;
+	pages = (struct page **)nums;
+	for (i = 0; i < vec->nr_frames; i++)
+		pages[i] = pfn_to_page(nums[i]);
+	vec->is_pfns = false;
+	return 0;
+}
+EXPORT_SYMBOL(frame_vector_to_pages);
+
+/**
+ * frame_vector_to_pfns - convert frame vector to contain pfns
+ * @vec:	frame vector to convert
+ *
+ * Convert @vec to contain array of pfns.
+ */
+void frame_vector_to_pfns(struct frame_vector *vec)
+{
+	int i;
+	unsigned long *nums;
+	struct page **pages;
+
+	if (vec->is_pfns)
+		return;
+	pages = (struct page **)(vec->ptrs);
+	nums = (unsigned long *)pages;
+	for (i = 0; i < vec->nr_frames; i++)
+		nums[i] = page_to_pfn(pages[i]);
+	vec->is_pfns = true;
+}
+EXPORT_SYMBOL(frame_vector_to_pfns);
+
+/**
+ * frame_vector_create() - allocate & initialize structure for pinned pfns
+ * @nr_frames:	number of pfns slots we should reserve
+ *
+ * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns
+ * pfns.
+ */
+struct frame_vector *frame_vector_create(unsigned int nr_frames)
+{
+	struct frame_vector *vec;
+	int size = sizeof(struct frame_vector) + sizeof(void *) * nr_frames;
+
+	if (WARN_ON_ONCE(nr_frames == 0))
+		return NULL;
+	/*
+	 * This is absurdly high. It's here just to avoid strange effects when
+	 * arithmetics overflows.
+	 */
+	if (WARN_ON_ONCE(nr_frames > INT_MAX / sizeof(void *) / 2))
+		return NULL;
+	/*
+	 * Avoid higher order allocations, use vmalloc instead. It should
+	 * be rare anyway.
+	 */
+	if (size <= PAGE_SIZE)
+		vec = kmalloc(size, GFP_KERNEL);
+	else
+		vec = vmalloc(size);
+	if (!vec)
+		return NULL;
+	vec->nr_allocated = nr_frames;
+	vec->nr_frames = 0;
+	return vec;
+}
+EXPORT_SYMBOL(frame_vector_create);
+
+/**
+ * frame_vector_destroy() - free memory allocated to carry frame vector
+ * @vec:	Frame vector to free
+ *
+ * Free structure allocated by frame_vector_create() to carry frames.
+ */
+void frame_vector_destroy(struct frame_vector *vec)
+{
+	/* Make sure put_vaddr_frames() got called properly... */
+	VM_BUG_ON(vec->nr_frames > 0);
+	kvfree(vec);
+}
+EXPORT_SYMBOL(frame_vector_destroy);
-- 
cgit v1.2.3-70-g09d2


From ab149b88eb23482cd9c203527bf4e612ee95b50f Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Date: Tue, 21 Jul 2015 10:09:10 -0300
Subject: [media] tw68: Move PCI vendor and device IDs to pci_ids.h

This commits moves the Intersil/Techwell PCI vendor ID, and
the device IDs for the TW68 PCI video capture cards.

This will allow to support future Intersil/Techwell devices
without duplicating the IDs.

Signed-off-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 drivers/media/pci/tw68/tw68-core.c | 21 +++++++++++----------
 drivers/media/pci/tw68/tw68.h      | 16 ----------------
 include/linux/pci_ids.h            |  9 +++++++++
 3 files changed, 20 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/pci/tw68/tw68-core.c b/drivers/media/pci/tw68/tw68-core.c
index c135165a8b26..04706cc9b818 100644
--- a/drivers/media/pci/tw68/tw68-core.c
+++ b/drivers/media/pci/tw68/tw68-core.c
@@ -37,6 +37,7 @@
 #include <linux/delay.h>
 #include <linux/mutex.h>
 #include <linux/dma-mapping.h>
+#include <linux/pci_ids.h>
 #include <linux/pm.h>
 
 #include <media/v4l2-dev.h>
@@ -70,13 +71,13 @@ static atomic_t tw68_instance = ATOMIC_INIT(0);
  * added under vendor 0x1797 (Techwell Inc.) as subsystem IDs.
  */
 static const struct pci_device_id tw68_pci_tbl[] = {
-	{PCI_DEVICE(PCI_VENDOR_ID_TECHWELL, PCI_DEVICE_ID_6800)},
-	{PCI_DEVICE(PCI_VENDOR_ID_TECHWELL, PCI_DEVICE_ID_6801)},
-	{PCI_DEVICE(PCI_VENDOR_ID_TECHWELL, PCI_DEVICE_ID_6804)},
-	{PCI_DEVICE(PCI_VENDOR_ID_TECHWELL, PCI_DEVICE_ID_6816_1)},
-	{PCI_DEVICE(PCI_VENDOR_ID_TECHWELL, PCI_DEVICE_ID_6816_2)},
-	{PCI_DEVICE(PCI_VENDOR_ID_TECHWELL, PCI_DEVICE_ID_6816_3)},
-	{PCI_DEVICE(PCI_VENDOR_ID_TECHWELL, PCI_DEVICE_ID_6816_4)},
+	{PCI_DEVICE(PCI_VENDOR_ID_TECHWELL, PCI_DEVICE_ID_TECHWELL_6800)},
+	{PCI_DEVICE(PCI_VENDOR_ID_TECHWELL, PCI_DEVICE_ID_TECHWELL_6801)},
+	{PCI_DEVICE(PCI_VENDOR_ID_TECHWELL, PCI_DEVICE_ID_TECHWELL_6804)},
+	{PCI_DEVICE(PCI_VENDOR_ID_TECHWELL, PCI_DEVICE_ID_TECHWELL_6816_1)},
+	{PCI_DEVICE(PCI_VENDOR_ID_TECHWELL, PCI_DEVICE_ID_TECHWELL_6816_2)},
+	{PCI_DEVICE(PCI_VENDOR_ID_TECHWELL, PCI_DEVICE_ID_TECHWELL_6816_3)},
+	{PCI_DEVICE(PCI_VENDOR_ID_TECHWELL, PCI_DEVICE_ID_TECHWELL_6816_4)},
 	{0,}
 };
 
@@ -263,15 +264,15 @@ static int tw68_initdev(struct pci_dev *pci_dev,
 	}
 
 	switch (pci_id->device) {
-	case PCI_DEVICE_ID_6800:	/* TW6800 */
+	case PCI_DEVICE_ID_TECHWELL_6800:	/* TW6800 */
 		dev->vdecoder = TW6800;
 		dev->board_virqmask = TW68_VID_INTS;
 		break;
-	case PCI_DEVICE_ID_6801:	/* Video decoder for TW6802 */
+	case PCI_DEVICE_ID_TECHWELL_6801:	/* Video decoder for TW6802 */
 		dev->vdecoder = TW6801;
 		dev->board_virqmask = TW68_VID_INTS | TW68_VID_INTSX;
 		break;
-	case PCI_DEVICE_ID_6804:	/* Video decoder for TW6804 */
+	case PCI_DEVICE_ID_TECHWELL_6804:	/* Video decoder for TW6804 */
 		dev->vdecoder = TW6804;
 		dev->board_virqmask = TW68_VID_INTS | TW68_VID_INTSX;
 		break;
diff --git a/drivers/media/pci/tw68/tw68.h b/drivers/media/pci/tw68/tw68.h
index 93f2335e004b..ef51e4d48866 100644
--- a/drivers/media/pci/tw68/tw68.h
+++ b/drivers/media/pci/tw68/tw68.h
@@ -42,22 +42,6 @@
 
 #define	UNSET	(-1U)
 
-/* system vendor and device ID's */
-#define	PCI_VENDOR_ID_TECHWELL	0x1797
-#define	PCI_DEVICE_ID_6800	0x6800
-#define	PCI_DEVICE_ID_6801	0x6801
-#define	PCI_DEVICE_ID_AUDIO2	0x6802
-#define	PCI_DEVICE_ID_TS3	0x6803
-#define	PCI_DEVICE_ID_6804	0x6804
-#define	PCI_DEVICE_ID_AUDIO5	0x6805
-#define	PCI_DEVICE_ID_TS6	0x6806
-
-/* tw6816 based cards */
-#define	PCI_DEVICE_ID_6816_1   0x6810
-#define	PCI_DEVICE_ID_6816_2   0x6811
-#define	PCI_DEVICE_ID_6816_3   0x6812
-#define	PCI_DEVICE_ID_6816_4   0x6813
-
 #define TW68_NORMS ( \
 	V4L2_STD_NTSC    | V4L2_STD_PAL       | V4L2_STD_SECAM    | \
 	V4L2_STD_PAL_M   | V4L2_STD_PAL_Nc    | V4L2_STD_PAL_60)
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index fcff8f865341..d9ba49cedc5d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2332,6 +2332,15 @@
 
 #define PCI_VENDOR_ID_CAVIUM		0x177d
 
+#define PCI_VENDOR_ID_TECHWELL		0x1797
+#define PCI_DEVICE_ID_TECHWELL_6800	0x6800
+#define PCI_DEVICE_ID_TECHWELL_6801	0x6801
+#define PCI_DEVICE_ID_TECHWELL_6804	0x6804
+#define PCI_DEVICE_ID_TECHWELL_6816_1	0x6810
+#define PCI_DEVICE_ID_TECHWELL_6816_2	0x6811
+#define PCI_DEVICE_ID_TECHWELL_6816_3	0x6812
+#define PCI_DEVICE_ID_TECHWELL_6816_4	0x6813
+
 #define PCI_VENDOR_ID_BELKIN		0x1799
 #define PCI_DEVICE_ID_BELKIN_F5D7010V7	0x701f
 
-- 
cgit v1.2.3-70-g09d2


From 76b733d15874128ee2d0365b4cbe7d51decd8d37 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Fri, 14 Aug 2015 22:33:30 +0200
Subject: nfc: st-nci: Remove duplicate file platform_data/st_nci.h

commit "nfc: st-nci: Rename st21nfcb to st-nci" adds
include/linux/platform_data/st_nci.h duplicated with
include/linux/platform_data/st-nci.h.

Only drivers/nfc/st-nci/i2c.c uses platform_data/st_nci.h.

Cc: stable@vger.kernel.org
Reported-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/nfc/st-nci/i2c.c             |  2 +-
 include/linux/platform_data/st_nci.h | 29 -----------------------------
 2 files changed, 1 insertion(+), 30 deletions(-)
 delete mode 100644 include/linux/platform_data/st_nci.h

(limited to 'include/linux')

diff --git a/drivers/nfc/st-nci/i2c.c b/drivers/nfc/st-nci/i2c.c
index 06175ce769bb..b2a467c2d668 100644
--- a/drivers/nfc/st-nci/i2c.c
+++ b/drivers/nfc/st-nci/i2c.c
@@ -25,7 +25,7 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/nfc.h>
-#include <linux/platform_data/st_nci.h>
+#include <linux/platform_data/st-nci.h>
 
 #include "ndlc.h"
 
diff --git a/include/linux/platform_data/st_nci.h b/include/linux/platform_data/st_nci.h
deleted file mode 100644
index d9d400a297bd..000000000000
--- a/include/linux/platform_data/st_nci.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Driver include for ST NCI NFC chip family.
- *
- * Copyright (C) 2014-2015  STMicroelectronics SAS. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _ST_NCI_H_
-#define _ST_NCI_H_
-
-#define ST_NCI_DRIVER_NAME "st_nci"
-
-struct st_nci_nfc_platform_data {
-	unsigned int gpio_reset;
-	unsigned int irq_polarity;
-};
-
-#endif /* _ST_NCI_H_ */
-- 
cgit v1.2.3-70-g09d2


From b0d955ba4688fcba8112884931aea1f1e6f50f03 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 14 Aug 2015 15:30:41 +0800
Subject: crypto: aead - Remove old AEAD interfaces

Now that the AEAD conversion is complete we can rip out the old
AEAD interafce and associated code.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/aead.c                   | 606 +---------------------------------------
 include/crypto/aead.h           | 148 +---------
 include/crypto/internal/aead.h  |  42 +--
 include/crypto/internal/geniv.h |   2 -
 include/linux/crypto.h          |  48 +---
 5 files changed, 28 insertions(+), 818 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/aead.c b/crypto/aead.c
index a4dcd19dcca6..c40df2c4d420 100644
--- a/crypto/aead.c
+++ b/crypto/aead.c
@@ -3,7 +3,7 @@
  *
  * This file provides API support for AEAD algorithms.
  *
- * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
+ * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -21,7 +21,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/rtnetlink.h>
-#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/cryptouser.h>
@@ -29,17 +28,6 @@
 
 #include "internal.h"
 
-struct compat_request_ctx {
-	struct scatterlist src[2];
-	struct scatterlist dst[2];
-	struct scatterlist ivbuf[2];
-	struct scatterlist *ivsg;
-	struct aead_givcrypt_request subreq;
-};
-
-static int aead_null_givencrypt(struct aead_givcrypt_request *req);
-static int aead_null_givdecrypt(struct aead_givcrypt_request *req);
-
 static int setkey_unaligned(struct crypto_aead *tfm, const u8 *key,
 			    unsigned int keylen)
 {
@@ -55,7 +43,7 @@ static int setkey_unaligned(struct crypto_aead *tfm, const u8 *key,
 
 	alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
 	memcpy(alignbuffer, key, keylen);
-	ret = tfm->setkey(tfm, alignbuffer, keylen);
+	ret = crypto_aead_alg(tfm)->setkey(tfm, alignbuffer, keylen);
 	memset(alignbuffer, 0, keylen);
 	kfree(buffer);
 	return ret;
@@ -66,12 +54,10 @@ int crypto_aead_setkey(struct crypto_aead *tfm,
 {
 	unsigned long alignmask = crypto_aead_alignmask(tfm);
 
-	tfm = tfm->child;
-
 	if ((unsigned long)key & alignmask)
 		return setkey_unaligned(tfm, key, keylen);
 
-	return tfm->setkey(tfm, key, keylen);
+	return crypto_aead_alg(tfm)->setkey(tfm, key, keylen);
 }
 EXPORT_SYMBOL_GPL(crypto_aead_setkey);
 
@@ -82,100 +68,17 @@ int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
 	if (authsize > crypto_aead_maxauthsize(tfm))
 		return -EINVAL;
 
-	if (tfm->setauthsize) {
-		err = tfm->setauthsize(tfm->child, authsize);
+	if (crypto_aead_alg(tfm)->setauthsize) {
+		err = crypto_aead_alg(tfm)->setauthsize(tfm, authsize);
 		if (err)
 			return err;
 	}
 
-	tfm->child->authsize = authsize;
 	tfm->authsize = authsize;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(crypto_aead_setauthsize);
 
-struct aead_old_request {
-	struct scatterlist srcbuf[2];
-	struct scatterlist dstbuf[2];
-	struct aead_request subreq;
-};
-
-unsigned int crypto_aead_reqsize(struct crypto_aead *tfm)
-{
-	return tfm->reqsize + sizeof(struct aead_old_request);
-}
-EXPORT_SYMBOL_GPL(crypto_aead_reqsize);
-
-static int old_crypt(struct aead_request *req,
-		     int (*crypt)(struct aead_request *req))
-{
-	struct aead_old_request *nreq = aead_request_ctx(req);
-	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct scatterlist *src, *dst;
-
-	if (req->old)
-		return crypt(req);
-
-	src = scatterwalk_ffwd(nreq->srcbuf, req->src, req->assoclen);
-	dst = req->src == req->dst ?
-	      src : scatterwalk_ffwd(nreq->dstbuf, req->dst, req->assoclen);
-
-	aead_request_set_tfm(&nreq->subreq, aead);
-	aead_request_set_callback(&nreq->subreq, aead_request_flags(req),
-				  req->base.complete, req->base.data);
-	aead_request_set_crypt(&nreq->subreq, src, dst, req->cryptlen,
-			       req->iv);
-	aead_request_set_assoc(&nreq->subreq, req->src, req->assoclen);
-
-	return crypt(&nreq->subreq);
-}
-
-static int old_encrypt(struct aead_request *req)
-{
-	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct old_aead_alg *alg = crypto_old_aead_alg(aead);
-
-	return old_crypt(req, alg->encrypt);
-}
-
-static int old_decrypt(struct aead_request *req)
-{
-	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct old_aead_alg *alg = crypto_old_aead_alg(aead);
-
-	return old_crypt(req, alg->decrypt);
-}
-
-static int no_givcrypt(struct aead_givcrypt_request *req)
-{
-	return -ENOSYS;
-}
-
-static int crypto_old_aead_init_tfm(struct crypto_tfm *tfm)
-{
-	struct old_aead_alg *alg = &tfm->__crt_alg->cra_aead;
-	struct crypto_aead *crt = __crypto_aead_cast(tfm);
-
-	if (max(alg->maxauthsize, alg->ivsize) > PAGE_SIZE / 8)
-		return -EINVAL;
-
-	crt->setkey = alg->setkey;
-	crt->setauthsize = alg->setauthsize;
-	crt->encrypt = old_encrypt;
-	crt->decrypt = old_decrypt;
-	if (alg->ivsize) {
-		crt->givencrypt = alg->givencrypt ?: no_givcrypt;
-		crt->givdecrypt = alg->givdecrypt ?: no_givcrypt;
-	} else {
-		crt->givencrypt = aead_null_givencrypt;
-		crt->givdecrypt = aead_null_givdecrypt;
-	}
-	crt->child = __crypto_aead_cast(tfm);
-	crt->authsize = alg->maxauthsize;
-
-	return 0;
-}
-
 static void crypto_aead_exit_tfm(struct crypto_tfm *tfm)
 {
 	struct crypto_aead *aead = __crypto_aead_cast(tfm);
@@ -189,14 +92,6 @@ static int crypto_aead_init_tfm(struct crypto_tfm *tfm)
 	struct crypto_aead *aead = __crypto_aead_cast(tfm);
 	struct aead_alg *alg = crypto_aead_alg(aead);
 
-	if (crypto_old_aead_alg(aead)->encrypt)
-		return crypto_old_aead_init_tfm(tfm);
-
-	aead->setkey = alg->setkey;
-	aead->setauthsize = alg->setauthsize;
-	aead->encrypt = alg->encrypt;
-	aead->decrypt = alg->decrypt;
-	aead->child = __crypto_aead_cast(tfm);
 	aead->authsize = alg->maxauthsize;
 
 	if (alg->exit)
@@ -208,64 +103,6 @@ static int crypto_aead_init_tfm(struct crypto_tfm *tfm)
 	return 0;
 }
 
-#ifdef CONFIG_NET
-static int crypto_old_aead_report(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_report_aead raead;
-	struct old_aead_alg *aead = &alg->cra_aead;
-
-	strncpy(raead.type, "aead", sizeof(raead.type));
-	strncpy(raead.geniv, aead->geniv ?: "<built-in>", sizeof(raead.geniv));
-
-	raead.blocksize = alg->cra_blocksize;
-	raead.maxauthsize = aead->maxauthsize;
-	raead.ivsize = aead->ivsize;
-
-	if (nla_put(skb, CRYPTOCFGA_REPORT_AEAD,
-		    sizeof(struct crypto_report_aead), &raead))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
-}
-#else
-static int crypto_old_aead_report(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	return -ENOSYS;
-}
-#endif
-
-static void crypto_old_aead_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
-static void crypto_old_aead_show(struct seq_file *m, struct crypto_alg *alg)
-{
-	struct old_aead_alg *aead = &alg->cra_aead;
-
-	seq_printf(m, "type         : aead\n");
-	seq_printf(m, "async        : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ?
-					     "yes" : "no");
-	seq_printf(m, "blocksize    : %u\n", alg->cra_blocksize);
-	seq_printf(m, "ivsize       : %u\n", aead->ivsize);
-	seq_printf(m, "maxauthsize  : %u\n", aead->maxauthsize);
-	seq_printf(m, "geniv        : %s\n", aead->geniv ?: "<built-in>");
-}
-
-const struct crypto_type crypto_aead_type = {
-	.extsize = crypto_alg_extsize,
-	.init_tfm = crypto_aead_init_tfm,
-#ifdef CONFIG_PROC_FS
-	.show = crypto_old_aead_show,
-#endif
-	.report = crypto_old_aead_report,
-	.lookup = crypto_lookup_aead,
-	.maskclear = ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV),
-	.maskset = CRYPTO_ALG_TYPE_MASK,
-	.type = CRYPTO_ALG_TYPE_AEAD,
-	.tfmsize = offsetof(struct crypto_aead, base),
-};
-EXPORT_SYMBOL_GPL(crypto_aead_type);
-
 #ifdef CONFIG_NET
 static int crypto_aead_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
@@ -321,7 +158,7 @@ static void crypto_aead_free_instance(struct crypto_instance *inst)
 	aead->free(aead);
 }
 
-static const struct crypto_type crypto_new_aead_type = {
+static const struct crypto_type crypto_aead_type = {
 	.extsize = crypto_alg_extsize,
 	.init_tfm = crypto_aead_init_tfm,
 	.free = crypto_aead_free_instance,
@@ -335,81 +172,6 @@ static const struct crypto_type crypto_new_aead_type = {
 	.tfmsize = offsetof(struct crypto_aead, base),
 };
 
-static int aead_null_givencrypt(struct aead_givcrypt_request *req)
-{
-	return crypto_aead_encrypt(&req->areq);
-}
-
-static int aead_null_givdecrypt(struct aead_givcrypt_request *req)
-{
-	return crypto_aead_decrypt(&req->areq);
-}
-
-#ifdef CONFIG_NET
-static int crypto_nivaead_report(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_report_aead raead;
-	struct old_aead_alg *aead = &alg->cra_aead;
-
-	strncpy(raead.type, "nivaead", sizeof(raead.type));
-	strncpy(raead.geniv, aead->geniv, sizeof(raead.geniv));
-
-	raead.blocksize = alg->cra_blocksize;
-	raead.maxauthsize = aead->maxauthsize;
-	raead.ivsize = aead->ivsize;
-
-	if (nla_put(skb, CRYPTOCFGA_REPORT_AEAD,
-		    sizeof(struct crypto_report_aead), &raead))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
-}
-#else
-static int crypto_nivaead_report(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	return -ENOSYS;
-}
-#endif
-
-
-static void crypto_nivaead_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
-static void crypto_nivaead_show(struct seq_file *m, struct crypto_alg *alg)
-{
-	struct old_aead_alg *aead = &alg->cra_aead;
-
-	seq_printf(m, "type         : nivaead\n");
-	seq_printf(m, "async        : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ?
-					     "yes" : "no");
-	seq_printf(m, "blocksize    : %u\n", alg->cra_blocksize);
-	seq_printf(m, "ivsize       : %u\n", aead->ivsize);
-	seq_printf(m, "maxauthsize  : %u\n", aead->maxauthsize);
-	seq_printf(m, "geniv        : %s\n", aead->geniv);
-}
-
-const struct crypto_type crypto_nivaead_type = {
-	.extsize = crypto_alg_extsize,
-	.init_tfm = crypto_aead_init_tfm,
-#ifdef CONFIG_PROC_FS
-	.show = crypto_nivaead_show,
-#endif
-	.report = crypto_nivaead_report,
-	.maskclear = ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV),
-	.maskset = CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV,
-	.type = CRYPTO_ALG_TYPE_AEAD,
-	.tfmsize = offsetof(struct crypto_aead, base),
-};
-EXPORT_SYMBOL_GPL(crypto_nivaead_type);
-
-static int crypto_grab_nivaead(struct crypto_aead_spawn *spawn,
-			       const char *name, u32 type, u32 mask)
-{
-	spawn->base.frontend = &crypto_nivaead_type;
-	return crypto_grab_spawn(&spawn->base, name, type, mask);
-}
-
 static int aead_geniv_setkey(struct crypto_aead *tfm,
 			     const u8 *key, unsigned int keylen)
 {
@@ -426,169 +188,6 @@ static int aead_geniv_setauthsize(struct crypto_aead *tfm,
 	return crypto_aead_setauthsize(ctx->child, authsize);
 }
 
-static void compat_encrypt_complete2(struct aead_request *req, int err)
-{
-	struct compat_request_ctx *rctx = aead_request_ctx(req);
-	struct aead_givcrypt_request *subreq = &rctx->subreq;
-	struct crypto_aead *geniv;
-
-	if (err == -EINPROGRESS)
-		return;
-
-	if (err)
-		goto out;
-
-	geniv = crypto_aead_reqtfm(req);
-	scatterwalk_map_and_copy(subreq->giv, rctx->ivsg, 0,
-				 crypto_aead_ivsize(geniv), 1);
-
-out:
-	kzfree(subreq->giv);
-}
-
-static void compat_encrypt_complete(struct crypto_async_request *base, int err)
-{
-	struct aead_request *req = base->data;
-
-	compat_encrypt_complete2(req, err);
-	aead_request_complete(req, err);
-}
-
-static int compat_encrypt(struct aead_request *req)
-{
-	struct crypto_aead *geniv = crypto_aead_reqtfm(req);
-	struct aead_geniv_ctx *ctx = crypto_aead_ctx(geniv);
-	struct compat_request_ctx *rctx = aead_request_ctx(req);
-	struct aead_givcrypt_request *subreq = &rctx->subreq;
-	unsigned int ivsize = crypto_aead_ivsize(geniv);
-	struct scatterlist *src, *dst;
-	crypto_completion_t compl;
-	void *data;
-	u8 *info;
-	__be64 seq;
-	int err;
-
-	if (req->cryptlen < ivsize)
-		return -EINVAL;
-
-	compl = req->base.complete;
-	data = req->base.data;
-
-	rctx->ivsg = scatterwalk_ffwd(rctx->ivbuf, req->dst, req->assoclen);
-	info = PageHighMem(sg_page(rctx->ivsg)) ? NULL : sg_virt(rctx->ivsg);
-
-	if (!info) {
-		info = kmalloc(ivsize, req->base.flags &
-				       CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL:
-								  GFP_ATOMIC);
-		if (!info)
-			return -ENOMEM;
-
-		compl = compat_encrypt_complete;
-		data = req;
-	}
-
-	memcpy(&seq, req->iv + ivsize - sizeof(seq), sizeof(seq));
-
-	src = scatterwalk_ffwd(rctx->src, req->src, req->assoclen + ivsize);
-	dst = req->src == req->dst ?
-	      src : scatterwalk_ffwd(rctx->dst, rctx->ivsg, ivsize);
-
-	aead_givcrypt_set_tfm(subreq, ctx->child);
-	aead_givcrypt_set_callback(subreq, req->base.flags,
-				   req->base.complete, req->base.data);
-	aead_givcrypt_set_crypt(subreq, src, dst,
-				req->cryptlen - ivsize, req->iv);
-	aead_givcrypt_set_assoc(subreq, req->src, req->assoclen);
-	aead_givcrypt_set_giv(subreq, info, be64_to_cpu(seq));
-
-	err = crypto_aead_givencrypt(subreq);
-	if (unlikely(PageHighMem(sg_page(rctx->ivsg))))
-		compat_encrypt_complete2(req, err);
-	return err;
-}
-
-static int compat_decrypt(struct aead_request *req)
-{
-	struct crypto_aead *geniv = crypto_aead_reqtfm(req);
-	struct aead_geniv_ctx *ctx = crypto_aead_ctx(geniv);
-	struct compat_request_ctx *rctx = aead_request_ctx(req);
-	struct aead_request *subreq = &rctx->subreq.areq;
-	unsigned int ivsize = crypto_aead_ivsize(geniv);
-	struct scatterlist *src, *dst;
-	crypto_completion_t compl;
-	void *data;
-
-	if (req->cryptlen < ivsize)
-		return -EINVAL;
-
-	aead_request_set_tfm(subreq, ctx->child);
-
-	compl = req->base.complete;
-	data = req->base.data;
-
-	src = scatterwalk_ffwd(rctx->src, req->src, req->assoclen + ivsize);
-	dst = req->src == req->dst ?
-	      src : scatterwalk_ffwd(rctx->dst, req->dst,
-				     req->assoclen + ivsize);
-
-	aead_request_set_callback(subreq, req->base.flags, compl, data);
-	aead_request_set_crypt(subreq, src, dst,
-			       req->cryptlen - ivsize, req->iv);
-	aead_request_set_assoc(subreq, req->src, req->assoclen);
-
-	scatterwalk_map_and_copy(req->iv, req->src, req->assoclen, ivsize, 0);
-
-	return crypto_aead_decrypt(subreq);
-}
-
-static int compat_encrypt_first(struct aead_request *req)
-{
-	struct crypto_aead *geniv = crypto_aead_reqtfm(req);
-	struct aead_geniv_ctx *ctx = crypto_aead_ctx(geniv);
-	int err = 0;
-
-	spin_lock_bh(&ctx->lock);
-	if (geniv->encrypt != compat_encrypt_first)
-		goto unlock;
-
-	geniv->encrypt = compat_encrypt;
-
-unlock:
-	spin_unlock_bh(&ctx->lock);
-
-	if (err)
-		return err;
-
-	return compat_encrypt(req);
-}
-
-static int aead_geniv_init_compat(struct crypto_tfm *tfm)
-{
-	struct crypto_aead *geniv = __crypto_aead_cast(tfm);
-	struct aead_geniv_ctx *ctx = crypto_aead_ctx(geniv);
-	int err;
-
-	spin_lock_init(&ctx->lock);
-
-	crypto_aead_set_reqsize(geniv, sizeof(struct compat_request_ctx));
-
-	err = aead_geniv_init(tfm);
-
-	ctx->child = geniv->child;
-	geniv->child = geniv;
-
-	return err;
-}
-
-static void aead_geniv_exit_compat(struct crypto_tfm *tfm)
-{
-	struct crypto_aead *geniv = __crypto_aead_cast(tfm);
-	struct aead_geniv_ctx *ctx = crypto_aead_ctx(geniv);
-
-	crypto_free_aead(ctx->child);
-}
-
 struct aead_instance *aead_geniv_alloc(struct crypto_template *tmpl,
 				       struct rtattr **tb, u32 type, u32 mask)
 {
@@ -605,7 +204,7 @@ struct aead_instance *aead_geniv_alloc(struct crypto_template *tmpl,
 	if (IS_ERR(algt))
 		return ERR_CAST(algt);
 
-	if ((algt->type ^ (CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_GENIV)) &
+	if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) &
 	    algt->mask & ~CRYPTO_ALG_AEAD_NEW)
 		return ERR_PTR(-EINVAL);
 
@@ -623,9 +222,7 @@ struct aead_instance *aead_geniv_alloc(struct crypto_template *tmpl,
 	mask |= crypto_requires_sync(algt->type, algt->mask);
 
 	crypto_set_aead_spawn(spawn, aead_crypto_instance(inst));
-	err = (algt->mask & CRYPTO_ALG_GENIV) ?
-	      crypto_grab_nivaead(spawn, name, type, mask) :
-	      crypto_grab_aead(spawn, name, type, mask);
+	err = crypto_grab_aead(spawn, name, type, mask);
 	if (err)
 		goto err_free_inst;
 
@@ -638,43 +235,6 @@ struct aead_instance *aead_geniv_alloc(struct crypto_template *tmpl,
 	if (ivsize < sizeof(u64))
 		goto err_drop_alg;
 
-	/*
-	 * This is only true if we're constructing an algorithm with its
-	 * default IV generator.  For the default generator we elide the
-	 * template name and double-check the IV generator.
-	 */
-	if (algt->mask & CRYPTO_ALG_GENIV) {
-		if (!alg->base.cra_aead.encrypt)
-			goto err_drop_alg;
-		if (strcmp(tmpl->name, alg->base.cra_aead.geniv))
-			goto err_drop_alg;
-
-		memcpy(inst->alg.base.cra_name, alg->base.cra_name,
-		       CRYPTO_MAX_ALG_NAME);
-		memcpy(inst->alg.base.cra_driver_name,
-		       alg->base.cra_driver_name, CRYPTO_MAX_ALG_NAME);
-
-		inst->alg.base.cra_flags = CRYPTO_ALG_TYPE_AEAD |
-					   CRYPTO_ALG_GENIV;
-		inst->alg.base.cra_flags |= alg->base.cra_flags &
-					    CRYPTO_ALG_ASYNC;
-		inst->alg.base.cra_priority = alg->base.cra_priority;
-		inst->alg.base.cra_blocksize = alg->base.cra_blocksize;
-		inst->alg.base.cra_alignmask = alg->base.cra_alignmask;
-		inst->alg.base.cra_type = &crypto_aead_type;
-
-		inst->alg.base.cra_aead.ivsize = ivsize;
-		inst->alg.base.cra_aead.maxauthsize = maxauthsize;
-
-		inst->alg.base.cra_aead.setkey = alg->base.cra_aead.setkey;
-		inst->alg.base.cra_aead.setauthsize =
-			alg->base.cra_aead.setauthsize;
-		inst->alg.base.cra_aead.encrypt = alg->base.cra_aead.encrypt;
-		inst->alg.base.cra_aead.decrypt = alg->base.cra_aead.decrypt;
-
-		goto out;
-	}
-
 	err = -ENAMETOOLONG;
 	if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
 		     "%s(%s)", tmpl->name, alg->base.cra_name) >=
@@ -698,12 +258,6 @@ struct aead_instance *aead_geniv_alloc(struct crypto_template *tmpl,
 	inst->alg.ivsize = ivsize;
 	inst->alg.maxauthsize = maxauthsize;
 
-	inst->alg.encrypt = compat_encrypt_first;
-	inst->alg.decrypt = compat_decrypt;
-
-	inst->alg.base.cra_init = aead_geniv_init_compat;
-	inst->alg.base.cra_exit = aead_geniv_exit_compat;
-
 out:
 	return inst;
 
@@ -723,31 +277,6 @@ void aead_geniv_free(struct aead_instance *inst)
 }
 EXPORT_SYMBOL_GPL(aead_geniv_free);
 
-int aead_geniv_init(struct crypto_tfm *tfm)
-{
-	struct crypto_instance *inst = (void *)tfm->__crt_alg;
-	struct crypto_aead *child;
-	struct crypto_aead *aead;
-
-	aead = __crypto_aead_cast(tfm);
-
-	child = crypto_spawn_aead(crypto_instance_ctx(inst));
-	if (IS_ERR(child))
-		return PTR_ERR(child);
-
-	aead->child = child;
-	aead->reqsize += crypto_aead_reqsize(child);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(aead_geniv_init);
-
-void aead_geniv_exit(struct crypto_tfm *tfm)
-{
-	crypto_free_aead(__crypto_aead_cast(tfm)->child);
-}
-EXPORT_SYMBOL_GPL(aead_geniv_exit);
-
 int aead_init_geniv(struct crypto_aead *aead)
 {
 	struct aead_geniv_ctx *ctx = crypto_aead_ctx(aead);
@@ -801,123 +330,6 @@ void aead_exit_geniv(struct crypto_aead *tfm)
 }
 EXPORT_SYMBOL_GPL(aead_exit_geniv);
 
-static int crypto_nivaead_default(struct crypto_alg *alg, u32 type, u32 mask)
-{
-	struct rtattr *tb[3];
-	struct {
-		struct rtattr attr;
-		struct crypto_attr_type data;
-	} ptype;
-	struct {
-		struct rtattr attr;
-		struct crypto_attr_alg data;
-	} palg;
-	struct crypto_template *tmpl;
-	struct crypto_instance *inst;
-	struct crypto_alg *larval;
-	const char *geniv;
-	int err;
-
-	larval = crypto_larval_lookup(alg->cra_driver_name,
-				      CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_GENIV,
-				      CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV);
-	err = PTR_ERR(larval);
-	if (IS_ERR(larval))
-		goto out;
-
-	err = -EAGAIN;
-	if (!crypto_is_larval(larval))
-		goto drop_larval;
-
-	ptype.attr.rta_len = sizeof(ptype);
-	ptype.attr.rta_type = CRYPTOA_TYPE;
-	ptype.data.type = type | CRYPTO_ALG_GENIV;
-	/* GENIV tells the template that we're making a default geniv. */
-	ptype.data.mask = mask | CRYPTO_ALG_GENIV;
-	tb[0] = &ptype.attr;
-
-	palg.attr.rta_len = sizeof(palg);
-	palg.attr.rta_type = CRYPTOA_ALG;
-	/* Must use the exact name to locate ourselves. */
-	memcpy(palg.data.name, alg->cra_driver_name, CRYPTO_MAX_ALG_NAME);
-	tb[1] = &palg.attr;
-
-	tb[2] = NULL;
-
-	geniv = alg->cra_aead.geniv;
-
-	tmpl = crypto_lookup_template(geniv);
-	err = -ENOENT;
-	if (!tmpl)
-		goto kill_larval;
-
-	if (tmpl->create) {
-		err = tmpl->create(tmpl, tb);
-		if (err)
-			goto put_tmpl;
-		goto ok;
-	}
-
-	inst = tmpl->alloc(tb);
-	err = PTR_ERR(inst);
-	if (IS_ERR(inst))
-		goto put_tmpl;
-
-	err = crypto_register_instance(tmpl, inst);
-	if (err) {
-		tmpl->free(inst);
-		goto put_tmpl;
-	}
-
-ok:
-	/* Redo the lookup to use the instance we just registered. */
-	err = -EAGAIN;
-
-put_tmpl:
-	crypto_tmpl_put(tmpl);
-kill_larval:
-	crypto_larval_kill(larval);
-drop_larval:
-	crypto_mod_put(larval);
-out:
-	crypto_mod_put(alg);
-	return err;
-}
-
-struct crypto_alg *crypto_lookup_aead(const char *name, u32 type, u32 mask)
-{
-	struct crypto_alg *alg;
-
-	alg = crypto_alg_mod_lookup(name, type, mask);
-	if (IS_ERR(alg))
-		return alg;
-
-	if (alg->cra_type == &crypto_aead_type)
-		return alg;
-
-	if (!alg->cra_aead.ivsize)
-		return alg;
-
-	crypto_mod_put(alg);
-	alg = crypto_alg_mod_lookup(name, type | CRYPTO_ALG_TESTED,
-				    mask & ~CRYPTO_ALG_TESTED);
-	if (IS_ERR(alg))
-		return alg;
-
-	if (alg->cra_type == &crypto_aead_type) {
-		if (~alg->cra_flags & (type ^ ~mask) & CRYPTO_ALG_TESTED) {
-			crypto_mod_put(alg);
-			alg = ERR_PTR(-ENOENT);
-		}
-		return alg;
-	}
-
-	BUG_ON(!alg->cra_aead.ivsize);
-
-	return ERR_PTR(crypto_nivaead_default(alg, type, mask));
-}
-EXPORT_SYMBOL_GPL(crypto_lookup_aead);
-
 int crypto_grab_aead(struct crypto_aead_spawn *spawn, const char *name,
 		     u32 type, u32 mask)
 {
@@ -939,7 +351,7 @@ static int aead_prepare_alg(struct aead_alg *alg)
 	if (max(alg->maxauthsize, alg->ivsize) > PAGE_SIZE / 8)
 		return -EINVAL;
 
-	base->cra_type = &crypto_new_aead_type;
+	base->cra_type = &crypto_aead_type;
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 	base->cra_flags |= CRYPTO_ALG_TYPE_AEAD;
 
diff --git a/include/crypto/aead.h b/include/crypto/aead.h
index 14e35364cdfa..077cae1e6b51 100644
--- a/include/crypto/aead.h
+++ b/include/crypto/aead.h
@@ -1,7 +1,7 @@
 /*
  * AEAD: Authenticated Encryption with Associated Data
  * 
- * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
+ * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -71,14 +71,14 @@
  * in the first scatter gather list entry pointing to a NULL buffer.
  */
 
+struct crypto_aead;
+
 /**
  *	struct aead_request - AEAD request
  *	@base: Common attributes for async crypto requests
- *	@old: Boolean whether the old or new AEAD API is used
  *	@assoclen: Length in bytes of associated data for authentication
  *	@cryptlen: Length of data to be encrypted or decrypted
  *	@iv: Initialisation vector
- *	@assoc: Associated data
  *	@src: Source data
  *	@dst: Destination data
  *	@__ctx: Start of private context data
@@ -86,33 +86,17 @@
 struct aead_request {
 	struct crypto_async_request base;
 
-	bool old;
-
 	unsigned int assoclen;
 	unsigned int cryptlen;
 
 	u8 *iv;
 
-	struct scatterlist *assoc;
 	struct scatterlist *src;
 	struct scatterlist *dst;
 
 	void *__ctx[] CRYPTO_MINALIGN_ATTR;
 };
 
-/**
- *	struct aead_givcrypt_request - AEAD request with IV generation
- *	@seq: Sequence number for IV generation
- *	@giv: Space for generated IV
- *	@areq: The AEAD request itself
- */
-struct aead_givcrypt_request {
-	u64 seq;
-	u8 *giv;
-
-	struct aead_request areq;
-};
-
 /**
  * struct aead_alg - AEAD cipher definition
  * @maxauthsize: Set the maximum authentication tag size supported by the
@@ -165,16 +149,6 @@ struct aead_alg {
 };
 
 struct crypto_aead {
-	int (*setkey)(struct crypto_aead *tfm, const u8 *key,
-	              unsigned int keylen);
-	int (*setauthsize)(struct crypto_aead *tfm, unsigned int authsize);
-	int (*encrypt)(struct aead_request *req);
-	int (*decrypt)(struct aead_request *req);
-	int (*givencrypt)(struct aead_givcrypt_request *req);
-	int (*givdecrypt)(struct aead_givcrypt_request *req);
-
-	struct crypto_aead *child;
-
 	unsigned int authsize;
 	unsigned int reqsize;
 
@@ -216,16 +190,6 @@ static inline void crypto_free_aead(struct crypto_aead *tfm)
 	crypto_destroy_tfm(tfm, crypto_aead_tfm(tfm));
 }
 
-static inline struct crypto_aead *crypto_aead_crt(struct crypto_aead *tfm)
-{
-	return tfm;
-}
-
-static inline struct old_aead_alg *crypto_old_aead_alg(struct crypto_aead *tfm)
-{
-	return &crypto_aead_tfm(tfm)->__crt_alg->cra_aead;
-}
-
 static inline struct aead_alg *crypto_aead_alg(struct crypto_aead *tfm)
 {
 	return container_of(crypto_aead_tfm(tfm)->__crt_alg,
@@ -234,8 +198,7 @@ static inline struct aead_alg *crypto_aead_alg(struct crypto_aead *tfm)
 
 static inline unsigned int crypto_aead_alg_ivsize(struct aead_alg *alg)
 {
-	return alg->base.cra_aead.encrypt ? alg->base.cra_aead.ivsize :
-					    alg->ivsize;
+	return alg->ivsize;
 }
 
 /**
@@ -361,7 +324,7 @@ static inline struct crypto_aead *crypto_aead_reqtfm(struct aead_request *req)
  */
 static inline int crypto_aead_encrypt(struct aead_request *req)
 {
-	return crypto_aead_reqtfm(req)->encrypt(req);
+	return crypto_aead_alg(crypto_aead_reqtfm(req))->encrypt(req);
 }
 
 /**
@@ -388,10 +351,12 @@ static inline int crypto_aead_encrypt(struct aead_request *req)
  */
 static inline int crypto_aead_decrypt(struct aead_request *req)
 {
-	if (req->cryptlen < crypto_aead_authsize(crypto_aead_reqtfm(req)))
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+
+	if (req->cryptlen < crypto_aead_authsize(aead))
 		return -EINVAL;
 
-	return crypto_aead_reqtfm(req)->decrypt(req);
+	return crypto_aead_alg(aead)->decrypt(req);
 }
 
 /**
@@ -411,7 +376,10 @@ static inline int crypto_aead_decrypt(struct aead_request *req)
  *
  * Return: number of bytes
  */
-unsigned int crypto_aead_reqsize(struct crypto_aead *tfm);
+static inline unsigned int crypto_aead_reqsize(struct crypto_aead *tfm)
+{
+	return tfm->reqsize;
+}
 
 /**
  * aead_request_set_tfm() - update cipher handle reference in request
@@ -424,7 +392,7 @@ unsigned int crypto_aead_reqsize(struct crypto_aead *tfm);
 static inline void aead_request_set_tfm(struct aead_request *req,
 					struct crypto_aead *tfm)
 {
-	req->base.tfm = crypto_aead_tfm(tfm->child);
+	req->base.tfm = crypto_aead_tfm(tfm);
 }
 
 /**
@@ -549,23 +517,6 @@ static inline void aead_request_set_crypt(struct aead_request *req,
 	req->iv = iv;
 }
 
-/**
- * aead_request_set_assoc() - set the associated data scatter / gather list
- * @req: request handle
- * @assoc: associated data scatter / gather list
- * @assoclen: number of bytes to process from @assoc
- *
- * Obsolete, do not use.
- */
-static inline void aead_request_set_assoc(struct aead_request *req,
-					  struct scatterlist *assoc,
-					  unsigned int assoclen)
-{
-	req->assoc = assoc;
-	req->assoclen = assoclen;
-	req->old = true;
-}
-
 /**
  * aead_request_set_ad - set associated data information
  * @req: request handle
@@ -578,77 +529,6 @@ static inline void aead_request_set_ad(struct aead_request *req,
 				       unsigned int assoclen)
 {
 	req->assoclen = assoclen;
-	req->old = false;
-}
-
-static inline struct crypto_aead *aead_givcrypt_reqtfm(
-	struct aead_givcrypt_request *req)
-{
-	return crypto_aead_reqtfm(&req->areq);
-}
-
-static inline int crypto_aead_givencrypt(struct aead_givcrypt_request *req)
-{
-	return aead_givcrypt_reqtfm(req)->givencrypt(req);
-};
-
-static inline int crypto_aead_givdecrypt(struct aead_givcrypt_request *req)
-{
-	return aead_givcrypt_reqtfm(req)->givdecrypt(req);
-};
-
-static inline void aead_givcrypt_set_tfm(struct aead_givcrypt_request *req,
-					 struct crypto_aead *tfm)
-{
-	req->areq.base.tfm = crypto_aead_tfm(tfm);
-}
-
-static inline struct aead_givcrypt_request *aead_givcrypt_alloc(
-	struct crypto_aead *tfm, gfp_t gfp)
-{
-	struct aead_givcrypt_request *req;
-
-	req = kmalloc(sizeof(struct aead_givcrypt_request) +
-		      crypto_aead_reqsize(tfm), gfp);
-
-	if (likely(req))
-		aead_givcrypt_set_tfm(req, tfm);
-
-	return req;
-}
-
-static inline void aead_givcrypt_free(struct aead_givcrypt_request *req)
-{
-	kfree(req);
-}
-
-static inline void aead_givcrypt_set_callback(
-	struct aead_givcrypt_request *req, u32 flags,
-	crypto_completion_t compl, void *data)
-{
-	aead_request_set_callback(&req->areq, flags, compl, data);
-}
-
-static inline void aead_givcrypt_set_crypt(struct aead_givcrypt_request *req,
-					   struct scatterlist *src,
-					   struct scatterlist *dst,
-					   unsigned int nbytes, void *iv)
-{
-	aead_request_set_crypt(&req->areq, src, dst, nbytes, iv);
-}
-
-static inline void aead_givcrypt_set_assoc(struct aead_givcrypt_request *req,
-					   struct scatterlist *assoc,
-					   unsigned int assoclen)
-{
-	aead_request_set_assoc(&req->areq, assoc, assoclen);
-}
-
-static inline void aead_givcrypt_set_giv(struct aead_givcrypt_request *req,
-					 u8 *giv, u64 seq)
-{
-	req->giv = giv;
-	req->seq = seq;
 }
 
 #endif	/* _CRYPTO_AEAD_H */
diff --git a/include/crypto/internal/aead.h b/include/crypto/internal/aead.h
index 49f3179b8a17..5554cdd8d6c1 100644
--- a/include/crypto/internal/aead.h
+++ b/include/crypto/internal/aead.h
@@ -1,7 +1,7 @@
 /*
  * AEAD: Authenticated Encryption with Associated Data
  * 
- * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
+ * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -39,20 +39,11 @@ struct aead_queue {
 	struct crypto_queue base;
 };
 
-extern const struct crypto_type crypto_aead_type;
-extern const struct crypto_type crypto_nivaead_type;
-
 static inline void *crypto_aead_ctx(struct crypto_aead *tfm)
 {
 	return crypto_tfm_ctx(&tfm->base);
 }
 
-static inline struct crypto_instance *crypto_aead_alg_instance(
-	struct crypto_aead *aead)
-{
-	return crypto_tfm_alg_instance(&aead->base);
-}
-
 static inline struct crypto_instance *aead_crypto_instance(
 	struct aead_instance *inst)
 {
@@ -66,7 +57,7 @@ static inline struct aead_instance *aead_instance(struct crypto_instance *inst)
 
 static inline struct aead_instance *aead_alg_instance(struct crypto_aead *aead)
 {
-	return aead_instance(crypto_aead_alg_instance(aead));
+	return aead_instance(crypto_tfm_alg_instance(&aead->base));
 }
 
 static inline void *aead_instance_ctx(struct aead_instance *inst)
@@ -95,8 +86,6 @@ static inline void crypto_set_aead_spawn(
 	crypto_set_spawn(&spawn->base, inst);
 }
 
-struct crypto_alg *crypto_lookup_aead(const char *name, u32 type, u32 mask);
-
 int crypto_grab_aead(struct crypto_aead_spawn *spawn, const char *name,
 		     u32 type, u32 mask);
 
@@ -105,12 +94,6 @@ static inline void crypto_drop_aead(struct crypto_aead_spawn *spawn)
 	crypto_drop_spawn(&spawn->base);
 }
 
-static inline struct crypto_alg *crypto_aead_spawn_alg(
-	struct crypto_aead_spawn *spawn)
-{
-	return spawn->base.alg;
-}
-
 static inline struct aead_alg *crypto_spawn_aead_alg(
 	struct crypto_aead_spawn *spawn)
 {
@@ -123,32 +106,15 @@ static inline struct crypto_aead *crypto_spawn_aead(
 	return crypto_spawn_tfm2(&spawn->base);
 }
 
-static inline struct crypto_aead *aead_geniv_base(struct crypto_aead *geniv)
-{
-	return geniv->child;
-}
-
-static inline void *aead_givcrypt_reqctx(struct aead_givcrypt_request *req)
-{
-	return aead_request_ctx(&req->areq);
-}
-
-static inline void aead_givcrypt_complete(struct aead_givcrypt_request *req,
-					  int err)
-{
-	aead_request_complete(&req->areq, err);
-}
-
 static inline void crypto_aead_set_reqsize(struct crypto_aead *aead,
 					   unsigned int reqsize)
 {
-	crypto_aead_crt(aead)->reqsize = reqsize;
+	aead->reqsize = reqsize;
 }
 
 static inline unsigned int crypto_aead_alg_maxauthsize(struct aead_alg *alg)
 {
-	return alg->base.cra_aead.encrypt ? alg->base.cra_aead.maxauthsize :
-					    alg->maxauthsize;
+	return alg->maxauthsize;
 }
 
 static inline unsigned int crypto_aead_maxauthsize(struct crypto_aead *aead)
diff --git a/include/crypto/internal/geniv.h b/include/crypto/internal/geniv.h
index b9c55bef7b6d..59333635e712 100644
--- a/include/crypto/internal/geniv.h
+++ b/include/crypto/internal/geniv.h
@@ -27,8 +27,6 @@ struct aead_geniv_ctx {
 struct aead_instance *aead_geniv_alloc(struct crypto_template *tmpl,
 				       struct rtattr **tb, u32 type, u32 mask);
 void aead_geniv_free(struct aead_instance *inst);
-int aead_geniv_init(struct crypto_tfm *tfm);
-void aead_geniv_exit(struct crypto_tfm *tfm);
 int aead_init_geniv(struct crypto_aead *tfm);
 void aead_exit_geniv(struct crypto_aead *tfm);
 
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 81ef938b0a8e..7f4aee9e1f91 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -142,13 +142,10 @@
 struct scatterlist;
 struct crypto_ablkcipher;
 struct crypto_async_request;
-struct crypto_aead;
 struct crypto_blkcipher;
 struct crypto_hash;
 struct crypto_tfm;
 struct crypto_type;
-struct aead_request;
-struct aead_givcrypt_request;
 struct skcipher_givcrypt_request;
 
 typedef void (*crypto_completion_t)(struct crypto_async_request *req, int err);
@@ -274,47 +271,6 @@ struct ablkcipher_alg {
 	unsigned int ivsize;
 };
 
-/**
- * struct old_aead_alg - AEAD cipher definition
- * @maxauthsize: Set the maximum authentication tag size supported by the
- *		 transformation. A transformation may support smaller tag sizes.
- *		 As the authentication tag is a message digest to ensure the
- *		 integrity of the encrypted data, a consumer typically wants the
- *		 largest authentication tag possible as defined by this
- *		 variable.
- * @setauthsize: Set authentication size for the AEAD transformation. This
- *		 function is used to specify the consumer requested size of the
- * 		 authentication tag to be either generated by the transformation
- *		 during encryption or the size of the authentication tag to be
- *		 supplied during the decryption operation. This function is also
- *		 responsible for checking the authentication tag size for
- *		 validity.
- * @setkey: see struct ablkcipher_alg
- * @encrypt: see struct ablkcipher_alg
- * @decrypt: see struct ablkcipher_alg
- * @givencrypt: see struct ablkcipher_alg
- * @givdecrypt: see struct ablkcipher_alg
- * @geniv: see struct ablkcipher_alg
- * @ivsize: see struct ablkcipher_alg
- *
- * All fields except @givencrypt , @givdecrypt , @geniv and @ivsize are
- * mandatory and must be filled.
- */
-struct old_aead_alg {
-	int (*setkey)(struct crypto_aead *tfm, const u8 *key,
-	              unsigned int keylen);
-	int (*setauthsize)(struct crypto_aead *tfm, unsigned int authsize);
-	int (*encrypt)(struct aead_request *req);
-	int (*decrypt)(struct aead_request *req);
-	int (*givencrypt)(struct aead_givcrypt_request *req);
-	int (*givdecrypt)(struct aead_givcrypt_request *req);
-
-	const char *geniv;
-
-	unsigned int ivsize;
-	unsigned int maxauthsize;
-};
-
 /**
  * struct blkcipher_alg - synchronous block cipher definition
  * @min_keysize: see struct ablkcipher_alg
@@ -409,7 +365,6 @@ struct compress_alg {
 
 
 #define cra_ablkcipher	cra_u.ablkcipher
-#define cra_aead	cra_u.aead
 #define cra_blkcipher	cra_u.blkcipher
 #define cra_cipher	cra_u.cipher
 #define cra_compress	cra_u.compress
@@ -460,7 +415,7 @@ struct compress_alg {
  *	      struct crypto_type, which implements callbacks common for all
  *	      transformation types. There are multiple options:
  *	      &crypto_blkcipher_type, &crypto_ablkcipher_type,
- *	      &crypto_ahash_type, &crypto_aead_type, &crypto_rng_type.
+ *	      &crypto_ahash_type, &crypto_rng_type.
  *	      This field might be empty. In that case, there are no common
  *	      callbacks. This is the case for: cipher, compress, shash.
  * @cra_u: Callbacks implementing the transformation. This is a union of
@@ -508,7 +463,6 @@ struct crypto_alg {
 
 	union {
 		struct ablkcipher_alg ablkcipher;
-		struct old_aead_alg aead;
 		struct blkcipher_alg blkcipher;
 		struct cipher_alg cipher;
 		struct compress_alg compress;
-- 
cgit v1.2.3-70-g09d2


From 5e4b8c1fcc70016f43926203ae1820c3b380d5cd Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 13 Aug 2015 17:29:06 +0800
Subject: crypto: aead - Remove CRYPTO_ALG_AEAD_NEW flag

This patch removes the CRYPTO_ALG_AEAD_NEW flag now that everyone
has been converted.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/aes-ce-ccm-glue.c      |  1 -
 arch/x86/crypto/aesni-intel_glue.c       |  3 +--
 crypto/aead.c                            |  6 ++----
 crypto/algif_aead.c                      |  3 +--
 crypto/authenc.c                         |  4 +---
 crypto/authencesn.c                      |  4 +---
 crypto/ccm.c                             |  8 ++------
 crypto/chacha20poly1305.c                |  4 +---
 crypto/cryptd.c                          |  8 +++-----
 crypto/gcm.c                             | 12 +++---------
 crypto/pcrypt.c                          |  5 +----
 crypto/tcrypt.c                          |  7 +------
 drivers/crypto/caam/caamalg.c            |  3 +--
 drivers/crypto/ixp4xx_crypto.c           |  1 -
 drivers/crypto/nx/nx-aes-ccm.c           |  6 ++----
 drivers/crypto/nx/nx-aes-gcm.c           |  2 --
 drivers/crypto/picoxcell_crypto.c        |  1 -
 drivers/crypto/qat/qat_common/qat_algs.c |  8 ++++----
 drivers/crypto/talitos.c                 |  1 -
 include/linux/crypto.h                   |  6 ------
 20 files changed, 24 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
index f3690fa76a5b..f4bf2f2a014c 100644
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -280,7 +280,6 @@ static struct aead_alg ccm_aes_alg = {
 	.base = {
 		.cra_name		= "ccm(aes)",
 		.cra_driver_name	= "ccm-aes-ce",
-		.cra_flags		= CRYPTO_ALG_AEAD_NEW,
 		.cra_priority		= 300,
 		.cra_blocksize		= 1,
 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2347ef0a1a6d..3633ad6145c5 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1437,8 +1437,7 @@ static struct aead_alg aesni_aead_algs[] = { {
 		.cra_name		= "rfc4106(gcm(aes))",
 		.cra_driver_name	= "rfc4106-gcm-aesni",
 		.cra_priority		= 400,
-		.cra_flags		= CRYPTO_ALG_ASYNC |
-					  CRYPTO_ALG_AEAD_NEW,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
 		.cra_blocksize		= 1,
 		.cra_ctxsize		= sizeof(struct cryptd_aead *),
 		.cra_module		= THIS_MODULE,
diff --git a/crypto/aead.c b/crypto/aead.c
index c40df2c4d420..9b18a1e40d6a 100644
--- a/crypto/aead.c
+++ b/crypto/aead.c
@@ -204,8 +204,7 @@ struct aead_instance *aead_geniv_alloc(struct crypto_template *tmpl,
 	if (IS_ERR(algt))
 		return ERR_CAST(algt);
 
-	if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) &
-	    algt->mask & ~CRYPTO_ALG_AEAD_NEW)
+	if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) & algt->mask)
 		return ERR_PTR(-EINVAL);
 
 	name = crypto_attr_alg_name(tb[1]);
@@ -245,8 +244,7 @@ struct aead_instance *aead_geniv_alloc(struct crypto_template *tmpl,
 	    CRYPTO_MAX_ALG_NAME)
 		goto err_drop_alg;
 
-	inst->alg.base.cra_flags = alg->base.cra_flags &
-				   (CRYPTO_ALG_ASYNC | CRYPTO_ALG_AEAD_NEW);
+	inst->alg.base.cra_flags = alg->base.cra_flags & CRYPTO_ALG_ASYNC;
 	inst->alg.base.cra_priority = alg->base.cra_priority;
 	inst->alg.base.cra_blocksize = alg->base.cra_blocksize;
 	inst->alg.base.cra_alignmask = alg->base.cra_alignmask;
diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index e0408a480d2f..38a6cab7aeca 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -514,8 +514,7 @@ static struct proto_ops algif_aead_ops = {
 
 static void *aead_bind(const char *name, u32 type, u32 mask)
 {
-	return crypto_alloc_aead(name, type | CRYPTO_ALG_AEAD_NEW,
-				 mask | CRYPTO_ALG_AEAD_NEW);
+	return crypto_alloc_aead(name, type, mask);
 }
 
 static void aead_release(void *private)
diff --git a/crypto/authenc.c b/crypto/authenc.c
index bca3835b6b42..55a354d57251 100644
--- a/crypto/authenc.c
+++ b/crypto/authenc.c
@@ -393,8 +393,7 @@ static int crypto_authenc_create(struct crypto_template *tmpl,
 	if (IS_ERR(algt))
 		return PTR_ERR(algt);
 
-	if ((algt->type ^ (CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_AEAD_NEW)) &
-	    algt->mask)
+	if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) & algt->mask)
 		return -EINVAL;
 
 	auth = ahash_attr_alg(tb[1], CRYPTO_ALG_TYPE_HASH,
@@ -445,7 +444,6 @@ static int crypto_authenc_create(struct crypto_template *tmpl,
 		goto err_drop_enc;
 
 	inst->alg.base.cra_flags = enc->cra_flags & CRYPTO_ALG_ASYNC;
-	inst->alg.base.cra_flags |= CRYPTO_ALG_AEAD_NEW;
 	inst->alg.base.cra_priority = enc->cra_priority * 10 +
 				      auth_base->cra_priority;
 	inst->alg.base.cra_blocksize = enc->cra_blocksize;
diff --git a/crypto/authencesn.c b/crypto/authencesn.c
index c30393e2ad22..0c0468869e25 100644
--- a/crypto/authencesn.c
+++ b/crypto/authencesn.c
@@ -409,8 +409,7 @@ static int crypto_authenc_esn_create(struct crypto_template *tmpl,
 	if (IS_ERR(algt))
 		return PTR_ERR(algt);
 
-	if ((algt->type ^ (CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_AEAD_NEW)) &
-	    algt->mask)
+	if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) & algt->mask)
 		return -EINVAL;
 
 	auth = ahash_attr_alg(tb[1], CRYPTO_ALG_TYPE_HASH,
@@ -458,7 +457,6 @@ static int crypto_authenc_esn_create(struct crypto_template *tmpl,
 		goto err_drop_enc;
 
 	inst->alg.base.cra_flags = enc->cra_flags & CRYPTO_ALG_ASYNC;
-	inst->alg.base.cra_flags |= CRYPTO_ALG_AEAD_NEW;
 	inst->alg.base.cra_priority = enc->cra_priority * 10 +
 				      auth_base->cra_priority;
 	inst->alg.base.cra_blocksize = enc->cra_blocksize;
diff --git a/crypto/ccm.c b/crypto/ccm.c
index b63f96a0b39c..cc31ea4335bf 100644
--- a/crypto/ccm.c
+++ b/crypto/ccm.c
@@ -518,8 +518,7 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 	if (IS_ERR(algt))
 		return PTR_ERR(algt);
 
-	if ((algt->type ^ (CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_AEAD_NEW)) &
-	    algt->mask)
+	if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) & algt->mask)
 		return -EINVAL;
 
 	cipher = crypto_alg_mod_lookup(cipher_name,  CRYPTO_ALG_TYPE_CIPHER,
@@ -571,7 +570,6 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 	memcpy(inst->alg.base.cra_name, full_name, CRYPTO_MAX_ALG_NAME);
 
 	inst->alg.base.cra_flags = ctr->cra_flags & CRYPTO_ALG_ASYNC;
-	inst->alg.base.cra_flags |= CRYPTO_ALG_AEAD_NEW;
 	inst->alg.base.cra_priority = (cipher->cra_priority +
 				       ctr->cra_priority) / 2;
 	inst->alg.base.cra_blocksize = 1;
@@ -820,8 +818,7 @@ static int crypto_rfc4309_create(struct crypto_template *tmpl,
 	if (IS_ERR(algt))
 		return PTR_ERR(algt);
 
-	if ((algt->type ^ (CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_AEAD_NEW)) &
-	    algt->mask)
+	if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) & algt->mask)
 		return -EINVAL;
 
 	ccm_name = crypto_attr_alg_name(tb[1]);
@@ -861,7 +858,6 @@ static int crypto_rfc4309_create(struct crypto_template *tmpl,
 		goto out_drop_alg;
 
 	inst->alg.base.cra_flags = alg->base.cra_flags & CRYPTO_ALG_ASYNC;
-	inst->alg.base.cra_flags |= CRYPTO_ALG_AEAD_NEW;
 	inst->alg.base.cra_priority = alg->base.cra_priority;
 	inst->alg.base.cra_blocksize = 1;
 	inst->alg.base.cra_alignmask = alg->base.cra_alignmask;
diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c
index b71445f282ad..99c3cce01290 100644
--- a/crypto/chacha20poly1305.c
+++ b/crypto/chacha20poly1305.c
@@ -585,8 +585,7 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb,
 	if (IS_ERR(algt))
 		return PTR_ERR(algt);
 
-	if ((algt->type ^ (CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_AEAD_NEW)) &
-	    algt->mask)
+	if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) & algt->mask)
 		return -EINVAL;
 
 	chacha_name = crypto_attr_alg_name(tb[1]);
@@ -644,7 +643,6 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb,
 
 	inst->alg.base.cra_flags = (chacha->cra_flags | poly->cra_flags) &
 				   CRYPTO_ALG_ASYNC;
-	inst->alg.base.cra_flags |= CRYPTO_ALG_AEAD_NEW;
 	inst->alg.base.cra_priority = (chacha->cra_priority +
 				       poly->cra_priority) / 2;
 	inst->alg.base.cra_blocksize = 1;
diff --git a/crypto/cryptd.c b/crypto/cryptd.c
index e5076f85b9e4..c81861b1350b 100644
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
@@ -177,8 +177,8 @@ static inline void cryptd_check_internal(struct rtattr **tb, u32 *type,
 	if (IS_ERR(algt))
 		return;
 
-	*type |= algt->type & (CRYPTO_ALG_INTERNAL | CRYPTO_ALG_AEAD_NEW);
-	*mask |= algt->mask & (CRYPTO_ALG_INTERNAL | CRYPTO_ALG_AEAD_NEW);
+	*type |= algt->type & CRYPTO_ALG_INTERNAL;
+	*mask |= algt->mask & CRYPTO_ALG_INTERNAL;
 }
 
 static int cryptd_blkcipher_setkey(struct crypto_ablkcipher *parent,
@@ -805,9 +805,7 @@ static int cryptd_create_aead(struct crypto_template *tmpl,
 		goto out_drop_aead;
 
 	inst->alg.base.cra_flags = CRYPTO_ALG_ASYNC |
-				   (alg->base.cra_flags &
-				    (CRYPTO_ALG_INTERNAL |
-				     CRYPTO_ALG_AEAD_NEW));
+				   (alg->base.cra_flags & CRYPTO_ALG_INTERNAL);
 	inst->alg.base.cra_ctxsize = sizeof(struct cryptd_aead_ctx);
 
 	inst->alg.ivsize = crypto_aead_alg_ivsize(alg);
diff --git a/crypto/gcm.c b/crypto/gcm.c
index 0c9e33bdce1a..ddb4f29b2fe6 100644
--- a/crypto/gcm.c
+++ b/crypto/gcm.c
@@ -634,8 +634,7 @@ static int crypto_gcm_create_common(struct crypto_template *tmpl,
 	if (IS_ERR(algt))
 		return PTR_ERR(algt);
 
-	if ((algt->type ^ (CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_AEAD_NEW)) &
-	    algt->mask)
+	if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) & algt->mask)
 		return -EINVAL;
 
 	ghash_alg = crypto_find_alg(ghash_name, &crypto_ahash_type,
@@ -690,7 +689,6 @@ static int crypto_gcm_create_common(struct crypto_template *tmpl,
 
 	inst->alg.base.cra_flags = (ghash->base.cra_flags | ctr->cra_flags) &
 				   CRYPTO_ALG_ASYNC;
-	inst->alg.base.cra_flags |= CRYPTO_ALG_AEAD_NEW;
 	inst->alg.base.cra_priority = (ghash->base.cra_priority +
 				       ctr->cra_priority) / 2;
 	inst->alg.base.cra_blocksize = 1;
@@ -935,8 +933,7 @@ static int crypto_rfc4106_create(struct crypto_template *tmpl,
 	if (IS_ERR(algt))
 		return PTR_ERR(algt);
 
-	if ((algt->type ^ (CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_AEAD_NEW)) &
-	    algt->mask)
+	if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) & algt->mask)
 		return -EINVAL;
 
 	ccm_name = crypto_attr_alg_name(tb[1]);
@@ -976,7 +973,6 @@ static int crypto_rfc4106_create(struct crypto_template *tmpl,
 		goto out_drop_alg;
 
 	inst->alg.base.cra_flags = alg->base.cra_flags & CRYPTO_ALG_ASYNC;
-	inst->alg.base.cra_flags |= CRYPTO_ALG_AEAD_NEW;
 	inst->alg.base.cra_priority = alg->base.cra_priority;
 	inst->alg.base.cra_blocksize = 1;
 	inst->alg.base.cra_alignmask = alg->base.cra_alignmask;
@@ -1175,8 +1171,7 @@ static int crypto_rfc4543_create(struct crypto_template *tmpl,
 	if (IS_ERR(algt))
 		return PTR_ERR(algt);
 
-	if ((algt->type ^ (CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_AEAD_NEW)) &
-	    algt->mask)
+	if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) & algt->mask)
 		return -EINVAL;
 
 	ccm_name = crypto_attr_alg_name(tb[1]);
@@ -1217,7 +1212,6 @@ static int crypto_rfc4543_create(struct crypto_template *tmpl,
 		goto out_drop_alg;
 
 	inst->alg.base.cra_flags = alg->base.cra_flags & CRYPTO_ALG_ASYNC;
-	inst->alg.base.cra_flags |= CRYPTO_ALG_AEAD_NEW;
 	inst->alg.base.cra_priority = alg->base.cra_priority;
 	inst->alg.base.cra_blocksize = 1;
 	inst->alg.base.cra_alignmask = alg->base.cra_alignmask;
diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index 001a3a3e75df..ee9cfb99fe25 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -295,9 +295,7 @@ static int pcrypt_create_aead(struct crypto_template *tmpl, struct rtattr **tb,
 	ctx = aead_instance_ctx(inst);
 	crypto_set_aead_spawn(&ctx->spawn, aead_crypto_instance(inst));
 
-	err = crypto_grab_aead(&ctx->spawn, name,
-			       algt->type & CRYPTO_ALG_AEAD_NEW,
-			       algt->mask & CRYPTO_ALG_AEAD_NEW);
+	err = crypto_grab_aead(&ctx->spawn, name, 0, 0);
 	if (err)
 		goto out_free_inst;
 
@@ -307,7 +305,6 @@ static int pcrypt_create_aead(struct crypto_template *tmpl, struct rtattr **tb,
 		goto out_drop_aead;
 
 	inst->alg.base.cra_flags = CRYPTO_ALG_ASYNC;
-	inst->alg.base.cra_flags |= alg->base.cra_flags & CRYPTO_ALG_AEAD_NEW;
 
 	inst->alg.ivsize = crypto_aead_alg_ivsize(alg);
 	inst->alg.maxauthsize = crypto_aead_alg_maxauthsize(alg);
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index e9a05ba2bfb4..2b00b617daab 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -344,12 +344,7 @@ static void test_aead_speed(const char *algo, int enc, unsigned int secs,
 		goto out_nosg;
 	sgout = &sg[9];
 
-	tfm = crypto_alloc_aead(algo, CRYPTO_ALG_AEAD_NEW,
-				CRYPTO_ALG_AEAD_NEW);
-	if (PTR_ERR(tfm) == -ENOENT) {
-		aad_size -= 8;
-		tfm = crypto_alloc_aead(algo, 0, CRYPTO_ALG_AEAD_NEW);
-	}
+	tfm = crypto_alloc_aead(algo, 0, 0);
 
 	if (IS_ERR(tfm)) {
 		pr_err("alg: aead: Failed to load transform for %s: %ld\n", algo,
diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 336125927377..83d2306ce7ab 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -4359,8 +4359,7 @@ static void caam_aead_alg_init(struct caam_aead_alg *t_alg)
 	alg->base.cra_module = THIS_MODULE;
 	alg->base.cra_priority = CAAM_CRA_PRIORITY;
 	alg->base.cra_ctxsize = sizeof(struct caam_ctx);
-	alg->base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY |
-			      CRYPTO_ALG_AEAD_NEW;
+	alg->base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY;
 
 	alg->init = caam_aead_init;
 	alg->exit = caam_aead_exit;
diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c
index 411de261e40c..8f2790353281 100644
--- a/drivers/crypto/ixp4xx_crypto.c
+++ b/drivers/crypto/ixp4xx_crypto.c
@@ -1451,7 +1451,6 @@ static int __init ixp_module_init(void)
 
 		/* authenc */
 		cra->base.cra_flags = CRYPTO_ALG_KERN_DRIVER_ONLY |
-				      CRYPTO_ALG_AEAD_NEW |
 				      CRYPTO_ALG_ASYNC;
 		cra->setkey = aead_setkey;
 		cra->setauthsize = aead_setauthsize;
diff --git a/drivers/crypto/nx/nx-aes-ccm.c b/drivers/crypto/nx/nx-aes-ccm.c
index 195c9207a98d..73ef49922788 100644
--- a/drivers/crypto/nx/nx-aes-ccm.c
+++ b/drivers/crypto/nx/nx-aes-ccm.c
@@ -559,8 +559,7 @@ struct aead_alg nx_ccm_aes_alg = {
 		.cra_name        = "ccm(aes)",
 		.cra_driver_name = "ccm-aes-nx",
 		.cra_priority    = 300,
-		.cra_flags       = CRYPTO_ALG_NEED_FALLBACK |
-				   CRYPTO_ALG_AEAD_NEW,
+		.cra_flags       = CRYPTO_ALG_NEED_FALLBACK,
 		.cra_blocksize   = 1,
 		.cra_ctxsize     = sizeof(struct nx_crypto_ctx),
 		.cra_module      = THIS_MODULE,
@@ -580,8 +579,7 @@ struct aead_alg nx_ccm4309_aes_alg = {
 		.cra_name        = "rfc4309(ccm(aes))",
 		.cra_driver_name = "rfc4309-ccm-aes-nx",
 		.cra_priority    = 300,
-		.cra_flags       = CRYPTO_ALG_NEED_FALLBACK |
-				   CRYPTO_ALG_AEAD_NEW,
+		.cra_flags       = CRYPTO_ALG_NEED_FALLBACK,
 		.cra_blocksize   = 1,
 		.cra_ctxsize     = sizeof(struct nx_crypto_ctx),
 		.cra_module      = THIS_MODULE,
diff --git a/drivers/crypto/nx/nx-aes-gcm.c b/drivers/crypto/nx/nx-aes-gcm.c
index 5719638b8642..eee624f589b6 100644
--- a/drivers/crypto/nx/nx-aes-gcm.c
+++ b/drivers/crypto/nx/nx-aes-gcm.c
@@ -490,7 +490,6 @@ struct aead_alg nx_gcm_aes_alg = {
 	.base = {
 		.cra_name        = "gcm(aes)",
 		.cra_driver_name = "gcm-aes-nx",
-		.cra_flags	 = CRYPTO_ALG_AEAD_NEW,
 		.cra_priority    = 300,
 		.cra_blocksize   = 1,
 		.cra_ctxsize     = sizeof(struct nx_crypto_ctx),
@@ -509,7 +508,6 @@ struct aead_alg nx_gcm4106_aes_alg = {
 	.base = {
 		.cra_name        = "rfc4106(gcm(aes))",
 		.cra_driver_name = "rfc4106-gcm-aes-nx",
-		.cra_flags	 = CRYPTO_ALG_AEAD_NEW,
 		.cra_priority    = 300,
 		.cra_blocksize   = 1,
 		.cra_ctxsize     = sizeof(struct nx_crypto_ctx),
diff --git a/drivers/crypto/picoxcell_crypto.c b/drivers/crypto/picoxcell_crypto.c
index e0f0c3497b12..da36de26a4dc 100644
--- a/drivers/crypto/picoxcell_crypto.c
+++ b/drivers/crypto/picoxcell_crypto.c
@@ -1738,7 +1738,6 @@ static int spacc_probe(struct platform_device *pdev)
 	INIT_LIST_HEAD(&engine->registered_aeads);
 	for (i = 0; i < engine->num_aeads; ++i) {
 		engine->aeads[i].engine = engine;
-		engine->aeads[i].alg.base.cra_flags |= CRYPTO_ALG_AEAD_NEW;
 		err = crypto_register_aead(&engine->aeads[i].alg);
 		if (!err) {
 			list_add_tail(&engine->aeads[i].entry,
diff --git a/drivers/crypto/qat/qat_common/qat_algs.c b/drivers/crypto/qat/qat_common/qat_algs.c
index b7099f2fea28..2bd913aceaeb 100644
--- a/drivers/crypto/qat/qat_common/qat_algs.c
+++ b/drivers/crypto/qat/qat_common/qat_algs.c
@@ -1109,7 +1109,7 @@ static struct aead_alg qat_aeads[] = { {
 		.cra_name = "authenc(hmac(sha1),cbc(aes))",
 		.cra_driver_name = "qat_aes_cbc_hmac_sha1",
 		.cra_priority = 4001,
-		.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_AEAD_NEW,
+		.cra_flags = CRYPTO_ALG_ASYNC,
 		.cra_blocksize = AES_BLOCK_SIZE,
 		.cra_ctxsize = sizeof(struct qat_alg_aead_ctx),
 		.cra_module = THIS_MODULE,
@@ -1126,7 +1126,7 @@ static struct aead_alg qat_aeads[] = { {
 		.cra_name = "authenc(hmac(sha256),cbc(aes))",
 		.cra_driver_name = "qat_aes_cbc_hmac_sha256",
 		.cra_priority = 4001,
-		.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_AEAD_NEW,
+		.cra_flags = CRYPTO_ALG_ASYNC,
 		.cra_blocksize = AES_BLOCK_SIZE,
 		.cra_ctxsize = sizeof(struct qat_alg_aead_ctx),
 		.cra_module = THIS_MODULE,
@@ -1143,7 +1143,7 @@ static struct aead_alg qat_aeads[] = { {
 		.cra_name = "authenc(hmac(sha512),cbc(aes))",
 		.cra_driver_name = "qat_aes_cbc_hmac_sha512",
 		.cra_priority = 4001,
-		.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_AEAD_NEW,
+		.cra_flags = CRYPTO_ALG_ASYNC,
 		.cra_blocksize = AES_BLOCK_SIZE,
 		.cra_ctxsize = sizeof(struct qat_alg_aead_ctx),
 		.cra_module = THIS_MODULE,
@@ -1197,7 +1197,7 @@ int qat_algs_register(void)
 		goto unlock;
 
 	for (i = 0; i < ARRAY_SIZE(qat_aeads); i++)
-		qat_aeads[i].base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_AEAD_NEW;
+		qat_aeads[i].base.cra_flags = CRYPTO_ALG_ASYNC;
 
 	ret = crypto_register_aeads(qat_aeads, ARRAY_SIZE(qat_aeads));
 	if (ret)
diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 1bc8dd91e217..cd774534d987 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -2730,7 +2730,6 @@ static struct talitos_crypto_alg *talitos_alg_alloc(struct device *dev,
 		break;
 	case CRYPTO_ALG_TYPE_AEAD:
 		alg = &t_alg->algt.alg.aead.base;
-		alg->cra_flags |= CRYPTO_ALG_AEAD_NEW;
 		t_alg->algt.alg.aead.init = talitos_cra_init_aead;
 		t_alg->algt.alg.aead.setkey = aead_setkey;
 		t_alg->algt.alg.aead.encrypt = aead_encrypt;
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 7f4aee9e1f91..e71cb70a1ac2 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -101,12 +101,6 @@
  */
 #define CRYPTO_ALG_INTERNAL		0x00002000
 
-/*
- * Temporary flag used to prevent legacy AEAD implementations from
- * being used by user-space.
- */
-#define CRYPTO_ALG_AEAD_NEW		0x00004000
-
 /*
  * Transform masks and values (for crt_flags).
  */
-- 
cgit v1.2.3-70-g09d2


From 272420214d261e97f08a4c555defb3924de06ae8 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vinod.koul@intel.com>
Date: Wed, 5 Aug 2015 08:42:05 +0530
Subject: dmaengine: Add DMA_CTRL_REUSE

This adds new descriptor flag for reusing a descriptor by submitting
multiple times by a client, for example video buffer.
Add helper APIs for this as well

Signed-off-by: Vinod Koul <vinod.koul@intel.com>
Acked-by:Robert Jarzmik <robert.jarzmik@free.fr>
---
 include/linux/dmaengine.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 8ad9a4e839f6..1b866e9a6ed1 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -184,6 +184,8 @@ struct dma_interleaved_template {
  *  operation it continues the calculation with new sources
  * @DMA_PREP_FENCE - tell the driver that subsequent operations depend
  *  on the result of this operation
+ * @DMA_CTRL_REUSE: client can reuse the descriptor and submit again till
+ *  cleared or freed
  */
 enum dma_ctrl_flags {
 	DMA_PREP_INTERRUPT = (1 << 0),
@@ -192,6 +194,7 @@ enum dma_ctrl_flags {
 	DMA_PREP_PQ_DISABLE_Q = (1 << 3),
 	DMA_PREP_CONTINUE = (1 << 4),
 	DMA_PREP_FENCE = (1 << 5),
+	DMA_CTRL_REUSE = (1 << 6),
 };
 
 /**
@@ -401,6 +404,8 @@ enum dma_residue_granularity {
  * @cmd_pause: true, if pause and thereby resume is supported
  * @cmd_terminate: true, if terminate cmd is supported
  * @residue_granularity: granularity of the reported transfer residue
+ * @descriptor_reuse: if a descriptor can be reused by client and
+ * resubmitted multiple times
  */
 struct dma_slave_caps {
 	u32 src_addr_widths;
@@ -409,6 +414,7 @@ struct dma_slave_caps {
 	bool cmd_pause;
 	bool cmd_terminate;
 	enum dma_residue_granularity residue_granularity;
+	bool descriptor_reuse;
 };
 
 static inline const char *dma_chan_name(struct dma_chan *chan)
@@ -468,6 +474,7 @@ struct dma_async_tx_descriptor {
 	dma_addr_t phys;
 	struct dma_chan *chan;
 	dma_cookie_t (*tx_submit)(struct dma_async_tx_descriptor *tx);
+	int (*desc_free)(struct dma_async_tx_descriptor *tx);
 	dma_async_tx_callback callback;
 	void *callback_param;
 	struct dmaengine_unmap_data *unmap;
@@ -1175,6 +1182,39 @@ static inline int dma_get_slave_caps(struct dma_chan *chan,
 }
 #endif
 
+static inline int dmaengine_desc_set_reuse(struct dma_async_tx_descriptor *tx)
+{
+	struct dma_slave_caps caps;
+
+	dma_get_slave_caps(tx->chan, &caps);
+
+	if (caps.descriptor_reuse) {
+		tx->flags |= DMA_CTRL_REUSE;
+		return 0;
+	} else {
+		return -EPERM;
+	}
+}
+
+static inline void dmaengine_desc_clear_reuse(struct dma_async_tx_descriptor *tx)
+{
+	tx->flags &= ~DMA_CTRL_REUSE;
+}
+
+static inline bool dmaengine_desc_test_reuse(struct dma_async_tx_descriptor *tx)
+{
+	return (tx->flags & DMA_CTRL_REUSE) == DMA_CTRL_REUSE;
+}
+
+static inline int dmaengine_desc_free(struct dma_async_tx_descriptor *desc)
+{
+	/* this is supported for reusable desc, so check that */
+	if (dmaengine_desc_test_reuse(desc))
+		return desc->desc_free(desc);
+	else
+		return -EPERM;
+}
+
 /* --- DMA device --- */
 
 int dma_async_device_register(struct dma_device *device);
-- 
cgit v1.2.3-70-g09d2


From 1dc042885456dff457d0b758b69209dcafa688ec Mon Sep 17 00:00:00 2001
From: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Date: Wed, 12 Aug 2015 11:30:59 +0300
Subject: dmaengine: Make __dma_request_slave_channel_compat() name argument
 constant

Inline function __dma_request_slave_channel_compat() doesn't modify "name"
argument but passes it to dma_request_slave_channel() which already takes
it as a constant.

Signed-off-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dmaengine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 1b866e9a6ed1..3a732ac95285 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1229,7 +1229,7 @@ struct dma_chan *dma_get_any_slave_channel(struct dma_device *device);
 static inline struct dma_chan
 *__dma_request_slave_channel_compat(const dma_cap_mask_t *mask,
 				  dma_filter_fn fn, void *fn_param,
-				  struct device *dev, char *name)
+				  struct device *dev, const char *name)
 {
 	struct dma_chan *chan;
 
-- 
cgit v1.2.3-70-g09d2


From 642c28ab86f7666d2ac62a0dc391b4c3121f1d6e Mon Sep 17 00:00:00 2001
From: David Jander <david@protonic.nl>
Date: Tue, 23 Jun 2015 11:43:52 +0200
Subject: mmc: core: Optimize case for exactly one erase-group budget

In the (not so unlikely) case that the mmc controller timeout budget is
enough for exactly one erase-group, the simplification of allowing one
sector has an enormous performance penalty. We optimize this special case
by introducing a flag that prohibits erase-group boundary crossing, so
that we can allow trimming more than one sector at a time.

Signed-off-by: David Jander <david@protonic.nl>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/core.c  | 38 ++++++++++++++++++++++++++++++++++----
 include/linux/mmc/card.h |  1 +
 2 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 9ad73f30f744..083cade3ffc5 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -2168,6 +2168,7 @@ int mmc_erase(struct mmc_card *card, unsigned int from, unsigned int nr,
 	      unsigned int arg)
 {
 	unsigned int rem, to = from + nr;
+	int err;
 
 	if (!(card->host->caps & MMC_CAP_ERASE) ||
 	    !(card->csd.cmdclass & CCC_ERASE))
@@ -2218,6 +2219,23 @@ int mmc_erase(struct mmc_card *card, unsigned int from, unsigned int nr,
 	/* 'from' and 'to' are inclusive */
 	to -= 1;
 
+	/*
+	 * Special case where only one erase-group fits in the timeout budget:
+	 * If the region crosses an erase-group boundary on this particular
+	 * case, we will be trimming more than one erase-group which, does not
+	 * fit in the timeout budget of the controller, so we need to split it
+	 * and call mmc_do_erase() twice if necessary. This special case is
+	 * identified by the card->eg_boundary flag.
+	 */
+	if ((arg & MMC_TRIM_ARGS) && (card->eg_boundary) &&
+	    (from % card->erase_size)) {
+		rem = card->erase_size - (from % card->erase_size);
+		err = mmc_do_erase(card, from, from + rem - 1, arg);
+		from += rem;
+		if ((err) || (to <= from))
+			return err;
+	}
+
 	return mmc_do_erase(card, from, to, arg);
 }
 EXPORT_SYMBOL(mmc_erase);
@@ -2313,16 +2331,28 @@ static unsigned int mmc_do_calc_max_discard(struct mmc_card *card,
 	if (!qty)
 		return 0;
 
+	/*
+	 * When specifying a sector range to trim, chances are we might cross
+	 * an erase-group boundary even if the amount of sectors is less than
+	 * one erase-group.
+	 * If we can only fit one erase-group in the controller timeout budget,
+	 * we have to care that erase-group boundaries are not crossed by a
+	 * single trim operation. We flag that special case with "eg_boundary".
+	 * In all other cases we can just decrement qty and pretend that we
+	 * always touch (qty + 1) erase-groups as a simple optimization.
+	 */
 	if (qty == 1)
-		return 1;
+		card->eg_boundary = 1;
+	else
+		qty--;
 
 	/* Convert qty to sectors */
 	if (card->erase_shift)
-		max_discard = --qty << card->erase_shift;
+		max_discard = qty << card->erase_shift;
 	else if (mmc_card_sd(card))
-		max_discard = qty;
+		max_discard = qty + 1;
 	else
-		max_discard = --qty * card->erase_size;
+		max_discard = qty * card->erase_size;
 
 	return max_discard;
 }
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 4d3776d25925..8fcbcd13218f 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -283,6 +283,7 @@ struct mmc_card {
 	unsigned int		erase_size;	/* erase size in sectors */
  	unsigned int		erase_shift;	/* if erase unit is power 2 */
  	unsigned int		pref_erase;	/* in sectors */
+	unsigned int		eg_boundary;	/* don't cross erase-group boundaries */
  	u8			erased_byte;	/* value of erased bytes */
 
 	u32			raw_cid[4];	/* raw card CID */
-- 
cgit v1.2.3-70-g09d2


From f13e5b9f3c625916d7658ba526574a5d24e4d664 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@freescale.com>
Date: Fri, 10 Jul 2015 11:36:45 +0800
Subject: mmc: sdio: avoid using NULL sdio_irq_thread pointer

For Freescale QorIQ LS1021AQDS board, there is a SDIO interrupt
in the process of resume without inserting SD adapter because of
some unknown issue. But the driver doesn't assign sdio_irq_thread
pointer. This will block the resume of kernel. This patch is used
to avoid using NULL sdio_irq_thread pointer.

Signed-off-by: Yangbo Lu <yangbo.lu@freescale.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/host.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 1369e54faeb7..83b81fd865f3 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -412,7 +412,8 @@ static inline void mmc_signal_sdio_irq(struct mmc_host *host)
 {
 	host->ops->enable_sdio_irq(host, 0);
 	host->sdio_irq_pending = true;
-	wake_up_process(host->sdio_irq_thread);
+	if (host->sdio_irq_thread)
+		wake_up_process(host->sdio_irq_thread);
 }
 
 void sdio_run_irqs(struct mmc_host *host);
-- 
cgit v1.2.3-70-g09d2


From 575c319dfe872fc8bf1d57b244fb40f497ab3a47 Mon Sep 17 00:00:00 2001
From: Heiko Stuebner <heiko@sntech.de>
Date: Mon, 3 Aug 2015 17:04:10 +0200
Subject: mmc: dw_mmc: fix pio mode when internal dmac is enabled

The dw_mci_init_dma() may decide to not use dma, but pio instead, caused
by things like wrong dma settings in the system.

Till now the code dw_mci_init_slot() always assumed that dma is available
when CONFIG_MMC_DW_IDMAC was defined, ignoring the host->use_dma var
set during dma init.

So when now the dma init failed for whatever reason, the transfer sizes
would still be set for dma transfers, especially including the maximum
block-count calculated from host->ring_size and resulting in a

[    4.991109] ------------[ cut here ]------------
[    4.991111] kernel BUG at drivers/mmc/core/core.c:256!
[    4.991113] Internal error: Oops - BUG: 0 [#1] SMP ARM

because host->ring_size is 0 in this case and the slot init code uses
the wrong code to calculate the values.

Fix this by selecting the correct calculations using the host->use_dma
variable instead of the CONFIG_MMC_DW_IDMAC config option.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Signed-off-by: Jaehoon Chung <jh80.chung@samsung.com>
---
 drivers/mmc/host/dw_mmc.c  | 27 ++++++++++++++-------------
 include/linux/mmc/dw_mmc.h |  4 ----
 2 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index 3f070d9f4086..3c0e1993e737 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -2444,19 +2444,20 @@ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id)
 		mmc->max_seg_size = host->pdata->blk_settings->max_seg_size;
 	} else {
 		/* Useful defaults if platform data is unset. */
-#ifdef CONFIG_MMC_DW_IDMAC
-		mmc->max_segs = host->ring_size;
-		mmc->max_blk_size = 65536;
-		mmc->max_seg_size = DW_MCI_DESC_DATA_LENGTH;
-		mmc->max_req_size = mmc->max_seg_size * host->ring_size;
-		mmc->max_blk_count = mmc->max_req_size / 512;
-#else
-		mmc->max_segs = 64;
-		mmc->max_blk_size = 65536; /* BLKSIZ is 16 bits */
-		mmc->max_blk_count = 512;
-		mmc->max_req_size = mmc->max_blk_size * mmc->max_blk_count;
-		mmc->max_seg_size = mmc->max_req_size;
-#endif /* CONFIG_MMC_DW_IDMAC */
+		if (host->use_dma) {
+			mmc->max_segs = host->ring_size;
+			mmc->max_blk_size = 65536;
+			mmc->max_seg_size = 0x1000;
+			mmc->max_req_size = mmc->max_seg_size * host->ring_size;
+			mmc->max_blk_count = mmc->max_req_size / 512;
+		} else {
+			mmc->max_segs = 64;
+			mmc->max_blk_size = 65536; /* BLKSIZ is 16 bits */
+			mmc->max_blk_count = 512;
+			mmc->max_req_size = mmc->max_blk_size *
+					    mmc->max_blk_count;
+			mmc->max_seg_size = mmc->max_req_size;
+		}
 	}
 
 	if (dw_mci_get_cd(mmc))
diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h
index 5be97676f1fa..1d88bf72c65f 100644
--- a/include/linux/mmc/dw_mmc.h
+++ b/include/linux/mmc/dw_mmc.h
@@ -153,11 +153,7 @@ struct dw_mci {
 	dma_addr_t		sg_dma;
 	void			*sg_cpu;
 	const struct dw_mci_dma_ops	*dma_ops;
-#ifdef CONFIG_MMC_DW_IDMAC
 	unsigned int		ring_size;
-#else
-	struct dw_mci_dma_data	*dma_data;
-#endif
 	u32			cmd_status;
 	u32			data_status;
 	u32			stop_cmdr;
-- 
cgit v1.2.3-70-g09d2


From 2b708df2b3edf6e91a4c8d89e7aedb0f861d0e41 Mon Sep 17 00:00:00 2001
From: Jaehoon Chung <jh80.chung@samsung.com>
Date: Thu, 6 Aug 2015 16:23:25 +0900
Subject: mmc: dw_mmc: remove the unused blk_setting

"blk_setting" doesn't use anywhere.

Signed-off-by: Jaehoon Chung <jh80.chung@samsung.com>
---
 drivers/mmc/host/dw_mmc.c  | 34 +++++++++++++---------------------
 include/linux/mmc/dw_mmc.h |  1 -
 2 files changed, 13 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index de88e640097e..091df65f7813 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -2436,28 +2436,20 @@ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id)
 	if (ret)
 		goto err_host_allocated;
 
-	if (host->pdata->blk_settings) {
-		mmc->max_segs = host->pdata->blk_settings->max_segs;
-		mmc->max_blk_size = host->pdata->blk_settings->max_blk_size;
-		mmc->max_blk_count = host->pdata->blk_settings->max_blk_count;
-		mmc->max_req_size = host->pdata->blk_settings->max_req_size;
-		mmc->max_seg_size = host->pdata->blk_settings->max_seg_size;
+	/* Useful defaults if platform data is unset. */
+	if (host->use_dma) {
+		mmc->max_segs = host->ring_size;
+		mmc->max_blk_size = 65536;
+		mmc->max_seg_size = 0x1000;
+		mmc->max_req_size = mmc->max_seg_size * host->ring_size;
+		mmc->max_blk_count = mmc->max_req_size / 512;
 	} else {
-		/* Useful defaults if platform data is unset. */
-		if (host->use_dma) {
-			mmc->max_segs = host->ring_size;
-			mmc->max_blk_size = 65536;
-			mmc->max_seg_size = 0x1000;
-			mmc->max_req_size = mmc->max_seg_size * host->ring_size;
-			mmc->max_blk_count = mmc->max_req_size / 512;
-		} else {
-			mmc->max_segs = 64;
-			mmc->max_blk_size = 65536; /* BLKSIZ is 16 bits */
-			mmc->max_blk_count = 512;
-			mmc->max_req_size = mmc->max_blk_size *
-					    mmc->max_blk_count;
-			mmc->max_seg_size = mmc->max_req_size;
-		}
+		mmc->max_segs = 64;
+		mmc->max_blk_size = 65536; /* BLKSIZ is 16 bits */
+		mmc->max_blk_count = 512;
+		mmc->max_req_size = mmc->max_blk_size *
+				    mmc->max_blk_count;
+		mmc->max_seg_size = mmc->max_req_size;
 	}
 
 	if (dw_mci_get_cd(mmc))
diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h
index 1d88bf72c65f..b14fcb5012f1 100644
--- a/include/linux/mmc/dw_mmc.h
+++ b/include/linux/mmc/dw_mmc.h
@@ -255,7 +255,6 @@ struct dw_mci_board {
 
 	struct dw_mci_dma_ops *dma_ops;
 	struct dma_pdata *data;
-	struct block_settings *blk_settings;
 };
 
 #endif /* LINUX_MMC_DW_MMC_H */
-- 
cgit v1.2.3-70-g09d2


From 57e104864bc4874a36796fd222d8d084dbf90b9b Mon Sep 17 00:00:00 2001
From: Addy Ke <addy.ke@rock-chips.com>
Date: Tue, 11 Aug 2015 01:27:18 +0900
Subject: mmc: dw_mmc: add quirk for broken data transfer over scheme

This patch add a new quirk to add a s/w timer to notify the driver
to terminate current transfer and report a data timeout to the core,
if DTO interrupt does NOT come within the given time.

dw_mmc call mmc_request_done func to finish transfer depends on
DTO interrupt. If DTO interrupt does not come in sending data state,
the current transfer will be blocked.

We got the reply from synopsys:
There are two counters but both use the same value of [31:8] bits.
Data timeout counter doesn't wait for stop clock and you should get
DRTO even when the clock is not stopped.
Host Starvation timeout counter is triggered with stop clock condition.

This means that host should get DRTO and DTO interrupt.

But this case really exists, when driver reads tuning data from
card on RK3288-pink2 board. I measured waveforms by oscilloscope
and found that card clock was always on and data lines were always
holded high level in sending data state.

There are two possibility that data over interrupt doesn't come in
reading data state on RK3X SoCs:
- get command done interrupt, but doesn't get any data-related interrupt.
- get data error interrupt, but doesn't get data over interrupt.

Signed-off-by: Addy Ke <addy.ke@rock-chips.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Signed-off-by: Jaehoon Chung <jh80.chung@samsung.com>
---
 drivers/mmc/host/dw_mmc-rockchip.c |  3 ++
 drivers/mmc/host/dw_mmc.c          | 64 ++++++++++++++++++++++++++++++++++++--
 include/linux/mmc/dw_mmc.h         |  4 +++
 3 files changed, 69 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/dw_mmc-rockchip.c b/drivers/mmc/host/dw_mmc-rockchip.c
index de15121bba7d..bc76aa22473e 100644
--- a/drivers/mmc/host/dw_mmc-rockchip.c
+++ b/drivers/mmc/host/dw_mmc-rockchip.c
@@ -73,6 +73,9 @@ static int dw_mci_rockchip_init(struct dw_mci *host)
 	/* It is slot 8 on Rockchip SoCs */
 	host->sdio_id0 = 8;
 
+	/* It needs this quirk on all Rockchip SoCs */
+	host->pdata->quirks |= DW_MCI_QUIRK_BROKEN_DTO;
+
 	return 0;
 }
 
diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index f8ac14a14693..fcbf5524fd31 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -1574,6 +1574,20 @@ static int dw_mci_data_complete(struct dw_mci *host, struct mmc_data *data)
 	return data->error;
 }
 
+static void dw_mci_set_drto(struct dw_mci *host)
+{
+	unsigned int drto_clks;
+	unsigned int drto_ms;
+
+	drto_clks = mci_readl(host, TMOUT) >> 8;
+	drto_ms = DIV_ROUND_UP(drto_clks, host->bus_hz / 1000);
+
+	/* add a bit spare time */
+	drto_ms += 10;
+
+	mod_timer(&host->dto_timer, jiffies + msecs_to_jiffies(drto_ms));
+}
+
 static void dw_mci_tasklet_func(unsigned long priv)
 {
 	struct dw_mci *host = (struct dw_mci *)priv;
@@ -1651,8 +1665,16 @@ static void dw_mci_tasklet_func(unsigned long priv)
 			}
 
 			if (!test_and_clear_bit(EVENT_XFER_COMPLETE,
-						&host->pending_events))
+						&host->pending_events)) {
+				/*
+				 * If all data-related interrupts don't come
+				 * within the given time in reading data state.
+				 */
+				if ((host->quirks & DW_MCI_QUIRK_BROKEN_DTO) &&
+				    (host->dir_status == DW_MCI_RECV_STATUS))
+					dw_mci_set_drto(host);
 				break;
+			}
 
 			set_bit(EVENT_XFER_COMPLETE, &host->completed_events);
 
@@ -1685,8 +1707,17 @@ static void dw_mci_tasklet_func(unsigned long priv)
 
 		case STATE_DATA_BUSY:
 			if (!test_and_clear_bit(EVENT_DATA_COMPLETE,
-						&host->pending_events))
+						&host->pending_events)) {
+				/*
+				 * If data error interrupt comes but data over
+				 * interrupt doesn't come within the given time.
+				 * in reading data state.
+				 */
+				if ((host->quirks & DW_MCI_QUIRK_BROKEN_DTO) &&
+				    (host->dir_status == DW_MCI_RECV_STATUS))
+					dw_mci_set_drto(host);
 				break;
+			}
 
 			host->data = NULL;
 			set_bit(EVENT_DATA_COMPLETE, &host->completed_events);
@@ -2259,6 +2290,9 @@ static irqreturn_t dw_mci_interrupt(int irq, void *dev_id)
 		}
 
 		if (pending & SDMMC_INT_DATA_OVER) {
+			if (host->quirks & DW_MCI_QUIRK_BROKEN_DTO)
+				del_timer(&host->dto_timer);
+
 			mci_writel(host, RINTSTS, SDMMC_INT_DATA_OVER);
 			if (!host->data_status)
 				host->data_status = pending;
@@ -2644,6 +2678,28 @@ static void dw_mci_cmd11_timer(unsigned long arg)
 	tasklet_schedule(&host->tasklet);
 }
 
+static void dw_mci_dto_timer(unsigned long arg)
+{
+	struct dw_mci *host = (struct dw_mci *)arg;
+
+	switch (host->state) {
+	case STATE_SENDING_DATA:
+	case STATE_DATA_BUSY:
+		/*
+		 * If DTO interrupt does NOT come in sending data state,
+		 * we should notify the driver to terminate current transfer
+		 * and report a data timeout to the core.
+		 */
+		host->data_status = SDMMC_INT_DRTO;
+		set_bit(EVENT_DATA_ERROR, &host->pending_events);
+		set_bit(EVENT_DATA_COMPLETE, &host->pending_events);
+		tasklet_schedule(&host->tasklet);
+		break;
+	default:
+		break;
+	}
+}
+
 #ifdef CONFIG_OF
 static struct dw_mci_of_quirks {
 	char *quirk;
@@ -2822,6 +2878,10 @@ int dw_mci_probe(struct dw_mci *host)
 
 	host->quirks = host->pdata->quirks;
 
+	if (host->quirks & DW_MCI_QUIRK_BROKEN_DTO)
+		setup_timer(&host->dto_timer,
+			    dw_mci_dto_timer, (unsigned long)host);
+
 	spin_lock_init(&host->lock);
 	spin_lock_init(&host->irq_lock);
 	INIT_LIST_HEAD(&host->queue);
diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h
index b14fcb5012f1..134c57422740 100644
--- a/include/linux/mmc/dw_mmc.h
+++ b/include/linux/mmc/dw_mmc.h
@@ -98,6 +98,7 @@ struct mmc_data;
  * @irq_flags: The flags to be passed to request_irq.
  * @irq: The irq value to be passed to request_irq.
  * @sdio_id0: Number of slot0 in the SDIO interrupt registers.
+ * @dto_timer: Timer for broken data transfer over scheme.
  *
  * Locking
  * =======
@@ -200,6 +201,7 @@ struct dw_mci {
 	int			sdio_id0;
 
 	struct timer_list       cmd11_timer;
+	struct timer_list       dto_timer;
 };
 
 /* DMA ops for Internal/External DMAC interface */
@@ -222,6 +224,8 @@ struct dw_mci_dma_ops {
 #define DW_MCI_QUIRK_HIGHSPEED			BIT(2)
 /* Unreliable card detection */
 #define DW_MCI_QUIRK_BROKEN_CARD_DETECTION	BIT(3)
+/* Timer for broken data transfer over scheme */
+#define DW_MCI_QUIRK_BROKEN_DTO			BIT(4)
 
 struct dma_pdata;
 
-- 
cgit v1.2.3-70-g09d2


From a0a8bcf4670c2c696e6e83742539a5e0dd7a62d6 Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Mon, 17 Aug 2015 15:35:23 +0300
Subject: gpiolib: irqchip: use different lockdep class for each gpio irqchip

Since IRQ chip helpers were introduced drivers lose ability to
register separate lockdep classes for each registered GPIO IRQ
chip and the gpiolib now is using shared lockdep class for
all GPIO IRQ chips (gpiochip_irq_lock_class).
As result, lockdep will produce warning when there are min two
stacked GPIO chips and all of them are interrupt controllers.

HW configuration which generates lockdep warning (TI dra7-evm):

[SOC GPIO bankA.gpioX]
  <- irq - [pcf875x.gpioY]
            <- irq - DevZ.enable_irq_wake(pcf_gpioY_irq);
The issue was reported in [1] and discussed [2].

=============================================
[ INFO: possible recursive locking detected ]
4.2.0-rc6-00013-g5d050ed-dirty #55 Not tainted
---------------------------------------------
sh/63 is trying to acquire lock:
 (class){......}, at: [<c009b91c>] __irq_get_desc_lock+0x50/0x94

but task is already holding lock:
 (class){......}, at: [<c009b91c>] __irq_get_desc_lock+0x50/0x94

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(class);
  lock(class);

 *** DEADLOCK ***

 May be due to missing lock nesting notation

7 locks held by sh/63:
 #0:  (sb_writers#4){.+.+.+}, at: [<c016bbb8>] vfs_write+0x13c/0x164
 #1:  (&of->mutex){+.+.+.}, at: [<c01debf4>] kernfs_fop_write+0x4c/0x1a0
 #2:  (s_active#36){.+.+.+}, at: [<c01debfc>] kernfs_fop_write+0x54/0x1a0
 #3:  (pm_mutex){+.+.+.}, at: [<c009758c>] pm_suspend+0xec/0x4c4
 #4:  (&dev->mutex){......}, at: [<c03f77f8>] __device_suspend+0xd4/0x398
 #5:  (&gpio->lock){+.+.+.}, at: [<c009b940>] __irq_get_desc_lock+0x74/0x94
 #6:  (class){......}, at: [<c009b91c>] __irq_get_desc_lock+0x50/0x94

stack backtrace:
CPU: 0 PID: 63 Comm: sh Not tainted 4.2.0-rc6-00013-g5d050ed-dirty #55
Hardware name: Generic DRA74X (Flattened Device Tree)
[<c0016e24>] (unwind_backtrace) from [<c0013338>] (show_stack+0x10/0x14)
[<c0013338>] (show_stack) from [<c05f6b24>] (dump_stack+0x84/0x9c)
[<c05f6b24>] (dump_stack) from [<c00903f4>] (__lock_acquire+0x19c0/0x1e20)
[<c00903f4>] (__lock_acquire) from [<c0091098>] (lock_acquire+0xa8/0x128)
[<c0091098>] (lock_acquire) from [<c05fd61c>] (_raw_spin_lock_irqsave+0x38/0x4c)
[<c05fd61c>] (_raw_spin_lock_irqsave) from [<c009b91c>] (__irq_get_desc_lock+0x50/0x94)
[<c009b91c>] (__irq_get_desc_lock) from [<c009c4f4>] (irq_set_irq_wake+0x20/0xfc)
[<c009c4f4>] (irq_set_irq_wake) from [<c0393ac4>] (pcf857x_irq_set_wake+0x24/0x54)
[<c0393ac4>] (pcf857x_irq_set_wake) from [<c009c560>] (irq_set_irq_wake+0x8c/0xfc)
[<c009c560>] (irq_set_irq_wake) from [<c04a02ac>] (gpio_keys_suspend+0x70/0xd4)
[<c04a02ac>] (gpio_keys_suspend) from [<c03f6a00>] (dpm_run_callback+0x50/0x124)
[<c03f6a00>] (dpm_run_callback) from [<c03f7830>] (__device_suspend+0x10c/0x398)
[<c03f7830>] (__device_suspend) from [<c03f90f0>] (dpm_suspend+0x134/0x2f4)
[<c03f90f0>] (dpm_suspend) from [<c0096e20>] (suspend_devices_and_enter+0xa8/0x728)
[<c0096e20>] (suspend_devices_and_enter) from [<c00977cc>] (pm_suspend+0x32c/0x4c4)
[<c00977cc>] (pm_suspend) from [<c0096060>] (state_store+0x64/0xb8)
[<c0096060>] (state_store) from [<c01dec64>] (kernfs_fop_write+0xbc/0x1a0)
[<c01dec64>] (kernfs_fop_write) from [<c016b280>] (__vfs_write+0x20/0xd8)
[<c016b280>] (__vfs_write) from [<c016bb0c>] (vfs_write+0x90/0x164)
[<c016bb0c>] (vfs_write) from [<c016c330>] (SyS_write+0x44/0x9c)
[<c016c330>] (SyS_write) from [<c000f500>] (ret_fast_syscall+0x0/0x54)

Lets fix it by using separate lockdep class for each registered GPIO
IRQ Chip. This is done by wrapping gpiochip_irqchip_add call into macros.

The implementation of this patch inspired by solution done by Nicolas
Boichat for regmap [3]

[1] http://www.spinics.net/lists/linux-gpio/msg05844.html
[2] http://www.spinics.net/lists/linux-gpio/msg06021.html
[3] http://www.spinics.net/lists/arm-kernel/msg429834.html

Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roger Quadros <rogerq@ti.com>
Reported-by: Roger Quadros <rogerq@ti.com>
Tested-by: Roger Quadros <rogerq@ti.com>
Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 27 ++++++++++++++-------------
 include/linux/gpio/driver.h | 26 +++++++++++++++++++++-----
 2 files changed, 35 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 34f95fbc884a..b562dd36c4af 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -462,12 +462,6 @@ void gpiochip_set_chained_irqchip(struct gpio_chip *gpiochip,
 }
 EXPORT_SYMBOL_GPL(gpiochip_set_chained_irqchip);
 
-/*
- * This lock class tells lockdep that GPIO irqs are in a different
- * category than their parents, so it won't report false recursion.
- */
-static struct lock_class_key gpiochip_irq_lock_class;
-
 /**
  * gpiochip_irq_map() - maps an IRQ into a GPIO irqchip
  * @d: the irqdomain used by this irqchip
@@ -484,7 +478,11 @@ static int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
 	struct gpio_chip *chip = d->host_data;
 
 	irq_set_chip_data(irq, chip);
-	irq_set_lockdep_class(irq, &gpiochip_irq_lock_class);
+	/*
+	 * This lock class tells lockdep that GPIO irqs are in a different
+	 * category than their parents, so it won't report false recursion.
+	 */
+	irq_set_lockdep_class(irq, chip->lock_key);
 	irq_set_chip_and_handler(irq, chip->irqchip, chip->irq_handler);
 	/* Chips that can sleep need nested thread handlers */
 	if (chip->can_sleep && !chip->irq_not_threaded)
@@ -589,6 +587,7 @@ static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip)
  * @handler: the irq handler to use (often a predefined irq core function)
  * @type: the default type for IRQs on this irqchip, pass IRQ_TYPE_NONE
  * to have the core avoid setting up any default type in the hardware.
+ * @lock_key: lockdep class
  *
  * This function closely associates a certain irqchip with a certain
  * gpiochip, providing an irq domain to translate the local IRQs to
@@ -604,11 +603,12 @@ static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip)
  * the pins on the gpiochip can generate a unique IRQ. Everything else
  * need to be open coded.
  */
-int gpiochip_irqchip_add(struct gpio_chip *gpiochip,
-			 struct irq_chip *irqchip,
-			 unsigned int first_irq,
-			 irq_flow_handler_t handler,
-			 unsigned int type)
+int _gpiochip_irqchip_add(struct gpio_chip *gpiochip,
+			  struct irq_chip *irqchip,
+			  unsigned int first_irq,
+			  irq_flow_handler_t handler,
+			  unsigned int type,
+			  struct lock_class_key *lock_key)
 {
 	struct device_node *of_node;
 	unsigned int offset;
@@ -634,6 +634,7 @@ int gpiochip_irqchip_add(struct gpio_chip *gpiochip,
 	gpiochip->irq_handler = handler;
 	gpiochip->irq_default_type = type;
 	gpiochip->to_irq = gpiochip_to_irq;
+	gpiochip->lock_key = lock_key;
 	gpiochip->irqdomain = irq_domain_add_simple(of_node,
 					gpiochip->ngpio, first_irq,
 					&gpiochip_domain_ops, gpiochip);
@@ -671,7 +672,7 @@ int gpiochip_irqchip_add(struct gpio_chip *gpiochip,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(gpiochip_irqchip_add);
+EXPORT_SYMBOL_GPL(_gpiochip_irqchip_add);
 
 #else /* CONFIG_GPIOLIB_IRQCHIP */
 
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index c8393cd4d44f..0c7004dab437 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -6,6 +6,7 @@
 #include <linux/irq.h>
 #include <linux/irqchip/chained_irq.h>
 #include <linux/irqdomain.h>
+#include <linux/lockdep.h>
 #include <linux/pinctrl/pinctrl.h>
 
 struct device;
@@ -126,6 +127,7 @@ struct gpio_chip {
 	irq_flow_handler_t	irq_handler;
 	unsigned int		irq_default_type;
 	int			irq_parent;
+	struct lock_class_key	*lock_key;
 #endif
 
 #if defined(CONFIG_OF_GPIO)
@@ -171,11 +173,25 @@ void gpiochip_set_chained_irqchip(struct gpio_chip *gpiochip,
 		int parent_irq,
 		irq_flow_handler_t parent_handler);
 
-int gpiochip_irqchip_add(struct gpio_chip *gpiochip,
-		struct irq_chip *irqchip,
-		unsigned int first_irq,
-		irq_flow_handler_t handler,
-		unsigned int type);
+int _gpiochip_irqchip_add(struct gpio_chip *gpiochip,
+			  struct irq_chip *irqchip,
+			  unsigned int first_irq,
+			  irq_flow_handler_t handler,
+			  unsigned int type,
+			  struct lock_class_key *lock_key);
+
+#ifdef CONFIG_LOCKDEP
+#define gpiochip_irqchip_add(...)				\
+(								\
+	({							\
+		static struct lock_class_key _key;		\
+		_gpiochip_irqchip_add(__VA_ARGS__, &_key);	\
+	})							\
+)
+#else
+#define gpiochip_irqchip_add(...)				\
+	_gpiochip_irqchip_add(__VA_ARGS__, NULL)
+#endif
 
 #endif /* CONFIG_GPIOLIB_IRQCHIP */
 
-- 
cgit v1.2.3-70-g09d2


From 6bc7064a69fc5b1f774771ea9e2c50e497311766 Mon Sep 17 00:00:00 2001
From: Thierry Reding <thierry.reding@gmail.com>
Date: Mon, 27 Jul 2015 11:57:28 +0200
Subject: pwm: Remove useless whitespace

Remove useless tabs used for padding in structure definitions as well as
some blank lines.

Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  |  1 -
 include/linux/pwm.h | 71 ++++++++++++++++++++++++-----------------------------
 2 files changed, 32 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index f7c11d2dec37..257cbcdd3ec3 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -924,6 +924,5 @@ static int __init pwm_debugfs_init(void)
 
 	return 0;
 }
-
 subsys_initcall(pwm_debugfs_init);
 #endif /* CONFIG_DEBUG_FS */
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 6f286df30021..7c4b6f35241d 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -80,16 +80,16 @@ enum {
 };
 
 struct pwm_device {
-	const char		*label;
-	unsigned long		flags;
-	unsigned int		hwpwm;
-	unsigned int		pwm;
-	struct pwm_chip		*chip;
-	void			*chip_data;
-
-	unsigned int		period; 	/* in nanoseconds */
-	unsigned int		duty_cycle;	/* in nanoseconds */
-	enum pwm_polarity	polarity;
+	const char *label;
+	unsigned long flags;
+	unsigned int hwpwm;
+	unsigned int pwm;
+	struct pwm_chip *chip;
+	void *chip_data;
+
+	unsigned int period;
+	unsigned int duty_cycle;
+	enum pwm_polarity polarity;
 };
 
 static inline bool pwm_is_enabled(const struct pwm_device *pwm)
@@ -141,25 +141,18 @@ static inline enum pwm_polarity pwm_get_polarity(const struct pwm_device *pwm)
  * @owner: helps prevent removal of modules exporting active PWMs
  */
 struct pwm_ops {
-	int			(*request)(struct pwm_chip *chip,
-					   struct pwm_device *pwm);
-	void			(*free)(struct pwm_chip *chip,
-					struct pwm_device *pwm);
-	int			(*config)(struct pwm_chip *chip,
-					  struct pwm_device *pwm,
-					  int duty_ns, int period_ns);
-	int			(*set_polarity)(struct pwm_chip *chip,
-					  struct pwm_device *pwm,
-					  enum pwm_polarity polarity);
-	int			(*enable)(struct pwm_chip *chip,
-					  struct pwm_device *pwm);
-	void			(*disable)(struct pwm_chip *chip,
-					   struct pwm_device *pwm);
+	int (*request)(struct pwm_chip *chip, struct pwm_device *pwm);
+	void (*free)(struct pwm_chip *chip, struct pwm_device *pwm);
+	int (*config)(struct pwm_chip *chip, struct pwm_device *pwm,
+		      int duty_ns, int period_ns);
+	int (*set_polarity)(struct pwm_chip *chip, struct pwm_device *pwm,
+			    enum pwm_polarity polarity);
+	int (*enable)(struct pwm_chip *chip, struct pwm_device *pwm);
+	void (*disable)(struct pwm_chip *chip, struct pwm_device *pwm);
 #ifdef CONFIG_DEBUG_FS
-	void			(*dbg_show)(struct pwm_chip *chip,
-					    struct seq_file *s);
+	void (*dbg_show)(struct pwm_chip *chip, struct seq_file *s);
 #endif
-	struct module		*owner;
+	struct module *owner;
 };
 
 /**
@@ -174,18 +167,18 @@ struct pwm_ops {
  *             operations may sleep
  */
 struct pwm_chip {
-	struct device		*dev;
-	struct list_head	list;
-	const struct pwm_ops	*ops;
-	int			base;
-	unsigned int		npwm;
-
-	struct pwm_device	*pwms;
-
-	struct pwm_device *	(*of_xlate)(struct pwm_chip *pc,
-					    const struct of_phandle_args *args);
-	unsigned int		of_pwm_n_cells;
-	bool			can_sleep;
+	struct device *dev;
+	struct list_head list;
+	const struct pwm_ops *ops;
+	int base;
+	unsigned int npwm;
+
+	struct pwm_device *pwms;
+
+	struct pwm_device * (*of_xlate)(struct pwm_chip *pc,
+					const struct of_phandle_args *args);
+	unsigned int of_pwm_n_cells;
+	bool can_sleep;
 };
 
 #if IS_ENABLED(CONFIG_PWM)
-- 
cgit v1.2.3-70-g09d2


From 048838027667872a75d3af40c51a22088bafd968 Mon Sep 17 00:00:00 2001
From: Thierry Reding <thierry.reding@gmail.com>
Date: Mon, 27 Jul 2015 11:58:32 +0200
Subject: pwm: Clean up kerneldoc

Clean up kerneldoc in preparation for including the PWM documentation in
DocBook.

Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 44 ++++++++++++++++++++++++++++++++++++++------
 include/linux/pwm.h | 14 ++++++++++++++
 2 files changed, 52 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 257cbcdd3ec3..3f9df3ea3350 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -200,6 +200,8 @@ static void of_pwmchip_remove(struct pwm_chip *chip)
  * pwm_set_chip_data() - set private chip data for a PWM
  * @pwm: PWM device
  * @data: pointer to chip-specific data
+ *
+ * Returns: 0 on success or a negative error code on failure.
  */
 int pwm_set_chip_data(struct pwm_device *pwm, void *data)
 {
@@ -215,6 +217,8 @@ EXPORT_SYMBOL_GPL(pwm_set_chip_data);
 /**
  * pwm_get_chip_data() - get private chip data for a PWM
  * @pwm: PWM device
+ *
+ * Returns: A pointer to the chip-private data for the PWM device.
  */
 void *pwm_get_chip_data(struct pwm_device *pwm)
 {
@@ -230,6 +234,8 @@ EXPORT_SYMBOL_GPL(pwm_get_chip_data);
  * Register a new PWM chip. If chip->base < 0 then a dynamically assigned base
  * will be used. The initial polarity for all channels is specified by the
  * @polarity parameter.
+ *
+ * Returns: 0 on success or a negative error code on failure.
  */
 int pwmchip_add_with_polarity(struct pwm_chip *chip,
 			      enum pwm_polarity polarity)
@@ -291,6 +297,8 @@ EXPORT_SYMBOL_GPL(pwmchip_add_with_polarity);
  *
  * Register a new PWM chip. If chip->base < 0 then a dynamically assigned base
  * will be used. The initial polarity for all channels is normal.
+ *
+ * Returns: 0 on success or a negative error code on failure.
  */
 int pwmchip_add(struct pwm_chip *chip)
 {
@@ -304,6 +312,8 @@ EXPORT_SYMBOL_GPL(pwmchip_add);
  *
  * Removes a PWM chip. This function may return busy if the PWM chip provides
  * a PWM device that is still requested.
+ *
+ * Returns: 0 on success or a negative error code on failure.
  */
 int pwmchip_remove(struct pwm_chip *chip)
 {
@@ -338,10 +348,13 @@ EXPORT_SYMBOL_GPL(pwmchip_remove);
 
 /**
  * pwm_request() - request a PWM device
- * @pwm_id: global PWM device index
+ * @pwm: global PWM device index
  * @label: PWM device label
  *
  * This function is deprecated, use pwm_get() instead.
+ *
+ * Returns: A pointer to a PWM device or an ERR_PTR()-encoded error code on
+ * failure.
  */
 struct pwm_device *pwm_request(int pwm, const char *label)
 {
@@ -376,9 +389,9 @@ EXPORT_SYMBOL_GPL(pwm_request);
  * @index: per-chip index of the PWM to request
  * @label: a literal description string of this PWM
  *
- * Returns the PWM at the given index of the given PWM chip. A negative error
- * code is returned if the index is not valid for the specified PWM chip or
- * if the PWM device cannot be requested.
+ * Returns: A pointer to the PWM device at the given index of the given PWM
+ * chip. A negative error code is returned if the index is not valid for the
+ * specified PWM chip or if the PWM device cannot be requested.
  */
 struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip,
 					 unsigned int index,
@@ -419,6 +432,8 @@ EXPORT_SYMBOL_GPL(pwm_free);
  * @pwm: PWM device
  * @duty_ns: "on" time (in nanoseconds)
  * @period_ns: duration (in nanoseconds) of one cycle
+ *
+ * Returns: 0 on success or a negative error code on failure.
  */
 int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns)
 {
@@ -443,7 +458,10 @@ EXPORT_SYMBOL_GPL(pwm_config);
  * @pwm: PWM device
  * @polarity: new polarity of the PWM signal
  *
- * Note that the polarity cannot be configured while the PWM device is enabled
+ * Note that the polarity cannot be configured while the PWM device is
+ * enabled.
+ *
+ * Returns: 0 on success or a negative error code on failure.
  */
 int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity)
 {
@@ -471,6 +489,8 @@ EXPORT_SYMBOL_GPL(pwm_set_polarity);
 /**
  * pwm_enable() - start a PWM output toggling
  * @pwm: PWM device
+ *
+ * Returns: 0 on success or a negative error code on failure.
  */
 int pwm_enable(struct pwm_device *pwm)
 {
@@ -524,6 +544,9 @@ static struct pwm_chip *of_node_to_pwmchip(struct device_node *np)
  * lookup of the PWM index. This also means that the "pwm-names" property
  * becomes mandatory for devices that look up the PWM device via the con_id
  * parameter.
+ *
+ * Returns: A pointer to the requested PWM device or an ERR_PTR()-encoded
+ * error code on failure.
  */
 struct pwm_device *of_pwm_get(struct device_node *np, const char *con_id)
 {
@@ -630,6 +653,9 @@ void pwm_remove_table(struct pwm_lookup *table, size_t num)
  *
  * Once a PWM chip has been found the specified PWM device will be requested
  * and is ready to be used.
+ *
+ * Returns: A pointer to the requested PWM device or an ERR_PTR()-encoded
+ * error code on failure.
  */
 struct pwm_device *pwm_get(struct device *dev, const char *con_id)
 {
@@ -752,6 +778,9 @@ static void devm_pwm_release(struct device *dev, void *res)
  *
  * This function performs like pwm_get() but the acquired PWM device will
  * automatically be released on driver detach.
+ *
+ * Returns: A pointer to the requested PWM device or an ERR_PTR()-encoded
+ * error code on failure.
  */
 struct pwm_device *devm_pwm_get(struct device *dev, const char *con_id)
 {
@@ -781,6 +810,9 @@ EXPORT_SYMBOL_GPL(devm_pwm_get);
  *
  * This function performs like of_pwm_get() but the acquired PWM device will
  * automatically be released on driver detach.
+ *
+ * Returns: A pointer to the requested PWM device or an ERR_PTR()-encoded
+ * error code on failure.
  */
 struct pwm_device *devm_of_pwm_get(struct device *dev, struct device_node *np,
 				   const char *con_id)
@@ -832,7 +864,7 @@ EXPORT_SYMBOL_GPL(devm_pwm_put);
   * pwm_can_sleep() - report whether PWM access will sleep
   * @pwm: PWM device
   *
-  * It returns true if accessing the PWM can sleep, false otherwise.
+  * Returns: True if accessing the PWM can sleep, false otherwise.
   */
 bool pwm_can_sleep(struct pwm_device *pwm)
 {
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 7c4b6f35241d..d681f6875aef 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -79,6 +79,18 @@ enum {
 	PWMF_EXPORTED = 1 << 2,
 };
 
+/**
+ * struct pwm_device - PWM channel object
+ * @label: name of the PWM device
+ * @flags: flags associated with the PWM device
+ * @hwpwm: per-chip relative index of the PWM device
+ * @pwm: global index of the PWM device
+ * @chip: PWM chip providing this PWM device
+ * @chip_data: chip-private data associated with the PWM device
+ * @period: period of the PWM signal (in nanoseconds)
+ * @duty_cycle: duty cycle of the PWM signal (in nanoseconds)
+ * @polarity: polarity of the PWM signal
+ */
 struct pwm_device {
 	const char *label;
 	unsigned long flags;
@@ -163,6 +175,8 @@ struct pwm_ops {
  * @base: number of first PWM controlled by this chip
  * @npwm: number of PWMs controlled by this chip
  * @pwms: array of PWM devices allocated by the framework
+ * @of_xlate: request a PWM device given a device tree PWM specifier
+ * @of_pwm_n_cells: number of cells expected in the device tree PWM specifier
  * @can_sleep: must be true if the .config(), .enable() or .disable()
  *             operations may sleep
  */
-- 
cgit v1.2.3-70-g09d2


From 10c95ed9aa2970e05fedb4ac8b3ce1b934dab17b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 7 Aug 2015 18:15:11 +0200
Subject: scatterlist: allow limited chaining without ARCH_HAS_SG_CHAIN

There are a couple of uses of struct scatterlist that never go to
the dma_map_sg() helper and thus don't care about ARCH_HAS_SG_CHAIN
which indicates that we can map chained S/G list.

The most important one is the crypto code, which currently has
to open code a few helpers to always allow chaining.  This patch
removes a few #ifdef ARCH_HAS_SG_CHAIN statements so that we can
switch the crypto code to these common helpers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/scatterlist.h | 4 ----
 lib/scatterlist.c           | 4 ----
 2 files changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 9b1ef0c820a7..698e906ca730 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -161,10 +161,6 @@ static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
 static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
 			    struct scatterlist *sgl)
 {
-#ifndef CONFIG_ARCH_HAS_SG_CHAIN
-	BUG();
-#endif
-
 	/*
 	 * offset and length are unused for chain entry.  Clear them.
 	 */
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index d105a9f56878..bafa9933fa76 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -105,16 +105,12 @@ EXPORT_SYMBOL(sg_nents_for_len);
  **/
 struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents)
 {
-#ifndef CONFIG_ARCH_HAS_SG_CHAIN
-	struct scatterlist *ret = &sgl[nents - 1];
-#else
 	struct scatterlist *sg, *ret = NULL;
 	unsigned int i;
 
 	for_each_sg(sgl, sg, nents, i)
 		ret = sg;
 
-#endif
 #ifdef CONFIG_DEBUG_SG
 	BUG_ON(sgl[0].sg_magic != SG_MAGIC);
 	BUG_ON(!sg_is_last(ret));
-- 
cgit v1.2.3-70-g09d2


From 8bb28975823aee062f82b99ddacc499601c0cfd1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 17 Aug 2015 18:41:01 +0200
Subject: pnfs: move common blocklayout XDR defintions to nfs4.h

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.h | 19 +------------------
 fs/nfs/blocklayout/extent_tree.c |  2 +-
 fs/nfsd/blocklayoutxdr.c         |  2 +-
 fs/nfsd/blocklayoutxdr.h         | 15 ---------------
 include/linux/nfs4.h             | 18 ++++++++++++++++++
 5 files changed, 21 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 92dca9e90d8d..c556640dcf3b 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -46,13 +46,6 @@
 
 struct pnfs_block_dev;
 
-enum pnfs_block_volume_type {
-	PNFS_BLOCK_VOLUME_SIMPLE	= 0,
-	PNFS_BLOCK_VOLUME_SLICE		= 1,
-	PNFS_BLOCK_VOLUME_CONCAT	= 2,
-	PNFS_BLOCK_VOLUME_STRIPE	= 3,
-};
-
 #define PNFS_BLOCK_MAX_UUIDS	4
 #define PNFS_BLOCK_MAX_DEVICES	64
 
@@ -117,13 +110,6 @@ struct pnfs_block_dev {
 			struct pnfs_block_dev_map *map);
 };
 
-enum exstate4 {
-	PNFS_BLOCK_READWRITE_DATA	= 0,
-	PNFS_BLOCK_READ_DATA		= 1,
-	PNFS_BLOCK_INVALID_DATA		= 2, /* mapped, but data is invalid */
-	PNFS_BLOCK_NONE_DATA		= 3  /* unmapped, it's a hole */
-};
-
 /* sector_t fields are all in 512-byte sectors */
 struct pnfs_block_extent {
 	union {
@@ -134,15 +120,12 @@ struct pnfs_block_extent {
 	sector_t	be_f_offset;	/* the starting offset in the file */
 	sector_t	be_length;	/* the size of the extent */
 	sector_t	be_v_offset;	/* the starting offset in the volume */
-	enum exstate4	be_state;	/* the state of this extent */
+	enum pnfs_block_extent_state be_state;	/* the state of this extent */
 #define EXTENT_WRITTEN		1
 #define EXTENT_COMMITTING	2
 	unsigned int	be_tag;
 };
 
-/* on the wire size of the extent */
-#define BL_EXTENT_SIZE	(7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
-
 struct pnfs_block_layout {
 	struct pnfs_layout_hdr	bl_layout;
 	struct rb_root		bl_ext_rw;
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index a11b759d294a..c59a59c37f3d 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -465,7 +465,7 @@ out:
 static size_t ext_tree_layoutupdate_size(size_t count)
 {
 	return sizeof(__be32) /* number of entries */ +
-		BL_EXTENT_SIZE * count;
+		PNFS_BLOCK_EXTENT_SIZE * count;
 }
 
 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 9aa2796da90d..6d834dc9bbc8 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -101,7 +101,7 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
 	}
 
 	nr_iomaps = be32_to_cpup(p++);
-	expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
+	expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
 	if (len != expected) {
 		dprintk("%s: extent array size mismatch: %u/%u\n",
 			__func__, len, expected);
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index fdc79037c0e7..6de925fe8499 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -7,13 +7,6 @@
 struct iomap;
 struct xdr_stream;
 
-enum pnfs_block_extent_state {
-	PNFS_BLOCK_READWRITE_DATA	= 0,
-	PNFS_BLOCK_READ_DATA		= 1,
-	PNFS_BLOCK_INVALID_DATA		= 2,
-	PNFS_BLOCK_NONE_DATA		= 3,
-};
-
 struct pnfs_block_extent {
 	struct nfsd4_deviceid		vol_id;
 	u64				foff;
@@ -21,14 +14,6 @@ struct pnfs_block_extent {
 	u64				soff;
 	enum pnfs_block_extent_state	es;
 };
-#define NFS4_BLOCK_EXTENT_SIZE		44
-
-enum pnfs_block_volume_type {
-	PNFS_BLOCK_VOLUME_SIMPLE	= 0,
-	PNFS_BLOCK_VOLUME_SLICE		= 1,
-	PNFS_BLOCK_VOLUME_CONCAT	= 2,
-	PNFS_BLOCK_VOLUME_STRIPE	= 3,
-};
 
 /*
  * Random upper cap for the uuid length to avoid unbounded allocation.
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index b8e72aad919c..00121f298269 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -547,6 +547,24 @@ enum pnfs_notify_deviceid_type4 {
 	NOTIFY_DEVICEID4_DELETE = 1 << 2,
 };
 
+enum pnfs_block_volume_type {
+	PNFS_BLOCK_VOLUME_SIMPLE	= 0,
+	PNFS_BLOCK_VOLUME_SLICE		= 1,
+	PNFS_BLOCK_VOLUME_CONCAT	= 2,
+	PNFS_BLOCK_VOLUME_STRIPE	= 3,
+};
+
+enum pnfs_block_extent_state {
+	PNFS_BLOCK_READWRITE_DATA	= 0,
+	PNFS_BLOCK_READ_DATA		= 1,
+	PNFS_BLOCK_INVALID_DATA		= 2,
+	PNFS_BLOCK_NONE_DATA		= 3,
+};
+
+/* on the wire size of a block layout extent */
+#define PNFS_BLOCK_EXTENT_SIZE \
+	(7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
+
 #define NFL4_UFLG_MASK			0x0000003F
 #define NFL4_UFLG_DENSE			0x00000001
 #define NFL4_UFLG_COMMIT_THRU_MDS	0x00000002
-- 
cgit v1.2.3-70-g09d2


From 19a46fe57a005a884ccb38419071f772ac2fca2b Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Wed, 29 Jul 2015 19:58:15 +0800
Subject: time: Introduce struct itimerspec64

The struct itimerspec is not year 2038 safe on 32bit systems due to
the limitation of the struct timespec members. Introduce itimerspec64
which uses struct timespec64 instead and provide conversion functions.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time64.h | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index 77b5df2acd2a..367d5af899e8 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -12,11 +12,18 @@ typedef __s64 time64_t;
  */
 #if __BITS_PER_LONG == 64
 # define timespec64 timespec
+#define itimerspec64 itimerspec
 #else
 struct timespec64 {
 	time64_t	tv_sec;			/* seconds */
 	long		tv_nsec;		/* nanoseconds */
 };
+
+struct itimerspec64 {
+	struct timespec64 it_interval;
+	struct timespec64 it_value;
+};
+
 #endif
 
 /* Parameters used to convert the timespec values: */
@@ -45,6 +52,16 @@ static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
 	return ts;
 }
 
+static inline struct itimerspec itimerspec64_to_itimerspec(struct itimerspec64 *its64)
+{
+	return *its64;
+}
+
+static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *its)
+{
+	return *its;
+}
+
 # define timespec64_equal		timespec_equal
 # define timespec64_compare		timespec_compare
 # define set_normalized_timespec64	set_normalized_timespec
@@ -77,6 +94,24 @@ static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
 	return ret;
 }
 
+static inline struct itimerspec itimerspec64_to_itimerspec(struct itimerspec64 *its64)
+{
+	struct itimerspec ret;
+
+	ret.it_interval = timespec64_to_timespec(its64->it_interval);
+	ret.it_value = timespec64_to_timespec(its64->it_value);
+	return ret;
+}
+
+static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *its)
+{
+	struct itimerspec64 ret;
+
+	ret.it_interval = timespec_to_timespec64(its->it_interval);
+	ret.it_value = timespec_to_timespec64(its->it_value);
+	return ret;
+}
+
 static inline int timespec64_equal(const struct timespec64 *a,
 				   const struct timespec64 *b)
 {
-- 
cgit v1.2.3-70-g09d2


From 8758a240e2d74c5932ab51a73377e6507b7fd441 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Wed, 29 Jul 2015 20:09:43 +0800
Subject: time: Introduce current_kernel_time64()

The current_kernel_time() is not year 2038 safe on 32bit systems
since it returns a timespec value. Introduce current_kernel_time64()
which returns a timespec64 value.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeping.h | 9 ++++++++-
 kernel/time/timekeeping.c   | 6 +++---
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 3aa72e648650..657ea03fff40 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -18,10 +18,17 @@ extern int do_sys_settimeofday(const struct timespec *tv,
  * Kernel time accessors
  */
 unsigned long get_seconds(void);
-struct timespec current_kernel_time(void);
+struct timespec64 current_kernel_time64(void);
 /* does not take xtime_lock */
 struct timespec __current_kernel_time(void);
 
+static inline struct timespec current_kernel_time(void)
+{
+	struct timespec64 now = current_kernel_time64();
+
+	return timespec64_to_timespec(now);
+}
+
 /*
  * timespec based interfaces
  */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4cdb771913c8..f6ee2e6b6f5d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1881,7 +1881,7 @@ struct timespec __current_kernel_time(void)
 	return timespec64_to_timespec(tk_xtime(tk));
 }
 
-struct timespec current_kernel_time(void)
+struct timespec64 current_kernel_time64(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	struct timespec64 now;
@@ -1893,9 +1893,9 @@ struct timespec current_kernel_time(void)
 		now = tk_xtime(tk);
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
-	return timespec64_to_timespec(now);
+	return now;
 }
-EXPORT_SYMBOL(current_kernel_time);
+EXPORT_SYMBOL(current_kernel_time64);
 
 struct timespec64 get_monotonic_coarse64(void)
 {
-- 
cgit v1.2.3-70-g09d2


From 9ca308506062fc4a4ee8ca7ad2f71033c831c2fb Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Wed, 29 Jul 2015 20:18:31 +0800
Subject: time: Introduce timespec64_to_jiffies()/jiffies_to_timespec64()

The conversion between struct timespec and jiffies is not year 2038
safe on 32bit systems. Introduce timespec64_to_jiffies() and
jiffies_to_timespec64() functions which use struct timespec64 to
make it ready for 2038 issue.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/jiffies.h | 22 +++++++++++++++++++---
 kernel/time/time.c      | 21 +++++++++++++--------
 2 files changed, 32 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 535fd3bb1ba8..bf96d9f27bd0 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -416,9 +416,25 @@ static inline unsigned long usecs_to_jiffies(const unsigned int u)
 	}
 }
 
-extern unsigned long timespec_to_jiffies(const struct timespec *value);
-extern void jiffies_to_timespec(const unsigned long jiffies,
-				struct timespec *value);
+extern unsigned long timespec64_to_jiffies(const struct timespec64 *value);
+extern void jiffies_to_timespec64(const unsigned long jiffies,
+				  struct timespec64 *value);
+static inline unsigned long timespec_to_jiffies(const struct timespec *value)
+{
+	struct timespec64 ts = timespec_to_timespec64(*value);
+
+	return timespec64_to_jiffies(&ts);
+}
+
+static inline void jiffies_to_timespec(const unsigned long jiffies,
+				       struct timespec *value)
+{
+	struct timespec64 ts;
+
+	jiffies_to_timespec64(jiffies, &ts);
+	*value = timespec64_to_timespec(ts);
+}
+
 extern unsigned long timeval_to_jiffies(const struct timeval *value);
 extern void jiffies_to_timeval(const unsigned long jiffies,
 			       struct timeval *value);
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 34dbd4209e4a..f18ab105ed87 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -540,7 +540,7 @@ EXPORT_SYMBOL(__usecs_to_jiffies);
  * value to a scaled second value.
  */
 static unsigned long
-__timespec_to_jiffies(unsigned long sec, long nsec)
+__timespec64_to_jiffies(u64 sec, long nsec)
 {
 	nsec = nsec + TICK_NSEC - 1;
 
@@ -548,22 +548,27 @@ __timespec_to_jiffies(unsigned long sec, long nsec)
 		sec = MAX_SEC_IN_JIFFIES;
 		nsec = 0;
 	}
-	return (((u64)sec * SEC_CONVERSION) +
+	return ((sec * SEC_CONVERSION) +
 		(((u64)nsec * NSEC_CONVERSION) >>
 		 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
 
 }
 
-unsigned long
-timespec_to_jiffies(const struct timespec *value)
+static unsigned long
+__timespec_to_jiffies(unsigned long sec, long nsec)
 {
-	return __timespec_to_jiffies(value->tv_sec, value->tv_nsec);
+	return __timespec64_to_jiffies((u64)sec, nsec);
 }
 
-EXPORT_SYMBOL(timespec_to_jiffies);
+unsigned long
+timespec64_to_jiffies(const struct timespec64 *value)
+{
+	return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec);
+}
+EXPORT_SYMBOL(timespec64_to_jiffies);
 
 void
-jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
+jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
 {
 	/*
 	 * Convert jiffies to nanoseconds and separate with
@@ -574,7 +579,7 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
 				    NSEC_PER_SEC, &rem);
 	value->tv_nsec = rem;
 }
-EXPORT_SYMBOL(jiffies_to_timespec);
+EXPORT_SYMBOL(jiffies_to_timespec64);
 
 /*
  * We could use a similar algorithm to timespec_to_jiffies (with a
-- 
cgit v1.2.3-70-g09d2


From 7b0ce60c0b203a2a037f308e387152acbf99c445 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Mon, 13 Jul 2015 14:01:26 -0400
Subject: SUNRPC: Drop double-underscores from rpc_cmp_addr{4|6}()

I'm planning on using these functions inside the client, so remove the
underscores to make it feel like I'm using a public interface.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/addr.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/addr.h b/include/linux/sunrpc/addr.h
index 07d8e53bedfc..772faefc15aa 100644
--- a/include/linux/sunrpc/addr.h
+++ b/include/linux/sunrpc/addr.h
@@ -46,8 +46,8 @@ static inline void rpc_set_port(struct sockaddr *sap,
 #define IPV6_SCOPE_DELIMITER		'%'
 #define IPV6_SCOPE_ID_LEN		sizeof("%nnnnnnnnnn")
 
-static inline bool __rpc_cmp_addr4(const struct sockaddr *sap1,
-				   const struct sockaddr *sap2)
+static inline bool rpc_cmp_addr4(const struct sockaddr *sap1,
+				 const struct sockaddr *sap2)
 {
 	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1;
 	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2;
@@ -67,8 +67,8 @@ static inline bool __rpc_copy_addr4(struct sockaddr *dst,
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1,
-				   const struct sockaddr *sap2)
+static inline bool rpc_cmp_addr6(const struct sockaddr *sap1,
+				 const struct sockaddr *sap2)
 {
 	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1;
 	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2;
@@ -122,9 +122,9 @@ static inline bool rpc_cmp_addr(const struct sockaddr *sap1,
 	if (sap1->sa_family == sap2->sa_family) {
 		switch (sap1->sa_family) {
 		case AF_INET:
-			return __rpc_cmp_addr4(sap1, sap2);
+			return rpc_cmp_addr4(sap1, sap2);
 		case AF_INET6:
-			return __rpc_cmp_addr6(sap1, sap2);
+			return rpc_cmp_addr6(sap1, sap2);
 		}
 	}
 	return false;
-- 
cgit v1.2.3-70-g09d2


From 58cc8a55aa38655d472a4c381df465c9ab97b4d6 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Mon, 13 Jul 2015 14:01:27 -0400
Subject: SUNRPC: Add an rpc_cmp_addr_port() function

This function is to help determine if two sockaddrs are really the same
socket.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/addr.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/addr.h b/include/linux/sunrpc/addr.h
index 772faefc15aa..9bc3538d09a9 100644
--- a/include/linux/sunrpc/addr.h
+++ b/include/linux/sunrpc/addr.h
@@ -130,6 +130,19 @@ static inline bool rpc_cmp_addr(const struct sockaddr *sap1,
 	return false;
 }
 
+/**
+ * rpc_cmp_addr_port - compare the address and port number of two sockaddrs.
+ * @sap1: first sockaddr
+ * @sap2: second sockaddr
+ */
+static inline bool rpc_cmp_addr_port(const struct sockaddr *sap1,
+				     const struct sockaddr *sap2)
+{
+	if (!rpc_cmp_addr(sap1, sap2))
+		return false;
+	return rpc_get_port(sap1) == rpc_get_port(sap2);
+}
+
 /**
  * rpc_copy_addr - copy the address portion of one sockaddr to another
  * @dst: destination sockaddr
-- 
cgit v1.2.3-70-g09d2


From aff8d8dc4c34804c6a1de04f1b3313aa9063bf46 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Mon, 13 Jul 2015 14:01:33 -0400
Subject: NFS: Remove nfs_release()

And call nfs_file_clear_open_context() directly.  This makes it obvious
that nfs_file_release() will always return 0.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/file.c          | 3 ++-
 fs/nfs/inode.c         | 8 +-------
 include/linux/nfs_fs.h | 2 +-
 3 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cc4fa1ed61fc..7538a8582384 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -82,7 +82,8 @@ nfs_file_release(struct inode *inode, struct file *filp)
 	dprintk("NFS: release(%pD2)\n", filp);
 
 	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
-	return nfs_release(inode, filp);
+	nfs_file_clear_open_context(filp);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_file_release);
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0adc7d245b3d..382c8a46c078 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -888,7 +888,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
 	return ctx;
 }
 
-static void nfs_file_clear_open_context(struct file *filp)
+void nfs_file_clear_open_context(struct file *filp)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(filp);
 
@@ -919,12 +919,6 @@ int nfs_open(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-int nfs_release(struct inode *inode, struct file *filp)
-{
-	nfs_file_clear_open_context(filp);
-	return 0;
-}
-
 /*
  * This function is called whenever some part of NFS notices that
  * the cached attributes have to be refreshed.
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 874b77228fb9..c0e961474a52 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -353,7 +353,6 @@ extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
 extern void nfs_access_set_mask(struct nfs_access_entry *, u32);
 extern int nfs_permission(struct inode *, int);
 extern int nfs_open(struct inode *, struct file *);
-extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
 extern int nfs_attribute_cache_expired(struct inode *inode);
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
@@ -371,6 +370,7 @@ extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struc
 extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode);
 extern void nfs_inode_attach_open_context(struct nfs_open_context *ctx);
 extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx);
+extern void nfs_file_clear_open_context(struct file *flip);
 extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx);
 extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx);
 extern u64 nfs_compat_user_ino64(u64 fileid);
-- 
cgit v1.2.3-70-g09d2


From fa8187c96471c49419c25d4ec3299d17d3f274b2 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 13 Aug 2015 19:01:06 +0200
Subject: net: declare new net_device priv_flag IFF_NO_QUEUE

This private net_device flag can be set by drivers to inform that a
device runs fine without a qdisc attached. This was formerly done by
setting tx_queue_len to zero.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f7a6ef2fae3a..d131755516ca 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1262,6 +1262,7 @@ struct net_device_ops {
  * @IFF_LIVE_ADDR_CHANGE: device supports hardware address
  *	change when it's running
  * @IFF_MACVLAN: Macvlan device
+ * @IFF_NO_QUEUE: device can run without qdisc attached
  */
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
@@ -1290,6 +1291,7 @@ enum netdev_priv_flags {
 	IFF_IPVLAN_MASTER		= 1<<23,
 	IFF_IPVLAN_SLAVE		= 1<<24,
 	IFF_VRF_MASTER			= 1<<25,
+	IFF_NO_QUEUE			= 1<<26,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1318,6 +1320,7 @@ enum netdev_priv_flags {
 #define IFF_IPVLAN_MASTER		IFF_IPVLAN_MASTER
 #define IFF_IPVLAN_SLAVE		IFF_IPVLAN_SLAVE
 #define IFF_VRF_MASTER			IFF_VRF_MASTER
+#define IFF_NO_QUEUE			IFF_NO_QUEUE
 
 /**
  *	struct net_device - The DEVICE structure.
-- 
cgit v1.2.3-70-g09d2


From fbaff3ef859a86dc3df2128d2de9f8a6e255a967 Mon Sep 17 00:00:00 2001
From: Jesse Brandeburg <jesse.brandeburg@intel.com>
Date: Thu, 13 Aug 2015 18:34:03 -0700
Subject: net: fix endian check warning in etherdevice.h

Sparse builds have been warning for a really long time now
that etherdevice.h has a conversion that is unsafe.

  include/linux/etherdevice.h:79:32: warning: restricted __be16 degrades to integer

This code change fixes the issue and generates the exact
same assembly before/after (checked on x86_64)

Fixes: 2c722fe1c821 (etherdevice: Optimize a few is_<foo>_ether_addr functions)
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
CC: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 9012f8775208..eb049c622208 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -76,7 +76,7 @@ static inline bool is_link_local_ether_addr(const u8 *addr)
 
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
 	return (((*(const u32 *)addr) ^ (*(const u32 *)b)) |
-		((a[2] ^ b[2]) & m)) == 0;
+		(__force int)((a[2] ^ b[2]) & m)) == 0;
 #else
 	return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0;
 #endif
-- 
cgit v1.2.3-70-g09d2


From 9fba8e30f704d49627ab8a4a57213862206b3e5f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Aug 2015 14:46:51 -0500
Subject: SUNRPC: Drop double-underscores from __rpc_cmp_addr6()

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Fixes: 7b0ce60c0b20 ("SUNRPC: Drop double-underscores from rpc_cmp_addr{4|6}()")
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/addr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/addr.h b/include/linux/sunrpc/addr.h
index 9bc3538d09a9..5c9c6cd08d3b 100644
--- a/include/linux/sunrpc/addr.h
+++ b/include/linux/sunrpc/addr.h
@@ -93,7 +93,7 @@ static inline bool __rpc_copy_addr6(struct sockaddr *dst,
 	return true;
 }
 #else	/* !(IS_ENABLED(CONFIG_IPV6) */
-static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1,
+static inline bool rpc_cmp_addr6(const struct sockaddr *sap1,
 				   const struct sockaddr *sap2)
 {
 	return false;
-- 
cgit v1.2.3-70-g09d2


From 355c06633e233a57155b827ebe99b91c35bc1f5c Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Thu, 13 Aug 2015 17:52:02 -0600
Subject: workqueue: fix some docbook warnings

There are some errors in the docbook comments in workqueue.h that cause
warnings when the docs are built; this only recently came to light because
these comments were not used until now.  Fix the comments to make the
warnings go away.

The "args..." "fix" is a hack.  kerneldoc doesn't deal properly with named
variadic arguments in macros, so all I've really achieved here is to make
it shut up.  Fixing kerneldoc will have to wait for more time.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 738b30b39b68..0197358f1e81 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -265,7 +265,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 /**
  * delayed_work_pending - Find out whether a delayable work item is currently
  * pending
- * @work: The work item in question
+ * @w: The work item in question
  */
 #define delayed_work_pending(w) \
 	work_pending(&(w)->work)
@@ -366,7 +366,7 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
  * @fmt: printf format for the name of the workqueue
  * @flags: WQ_* flags
  * @max_active: max in-flight work items, 0 for default
- * @args: args for @fmt
+ * @args...: args for @fmt
  *
  * Allocate a workqueue with the specified parameters.  For detailed
  * information on WQ_* flags, please refer to Documentation/workqueue.txt.
@@ -398,7 +398,7 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
  * alloc_ordered_workqueue - allocate an ordered workqueue
  * @fmt: printf format for the name of the workqueue
  * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
- * @args: args for @fmt
+ * @args...: args for @fmt
  *
  * Allocate an ordered workqueue.  An ordered workqueue executes at
  * most one work item at any given time in the queued order.  They are
-- 
cgit v1.2.3-70-g09d2


From 30f93ca8323f0c21b789bea0f7db8e8e3a7915c6 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 17 Aug 2015 08:16:51 +0530
Subject: regulator: core: Define regulator_set_voltage_triplet()

Voltage tolerance isn't necessarily same on both sides of the target
voltage and regulator_set_voltage_tol() wouldn't be suitable in such
cases.

Add another routine regulator_set_voltage_triplet(), which accepts
target, min and max voltages as arguments.

This first tries to set the voltage between the target voltage and the
upper limit, then fall back on the full range. The idea behind this is
to set regulator's voltage as close to the target voltage, as possible.

Based on regulator_set_voltage_tol().

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/consumer.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index f8a689ed62a5..e325d4606b62 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -552,6 +552,16 @@ static inline int regulator_count_voltages(struct regulator *regulator)
 }
 #endif
 
+static inline int regulator_set_voltage_triplet(struct regulator *regulator,
+						int min_uV, int target_uV,
+						int max_uV)
+{
+	if (regulator_set_voltage(regulator, target_uV, max_uV) == 0)
+		return 0;
+
+	return regulator_set_voltage(regulator, min_uV, max_uV);
+}
+
 static inline int regulator_set_voltage_tol(struct regulator *regulator,
 					    int new_uV, int tol_uV)
 {
-- 
cgit v1.2.3-70-g09d2


From cbedaac63481dea52327127a9f1c60f092bd6b07 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 12 Mar 2015 08:19:11 -0400
Subject: inode: add hlist_fake to avoid the inode hash lock in evict

Some filesystems don't use the VFS inode hash and fake the fact they
are hashed so that all the writeback code works correctly. However,
this means the evict() path still tries to remove the inode from the
hash, meaning that the inode_hash_lock() needs to be taken
unnecessarily. Hence under certain workloads the inode_hash_lock can
be contended even if the inode is never actually hashed.

To avoid this add hlist_fake to test if the inode isn't actually
hashed to avoid taking the hash lock on inodes that have never been
hashed.  Based on Dave Chinner's

inode: add IOP_NOTHASHED to avoid inode hash lock in evict

basd on Al's suggestions.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Dave Chinner <dchinner@redhat.com>
---
 include/linux/fs.h   | 2 +-
 include/linux/list.h | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 84b783f277f7..4a40fa843040 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2608,7 +2608,7 @@ static inline void insert_inode_hash(struct inode *inode)
 extern void __remove_inode_hash(struct inode *);
 static inline void remove_inode_hash(struct inode *inode)
 {
-	if (!inode_unhashed(inode))
+	if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash))
 		__remove_inode_hash(inode);
 }
 
diff --git a/include/linux/list.h b/include/linux/list.h
index feb773c76ee0..3e3e64a61002 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -672,6 +672,11 @@ static inline void hlist_add_fake(struct hlist_node *n)
 	n->pprev = &n->next;
 }
 
+static inline bool hlist_fake(struct hlist_node *h)
+{
+	return h->pprev == &h->next;
+}
+
 /*
  * Move a list from one list head to another. Fixup the pprev
  * reference of the first entry if it exists.
-- 
cgit v1.2.3-70-g09d2


From 74278da9f70d84d715601fe794567a6d2bfdf078 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 4 Mar 2015 12:37:22 -0500
Subject: inode: convert inode_sb_list_lock to per-sb

The process of reducing contention on per-superblock inode lists
starts with moving the locking to match the per-superblock inode
list. This takes the global lock out of the picture and reduces the
contention problems to within a single filesystem. This doesn't get
rid of contention as the locks still have global CPU scope, but it
does isolate operations on different superblocks form each other.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Dave Chinner <dchinner@redhat.com>
---
 fs/block_dev.c                   | 12 ++++++------
 fs/drop_caches.c                 | 10 ++++++----
 fs/fs-writeback.c                | 12 ++++++------
 fs/inode.c                       | 28 +++++++++++++---------------
 fs/internal.h                    |  1 -
 fs/notify/inode_mark.c           | 20 ++++++++++----------
 fs/quota/dquot.c                 | 16 ++++++++--------
 fs/super.c                       |  3 ++-
 include/linux/fs.h               |  5 ++++-
 include/linux/fsnotify_backend.h |  4 ++--
 10 files changed, 57 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 198243717da5..33b813e04f79 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1769,7 +1769,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
 {
 	struct inode *inode, *old_inode = NULL;
 
-	spin_lock(&inode_sb_list_lock);
+	spin_lock(&blockdev_superblock->s_inode_list_lock);
 	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
 		struct address_space *mapping = inode->i_mapping;
 
@@ -1781,13 +1781,13 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
 		}
 		__iget(inode);
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_sb_list_lock);
+		spin_unlock(&blockdev_superblock->s_inode_list_lock);
 		/*
 		 * We hold a reference to 'inode' so it couldn't have been
 		 * removed from s_inodes list while we dropped the
-		 * inode_sb_list_lock.  We cannot iput the inode now as we can
+		 * s_inode_list_lock  We cannot iput the inode now as we can
 		 * be holding the last reference and we cannot iput it under
-		 * inode_sb_list_lock. So we keep the reference and iput it
+		 * s_inode_list_lock. So we keep the reference and iput it
 		 * later.
 		 */
 		iput(old_inode);
@@ -1795,8 +1795,8 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
 
 		func(I_BDEV(inode), arg);
 
-		spin_lock(&inode_sb_list_lock);
+		spin_lock(&blockdev_superblock->s_inode_list_lock);
 	}
-	spin_unlock(&inode_sb_list_lock);
+	spin_unlock(&blockdev_superblock->s_inode_list_lock);
 	iput(old_inode);
 }
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 5718cb9f7273..d72d52b90433 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -17,7 +17,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 {
 	struct inode *inode, *toput_inode = NULL;
 
-	spin_lock(&inode_sb_list_lock);
+	spin_lock(&sb->s_inode_list_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		spin_lock(&inode->i_lock);
 		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
@@ -27,13 +27,15 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 		}
 		__iget(inode);
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_sb_list_lock);
+		spin_unlock(&sb->s_inode_list_lock);
+
 		invalidate_mapping_pages(inode->i_mapping, 0, -1);
 		iput(toput_inode);
 		toput_inode = inode;
-		spin_lock(&inode_sb_list_lock);
+
+		spin_lock(&sb->s_inode_list_lock);
 	}
-	spin_unlock(&inode_sb_list_lock);
+	spin_unlock(&sb->s_inode_list_lock);
 	iput(toput_inode);
 }
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d98e37bbf417..f45bf876579f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2124,7 +2124,7 @@ static void wait_sb_inodes(struct super_block *sb)
 	 */
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	spin_lock(&inode_sb_list_lock);
+	spin_lock(&sb->s_inode_list_lock);
 
 	/*
 	 * Data integrity sync. Must wait for all pages under writeback,
@@ -2144,14 +2144,14 @@ static void wait_sb_inodes(struct super_block *sb)
 		}
 		__iget(inode);
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_sb_list_lock);
+		spin_unlock(&sb->s_inode_list_lock);
 
 		/*
 		 * We hold a reference to 'inode' so it couldn't have been
 		 * removed from s_inodes list while we dropped the
-		 * inode_sb_list_lock.  We cannot iput the inode now as we can
+		 * s_inode_list_lock.  We cannot iput the inode now as we can
 		 * be holding the last reference and we cannot iput it under
-		 * inode_sb_list_lock. So we keep the reference and iput it
+		 * s_inode_list_lock. So we keep the reference and iput it
 		 * later.
 		 */
 		iput(old_inode);
@@ -2161,9 +2161,9 @@ static void wait_sb_inodes(struct super_block *sb)
 
 		cond_resched();
 
-		spin_lock(&inode_sb_list_lock);
+		spin_lock(&sb->s_inode_list_lock);
 	}
-	spin_unlock(&inode_sb_list_lock);
+	spin_unlock(&sb->s_inode_list_lock);
 	iput(old_inode);
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index d30640f7a193..a2de294f6b77 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -28,8 +28,8 @@
  *   inode->i_state, inode->i_hash, __iget()
  * Inode LRU list locks protect:
  *   inode->i_sb->s_inode_lru, inode->i_lru
- * inode_sb_list_lock protects:
- *   sb->s_inodes, inode->i_sb_list
+ * inode->i_sb->s_inode_list_lock protects:
+ *   inode->i_sb->s_inodes, inode->i_sb_list
  * bdi->wb.list_lock protects:
  *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
  * inode_hash_lock protects:
@@ -37,7 +37,7 @@
  *
  * Lock ordering:
  *
- * inode_sb_list_lock
+ * inode->i_sb->s_inode_list_lock
  *   inode->i_lock
  *     Inode LRU list locks
  *
@@ -45,7 +45,7 @@
  *   inode->i_lock
  *
  * inode_hash_lock
- *   inode_sb_list_lock
+ *   inode->i_sb->s_inode_list_lock
  *   inode->i_lock
  *
  * iunique_lock
@@ -57,8 +57,6 @@ static unsigned int i_hash_shift __read_mostly;
 static struct hlist_head *inode_hashtable __read_mostly;
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
-
 /*
  * Empty aops. Can be used for the cases where the user does not
  * define any of the address_space operations.
@@ -426,18 +424,18 @@ static void inode_lru_list_del(struct inode *inode)
  */
 void inode_sb_list_add(struct inode *inode)
 {
-	spin_lock(&inode_sb_list_lock);
+	spin_lock(&inode->i_sb->s_inode_list_lock);
 	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
-	spin_unlock(&inode_sb_list_lock);
+	spin_unlock(&inode->i_sb->s_inode_list_lock);
 }
 EXPORT_SYMBOL_GPL(inode_sb_list_add);
 
 static inline void inode_sb_list_del(struct inode *inode)
 {
 	if (!list_empty(&inode->i_sb_list)) {
-		spin_lock(&inode_sb_list_lock);
+		spin_lock(&inode->i_sb->s_inode_list_lock);
 		list_del_init(&inode->i_sb_list);
-		spin_unlock(&inode_sb_list_lock);
+		spin_unlock(&inode->i_sb->s_inode_list_lock);
 	}
 }
 
@@ -594,7 +592,7 @@ void evict_inodes(struct super_block *sb)
 	struct inode *inode, *next;
 	LIST_HEAD(dispose);
 
-	spin_lock(&inode_sb_list_lock);
+	spin_lock(&sb->s_inode_list_lock);
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 		if (atomic_read(&inode->i_count))
 			continue;
@@ -610,7 +608,7 @@ void evict_inodes(struct super_block *sb)
 		spin_unlock(&inode->i_lock);
 		list_add(&inode->i_lru, &dispose);
 	}
-	spin_unlock(&inode_sb_list_lock);
+	spin_unlock(&sb->s_inode_list_lock);
 
 	dispose_list(&dispose);
 }
@@ -631,7 +629,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 	struct inode *inode, *next;
 	LIST_HEAD(dispose);
 
-	spin_lock(&inode_sb_list_lock);
+	spin_lock(&sb->s_inode_list_lock);
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
@@ -654,7 +652,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 		spin_unlock(&inode->i_lock);
 		list_add(&inode->i_lru, &dispose);
 	}
-	spin_unlock(&inode_sb_list_lock);
+	spin_unlock(&sb->s_inode_list_lock);
 
 	dispose_list(&dispose);
 
@@ -890,7 +888,7 @@ struct inode *new_inode(struct super_block *sb)
 {
 	struct inode *inode;
 
-	spin_lock_prefetch(&inode_sb_list_lock);
+	spin_lock_prefetch(&sb->s_inode_list_lock);
 
 	inode = new_inode_pseudo(sb);
 	if (inode)
diff --git a/fs/internal.h b/fs/internal.h
index 4d5af583ab03..ee1209c54eb1 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -112,7 +112,6 @@ extern int vfs_open(const struct path *, struct file *, const struct cred *);
 /*
  * inode.c
  */
-extern spinlock_t inode_sb_list_lock;
 extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
 extern void inode_add_lru(struct inode *inode);
 
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3daf513ee99e..a4e1a8f6c329 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -163,17 +163,17 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 
 /**
  * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
- * @list: list of inodes being unmounted (sb->s_inodes)
+ * @sb: superblock being unmounted.
  *
  * Called during unmount with no locks held, so needs to be safe against
- * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block.
+ * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
  */
-void fsnotify_unmount_inodes(struct list_head *list)
+void fsnotify_unmount_inodes(struct super_block *sb)
 {
 	struct inode *inode, *next_i, *need_iput = NULL;
 
-	spin_lock(&inode_sb_list_lock);
-	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+	spin_lock(&sb->s_inode_list_lock);
+	list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) {
 		struct inode *need_iput_tmp;
 
 		/*
@@ -209,7 +209,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
 		spin_unlock(&inode->i_lock);
 
 		/* In case the dropping of a reference would nuke next_i. */
-		while (&next_i->i_sb_list != list) {
+		while (&next_i->i_sb_list != &sb->s_inodes) {
 			spin_lock(&next_i->i_lock);
 			if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
 						atomic_read(&next_i->i_count)) {
@@ -224,12 +224,12 @@ void fsnotify_unmount_inodes(struct list_head *list)
 		}
 
 		/*
-		 * We can safely drop inode_sb_list_lock here because either
+		 * We can safely drop s_inode_list_lock here because either
 		 * we actually hold references on both inode and next_i or
 		 * end of list.  Also no new inodes will be added since the
 		 * umount has begun.
 		 */
-		spin_unlock(&inode_sb_list_lock);
+		spin_unlock(&sb->s_inode_list_lock);
 
 		if (need_iput_tmp)
 			iput(need_iput_tmp);
@@ -241,7 +241,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
 
 		iput(inode);
 
-		spin_lock(&inode_sb_list_lock);
+		spin_lock(&sb->s_inode_list_lock);
 	}
-	spin_unlock(&inode_sb_list_lock);
+	spin_unlock(&sb->s_inode_list_lock);
 }
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 20d1f74561cf..2863ec6cbadf 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -923,7 +923,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
 	int reserved = 0;
 #endif
 
-	spin_lock(&inode_sb_list_lock);
+	spin_lock(&sb->s_inode_list_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		spin_lock(&inode->i_lock);
 		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
@@ -934,7 +934,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
 		}
 		__iget(inode);
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_sb_list_lock);
+		spin_unlock(&sb->s_inode_list_lock);
 
 #ifdef CONFIG_QUOTA_DEBUG
 		if (unlikely(inode_get_rsv_space(inode) > 0))
@@ -946,15 +946,15 @@ static void add_dquot_ref(struct super_block *sb, int type)
 		/*
 		 * We hold a reference to 'inode' so it couldn't have been
 		 * removed from s_inodes list while we dropped the
-		 * inode_sb_list_lock We cannot iput the inode now as we can be
+		 * s_inode_list_lock. We cannot iput the inode now as we can be
 		 * holding the last reference and we cannot iput it under
-		 * inode_sb_list_lock. So we keep the reference and iput it
+		 * s_inode_list_lock. So we keep the reference and iput it
 		 * later.
 		 */
 		old_inode = inode;
-		spin_lock(&inode_sb_list_lock);
+		spin_lock(&sb->s_inode_list_lock);
 	}
-	spin_unlock(&inode_sb_list_lock);
+	spin_unlock(&sb->s_inode_list_lock);
 	iput(old_inode);
 
 #ifdef CONFIG_QUOTA_DEBUG
@@ -1023,7 +1023,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
 	struct inode *inode;
 	int reserved = 0;
 
-	spin_lock(&inode_sb_list_lock);
+	spin_lock(&sb->s_inode_list_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		/*
 		 *  We have to scan also I_NEW inodes because they can already
@@ -1039,7 +1039,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
 		}
 		spin_unlock(&dq_data_lock);
 	}
-	spin_unlock(&inode_sb_list_lock);
+	spin_unlock(&sb->s_inode_list_lock);
 #ifdef CONFIG_QUOTA_DEBUG
 	if (reserved) {
 		printk(KERN_WARNING "VFS (%s): Writes happened after quota"
diff --git a/fs/super.c b/fs/super.c
index b61372354f2b..c808183554a2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -191,6 +191,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_anon);
 	INIT_LIST_HEAD(&s->s_inodes);
+	spin_lock_init(&s->s_inode_list_lock);
 
 	if (list_lru_init_memcg(&s->s_dentry_lru))
 		goto fail;
@@ -399,7 +400,7 @@ void generic_shutdown_super(struct super_block *sb)
 		sync_filesystem(sb);
 		sb->s_flags &= ~MS_ACTIVE;
 
-		fsnotify_unmount_inodes(&sb->s_inodes);
+		fsnotify_unmount_inodes(sb);
 
 		evict_inodes(sb);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4a40fa843040..09bbd38485f9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1309,7 +1309,6 @@ struct super_block {
 #endif
 	const struct xattr_handler **s_xattr;
 
-	struct list_head	s_inodes;	/* all inodes */
 	struct hlist_bl_head	s_anon;		/* anonymous dentries for (nfs) exporting */
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
 	struct block_device	*s_bdev;
@@ -1380,6 +1379,10 @@ struct super_block {
 	 * Indicates how deep in a filesystem stack this SB is
 	 */
 	int s_stack_depth;
+
+	/* s_inode_list_lock protects s_inodes */
+	spinlock_t		s_inode_list_lock ____cacheline_aligned_in_smp;
+	struct list_head	s_inodes;	/* all inodes */
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 65a517dd32f7..0390ee69c439 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -357,7 +357,7 @@ extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, un
 extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
 extern void fsnotify_get_mark(struct fsnotify_mark *mark);
 extern void fsnotify_put_mark(struct fsnotify_mark *mark);
-extern void fsnotify_unmount_inodes(struct list_head *list);
+extern void fsnotify_unmount_inodes(struct super_block *sb);
 
 /* put here because inotify does some weird stuff when destroying watches */
 extern void fsnotify_init_event(struct fsnotify_event *event,
@@ -393,7 +393,7 @@ static inline u32 fsnotify_get_cookie(void)
 	return 0;
 }
 
-static inline void fsnotify_unmount_inodes(struct list_head *list)
+static inline void fsnotify_unmount_inodes(struct super_block *sb)
 {}
 
 #endif	/* CONFIG_FSNOTIFY */
-- 
cgit v1.2.3-70-g09d2


From e97fedb9ef9868ff24d588be781906cf7c1b59ae Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 4 Mar 2015 13:40:00 -0500
Subject: sync: serialise per-superblock sync operations

When competing sync(2) calls walk the same filesystem, they need to
walk the list of inodes on the superblock to find all the inodes
that we need to wait for IO completion on. However, when multiple
wait_sb_inodes() calls do this at the same time, they contend on the
the inode_sb_list_lock and the contention causes system wide
slowdowns. In effect, concurrent sync(2) calls can take longer and
burn more CPU than if they were serialised.

Stop the worst of the contention by adding a per-sb mutex to wrap
around wait_sb_inodes() so that we only execute one sync(2) IO
completion walk per superblock superblock at a time and hence avoid
contention being triggered by concurrent sync(2) calls.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Dave Chinner <dchinner@redhat.com>
---
 fs/fs-writeback.c  | 11 +++++++++++
 fs/super.c         |  1 +
 include/linux/fs.h |  2 ++
 3 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f45bf876579f..3c974442bdf0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2114,6 +2114,15 @@ out_unlock_inode:
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
 
+/*
+ * The @s_sync_lock is used to serialise concurrent sync operations
+ * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
+ * Concurrent callers will block on the s_sync_lock rather than doing contending
+ * walks. The queueing maintains sync(2) required behaviour as all the IO that
+ * has been issued up to the time this function is enter is guaranteed to be
+ * completed by the time we have gained the lock and waited for all IO that is
+ * in progress regardless of the order callers are granted the lock.
+ */
 static void wait_sb_inodes(struct super_block *sb)
 {
 	struct inode *inode, *old_inode = NULL;
@@ -2124,6 +2133,7 @@ static void wait_sb_inodes(struct super_block *sb)
 	 */
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
+	mutex_lock(&sb->s_sync_lock);
 	spin_lock(&sb->s_inode_list_lock);
 
 	/*
@@ -2165,6 +2175,7 @@ static void wait_sb_inodes(struct super_block *sb)
 	}
 	spin_unlock(&sb->s_inode_list_lock);
 	iput(old_inode);
+	mutex_unlock(&sb->s_sync_lock);
 }
 
 static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
diff --git a/fs/super.c b/fs/super.c
index c808183554a2..fd427ec0b372 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -190,6 +190,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	s->s_flags = flags;
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_anon);
+	mutex_init(&s->s_sync_lock);
 	INIT_LIST_HEAD(&s->s_inodes);
 	spin_lock_init(&s->s_inode_list_lock);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 09bbd38485f9..82dfc5519b4b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1375,6 +1375,8 @@ struct super_block {
 	struct list_lru		s_inode_lru ____cacheline_aligned_in_smp;
 	struct rcu_head		rcu;
 
+	struct mutex		s_sync_lock;	/* sync serialisation lock */
+
 	/*
 	 * Indicates how deep in a filesystem stack this SB is
 	 */
-- 
cgit v1.2.3-70-g09d2


From 6fa1bcab6be6e9bd93f80e345c7e9a4ec7861df9 Mon Sep 17 00:00:00 2001
From: Achiad Shochat <achiad@mellanox.com>
Date: Sun, 16 Aug 2015 16:04:50 +0300
Subject: net/mlx5e: Ethtool link speed setting fixes

- Port speed settings are applied by the device only upon
  port admin status transition from DOWN to UP.
  So we enforce this transition regardless of the port's
  current operation state (which may be occasionally DOWN if
  for example the network cable is disconnected).
- Fix the PORT_UP/DOWN device interface enum
- Set the local_port bit in the device PAOS register
- EXPORT the PAOS (Port Administrative and Operational Status)
  register set/query access functions.

Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 25 ++++++----------------
 drivers/net/ethernet/mellanox/mlx5/core/port.c     | 14 ++++++++----
 include/linux/mlx5/driver.h                        | 11 +++++-----
 3 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 975828521913..77158b31eb0b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -628,7 +628,7 @@ static int mlx5e_set_settings(struct net_device *netdev,
 	u32 link_modes;
 	u32 speed;
 	u32 eth_proto_cap, eth_proto_admin;
-	u8 port_status;
+	enum mlx5_port_status ps;
 	int err;
 
 	speed = ethtool_cmd_speed(cmd);
@@ -662,24 +662,13 @@ static int mlx5e_set_settings(struct net_device *netdev,
 	if (link_modes == eth_proto_admin)
 		goto out;
 
-	err = mlx5_set_port_proto(mdev, link_modes, MLX5_PTYS_EN);
-	if (err) {
-		netdev_err(netdev, "%s: set port eth proto admin failed: %d\n",
-			   __func__, err);
-		goto out;
-	}
+	mlx5_query_port_admin_status(mdev, &ps);
+	if (ps == MLX5_PORT_UP)
+		mlx5_set_port_admin_status(mdev, MLX5_PORT_DOWN);
+	mlx5_set_port_proto(mdev, link_modes, MLX5_PTYS_EN);
+	if (ps == MLX5_PORT_UP)
+		mlx5_set_port_admin_status(mdev, MLX5_PORT_UP);
 
-	err = mlx5_query_port_status(mdev, &port_status);
-	if (err)
-		goto out;
-
-	if (port_status == MLX5_PORT_DOWN)
-		return 0;
-
-	err = mlx5_set_port_status(mdev, MLX5_PORT_DOWN);
-	if (err)
-		goto out;
-	err = mlx5_set_port_status(mdev, MLX5_PORT_UP);
 out:
 	return err;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 70147999f657..f8db52bd8f18 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -216,22 +216,25 @@ int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin,
 }
 EXPORT_SYMBOL_GPL(mlx5_set_port_proto);
 
-int mlx5_set_port_status(struct mlx5_core_dev *dev,
-			 enum mlx5_port_status status)
+int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
+			       enum mlx5_port_status status)
 {
 	u32 in[MLX5_ST_SZ_DW(paos_reg)];
 	u32 out[MLX5_ST_SZ_DW(paos_reg)];
 
 	memset(in, 0, sizeof(in));
 
+	MLX5_SET(paos_reg, in, local_port, 1);
 	MLX5_SET(paos_reg, in, admin_status, status);
 	MLX5_SET(paos_reg, in, ase, 1);
 
 	return mlx5_core_access_reg(dev, in, sizeof(in), out,
 				    sizeof(out), MLX5_REG_PAOS, 0, 1);
 }
+EXPORT_SYMBOL_GPL(mlx5_set_port_admin_status);
 
-int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status)
+int mlx5_query_port_admin_status(struct mlx5_core_dev *dev,
+				 enum mlx5_port_status *status)
 {
 	u32 in[MLX5_ST_SZ_DW(paos_reg)];
 	u32 out[MLX5_ST_SZ_DW(paos_reg)];
@@ -239,14 +242,17 @@ int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status)
 
 	memset(in, 0, sizeof(in));
 
+	MLX5_SET(paos_reg, in, local_port, 1);
+
 	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
 				   sizeof(out), MLX5_REG_PAOS, 0, 0);
 	if (err)
 		return err;
 
-	*status = MLX5_GET(paos_reg, out, oper_status);
+	*status = MLX5_GET(paos_reg, out, admin_status);
 	return err;
 }
+EXPORT_SYMBOL_GPL(mlx5_query_port_admin_status);
 
 static void mlx5_query_port_mtu(struct mlx5_core_dev *dev, int *admin_mtu,
 				int *max_mtu, int *oper_mtu, u8 port)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2039546b0ec6..4b5d7fc88d0f 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -152,8 +152,8 @@ enum mlx5_dev_event {
 };
 
 enum mlx5_port_status {
-	MLX5_PORT_UP        = 1 << 1,
-	MLX5_PORT_DOWN      = 1 << 2,
+	MLX5_PORT_UP        = 1,
+	MLX5_PORT_DOWN      = 2,
 };
 
 struct mlx5_uuar_info {
@@ -761,9 +761,10 @@ int mlx5_query_port_proto_oper(struct mlx5_core_dev *dev,
 			       u8 local_port);
 int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin,
 			int proto_mask);
-int mlx5_set_port_status(struct mlx5_core_dev *dev,
-			 enum mlx5_port_status status);
-int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status);
+int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
+			       enum mlx5_port_status status);
+int mlx5_query_port_admin_status(struct mlx5_core_dev *dev,
+				 enum mlx5_port_status *status);
 
 int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port);
 void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, u8 port);
-- 
cgit v1.2.3-70-g09d2


From 3c2d18ef22df1bdccfb11a5b85b29e4e61b9d9c6 Mon Sep 17 00:00:00 2001
From: Achiad Shochat <achiad@mellanox.com>
Date: Sun, 16 Aug 2015 16:04:51 +0300
Subject: net/mlx5e: Support ethtool get/set_pauseparam

Only rx/tx pause settings.
Autoneg setting is currently not supported.

Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 38 ++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/port.c     | 42 ++++++++++++++++++++++
 include/linux/mlx5/driver.h                        |  5 +++
 3 files changed, 85 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 77158b31eb0b..bce912688ca8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -820,6 +820,42 @@ static int mlx5e_set_tunable(struct net_device *dev,
 	return err;
 }
 
+static void mlx5e_get_pauseparam(struct net_device *netdev,
+				 struct ethtool_pauseparam *pauseparam)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	err = mlx5_query_port_pause(mdev, &pauseparam->rx_pause,
+				    &pauseparam->tx_pause);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5_query_port_pause failed:0x%x\n",
+			   __func__, err);
+	}
+}
+
+static int mlx5e_set_pauseparam(struct net_device *netdev,
+				struct ethtool_pauseparam *pauseparam)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	if (pauseparam->autoneg)
+		return -EINVAL;
+
+	err = mlx5_set_port_pause(mdev,
+				  pauseparam->rx_pause ? 1 : 0,
+				  pauseparam->tx_pause ? 1 : 0);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5_set_port_pause failed:0x%x\n",
+			   __func__, err);
+	}
+
+	return err;
+}
+
 const struct ethtool_ops mlx5e_ethtool_ops = {
 	.get_drvinfo       = mlx5e_get_drvinfo,
 	.get_link          = ethtool_op_get_link,
@@ -841,4 +877,6 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
 	.get_rxnfc         = mlx5e_get_rxnfc,
 	.get_tunable       = mlx5e_get_tunable,
 	.set_tunable       = mlx5e_set_tunable,
+	.get_pauseparam    = mlx5e_get_pauseparam,
+	.set_pauseparam    = mlx5e_set_pauseparam,
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index f8db52bd8f18..821caaab9bfb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -334,3 +334,45 @@ int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_vl_hw_cap);
+
+int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause)
+{
+	u32 in[MLX5_ST_SZ_DW(pfcc_reg)];
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(pfcc_reg, in, local_port, 1);
+	MLX5_SET(pfcc_reg, in, pptx, tx_pause);
+	MLX5_SET(pfcc_reg, in, pprx, rx_pause);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_PFCC, 0, 1);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_pause);
+
+int mlx5_query_port_pause(struct mlx5_core_dev *dev,
+			  u32 *rx_pause, u32 *tx_pause)
+{
+	u32 in[MLX5_ST_SZ_DW(pfcc_reg)];
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(pfcc_reg, in, local_port, 1);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_PFCC, 0, 0);
+	if (err)
+		return err;
+
+	if (rx_pause)
+		*rx_pause = MLX5_GET(pfcc_reg, out, pprx);
+
+	if (tx_pause)
+		*tx_pause = MLX5_GET(pfcc_reg, out, pptx);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_pause);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 4b5d7fc88d0f..8b6d6f2154a4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -103,6 +103,7 @@ enum {
 	MLX5_REG_PMTU		 = 0x5003,
 	MLX5_REG_PTYS		 = 0x5004,
 	MLX5_REG_PAOS		 = 0x5006,
+	MLX5_REG_PFCC            = 0x5007,
 	MLX5_REG_PPCNT		 = 0x5008,
 	MLX5_REG_PMAOS		 = 0x5012,
 	MLX5_REG_PUDE		 = 0x5009,
@@ -774,6 +775,10 @@ void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
 int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev,
 			      u8 *vl_hw_cap, u8 local_port);
 
+int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause);
+int mlx5_query_port_pause(struct mlx5_core_dev *dev,
+			  u32 *rx_pause, u32 *tx_pause);
+
 int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
-- 
cgit v1.2.3-70-g09d2


From 2f52bdcf6ba1bd81597b1505a222430676548b3a Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sun, 16 Aug 2015 07:49:20 -0600
Subject: net: Updates to netif_index_is_vrf

As Eric noted netif_index_is_vrf is not called with rcu_read_lock held,
so wrap the dev_get_by_index_rcu in rcu_read_lock and unlock.

If VRF is not enabled or oif is 0 skip the device lookup. In both cases
index cannot be the VRF master.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d131755516ca..8a530f930754 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3822,12 +3822,22 @@ static inline bool netif_is_vrf(const struct net_device *dev)
 
 static inline bool netif_index_is_vrf(struct net *net, int ifindex)
 {
-	struct net_device *dev = dev_get_by_index_rcu(net, ifindex);
 	bool rc = false;
 
+#if IS_ENABLED(CONFIG_NET_VRF)
+	struct net_device *dev;
+
+	if (ifindex == 0)
+		return false;
+
+	rcu_read_lock();
+
+	dev = dev_get_by_index_rcu(net, ifindex);
 	if (dev)
 		rc = netif_is_vrf(dev);
 
+	rcu_read_unlock();
+#endif
 	return rc;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 808d28c468a89e8680396d98aea96024b6f5afdc Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sun, 16 Aug 2015 10:26:49 -0600
Subject: net: Fix docbook warning for IFF_VRF_MASTER enum

kbuild test robot reported:
tree:   git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
head:   d52736e24fe2e927c26817256f8d1a3c8b5d51a0
commit: 4e3c89920cd3a6cfce22c6f537690747c26128dd [751/762] net: Introduce VRF related flags and helpers
reproduce: make htmldocs

>> Warning(include/linux/netdevice.h:1293): Enum value 'IFF_VRF_MASTER' not described in enum 'netdev_priv_flags'

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8a530f930754..4bd177faa90c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1262,6 +1262,7 @@ struct net_device_ops {
  * @IFF_LIVE_ADDR_CHANGE: device supports hardware address
  *	change when it's running
  * @IFF_MACVLAN: Macvlan device
+ * @IFF_VRF_MASTER: device is a VRF master
  * @IFF_NO_QUEUE: device can run without qdisc attached
  */
 enum netdev_priv_flags {
-- 
cgit v1.2.3-70-g09d2


From c7f5408493aeb01532927b2276316797a03ed6ee Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 4 Mar 2015 14:07:22 -0500
Subject: inode: rename i_wb_list to i_io_list

There's a small consistency problem between the inode and writeback
naming. Writeback calls the "for IO" inode queues b_io and
b_more_io, but the inode calls these the "writeback list" or
i_wb_list. This makes it hard to an new "under writeback" list to
the inode, or call it an "under IO" list on the bdi because either
way we'll have writeback on IO and IO on writeback and it'll just be
confusing. I'm getting confused just writing this!

So, rename the inode "for IO" list variable to i_io_list so we can
add a new "writeback list" in a subsequent patch.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Dave Chinner <dchinner@redhat.com>
---
 fs/fs-writeback.c  | 46 +++++++++++++++++++++++-----------------------
 fs/inode.c         |  8 ++++----
 fs/internal.h      |  2 +-
 include/linux/fs.h |  2 +-
 mm/backing-dev.c   |  8 ++++----
 5 files changed, 33 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3c974442bdf0..63e00f11022e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -88,7 +88,7 @@ unsigned int dirtytime_expire_interval = 12 * 60 * 60;
 
 static inline struct inode *wb_inode(struct list_head *head)
 {
-	return list_entry(head, struct inode, i_wb_list);
+	return list_entry(head, struct inode, i_io_list);
 }
 
 /*
@@ -125,22 +125,22 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb)
 }
 
 /**
- * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list
+ * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
  * @inode: inode to be moved
  * @wb: target bdi_writeback
  * @head: one of @wb->b_{dirty|io|more_io}
  *
- * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io.
+ * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
  * Returns %true if @inode is the first occupant of the !dirty_time IO
  * lists; otherwise, %false.
  */
-static bool inode_wb_list_move_locked(struct inode *inode,
+static bool inode_io_list_move_locked(struct inode *inode,
 				      struct bdi_writeback *wb,
 				      struct list_head *head)
 {
 	assert_spin_locked(&wb->list_lock);
 
-	list_move(&inode->i_wb_list, head);
+	list_move(&inode->i_io_list, head);
 
 	/* dirty_time doesn't count as dirty_io until expiration */
 	if (head != &wb->b_dirty_time)
@@ -151,19 +151,19 @@ static bool inode_wb_list_move_locked(struct inode *inode,
 }
 
 /**
- * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list
+ * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
  * @inode: inode to be removed
  * @wb: bdi_writeback @inode is being removed from
  *
  * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
  * clear %WB_has_dirty_io if all are empty afterwards.
  */
-static void inode_wb_list_del_locked(struct inode *inode,
+static void inode_io_list_del_locked(struct inode *inode,
 				     struct bdi_writeback *wb)
 {
 	assert_spin_locked(&wb->list_lock);
 
-	list_del_init(&inode->i_wb_list);
+	list_del_init(&inode->i_io_list);
 	wb_io_lists_depopulated(wb);
 }
 
@@ -351,7 +351,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 
 	/*
 	 * Once I_FREEING is visible under i_lock, the eviction path owns
-	 * the inode and we shouldn't modify ->i_wb_list.
+	 * the inode and we shouldn't modify ->i_io_list.
 	 */
 	if (unlikely(inode->i_state & I_FREEING))
 		goto skip_switch;
@@ -390,16 +390,16 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 	 * is always correct including from ->b_dirty_time.  The transfer
 	 * preserves @inode->dirtied_when ordering.
 	 */
-	if (!list_empty(&inode->i_wb_list)) {
+	if (!list_empty(&inode->i_io_list)) {
 		struct inode *pos;
 
-		inode_wb_list_del_locked(inode, old_wb);
+		inode_io_list_del_locked(inode, old_wb);
 		inode->i_wb = new_wb;
-		list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list)
+		list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
 			if (time_after_eq(inode->dirtied_when,
 					  pos->dirtied_when))
 				break;
-		inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev);
+		inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
 	} else {
 		inode->i_wb = new_wb;
 	}
@@ -961,12 +961,12 @@ void wb_start_background_writeback(struct bdi_writeback *wb)
 /*
  * Remove the inode from the writeback list it is on.
  */
-void inode_wb_list_del(struct inode *inode)
+void inode_io_list_del(struct inode *inode)
 {
 	struct bdi_writeback *wb;
 
 	wb = inode_to_wb_and_lock_list(inode);
-	inode_wb_list_del_locked(inode, wb);
+	inode_io_list_del_locked(inode, wb);
 	spin_unlock(&wb->list_lock);
 }
 
@@ -988,7 +988,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
 		if (time_before(inode->dirtied_when, tail->dirtied_when))
 			inode->dirtied_when = jiffies;
 	}
-	inode_wb_list_move_locked(inode, wb, &wb->b_dirty);
+	inode_io_list_move_locked(inode, wb, &wb->b_dirty);
 }
 
 /*
@@ -996,7 +996,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
  */
 static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
 {
-	inode_wb_list_move_locked(inode, wb, &wb->b_more_io);
+	inode_io_list_move_locked(inode, wb, &wb->b_more_io);
 }
 
 static void inode_sync_complete(struct inode *inode)
@@ -1055,7 +1055,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 		if (older_than_this &&
 		    inode_dirtied_after(inode, *older_than_this))
 			break;
-		list_move(&inode->i_wb_list, &tmp);
+		list_move(&inode->i_io_list, &tmp);
 		moved++;
 		if (flags & EXPIRE_DIRTY_ATIME)
 			set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
@@ -1078,7 +1078,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 		list_for_each_prev_safe(pos, node, &tmp) {
 			inode = wb_inode(pos);
 			if (inode->i_sb == sb)
-				list_move(&inode->i_wb_list, dispatch_queue);
+				list_move(&inode->i_io_list, dispatch_queue);
 		}
 	}
 out:
@@ -1232,10 +1232,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
 		redirty_tail(inode, wb);
 	} else if (inode->i_state & I_DIRTY_TIME) {
 		inode->dirtied_when = jiffies;
-		inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time);
+		inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
 	} else {
 		/* The inode is clean. Remove from writeback lists. */
-		inode_wb_list_del_locked(inode, wb);
+		inode_io_list_del_locked(inode, wb);
 	}
 }
 
@@ -1378,7 +1378,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	 * touch it. See comment above for explanation.
 	 */
 	if (!(inode->i_state & I_DIRTY_ALL))
-		inode_wb_list_del_locked(inode, wb);
+		inode_io_list_del_locked(inode, wb);
 	spin_unlock(&wb->list_lock);
 	inode_sync_complete(inode);
 out:
@@ -2091,7 +2091,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			else
 				dirty_list = &wb->b_dirty_time;
 
-			wakeup_bdi = inode_wb_list_move_locked(inode, wb,
+			wakeup_bdi = inode_io_list_move_locked(inode, wb,
 							       dirty_list);
 
 			spin_unlock(&wb->list_lock);
diff --git a/fs/inode.c b/fs/inode.c
index a2de294f6b77..f09148e07198 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -31,7 +31,7 @@
  * inode->i_sb->s_inode_list_lock protects:
  *   inode->i_sb->s_inodes, inode->i_sb_list
  * bdi->wb.list_lock protects:
- *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
+ *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
  * inode_hash_lock protects:
  *   inode_hashtable, inode->i_hash
  *
@@ -357,7 +357,7 @@ void inode_init_once(struct inode *inode)
 	memset(inode, 0, sizeof(*inode));
 	INIT_HLIST_NODE(&inode->i_hash);
 	INIT_LIST_HEAD(&inode->i_devices);
-	INIT_LIST_HEAD(&inode->i_wb_list);
+	INIT_LIST_HEAD(&inode->i_io_list);
 	INIT_LIST_HEAD(&inode->i_lru);
 	address_space_init_once(&inode->i_data);
 	i_size_ordered_init(inode);
@@ -525,8 +525,8 @@ static void evict(struct inode *inode)
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(!list_empty(&inode->i_lru));
 
-	if (!list_empty(&inode->i_wb_list))
-		inode_wb_list_del(inode);
+	if (!list_empty(&inode->i_io_list))
+		inode_io_list_del(inode);
 
 	inode_sb_list_del(inode);
 
diff --git a/fs/internal.h b/fs/internal.h
index ee1209c54eb1..71859c4d0b41 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -118,7 +118,7 @@ extern void inode_add_lru(struct inode *inode);
 /*
  * fs-writeback.c
  */
-extern void inode_wb_list_del(struct inode *inode);
+extern void inode_io_list_del(struct inode *inode);
 
 extern long get_nr_dirty_inodes(void);
 extern void evict_inodes(struct super_block *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 82dfc5519b4b..34cfa60db678 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -636,7 +636,7 @@ struct inode {
 	unsigned long		dirtied_time_when;
 
 	struct hlist_node	i_hash;
-	struct list_head	i_wb_list;	/* backing dev IO list */
+	struct list_head	i_io_list;	/* backing dev IO list */
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct bdi_writeback	*i_wb;		/* the associated cgroup wb */
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index dac5bf59309d..ee8d7fd07be3 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -55,13 +55,13 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 
 	nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
 	spin_lock(&wb->list_lock);
-	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
+	list_for_each_entry(inode, &wb->b_dirty, i_io_list)
 		nr_dirty++;
-	list_for_each_entry(inode, &wb->b_io, i_wb_list)
+	list_for_each_entry(inode, &wb->b_io, i_io_list)
 		nr_io++;
-	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
+	list_for_each_entry(inode, &wb->b_more_io, i_io_list)
 		nr_more_io++;
-	list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
+	list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
 		if (inode->i_state & I_DIRTY_TIME)
 			nr_dirty_time++;
 	spin_unlock(&wb->list_lock);
-- 
cgit v1.2.3-70-g09d2


From 74f4e0cc61080f63f28e8d519bdf437957e64217 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 14 Aug 2015 00:21:45 +0200
Subject: bcma: switch GPIO portions to use GPIOLIB_IRQCHIP

This switches the BCMA GPIO driver to use GPIOLIB_IRQCHIP to
handle its interrupts instead of rolling its own copy of the
irqdomain handling etc.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/Kconfig                        |  2 +-
 drivers/bcma/driver_gpio.c                  | 92 ++++++++++-------------------
 include/linux/bcma/bcma_driver_chipcommon.h |  1 -
 3 files changed, 31 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/Kconfig b/drivers/bcma/Kconfig
index be5fffb6da24..023d448ed3fa 100644
--- a/drivers/bcma/Kconfig
+++ b/drivers/bcma/Kconfig
@@ -92,7 +92,7 @@ config BCMA_DRIVER_GMAC_CMN
 config BCMA_DRIVER_GPIO
 	bool "BCMA GPIO driver"
 	depends on BCMA && GPIOLIB
-	select IRQ_DOMAIN if BCMA_HOST_SOC
+	select GPIOLIB_IRQCHIP if BCMA_HOST_SOC
 	help
 	  Driver to provide access to the GPIO pins of the bcma bus.
 
diff --git a/drivers/bcma/driver_gpio.c b/drivers/bcma/driver_gpio.c
index 5f6018e7cd4c..504899a72966 100644
--- a/drivers/bcma/driver_gpio.c
+++ b/drivers/bcma/driver_gpio.c
@@ -8,10 +8,8 @@
  * Licensed under the GNU/GPL. See COPYING for details.
  */
 
-#include <linux/gpio.h>
-#include <linux/irq.h>
+#include <linux/gpio/driver.h>
 #include <linux/interrupt.h>
-#include <linux/irqdomain.h>
 #include <linux/export.h>
 #include <linux/bcma/bcma.h>
 
@@ -79,19 +77,11 @@ static void bcma_gpio_free(struct gpio_chip *chip, unsigned gpio)
 }
 
 #if IS_BUILTIN(CONFIG_BCM47XX) || IS_BUILTIN(CONFIG_ARCH_BCM_5301X)
-static int bcma_gpio_to_irq(struct gpio_chip *chip, unsigned gpio)
-{
-	struct bcma_drv_cc *cc = bcma_gpio_get_cc(chip);
-
-	if (cc->core->bus->hosttype == BCMA_HOSTTYPE_SOC)
-		return irq_find_mapping(cc->irq_domain, gpio);
-	else
-		return -EINVAL;
-}
 
 static void bcma_gpio_irq_unmask(struct irq_data *d)
 {
-	struct bcma_drv_cc *cc = irq_data_get_irq_chip_data(d);
+	struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+	struct bcma_drv_cc *cc = bcma_gpio_get_cc(gc);
 	int gpio = irqd_to_hwirq(d);
 	u32 val = bcma_chipco_gpio_in(cc, BIT(gpio));
 
@@ -101,7 +91,8 @@ static void bcma_gpio_irq_unmask(struct irq_data *d)
 
 static void bcma_gpio_irq_mask(struct irq_data *d)
 {
-	struct bcma_drv_cc *cc = irq_data_get_irq_chip_data(d);
+	struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+	struct bcma_drv_cc *cc = bcma_gpio_get_cc(gc);
 	int gpio = irqd_to_hwirq(d);
 
 	bcma_chipco_gpio_intmask(cc, BIT(gpio), 0);
@@ -116,6 +107,7 @@ static struct irq_chip bcma_gpio_irq_chip = {
 static irqreturn_t bcma_gpio_irq_handler(int irq, void *dev_id)
 {
 	struct bcma_drv_cc *cc = dev_id;
+	struct gpio_chip *gc = &cc->gpio;
 	u32 val = bcma_cc_read32(cc, BCMA_CC_GPIOIN);
 	u32 mask = bcma_cc_read32(cc, BCMA_CC_GPIOIRQ);
 	u32 pol = bcma_cc_read32(cc, BCMA_CC_GPIOPOL);
@@ -125,81 +117,58 @@ static irqreturn_t bcma_gpio_irq_handler(int irq, void *dev_id)
 	if (!irqs)
 		return IRQ_NONE;
 
-	for_each_set_bit(gpio, &irqs, cc->gpio.ngpio)
-		generic_handle_irq(bcma_gpio_to_irq(&cc->gpio, gpio));
+	for_each_set_bit(gpio, &irqs, gc->ngpio)
+		generic_handle_irq(irq_find_mapping(gc->irqdomain, gpio));
 	bcma_chipco_gpio_polarity(cc, irqs, val & irqs);
 
 	return IRQ_HANDLED;
 }
 
-static int bcma_gpio_irq_domain_init(struct bcma_drv_cc *cc)
+static int bcma_gpio_irq_init(struct bcma_drv_cc *cc)
 {
 	struct gpio_chip *chip = &cc->gpio;
-	int gpio, hwirq, err;
+	int hwirq, err;
 
 	if (cc->core->bus->hosttype != BCMA_HOSTTYPE_SOC)
 		return 0;
 
-	cc->irq_domain = irq_domain_add_linear(NULL, chip->ngpio,
-					       &irq_domain_simple_ops, cc);
-	if (!cc->irq_domain) {
-		err = -ENODEV;
-		goto err_irq_domain;
-	}
-	for (gpio = 0; gpio < chip->ngpio; gpio++) {
-		int irq = irq_create_mapping(cc->irq_domain, gpio);
-
-		irq_set_chip_data(irq, cc);
-		irq_set_chip_and_handler(irq, &bcma_gpio_irq_chip,
-					 handle_simple_irq);
-	}
-
 	hwirq = bcma_core_irq(cc->core, 0);
 	err = request_irq(hwirq, bcma_gpio_irq_handler, IRQF_SHARED, "gpio",
 			  cc);
 	if (err)
-		goto err_req_irq;
+		return err;
 
 	bcma_chipco_gpio_intmask(cc, ~0, 0);
 	bcma_cc_set32(cc, BCMA_CC_IRQMASK, BCMA_CC_IRQ_GPIO);
 
-	return 0;
-
-err_req_irq:
-	for (gpio = 0; gpio < chip->ngpio; gpio++) {
-		int irq = irq_find_mapping(cc->irq_domain, gpio);
-
-		irq_dispose_mapping(irq);
+	err =  gpiochip_irqchip_add(chip,
+				    &bcma_gpio_irq_chip,
+				    0,
+				    handle_simple_irq,
+				    IRQ_TYPE_NONE);
+	if (err) {
+		free_irq(hwirq, cc);
+		return err;
 	}
-	irq_domain_remove(cc->irq_domain);
-err_irq_domain:
-	return err;
+
+	return 0;
 }
 
-static void bcma_gpio_irq_domain_exit(struct bcma_drv_cc *cc)
+static void bcma_gpio_irq_exit(struct bcma_drv_cc *cc)
 {
-	struct gpio_chip *chip = &cc->gpio;
-	int gpio;
-
 	if (cc->core->bus->hosttype != BCMA_HOSTTYPE_SOC)
 		return;
 
 	bcma_cc_mask32(cc, BCMA_CC_IRQMASK, ~BCMA_CC_IRQ_GPIO);
 	free_irq(bcma_core_irq(cc->core, 0), cc);
-	for (gpio = 0; gpio < chip->ngpio; gpio++) {
-		int irq = irq_find_mapping(cc->irq_domain, gpio);
-
-		irq_dispose_mapping(irq);
-	}
-	irq_domain_remove(cc->irq_domain);
 }
 #else
-static int bcma_gpio_irq_domain_init(struct bcma_drv_cc *cc)
+static int bcma_gpio_irq_init(struct bcma_drv_cc *cc)
 {
 	return 0;
 }
 
-static void bcma_gpio_irq_domain_exit(struct bcma_drv_cc *cc)
+static void bcma_gpio_irq_exit(struct bcma_drv_cc *cc)
 {
 }
 #endif
@@ -218,9 +187,8 @@ int bcma_gpio_init(struct bcma_drv_cc *cc)
 	chip->set		= bcma_gpio_set_value;
 	chip->direction_input	= bcma_gpio_direction_input;
 	chip->direction_output	= bcma_gpio_direction_output;
-#if IS_BUILTIN(CONFIG_BCM47XX) || IS_BUILTIN(CONFIG_ARCH_BCM_5301X)
-	chip->to_irq		= bcma_gpio_to_irq;
-#endif
+	chip->owner		= THIS_MODULE;
+	chip->dev		= bcma_bus_get_host_dev(bus);
 #if IS_BUILTIN(CONFIG_OF)
 	if (cc->core->bus->hosttype == BCMA_HOSTTYPE_SOC)
 		chip->of_node	= cc->core->dev.of_node;
@@ -248,13 +216,13 @@ int bcma_gpio_init(struct bcma_drv_cc *cc)
 	else
 		chip->base		= -1;
 
-	err = bcma_gpio_irq_domain_init(cc);
+	err = gpiochip_add(chip);
 	if (err)
 		return err;
 
-	err = gpiochip_add(chip);
+	err = bcma_gpio_irq_init(cc);
 	if (err) {
-		bcma_gpio_irq_domain_exit(cc);
+		gpiochip_remove(chip);
 		return err;
 	}
 
@@ -263,7 +231,7 @@ int bcma_gpio_init(struct bcma_drv_cc *cc)
 
 int bcma_gpio_unregister(struct bcma_drv_cc *cc)
 {
-	bcma_gpio_irq_domain_exit(cc);
+	bcma_gpio_irq_exit(cc);
 	gpiochip_remove(&cc->gpio);
 	return 0;
 }
diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h
index 6cceedf65ca2..cf038431a5cc 100644
--- a/include/linux/bcma/bcma_driver_chipcommon.h
+++ b/include/linux/bcma/bcma_driver_chipcommon.h
@@ -640,7 +640,6 @@ struct bcma_drv_cc {
 	spinlock_t gpio_lock;
 #ifdef CONFIG_BCMA_DRIVER_GPIO
 	struct gpio_chip gpio;
-	struct irq_domain *irq_domain;
 #endif
 };
 
-- 
cgit v1.2.3-70-g09d2


From a1b93ab71587b8b44d45d114937cb4e75f9a5f27 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 18 Aug 2015 09:56:04 -0700
Subject: Revert "usb: interface authorization: Use a flag for the default
 device authorization"

This reverts commit 3cf1fc80655d3af7083ea4b3615e5f8532543be7 as the
signed-off-by address is invalid.

Cc: Stefan Koch <stefan.koch10@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hcd.c  | 31 ++++++++++---------------------
 drivers/usb/core/usb.c  |  2 +-
 include/linux/usb/hcd.h | 16 +++++++---------
 3 files changed, 18 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 86b3d1190500..83df1dde9c08 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -854,10 +854,10 @@ static ssize_t authorized_default_show(struct device *dev,
 {
 	struct usb_device *rh_usb_dev = to_usb_device(dev);
 	struct usb_bus *usb_bus = rh_usb_dev->bus;
-	struct usb_hcd *hcd;
+	struct usb_hcd *usb_hcd;
 
-	hcd = bus_to_hcd(usb_bus);
-	return snprintf(buf, PAGE_SIZE, "%u\n", !!HCD_DEV_AUTHORIZED(hcd));
+	usb_hcd = bus_to_hcd(usb_bus);
+	return snprintf(buf, PAGE_SIZE, "%u\n", usb_hcd->authorized_default);
 }
 
 static ssize_t authorized_default_store(struct device *dev,
@@ -868,16 +868,12 @@ static ssize_t authorized_default_store(struct device *dev,
 	unsigned val;
 	struct usb_device *rh_usb_dev = to_usb_device(dev);
 	struct usb_bus *usb_bus = rh_usb_dev->bus;
-	struct usb_hcd *hcd;
+	struct usb_hcd *usb_hcd;
 
-	hcd = bus_to_hcd(usb_bus);
+	usb_hcd = bus_to_hcd(usb_bus);
 	result = sscanf(buf, "%u\n", &val);
 	if (result == 1) {
-		if (val)
-			set_bit(HCD_FLAG_DEV_AUTHORIZED, &hcd->flags);
-		else
-			clear_bit(HCD_FLAG_DEV_AUTHORIZED, &hcd->flags);
-
+		usb_hcd->authorized_default = val ? 1 : 0;
 		result = size;
 	} else {
 		result = -EINVAL;
@@ -2724,17 +2720,10 @@ int usb_add_hcd(struct usb_hcd *hcd,
 	dev_info(hcd->self.controller, "%s\n", hcd->product_desc);
 
 	/* Keep old behaviour if authorized_default is not in [0, 1]. */
-	if (authorized_default < 0 || authorized_default > 1) {
-		if (hcd->wireless)
-			clear_bit(HCD_FLAG_DEV_AUTHORIZED, &hcd->flags);
-		else
-			set_bit(HCD_FLAG_DEV_AUTHORIZED, &hcd->flags);
-	} else {
-		if (authorized_default)
-			set_bit(HCD_FLAG_DEV_AUTHORIZED, &hcd->flags);
-		else
-			clear_bit(HCD_FLAG_DEV_AUTHORIZED, &hcd->flags);
-	}
+	if (authorized_default < 0 || authorized_default > 1)
+		hcd->authorized_default = hcd->wireless ? 0 : 1;
+	else
+		hcd->authorized_default = authorized_default;
 	set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags);
 
 	/* per default all interfaces are authorized */
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index f8bbd0b6d9fe..8d5b2f4113cd 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -510,7 +510,7 @@ struct usb_device *usb_alloc_dev(struct usb_device *parent,
 	if (root_hub)	/* Root hub always ok [and always wired] */
 		dev->authorized = 1;
 	else {
-		dev->authorized = !!HCD_DEV_AUTHORIZED(usb_hcd);
+		dev->authorized = usb_hcd->authorized_default;
 		dev->wusb = usb_bus_is_wusb(bus) ? 1 : 0;
 	}
 	return dev;
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 4d147277b30d..36ce3d215cad 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -58,6 +58,12 @@
  *
  * Since "struct usb_bus" is so thin, you can't share much code in it.
  * This framework is a layer over that, and should be more sharable.
+ *
+ * @authorized_default: Specifies if new devices are authorized to
+ *                      connect by default or they require explicit
+ *                      user space authorization; this bit is settable
+ *                      through /sys/class/usb_host/X/authorized_default.
+ *                      For the rest is RO, so we don't lock to r/w it.
  */
 
 /*-------------------------------------------------------------------------*/
@@ -115,7 +121,6 @@ struct usb_hcd {
 #define HCD_FLAG_RH_RUNNING		5	/* root hub is running? */
 #define HCD_FLAG_DEAD			6	/* controller has died? */
 #define HCD_FLAG_INTF_AUTHORIZED	7	/* authorize interfaces? */
-#define HCD_FLAG_DEV_AUTHORIZED		8	/* authorize devices? */
 
 	/* The flags can be tested using these macros; they are likely to
 	 * be slightly faster than test_bit().
@@ -135,14 +140,6 @@ struct usb_hcd {
 #define HCD_INTF_AUTHORIZED(hcd) \
 	((hcd)->flags & (1U << HCD_FLAG_INTF_AUTHORIZED))
 
-	/*
-	 * Specifies if devices are authorized by default
-	 * or they require explicit user space authorization; this bit is
-	 * settable through /sys/class/usb_host/X/authorized_default
-	 */
-#define HCD_DEV_AUTHORIZED(hcd) \
-	((hcd)->flags & (1U << HCD_FLAG_DEV_AUTHORIZED))
-
 	/* Flags that get set only during HCD registration or removal. */
 	unsigned		rh_registered:1;/* is root hub registered? */
 	unsigned		rh_pollable:1;	/* may we poll the root hub? */
@@ -153,6 +150,7 @@ struct usb_hcd {
 	 * support the new root-hub polling mechanism. */
 	unsigned		uses_new_polling:1;
 	unsigned		wireless:1;	/* Wireless USB HCD */
+	unsigned		authorized_default:1;
 	unsigned		has_tt:1;	/* Integrated TT in root hub */
 	unsigned		amd_resume_bug:1; /* AMD remote wakeup quirk */
 	unsigned		can_do_streams:1; /* HC supports streams */
-- 
cgit v1.2.3-70-g09d2


From 12e1a6a0f16168346cb8f33aff135ed523a9f097 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 18 Aug 2015 09:58:45 -0700
Subject: Revert "usb: interface authorization: Introduces the default
 interface authorization"

This reverts commit 1d958bef45030acfc5578263e9de3bb07032b8da as the
signed-off-by address is invalid.

Cc: Stefan Koch <stefan.koch10@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hcd.c     | 47 ----------------------------------------------
 drivers/usb/core/message.c |  1 -
 include/linux/usb/hcd.h    |  9 ---------
 3 files changed, 57 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 83df1dde9c08..4d64e5c499e1 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -882,53 +882,9 @@ static ssize_t authorized_default_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(authorized_default);
 
-/*
- * interface_authorized_default_show - show default authorization status
- * for USB interfaces
- *
- * note: interface_authorized_default is the default value
- *       for initializing the authorized attribute of interfaces
- */
-static ssize_t interface_authorized_default_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct usb_device *usb_dev = to_usb_device(dev);
-	struct usb_hcd *hcd = bus_to_hcd(usb_dev->bus);
-
-	return sprintf(buf, "%u\n", !!HCD_INTF_AUTHORIZED(hcd));
-}
-
-/*
- * interface_authorized_default_store - store default authorization status
- * for USB interfaces
- *
- * note: interface_authorized_default is the default value
- *       for initializing the authorized attribute of interfaces
- */
-static ssize_t interface_authorized_default_store(struct device *dev,
-		struct device_attribute *attr, const char *buf, size_t count)
-{
-	struct usb_device *usb_dev = to_usb_device(dev);
-	struct usb_hcd *hcd = bus_to_hcd(usb_dev->bus);
-	int rc = count;
-	bool val;
-
-	if (strtobool(buf, &val) != 0)
-		return -EINVAL;
-
-	if (val)
-		set_bit(HCD_FLAG_INTF_AUTHORIZED, &hcd->flags);
-	else
-		clear_bit(HCD_FLAG_INTF_AUTHORIZED, &hcd->flags);
-
-	return rc;
-}
-static DEVICE_ATTR_RW(interface_authorized_default);
-
 /* Group all the USB bus attributes */
 static struct attribute *usb_bus_attrs[] = {
 		&dev_attr_authorized_default.attr,
-		&dev_attr_interface_authorized_default.attr,
 		NULL,
 };
 
@@ -2726,9 +2682,6 @@ int usb_add_hcd(struct usb_hcd *hcd,
 		hcd->authorized_default = authorized_default;
 	set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags);
 
-	/* per default all interfaces are authorized */
-	set_bit(HCD_FLAG_INTF_AUTHORIZED, &hcd->flags);
-
 	/* HC is in reset state, but accessible.  Now do the one-time init,
 	 * bottom up so that hcds can customize the root hubs before hub_wq
 	 * starts talking to them.  (Note, bus id is assigned early too.)
diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 3d25d89671d7..f368d2053da5 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -1807,7 +1807,6 @@ free_interfaces:
 		intfc = cp->intf_cache[i];
 		intf->altsetting = intfc->altsetting;
 		intf->num_altsetting = intfc->num_altsetting;
-		intf->authorized = !!HCD_INTF_AUTHORIZED(hcd);
 		kref_get(&intfc->ref);
 
 		alt = usb_altnum_to_altsetting(intf, 0);
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 36ce3d215cad..d2784c10bfe2 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -120,7 +120,6 @@ struct usb_hcd {
 #define HCD_FLAG_WAKEUP_PENDING		4	/* root hub is resuming? */
 #define HCD_FLAG_RH_RUNNING		5	/* root hub is running? */
 #define HCD_FLAG_DEAD			6	/* controller has died? */
-#define HCD_FLAG_INTF_AUTHORIZED	7	/* authorize interfaces? */
 
 	/* The flags can be tested using these macros; they are likely to
 	 * be slightly faster than test_bit().
@@ -132,14 +131,6 @@ struct usb_hcd {
 #define HCD_RH_RUNNING(hcd)	((hcd)->flags & (1U << HCD_FLAG_RH_RUNNING))
 #define HCD_DEAD(hcd)		((hcd)->flags & (1U << HCD_FLAG_DEAD))
 
-	/*
-	 * Specifies if interfaces are authorized by default
-	 * or they require explicit user space authorization; this bit is
-	 * settable through /sys/class/usb_host/X/interface_authorized_default
-	 */
-#define HCD_INTF_AUTHORIZED(hcd) \
-	((hcd)->flags & (1U << HCD_FLAG_INTF_AUTHORIZED))
-
 	/* Flags that get set only during HCD registration or removal. */
 	unsigned		rh_registered:1;/* is root hub registered? */
 	unsigned		rh_pollable:1;	/* may we poll the root hub? */
-- 
cgit v1.2.3-70-g09d2


From 475908bc4d29b95b24c03206f673f46b41e7124d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 18 Aug 2015 09:59:12 -0700
Subject: Revert "usb: interface authorization: Declare authorized attribute"

This reverts commit 484ebaedecc5ddf778a30ee1efab367cbee27030 as the
signed-off-by address is invalid.

Cc: Stefan Koch <stefan.koch10@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 3deccab2ce0a..447fe29b55b4 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -178,7 +178,6 @@ struct usb_interface {
 	unsigned needs_altsetting0:1;	/* switch to altsetting 0 is pending */
 	unsigned needs_binding:1;	/* needs delayed unbind/rebind */
 	unsigned resetting_device:1;	/* true: bandwidth alloc after reset */
-	unsigned authorized:1;		/* used for interface authorization */
 
 	struct device dev;		/* interface specific device info */
 	struct device *usb_dev;
-- 
cgit v1.2.3-70-g09d2


From dfbac8c7ac5f58448b2216fe42ff52aaf175421d Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 10 Aug 2015 15:20:40 -0600
Subject: NVMe: Add nvme subsystem reset support

Controllers part of an NVMe subsystem may be reset by any other controller
in the subsystem. If the device is capable of subsystem resets, this
patch adds detection for such events and performs appropriate controller
initialization upon subsystem reset detection.

The register bit is a RW1C type, so the driver needs to write a 1 to the
status bit to clear the subsystem reset occured bit during initialization.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nvme-core.c | 11 ++++++++++-
 include/linux/nvme.h      |  3 +++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 85cd33bf9607..e318a992e2e6 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1735,6 +1735,12 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 		page_shift = dev_page_max;
 	}
 
+	dev->subsystem = readl(&dev->bar->vs) >= NVME_VS(1, 1) ?
+						NVME_CAP_NSSRC(cap) : 0;
+
+	if (dev->subsystem && (readl(&dev->bar->csts) & NVME_CSTS_NSSRO))
+		writel(NVME_CSTS_NSSRO, &dev->bar->csts);
+
 	result = nvme_disable_ctrl(dev, cap);
 	if (result < 0)
 		return result;
@@ -2059,7 +2065,10 @@ static int nvme_kthread(void *data)
 		spin_lock(&dev_list_lock);
 		list_for_each_entry_safe(dev, next, &dev_list, node) {
 			int i;
-			if (readl(&dev->bar->csts) & NVME_CSTS_CFS) {
+			u32 csts = readl(&dev->bar->csts);
+
+			if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
+							csts & NVME_CSTS_CFS) {
 				if (work_busy(&dev->reset_work))
 					continue;
 				list_del_init(&dev->node);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index fa3fe160c6cb..d6b5600cfa47 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -39,6 +39,7 @@ struct nvme_bar {
 #define NVME_CAP_MQES(cap)	((cap) & 0xffff)
 #define NVME_CAP_TIMEOUT(cap)	(((cap) >> 24) & 0xff)
 #define NVME_CAP_STRIDE(cap)	(((cap) >> 32) & 0xf)
+#define NVME_CAP_NSSRC(cap)	(((cap) >> 36) & 0x1)
 #define NVME_CAP_MPSMIN(cap)	(((cap) >> 48) & 0xf)
 #define NVME_CAP_MPSMAX(cap)	(((cap) >> 52) & 0xf)
 
@@ -68,6 +69,7 @@ enum {
 	NVME_CC_IOCQES		= 4 << 20,
 	NVME_CSTS_RDY		= 1 << 0,
 	NVME_CSTS_CFS		= 1 << 1,
+	NVME_CSTS_NSSRO		= 1 << 4,
 	NVME_CSTS_SHST_NORMAL	= 0 << 2,
 	NVME_CSTS_SHST_OCCUR	= 1 << 2,
 	NVME_CSTS_SHST_CMPLT	= 2 << 2,
@@ -110,6 +112,7 @@ struct nvme_dev {
 	char serial[20];
 	char model[40];
 	char firmware_rev[8];
+	bool subsystem;
 	u32 max_hw_sectors;
 	u32 stripe_size;
 	u32 page_size;
-- 
cgit v1.2.3-70-g09d2


From 81f03fedcce7ee7e83c37237ecaa2f68aad236fd Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Mon, 10 Aug 2015 15:20:41 -0600
Subject: NVMe: Add nvme subsystem reset IOCTL

Controllers can perform optional subsystem resets as introduced in NVMe
1.1. This patch adds an IOCTL to trigger the subsystem reset by writing
"NVMe" to the NSSR register.

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Acked-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nvme-core.c | 11 +++++++++++
 include/linux/nvme.h      |  2 +-
 include/uapi/linux/nvme.h |  1 +
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e318a992e2e6..3a35c5807bb7 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1901,6 +1901,15 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
 	return status;
 }
 
+static int nvme_subsys_reset(struct nvme_dev *dev)
+{
+	if (!dev->subsystem)
+		return -ENOTTY;
+
+	writel(0x4E564D65, &dev->bar->nssr); /* "NVMe" */
+	return 0;
+}
+
 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 							unsigned long arg)
 {
@@ -2932,6 +2941,8 @@ static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 	case NVME_IOCTL_RESET:
 		dev_warn(dev->dev, "resetting controller\n");
 		return nvme_reset(dev);
+	case NVME_IOCTL_SUBSYS_RESET:
+		return nvme_subsys_reset(dev);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index d6b5600cfa47..b5812c395351 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -28,7 +28,7 @@ struct nvme_bar {
 	__u32			cc;	/* Controller Configuration */
 	__u32			rsvd1;	/* Reserved */
 	__u32			csts;	/* Controller Status */
-	__u32			rsvd2;	/* Reserved */
+	__u32			nssr;	/* Subsystem Reset */
 	__u32			aqa;	/* Admin Queue Attributes */
 	__u64			asq;	/* Admin SQ Base Address */
 	__u64			acq;	/* Admin CQ Base Address */
diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
index 732b32e92b02..8864194a4151 100644
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -584,5 +584,6 @@ struct nvme_passthru_cmd {
 #define NVME_IOCTL_SUBMIT_IO	_IOW('N', 0x42, struct nvme_user_io)
 #define NVME_IOCTL_IO_CMD	_IOWR('N', 0x43, struct nvme_passthru_cmd)
 #define NVME_IOCTL_RESET	_IO('N', 0x44)
+#define NVME_IOCTL_SUBSYS_RESET	_IO('N', 0x45)
 
 #endif /* _UAPI_LINUX_NVME_H */
-- 
cgit v1.2.3-70-g09d2


From 30e2bc08b2bb7c069246feee78f7ed4006e130fe Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Thu, 13 Aug 2015 14:57:56 -0400
Subject: Revert "block: remove artifical max_hw_sectors cap"

This reverts commit 34b48db66e08ca1c1bc07cf305d672ac940268dc.
That commit caused performance regressions for streaming I/O
workloads on a number of different storage devices, from
SATA disks to external RAID arrays.  It also managed to
trip up some buggy firmware in at least one drive, causing
data corruption.

The next patch will bump the default max_sectors_kb value to
1280, which will accommodate a 10-data-disk stripe write
with chunk size 128k.  In the testing I've done using iozone,
fio, and aio-stress, a value of 1280 does not show a big
performance difference from 512.  This will hopefully still
help the software RAID setup that Christoph saw the original
performance gains with while still not regressing other
storage configurations.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-settings.c       | 4 +++-
 drivers/block/aoe/aoeblk.c | 2 +-
 include/linux/blkdev.h     | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9df73991b231..d27b4e272356 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -236,7 +236,9 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_
 		       __func__, max_hw_sectors);
 	}
 
-	limits->max_sectors = limits->max_hw_sectors = max_hw_sectors;
+	limits->max_hw_sectors = max_hw_sectors;
+	limits->max_sectors = min_t(unsigned int, max_hw_sectors,
+				    BLK_DEF_MAX_SECTORS);
 }
 EXPORT_SYMBOL(blk_limits_max_hw_sectors);
 
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 46c282fff104..dd73e1ff1759 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -395,7 +395,7 @@ aoeblk_gdalloc(void *vp)
 	WARN_ON(d->flags & DEVFL_TKILL);
 	WARN_ON(d->gd);
 	WARN_ON(d->flags & DEVFL_UP);
-	blk_queue_max_hw_sectors(q, 1024);
+	blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS);
 	q->backing_dev_info.name = "aoe";
 	q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
 	d->bufpool = mp;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a1feff54aeab..9f1e3f8eeed7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1132,6 +1132,7 @@ extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
 enum blk_default_limits {
 	BLK_MAX_SEGMENTS	= 128,
 	BLK_SAFE_MAX_SECTORS	= 255,
+	BLK_DEF_MAX_SECTORS	= 1024,
 	BLK_MAX_SEGMENT_SIZE	= 65536,
 	BLK_SEG_BOUNDARY_MASK	= 0xFFFFFFFFUL,
 };
-- 
cgit v1.2.3-70-g09d2


From d2be537c3ba3568acd79cd178327b842e60d035e Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Thu, 13 Aug 2015 14:57:57 -0400
Subject: block: bump BLK_DEF_MAX_SECTORS to 2560

A value of 2560 (1280k) will accommodate a 10-data-disk stripe
write with chunk size 128k.  In the testing I've done using
iozone, fio, and aio-stress across a number of different storage
devices, a value of 1280 does not show a big performance
difference from 512, but will hopefully help software RAID
setups using SATA disks, as reported by Christoph.

NOTE: drivers/block/aoe/aoeblk.c sets its own max_hw_sectors_kb to
BLK_DEF_MAX_SECTORS.  So, this patch essentially changes aeoblk to
Use a larger maximum sector size, and I did not test this.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9f1e3f8eeed7..e427debc7008 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1132,7 +1132,7 @@ extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
 enum blk_default_limits {
 	BLK_MAX_SEGMENTS	= 128,
 	BLK_SAFE_MAX_SECTORS	= 255,
-	BLK_DEF_MAX_SECTORS	= 1024,
+	BLK_DEF_MAX_SECTORS	= 2560,
 	BLK_MAX_SEGMENT_SIZE	= 65536,
 	BLK_SEG_BOUNDARY_MASK	= 0xFFFFFFFFUL,
 };
-- 
cgit v1.2.3-70-g09d2


From 3e1d2eed39d804e48282931835c7203fa47fe1d9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 13:58:16 -0700
Subject: cgroup: introduce cgroup_subsys->legacy_name

This allows cgroup subsystems to use a different name on the unified
hierarchy.  cgroup_subsys->name is used on the unified hierarchy,
->legacy_name elsewhere.  If ->legacy_name is not explicitly set, it's
automatically set to ->name and the userland visible behavior remains
unchanged.

v2: Make parse_cgroupfs_options() only consider ->legacy_name as mount
    options are used only on legacy hierarchies.  Suggested by Li
    Zefan.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: cgroups@vger.kernel.org
---
 include/linux/cgroup-defs.h |  3 +++
 kernel/cgroup.c             | 29 ++++++++++++++++++-----------
 2 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 8f5770a7d85a..7d0bb53e4553 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -434,6 +434,9 @@ struct cgroup_subsys {
 	int id;
 	const char *name;
 
+	/* optional, initialized automatically during boot if not set */
+	const char *legacy_name;
+
 	/* link to parent, protected by cgroup_lock() */
 	struct cgroup_root *root;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d9f854defa78..9c8eb17d593a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1028,10 +1028,13 @@ static const struct file_operations proc_cgroupstats_operations;
 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 			      char *buf)
 {
+	struct cgroup_subsys *ss = cft->ss;
+
 	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
 	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
 		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
-			 cft->ss->name, cft->name);
+			 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+			 cft->name);
 	else
 		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
 	return buf;
@@ -1336,7 +1339,7 @@ static int cgroup_show_options(struct seq_file *seq,
 	if (root != &cgrp_dfl_root)
 		for_each_subsys(ss, ssid)
 			if (root->subsys_mask & (1 << ssid))
-				seq_printf(seq, ",%s", ss->name);
+				seq_printf(seq, ",%s", ss->legacy_name);
 	if (root->flags & CGRP_ROOT_NOPREFIX)
 		seq_puts(seq, ",noprefix");
 	if (root->flags & CGRP_ROOT_XATTR)
@@ -1449,7 +1452,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 		}
 
 		for_each_subsys(ss, i) {
-			if (strcmp(token, ss->name))
+			if (strcmp(token, ss->legacy_name))
 				continue;
 			if (ss->disabled)
 				continue;
@@ -4995,6 +4998,8 @@ int __init cgroup_init_early(void)
 
 		ss->id = i;
 		ss->name = cgroup_subsys_name[i];
+		if (!ss->legacy_name)
+			ss->legacy_name = cgroup_subsys_name[i];
 
 		if (ss->early_init)
 			cgroup_init_subsys(ss, true);
@@ -5142,7 +5147,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 			for_each_subsys(ss, ssid)
 				if (root->subsys_mask & (1 << ssid))
 					seq_printf(m, "%s%s", count++ ? "," : "",
-						   ss->name);
+						   ss->legacy_name);
 		if (strlen(root->name))
 			seq_printf(m, "%sname=%s", count ? "," : "",
 				   root->name);
@@ -5182,7 +5187,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
 
 	for_each_subsys(ss, i)
 		seq_printf(m, "%s\t%d\t%d\t%d\n",
-			   ss->name, ss->root->hierarchy_id,
+			   ss->legacy_name, ss->root->hierarchy_id,
 			   atomic_read(&ss->root->nr_cgrps), !ss->disabled);
 
 	mutex_unlock(&cgroup_mutex);
@@ -5404,12 +5409,14 @@ static int __init cgroup_disable(char *str)
 			continue;
 
 		for_each_subsys(ss, i) {
-			if (!strcmp(token, ss->name)) {
-				ss->disabled = 1;
-				printk(KERN_INFO "Disabling %s control group"
-					" subsystem\n", ss->name);
-				break;
-			}
+			if (strcmp(token, ss->name) &&
+			    strcmp(token, ss->legacy_name))
+				continue;
+
+			ss->disabled = 1;
+			printk(KERN_INFO "Disabling %s control group subsystem\n",
+			       ss->name);
+			break;
 		}
 	}
 	return 1;
-- 
cgit v1.2.3-70-g09d2


From 1ed8d48c57bf7400eac7b8dc622ab0413715cafb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:54:52 -0700
Subject: writeback: bdi_for_each_wb() iteration is memcg ID based not blkcg

wb's (bdi_writeback's) are currently keyed by memcg ID; however, in an
earlier implementation, wb's were keyed by blkcg ID.
bdi_for_each_wb() walks bdi->cgwb_tree in the ascending ID order and
allows iterations to start from an arbitrary ID which is used to
interrupt and resume iterations.

Unfortunately, while changing wb to be keyed by memcg ID instead of
blkcg, bdi_for_each_wb() was missed and is still assuming that wb's
are keyed by blkcg ID.  This doesn't affect iterations which don't get
interrupted but bdi_split_work_to_wbs() makes use of iteration
resuming on allocation failures and thus may incorrectly skip or
repeat wb's.

Fix it by changing bdi_for_each_wb() to take memcg IDs instead of
blkcg IDs and updating bdi_split_work_to_wbs() accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c           |  6 +++---
 include/linux/backing-dev.h | 24 ++++++++++++------------
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 518c6294bf6c..c9def2115aca 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -839,7 +839,7 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 				  bool skip_if_busy)
 {
 	long nr_pages = base_work->nr_pages;
-	int next_blkcg_id = 0;
+	int next_memcg_id = 0;
 	struct bdi_writeback *wb;
 	struct wb_iter iter;
 
@@ -849,14 +849,14 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 		return;
 restart:
 	rcu_read_lock();
-	bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
+	bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
 		if (!wb_has_dirty_io(wb) ||
 		    (skip_if_busy && writeback_in_progress(wb)))
 			continue;
 
 		base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
 		if (!wb_clone_and_queue_work(wb, base_work)) {
-			next_blkcg_id = wb->blkcg_css->id + 1;
+			next_memcg_id = wb->memcg_css->id + 1;
 			rcu_read_unlock();
 			wb_wait_for_single_work(bdi, base_work);
 			goto restart;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0fe9df983ab7..23ebb946e66f 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -402,7 +402,7 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
 }
 
 struct wb_iter {
-	int			start_blkcg_id;
+	int			start_memcg_id;
 	struct radix_tree_iter	tree_iter;
 	void			**slot;
 };
@@ -414,9 +414,9 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
-	if (iter->start_blkcg_id >= 0) {
-		iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id);
-		iter->start_blkcg_id = -1;
+	if (iter->start_memcg_id >= 0) {
+		iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
+		iter->start_memcg_id = -1;
 	} else {
 		iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
 	}
@@ -430,30 +430,30 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
 
 static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
 						   struct backing_dev_info *bdi,
-						   int start_blkcg_id)
+						   int start_memcg_id)
 {
-	iter->start_blkcg_id = start_blkcg_id;
+	iter->start_memcg_id = start_memcg_id;
 
-	if (start_blkcg_id)
+	if (start_memcg_id)
 		return __wb_iter_next(iter, bdi);
 	else
 		return &bdi->wb;
 }
 
 /**
- * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order
+ * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
  * @wb_cur: cursor struct bdi_writeback pointer
  * @bdi: bdi to walk wb's of
  * @iter: pointer to struct wb_iter to be used as iteration buffer
- * @start_blkcg_id: blkcg ID to start iteration from
+ * @start_memcg_id: memcg ID to start iteration from
  *
  * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
- * blkcg ID order starting from @start_blkcg_id.  @iter is struct wb_iter
+ * memcg ID order starting from @start_memcg_id.  @iter is struct wb_iter
  * to be used as temp storage during iteration.  rcu_read_lock() must be
  * held throughout iteration.
  */
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)		\
-	for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id);	\
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id)		\
+	for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id);	\
 	     (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
-- 
cgit v1.2.3-70-g09d2


From 9acee9c551f045d2c5b5261aa587331423fd7d92 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:54:55 -0700
Subject: kernfs: implement kernfs_path_len()

Add a function to determine the path length of a kernfs node.  This
for now will be used by writeback tracepoint updates.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/kernfs/dir.c        | 23 +++++++++++++++++++++++
 include/linux/kernfs.h |  4 ++++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 2d48d28e1640..91e004518237 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -91,6 +91,29 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
 	return ret;
 }
 
+/**
+ * kernfs_path_len - determine the length of the full path of a given node
+ * @kn: kernfs_node of interest
+ *
+ * The returned length doesn't include the space for the terminating '\0'.
+ */
+size_t kernfs_path_len(struct kernfs_node *kn)
+{
+	size_t len = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kernfs_rename_lock, flags);
+
+	do {
+		len += strlen(kn->name) + 1;
+		kn = kn->parent;
+	} while (kn && kn->parent);
+
+	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+
+	return len;
+}
+
 /**
  * kernfs_path - build full path of a given node
  * @kn: kernfs_node of interest
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 123be25ea15a..5d4e9c4b821d 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -266,6 +266,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
 }
 
 int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
+size_t kernfs_path_len(struct kernfs_node *kn);
 char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
 				size_t buflen);
 void pr_cont_kernfs_name(struct kernfs_node *kn);
@@ -332,6 +333,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
 static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
 { return -ENOSYS; }
 
+static inline size_t kernfs_path_len(struct kernfs_node *kn)
+{ return 0; }
+
 static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
 					      size_t buflen)
 { return NULL; }
-- 
cgit v1.2.3-70-g09d2


From 401efbf835040dd2ebca54f78d58fc8e3c51f91d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:06 -0700
Subject: blkcg: remove unnecessary request_list->blkg NULL test in
 blk_put_rl()

Since ec13b1d6f0a0 ("blkcg: always create the blkcg_gq for the root
blkcg"), a request_list always has its blkg associated.  Drop
unnecessary rl->blkg NULL test from blk_put_rl().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-cgroup.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 1b62d768c7df..9711fc277c02 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -394,8 +394,7 @@ root_rl:
  */
 static inline void blk_put_rl(struct request_list *rl)
 {
-	/* root_rl may not have blkg set */
-	if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+	if (rl->blkg->blkcg != &blkcg_root)
 		blkg_put(rl->blkg);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 4c55f4f9ad3001ac1fefdd8d8ca7641d18558e23 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:09 -0700
Subject: blkcg: restructure blkg_policy_data allocation in
 blkcg_activate_policy()

When a policy gets activated, it needs to allocate and install its
policy data on all existing blkg's (blkcg_gq's).  Because blkg
iteration is protected by a spinlock, it currently counts the total
number of blkg's in the system, allocates the matching number of
policy data on a list and installs them during a single iteration.

This can be simplified by using speculative GFP_NOWAIT allocations
while iterating and falling back to a preallocated policy data on
failure.  If the preallocated one has already been consumed, it
releases the lock, preallocate with GFP_KERNEL and then restarts the
iteration.  This can be a bit more expensive than before but policy
activation is a very cold path and shouldn't matter.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 55 ++++++++++++++++++----------------------------
 include/linux/blk-cgroup.h |  3 ---
 2 files changed, 21 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index f91a4e09e0c9..9e9b0df339ee 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1047,65 +1047,52 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
 int blkcg_activate_policy(struct request_queue *q,
 			  const struct blkcg_policy *pol)
 {
-	LIST_HEAD(pds);
+	struct blkg_policy_data *pd_prealloc = NULL;
 	struct blkcg_gq *blkg;
-	struct blkg_policy_data *pd, *nd;
-	int cnt = 0, ret;
+	int ret;
 
 	if (blkcg_policy_enabled(q, pol))
 		return 0;
 
-	/* count and allocate policy_data for all existing blkgs */
 	blk_queue_bypass_start(q);
-	spin_lock_irq(q->queue_lock);
-	list_for_each_entry(blkg, &q->blkg_list, q_node)
-		cnt++;
-	spin_unlock_irq(q->queue_lock);
-
-	/* allocate per-blkg policy data for all existing blkgs */
-	while (cnt--) {
-		pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
-		if (!pd) {
+pd_prealloc:
+	if (!pd_prealloc) {
+		pd_prealloc = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
+		if (!pd_prealloc) {
 			ret = -ENOMEM;
-			goto out_free;
+			goto out_bypass_end;
 		}
-		list_add_tail(&pd->alloc_node, &pds);
 	}
 
-	/*
-	 * Install the allocated pds and cpds. With @q bypassing, no new blkg
-	 * should have been created while the queue lock was dropped.
-	 */
 	spin_lock_irq(q->queue_lock);
 
 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
-		if (WARN_ON(list_empty(&pds))) {
-			/* umm... this shouldn't happen, just abort */
-			ret = -ENOMEM;
-			goto out_unlock;
-		}
-		pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
-		list_del_init(&pd->alloc_node);
+		struct blkg_policy_data *pd;
 
-		/* grab blkcg lock too while installing @pd on @blkg */
-		spin_lock(&blkg->blkcg->lock);
+		if (blkg->pd[pol->plid])
+			continue;
+
+		pd = kzalloc_node(pol->pd_size, GFP_NOWAIT, q->node);
+		if (!pd)
+			swap(pd, pd_prealloc);
+		if (!pd) {
+			spin_unlock_irq(q->queue_lock);
+			goto pd_prealloc;
+		}
 
 		blkg->pd[pol->plid] = pd;
 		pd->blkg = blkg;
 		pd->plid = pol->plid;
 		pol->pd_init_fn(blkg);
-
-		spin_unlock(&blkg->blkcg->lock);
 	}
 
 	__set_bit(pol->plid, q->blkcg_pols);
 	ret = 0;
-out_unlock:
+
 	spin_unlock_irq(q->queue_lock);
-out_free:
+out_bypass_end:
 	blk_queue_bypass_end(q);
-	list_for_each_entry_safe(pd, nd, &pds, alloc_node)
-		kfree(pd);
+	kfree(pd_prealloc);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 9711fc277c02..db822880242a 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -80,9 +80,6 @@ struct blkg_policy_data {
 	/* the blkg and policy id this per-policy data belongs to */
 	struct blkcg_gq			*blkg;
 	int				plid;
-
-	/* used during policy activation */
-	struct list_head		alloc_node;
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 001bea73e70efdf48a9e00188cf302f6b6aed2bf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:11 -0700
Subject: blkcg: replace blkcg_policy->pd_size with ->pd_alloc/free_fn()
 methods

A blkg (blkcg_gq) represents the relationship between a cgroup and
request_queue.  Each active policy has a pd (blkg_policy_data) on each
blkg.  The pd's were allocated by blkcg core and each policy could
request to allocate extra space at the end by setting
blkcg_policy->pd_size larger than the size of pd.

This is a bit unusual but was done this way mostly to simplify error
handling and all the existing use cases could be handled this way;
however, this is becoming too restrictive now that percpu memory can
be allocated without blocking.

This introduces two new mandatory blkcg_policy methods - pd_alloc_fn()
and pd_free_fn() - which are used to allocate and release pd for a
given policy.  As pd allocation is now done from policy side, it can
simply allocate a larger area which embeds pd at the beginning.  This
change makes ->pd_size pointless.  Removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 21 +++++++++++----------
 block/blk-throttle.c       | 13 ++++++++++++-
 block/cfq-iosched.c        | 13 ++++++++++++-
 include/linux/blk-cgroup.h | 18 +++++++++---------
 4 files changed, 44 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4defbbabc0ff..d1bc6099bd1e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -68,7 +68,8 @@ static void blkg_free(struct blkcg_gq *blkg)
 		return;
 
 	for (i = 0; i < BLKCG_MAX_POLS; i++)
-		kfree(blkg->pd[i]);
+		if (blkg->pd[i])
+			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
 
 	if (blkg->blkcg != &blkcg_root)
 		blk_exit_rl(&blkg->rl);
@@ -114,7 +115,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 			continue;
 
 		/* alloc per-policy data and attach it to blkg */
-		pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
+		pd = pol->pd_alloc_fn(gfp_mask, q->node);
 		if (!pd)
 			goto err_free;
 
@@ -1057,7 +1058,7 @@ int blkcg_activate_policy(struct request_queue *q,
 	blk_queue_bypass_start(q);
 pd_prealloc:
 	if (!pd_prealloc) {
-		pd_prealloc = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
+		pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
 		if (!pd_prealloc) {
 			ret = -ENOMEM;
 			goto out_bypass_end;
@@ -1072,7 +1073,7 @@ pd_prealloc:
 		if (blkg->pd[pol->plid])
 			continue;
 
-		pd = kzalloc_node(pol->pd_size, GFP_NOWAIT, q->node);
+		pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node);
 		if (!pd)
 			swap(pd, pd_prealloc);
 		if (!pd) {
@@ -1093,7 +1094,8 @@ pd_prealloc:
 	spin_unlock_irq(q->queue_lock);
 out_bypass_end:
 	blk_queue_bypass_end(q);
-	kfree(pd_prealloc);
+	if (pd_prealloc)
+		pol->pd_free_fn(pd_prealloc);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1128,8 +1130,10 @@ void blkcg_deactivate_policy(struct request_queue *q,
 		if (pol->pd_exit_fn)
 			pol->pd_exit_fn(blkg);
 
-		kfree(blkg->pd[pol->plid]);
-		blkg->pd[pol->plid] = NULL;
+		if (blkg->pd[pol->plid]) {
+			pol->pd_free_fn(blkg->pd[pol->plid]);
+			blkg->pd[pol->plid] = NULL;
+		}
 
 		spin_unlock(&blkg->blkcg->lock);
 	}
@@ -1151,9 +1155,6 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	struct blkcg *blkcg;
 	int i, ret;
 
-	if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
-		return -EINVAL;
-
 	mutex_lock(&blkcg_pol_register_mutex);
 	mutex_lock(&blkcg_pol_mutex);
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b23193518ac7..f1dd691c5359 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -403,6 +403,11 @@ static void throtl_service_queue_exit(struct throtl_service_queue *sq)
 	del_timer_sync(&sq->pending_timer);
 }
 
+static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
+{
+	return kzalloc_node(sizeof(struct throtl_grp), gfp, node);
+}
+
 static void throtl_pd_init(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -493,6 +498,11 @@ static void throtl_pd_exit(struct blkcg_gq *blkg)
 	throtl_service_queue_exit(&tg->service_queue);
 }
 
+static void throtl_pd_free(struct blkg_policy_data *pd)
+{
+	kfree(pd);
+}
+
 static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1468,12 +1478,13 @@ static void throtl_shutdown_wq(struct request_queue *q)
 }
 
 static struct blkcg_policy blkcg_policy_throtl = {
-	.pd_size		= sizeof(struct throtl_grp),
 	.cftypes		= throtl_files,
 
+	.pd_alloc_fn		= throtl_pd_alloc,
 	.pd_init_fn		= throtl_pd_init,
 	.pd_online_fn		= throtl_pd_online,
 	.pd_exit_fn		= throtl_pd_exit,
+	.pd_free_fn		= throtl_pd_free,
 	.pd_reset_stats_fn	= throtl_pd_reset_stats,
 };
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9c9ec7cc9f99..69ce2883099e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1582,6 +1582,11 @@ static void cfq_cpd_init(const struct blkcg *blkcg)
 	}
 }
 
+static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
+{
+	return kzalloc_node(sizeof(struct cfq_group), gfp, node);
+}
+
 static void cfq_pd_init(struct blkcg_gq *blkg)
 {
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
@@ -1618,6 +1623,11 @@ static void cfq_pd_offline(struct blkcg_gq *blkg)
 	cfqg_stats_xfer_dead(cfqg);
 }
 
+static void cfq_pd_free(struct blkg_policy_data *pd)
+{
+	return kfree(pd);
+}
+
 /* offset delta from cfqg->stats to cfqg->dead_stats */
 static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
 					offsetof(struct cfq_group, stats);
@@ -4633,13 +4643,14 @@ static struct elevator_type iosched_cfq = {
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkcg_policy blkcg_policy_cfq = {
-	.pd_size		= sizeof(struct cfq_group),
 	.cpd_size		= sizeof(struct cfq_group_data),
 	.cftypes		= cfq_blkcg_files,
 
 	.cpd_init_fn		= cfq_cpd_init,
+	.pd_alloc_fn		= cfq_pd_alloc,
 	.pd_init_fn		= cfq_pd_init,
 	.pd_offline_fn		= cfq_pd_offline,
+	.pd_free_fn		= cfq_pd_free,
 	.pd_reset_stats_fn	= cfq_pd_reset_stats,
 };
 #endif
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index db822880242a..bd173ea360ce 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -68,13 +68,11 @@ struct blkg_rwstat {
  * request_queue (q).  This is used by blkcg policies which need to track
  * information per blkcg - q pair.
  *
- * There can be multiple active blkcg policies and each has its private
- * data on each blkg, the size of which is determined by
- * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
- * together with blkg and invokes pd_init/exit_fn() methods.
- *
- * Such private data must embed struct blkg_policy_data (pd) at the
- * beginning and pd_size can't be smaller than pd.
+ * There can be multiple active blkcg policies and each blkg:policy pair is
+ * represented by a blkg_policy_data which is allocated and freed by each
+ * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
+ * area by allocating larger data structure which embeds blkg_policy_data
+ * at the beginning.
  */
 struct blkg_policy_data {
 	/* the blkg and policy id this per-policy data belongs to */
@@ -126,16 +124,16 @@ struct blkcg_gq {
 };
 
 typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
+typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
 typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
 typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
 
 struct blkcg_policy {
 	int				plid;
-	/* policy specific private data size */
-	size_t				pd_size;
 	/* policy specific per-blkcg data size */
 	size_t				cpd_size;
 	/* cgroup files for the policy */
@@ -143,10 +141,12 @@ struct blkcg_policy {
 
 	/* operations */
 	blkcg_pol_init_cpd_fn		*cpd_init_fn;
+	blkcg_pol_alloc_pd_fn		*pd_alloc_fn;
 	blkcg_pol_init_pd_fn		*pd_init_fn;
 	blkcg_pol_online_pd_fn		*pd_online_fn;
 	blkcg_pol_offline_pd_fn		*pd_offline_fn;
 	blkcg_pol_exit_pd_fn		*pd_exit_fn;
+	blkcg_pol_free_pd_fn		*pd_free_fn;
 	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
 };
 
-- 
cgit v1.2.3-70-g09d2


From b2ce2643cc705aa9043642d7b6248ccfd8e20629 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:13 -0700
Subject: blk-throttle: clean up blkg_policy_data alloc/init/exit/free methods

With the recent addition of alloc and free methods, things became
messier.  This patch reorganizes them according to the followings.

* ->pd_alloc_fn()

  Responsible for allocation and static initializations - the ones
  which can be done independent of where the pd might be attached.

* ->pd_init_fn()

  Initializations which require the knowledge of where the pd is
  attached.

* ->pd_free_fn()

  The counter part of pd_alloc_fn().  Static de-init and freeing.

This leaves ->pd_exit_fn() without any users.  Removed.

While at it, collapse an one liner function throtl_pd_exit(), which
has only one user, into its user.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 11 ---------
 block/blk-throttle.c       | 57 ++++++++++++++++------------------------------
 block/cfq-iosched.c        | 15 ++++++++----
 include/linux/blk-cgroup.h |  2 --
 4 files changed, 31 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d1bc6099bd1e..acfb09af58a5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -402,15 +402,6 @@ static void blkg_destroy_all(struct request_queue *q)
 void __blkg_release_rcu(struct rcu_head *rcu_head)
 {
 	struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
-	int i;
-
-	/* tell policies that this one is being freed */
-	for (i = 0; i < BLKCG_MAX_POLS; i++) {
-		struct blkcg_policy *pol = blkcg_policy[i];
-
-		if (blkg->pd[i] && pol->pd_exit_fn)
-			pol->pd_exit_fn(blkg);
-	}
 
 	/* release the blkcg and parent blkg refs this blkg has been holding */
 	css_put(&blkg->blkcg->css);
@@ -1127,8 +1118,6 @@ void blkcg_deactivate_policy(struct request_queue *q,
 
 		if (pol->pd_offline_fn)
 			pol->pd_offline_fn(blkg);
-		if (pol->pd_exit_fn)
-			pol->pd_exit_fn(blkg);
 
 		if (blkg->pd[pol->plid]) {
 			pol->pd_free_fn(blkg->pd[pol->plid]);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 3c869768cfdd..c3a235b8ec7e 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -330,26 +330,19 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
 }
 
 /* init a service_queue, assumes the caller zeroed it */
-static void throtl_service_queue_init(struct throtl_service_queue *sq,
-				      struct throtl_service_queue *parent_sq)
+static void throtl_service_queue_init(struct throtl_service_queue *sq)
 {
 	INIT_LIST_HEAD(&sq->queued[0]);
 	INIT_LIST_HEAD(&sq->queued[1]);
 	sq->pending_tree = RB_ROOT;
-	sq->parent_sq = parent_sq;
 	setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
 		    (unsigned long)sq);
 }
 
-static void throtl_service_queue_exit(struct throtl_service_queue *sq)
-{
-	del_timer_sync(&sq->pending_timer);
-}
-
 static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 {
 	struct throtl_grp *tg;
-	int cpu;
+	int rw, cpu;
 
 	tg = kzalloc_node(sizeof(*tg), gfp, node);
 	if (!tg)
@@ -361,6 +354,19 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 		return NULL;
 	}
 
+	throtl_service_queue_init(&tg->service_queue);
+
+	for (rw = READ; rw <= WRITE; rw++) {
+		throtl_qnode_init(&tg->qnode_on_self[rw], tg);
+		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+	}
+
+	RB_CLEAR_NODE(&tg->rb_node);
+	tg->bps[READ] = -1;
+	tg->bps[WRITE] = -1;
+	tg->iops[READ] = -1;
+	tg->iops[WRITE] = -1;
+
 	for_each_possible_cpu(cpu) {
 		struct tg_stats_cpu *stats_cpu = per_cpu_ptr(tg->stats_cpu, cpu);
 
@@ -375,8 +381,7 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 	struct throtl_data *td = blkg->q->td;
-	struct throtl_service_queue *parent_sq;
-	int rw;
+	struct throtl_service_queue *sq = &tg->service_queue;
 
 	/*
 	 * If on the default hierarchy, we switch to properly hierarchical
@@ -391,25 +396,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
 	 * Limits of a group don't interact with limits of other groups
 	 * regardless of the position of the group in the hierarchy.
 	 */
-	parent_sq = &td->service_queue;
-
+	sq->parent_sq = &td->service_queue;
 	if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent)
-		parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
-
-	throtl_service_queue_init(&tg->service_queue, parent_sq);
-
-	for (rw = READ; rw <= WRITE; rw++) {
-		throtl_qnode_init(&tg->qnode_on_self[rw], tg);
-		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
-	}
-
-	RB_CLEAR_NODE(&tg->rb_node);
+		sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
 	tg->td = td;
-
-	tg->bps[READ] = -1;
-	tg->bps[WRITE] = -1;
-	tg->iops[READ] = -1;
-	tg->iops[WRITE] = -1;
 }
 
 /*
@@ -436,17 +426,11 @@ static void throtl_pd_online(struct blkcg_gq *blkg)
 	tg_update_has_rules(blkg_to_tg(blkg));
 }
 
-static void throtl_pd_exit(struct blkcg_gq *blkg)
-{
-	struct throtl_grp *tg = blkg_to_tg(blkg);
-
-	throtl_service_queue_exit(&tg->service_queue);
-}
-
 static void throtl_pd_free(struct blkg_policy_data *pd)
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
 
+	del_timer_sync(&tg->service_queue.pending_timer);
 	free_percpu(tg->stats_cpu);
 	kfree(tg);
 }
@@ -1421,7 +1405,6 @@ static struct blkcg_policy blkcg_policy_throtl = {
 	.pd_alloc_fn		= throtl_pd_alloc,
 	.pd_init_fn		= throtl_pd_init,
 	.pd_online_fn		= throtl_pd_online,
-	.pd_exit_fn		= throtl_pd_exit,
 	.pd_free_fn		= throtl_pd_free,
 	.pd_reset_stats_fn	= throtl_pd_reset_stats,
 };
@@ -1616,7 +1599,7 @@ int blk_throtl_init(struct request_queue *q)
 		return -ENOMEM;
 
 	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
-	throtl_service_queue_init(&td->service_queue, NULL);
+	throtl_service_queue_init(&td->service_queue);
 
 	q->td = td;
 	td->queue = q;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 69ce2883099e..4b795c7250ef 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1584,7 +1584,17 @@ static void cfq_cpd_init(const struct blkcg *blkcg)
 
 static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
 {
-	return kzalloc_node(sizeof(struct cfq_group), gfp, node);
+	struct cfq_group *cfqg;
+
+	cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
+	if (!cfqg)
+		return NULL;
+
+	cfq_init_cfqg_base(cfqg);
+	cfqg_stats_init(&cfqg->stats);
+	cfqg_stats_init(&cfqg->dead_stats);
+
+	return &cfqg->pd;
 }
 
 static void cfq_pd_init(struct blkcg_gq *blkg)
@@ -1592,11 +1602,8 @@ static void cfq_pd_init(struct blkcg_gq *blkg)
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
 	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
 
-	cfq_init_cfqg_base(cfqg);
 	cfqg->weight = cgd->weight;
 	cfqg->leaf_weight = cgd->leaf_weight;
-	cfqg_stats_init(&cfqg->stats);
-	cfqg_stats_init(&cfqg->dead_stats);
 }
 
 static void cfq_pd_offline(struct blkcg_gq *blkg)
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index bd173ea360ce..9879469b1b38 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -128,7 +128,6 @@ typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
 typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
 typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
 
@@ -145,7 +144,6 @@ struct blkcg_policy {
 	blkcg_pol_init_pd_fn		*pd_init_fn;
 	blkcg_pol_online_pd_fn		*pd_online_fn;
 	blkcg_pol_offline_pd_fn		*pd_offline_fn;
-	blkcg_pol_exit_pd_fn		*pd_exit_fn;
 	blkcg_pol_free_pd_fn		*pd_free_fn;
 	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
 };
-- 
cgit v1.2.3-70-g09d2


From a9520cd6f2ac1fbbf206b915946534c6dddbaae2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:14 -0700
Subject: blkcg: make blkcg_policy methods take a pointer to blkcg_policy_data

The newly added ->pd_alloc_fn() and ->pd_free_fn() deal with pd
(blkg_policy_data) while the older ones use blkg (blkcg_gq).  As using
blkg doesn't make sense for ->pd_alloc_fn() and after allocation pd
can always be mapped to blkg and given that these are policy-specific
methods, it makes sense to converge on pd.

This patch makes all methods deal with pd instead of blkg.  Most
conversions are trivial.  In blk-cgroup.c, a couple method invocation
sites now test whether pd exists instead of policy state for
consistency.  This shouldn't cause any behavioral differences.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 18 ++++++++----------
 block/blk-throttle.c       | 13 +++++++------
 block/cfq-iosched.c        | 14 +++++++-------
 include/linux/blk-cgroup.h |  8 ++++----
 4 files changed, 26 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index acfb09af58a5..8343450cffe2 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -242,7 +242,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 		struct blkcg_policy *pol = blkcg_policy[i];
 
 		if (blkg->pd[i] && pol->pd_init_fn)
-			pol->pd_init_fn(blkg);
+			pol->pd_init_fn(blkg->pd[i]);
 	}
 
 	/* insert */
@@ -256,7 +256,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 			struct blkcg_policy *pol = blkcg_policy[i];
 
 			if (blkg->pd[i] && pol->pd_online_fn)
-				pol->pd_online_fn(blkg);
+				pol->pd_online_fn(blkg->pd[i]);
 		}
 	}
 	blkg->online = true;
@@ -347,7 +347,7 @@ static void blkg_destroy(struct blkcg_gq *blkg)
 		struct blkcg_policy *pol = blkcg_policy[i];
 
 		if (blkg->pd[i] && pol->pd_offline_fn)
-			pol->pd_offline_fn(blkg);
+			pol->pd_offline_fn(blkg->pd[i]);
 	}
 	blkg->online = false;
 
@@ -468,9 +468,8 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 		for (i = 0; i < BLKCG_MAX_POLS; i++) {
 			struct blkcg_policy *pol = blkcg_policy[i];
 
-			if (blkcg_policy_enabled(blkg->q, pol) &&
-			    pol->pd_reset_stats_fn)
-				pol->pd_reset_stats_fn(blkg);
+			if (blkg->pd[i] && pol->pd_reset_stats_fn)
+				pol->pd_reset_stats_fn(blkg->pd[i]);
 		}
 	}
 
@@ -1076,7 +1075,7 @@ pd_prealloc:
 		pd->blkg = blkg;
 		pd->plid = pol->plid;
 		if (pol->pd_init_fn)
-			pol->pd_init_fn(blkg);
+			pol->pd_init_fn(pd);
 	}
 
 	__set_bit(pol->plid, q->blkcg_pols);
@@ -1116,10 +1115,9 @@ void blkcg_deactivate_policy(struct request_queue *q,
 		/* grab blkcg lock too while removing @pd from @blkg */
 		spin_lock(&blkg->blkcg->lock);
 
-		if (pol->pd_offline_fn)
-			pol->pd_offline_fn(blkg);
-
 		if (blkg->pd[pol->plid]) {
+			if (pol->pd_offline_fn)
+				pol->pd_offline_fn(blkg->pd[pol->plid]);
 			pol->pd_free_fn(blkg->pd[pol->plid]);
 			blkg->pd[pol->plid] = NULL;
 		}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c3a235b8ec7e..c2c75477a6b2 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -377,9 +377,10 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 	return &tg->pd;
 }
 
-static void throtl_pd_init(struct blkcg_gq *blkg)
+static void throtl_pd_init(struct blkg_policy_data *pd)
 {
-	struct throtl_grp *tg = blkg_to_tg(blkg);
+	struct throtl_grp *tg = pd_to_tg(pd);
+	struct blkcg_gq *blkg = tg_to_blkg(tg);
 	struct throtl_data *td = blkg->q->td;
 	struct throtl_service_queue *sq = &tg->service_queue;
 
@@ -417,13 +418,13 @@ static void tg_update_has_rules(struct throtl_grp *tg)
 				    (tg->bps[rw] != -1 || tg->iops[rw] != -1);
 }
 
-static void throtl_pd_online(struct blkcg_gq *blkg)
+static void throtl_pd_online(struct blkg_policy_data *pd)
 {
 	/*
 	 * We don't want new groups to escape the limits of its ancestors.
 	 * Update has_rules[] after a new group is brought online.
 	 */
-	tg_update_has_rules(blkg_to_tg(blkg));
+	tg_update_has_rules(pd_to_tg(pd));
 }
 
 static void throtl_pd_free(struct blkg_policy_data *pd)
@@ -435,9 +436,9 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
 	kfree(tg);
 }
 
-static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
+static void throtl_pd_reset_stats(struct blkg_policy_data *pd)
 {
-	struct throtl_grp *tg = blkg_to_tg(blkg);
+	struct throtl_grp *tg = pd_to_tg(pd);
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4b795c7250ef..95e6b0cbaa84 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1597,18 +1597,18 @@ static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
 	return &cfqg->pd;
 }
 
-static void cfq_pd_init(struct blkcg_gq *blkg)
+static void cfq_pd_init(struct blkg_policy_data *pd)
 {
-	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
-	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
+	struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
 
 	cfqg->weight = cgd->weight;
 	cfqg->leaf_weight = cgd->leaf_weight;
 }
 
-static void cfq_pd_offline(struct blkcg_gq *blkg)
+static void cfq_pd_offline(struct blkg_policy_data *pd)
 {
-	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
 	int i;
 
 	for (i = 0; i < IOPRIO_BE_NR; i++) {
@@ -1661,9 +1661,9 @@ static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *
 	return a;
 }
 
-static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
 {
-	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
 
 	cfqg_stats_reset(&cfqg->stats);
 	cfqg_stats_reset(&cfqg->dead_stats);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 9879469b1b38..ddd4b8b252c7 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -125,11 +125,11 @@ struct blkcg_gq {
 
 typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
 typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
-typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
 typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
-typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
 
 struct blkcg_policy {
 	int				plid;
-- 
cgit v1.2.3-70-g09d2


From 814376483e7d85b69a70634633f1f9d01c6ee0cf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:15 -0700
Subject: blkcg: minor updates around blkcg_policy_data

* Rename blkcg->pd[] to blkcg->cpd[] so that cpd is consistently used
  for blkcg_policy_data.

* Make blkcg_policy->cpd_init_fn() take blkcg_policy_data instead of
  blkcg.  This makes it consistent with blkg_policy_data methods and
  to-be-added cpd alloc/free methods.

* blkcg_policy_data->blkcg and cpd_to_blkcg() added so that
  cpd_init_fn() can determine the associated blkcg from
  blkcg_policy_data.

v2: blkcg_policy_data->blkcg initializations were missing.  Added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 24 +++++++++++++-----------
 block/cfq-iosched.c        | 11 +++++------
 include/linux/blk-cgroup.h | 14 ++++++++++----
 3 files changed, 28 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8343450cffe2..247c42c8c83b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -821,7 +821,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
 	mutex_unlock(&blkcg_pol_mutex);
 
 	for (i = 0; i < BLKCG_MAX_POLS; i++)
-		kfree(blkcg->pd[i]);
+		kfree(blkcg->cpd[i]);
 	kfree(blkcg);
 }
 
@@ -857,15 +857,16 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		if (!pol || !pol->cpd_size)
 			continue;
 
-		BUG_ON(blkcg->pd[i]);
+		BUG_ON(blkcg->cpd[i]);
 		cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
 		if (!cpd) {
 			ret = ERR_PTR(-ENOMEM);
 			goto free_pd_blkcg;
 		}
-		blkcg->pd[i] = cpd;
+		blkcg->cpd[i] = cpd;
+		cpd->blkcg = blkcg;
 		cpd->plid = i;
-		pol->cpd_init_fn(blkcg);
+		pol->cpd_init_fn(cpd);
 	}
 
 	spin_lock_init(&blkcg->lock);
@@ -881,7 +882,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 
 free_pd_blkcg:
 	for (i--; i >= 0; i--)
-		kfree(blkcg->pd[i]);
+		kfree(blkcg->cpd[i]);
 free_blkcg:
 	kfree(blkcg);
 	mutex_unlock(&blkcg_pol_mutex);
@@ -1168,9 +1169,10 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 				goto err_free_cpds;
 			}
 
-			blkcg->pd[pol->plid] = cpd;
+			blkcg->cpd[pol->plid] = cpd;
+			cpd->blkcg = blkcg;
 			cpd->plid = pol->plid;
-			pol->cpd_init_fn(blkcg);
+			pol->cpd_init_fn(cpd);
 		}
 	}
 
@@ -1186,8 +1188,8 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 err_free_cpds:
 	if (pol->cpd_size) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-			kfree(blkcg->pd[pol->plid]);
-			blkcg->pd[pol->plid] = NULL;
+			kfree(blkcg->cpd[pol->plid]);
+			blkcg->cpd[pol->plid] = NULL;
 		}
 	}
 	blkcg_policy[pol->plid] = NULL;
@@ -1222,8 +1224,8 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 
 	if (pol->cpd_size) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-			kfree(blkcg->pd[pol->plid]);
-			blkcg->pd[pol->plid] = NULL;
+			kfree(blkcg->cpd[pol->plid]);
+			blkcg->cpd[pol->plid] = NULL;
 		}
 	}
 	blkcg_policy[pol->plid] = NULL;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 95e6b0cbaa84..dd6ea9ee6245 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -220,7 +220,7 @@ struct cfqg_stats {
 /* Per-cgroup data */
 struct cfq_group_data {
 	/* must be the first member */
-	struct blkcg_policy_data pd;
+	struct blkcg_policy_data cpd;
 
 	unsigned int weight;
 	unsigned int leaf_weight;
@@ -612,7 +612,7 @@ static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
 static struct cfq_group_data
 *cpd_to_cfqgd(struct blkcg_policy_data *cpd)
 {
-	return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
+	return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
 }
 
 static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
@@ -1568,12 +1568,11 @@ static void cfqg_stats_init(struct cfqg_stats *stats)
 #endif
 }
 
-static void cfq_cpd_init(const struct blkcg *blkcg)
+static void cfq_cpd_init(struct blkcg_policy_data *cpd)
 {
-	struct cfq_group_data *cgd =
-		cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
+	struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
 
-	if (blkcg == &blkcg_root) {
+	if (cpd_to_blkcg(cpd) == &blkcg_root) {
 		cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
 		cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
 	} else {
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index ddd4b8b252c7..7988d4749fff 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -45,7 +45,7 @@ struct blkcg {
 	struct blkcg_gq			*blkg_hint;
 	struct hlist_head		blkg_list;
 
-	struct blkcg_policy_data	*pd[BLKCG_MAX_POLS];
+	struct blkcg_policy_data	*cpd[BLKCG_MAX_POLS];
 
 	struct list_head		all_blkcgs_node;
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -88,7 +88,8 @@ struct blkg_policy_data {
  * each policy handle per-blkcg data.
  */
 struct blkcg_policy_data {
-	/* the policy id this per-policy data belongs to */
+	/* the blkcg and policy id this per-policy data belongs to */
+	struct blkcg			*blkcg;
 	int				plid;
 };
 
@@ -123,7 +124,7 @@ struct blkcg_gq {
 	struct rcu_head			rcu_head;
 };
 
-typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
+typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
 typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
 typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
 typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
@@ -243,7 +244,7 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
 static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
 						     struct blkcg_policy *pol)
 {
-	return blkcg ? blkcg->pd[pol->plid] : NULL;
+	return blkcg ? blkcg->cpd[pol->plid] : NULL;
 }
 
 /**
@@ -257,6 +258,11 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
 	return pd ? pd->blkg : NULL;
 }
 
+static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
+{
+	return cpd ? cpd->blkcg : NULL;
+}
+
 /**
  * blkg_path - format cgroup path of blkg
  * @blkg: blkg of interest
-- 
cgit v1.2.3-70-g09d2


From e4a9bde9589fdc51283755cdd75d47b27ca7c6fb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:16 -0700
Subject: blkcg: replace blkcg_policy->cpd_size with ->cpd_alloc/free_fn()
 methods

Each active policy has a cpd (blkcg_policy_data) on each blkcg.  The
cpd's were allocated by blkcg core and each policy could request to
allocate extra space at the end by setting blkcg_policy->cpd_size
larger than the size of cpd.

This is a bit unusual but blkg (blkcg_gq) policy data used to be
handled this way too so it made sense to be consistent; however, blkg
policy data switched to alloc/free callbacks.

This patch makes similar changes to cpd handling.
blkcg_policy->cpd_alloc/free_fn() are added to replace ->cpd_size.  As
cpd allocation is now done from policy side, it can simply allocate a
larger area which embeds cpd at the beginning.

As ->cpd_alloc_fn() may be able to perform all necessary
initializations, this patch makes ->cpd_init_fn() optional.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 39 ++++++++++++++++++++++++---------------
 block/cfq-iosched.c        | 19 ++++++++++++++++++-
 include/linux/blk-cgroup.h | 17 ++++++++++-------
 3 files changed, 52 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 247c42c8c83b..2b4354b6b5de 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -817,11 +817,15 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
 	int i;
 
 	mutex_lock(&blkcg_pol_mutex);
+
 	list_del(&blkcg->all_blkcgs_node);
-	mutex_unlock(&blkcg_pol_mutex);
 
 	for (i = 0; i < BLKCG_MAX_POLS; i++)
-		kfree(blkcg->cpd[i]);
+		if (blkcg->cpd[i])
+			blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
+
+	mutex_unlock(&blkcg_pol_mutex);
+
 	kfree(blkcg);
 }
 
@@ -854,11 +858,10 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		 * check if the policy requires any specific per-cgroup
 		 * data: if it does, allocate and initialize it.
 		 */
-		if (!pol || !pol->cpd_size)
+		if (!pol || !pol->cpd_alloc_fn)
 			continue;
 
-		BUG_ON(blkcg->cpd[i]);
-		cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+		cpd = pol->cpd_alloc_fn(GFP_KERNEL);
 		if (!cpd) {
 			ret = ERR_PTR(-ENOMEM);
 			goto free_pd_blkcg;
@@ -866,7 +869,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		blkcg->cpd[i] = cpd;
 		cpd->blkcg = blkcg;
 		cpd->plid = i;
-		pol->cpd_init_fn(cpd);
+		if (pol->cpd_init_fn)
+			pol->cpd_init_fn(cpd);
 	}
 
 	spin_lock_init(&blkcg->lock);
@@ -882,7 +886,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 
 free_pd_blkcg:
 	for (i--; i >= 0; i--)
-		kfree(blkcg->cpd[i]);
+		if (blkcg->cpd[i])
+			blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
 free_blkcg:
 	kfree(blkcg);
 	mutex_unlock(&blkcg_pol_mutex);
@@ -1159,11 +1164,11 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	blkcg_policy[pol->plid] = pol;
 
 	/* allocate and install cpd's */
-	if (pol->cpd_size) {
+	if (pol->cpd_alloc_fn) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
 			struct blkcg_policy_data *cpd;
 
-			cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+			cpd = pol->cpd_alloc_fn(GFP_KERNEL);
 			if (!cpd) {
 				mutex_unlock(&blkcg_pol_mutex);
 				goto err_free_cpds;
@@ -1186,10 +1191,12 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	return 0;
 
 err_free_cpds:
-	if (pol->cpd_size) {
+	if (pol->cpd_alloc_fn) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-			kfree(blkcg->cpd[pol->plid]);
-			blkcg->cpd[pol->plid] = NULL;
+			if (blkcg->cpd[pol->plid]) {
+				pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+				blkcg->cpd[pol->plid] = NULL;
+			}
 		}
 	}
 	blkcg_policy[pol->plid] = NULL;
@@ -1222,10 +1229,12 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 	/* remove cpds and unregister */
 	mutex_lock(&blkcg_pol_mutex);
 
-	if (pol->cpd_size) {
+	if (pol->cpd_alloc_fn) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-			kfree(blkcg->cpd[pol->plid]);
-			blkcg->cpd[pol->plid] = NULL;
+			if (blkcg->cpd[pol->plid]) {
+				pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+				blkcg->cpd[pol->plid] = NULL;
+			}
 		}
 	}
 	blkcg_policy[pol->plid] = NULL;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index dd6ea9ee6245..a4429b366820 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1568,6 +1568,16 @@ static void cfqg_stats_init(struct cfqg_stats *stats)
 #endif
 }
 
+static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
+{
+	struct cfq_group_data *cgd;
+
+	cgd = kzalloc(sizeof(*cgd), GFP_KERNEL);
+	if (!cgd)
+		return NULL;
+	return &cgd->cpd;
+}
+
 static void cfq_cpd_init(struct blkcg_policy_data *cpd)
 {
 	struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
@@ -1581,6 +1591,11 @@ static void cfq_cpd_init(struct blkcg_policy_data *cpd)
 	}
 }
 
+static void cfq_cpd_free(struct blkcg_policy_data *cpd)
+{
+	kfree(cpd_to_cfqgd(cpd));
+}
+
 static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
 {
 	struct cfq_group *cfqg;
@@ -4649,10 +4664,12 @@ static struct elevator_type iosched_cfq = {
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkcg_policy blkcg_policy_cfq = {
-	.cpd_size		= sizeof(struct cfq_group_data),
 	.cftypes		= cfq_blkcg_files,
 
+	.cpd_alloc_fn		= cfq_cpd_alloc,
 	.cpd_init_fn		= cfq_cpd_init,
+	.cpd_free_fn		= cfq_cpd_free,
+
 	.pd_alloc_fn		= cfq_pd_alloc,
 	.pd_init_fn		= cfq_pd_init,
 	.pd_offline_fn		= cfq_pd_offline,
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 7988d4749fff..15f2382bc723 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -81,11 +81,11 @@ struct blkg_policy_data {
 };
 
 /*
- * Policies that need to keep per-blkcg data which is independent
- * from any request_queue associated to it must specify its size
- * with the cpd_size field of the blkcg_policy structure and
- * embed a blkcg_policy_data in it.  cpd_init() is invoked to let
- * each policy handle per-blkcg data.
+ * Policies that need to keep per-blkcg data which is independent from any
+ * request_queue associated to it should implement cpd_alloc/free_fn()
+ * methods.  A policy can allocate private data area by allocating larger
+ * data structure which embeds blkcg_policy_data at the beginning.
+ * cpd_init() is invoked to let each policy handle per-blkcg data.
  */
 struct blkcg_policy_data {
 	/* the blkcg and policy id this per-policy data belongs to */
@@ -124,7 +124,9 @@ struct blkcg_gq {
 	struct rcu_head			rcu_head;
 };
 
+typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
 typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
 typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
 typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
 typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
@@ -134,13 +136,14 @@ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
 
 struct blkcg_policy {
 	int				plid;
-	/* policy specific per-blkcg data size */
-	size_t				cpd_size;
 	/* cgroup files for the policy */
 	struct cftype			*cftypes;
 
 	/* operations */
+	blkcg_pol_alloc_cpd_fn		*cpd_alloc_fn;
 	blkcg_pol_init_cpd_fn		*cpd_init_fn;
+	blkcg_pol_free_cpd_fn		*cpd_free_fn;
+
 	blkcg_pol_alloc_pd_fn		*pd_alloc_fn;
 	blkcg_pol_init_pd_fn		*pd_init_fn;
 	blkcg_pol_online_pd_fn		*pd_online_fn;
-- 
cgit v1.2.3-70-g09d2


From 24f290466f79a6497f1654f64b9a841872cba3ca Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:17 -0700
Subject: blkcg: inline [__]blkg_lookup()

blkg_lookup() checks whether the target queue is bypassing and, if
not, calls __blkg_lookup() which first checks the lookup hint and then
performs radix tree walk.  The operations upto hint checking are
trivial and there are many users of this function.  This patch inlines
blkg_lookup() and the fast path part of __blkg_lookup().  The radix
tree lookup and hint update are now in blkg_lookup_slowpath().

This will help consolidating blkg handling by easing moving root blkcg
short-circuit to inlined lookup fast path.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 38 ++---------------------------------
 include/linux/blk-cgroup.h | 49 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 47 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2b4354b6b5de..52e637ed9d7e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -131,26 +131,11 @@ err_free:
 	return NULL;
 }
 
-/**
- * __blkg_lookup - internal version of blkg_lookup()
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- * @update_hint: whether to update lookup hint with the result or not
- *
- * This is internal version and shouldn't be used by policy
- * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
- * @q's bypass state.  If @update_hint is %true, the caller should be
- * holding @q->queue_lock and lookup hint is updated on success.
- */
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-			       bool update_hint)
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+				      struct request_queue *q, bool update_hint)
 {
 	struct blkcg_gq *blkg;
 
-	blkg = rcu_dereference(blkcg->blkg_hint);
-	if (blkg && blkg->q == q)
-		return blkg;
-
 	/*
 	 * Hint didn't match.  Look up from the radix tree.  Note that the
 	 * hint can only be updated under queue_lock as otherwise @blkg
@@ -169,25 +154,6 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
 	return NULL;
 }
 
-/**
- * blkg_lookup - lookup blkg for the specified blkcg - q pair
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- *
- * Lookup blkg for the @blkcg - @q pair.  This function should be called
- * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
- * - see blk_queue_bypass_start() for details.
- */
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
-{
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	if (unlikely(blk_queue_bypass(q)))
-		return NULL;
-	return __blkg_lookup(blkcg, q, false);
-}
-EXPORT_SYMBOL_GPL(blkg_lookup);
-
 /*
  * If @new_blkg is %NULL, this function tries to allocate a new one as
  * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 15f2382bc723..d5b54aa50582 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -155,7 +155,8 @@ struct blkcg_policy {
 extern struct blkcg blkcg_root;
 extern struct cgroup_subsys_state * const blkcg_root_css;
 
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+				      struct request_queue *q, bool update_hint);
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 				    struct request_queue *q);
 int blkcg_init_queue(struct request_queue *q);
@@ -231,6 +232,49 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
 	return css_to_blkcg(blkcg->css.parent);
 }
 
+/**
+ * __blkg_lookup - internal version of blkg_lookup()
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ * @update_hint: whether to update lookup hint with the result or not
+ *
+ * This is internal version and shouldn't be used by policy
+ * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
+ * @q's bypass state.  If @update_hint is %true, the caller should be
+ * holding @q->queue_lock and lookup hint is updated on success.
+ */
+static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+					     struct request_queue *q,
+					     bool update_hint)
+{
+	struct blkcg_gq *blkg;
+
+	blkg = rcu_dereference(blkcg->blkg_hint);
+	if (blkg && blkg->q == q)
+		return blkg;
+
+	return blkg_lookup_slowpath(blkcg, q, update_hint);
+}
+
+/**
+ * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair.  This function should be called
+ * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
+ * - see blk_queue_bypass_start() for details.
+ */
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
+					   struct request_queue *q)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (unlikely(blk_queue_bypass(q)))
+		return NULL;
+	return __blkg_lookup(blkcg, q, false);
+}
+
 /**
  * blkg_to_pdata - get policy private data
  * @blkg: blkg of interest
@@ -313,9 +357,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 		call_rcu(&blkg->rcu_head, __blkg_release_rcu);
 }
 
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-			       bool update_hint);
-
 /**
  * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
  * @d_blkg: loop cursor pointing to the current descendant
-- 
cgit v1.2.3-70-g09d2


From 85b6bc9db6d5ab6980b43c38b5cbd11d24414ce4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:18 -0700
Subject: blkcg: move root blkg lookup optimization from throtl_lookup_tg() to
 __blkg_lookup()

Currently, both throttle and cfq policies implement their own root
blkg (blkcg_gq) lookup fast path.  This patch moves root blkg
optimization from throtl_lookup_tg() to __blkg_lookup().  cfq-iosched
currently doesn't use blkg_lookup() but will be converted and drop the
optimization too.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-throttle.c       | 7 -------
 include/linux/blk-cgroup.h | 3 +++
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c2c75477a6b2..1f63fc834dc3 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -452,13 +452,6 @@ static void throtl_pd_reset_stats(struct blkg_policy_data *pd)
 static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
 					   struct blkcg *blkcg)
 {
-	/*
-	 * This is the common case when there are no blkcgs.  Avoid lookup
-	 * in this case
-	 */
-	if (blkcg == &blkcg_root)
-		return td_root_tg(td);
-
 	return blkg_to_tg(blkg_lookup(blkcg, td->queue));
 }
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index d5b54aa50582..0609bce69f68 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -249,6 +249,9 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
 {
 	struct blkcg_gq *blkg;
 
+	if (blkcg == &blkcg_root)
+		return q->root_blkg;
+
 	blkg = rcu_dereference(blkcg->blkg_hint);
 	if (blkg && blkg->q == q)
 		return blkg;
-- 
cgit v1.2.3-70-g09d2


From ae11889636111199dbcf47283b4167f578b69472 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:20 -0700
Subject: blkcg: consolidate blkg creation in blkcg_bio_issue_check()

blkg (blkcg_gq) currently is created by blkcg policies invoking
blkg_lookup_create() which ends up repeating about the same code in
different policies.  Theoretically, this can avoid the overhead of
looking and/or creating blkg's if blkcg is enabled but no policy is in
use; however, the cost of blkg lookup / creation is very low
especially if only the root blkcg is in use which is highly likely if
no blkcg policy is in active use - it boils down to a single very
predictable conditional and surrounding RCU protection.

This patch consolidates blkg creation to a new function
blkcg_bio_issue_check() which is called during bio issue from
generic_make_request_checks().  blkcg_bio_issue_check() is now the
only function which tries to create missing blkg's.  The subsequent
policy and request_list operations just perform blkg_lookup() and if
missing falls back to the root.

* blk_get_rl() no longer tries to create blkg.  It uses blkg_lookup()
  instead of blkg_lookup_create().

* blk_throtl_bio() is now called from blkcg_bio_issue_check() with rcu
  read locked and blkg already looked up.  Both throtl_lookup_tg() and
  throtl_lookup_create_tg() are dropped.

* cfq is similarly updated.  cfq_lookup_create_cfqg() is replaced with
  cfq_lookup_cfqg()which uses blkg_lookup().

This consolidates blkg handling and avoids unnecessary blkg creation
retries under memory pressure.  In addition, this provides a common
bio entry point into blkcg where things like common accounting can be
performed.

v2: Build fixes for !CONFIG_CFQ_GROUP_IOSCHED and
    !CONFIG_BLK_DEV_THROTTLING.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         |  2 +-
 block/blk-core.c           |  4 +--
 block/blk-throttle.c       | 72 ++++------------------------------------------
 block/blk.h                |  5 ----
 block/cfq-iosched.c        | 33 +++++++--------------
 include/linux/blk-cgroup.h | 40 ++++++++++++++++++++++++--
 6 files changed, 57 insertions(+), 99 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 52e637ed9d7e..097c4a670fa4 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -153,6 +153,7 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
 
 /*
  * If @new_blkg is %NULL, this function tries to allocate a new one as
@@ -295,7 +296,6 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 			return blkg;
 	}
 }
-EXPORT_SYMBOL_GPL(blkg_lookup_create);
 
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
diff --git a/block/blk-core.c b/block/blk-core.c
index 627ed0c593fb..140094f4eaca 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1889,8 +1889,8 @@ generic_make_request_checks(struct bio *bio)
 	 */
 	create_io_context(GFP_ATOMIC, q->node);
 
-	if (blk_throtl_bio(q, bio))
-		return false;	/* throttled, will be resubmitted later */
+	if (!blkcg_bio_issue_check(q, bio))
+		return false;
 
 	trace_block_bio_queue(q, bio);
 	return true;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 900a777e01c2..29c22ed4b073 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -182,11 +182,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
 	return pd_to_blkg(&tg->pd);
 }
 
-static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
-{
-	return blkg_to_tg(td->queue->root_blkg);
-}
-
 /**
  * sq_to_tg - return the throl_grp the specified service queue belongs to
  * @sq: the throtl_service_queue of interest
@@ -449,39 +444,6 @@ static void throtl_pd_reset_stats(struct blkg_policy_data *pd)
 	}
 }
 
-static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
-					   struct blkcg *blkcg)
-{
-	return blkg_to_tg(blkg_lookup(blkcg, td->queue));
-}
-
-static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
-						  struct blkcg *blkcg)
-{
-	struct request_queue *q = td->queue;
-	struct throtl_grp *tg = NULL;
-
-	/*
-	 * This is the common case when there are no blkcgs.  Avoid lookup
-	 * in this case
-	 */
-	if (blkcg == &blkcg_root) {
-		tg = td_root_tg(td);
-	} else {
-		struct blkcg_gq *blkg;
-
-		blkg = blkg_lookup_create(blkcg, q);
-
-		/* if %NULL and @q is alive, fall back to root_tg */
-		if (!IS_ERR(blkg))
-			tg = blkg_to_tg(blkg);
-		else
-			tg = td_root_tg(td);
-	}
-
-	return tg;
-}
-
 static struct throtl_grp *
 throtl_rb_first(struct throtl_service_queue *parent_sq)
 {
@@ -1403,46 +1365,26 @@ static struct blkcg_policy blkcg_policy_throtl = {
 	.pd_reset_stats_fn	= throtl_pd_reset_stats,
 };
 
-bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
+bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+		    struct bio *bio)
 {
-	struct throtl_data *td = q->td;
 	struct throtl_qnode *qn = NULL;
-	struct throtl_grp *tg;
+	struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
 	struct throtl_service_queue *sq;
 	bool rw = bio_data_dir(bio);
-	struct blkcg *blkcg;
 	bool throttled = false;
 
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
 	/* see throtl_charge_bio() */
-	if (bio->bi_rw & REQ_THROTTLED)
+	if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw])
 		goto out;
 
-	/*
-	 * A throtl_grp pointer retrieved under rcu can be used to access
-	 * basic fields like stats and io rates. If a group has no rules,
-	 * just update the dispatch stats in lockless manner and return.
-	 */
-	rcu_read_lock();
-	blkcg = bio_blkcg(bio);
-	tg = throtl_lookup_tg(td, blkcg);
-	if (tg) {
-		if (!tg->has_rules[rw]) {
-			throtl_update_dispatch_stats(tg_to_blkg(tg),
-					bio->bi_iter.bi_size, bio->bi_rw);
-			goto out_unlock_rcu;
-		}
-	}
-
-	/*
-	 * Either group has not been allocated yet or it is not an unlimited
-	 * IO group
-	 */
 	spin_lock_irq(q->queue_lock);
 
 	if (unlikely(blk_queue_bypass(q)))
 		goto out_unlock;
 
-	tg = throtl_lookup_create_tg(td, blkcg);
 	sq = &tg->service_queue;
 
 	while (true) {
@@ -1507,8 +1449,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 
 out_unlock:
 	spin_unlock_irq(q->queue_lock);
-out_unlock_rcu:
-	rcu_read_unlock();
 out:
 	/*
 	 * As multiple blk-throtls may stack in the same issue path, we
diff --git a/block/blk.h b/block/blk.h
index 026d9594142b..d905b26fbff5 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -266,15 +266,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
  * Internal throttling interface
  */
 #ifdef CONFIG_BLK_DEV_THROTTLING
-extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
 extern void blk_throtl_drain(struct request_queue *q);
 extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
 #else /* CONFIG_BLK_DEV_THROTTLING */
-static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
-{
-	return false;
-}
 static inline void blk_throtl_drain(struct request_queue *q) { }
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a4429b366820..0994f3b523a8 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1683,28 +1683,15 @@ static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
 	cfqg_stats_reset(&cfqg->dead_stats);
 }
 
-/*
- * Search for the cfq group current task belongs to. request_queue lock must
- * be held.
- */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
-						struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
+					 struct blkcg *blkcg)
 {
-	struct request_queue *q = cfqd->queue;
-	struct cfq_group *cfqg = NULL;
-
-	/* avoid lookup for the common case where there's no blkcg */
-	if (blkcg == &blkcg_root) {
-		cfqg = cfqd->root_group;
-	} else {
-		struct blkcg_gq *blkg;
-
-		blkg = blkg_lookup_create(blkcg, q);
-		if (!IS_ERR(blkg))
-			cfqg = blkg_to_cfqg(blkg);
-	}
+	struct blkcg_gq *blkg;
 
-	return cfqg;
+	blkg = blkg_lookup(blkcg, cfqd->queue);
+	if (likely(blkg))
+		return blkg_to_cfqg(blkg);
+	return NULL;
 }
 
 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
@@ -2108,8 +2095,8 @@ static struct cftype cfq_blkcg_files[] = {
 	{ }	/* terminate */
 };
 #else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
-						struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
+					 struct blkcg *blkcg)
 {
 	return cfqd->root_group;
 }
@@ -3716,7 +3703,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 	struct cfq_group *cfqg;
 
 	rcu_read_lock();
-	cfqg = cfq_lookup_create_cfqg(cfqd, bio_blkcg(bio));
+	cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
 	if (!cfqg) {
 		cfqq = &cfqd->oom_cfqq;
 		goto out;
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 0609bce69f68..4d1659c7f84b 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -421,8 +421,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q,
 	 * or if either the blkcg or queue is going away.  Fall back to
 	 * root_rl in such cases.
 	 */
-	blkg = blkg_lookup_create(blkcg, q);
-	if (unlikely(IS_ERR(blkg)))
+	blkg = blkg_lookup(blkcg, q);
+	if (unlikely(!blkg))
 		goto root_rl;
 
 	blkg_get(blkg);
@@ -636,6 +636,39 @@ static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
 	u64_stats_update_end(&to->syncp);
 }
 
+#ifdef CONFIG_BLK_DEV_THROTTLING
+extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+			   struct bio *bio);
+#else
+static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+				  struct bio *bio) { return false; }
+#endif
+
+static inline bool blkcg_bio_issue_check(struct request_queue *q,
+					 struct bio *bio)
+{
+	struct blkcg *blkcg;
+	struct blkcg_gq *blkg;
+	bool throtl = false;
+
+	rcu_read_lock();
+	blkcg = bio_blkcg(bio);
+
+	blkg = blkg_lookup(blkcg, q);
+	if (unlikely(!blkg)) {
+		spin_lock_irq(q->queue_lock);
+		blkg = blkg_lookup_create(blkcg, q);
+		if (IS_ERR(blkg))
+			blkg = NULL;
+		spin_unlock_irq(q->queue_lock);
+	}
+
+	throtl = blk_throtl_bio(q, blkg, bio);
+
+	rcu_read_unlock();
+	return !throtl;
+}
+
 #else	/* CONFIG_BLK_CGROUP */
 
 struct blkcg {
@@ -689,6 +722,9 @@ static inline void blk_put_rl(struct request_list *rl) { }
 static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
 static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
 
+static inline bool blkcg_bio_issue_check(struct request_queue *q,
+					 struct bio *bio) { return true; }
+
 #define blk_queue_for_each_rl(rl, q)	\
 	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
 
-- 
cgit v1.2.3-70-g09d2


From e6269c44546755094979ab53609e6e203a68c8ff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:21 -0700
Subject: blkcg: add blkg_[rw]stat->aux_cnt and replace cfq_group->dead_stats
 with it

cgroup stats are local to each cgroup and doesn't propagate to
ancestors by default.  When recursive stats are necessary, the sum is
calculated over all the descendants.  This initially was for backward
compatibility to support both group-local and recursive stats but this
mode of operation makes general sense as stat update is much hotter
thafn reporting those stats.

This however ends up losing recursive stats when a child is removed.
To work around this, cfq-iosched adds its stats to its parent
cfq_group->dead_stats which is summed up together when calculating
recursive stats.

It's planned that the core stats will be moved to blkcg_gq, so we want
to move the mechanism for keeping track of the stats of dead children
from cfq to blkcg core.  This patch adds blkg_[rw]stat->aux_cnt which
are atomic64_t's keeping track of auxiliary counts which are excluded
when reading local counts but included for recursive.

blkg_[rw]stat_merge() which were used by cfq to implement dead_stats
are replaced by blkg_[rw]stat_add_aux(), and cfq now forwards stats of
a dead cgroup to the aux counts of parent->stats instead of separate
->dead_stats.

This will also help making blkg_[rw]stats per-cpu.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 10 ++++---
 block/cfq-iosched.c        | 67 +++++++++++++---------------------------------
 include/linux/blk-cgroup.h | 46 ++++++++++++++++++++++---------
 3 files changed, 57 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 097c4a670fa4..ff79b52d1a0e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -584,7 +584,7 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
  * @off: offset to the blkg_stat in @pd
  *
  * Collect the blkg_stat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
+ * descendants and their aux counts.  The caller must be holding the queue
  * lock for online tests.
  */
 u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
@@ -602,7 +602,8 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
 		struct blkg_stat *stat = (void *)pos_pd + off;
 
 		if (pos_blkg->online)
-			sum += blkg_stat_read(stat);
+			sum += blkg_stat_read(stat) +
+				atomic64_read(&stat->aux_cnt);
 	}
 	rcu_read_unlock();
 
@@ -616,7 +617,7 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
  * @off: offset to the blkg_stat in @pd
  *
  * Collect the blkg_rwstat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
+ * descendants and their aux counts.  The caller must be holding the queue
  * lock for online tests.
  */
 struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
@@ -642,7 +643,8 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
 		tmp = blkg_rwstat_read(rwstat);
 
 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
-			sum.cnt[i] += tmp.cnt[i];
+			sum.cnt[i] += tmp.cnt[i] +
+				atomic64_read(&rwstat->aux_cnt[i]);
 	}
 	rcu_read_unlock();
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 0994f3b523a8..b272cfff7364 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -304,7 +304,6 @@ struct cfq_group {
 	int dispatched;
 	struct cfq_ttime ttime;
 	struct cfqg_stats stats;	/* stats for this cfqg */
-	struct cfqg_stats dead_stats;	/* stats pushed from dead children */
 
 	/* async queue for each priority case */
 	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
@@ -736,28 +735,28 @@ static void cfqg_stats_reset(struct cfqg_stats *stats)
 }
 
 /* @to += @from */
-static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from)
+static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
 {
 	/* queued stats shouldn't be cleared */
-	blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
-	blkg_rwstat_merge(&to->serviced, &from->serviced);
-	blkg_rwstat_merge(&to->merged, &from->merged);
-	blkg_rwstat_merge(&to->service_time, &from->service_time);
-	blkg_rwstat_merge(&to->wait_time, &from->wait_time);
-	blkg_stat_merge(&from->time, &from->time);
+	blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes);
+	blkg_rwstat_add_aux(&to->serviced, &from->serviced);
+	blkg_rwstat_add_aux(&to->merged, &from->merged);
+	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+	blkg_stat_add_aux(&from->time, &from->time);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-	blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
-	blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
-	blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
-	blkg_stat_merge(&to->dequeue, &from->dequeue);
-	blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
-	blkg_stat_merge(&to->idle_time, &from->idle_time);
-	blkg_stat_merge(&to->empty_time, &from->empty_time);
+	blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
+	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+	blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
 #endif
 }
 
 /*
- * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
+ * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
  * recursive stats can still account for the amount used by this cfqg after
  * it's gone.
  */
@@ -770,10 +769,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
 	if (unlikely(!parent))
 		return;
 
-	cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
-	cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
+	cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
 	cfqg_stats_reset(&cfqg->stats);
-	cfqg_stats_reset(&cfqg->dead_stats);
 }
 
 #else	/* CONFIG_CFQ_GROUP_IOSCHED */
@@ -1606,7 +1603,6 @@ static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
 
 	cfq_init_cfqg_base(cfqg);
 	cfqg_stats_init(&cfqg->stats);
-	cfqg_stats_init(&cfqg->dead_stats);
 
 	return &cfqg->pd;
 }
@@ -1649,38 +1645,11 @@ static void cfq_pd_free(struct blkg_policy_data *pd)
 	return kfree(pd);
 }
 
-/* offset delta from cfqg->stats to cfqg->dead_stats */
-static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
-					offsetof(struct cfq_group, stats);
-
-/* to be used by recursive prfill, sums live and dead stats recursively */
-static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
-{
-	u64 sum = 0;
-
-	sum += blkg_stat_recursive_sum(pd, off);
-	sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
-	return sum;
-}
-
-/* to be used by recursive prfill, sums live and dead rwstats recursively */
-static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
-						       int off)
-{
-	struct blkg_rwstat a, b;
-
-	a = blkg_rwstat_recursive_sum(pd, off);
-	b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
-	blkg_rwstat_merge(&a, &b);
-	return a;
-}
-
 static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
 {
 	struct cfq_group *cfqg = pd_to_cfqg(pd);
 
 	cfqg_stats_reset(&cfqg->stats);
-	cfqg_stats_reset(&cfqg->dead_stats);
 }
 
 static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
@@ -1883,7 +1852,7 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v)
 static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
 				      struct blkg_policy_data *pd, int off)
 {
-	u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
+	u64 sum = blkg_stat_recursive_sum(pd, off);
 
 	return __blkg_prfill_u64(sf, pd, sum);
 }
@@ -1891,7 +1860,7 @@ static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
 static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
 					struct blkg_policy_data *pd, int off)
 {
-	struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
+	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd, off);
 
 	return __blkg_prfill_rwstat(sf, pd, &sum);
 }
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 4d1659c7f84b..e8092276af58 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -53,14 +53,20 @@ struct blkcg {
 #endif
 };
 
+/*
+ * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
+ * recursive.  Used to carry stats of dead children.
+ */
 struct blkg_stat {
 	struct u64_stats_sync		syncp;
 	uint64_t			cnt;
+	atomic64_t			aux_cnt;
 };
 
 struct blkg_rwstat {
 	struct u64_stats_sync		syncp;
 	uint64_t			cnt[BLKG_RWSTAT_NR];
+	atomic64_t			aux_cnt[BLKG_RWSTAT_NR];
 };
 
 /*
@@ -483,6 +489,7 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
 static inline void blkg_stat_init(struct blkg_stat *stat)
 {
 	u64_stats_init(&stat->syncp);
+	atomic64_set(&stat->aux_cnt, 0);
 }
 
 /**
@@ -504,8 +511,9 @@ static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
  * blkg_stat_read - read the current value of a blkg_stat
  * @stat: blkg_stat to read
  *
- * Read the current value of @stat.  This function can be called without
- * synchroniztion and takes care of u64 atomicity.
+ * Read the current value of @stat.  The returned value doesn't include the
+ * aux count.  This function can be called without synchroniztion and takes
+ * care of u64 atomicity.
  */
 static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
 {
@@ -527,23 +535,31 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
 static inline void blkg_stat_reset(struct blkg_stat *stat)
 {
 	stat->cnt = 0;
+	atomic64_set(&stat->aux_cnt, 0);
 }
 
 /**
- * blkg_stat_merge - merge a blkg_stat into another
+ * blkg_stat_add_aux - add a blkg_stat into another's aux count
  * @to: the destination blkg_stat
  * @from: the source
  *
- * Add @from's count to @to.
+ * Add @from's count including the aux one to @to's aux count.
  */
-static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
+static inline void blkg_stat_add_aux(struct blkg_stat *to,
+				     struct blkg_stat *from)
 {
-	blkg_stat_add(to, blkg_stat_read(from));
+	atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt),
+		     &to->aux_cnt);
 }
 
 static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
 {
+	int i;
+
 	u64_stats_init(&rwstat->syncp);
+
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		atomic64_set(&rwstat->aux_cnt[i], 0);
 }
 
 /**
@@ -614,26 +630,30 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
  */
 static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 {
+	int i;
+
 	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		atomic64_set(&rwstat->aux_cnt[i], 0);
 }
 
 /**
- * blkg_rwstat_merge - merge a blkg_rwstat into another
+ * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
  * @to: the destination blkg_rwstat
  * @from: the source
  *
- * Add @from's counts to @to.
+ * Add @from's count including the aux one to @to's aux count.
  */
-static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
-				     struct blkg_rwstat *from)
+static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
+				       struct blkg_rwstat *from)
 {
 	struct blkg_rwstat v = blkg_rwstat_read(from);
 	int i;
 
-	u64_stats_update_begin(&to->syncp);
 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		to->cnt[i] += v.cnt[i];
-	u64_stats_update_end(&to->syncp);
+		atomic64_add(v.cnt[i] + atomic64_read(&from->aux_cnt[i]),
+			     &to->aux_cnt[i]);
 }
 
 #ifdef CONFIG_BLK_DEV_THROTTLING
-- 
cgit v1.2.3-70-g09d2


From 24bdb8ef068ebdc2a57ce715f0ab22d5da32832a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:22 -0700
Subject: blkcg: make blkcg_[rw]stat per-cpu

blkcg_[rw]stat are used as stat counters for blkcg policies.  It isn't
per-cpu by itself and blk-throttle makes it per-cpu by wrapping around
it.  This patch makes blkcg_[rw]stat per-cpu and drop the ad-hoc
per-cpu wrapping in blk-throttle.

* blkg_[rw]stat->cnt is replaced with cpu_cnt which is struct
  percpu_counter.  This makes syncp unnecessary as remote accesses are
  handled by percpu_counter itself.

* blkg_[rw]stat_init() can now fail due to percpu allocation failure
  and thus are updated to return int.

* percpu_counters need explicit freeing.  blkg_[rw]stat_exit() added.

* As blkg_rwstat->cpu_cnt[] can't be read directly anymore, reading
  and summing results are stored in ->aux_cnt[] instead.

* Custom per-cpu stat implementation in blk-throttle is removed.

This makes all blkcg stat counters per-cpu without complicating policy
implmentations.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         |  10 ++--
 block/blk-throttle.c       |  89 +++++++++++----------------------
 block/cfq-iosched.c        |  70 +++++++++++++++++++-------
 include/linux/blk-cgroup.h | 120 +++++++++++++++++++++++++--------------------
 4 files changed, 153 insertions(+), 136 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ff79b52d1a0e..02a2d029b5a5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -539,9 +539,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 
 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
 		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
-			   (unsigned long long)rwstat->cnt[i]);
+			   (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
 
-	v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
+	v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
+		atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
 	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
 	return v;
 }
@@ -643,8 +644,9 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
 		tmp = blkg_rwstat_read(rwstat);
 
 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
-			sum.cnt[i] += tmp.cnt[i] +
-				atomic64_read(&rwstat->aux_cnt[i]);
+			atomic64_add(atomic64_read(&tmp.aux_cnt[i]) +
+				     atomic64_read(&rwstat->aux_cnt[i]),
+				     &sum.aux_cnt[i]);
 	}
 	rcu_read_unlock();
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 29c22ed4b073..c0b2263a222a 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -83,14 +83,6 @@ enum tg_state_flags {
 
 #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
 
-/* Per-cpu group stats */
-struct tg_stats_cpu {
-	/* total bytes transferred */
-	struct blkg_rwstat		service_bytes;
-	/* total IOs serviced, post merge */
-	struct blkg_rwstat		serviced;
-};
-
 struct throtl_grp {
 	/* must be the first member */
 	struct blkg_policy_data pd;
@@ -142,8 +134,10 @@ struct throtl_grp {
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
 
-	/* Per cpu stats pointer */
-	struct tg_stats_cpu __percpu *stats_cpu;
+	/* total bytes transferred */
+	struct blkg_rwstat		service_bytes;
+	/* total IOs serviced, post merge */
+	struct blkg_rwstat		serviced;
 };
 
 struct throtl_data
@@ -337,17 +331,15 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq)
 static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 {
 	struct throtl_grp *tg;
-	int rw, cpu;
+	int rw;
 
 	tg = kzalloc_node(sizeof(*tg), gfp, node);
 	if (!tg)
-		return NULL;
+		goto err;
 
-	tg->stats_cpu = alloc_percpu_gfp(struct tg_stats_cpu, gfp);
-	if (!tg->stats_cpu) {
-		kfree(tg);
-		return NULL;
-	}
+	if (blkg_rwstat_init(&tg->service_bytes, gfp) ||
+	    blkg_rwstat_init(&tg->serviced, gfp))
+		goto err_free_tg;
 
 	throtl_service_queue_init(&tg->service_queue);
 
@@ -362,14 +354,14 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 	tg->iops[READ] = -1;
 	tg->iops[WRITE] = -1;
 
-	for_each_possible_cpu(cpu) {
-		struct tg_stats_cpu *stats_cpu = per_cpu_ptr(tg->stats_cpu, cpu);
-
-		blkg_rwstat_init(&stats_cpu->service_bytes);
-		blkg_rwstat_init(&stats_cpu->serviced);
-	}
-
 	return &tg->pd;
+
+err_free_tg:
+	blkg_rwstat_exit(&tg->serviced);
+	blkg_rwstat_exit(&tg->service_bytes);
+	kfree(tg);
+err:
+	return NULL;
 }
 
 static void throtl_pd_init(struct blkg_policy_data *pd)
@@ -427,21 +419,17 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
 	struct throtl_grp *tg = pd_to_tg(pd);
 
 	del_timer_sync(&tg->service_queue.pending_timer);
-	free_percpu(tg->stats_cpu);
+	blkg_rwstat_exit(&tg->serviced);
+	blkg_rwstat_exit(&tg->service_bytes);
 	kfree(tg);
 }
 
 static void throtl_pd_reset_stats(struct blkg_policy_data *pd)
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
-	int cpu;
 
-	for_each_possible_cpu(cpu) {
-		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
-		blkg_rwstat_reset(&sc->service_bytes);
-		blkg_rwstat_reset(&sc->serviced);
-	}
+	blkg_rwstat_reset(&tg->service_bytes);
+	blkg_rwstat_reset(&tg->serviced);
 }
 
 static struct throtl_grp *
@@ -855,7 +843,6 @@ static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
 					 int rw)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
-	struct tg_stats_cpu *stats_cpu;
 	unsigned long flags;
 
 	/*
@@ -865,10 +852,8 @@ static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
 	 */
 	local_irq_save(flags);
 
-	stats_cpu = this_cpu_ptr(tg->stats_cpu);
-
-	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
-	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
+	blkg_rwstat_add(&tg->serviced, rw, 1);
+	blkg_rwstat_add(&tg->service_bytes, rw, bytes);
 
 	local_irq_restore(flags);
 }
@@ -1176,27 +1161,9 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 	}
 }
 
-static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
-				struct blkg_policy_data *pd, int off)
-{
-	struct throtl_grp *tg = pd_to_tg(pd);
-	struct blkg_rwstat rwstat = { }, tmp;
-	int i, cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
-		tmp = blkg_rwstat_read((void *)sc + off);
-		for (i = 0; i < BLKG_RWSTAT_NR; i++)
-			rwstat.cnt[i] += tmp.cnt[i];
-	}
-
-	return __blkg_prfill_rwstat(sf, pd, &rwstat);
-}
-
-static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
+static int tg_print_rwstat(struct seq_file *sf, void *v)
 {
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
 			  &blkcg_policy_throtl, seq_cft(sf)->private, true);
 	return 0;
 }
@@ -1337,13 +1304,13 @@ static struct cftype throtl_files[] = {
 	},
 	{
 		.name = "throttle.io_service_bytes",
-		.private = offsetof(struct tg_stats_cpu, service_bytes),
-		.seq_show = tg_print_cpu_rwstat,
+		.private = offsetof(struct throtl_grp, service_bytes),
+		.seq_show = tg_print_rwstat,
 	},
 	{
 		.name = "throttle.io_serviced",
-		.private = offsetof(struct tg_stats_cpu, serviced),
-		.seq_show = tg_print_cpu_rwstat,
+		.private = offsetof(struct throtl_grp, serviced),
+		.seq_show = tg_print_rwstat,
 	},
 	{ }	/* terminate */
 };
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b272cfff7364..71e55c91ee98 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1542,27 +1542,55 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void cfqg_stats_init(struct cfqg_stats *stats)
+static void cfqg_stats_exit(struct cfqg_stats *stats)
 {
-	blkg_rwstat_init(&stats->service_bytes);
-	blkg_rwstat_init(&stats->serviced);
-	blkg_rwstat_init(&stats->merged);
-	blkg_rwstat_init(&stats->service_time);
-	blkg_rwstat_init(&stats->wait_time);
-	blkg_rwstat_init(&stats->queued);
+	blkg_rwstat_exit(&stats->service_bytes);
+	blkg_rwstat_exit(&stats->serviced);
+	blkg_rwstat_exit(&stats->merged);
+	blkg_rwstat_exit(&stats->service_time);
+	blkg_rwstat_exit(&stats->wait_time);
+	blkg_rwstat_exit(&stats->queued);
 
-	blkg_stat_init(&stats->sectors);
-	blkg_stat_init(&stats->time);
+	blkg_stat_exit(&stats->sectors);
+	blkg_stat_exit(&stats->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	blkg_stat_exit(&stats->unaccounted_time);
+	blkg_stat_exit(&stats->avg_queue_size_sum);
+	blkg_stat_exit(&stats->avg_queue_size_samples);
+	blkg_stat_exit(&stats->dequeue);
+	blkg_stat_exit(&stats->group_wait_time);
+	blkg_stat_exit(&stats->idle_time);
+	blkg_stat_exit(&stats->empty_time);
+#endif
+}
+
+static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
+{
+	if (blkg_rwstat_init(&stats->service_bytes, gfp) ||
+	    blkg_rwstat_init(&stats->serviced, gfp) ||
+	    blkg_rwstat_init(&stats->merged, gfp) ||
+	    blkg_rwstat_init(&stats->service_time, gfp) ||
+	    blkg_rwstat_init(&stats->wait_time, gfp) ||
+	    blkg_rwstat_init(&stats->queued, gfp) ||
+
+	    blkg_stat_init(&stats->sectors, gfp) ||
+	    blkg_stat_init(&stats->time, gfp))
+		goto err;
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-	blkg_stat_init(&stats->unaccounted_time);
-	blkg_stat_init(&stats->avg_queue_size_sum);
-	blkg_stat_init(&stats->avg_queue_size_samples);
-	blkg_stat_init(&stats->dequeue);
-	blkg_stat_init(&stats->group_wait_time);
-	blkg_stat_init(&stats->idle_time);
-	blkg_stat_init(&stats->empty_time);
+	if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+	    blkg_stat_init(&stats->dequeue, gfp) ||
+	    blkg_stat_init(&stats->group_wait_time, gfp) ||
+	    blkg_stat_init(&stats->idle_time, gfp) ||
+	    blkg_stat_init(&stats->empty_time, gfp))
+		goto err;
 #endif
+	return 0;
+err:
+	cfqg_stats_exit(stats);
+	return -ENOMEM;
 }
 
 static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
@@ -1602,7 +1630,10 @@ static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
 		return NULL;
 
 	cfq_init_cfqg_base(cfqg);
-	cfqg_stats_init(&cfqg->stats);
+	if (cfqg_stats_init(&cfqg->stats, gfp)) {
+		kfree(cfqg);
+		return NULL;
+	}
 
 	return &cfqg->pd;
 }
@@ -1642,7 +1673,10 @@ static void cfq_pd_offline(struct blkg_policy_data *pd)
 
 static void cfq_pd_free(struct blkg_policy_data *pd)
 {
-	return kfree(pd);
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
+
+	cfqg_stats_exit(&cfqg->stats);
+	return kfree(cfqg);
 }
 
 static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index e8092276af58..fdc7ac08b1ce 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -14,12 +14,15 @@
  */
 
 #include <linux/cgroup.h>
-#include <linux/u64_stats_sync.h>
+#include <linux/percpu_counter.h>
 #include <linux/seq_file.h>
 #include <linux/radix-tree.h>
 #include <linux/blkdev.h>
 #include <linux/atomic.h>
 
+/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
+#define BLKG_STAT_CPU_BATCH	(INT_MAX / 2)
+
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX		UINT_MAX
 
@@ -55,17 +58,16 @@ struct blkcg {
 
 /*
  * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
- * recursive.  Used to carry stats of dead children.
+ * recursive.  Used to carry stats of dead children, and, for blkg_rwstat,
+ * to carry result values from read and sum operations.
  */
 struct blkg_stat {
-	struct u64_stats_sync		syncp;
-	uint64_t			cnt;
+	struct percpu_counter		cpu_cnt;
 	atomic64_t			aux_cnt;
 };
 
 struct blkg_rwstat {
-	struct u64_stats_sync		syncp;
-	uint64_t			cnt[BLKG_RWSTAT_NR];
+	struct percpu_counter		cpu_cnt[BLKG_RWSTAT_NR];
 	atomic64_t			aux_cnt[BLKG_RWSTAT_NR];
 };
 
@@ -486,10 +488,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
 #define blk_queue_for_each_rl(rl, q)	\
 	for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
 
-static inline void blkg_stat_init(struct blkg_stat *stat)
+static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp)
 {
-	u64_stats_init(&stat->syncp);
+	int ret;
+
+	ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
+	if (ret)
+		return ret;
+
 	atomic64_set(&stat->aux_cnt, 0);
+	return 0;
+}
+
+static inline void blkg_stat_exit(struct blkg_stat *stat)
+{
+	percpu_counter_destroy(&stat->cpu_cnt);
 }
 
 /**
@@ -497,35 +510,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat)
  * @stat: target blkg_stat
  * @val: value to add
  *
- * Add @val to @stat.  The caller is responsible for synchronizing calls to
- * this function.
+ * Add @val to @stat.  The caller must ensure that IRQ on the same CPU
+ * don't re-enter this function for the same counter.
  */
 static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
 {
-	u64_stats_update_begin(&stat->syncp);
-	stat->cnt += val;
-	u64_stats_update_end(&stat->syncp);
+	__percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
 }
 
 /**
  * blkg_stat_read - read the current value of a blkg_stat
  * @stat: blkg_stat to read
- *
- * Read the current value of @stat.  The returned value doesn't include the
- * aux count.  This function can be called without synchroniztion and takes
- * care of u64 atomicity.
  */
 static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
 {
-	unsigned int start;
-	uint64_t v;
-
-	do {
-		start = u64_stats_fetch_begin_irq(&stat->syncp);
-		v = stat->cnt;
-	} while (u64_stats_fetch_retry_irq(&stat->syncp, start));
-
-	return v;
+	return percpu_counter_sum_positive(&stat->cpu_cnt);
 }
 
 /**
@@ -534,7 +533,7 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
  */
 static inline void blkg_stat_reset(struct blkg_stat *stat)
 {
-	stat->cnt = 0;
+	percpu_counter_set(&stat->cpu_cnt, 0);
 	atomic64_set(&stat->aux_cnt, 0);
 }
 
@@ -552,14 +551,28 @@ static inline void blkg_stat_add_aux(struct blkg_stat *to,
 		     &to->aux_cnt);
 }
 
-static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
 {
-	int i;
+	int i, ret;
+
+	for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+		ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
+		if (ret) {
+			while (--i >= 0)
+				percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+			return ret;
+		}
+		atomic64_set(&rwstat->aux_cnt[i], 0);
+	}
+	return 0;
+}
 
-	u64_stats_init(&rwstat->syncp);
+static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
+{
+	int i;
 
 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		atomic64_set(&rwstat->aux_cnt[i], 0);
+		percpu_counter_destroy(&rwstat->cpu_cnt[i]);
 }
 
 /**
@@ -574,39 +587,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
 static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
 				   int rw, uint64_t val)
 {
-	u64_stats_update_begin(&rwstat->syncp);
+	struct percpu_counter *cnt;
 
 	if (rw & REQ_WRITE)
-		rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
 	else
-		rwstat->cnt[BLKG_RWSTAT_READ] += val;
+		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
+
+	__percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
+
 	if (rw & REQ_SYNC)
-		rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
 	else
-		rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
 
-	u64_stats_update_end(&rwstat->syncp);
+	__percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
 }
 
 /**
  * blkg_rwstat_read - read the current values of a blkg_rwstat
  * @rwstat: blkg_rwstat to read
  *
- * Read the current snapshot of @rwstat and return it as the return value.
- * This function can be called without synchronization and takes care of
- * u64 atomicity.
+ * Read the current snapshot of @rwstat and return it in the aux counts.
  */
 static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
 {
-	unsigned int start;
-	struct blkg_rwstat tmp;
-
-	do {
-		start = u64_stats_fetch_begin_irq(&rwstat->syncp);
-		tmp = *rwstat;
-	} while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
+	struct blkg_rwstat result;
+	int i;
 
-	return tmp;
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		atomic64_set(&result.aux_cnt[i],
+			     percpu_counter_sum_positive(&rwstat->cpu_cnt[i]));
+	return result;
 }
 
 /**
@@ -621,7 +633,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
 {
 	struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
 
-	return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+	return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
 }
 
 /**
@@ -632,10 +645,10 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 {
 	int i;
 
-	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
-
-	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+	for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+		percpu_counter_set(&rwstat->cpu_cnt[i], 0);
 		atomic64_set(&rwstat->aux_cnt[i], 0);
+	}
 }
 
 /**
@@ -652,7 +665,8 @@ static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
 	int i;
 
 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		atomic64_add(v.cnt[i] + atomic64_read(&from->aux_cnt[i]),
+		atomic64_add(atomic64_read(&v.aux_cnt[i]) +
+			     atomic64_read(&from->aux_cnt[i]),
 			     &to->aux_cnt[i]);
 }
 
-- 
cgit v1.2.3-70-g09d2


From f12c74cab1635d67077ce8cc40da88b57980f637 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:23 -0700
Subject: blkcg: make blkg_[rw]stat_recursive_sum() to be able to index into
 blkcg_gq

Currently, blkg_[rw]stat_recursive_sum() assume that the target
counter is located in pd (blkg_policy_data); however, some counters
are planned to be moved to blkg (blkcg_gq).

This patch updates blkg_[rw]stat_recursive_sum() to take blkg and
blkg_policy pointers instead of pd.  If policy is NULL, it indexes
into blkg.  If non-NULL, into the blkg's pd of the policy.

The existing usages are updated to maintain the current behaviors.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 69 ++++++++++++++++++++++++++++------------------
 block/cfq-iosched.c        |  8 +++---
 include/linux/blk-cgroup.h |  7 +++--
 3 files changed, 50 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 02a2d029b5a5..b26320720a3c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -581,30 +581,39 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 
 /**
  * blkg_stat_recursive_sum - collect hierarchical blkg_stat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_stat
+ * @off: offset to the blkg_stat in blkg_policy_data or @blkg
  *
- * Collect the blkg_stat specified by @off from @pd and all its online
- * descendants and their aux counts.  The caller must be holding the queue
- * lock for online tests.
+ * Collect the blkg_stat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
+ *
+ * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
+ * at @off bytes into @blkg's blkg_policy_data of the policy.
  */
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+			    struct blkcg_policy *pol, int off)
 {
-	struct blkcg_policy *pol = blkcg_policy[pd->plid];
 	struct blkcg_gq *pos_blkg;
 	struct cgroup_subsys_state *pos_css;
 	u64 sum = 0;
 
-	lockdep_assert_held(pd->blkg->q->queue_lock);
+	lockdep_assert_held(blkg->q->queue_lock);
 
 	rcu_read_lock();
-	blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-		struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-		struct blkg_stat *stat = (void *)pos_pd + off;
+	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+		struct blkg_stat *stat;
+
+		if (!pos_blkg->online)
+			continue;
 
-		if (pos_blkg->online)
-			sum += blkg_stat_read(stat) +
-				atomic64_read(&stat->aux_cnt);
+		if (pol)
+			stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+		else
+			stat = (void *)blkg + off;
+
+		sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
 	}
 	rcu_read_unlock();
 
@@ -614,33 +623,39 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
 
 /**
  * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_rwstat
+ * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
  *
- * Collect the blkg_rwstat specified by @off from @pd and all its online
- * descendants and their aux counts.  The caller must be holding the queue
- * lock for online tests.
+ * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
+ *
+ * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
+ * is at @off bytes into @blkg's blkg_policy_data of the policy.
  */
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-					     int off)
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+					     struct blkcg_policy *pol, int off)
 {
-	struct blkcg_policy *pol = blkcg_policy[pd->plid];
 	struct blkcg_gq *pos_blkg;
 	struct cgroup_subsys_state *pos_css;
 	struct blkg_rwstat sum = { };
 	int i;
 
-	lockdep_assert_held(pd->blkg->q->queue_lock);
+	lockdep_assert_held(blkg->q->queue_lock);
 
 	rcu_read_lock();
-	blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-		struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-		struct blkg_rwstat *rwstat = (void *)pos_pd + off;
-		struct blkg_rwstat tmp;
+	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+		struct blkg_rwstat *rwstat, tmp;
 
 		if (!pos_blkg->online)
 			continue;
 
+		if (pol)
+			rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+		else
+			rwstat = (void *)pos_blkg + off;
+
 		tmp = blkg_rwstat_read(rwstat);
 
 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 71e55c91ee98..e861cc1ebd62 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1886,16 +1886,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v)
 static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
 				      struct blkg_policy_data *pd, int off)
 {
-	u64 sum = blkg_stat_recursive_sum(pd, off);
-
+	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+					  &blkcg_policy_cfq, off);
 	return __blkg_prfill_u64(sf, pd, sum);
 }
 
 static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
 					struct blkg_policy_data *pd, int off)
 {
-	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd, off);
-
+	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+							&blkcg_policy_cfq, off);
 	return __blkg_prfill_rwstat(sf, pd, &sum);
 }
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index fdc7ac08b1ce..4630ce8f9425 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -191,9 +191,10 @@ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 		       int off);
 
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-					     int off);
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+			    struct blkcg_policy *pol, int off);
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+					     struct blkcg_policy *pol, int off);
 
 struct blkg_conf_ctx {
 	struct gendisk			*disk;
-- 
cgit v1.2.3-70-g09d2


From 77ea733884eb5520f22c36def1309fe2ab61633e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:24 -0700
Subject: blkcg: move io_service_bytes and io_serviced stats into blkcg_gq

Currently, both cfq-iosched and blk-throttle keep track of
io_service_bytes and io_serviced stats.  While keeping track of them
separately may be useful during development, it doesn't make much
sense otherwise.  Also, blk-throttle was counting bio's as IOs while
cfq-iosched request's, which is more confusing than informative.

This patch adds ->stat_bytes and ->stat_ios to blkg (blkcg_gq),
removes the counterparts from cfq-iosched and blk-throttle and let
them print from the common blkg counters.  The common counters are
incremented during bio issue in blkcg_bio_issue_check().

The outputs are still filtered by whether the policy has
blkg_policy_data on a given blkg, so cfq's output won't show up if it
has never been used for a given blkg.  The only times when the outputs
would differ significantly are when policies are attached on the fly
or elevators are switched back and forth.  Those are quite exceptional
operations and I don't think they warrant keeping separate counters.

v3: Update blkio-controller.txt accordingly.

v2: Account IOs during bio issues instead of request completions so
    that bio-based drivers can be handled the same way.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/cgroups/blkio-controller.txt | 24 ++------
 block/blk-cgroup.c                         | 98 ++++++++++++++++++++++++++++++
 block/blk-throttle.c                       | 73 ++--------------------
 block/cfq-iosched.c                        | 32 +++-------
 include/linux/blk-cgroup.h                 | 14 +++++
 5 files changed, 133 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 68b6a6a470b0..12686bec37b9 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -201,7 +201,7 @@ Proportional weight policy files
 	  specifies the number of bytes.
 
 - blkio.io_serviced
-	- Number of IOs completed to/from the disk by the group. These
+	- Number of IOs (bio) issued to the disk by the group. These
 	  are further divided by the type of operation - read or write, sync
 	  or async. First two fields specify the major and minor number of the
 	  device, third field specifies the operation type and the fourth field
@@ -327,18 +327,11 @@ Note: If both BW and IOPS rules are specified for a device, then IO is
       subjected to both the constraints.
 
 - blkio.throttle.io_serviced
-	- Number of IOs (bio) completed to/from the disk by the group (as
-	  seen by throttling policy). These are further divided by the type
-	  of operation - read or write, sync or async. First two fields specify
-	  the major and minor number of the device, third field specifies the
-	  operation type and the fourth field specifies the number of IOs.
-
-	  blkio.io_serviced does accounting as seen by CFQ and counts are in
-	  number of requests (struct request). On the other hand,
-	  blkio.throttle.io_serviced counts number of IO in terms of number
-	  of bios as seen by throttling policy.  These bios can later be
-	  merged by elevator and total number of requests completed can be
-	  lesser.
+	- Number of IOs (bio) issued to the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of IOs.
 
 - blkio.throttle.io_service_bytes
 	- Number of bytes transferred to/from the disk by the group. These
@@ -347,11 +340,6 @@ Note: If both BW and IOPS rules are specified for a device, then IO is
 	  device, third field specifies the operation type and the fourth field
 	  specifies the number of bytes.
 
-	  These numbers should roughly be same as blkio.io_service_bytes as
-	  updated by CFQ. The difference between two is that
-	  blkio.io_service_bytes will not be updated if CFQ is not operating
-	  on request queue.
-
 Common files among various policies
 -----------------------------------
 - blkio.reset_stats
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b26320720a3c..a25263ca39ca 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -73,6 +73,9 @@ static void blkg_free(struct blkcg_gq *blkg)
 
 	if (blkg->blkcg != &blkcg_root)
 		blk_exit_rl(&blkg->rl);
+
+	blkg_rwstat_exit(&blkg->stat_ios);
+	blkg_rwstat_exit(&blkg->stat_bytes);
 	kfree(blkg);
 }
 
@@ -95,6 +98,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 	if (!blkg)
 		return NULL;
 
+	if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
+	    blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
+		goto err_free;
+
 	blkg->q = q;
 	INIT_LIST_HEAD(&blkg->q_node);
 	blkg->blkcg = blkcg;
@@ -300,6 +307,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
 	struct blkcg *blkcg = blkg->blkcg;
+	struct blkcg_gq *parent = blkg->parent;
 	int i;
 
 	lockdep_assert_held(blkg->q->queue_lock);
@@ -315,6 +323,12 @@ static void blkg_destroy(struct blkcg_gq *blkg)
 		if (blkg->pd[i] && pol->pd_offline_fn)
 			pol->pd_offline_fn(blkg->pd[i]);
 	}
+
+	if (parent) {
+		blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
+		blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
+	}
+
 	blkg->online = false;
 
 	radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
@@ -431,6 +445,9 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 	 * anyway.  If you get hit by a race, retry.
 	 */
 	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+		blkg_rwstat_reset(&blkg->stat_bytes);
+		blkg_rwstat_reset(&blkg->stat_ios);
+
 		for (i = 0; i < BLKCG_MAX_POLS; i++) {
 			struct blkcg_policy *pol = blkcg_policy[i];
 
@@ -579,6 +596,87 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 }
 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 
+static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
+				    struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
+
+	return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_bytes.
+ * cftype->private must be set to the blkcg_policy.
+ */
+int blkg_print_stat_bytes(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+			  offsetof(struct blkcg_gq, stat_bytes), true);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_ios.  cftype->private
+ * must be set to the blkcg_policy.
+ */
+int blkg_print_stat_ios(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+			  offsetof(struct blkcg_gq, stat_ios), true);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
+
+static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
+					      struct blkg_policy_data *pd,
+					      int off)
+{
+	struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
+							      NULL, off);
+	return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  blkg_prfill_rwstat_field_recursive,
+			  (void *)seq_cft(sf)->private,
+			  offsetof(struct blkcg_gq, stat_bytes), true);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
+
+/**
+ * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  blkg_prfill_rwstat_field_recursive,
+			  (void *)seq_cft(sf)->private,
+			  offsetof(struct blkcg_gq, stat_ios), true);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
+
 /**
  * blkg_stat_recursive_sum - collect hierarchical blkg_stat
  * @blkg: blkg of interest
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c0b2263a222a..bd3e4b2c7a9d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -133,11 +133,6 @@ struct throtl_grp {
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
-
-	/* total bytes transferred */
-	struct blkg_rwstat		service_bytes;
-	/* total IOs serviced, post merge */
-	struct blkg_rwstat		serviced;
 };
 
 struct throtl_data
@@ -335,11 +330,7 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 
 	tg = kzalloc_node(sizeof(*tg), gfp, node);
 	if (!tg)
-		goto err;
-
-	if (blkg_rwstat_init(&tg->service_bytes, gfp) ||
-	    blkg_rwstat_init(&tg->serviced, gfp))
-		goto err_free_tg;
+		return NULL;
 
 	throtl_service_queue_init(&tg->service_queue);
 
@@ -355,13 +346,6 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 	tg->iops[WRITE] = -1;
 
 	return &tg->pd;
-
-err_free_tg:
-	blkg_rwstat_exit(&tg->serviced);
-	blkg_rwstat_exit(&tg->service_bytes);
-	kfree(tg);
-err:
-	return NULL;
 }
 
 static void throtl_pd_init(struct blkg_policy_data *pd)
@@ -419,19 +403,9 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
 	struct throtl_grp *tg = pd_to_tg(pd);
 
 	del_timer_sync(&tg->service_queue.pending_timer);
-	blkg_rwstat_exit(&tg->serviced);
-	blkg_rwstat_exit(&tg->service_bytes);
 	kfree(tg);
 }
 
-static void throtl_pd_reset_stats(struct blkg_policy_data *pd)
-{
-	struct throtl_grp *tg = pd_to_tg(pd);
-
-	blkg_rwstat_reset(&tg->service_bytes);
-	blkg_rwstat_reset(&tg->serviced);
-}
-
 static struct throtl_grp *
 throtl_rb_first(struct throtl_service_queue *parent_sq)
 {
@@ -839,25 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	return 0;
 }
 
-static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
-					 int rw)
-{
-	struct throtl_grp *tg = blkg_to_tg(blkg);
-	unsigned long flags;
-
-	/*
-	 * Disabling interrupts to provide mutual exclusion between two
-	 * writes on same cpu. It probably is not needed for 64bit. Not
-	 * optimizing that case yet.
-	 */
-	local_irq_save(flags);
-
-	blkg_rwstat_add(&tg->serviced, rw, 1);
-	blkg_rwstat_add(&tg->service_bytes, rw, bytes);
-
-	local_irq_restore(flags);
-}
-
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
 	bool rw = bio_data_dir(bio);
@@ -871,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	 * more than once as a throttled bio will go through blk-throtl the
 	 * second time when it eventually gets issued.  Set it when a bio
 	 * is being charged to a tg.
-	 *
-	 * Dispatch stats aren't recursive and each @bio should only be
-	 * accounted by the @tg it was originally associated with.  Let's
-	 * update the stats when setting REQ_THROTTLED for the first time
-	 * which is guaranteed to be for the @bio's original tg.
 	 */
-	if (!(bio->bi_rw & REQ_THROTTLED)) {
+	if (!(bio->bi_rw & REQ_THROTTLED))
 		bio->bi_rw |= REQ_THROTTLED;
-		throtl_update_dispatch_stats(tg_to_blkg(tg),
-					     bio->bi_iter.bi_size, bio->bi_rw);
-	}
 }
 
 /**
@@ -1161,13 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 	}
 }
 
-static int tg_print_rwstat(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
-			  &blkcg_policy_throtl, seq_cft(sf)->private, true);
-	return 0;
-}
-
 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
 			      int off)
 {
@@ -1304,13 +1244,13 @@ static struct cftype throtl_files[] = {
 	},
 	{
 		.name = "throttle.io_service_bytes",
-		.private = offsetof(struct throtl_grp, service_bytes),
-		.seq_show = tg_print_rwstat,
+		.private = (unsigned long)&blkcg_policy_throtl,
+		.seq_show = blkg_print_stat_bytes,
 	},
 	{
 		.name = "throttle.io_serviced",
-		.private = offsetof(struct throtl_grp, serviced),
-		.seq_show = tg_print_rwstat,
+		.private = (unsigned long)&blkcg_policy_throtl,
+		.seq_show = blkg_print_stat_ios,
 	},
 	{ }	/* terminate */
 };
@@ -1329,7 +1269,6 @@ static struct blkcg_policy blkcg_policy_throtl = {
 	.pd_init_fn		= throtl_pd_init,
 	.pd_online_fn		= throtl_pd_online,
 	.pd_free_fn		= throtl_pd_free,
-	.pd_reset_stats_fn	= throtl_pd_reset_stats,
 };
 
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e861cc1ebd62..a948d4df3fc3 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -177,10 +177,6 @@ enum wl_type_t {
 
 struct cfqg_stats {
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-	/* total bytes transferred */
-	struct blkg_rwstat		service_bytes;
-	/* total IOs serviced, post merge */
-	struct blkg_rwstat		serviced;
 	/* number of ios merged */
 	struct blkg_rwstat		merged;
 	/* total time spent on device in ns, may not be accurate w/ queueing */
@@ -696,8 +692,6 @@ static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
 					      uint64_t bytes, int rw)
 {
 	blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
-	blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
-	blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
 }
 
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
@@ -717,8 +711,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 static void cfqg_stats_reset(struct cfqg_stats *stats)
 {
 	/* queued stats shouldn't be cleared */
-	blkg_rwstat_reset(&stats->service_bytes);
-	blkg_rwstat_reset(&stats->serviced);
 	blkg_rwstat_reset(&stats->merged);
 	blkg_rwstat_reset(&stats->service_time);
 	blkg_rwstat_reset(&stats->wait_time);
@@ -738,8 +730,6 @@ static void cfqg_stats_reset(struct cfqg_stats *stats)
 static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
 {
 	/* queued stats shouldn't be cleared */
-	blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes);
-	blkg_rwstat_add_aux(&to->serviced, &from->serviced);
 	blkg_rwstat_add_aux(&to->merged, &from->merged);
 	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
 	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
@@ -1544,8 +1534,6 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static void cfqg_stats_exit(struct cfqg_stats *stats)
 {
-	blkg_rwstat_exit(&stats->service_bytes);
-	blkg_rwstat_exit(&stats->serviced);
 	blkg_rwstat_exit(&stats->merged);
 	blkg_rwstat_exit(&stats->service_time);
 	blkg_rwstat_exit(&stats->wait_time);
@@ -1566,9 +1554,7 @@ static void cfqg_stats_exit(struct cfqg_stats *stats)
 
 static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
 {
-	if (blkg_rwstat_init(&stats->service_bytes, gfp) ||
-	    blkg_rwstat_init(&stats->serviced, gfp) ||
-	    blkg_rwstat_init(&stats->merged, gfp) ||
+	if (blkg_rwstat_init(&stats->merged, gfp) ||
 	    blkg_rwstat_init(&stats->service_time, gfp) ||
 	    blkg_rwstat_init(&stats->wait_time, gfp) ||
 	    blkg_rwstat_init(&stats->queued, gfp) ||
@@ -1994,13 +1980,13 @@ static struct cftype cfq_blkcg_files[] = {
 	},
 	{
 		.name = "io_service_bytes",
-		.private = offsetof(struct cfq_group, stats.service_bytes),
-		.seq_show = cfqg_print_rwstat,
+		.private = (unsigned long)&blkcg_policy_cfq,
+		.seq_show = blkg_print_stat_bytes,
 	},
 	{
 		.name = "io_serviced",
-		.private = offsetof(struct cfq_group, stats.serviced),
-		.seq_show = cfqg_print_rwstat,
+		.private = (unsigned long)&blkcg_policy_cfq,
+		.seq_show = blkg_print_stat_ios,
 	},
 	{
 		.name = "io_service_time",
@@ -2036,13 +2022,13 @@ static struct cftype cfq_blkcg_files[] = {
 	},
 	{
 		.name = "io_service_bytes_recursive",
-		.private = offsetof(struct cfq_group, stats.service_bytes),
-		.seq_show = cfqg_print_rwstat_recursive,
+		.private = (unsigned long)&blkcg_policy_cfq,
+		.seq_show = blkg_print_stat_bytes_recursive,
 	},
 	{
 		.name = "io_serviced_recursive",
-		.private = offsetof(struct cfq_group, stats.serviced),
-		.seq_show = cfqg_print_rwstat_recursive,
+		.private = (unsigned long)&blkcg_policy_cfq,
+		.seq_show = blkg_print_stat_ios_recursive,
 	},
 	{
 		.name = "io_service_time_recursive",
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 4630ce8f9425..286e1bde249f 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -127,6 +127,9 @@ struct blkcg_gq {
 	/* is this blkg online? protected by both blkcg and q locks */
 	bool				online;
 
+	struct blkg_rwstat		stat_bytes;
+	struct blkg_rwstat		stat_ios;
+
 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
 
 	struct rcu_head			rcu_head;
@@ -190,6 +193,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 		       int off);
+int blkg_print_stat_bytes(struct seq_file *sf, void *v);
+int blkg_print_stat_ios(struct seq_file *sf, void *v);
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v);
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
 
 u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
 			    struct blkcg_policy *pol, int off);
@@ -700,6 +707,13 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 
 	throtl = blk_throtl_bio(q, blkg, bio);
 
+	if (!throtl) {
+		blkg = blkg ?: q->root_blkg;
+		blkg_rwstat_add(&blkg->stat_bytes, bio->bi_flags,
+				bio->bi_iter.bi_size);
+		blkg_rwstat_add(&blkg->stat_ios, bio->bi_flags, 1);
+	}
+
 	rcu_read_unlock();
 	return !throtl;
 }
-- 
cgit v1.2.3-70-g09d2


From c165b3e3c7bb68c2ed55a5ac2623f030d01d9567 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:29 -0700
Subject: blkcg: rename subsystem name from blkio to io

blkio interface has become messy over time and is currently the
largest.  In addition to the inconsistent naming scheme, it has
multiple stat files which report more or less the same thing, a number
of debug stat files which expose internal details which shouldn't have
been part of the public interface in the first place, recursive and
non-recursive stats and leaf and non-leaf knobs.

Both recursive vs. non-recursive and leaf vs. non-leaf distinctions
don't make any sense on the unified hierarchy as only leaf cgroups can
contain processes.  cgroups is going through a major interface
revision with the unified hierarchy involving significant fundamental
usage changes and given that a significant portion of the interface
doesn't make sense anymore, it's a good time to reorganize the
interface.

As the first step, this patch renames the external visible subsystem
name from "blkio" to "io".  This is more concise, matches the other
two major subsystem names, "cpu" and "memory", and better suited as
blkcg will be involved in anything writeback related too whether an
actual block device is involved or not.

As the subsystem legacy_name is set to "blkio", the only userland
visible change outside the unified hierarchy is that blkcg is reported
as "io" instead of "blkio" in the subsystem initialized message during
boot.  On the unified hierarchy, blkcg now appears as "io".

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: cgroups@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c                   | 2 +-
 block/blk-cgroup.c            | 7 ++++---
 include/linux/backing-dev.h   | 2 +-
 include/linux/blk-cgroup.h    | 4 ++--
 include/linux/cgroup_subsys.h | 2 +-
 mm/backing-dev.c              | 4 ++--
 6 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index d6e5ba3399f0..c52222c6c69b 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2046,7 +2046,7 @@ int bio_associate_current(struct bio *bio)
 
 	get_io_context_active(ioc);
 	bio->bi_ioc = ioc;
-	bio->bi_css = task_get_css(current, blkio_cgrp_id);
+	bio->bi_css = task_get_css(current, io_cgrp_id);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(bio_associate_current);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0af3bff198ed..fc197ea4c992 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1089,12 +1089,13 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
 	return ret;
 }
 
-struct cgroup_subsys blkio_cgrp_subsys = {
+struct cgroup_subsys io_cgrp_subsys = {
 	.css_alloc = blkcg_css_alloc,
 	.css_offline = blkcg_css_offline,
 	.css_free = blkcg_css_free,
 	.can_attach = blkcg_can_attach,
 	.legacy_cftypes = blkcg_files,
+	.legacy_name = "blkio",
 #ifdef CONFIG_MEMCG
 	/*
 	 * This ensures that, if available, memcg is automatically enabled
@@ -1104,7 +1105,7 @@ struct cgroup_subsys blkio_cgrp_subsys = {
 	.depends_on = 1 << memory_cgrp_id,
 #endif
 };
-EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
+EXPORT_SYMBOL_GPL(io_cgrp_subsys);
 
 /**
  * blkcg_activate_policy - activate a blkcg policy on a request_queue
@@ -1266,7 +1267,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 
 	/* everything is in place, add intf files for the new policy */
 	if (pol->cftypes)
-		WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
+		WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
 						  pol->cftypes));
 	mutex_unlock(&blkcg_pol_register_mutex);
 	return 0;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 23ebb946e66f..5a5d79ee256f 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi
 	 * %current's blkcg equals the effective blkcg of its memcg.  No
 	 * need to use the relatively expensive cgroup_get_e_css().
 	 */
-	if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
+	if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id)))
 		return wb;
 	return NULL;
 }
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 286e1bde249f..db89acd2a864 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -221,7 +221,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 
 static inline struct blkcg *task_blkcg(struct task_struct *tsk)
 {
-	return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
+	return css_to_blkcg(task_css(tsk, io_cgrp_id));
 }
 
 static inline struct blkcg *bio_blkcg(struct bio *bio)
@@ -234,7 +234,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
 static inline struct cgroup_subsys_state *
 task_get_blkcg_css(struct task_struct *task)
 {
-	return task_get_css(task, blkio_cgrp_id);
+	return task_get_css(task, io_cgrp_id);
 }
 
 /**
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e4a96fb14403..86b5056104df 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -16,7 +16,7 @@ SUBSYS(cpuacct)
 #endif
 
 #if IS_ENABLED(CONFIG_BLK_CGROUP)
-SUBSYS(blkio)
+SUBSYS(io)
 #endif
 
 #if IS_ENABLED(CONFIG_MEMCG)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index dac5bf59309d..d0ee90e72867 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -523,7 +523,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
 	int ret = 0;
 
 	memcg = mem_cgroup_from_css(memcg_css);
-	blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
+	blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
 	blkcg = css_to_blkcg(blkcg_css);
 	memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
 	blkcg_cgwb_list = &blkcg->cgwb_list;
@@ -645,7 +645,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 
 			/* see whether the blkcg association has changed */
 			blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
-						     &blkio_cgrp_subsys);
+						     &io_cgrp_subsys);
 			if (unlikely(wb->blkcg_css != blkcg_css ||
 				     !wb_tryget(wb)))
 				wb = NULL;
-- 
cgit v1.2.3-70-g09d2


From 880f50e228f80626dff6327a6e281e40286f5228 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:30 -0700
Subject: blkcg: mark existing cftypes as legacy

blkcg is about to grow interface for the unified hierarchy.  Add
legacy to existing cftypes.

* blkcg_policy->cftypes -> blkcg_policy->legacy_cftypes
* blk-cgroup.c:blkcg_files -> blkcg_legacy_files
* cfq-iosched.c:cfq_blkcg_files -> cfq_blkcg_legacy_files
* blk-throttle.c:throtl_files -> throtl_legacy_files

Pure renames.  No functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 12 ++++++------
 block/blk-throttle.c       |  4 ++--
 block/cfq-iosched.c        |  4 ++--
 include/linux/blk-cgroup.h |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index fc197ea4c992..429726bed560 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -847,7 +847,7 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
-struct cftype blkcg_files[] = {
+struct cftype blkcg_legacy_files[] = {
 	{
 		.name = "reset_stats",
 		.write_u64 = blkcg_reset_stats,
@@ -1094,7 +1094,7 @@ struct cgroup_subsys io_cgrp_subsys = {
 	.css_offline = blkcg_css_offline,
 	.css_free = blkcg_css_free,
 	.can_attach = blkcg_can_attach,
-	.legacy_cftypes = blkcg_files,
+	.legacy_cftypes = blkcg_legacy_files,
 	.legacy_name = "blkio",
 #ifdef CONFIG_MEMCG
 	/*
@@ -1266,9 +1266,9 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	mutex_unlock(&blkcg_pol_mutex);
 
 	/* everything is in place, add intf files for the new policy */
-	if (pol->cftypes)
+	if (pol->legacy_cftypes)
 		WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
-						  pol->cftypes));
+						  pol->legacy_cftypes));
 	mutex_unlock(&blkcg_pol_register_mutex);
 	return 0;
 
@@ -1305,8 +1305,8 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 		goto out_unlock;
 
 	/* kill the intf files first */
-	if (pol->cftypes)
-		cgroup_rm_cftypes(pol->cftypes);
+	if (pol->legacy_cftypes)
+		cgroup_rm_cftypes(pol->legacy_cftypes);
 
 	/* remove cpds and unregister */
 	mutex_lock(&blkcg_pol_mutex);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index bd3e4b2c7a9d..8b4f6b81bb72 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1217,7 +1217,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
 	return tg_set_conf(of, buf, nbytes, off, false);
 }
 
-static struct cftype throtl_files[] = {
+static struct cftype throtl_legacy_files[] = {
 	{
 		.name = "throttle.read_bps_device",
 		.private = offsetof(struct throtl_grp, bps[READ]),
@@ -1263,7 +1263,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
 }
 
 static struct blkcg_policy blkcg_policy_throtl = {
-	.cftypes		= throtl_files,
+	.legacy_cftypes		= throtl_legacy_files,
 
 	.pd_alloc_fn		= throtl_pd_alloc,
 	.pd_init_fn		= throtl_pd_init,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 38277e3ce286..baa8459f7064 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1944,7 +1944,7 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
 }
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 
-static struct cftype cfq_blkcg_files[] = {
+static struct cftype cfq_blkcg_legacy_files[] = {
 	/* on root, weight is mapped to leaf_weight */
 	{
 		.name = "weight_device",
@@ -4654,7 +4654,7 @@ static struct elevator_type iosched_cfq = {
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkcg_policy blkcg_policy_cfq = {
-	.cftypes		= cfq_blkcg_files,
+	.legacy_cftypes		= cfq_blkcg_legacy_files,
 
 	.cpd_alloc_fn		= cfq_cpd_alloc,
 	.cpd_init_fn		= cfq_cpd_init,
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index db89acd2a864..6e016e6fee87 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -148,7 +148,7 @@ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
 struct blkcg_policy {
 	int				plid;
 	/* cgroup files for the policy */
-	struct cftype			*cftypes;
+	struct cftype			*legacy_cftypes;
 
 	/* operations */
 	blkcg_pol_alloc_cpd_fn		*cpd_alloc_fn;
-- 
cgit v1.2.3-70-g09d2


From 36aa9e5f591e84d67aad2c5bff75e413d77660dd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:31 -0700
Subject: blkcg: move body parsing from blkg_conf_prep() to its callers

Currently, blkg_conf_prep() expects input to be of the following form

 MAJ:MIN NUM

and reads the NUM part into blkg_conf_ctx->v.  This is quite
restrictive and gets in the way in implementing blkcg interface for
the unified hierarchy.  This patch updates blkg_conf_prep() so that it
expects

 MAJ:MIN BODY_STR

where BODY_STR is an arbitrary string.  blkg_conf_ctx->v is replaced
with ->body which is a char pointer pointing to the start of BODY_STR.
Parsing of the body is moved to blkg_conf_prep()'s callers.

To allow using, for example, strsep() on blkg_conf_ctx->val, it is a
non-const pointer and to accommodate that const is dropped from @input
too.

This doesn't cause any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 22 ++++++++++++++--------
 block/blk-throttle.c       | 18 ++++++++++++------
 block/cfq-iosched.c        | 17 +++++++++++------
 include/linux/blk-cgroup.h |  4 ++--
 4 files changed, 39 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 429726bed560..8ee1ca4d4f2f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -24,6 +24,7 @@
 #include <linux/genhd.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
+#include <linux/ctype.h>
 #include <linux/blk-cgroup.h>
 #include "blk.h"
 
@@ -773,23 +774,28 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
  * @ctx: blkg_conf_ctx to be filled
  *
  * Parse per-blkg config update from @input and initialize @ctx with the
- * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
- * value.  This function returns with RCU read lock and queue lock held and
- * must be paired with blkg_conf_finish().
+ * result.  @ctx->blkg points to the blkg to be updated and @ctx->body the
+ * part of @input following MAJ:MIN.  This function returns with RCU read
+ * lock and queue lock held and must be paired with blkg_conf_finish().
  */
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-		   const char *input, struct blkg_conf_ctx *ctx)
+		   char *input, struct blkg_conf_ctx *ctx)
 	__acquires(rcu) __acquires(disk->queue->queue_lock)
 {
 	struct gendisk *disk;
 	struct blkcg_gq *blkg;
 	unsigned int major, minor;
-	unsigned long long v;
-	int part, ret;
+	int key_len, part, ret;
+	char *body;
 
-	if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
+	if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
 		return -EINVAL;
 
+	body = input + key_len;
+	if (!isspace(*body))
+		return -EINVAL;
+	body = skip_spaces(body);
+
 	disk = get_gendisk(MKDEV(major, minor), &part);
 	if (!disk)
 		return -ENODEV;
@@ -826,7 +832,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 
 	ctx->disk = disk;
 	ctx->blkg = blkg;
-	ctx->v = v;
+	ctx->body = body;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blkg_conf_prep);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8b4f6b81bb72..0e17c8f92d25 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1154,21 +1154,25 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 	struct blkcg_gq *blkg;
 	struct cgroup_subsys_state *pos_css;
 	int ret;
+	u64 v;
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
 	if (ret)
 		return ret;
 
+	ret = -EINVAL;
+	if (sscanf(ctx.body, "%llu", &v) != 1)
+		goto out_finish;
+	if (!v)
+		v = -1;
+
 	tg = blkg_to_tg(ctx.blkg);
 	sq = &tg->service_queue;
 
-	if (!ctx.v)
-		ctx.v = -1;
-
 	if (is_u64)
-		*(u64 *)((void *)tg + of_cft(of)->private) = ctx.v;
+		*(u64 *)((void *)tg + of_cft(of)->private) = v;
 	else
-		*(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v;
+		*(unsigned int *)((void *)tg + of_cft(of)->private) = v;
 
 	throtl_log(&tg->service_queue,
 		   "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
@@ -1201,8 +1205,10 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 		throtl_schedule_next_dispatch(sq->parent_sq, true);
 	}
 
+	ret = 0;
+out_finish:
 	blkg_conf_finish(&ctx);
-	return nbytes;
+	return ret ?: nbytes;
 }
 
 static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index baa8459f7064..ea88d8905bbd 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1747,26 +1747,31 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 	struct cfq_group *cfqg;
 	struct cfq_group_data *cfqgd;
 	int ret;
+	u64 v;
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
 	if (ret)
 		return ret;
 
+	ret = -EINVAL;
+	if (sscanf(ctx.body, "%llu", &v) != 1)
+		goto out_finish;
+
 	cfqg = blkg_to_cfqg(ctx.blkg);
 	cfqgd = blkcg_to_cfqgd(blkcg);
 
 	ret = -ERANGE;
-	if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
+	if (!v || (v >= CFQ_WEIGHT_MIN && v <= CFQ_WEIGHT_MAX)) {
 		if (!is_leaf_weight) {
-			cfqg->dev_weight = ctx.v;
-			cfqg->new_weight = ctx.v ?: cfqgd->weight;
+			cfqg->dev_weight = v;
+			cfqg->new_weight = v ?: cfqgd->weight;
 		} else {
-			cfqg->dev_leaf_weight = ctx.v;
-			cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
+			cfqg->dev_leaf_weight = v;
+			cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
 		}
 		ret = 0;
 	}
-
+out_finish:
 	blkg_conf_finish(&ctx);
 	return ret ?: nbytes;
 }
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 6e016e6fee87..85a4d989ae43 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -206,11 +206,11 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
 struct blkg_conf_ctx {
 	struct gendisk			*disk;
 	struct blkcg_gq			*blkg;
-	u64				v;
+	char				*body;
 };
 
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-		   const char *input, struct blkg_conf_ctx *ctx);
+		   char *input, struct blkg_conf_ctx *ctx);
 void blkg_conf_finish(struct blkg_conf_ctx *ctx);
 
 
-- 
cgit v1.2.3-70-g09d2


From dd165eb3bb4ef16bcdb75417add40633f38c52b8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:33 -0700
Subject: blkcg: misc preparations for unified hierarchy interface

* Export blkg_dev_name()

* Drop unnecessary @cft from __cfq_set_weight().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 3 ++-
 block/cfq-iosched.c        | 8 ++++----
 include/linux/blk-cgroup.h | 1 +
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8ee1ca4d4f2f..b5e72d756be1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -462,13 +462,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 	return 0;
 }
 
-static const char *blkg_dev_name(struct blkcg_gq *blkg)
+const char *blkg_dev_name(struct blkcg_gq *blkg)
 {
 	/* some drivers (floppy) instantiate a queue w/o disk registered */
 	if (blkg->q->backing_dev_info.dev)
 		return dev_name(blkg->q->backing_dev_info.dev);
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(blkg_dev_name);
 
 /**
  * blkcg_print_blkgs - helper for printing per-blkg data
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ea88d8905bbd..7a7230136e43 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1788,8 +1788,8 @@ static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
 	return __cfqg_set_weight_device(of, buf, nbytes, off, true);
 }
 
-static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
-			    u64 val, bool is_leaf_weight)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
+			    bool is_leaf_weight)
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
 	struct blkcg_gq *blkg;
@@ -1834,13 +1834,13 @@ out:
 static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
 			  u64 val)
 {
-	return __cfq_set_weight(css, cft, val, false);
+	return __cfq_set_weight(css, val, false);
 }
 
 static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
 			       struct cftype *cft, u64 val)
 {
-	return __cfq_set_weight(css, cft, val, true);
+	return __cfq_set_weight(css, val, true);
 }
 
 static int cfqg_print_stat(struct seq_file *sf, void *v)
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 85a4d989ae43..b270aef519c6 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -182,6 +182,7 @@ int blkcg_activate_policy(struct request_queue *q,
 void blkcg_deactivate_policy(struct request_queue *q,
 			     const struct blkcg_policy *pol);
 
+const char *blkg_dev_name(struct blkcg_gq *blkg);
 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 		       u64 (*prfill)(struct seq_file *,
 				     struct blkg_policy_data *, int),
-- 
cgit v1.2.3-70-g09d2


From 2ee867dcfa2eaef1063b686da55c35878b2da4a2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:34 -0700
Subject: blkcg: implement interface for the unified hierarchy

blkcg interface grew to be the biggest of all controllers and
unfortunately most inconsistent too.  The interface files are
inconsistent with a number of cloes duplicates.  Some files have
recursive variants while others don't.  There's distinction between
normal and leaf weights which isn't intuitive and there are a lot of
stat knobs which don't make much sense outside of debugging and expose
too much implementation details to userland.

In the unified hierarchy, everything is always hierarchical and
internal nodes can't have tasks rendering the two structural issues
twisting the current interface.  The interface has to be updated in a
significant anyway and this is a good chance to revamp it as a whole.
This patch implements blkcg interface for the unified hierarchy.

* (from a previous patch) blkcg is identified by "io" instead of
  "blkio" on the unified hierarchy.  Given that the whole interface is
  updated anyway, the rename shouldn't carry noticeable conversion
  overhead.

* The original interface consisted of 27 files is replaced with the
  following three files.

  blkio.stat	: per-blkcg stats
  blkio.weight	: per-cgroup and per-cgroup-queue weight settings
  blkio.max	: per-cgroup-queue bps and iops max limits

Documentation/cgroups/unified-hierarchy.txt updated accordingly.

v2: blkcg_policy->dfl_cftypes wasn't removed on
    blkcg_policy_unregister() corrupting the cftypes list.  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/cgroups/unified-hierarchy.txt |  61 ++++++++++++++-
 block/blk-cgroup.c                          |  53 +++++++++++++
 block/blk-throttle.c                        | 112 ++++++++++++++++++++++++++++
 block/cfq-iosched.c                         |  61 +++++++++++++--
 include/linux/blk-cgroup.h                  |   1 +
 5 files changed, 279 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 1ee9caf29e57..bd1ce15d5178 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -27,7 +27,7 @@ CONTENTS
     5-3-1. Format
     5-3-2. Control Knobs
   5-4. Per-Controller Changes
-    5-4-1. blkio
+    5-4-1. io
     5-4-2. cpuset
     5-4-3. memory
 6. Planned Changes
@@ -203,7 +203,7 @@ other issues.  The mapping from nice level to weight isn't obvious or
 universal, and there are various other knobs which simply aren't
 available for tasks.
 
-The blkio controller implicitly creates a hidden leaf node for each
+The io controller implicitly creates a hidden leaf node for each
 cgroup to host the tasks.  The hidden leaf has its own copies of all
 the knobs with "leaf_" prefixed.  While this allows equivalent control
 over internal tasks, it's with serious drawbacks.  It always adds an
@@ -438,9 +438,62 @@ may be specified in any order and not all pairs have to be specified.
 
 5-4. Per-Controller Changes
 
-5-4-1. blkio
+5-4-1. io
 
-- blk-throttle becomes properly hierarchical.
+- blkio is renamed to io.  The interface is overhauled anyway.  The
+  new name is more in line with the other two major controllers, cpu
+  and memory, and better suited given that it may be used for cgroup
+  writeback without involving block layer.
+
+- Everything including stat is always hierarchical making separate
+  recursive stat files pointless and, as no internal node can have
+  tasks, leaf weights are meaningless.  The operation model is
+  simplified and the interface is overhauled accordingly.
+
+  io.stat
+
+	The stat file.  The reported stats are from the point where
+	bio's are issued to request_queue.  The stats are counted
+	independent of which policies are enabled.  Each line in the
+	file follows the following format.  More fields may later be
+	added at the end.
+
+	  $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS
+
+  io.weight
+
+	The weight setting, currently only available and effective if
+	cfq-iosched is in use for the target device.  The weight is
+	between 10 and 1000 and defaults to 500.  The first line
+	always contains the default weight in the following format to
+	use when per-device setting is missing.
+
+	  default $WEIGHT
+
+	Subsequent lines list per-device weights of the following
+	format.
+
+	  $MAJ:$MIN $WEIGHT
+
+	Writing "$WEIGHT" or "default $WEIGHT" changes the default
+	setting.  Writing "$MAJ:$MIN $WEIGHT" sets per-device weight
+	while "$MAJ:$MIN default" clears it.
+
+	This file is available only on non-root cgroups.
+
+  io.max
+
+	The maximum bandwidth and/or iops setting, only available if
+	blk-throttle is enabled.  The file is of the following format.
+
+	  $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS
+
+	${R|W}BPS are read/write bytes per second and ${R|W}IOPS are
+	read/write IOs per second.  "max" indicates no limit.  Writing
+	to the file follows the same format but the individual
+	settings may be ommitted or specified in any order.
+
+	This file is available only on non-root cgroups.
 
 
 5-4-2. cpuset
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b5e72d756be1..88bdb73bd5e0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -854,6 +854,53 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
+static int blkcg_print_stat(struct seq_file *sf, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct blkcg_gq *blkg;
+
+	rcu_read_lock();
+
+	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+		const char *dname;
+		struct blkg_rwstat rwstat;
+		u64 rbytes, wbytes, rios, wios;
+
+		dname = blkg_dev_name(blkg);
+		if (!dname)
+			continue;
+
+		spin_lock_irq(blkg->q->queue_lock);
+
+		rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+					offsetof(struct blkcg_gq, stat_bytes));
+		rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+		wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+		rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+					offsetof(struct blkcg_gq, stat_ios));
+		rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+		wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+		spin_unlock_irq(blkg->q->queue_lock);
+
+		if (rbytes || wbytes || rios || wios)
+			seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
+				   dname, rbytes, wbytes, rios, wios);
+	}
+
+	rcu_read_unlock();
+	return 0;
+}
+
+struct cftype blkcg_files[] = {
+	{
+		.name = "stat",
+		.seq_show = blkcg_print_stat,
+	},
+	{ }	/* terminate */
+};
+
 struct cftype blkcg_legacy_files[] = {
 	{
 		.name = "reset_stats",
@@ -1101,6 +1148,7 @@ struct cgroup_subsys io_cgrp_subsys = {
 	.css_offline = blkcg_css_offline,
 	.css_free = blkcg_css_free,
 	.can_attach = blkcg_can_attach,
+	.dfl_cftypes = blkcg_files,
 	.legacy_cftypes = blkcg_legacy_files,
 	.legacy_name = "blkio",
 #ifdef CONFIG_MEMCG
@@ -1273,6 +1321,9 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	mutex_unlock(&blkcg_pol_mutex);
 
 	/* everything is in place, add intf files for the new policy */
+	if (pol->dfl_cftypes)
+		WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
+					       pol->dfl_cftypes));
 	if (pol->legacy_cftypes)
 		WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
 						  pol->legacy_cftypes));
@@ -1312,6 +1363,8 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 		goto out_unlock;
 
 	/* kill the intf files first */
+	if (pol->dfl_cftypes)
+		cgroup_rm_cftypes(pol->dfl_cftypes);
 	if (pol->legacy_cftypes)
 		cgroup_rm_cftypes(pol->legacy_cftypes);
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a8bb2fd8f523..c75a2636dd40 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1265,6 +1265,117 @@ static struct cftype throtl_legacy_files[] = {
 	{ }	/* terminate */
 };
 
+static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
+			 int off)
+{
+	struct throtl_grp *tg = pd_to_tg(pd);
+	const char *dname = blkg_dev_name(pd->blkg);
+	char bufs[4][21] = { "max", "max", "max", "max" };
+
+	if (!dname)
+		return 0;
+	if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 &&
+	    tg->iops[READ] == -1 && tg->iops[WRITE] == -1)
+		return 0;
+
+	if (tg->bps[READ] != -1)
+		snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]);
+	if (tg->bps[WRITE] != -1)
+		snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]);
+	if (tg->iops[READ] != -1)
+		snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]);
+	if (tg->iops[WRITE] != -1)
+		snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]);
+
+	seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
+		   dname, bufs[0], bufs[1], bufs[2], bufs[3]);
+	return 0;
+}
+
+static int tg_print_max(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max,
+			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
+	return 0;
+}
+
+static ssize_t tg_set_max(struct kernfs_open_file *of,
+			  char *buf, size_t nbytes, loff_t off)
+{
+	struct blkcg *blkcg = css_to_blkcg(of_css(of));
+	struct blkg_conf_ctx ctx;
+	struct throtl_grp *tg;
+	u64 v[4];
+	int ret;
+
+	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+	if (ret)
+		return ret;
+
+	tg = blkg_to_tg(ctx.blkg);
+
+	v[0] = tg->bps[READ];
+	v[1] = tg->bps[WRITE];
+	v[2] = tg->iops[READ];
+	v[3] = tg->iops[WRITE];
+
+	while (true) {
+		char tok[27];	/* wiops=18446744073709551616 */
+		char *p;
+		u64 val = -1;
+		int len;
+
+		if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
+			break;
+		if (tok[0] == '\0')
+			break;
+		ctx.body += len;
+
+		ret = -EINVAL;
+		p = tok;
+		strsep(&p, "=");
+		if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
+			goto out_finish;
+
+		ret = -ERANGE;
+		if (!val)
+			goto out_finish;
+
+		ret = -EINVAL;
+		if (!strcmp(tok, "rbps"))
+			v[0] = val;
+		else if (!strcmp(tok, "wbps"))
+			v[1] = val;
+		else if (!strcmp(tok, "riops"))
+			v[2] = min_t(u64, val, UINT_MAX);
+		else if (!strcmp(tok, "wiops"))
+			v[3] = min_t(u64, val, UINT_MAX);
+		else
+			goto out_finish;
+	}
+
+	tg->bps[READ] = v[0];
+	tg->bps[WRITE] = v[1];
+	tg->iops[READ] = v[2];
+	tg->iops[WRITE] = v[3];
+
+	tg_conf_updated(tg);
+	ret = 0;
+out_finish:
+	blkg_conf_finish(&ctx);
+	return ret ?: nbytes;
+}
+
+static struct cftype throtl_files[] = {
+	{
+		.name = "max",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = tg_print_max,
+		.write = tg_set_max,
+	},
+	{ }	/* terminate */
+};
+
 static void throtl_shutdown_wq(struct request_queue *q)
 {
 	struct throtl_data *td = q->td;
@@ -1273,6 +1384,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
 }
 
 static struct blkcg_policy blkcg_policy_throtl = {
+	.dfl_cftypes		= throtl_files,
 	.legacy_cftypes		= throtl_legacy_files,
 
 	.pd_alloc_fn		= throtl_pd_alloc,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7a7230136e43..97da5719c87a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1740,7 +1740,7 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
 
 static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 					char *buf, size_t nbytes, loff_t off,
-					bool is_leaf_weight)
+					bool on_dfl, bool is_leaf_weight)
 {
 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
 	struct blkg_conf_ctx ctx;
@@ -1753,9 +1753,17 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 	if (ret)
 		return ret;
 
-	ret = -EINVAL;
-	if (sscanf(ctx.body, "%llu", &v) != 1)
+	if (sscanf(ctx.body, "%llu", &v) == 1) {
+		/* require "default" on dfl */
+		ret = -ERANGE;
+		if (!v && on_dfl)
+			goto out_finish;
+	} else if (!strcmp(strim(ctx.body), "default")) {
+		v = 0;
+	} else {
+		ret = -EINVAL;
 		goto out_finish;
+	}
 
 	cfqg = blkg_to_cfqg(ctx.blkg);
 	cfqgd = blkcg_to_cfqgd(blkcg);
@@ -1779,13 +1787,13 @@ out_finish:
 static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
 				      char *buf, size_t nbytes, loff_t off)
 {
-	return __cfqg_set_weight_device(of, buf, nbytes, off, false);
+	return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
 }
 
 static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
 					   char *buf, size_t nbytes, loff_t off)
 {
-	return __cfqg_set_weight_device(of, buf, nbytes, off, true);
+	return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
 }
 
 static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
@@ -2103,6 +2111,48 @@ static struct cftype cfq_blkcg_legacy_files[] = {
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 	{ }	/* terminate */
 };
+
+static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+
+	seq_printf(sf, "default %u\n", cgd->weight);
+	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
+			  &blkcg_policy_cfq, 0, false);
+	return 0;
+}
+
+static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
+				     char *buf, size_t nbytes, loff_t off)
+{
+	char *endp;
+	int ret;
+	u64 v;
+
+	buf = strim(buf);
+
+	/* "WEIGHT" or "default WEIGHT" sets the default weight */
+	v = simple_strtoull(buf, &endp, 0);
+	if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
+		ret = __cfq_set_weight(of_css(of), v, false);
+		return ret ?: nbytes;
+	}
+
+	/* "MAJ:MIN WEIGHT" */
+	return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
+}
+
+static struct cftype cfq_blkcg_files[] = {
+	{
+		.name = "weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cfq_print_weight_on_dfl,
+		.write = cfq_set_weight_on_dfl,
+	},
+	{ }	/* terminate */
+};
+
 #else /* GROUP_IOSCHED */
 static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
 					 struct blkcg *blkcg)
@@ -4659,6 +4709,7 @@ static struct elevator_type iosched_cfq = {
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkcg_policy blkcg_policy_cfq = {
+	.dfl_cftypes		= cfq_blkcg_files,
 	.legacy_cftypes		= cfq_blkcg_legacy_files,
 
 	.cpd_alloc_fn		= cfq_cpd_alloc,
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index b270aef519c6..9a7c4bd45fff 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -148,6 +148,7 @@ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
 struct blkcg_policy {
 	int				plid;
 	/* cgroup files for the policy */
+	struct cftype			*dfl_cftypes;
 	struct cftype			*legacy_cftypes;
 
 	/* operations */
-- 
cgit v1.2.3-70-g09d2


From 69d7fde5909b614114343974cfc52cb8ff30b544 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:36 -0700
Subject: blkcg: use CGROUP_WEIGHT_* scale for io.weight on the unified
 hierarchy

cgroup is trying to make interface consistent across different
controllers.  For weight based resource control, the knob should have
the range [1, 10000] and default to 100.  This patch updates
cfq-iosched so that the weight range conforms.  The internal
calculations have enough range and the widening of the weight range
shouldn't cause any problem.

* blkcg_policy->cpd_bind_fn() is added.  If present, this is invoked
  when blkcg is attached to a hierarchy.

* cfq_cpd_init() is updated to use the new default value on the
  unified hierarchy.

* cfq_cpd_bind() callback is implemented to clear per-blkg configs and
  apply the default config matching the hierarchy type.

* cfqd->root_group->[leaf_]weight initialization in cfq_init_queue()
  is moved into !CONFIG_CFQ_GROUP_IOSCHED block.  cfq_cpd_bind() is
  now responsible for initializing the initial weights when blkcg is
  enabled.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/cgroups/unified-hierarchy.txt |  2 +-
 block/blk-cgroup.c                          | 21 +++++++++++
 block/cfq-iosched.c                         | 55 +++++++++++++++++++++--------
 include/linux/blk-cgroup.h                  |  2 ++
 4 files changed, 64 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index bd1ce15d5178..e0975c2cf03d 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -464,7 +464,7 @@ may be specified in any order and not all pairs have to be specified.
 
 	The weight setting, currently only available and effective if
 	cfq-iosched is in use for the target device.  The weight is
-	between 10 and 1000 and defaults to 500.  The first line
+	between 1 and 10000 and defaults to 100.  The first line
 	always contains the default weight in the following format to
 	use when per-device setting is missing.
 
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 88bdb73bd5e0..ac8370cb2515 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1143,11 +1143,32 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
 	return ret;
 }
 
+static void blkcg_bind(struct cgroup_subsys_state *root_css)
+{
+	int i;
+
+	mutex_lock(&blkcg_pol_mutex);
+
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
+		struct blkcg *blkcg;
+
+		if (!pol || !pol->cpd_bind_fn)
+			continue;
+
+		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
+			if (blkcg->cpd[pol->plid])
+				pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
+	}
+	mutex_unlock(&blkcg_pol_mutex);
+}
+
 struct cgroup_subsys io_cgrp_subsys = {
 	.css_alloc = blkcg_css_alloc,
 	.css_offline = blkcg_css_offline,
 	.css_free = blkcg_css_free,
 	.can_attach = blkcg_can_attach,
+	.bind = blkcg_bind,
 	.dfl_cftypes = blkcg_files,
 	.legacy_cftypes = blkcg_legacy_files,
 	.legacy_name = "blkio",
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 0fe721eaff28..04de88463a98 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1522,6 +1522,9 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
+static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
+			    bool on_dfl, bool reset_dev, bool is_leaf_weight);
+
 static void cfqg_stats_exit(struct cfqg_stats *stats)
 {
 	blkg_rwstat_exit(&stats->merged);
@@ -1578,14 +1581,14 @@ static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
 static void cfq_cpd_init(struct blkcg_policy_data *cpd)
 {
 	struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
+	unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ?
+			      CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
 
-	if (cpd_to_blkcg(cpd) == &blkcg_root) {
-		cgd->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
-		cgd->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
-	} else {
-		cgd->weight = CFQ_WEIGHT_LEGACY_DFL;
-		cgd->leaf_weight = CFQ_WEIGHT_LEGACY_DFL;
-	}
+	if (cpd_to_blkcg(cpd) == &blkcg_root)
+		weight *= 2;
+
+	cgd->weight = weight;
+	cgd->leaf_weight = weight;
 }
 
 static void cfq_cpd_free(struct blkcg_policy_data *cpd)
@@ -1593,6 +1596,19 @@ static void cfq_cpd_free(struct blkcg_policy_data *cpd)
 	kfree(cpd_to_cfqgd(cpd));
 }
 
+static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
+{
+	struct blkcg *blkcg = cpd_to_blkcg(cpd);
+	bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup);
+	unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
+
+	if (blkcg == &blkcg_root)
+		weight *= 2;
+
+	WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
+	WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
+}
+
 static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
 {
 	struct cfq_group *cfqg;
@@ -1742,6 +1758,8 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 					char *buf, size_t nbytes, loff_t off,
 					bool on_dfl, bool is_leaf_weight)
 {
+	unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
+	unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
 	struct blkg_conf_ctx ctx;
 	struct cfq_group *cfqg;
@@ -1769,7 +1787,7 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 	cfqgd = blkcg_to_cfqgd(blkcg);
 
 	ret = -ERANGE;
-	if (!v || (v >= CFQ_WEIGHT_LEGACY_MIN && v <= CFQ_WEIGHT_LEGACY_MAX)) {
+	if (!v || (v >= min && v <= max)) {
 		if (!is_leaf_weight) {
 			cfqg->dev_weight = v;
 			cfqg->new_weight = v ?: cfqgd->weight;
@@ -1797,15 +1815,17 @@ static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
 }
 
 static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
-			    bool is_leaf_weight)
+			    bool on_dfl, bool reset_dev, bool is_leaf_weight)
 {
+	unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
+	unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
 	struct blkcg *blkcg = css_to_blkcg(css);
 	struct blkcg_gq *blkg;
 	struct cfq_group_data *cfqgd;
 	int ret = 0;
 
-	if (val < CFQ_WEIGHT_LEGACY_MIN || val > CFQ_WEIGHT_LEGACY_MAX)
-		return -EINVAL;
+	if (val < min || val > max)
+		return -ERANGE;
 
 	spin_lock_irq(&blkcg->lock);
 	cfqgd = blkcg_to_cfqgd(blkcg);
@@ -1826,9 +1846,13 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
 			continue;
 
 		if (!is_leaf_weight) {
+			if (reset_dev)
+				cfqg->dev_weight = 0;
 			if (!cfqg->dev_weight)
 				cfqg->new_weight = cfqgd->weight;
 		} else {
+			if (reset_dev)
+				cfqg->dev_leaf_weight = 0;
 			if (!cfqg->dev_leaf_weight)
 				cfqg->new_leaf_weight = cfqgd->leaf_weight;
 		}
@@ -1842,13 +1866,13 @@ out:
 static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
 			  u64 val)
 {
-	return __cfq_set_weight(css, val, false);
+	return __cfq_set_weight(css, val, false, false, false);
 }
 
 static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
 			       struct cftype *cft, u64 val)
 {
-	return __cfq_set_weight(css, val, true);
+	return __cfq_set_weight(css, val, false, false, true);
 }
 
 static int cfqg_print_stat(struct seq_file *sf, void *v)
@@ -2135,7 +2159,7 @@ static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
 	/* "WEIGHT" or "default WEIGHT" sets the default weight */
 	v = simple_strtoull(buf, &endp, 0);
 	if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
-		ret = __cfq_set_weight(of_css(of), v, false);
+		ret = __cfq_set_weight(of_css(of), v, true, false, false);
 		return ret ?: nbytes;
 	}
 
@@ -4512,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
 		goto out_free;
 
 	cfq_init_cfqg_base(cfqd->root_group);
-#endif
 	cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
 	cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
+#endif
 
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
@@ -4715,6 +4739,7 @@ static struct blkcg_policy blkcg_policy_cfq = {
 	.cpd_alloc_fn		= cfq_cpd_alloc,
 	.cpd_init_fn		= cfq_cpd_init,
 	.cpd_free_fn		= cfq_cpd_free,
+	.cpd_bind_fn		= cfq_cpd_bind,
 
 	.pd_alloc_fn		= cfq_pd_alloc,
 	.pd_init_fn		= cfq_pd_init,
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 9a7c4bd45fff..0a5cc7a1109b 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -138,6 +138,7 @@ struct blkcg_gq {
 typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
 typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
 typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
 typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
 typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
 typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
@@ -155,6 +156,7 @@ struct blkcg_policy {
 	blkcg_pol_alloc_cpd_fn		*cpd_alloc_fn;
 	blkcg_pol_init_cpd_fn		*cpd_init_fn;
 	blkcg_pol_free_cpd_fn		*cpd_free_fn;
+	blkcg_pol_bind_cpd_fn		*cpd_bind_fn;
 
 	blkcg_pol_alloc_pd_fn		*pd_alloc_fn;
 	blkcg_pol_init_pd_fn		*pd_init_fn;
-- 
cgit v1.2.3-70-g09d2


From 03100aada96f0645bbcb89aea24c01f02d0ef1fa Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 19 Aug 2015 14:24:05 -0700
Subject: block: Replace SG_GAPS with new queue limits mask

The SG_GAPS queue flag caused checks for bio vector alignment against
PAGE_SIZE, but the device may have different constraints. This patch
adds a queue limits so a driver with such constraints can set to allow
requests that would have been unnecessarily split. The new gaps check
takes the request_queue as a parameter to simplify the logic around
invoking this function.

This new limit makes the queue flag redundant, so removing it and
all usage. Device-mappers will inherit the correct settings through
blk_stack_limits().

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c               |  3 +--
 block/blk-merge.c         | 23 +++++++----------------
 block/blk-settings.c      | 14 ++++++++++++++
 drivers/block/nvme-core.c |  2 +-
 drivers/md/dm-table.c     | 13 -------------
 include/linux/bio.h       |  9 ---------
 include/linux/blkdev.h    | 21 ++++++++++++++++++++-
 7 files changed, 43 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 425d6d4a2f7a..b1f198f9a317 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -742,8 +742,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
 		 * If the queue doesn't support SG gaps and adding this
 		 * offset would create a gap, disallow it.
 		 */
-		if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) &&
-		    bvec_gap_to_prev(prev, offset))
+		if (bvec_gap_to_prev(q, prev, offset))
 			return 0;
 	}
 
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 0027def35f5a..0e0d9fd01c40 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -82,8 +82,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 		 * If the queue doesn't support SG gaps and adding this
 		 * offset would create a gap, disallow it.
 		 */
-		if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) &&
-		    prev && bvec_gap_to_prev(&bvprv, bv.bv_offset))
+		if (prev && bvec_gap_to_prev(q, &bvprv, bv.bv_offset))
 			goto split;
 
 		if (prev && blk_queue_cluster(q)) {
@@ -484,12 +483,12 @@ static bool req_no_special_merge(struct request *req)
 	return !q->mq_ops && req->special;
 }
 
-static int req_gap_to_prev(struct request *req, struct request *next)
+static int req_gap_to_prev(struct request *req, struct bio *next)
 {
 	struct bio *prev = req->biotail;
 
-	return bvec_gap_to_prev(&prev->bi_io_vec[prev->bi_vcnt - 1],
-				next->bio->bi_io_vec[0].bv_offset);
+	return bvec_gap_to_prev(req->q, &prev->bi_io_vec[prev->bi_vcnt - 1],
+			next->bi_io_vec[1].bv_offset);
 }
 
 static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
@@ -506,8 +505,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	if (req_no_special_merge(req) || req_no_special_merge(next))
 		return 0;
 
-	if (test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags) &&
-	    req_gap_to_prev(req, next))
+	if (req_gap_to_prev(req, next->bio))
 		return 0;
 
 	/*
@@ -692,8 +690,6 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 
 bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 {
-	struct request_queue *q = rq->q;
-
 	if (!rq_mergeable(rq) || !bio_mergeable(bio))
 		return false;
 
@@ -718,13 +714,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 		return false;
 
 	/* Only check gaps if the bio carries data */
-	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && bio_has_data(bio)) {
-		struct bio_vec *bprev;
-
-		bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1];
-		if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset))
-			return false;
-	}
+	if (bio_has_data(bio) && req_gap_to_prev(rq, bio))
+		return false;
 
 	return true;
 }
diff --git a/block/blk-settings.c b/block/blk-settings.c
index d27b4e272356..f96c72116931 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -89,6 +89,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->max_segments = BLK_MAX_SEGMENTS;
 	lim->max_integrity_segments = 0;
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
+	lim->virt_boundary_mask = 0;
 	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
 	lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
 	lim->chunk_sectors = 0;
@@ -532,6 +533,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
 					    b->seg_boundary_mask);
+	t->virt_boundary_mask = min_not_zero(t->virt_boundary_mask,
+					    b->virt_boundary_mask);
 
 	t->max_segments = min_not_zero(t->max_segments, b->max_segments);
 	t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
@@ -771,6 +774,17 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
 }
 EXPORT_SYMBOL(blk_queue_segment_boundary);
 
+/**
+ * blk_queue_virt_boundary - set boundary rules for bio merging
+ * @q:  the request queue for the device
+ * @mask:  the memory boundary mask
+ **/
+void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask)
+{
+	q->limits.virt_boundary_mask = mask;
+}
+EXPORT_SYMBOL(blk_queue_virt_boundary);
+
 /**
  * blk_queue_dma_alignment - set dma length and memory alignment
  * @q:     the request queue for the device
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index d844ec4a2b85..2f694d78da55 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2067,7 +2067,6 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
 		goto out_free_ns;
 	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
-	queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, ns->queue);
 	ns->dev = dev;
 	ns->queue->queuedata = ns;
 
@@ -2087,6 +2086,7 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
 		blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
 	if (dev->vwc & NVME_CTRL_VWC_PRESENT)
 		blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
+	blk_queue_virt_boundary(ns->queue, dev->page_size - 1);
 
 	disk->major = nvme_major;
 	disk->first_minor = 0;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index afb4ad3dfeb3..e76ed003769e 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1380,14 +1380,6 @@ static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
 	return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
 }
 
-static int queue_supports_sg_gaps(struct dm_target *ti, struct dm_dev *dev,
-				  sector_t start, sector_t len, void *data)
-{
-	struct request_queue *q = bdev_get_queue(dev->bdev);
-
-	return q && !test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags);
-}
-
 static bool dm_table_all_devices_attribute(struct dm_table *t,
 					   iterate_devices_callout_fn func)
 {
@@ -1508,11 +1500,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	else
 		queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
 
-	if (dm_table_all_devices_attribute(t, queue_supports_sg_gaps))
-		queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, q);
-	else
-		queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, q);
-
 	dm_table_set_integrity(t);
 
 	/*
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ad7217458812..b9b6e046b52e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -186,15 +186,6 @@ static inline void *bio_data(struct bio *bio)
 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
 	__BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, queue_segment_boundary((q)))
 
-/*
- * Check if adding a bio_vec after bprv with offset would create a gap in
- * the SG list. Most drivers don't care about this, but some do.
- */
-static inline bool bvec_gap_to_prev(struct bio_vec *bprv, unsigned int offset)
-{
-	return offset || ((bprv->bv_offset + bprv->bv_len) & (PAGE_SIZE - 1));
-}
-
 /*
  * drivers should _never_ use the all version - the bio may have been split
  * before it got to the driver and the driver won't own all of it
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e427debc7008..a622f270f09e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -250,6 +250,7 @@ struct blk_queue_tag {
 struct queue_limits {
 	unsigned long		bounce_pfn;
 	unsigned long		seg_boundary_mask;
+	unsigned long		virt_boundary_mask;
 
 	unsigned int		max_hw_sectors;
 	unsigned int		chunk_sectors;
@@ -479,7 +480,6 @@ struct request_queue {
 #define QUEUE_FLAG_DEAD        19	/* queue tear-down finished */
 #define QUEUE_FLAG_INIT_DONE   20	/* queue is initialized */
 #define QUEUE_FLAG_NO_SG_MERGE 21	/* don't attempt to merge SG segments*/
-#define QUEUE_FLAG_SG_GAPS     22	/* queue doesn't support SG gaps */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
@@ -981,6 +981,7 @@ extern int blk_queue_dma_drain(struct request_queue *q,
 			       void *buf, unsigned int size);
 extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
+extern void blk_queue_virt_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
 extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
@@ -1149,6 +1150,11 @@ static inline unsigned long queue_segment_boundary(struct request_queue *q)
 	return q->limits.seg_boundary_mask;
 }
 
+static inline unsigned long queue_virt_boundary(struct request_queue *q)
+{
+	return q->limits.virt_boundary_mask;
+}
+
 static inline unsigned int queue_max_sectors(struct request_queue *q)
 {
 	return q->limits.max_sectors;
@@ -1349,6 +1355,19 @@ static inline void put_dev_sector(Sector p)
 	page_cache_release(p.v);
 }
 
+/*
+ * Check if adding a bio_vec after bprv with offset would create a gap in
+ * the SG list. Most drivers don't care about this, but some do.
+ */
+static inline bool bvec_gap_to_prev(struct request_queue *q,
+				struct bio_vec *bprv, unsigned int offset)
+{
+	if (!queue_virt_boundary(q))
+		return false;
+	return offset ||
+		((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+}
+
 struct work_struct;
 int kblockd_schedule_work(struct work_struct *work);
 int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
-- 
cgit v1.2.3-70-g09d2


From b7560de198222994374c1340a389f12d5efb244a Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Fri, 14 Aug 2015 15:20:26 +0300
Subject: genirq: Introduce irq_chip_set_type_parent() helper

This helper is required for irq chips which do not implement a
irq_set_type callback and need to call down the irq domain hierarchy
for the actual trigger type change.

This helper is required to fix further wreckage caused by the
conversion of TI OMAP to hierarchical irq domains and therefor tagged
for stable.

[ tglx: Massaged changelog ]

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: <linux@arm.linux.org.uk>
Cc: <nsekhar@ti.com>
Cc: <jason@lakedaemon.net>
Cc: <balbi@ti.com>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: <tony@atomide.com>
Cc: <marc.zyngier@arm.com>
Cc: stable@vger.kernel.org # 4.1
Link: http://lkml.kernel.org/r/1439554830-19502-3-git-send-email-grygorii.strashko@ti.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h |  1 +
 kernel/irq/chip.c   | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 92188b0225bb..51744bcf74ee 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -484,6 +484,7 @@ extern int irq_chip_set_affinity_parent(struct irq_data *data,
 extern int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on);
 extern int irq_chip_set_vcpu_affinity_parent(struct irq_data *data,
 					     void *vcpu_info);
+extern int irq_chip_set_type_parent(struct irq_data *data, unsigned int type);
 #endif
 
 /* Handling of unhandled and spurious interrupts: */
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6de638bccba7..ae216824e8ca 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -984,6 +984,23 @@ int irq_chip_set_affinity_parent(struct irq_data *data,
 	return -ENOSYS;
 }
 
+/**
+ * irq_chip_set_type_parent - Set IRQ type on the parent interrupt
+ * @data:	Pointer to interrupt specific data
+ * @type:	IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
+ *
+ * Conditional, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_type_parent(struct irq_data *data, unsigned int type)
+{
+	data = data->parent_data;
+
+	if (data->chip->irq_set_type)
+		return data->chip->irq_set_type(data, type);
+
+	return -ENOSYS;
+}
+
 /**
  * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
  * @data:	Pointer to interrupt specific data
-- 
cgit v1.2.3-70-g09d2


From 2a606188c55990fa65cba3fd9b64f2b7542b7692 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 19 Aug 2015 22:30:00 -0500
Subject: NFSv4: Enable delegated opens even when reboot recovery is pending

Unlike the previous attempt, this takes into account the fact that
we may be calling it from the recovery thread itself. Detect this
by looking at what kind of open we're doing, and checking the state
of the NFS_DELEGATION_NEED_RECLAIM if it turns out we're doing a
reboot reclaim-type open.

Cc: Olga Kornievskaia <aglo@umich.edu>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 27 +++++++++++++++++++--------
 include/linux/nfs_xdr.h |  2 +-
 2 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 95c5e8d39bef..6e988fd92f69 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1150,16 +1150,25 @@ out:
 	return ret;
 }
 
-static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
+static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode,
+		enum open_claim_type4 claim)
 {
 	if (delegation == NULL)
 		return 0;
 	if ((delegation->type & fmode) != fmode)
 		return 0;
-	if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
-		return 0;
 	if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
 		return 0;
+	switch (claim) {
+	case NFS4_OPEN_CLAIM_NULL:
+	case NFS4_OPEN_CLAIM_FH:
+		break;
+	case NFS4_OPEN_CLAIM_PREVIOUS:
+		if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
+			break;
+	default:
+		return 0;
+	}
 	nfs_mark_delegation_referenced(delegation);
 	return 1;
 }
@@ -1378,6 +1387,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 	struct nfs_delegation *delegation;
 	int open_mode = opendata->o_arg.open_flags;
 	fmode_t fmode = opendata->o_arg.fmode;
+	enum open_claim_type4 claim = opendata->o_arg.claim;
 	nfs4_stateid stateid;
 	int ret = -EAGAIN;
 
@@ -1391,7 +1401,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 		spin_unlock(&state->owner->so_lock);
 		rcu_read_lock();
 		delegation = rcu_dereference(nfsi->delegation);
-		if (!can_open_delegated(delegation, fmode)) {
+		if (!can_open_delegated(delegation, fmode, claim)) {
 			rcu_read_unlock();
 			break;
 		}
@@ -1854,6 +1864,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 	struct nfs4_opendata *data = calldata;
 	struct nfs4_state_owner *sp = data->owner;
 	struct nfs_client *clp = sp->so_server->nfs_client;
+	enum open_claim_type4 claim = data->o_arg.claim;
 
 	if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
 		goto out_wait;
@@ -1868,15 +1879,15 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 			goto out_no_action;
 		rcu_read_lock();
 		delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
-		if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR &&
-		    data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH &&
-		    can_open_delegated(delegation, data->o_arg.fmode))
+		if (can_open_delegated(delegation, data->o_arg.fmode, claim))
 			goto unlock_no_action;
 		rcu_read_unlock();
 	}
 	/* Update client id. */
 	data->o_arg.clientid = clp->cl_clientid;
-	switch (data->o_arg.claim) {
+	switch (claim) {
+	default:
+		break;
 	case NFS4_OPEN_CLAIM_PREVIOUS:
 	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
 	case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7bbe50504211..b9b530409ff7 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -389,7 +389,7 @@ struct nfs_openargs {
 	const struct nfs_server *server;	 /* Needed for ID mapping */
 	const u32 *		bitmask;
 	const u32 *		open_bitmap;
-	__u32			claim;
+	enum open_claim_type4	claim;
 	enum createmode4	createmode;
 	const struct nfs4_label *label;
 };
-- 
cgit v1.2.3-70-g09d2


From 7dfffb9541bca80bbf8df1869564f9220ee150d2 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 17 Aug 2015 15:08:55 +0200
Subject: dmaengine: Stricter legacy checking in
 dma_request_slave_channel_compat()

dma_request_slave_channel_compat() is meant for drivers that support
both DT and legacy platform device based probing: if DT channel DMA
setup fails, it will fall back to platform data based DMA channel setup,
using hardcoded DMA channel IDs and a filter function.

However, if the DTS doesn't provide a "dmas" property for the device,
the fallback is also used. If the legacy filter function is not
hardcoded in the DMA slave driver, but comes from platform data, it will
be NULL. Then dma_request_slave_channel_compat() will succeed
incorrectly, and return a DMA channel, as a NULL legacy filter function
actually means "all channels are OK", not "do not match".

Later, when trying to use that DMA channel, it will fail with:

    rcar-dmac e6700000.dma-controller: rcar_dmac_prep_slave_sg: bad parameter: len=1, id=-22

To fix this, ensure that both the filter function and the DMA channel ID
are not NULL before using the legacy fallback.

Note that some DMA slave drivers can handle this failure, and will fall
back to PIO.

See also commit 056f6c87028544de ("dmaengine: shdma: Make dummy
shdma_chan_filter() always return false"), which fixed the same issue
for the case where shdma_chan_filter() is hardcoded in a DMA slave
driver.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dmaengine.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 3a732ac95285..7ea9184eaa13 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1237,6 +1237,9 @@ static inline struct dma_chan
 	if (chan)
 		return chan;
 
+	if (!fn || !fn_param)
+		return NULL;
+
 	return __dma_request_channel(mask, fn, fn_param);
 }
 #endif /* DMAENGINE_H */
-- 
cgit v1.2.3-70-g09d2


From 6080a89357cc46f3450839a84af75c3d18f57772 Mon Sep 17 00:00:00 2001
From: Tomi Valkeinen <tomi.valkeinen@ti.com>
Date: Thu, 15 Jan 2015 13:47:19 +0200
Subject: fbdev: fix cea_modes array size

CEA defines 64 modes, indexed from 1 to 64. modedb has cea_modes arrays,
which contains 64 entries. However, the code uses the CEA indices
directly, i.e. the first mode is at cea_modes[1]. This means the array
is one too short.

This does not cause references to uninitialized memory as the code in
fbmon only allows indexes up to 63, and the cea_modes does not contain
an entry for the mode 64 so it could not be used in any case.

However, the code contains a check 'if (idx > ARRAY_SIZE(cea_modes)',
and while that check is a no-op as at that point idx cannot be >= 63, it
upsets static checkers.

Fix this by increasing the cea_array size to be 65, and change the code
to allow mode 64.

Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
---
 drivers/video/fbdev/core/fbmon.c  | 4 ++--
 drivers/video/fbdev/core/modedb.c | 2 +-
 include/linux/fb.h                | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/fbmon.c b/drivers/video/fbdev/core/fbmon.c
index d787533d9c8b..47c3191ec313 100644
--- a/drivers/video/fbdev/core/fbmon.c
+++ b/drivers/video/fbdev/core/fbmon.c
@@ -1072,9 +1072,9 @@ void fb_edid_add_monspecs(unsigned char *edid, struct fb_monspecs *specs)
 
 	for (i = specs->modedb_len + num; i < specs->modedb_len + num + svd_n; i++) {
 		int idx = svd[i - specs->modedb_len - num];
-		if (!idx || idx > 63) {
+		if (!idx || idx >= ARRAY_SIZE(cea_modes)) {
 			pr_warning("Reserved SVD code %d\n", idx);
-		} else if (idx > ARRAY_SIZE(cea_modes) || !cea_modes[idx].xres) {
+		} else if (!cea_modes[idx].xres) {
 			pr_warning("Unimplemented SVD code %d\n", idx);
 		} else {
 			memcpy(&m[i], cea_modes + idx, sizeof(m[i]));
diff --git a/drivers/video/fbdev/core/modedb.c b/drivers/video/fbdev/core/modedb.c
index 7d07cf824b64..2510fa728d77 100644
--- a/drivers/video/fbdev/core/modedb.c
+++ b/drivers/video/fbdev/core/modedb.c
@@ -289,7 +289,7 @@ static const struct fb_videomode modedb[] = {
 };
 
 #ifdef CONFIG_FB_MODE_HELPERS
-const struct fb_videomode cea_modes[64] = {
+const struct fb_videomode cea_modes[65] = {
 	/* #1: 640x480p@59.94/60Hz */
 	[1] = {
 		NULL, 60, 640, 480, 39722, 48, 16, 33, 10, 96, 2, 0,
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 043f3283b71c..bc9afa74ee11 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -788,7 +788,7 @@ struct dmt_videomode {
 
 extern const char *fb_mode_option;
 extern const struct fb_videomode vesa_modes[];
-extern const struct fb_videomode cea_modes[64];
+extern const struct fb_videomode cea_modes[65];
 extern const struct dmt_videomode dmt_modes[];
 
 struct fb_modelist {
-- 
cgit v1.2.3-70-g09d2


From d2a7926d42b3b46e45b4e44dc3302b2701ec0856 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Mon, 3 Aug 2015 21:27:10 -0500
Subject: PCI: Add pci_scan_root_bus_msi()

Add a pci_scan_root_bus_msi() interface so an arch can specify the MSI
controller up front.  This removes the need for a pcibios callback to set
the MSI controller later.

This is not exported because I'd like to replace the variety of "scan root
bus" interfaces with a single, more extensible interface that can handle
the MSI controller, domain, pci_ops, resources, etc.  I hope this interface
is temporary.

[bhelgaas: changelog, split into separate patch]
Suggested-by: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jingoo Han <jingoohan1@gmail.com>
---
 drivers/pci/probe.c | 14 ++++++++++++--
 include/linux/pci.h |  4 ++++
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index cefd636681b6..9ff4df0db0c3 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2096,8 +2096,9 @@ void pci_bus_release_busn_res(struct pci_bus *b)
 			res, ret ? "can not be" : "is");
 }
 
-struct pci_bus *pci_scan_root_bus(struct device *parent, int bus,
-		struct pci_ops *ops, void *sysdata, struct list_head *resources)
+struct pci_bus *pci_scan_root_bus_msi(struct device *parent, int bus,
+		struct pci_ops *ops, void *sysdata,
+		struct list_head *resources, struct msi_controller *msi)
 {
 	struct resource_entry *window;
 	bool found = false;
@@ -2114,6 +2115,8 @@ struct pci_bus *pci_scan_root_bus(struct device *parent, int bus,
 	if (!b)
 		return NULL;
 
+	b->msi = msi;
+
 	if (!found) {
 		dev_info(&b->dev,
 		 "No busn resource found for root bus, will use [bus %02x-ff]\n",
@@ -2128,6 +2131,13 @@ struct pci_bus *pci_scan_root_bus(struct device *parent, int bus,
 
 	return b;
 }
+
+struct pci_bus *pci_scan_root_bus(struct device *parent, int bus,
+		struct pci_ops *ops, void *sysdata, struct list_head *resources)
+{
+	return pci_scan_root_bus_msi(parent, bus, ops, sysdata, resources,
+				     NULL);
+}
 EXPORT_SYMBOL(pci_scan_root_bus);
 
 struct pci_bus *pci_scan_bus(int bus, struct pci_ops *ops,
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8a0321a8fb59..4d4f9d29fcc9 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -787,6 +787,10 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
 int pci_bus_insert_busn_res(struct pci_bus *b, int bus, int busmax);
 int pci_bus_update_busn_res_end(struct pci_bus *b, int busmax);
 void pci_bus_release_busn_res(struct pci_bus *b);
+struct pci_bus *pci_scan_root_bus_msi(struct device *parent, int bus,
+				      struct pci_ops *ops, void *sysdata,
+				      struct list_head *resources,
+				      struct msi_controller *msi);
 struct pci_bus *pci_scan_root_bus(struct device *parent, int bus,
 					     struct pci_ops *ops, void *sysdata,
 					     struct list_head *resources);
-- 
cgit v1.2.3-70-g09d2


From 40603526569b304dd92f720f2f8ab11e828ea145 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 18 Aug 2015 13:55:36 -0600
Subject: pmem, x86: move x86 PMEM API to new pmem.h header

Move the x86 PMEM API implementation out of asm/cacheflush.h and into
its own header asm/pmem.h.  This will allow members of the PMEM API to
be more easily identified on this and other architectures.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 MAINTAINERS                       |  1 +
 arch/x86/include/asm/cacheflush.h | 71 ------------------------------
 arch/x86/include/asm/pmem.h       | 92 +++++++++++++++++++++++++++++++++++++++
 include/linux/pmem.h              |  2 +-
 4 files changed, 94 insertions(+), 72 deletions(-)
 create mode 100644 arch/x86/include/asm/pmem.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 9289ecb57b68..8fcde3717ab7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6161,6 +6161,7 @@ Q:	https://patchwork.kernel.org/project/linux-nvdimm/list/
 S:	Supported
 F:	drivers/nvdimm/pmem.c
 F:	include/linux/pmem.h
+F:	arch/*/include/asm/pmem.h
 
 LINUX FOR IBM pSERIES (RS/6000)
 M:	Paul Mackerras <paulus@au.ibm.com>
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 9bf3ea14b9f0..471418ac1ff9 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -109,75 +109,4 @@ static inline int rodata_test(void)
 }
 #endif
 
-#ifdef ARCH_HAS_NOCACHE_UACCESS
-
-/**
- * arch_memcpy_to_pmem - copy data to persistent memory
- * @dst: destination buffer for the copy
- * @src: source buffer for the copy
- * @n: length of the copy in bytes
- *
- * Copy data to persistent memory media via non-temporal stores so that
- * a subsequent arch_wmb_pmem() can flush cpu and memory controller
- * write buffers to guarantee durability.
- */
-static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
-		size_t n)
-{
-	int unwritten;
-
-	/*
-	 * We are copying between two kernel buffers, if
-	 * __copy_from_user_inatomic_nocache() returns an error (page
-	 * fault) we would have already reported a general protection fault
-	 * before the WARN+BUG.
-	 */
-	unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
-			(void __user *) src, n);
-	if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
-				__func__, dst, src, unwritten))
-		BUG();
-}
-
-/**
- * arch_wmb_pmem - synchronize writes to persistent memory
- *
- * After a series of arch_memcpy_to_pmem() operations this drains data
- * from cpu write buffers and any platform (memory controller) buffers
- * to ensure that written data is durable on persistent memory media.
- */
-static inline void arch_wmb_pmem(void)
-{
-	/*
-	 * wmb() to 'sfence' all previous writes such that they are
-	 * architecturally visible to 'pcommit'.  Note, that we've
-	 * already arranged for pmem writes to avoid the cache via
-	 * arch_memcpy_to_pmem().
-	 */
-	wmb();
-	pcommit_sfence();
-}
-
-static inline bool __arch_has_wmb_pmem(void)
-{
-#ifdef CONFIG_X86_64
-	/*
-	 * We require that wmb() be an 'sfence', that is only guaranteed on
-	 * 64-bit builds
-	 */
-	return static_cpu_has(X86_FEATURE_PCOMMIT);
-#else
-	return false;
-#endif
-}
-#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */
-extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n);
-extern void arch_wmb_pmem(void);
-
-static inline bool __arch_has_wmb_pmem(void)
-{
-	return false;
-}
-#endif
-
 #endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
new file mode 100644
index 000000000000..f43462cc91aa
--- /dev/null
+++ b/arch/x86/include/asm/pmem.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __ASM_X86_PMEM_H__
+#define __ASM_X86_PMEM_H__
+
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
+#include <asm/special_insns.h>
+
+#ifdef ARCH_HAS_NOCACHE_UACCESS
+
+/**
+ * arch_memcpy_to_pmem - copy data to persistent memory
+ * @dst: destination buffer for the copy
+ * @src: source buffer for the copy
+ * @n: length of the copy in bytes
+ *
+ * Copy data to persistent memory media via non-temporal stores so that
+ * a subsequent arch_wmb_pmem() can flush cpu and memory controller
+ * write buffers to guarantee durability.
+ */
+static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
+		size_t n)
+{
+	int unwritten;
+
+	/*
+	 * We are copying between two kernel buffers, if
+	 * __copy_from_user_inatomic_nocache() returns an error (page
+	 * fault) we would have already reported a general protection fault
+	 * before the WARN+BUG.
+	 */
+	unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
+			(void __user *) src, n);
+	if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
+				__func__, dst, src, unwritten))
+		BUG();
+}
+
+/**
+ * arch_wmb_pmem - synchronize writes to persistent memory
+ *
+ * After a series of arch_memcpy_to_pmem() operations this drains data
+ * from cpu write buffers and any platform (memory controller) buffers
+ * to ensure that written data is durable on persistent memory media.
+ */
+static inline void arch_wmb_pmem(void)
+{
+	/*
+	 * wmb() to 'sfence' all previous writes such that they are
+	 * architecturally visible to 'pcommit'.  Note, that we've
+	 * already arranged for pmem writes to avoid the cache via
+	 * arch_memcpy_to_pmem().
+	 */
+	wmb();
+	pcommit_sfence();
+}
+
+static inline bool __arch_has_wmb_pmem(void)
+{
+#ifdef CONFIG_X86_64
+	/*
+	 * We require that wmb() be an 'sfence', that is only guaranteed on
+	 * 64-bit builds
+	 */
+	return static_cpu_has(X86_FEATURE_PCOMMIT);
+#else
+	return false;
+#endif
+}
+#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */
+extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n);
+extern void arch_wmb_pmem(void);
+
+static inline bool __arch_has_wmb_pmem(void)
+{
+	return false;
+}
+#endif
+
+#endif /* __ASM_X86_PMEM_H__ */
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
index 20c367cd76e6..c2af613ec297 100644
--- a/include/linux/pmem.h
+++ b/include/linux/pmem.h
@@ -16,7 +16,7 @@
 #include <linux/io.h>
 
 #ifdef CONFIG_ARCH_HAS_PMEM_API
-#include <asm/cacheflush.h>
+#include <asm/pmem.h>
 #else
 static inline void arch_wmb_pmem(void)
 {
-- 
cgit v1.2.3-70-g09d2


From 18279b467a9d89afe44afbc19d768e834dbf4545 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 18 Aug 2015 13:55:37 -0600
Subject: pmem: remove layer when calling arch_has_wmb_pmem()

Prior to this change arch_has_wmb_pmem() was only called by
arch_has_pmem_api().  Both arch_has_wmb_pmem() and arch_has_pmem_api()
checked to make sure that CONFIG_ARCH_HAS_PMEM_API was enabled.

Instead, remove the old arch_has_wmb_pmem() wrapper to be rid of one
extra layer of indirection and the redundant CONFIG_ARCH_HAS_PMEM_API
check. Rename __arch_has_wmb_pmem() to arch_has_wmb_pmem() since we no
longer have a wrapper, and just have arch_has_pmem_api() call the
architecture specific arch_has_wmb_pmem() directly.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/include/asm/pmem.h |  2 +-
 include/linux/pmem.h        | 13 +++----------
 2 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index f43462cc91aa..1e8dbb72d6ee 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -67,7 +67,7 @@ static inline void arch_wmb_pmem(void)
 	pcommit_sfence();
 }
 
-static inline bool __arch_has_wmb_pmem(void)
+static inline bool arch_has_wmb_pmem(void)
 {
 #ifdef CONFIG_X86_64
 	/*
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
index c2af613ec297..a0706ea04efd 100644
--- a/include/linux/pmem.h
+++ b/include/linux/pmem.h
@@ -23,7 +23,7 @@ static inline void arch_wmb_pmem(void)
 	BUG();
 }
 
-static inline bool __arch_has_wmb_pmem(void)
+static inline bool arch_has_wmb_pmem(void)
 {
 	return false;
 }
@@ -38,7 +38,7 @@ static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
 /*
  * Architectures that define ARCH_HAS_PMEM_API must provide
  * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(), and
- * __arch_has_wmb_pmem().
+ * arch_has_wmb_pmem().
  */
 
 static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size)
@@ -52,7 +52,7 @@ static inline void memunmap_pmem(struct device *dev, void __pmem *addr)
 }
 
 /**
- * arch_has_wmb_pmem - true if wmb_pmem() ensures durability
+ * arch_has_pmem_api - true if wmb_pmem() ensures durability
  *
  * For a given cpu implementation within an architecture it is possible
  * that wmb_pmem() resolves to a nop.  In the case this returns
@@ -60,13 +60,6 @@ static inline void memunmap_pmem(struct device *dev, void __pmem *addr)
  * fall back to a different data consistency model, or otherwise notify
  * the user.
  */
-static inline bool arch_has_wmb_pmem(void)
-{
-	if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API))
-		return __arch_has_wmb_pmem();
-	return false;
-}
-
 static inline bool arch_has_pmem_api(void)
 {
 	return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && arch_has_wmb_pmem();
-- 
cgit v1.2.3-70-g09d2


From 5de490daec8b6354b90d5c9d3e2415b195f5adb6 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 18 Aug 2015 13:55:39 -0600
Subject: pmem: add copy_from_iter_pmem() and clear_pmem()

Add support for two new PMEM APIs, copy_from_iter_pmem() and
clear_pmem().  copy_from_iter_pmem() is used to copy data from an
iterator into a PMEM buffer.  clear_pmem() zeros a PMEM memory range.

Both of these new APIs must be explicitly ordered using a wmb_pmem()
function call and are implemented in such a way that the wmb_pmem()
will make the stores to PMEM durable.  Because both APIs are unordered
they can be called as needed without introducing any unwanted memory
barriers.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/include/asm/pmem.h | 75 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pmem.h        | 64 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 137 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index 7f3413fce46c..a3a0df6545ee 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -66,6 +66,81 @@ static inline void arch_wmb_pmem(void)
 	pcommit_sfence();
 }
 
+/**
+ * __arch_wb_cache_pmem - write back a cache range with CLWB
+ * @vaddr:	virtual start address
+ * @size:	number of bytes to write back
+ *
+ * Write back a cache range using the CLWB (cache line write back)
+ * instruction.  This function requires explicit ordering with an
+ * arch_wmb_pmem() call.  This API is internal to the x86 PMEM implementation.
+ */
+static inline void __arch_wb_cache_pmem(void *vaddr, size_t size)
+{
+	u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
+	unsigned long clflush_mask = x86_clflush_size - 1;
+	void *vend = vaddr + size;
+	void *p;
+
+	for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+	     p < vend; p += x86_clflush_size)
+		clwb(p);
+}
+
+/*
+ * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec
+ * iterators, so for other types (bvec & kvec) we must do a cache write-back.
+ */
+static inline bool __iter_needs_pmem_wb(struct iov_iter *i)
+{
+	return iter_is_iovec(i) == false;
+}
+
+/**
+ * arch_copy_from_iter_pmem - copy data from an iterator to PMEM
+ * @addr:	PMEM destination address
+ * @bytes:	number of bytes to copy
+ * @i:		iterator with source data
+ *
+ * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'.
+ * This function requires explicit ordering with an arch_wmb_pmem() call.
+ */
+static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
+		struct iov_iter *i)
+{
+	void *vaddr = (void __force *)addr;
+	size_t len;
+
+	/* TODO: skip the write-back by always using non-temporal stores */
+	len = copy_from_iter_nocache(vaddr, bytes, i);
+
+	if (__iter_needs_pmem_wb(i))
+		__arch_wb_cache_pmem(vaddr, bytes);
+
+	return len;
+}
+
+/**
+ * arch_clear_pmem - zero a PMEM memory range
+ * @addr:	virtual start address
+ * @size:	number of bytes to zero
+ *
+ * Write zeros into the memory range starting at 'addr' for 'size' bytes.
+ * This function requires explicit ordering with an arch_wmb_pmem() call.
+ */
+static inline void arch_clear_pmem(void __pmem *addr, size_t size)
+{
+	void *vaddr = (void __force *)addr;
+
+	/* TODO: implement the zeroing via non-temporal writes */
+	if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0)
+		clear_page(vaddr);
+	else
+		memset(vaddr, 0, size);
+
+	__arch_wb_cache_pmem(vaddr, size);
+}
+
 static inline bool arch_has_wmb_pmem(void)
 {
 #ifdef CONFIG_X86_64
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
index a0706ea04efd..a9d84bf335ee 100644
--- a/include/linux/pmem.h
+++ b/include/linux/pmem.h
@@ -14,6 +14,7 @@
 #define __PMEM_H__
 
 #include <linux/io.h>
+#include <linux/uio.h>
 
 #ifdef CONFIG_ARCH_HAS_PMEM_API
 #include <asm/pmem.h>
@@ -33,12 +34,24 @@ static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
 {
 	BUG();
 }
+
+static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
+		struct iov_iter *i)
+{
+	BUG();
+	return 0;
+}
+
+static inline void arch_clear_pmem(void __pmem *addr, size_t size)
+{
+	BUG();
+}
 #endif
 
 /*
  * Architectures that define ARCH_HAS_PMEM_API must provide
- * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(), and
- * arch_has_wmb_pmem().
+ * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(),
+ * arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem().
  */
 
 static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size)
@@ -78,6 +91,20 @@ static inline void default_memcpy_to_pmem(void __pmem *dst, const void *src,
 	memcpy((void __force *) dst, src, size);
 }
 
+static inline size_t default_copy_from_iter_pmem(void __pmem *addr,
+		size_t bytes, struct iov_iter *i)
+{
+	return copy_from_iter_nocache((void __force *)addr, bytes, i);
+}
+
+static inline void default_clear_pmem(void __pmem *addr, size_t size)
+{
+	if (size == PAGE_SIZE && ((unsigned long)addr & ~PAGE_MASK) == 0)
+		clear_page((void __force *)addr);
+	else
+		memset((void __force *)addr, 0, size);
+}
+
 /**
  * memremap_pmem - map physical persistent memory for pmem api
  * @offset: physical address of persistent memory
@@ -134,4 +161,37 @@ static inline void wmb_pmem(void)
 	if (arch_has_pmem_api())
 		arch_wmb_pmem();
 }
+
+/**
+ * copy_from_iter_pmem - copy data from an iterator to PMEM
+ * @addr:	PMEM destination address
+ * @bytes:	number of bytes to copy
+ * @i:		iterator with source data
+ *
+ * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'.
+ * This function requires explicit ordering with a wmb_pmem() call.
+ */
+static inline size_t copy_from_iter_pmem(void __pmem *addr, size_t bytes,
+		struct iov_iter *i)
+{
+	if (arch_has_pmem_api())
+		return arch_copy_from_iter_pmem(addr, bytes, i);
+	return default_copy_from_iter_pmem(addr, bytes, i);
+}
+
+/**
+ * clear_pmem - zero a PMEM memory range
+ * @addr:	virtual start address
+ * @size:	number of bytes to zero
+ *
+ * Write zeros into the memory range starting at 'addr' for 'size' bytes.
+ * This function requires explicit ordering with a wmb_pmem() call.
+ */
+static inline void clear_pmem(void __pmem *addr, size_t size)
+{
+	if (arch_has_pmem_api())
+		arch_clear_pmem(addr, size);
+	else
+		default_clear_pmem(addr, size);
+}
 #endif /* __PMEM_H__ */
-- 
cgit v1.2.3-70-g09d2


From e2e05394e4a3420dab96f728df4531893494e15d Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 18 Aug 2015 13:55:41 -0600
Subject: pmem, dax: have direct_access use __pmem annotation

Update the annotation for the kaddr pointer returned by direct_access()
so that it is a __pmem pointer.  This is consistent with the PMEM driver
and with how this direct_access() pointer is used in the DAX code.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/filesystems/Locking |  3 ++-
 arch/powerpc/sysdev/axonram.c     |  7 ++++---
 drivers/block/brd.c               |  4 ++--
 drivers/nvdimm/pmem.c             |  4 ++--
 drivers/s390/block/dcssblk.c      | 10 ++++++----
 fs/block_dev.c                    |  2 +-
 fs/dax.c                          | 37 ++++++++++++++++++++-----------------
 include/linux/blkdev.h            |  8 ++++----
 8 files changed, 41 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 6a34a0f4d37c..06d443450f21 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -397,7 +397,8 @@ prototypes:
 	int (*release) (struct gendisk *, fmode_t);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
-	int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *);
+	int (*direct_access) (struct block_device *, sector_t, void __pmem **,
+				unsigned long *);
 	int (*media_changed) (struct gendisk *);
 	void (*unlock_native_capacity) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index ee90db17b097..a2be2a66dab6 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -141,13 +141,14 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
  */
 static long
 axon_ram_direct_access(struct block_device *device, sector_t sector,
-		       void **kaddr, unsigned long *pfn, long size)
+		       void __pmem **kaddr, unsigned long *pfn, long size)
 {
 	struct axon_ram_bank *bank = device->bd_disk->private_data;
 	loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
+	void *addr = (void *)(bank->ph_addr + offset);
 
-	*kaddr = (void *)(bank->ph_addr + offset);
-	*pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT;
+	*kaddr = (void __pmem *)addr;
+	*pfn = virt_to_phys(addr) >> PAGE_SHIFT;
 
 	return bank->size - offset;
 }
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 64ab4951e9d6..c96402fd1560 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -371,7 +371,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 static long brd_direct_access(struct block_device *bdev, sector_t sector,
-			void **kaddr, unsigned long *pfn, long size)
+			void __pmem **kaddr, unsigned long *pfn, long size)
 {
 	struct brd_device *brd = bdev->bd_disk->private_data;
 	struct page *page;
@@ -381,7 +381,7 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector,
 	page = brd_insert_page(brd, sector);
 	if (!page)
 		return -ENOSPC;
-	*kaddr = page_address(page);
+	*kaddr = (void __pmem *)page_address(page);
 	*pfn = page_to_pfn(page);
 
 	/*
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index eb7552d939e1..f3b629779266 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -92,7 +92,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 }
 
 static long pmem_direct_access(struct block_device *bdev, sector_t sector,
-			      void **kaddr, unsigned long *pfn, long size)
+		      void __pmem **kaddr, unsigned long *pfn, long size)
 {
 	struct pmem_device *pmem = bdev->bd_disk->private_data;
 	size_t offset = sector << 9;
@@ -101,7 +101,7 @@ static long pmem_direct_access(struct block_device *bdev, sector_t sector,
 		return -ENODEV;
 
 	/* FIXME convert DAX to comprehend that this mapping has a lifetime */
-	*kaddr = (void __force *) pmem->virt_addr + offset;
+	*kaddr = pmem->virt_addr + offset;
 	*pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;
 
 	return pmem->size - offset;
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index da212813f2d5..2c5a397b9f3e 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -29,7 +29,7 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode);
 static void dcssblk_release(struct gendisk *disk, fmode_t mode);
 static void dcssblk_make_request(struct request_queue *q, struct bio *bio);
 static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
-				 void **kaddr, unsigned long *pfn, long size);
+			 void __pmem **kaddr, unsigned long *pfn, long size);
 
 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
 
@@ -879,18 +879,20 @@ fail:
 
 static long
 dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
-			void **kaddr, unsigned long *pfn, long size)
+			void __pmem **kaddr, unsigned long *pfn, long size)
 {
 	struct dcssblk_dev_info *dev_info;
 	unsigned long offset, dev_sz;
+	void *addr;
 
 	dev_info = bdev->bd_disk->private_data;
 	if (!dev_info)
 		return -ENODEV;
 	dev_sz = dev_info->end - dev_info->start;
 	offset = secnum * 512;
-	*kaddr = (void *) (dev_info->start + offset);
-	*pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT;
+	addr = (void *) (dev_info->start + offset);
+	*pfn = virt_to_phys(addr) >> PAGE_SHIFT;
+	*kaddr = (void __pmem *) addr;
 
 	return dev_sz - offset;
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 198243717da5..2345a9870e2c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -441,7 +441,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
  * accessible at this address.
  */
 long bdev_direct_access(struct block_device *bdev, sector_t sector,
-			void **addr, unsigned long *pfn, long size)
+			void __pmem **addr, unsigned long *pfn, long size)
 {
 	long avail;
 	const struct block_device_operations *ops = bdev->bd_disk->fops;
diff --git a/fs/dax.c b/fs/dax.c
index e07fecc93f80..7c634ac797b1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -35,7 +35,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size)
 
 	might_sleep();
 	do {
-		void *addr;
+		void __pmem *addr;
 		unsigned long pfn;
 		long count;
 
@@ -47,7 +47,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size)
 			unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
 			if (pgsz > count)
 				pgsz = count;
-			clear_pmem((void __pmem *)addr, pgsz);
+			clear_pmem(addr, pgsz);
 			addr += pgsz;
 			size -= pgsz;
 			count -= pgsz;
@@ -62,7 +62,8 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size)
 }
 EXPORT_SYMBOL_GPL(dax_clear_blocks);
 
-static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
+static long dax_get_addr(struct buffer_head *bh, void __pmem **addr,
+		unsigned blkbits)
 {
 	unsigned long pfn;
 	sector_t sector = bh->b_blocknr << (blkbits - 9);
@@ -70,15 +71,15 @@ static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
 }
 
 /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
-static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
-			loff_t end)
+static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
+		loff_t pos, loff_t end)
 {
 	loff_t final = end - pos + first; /* The final byte of the buffer */
 
 	if (first > 0)
-		clear_pmem((void __pmem *)addr, first);
+		clear_pmem(addr, first);
 	if (final < size)
-		clear_pmem((void __pmem *)addr + final, size - final);
+		clear_pmem(addr + final, size - final);
 }
 
 static bool buffer_written(struct buffer_head *bh)
@@ -106,7 +107,7 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
 	loff_t pos = start;
 	loff_t max = start;
 	loff_t bh_max = start;
-	void *addr;
+	void __pmem *addr;
 	bool hole = false;
 	bool need_wmb = false;
 
@@ -158,11 +159,11 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
 		}
 
 		if (iov_iter_rw(iter) == WRITE) {
-			len = copy_from_iter_pmem((void __pmem *)addr,
-					max - pos, iter);
+			len = copy_from_iter_pmem(addr, max - pos, iter);
 			need_wmb = true;
 		} else if (!hole)
-			len = copy_to_iter(addr, max - pos, iter);
+			len = copy_to_iter((void __force *)addr, max - pos,
+					iter);
 		else
 			len = iov_iter_zero(max - pos, iter);
 
@@ -268,11 +269,13 @@ static int dax_load_hole(struct address_space *mapping, struct page *page,
 static int copy_user_bh(struct page *to, struct buffer_head *bh,
 			unsigned blkbits, unsigned long vaddr)
 {
-	void *vfrom, *vto;
+	void __pmem *vfrom;
+	void *vto;
+
 	if (dax_get_addr(bh, &vfrom, blkbits) < 0)
 		return -EIO;
 	vto = kmap_atomic(to);
-	copy_user_page(vto, vfrom, vaddr, to);
+	copy_user_page(vto, (void __force *)vfrom, vaddr, to);
 	kunmap_atomic(vto);
 	return 0;
 }
@@ -283,7 +286,7 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 	struct address_space *mapping = inode->i_mapping;
 	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
-	void *addr;
+	void __pmem *addr;
 	unsigned long pfn;
 	pgoff_t size;
 	int error;
@@ -312,7 +315,7 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 	}
 
 	if (buffer_unwritten(bh) || buffer_new(bh)) {
-		clear_pmem((void __pmem *)addr, PAGE_SIZE);
+		clear_pmem(addr, PAGE_SIZE);
 		wmb_pmem();
 	}
 
@@ -548,11 +551,11 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
 	if (err < 0)
 		return err;
 	if (buffer_written(&bh)) {
-		void *addr;
+		void __pmem *addr;
 		err = dax_get_addr(&bh, &addr, inode->i_blkbits);
 		if (err < 0)
 			return err;
-		clear_pmem((void __pmem *)addr + offset, length);
+		clear_pmem(addr + offset, length);
 		wmb_pmem();
 	}
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d4068c17d0df..c401ecdff9cb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1555,8 +1555,8 @@ struct block_device_operations {
 	int (*rw_page)(struct block_device *, sector_t, struct page *, int rw);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
-	long (*direct_access)(struct block_device *, sector_t,
-					void **, unsigned long *pfn, long size);
+	long (*direct_access)(struct block_device *, sector_t, void __pmem **,
+			unsigned long *pfn, long size);
 	unsigned int (*check_events) (struct gendisk *disk,
 				      unsigned int clearing);
 	/* ->media_changed() is DEPRECATED, use ->check_events() instead */
@@ -1574,8 +1574,8 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
-extern long bdev_direct_access(struct block_device *, sector_t, void **addr,
-						unsigned long *pfn, long size);
+extern long bdev_direct_access(struct block_device *, sector_t,
+		void __pmem **addr, unsigned long *pfn, long size);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
-- 
cgit v1.2.3-70-g09d2


From 44f636da4e71e0c73d6e29d0319a8954ce3f247a Mon Sep 17 00:00:00 2001
From: Leilk Liu <leilk.liu@mediatek.com>
Date: Thu, 20 Aug 2015 17:19:06 +0800
Subject: spi: mediatek: fix spi incorrect endian usage

TX_ENDIAN/RX_ENDIAN bits define whether to reverse the endian
order of the data DMA from/to memory. The endian order should
keep the same with cpu endian.

Signed-off-by: Leilk Liu <leilk.liu@mediatek.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-mt65xx.c                 | 38 ++++++++++++++------------------
 include/linux/platform_data/spi-mt65xx.h |  2 --
 2 files changed, 16 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-mt65xx.c b/drivers/spi/spi-mt65xx.c
index 546d70cc1e21..2eda2d1782f3 100644
--- a/drivers/spi/spi-mt65xx.c
+++ b/drivers/spi/spi-mt65xx.c
@@ -122,8 +122,6 @@ static const struct mtk_spi_compatible mt8173_compat = {
 static const struct mtk_chip_config mtk_default_chip_info = {
 	.rx_mlsb = 1,
 	.tx_mlsb = 1,
-	.tx_endian = 0,
-	.rx_endian = 0,
 };
 
 static const struct of_device_id mtk_spi_of_match[] = {
@@ -161,9 +159,13 @@ static void mtk_spi_config(struct mtk_spi *mdata,
 	reg_val |= (chip_config->rx_mlsb << SPI_CMD_RXMSBF_OFFSET);
 
 	/* set the tx/rx endian */
-	reg_val &= ~(SPI_CMD_TX_ENDIAN | SPI_CMD_RX_ENDIAN);
-	reg_val |= (chip_config->tx_endian << SPI_CMD_TX_ENDIAN_OFFSET);
-	reg_val |= (chip_config->rx_endian << SPI_CMD_RX_ENDIAN_OFFSET);
+#ifdef __LITTLE_ENDIAN
+	reg_val &= ~SPI_CMD_TX_ENDIAN;
+	reg_val &= ~SPI_CMD_RX_ENDIAN;
+#else
+	reg_val |= SPI_CMD_TX_ENDIAN;
+	reg_val |= SPI_CMD_RX_ENDIAN;
+#endif
 
 	/* set finish and pause interrupt always enable */
 	reg_val |= SPI_CMD_FINISH_IE | SPI_CMD_PAUSE_EN;
@@ -352,7 +354,7 @@ static int mtk_spi_fifo_transfer(struct spi_master *master,
 				 struct spi_device *spi,
 				 struct spi_transfer *xfer)
 {
-	int cnt, i;
+	int cnt;
 	struct mtk_spi *mdata = spi_master_get_devdata(master);
 
 	mdata->cur_transfer = xfer;
@@ -364,10 +366,7 @@ static int mtk_spi_fifo_transfer(struct spi_master *master,
 		cnt = xfer->len / 4 + 1;
 	else
 		cnt = xfer->len / 4;
-
-	for (i = 0; i < cnt; i++)
-		writel(*((u32 *)xfer->tx_buf + i),
-		       mdata->base + SPI_TX_DATA_REG);
+	iowrite32_rep(mdata->base + SPI_TX_DATA_REG, xfer->tx_buf, cnt);
 
 	mtk_spi_enable_transfer(master);
 
@@ -437,7 +436,7 @@ static bool mtk_spi_can_dma(struct spi_master *master,
 
 static irqreturn_t mtk_spi_interrupt(int irq, void *dev_id)
 {
-	u32 cmd, reg_val, i;
+	u32 cmd, reg_val, cnt;
 	struct spi_master *master = dev_id;
 	struct mtk_spi *mdata = spi_master_get_devdata(master);
 	struct spi_transfer *trans = mdata->cur_transfer;
@@ -449,18 +448,13 @@ static irqreturn_t mtk_spi_interrupt(int irq, void *dev_id)
 		mdata->state = MTK_SPI_IDLE;
 
 	if (!master->can_dma(master, master->cur_msg->spi, trans)) {
-		/* xfer len is not N*4 bytes every time in a transfer,
-		 * but SPI_RX_DATA_REG must reads 4 bytes once,
-		 * so rx buffer byte by byte.
-		 */
 		if (trans->rx_buf) {
-			for (i = 0; i < mdata->xfer_len; i++) {
-				if (i % 4 == 0)
-					reg_val =
-					readl(mdata->base + SPI_RX_DATA_REG);
-				*((u8 *)(trans->rx_buf + i)) =
-					(reg_val >> ((i % 4) * 8)) & 0xff;
-			}
+			if (mdata->xfer_len % 4)
+				cnt = mdata->xfer_len / 4 + 1;
+			else
+				cnt = mdata->xfer_len / 4;
+			ioread32_rep(mdata->base + SPI_RX_DATA_REG,
+				     trans->rx_buf, cnt);
 		}
 		spi_finalize_current_transfer(master);
 		return IRQ_HANDLED;
diff --git a/include/linux/platform_data/spi-mt65xx.h b/include/linux/platform_data/spi-mt65xx.h
index 751225569d27..54b04483976c 100644
--- a/include/linux/platform_data/spi-mt65xx.h
+++ b/include/linux/platform_data/spi-mt65xx.h
@@ -16,7 +16,5 @@
 struct mtk_chip_config {
 	u32 tx_mlsb;
 	u32 rx_mlsb;
-	u32 tx_endian;
-	u32 rx_endian;
 };
 #endif
-- 
cgit v1.2.3-70-g09d2


From f4e774f55fe0bb568a0877b2eb9e1b4b5a6f5cbc Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 19 Aug 2015 09:46:22 +0200
Subject: average: remove out-of-line implementation

Since all users are now converted to the inline implementation,
remove the out-of-line implementation entirely.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/average.h | 24 -------------------
 lib/Kconfig             | 10 --------
 lib/average.c           | 64 -------------------------------------------------
 3 files changed, 98 deletions(-)
 delete mode 100644 lib/average.c

(limited to 'include/linux')

diff --git a/include/linux/average.h b/include/linux/average.h
index 60f8226c5652..d04aa58280de 100644
--- a/include/linux/average.h
+++ b/include/linux/average.h
@@ -3,30 +3,6 @@
 
 /* Exponentially weighted moving average (EWMA) */
 
-/* For more documentation see lib/average.c */
-
-struct ewma {
-	unsigned long internal;
-	unsigned long factor;
-	unsigned long weight;
-};
-
-extern void ewma_init(struct ewma *avg, unsigned long factor,
-		      unsigned long weight);
-
-extern struct ewma *ewma_add(struct ewma *avg, unsigned long val);
-
-/**
- * ewma_read() - Get average value
- * @avg: Average structure
- *
- * Returns the average value held in @avg.
- */
-static inline unsigned long ewma_read(const struct ewma *avg)
-{
-	return avg->internal >> avg->factor;
-}
-
 #define DECLARE_EWMA(name, _factor, _weight)				\
 	struct ewma_##name {						\
 		unsigned long internal;					\
diff --git a/lib/Kconfig b/lib/Kconfig
index 3a2ef67db6c7..278890dd1049 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -460,16 +460,6 @@ config ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 config LRU_CACHE
 	tristate
 
-config AVERAGE
-	bool "Averaging functions"
-	help
-	  This option is provided for the case where no in-kernel-tree
-	  modules require averaging functions, but a module built outside
-	  the kernel tree does. Such modules that use library averaging
-	  functions require Y here.
-
-	  If unsure, say N.
-
 config CLZ_TAB
 	bool
 
diff --git a/lib/average.c b/lib/average.c
deleted file mode 100644
index 114d1beae0c7..000000000000
--- a/lib/average.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * lib/average.c
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2.  See the file COPYING for more details.
- */
-
-#include <linux/export.h>
-#include <linux/average.h>
-#include <linux/kernel.h>
-#include <linux/bug.h>
-#include <linux/log2.h>
-
-/**
- * DOC: Exponentially Weighted Moving Average (EWMA)
- *
- * These are generic functions for calculating Exponentially Weighted Moving
- * Averages (EWMA). We keep a structure with the EWMA parameters and a scaled
- * up internal representation of the average value to prevent rounding errors.
- * The factor for scaling up and the exponential weight (or decay rate) have to
- * be specified thru the init fuction. The structure should not be accessed
- * directly but only thru the helper functions.
- */
-
-/**
- * ewma_init() - Initialize EWMA parameters
- * @avg: Average structure
- * @factor: Factor to use for the scaled up internal value. The maximum value
- *	of averages can be ULONG_MAX/(factor*weight). For performance reasons
- *	factor has to be a power of 2.
- * @weight: Exponential weight, or decay rate. This defines how fast the
- *	influence of older values decreases. For performance reasons weight has
- *	to be a power of 2.
- *
- * Initialize the EWMA parameters for a given struct ewma @avg.
- */
-void ewma_init(struct ewma *avg, unsigned long factor, unsigned long weight)
-{
-	WARN_ON(!is_power_of_2(weight) || !is_power_of_2(factor));
-
-	avg->weight = ilog2(weight);
-	avg->factor = ilog2(factor);
-	avg->internal = 0;
-}
-EXPORT_SYMBOL(ewma_init);
-
-/**
- * ewma_add() - Exponentially weighted moving average (EWMA)
- * @avg: Average structure
- * @val: Current value
- *
- * Add a sample to the average.
- */
-struct ewma *ewma_add(struct ewma *avg, unsigned long val)
-{
-	unsigned long internal = ACCESS_ONCE(avg->internal);
-
-	ACCESS_ONCE(avg->internal) = internal ?
-		(((internal << avg->weight) - internal) +
-			(val << avg->factor)) >> avg->weight :
-		(val << avg->factor);
-	return avg;
-}
-EXPORT_SYMBOL(ewma_add);
-- 
cgit v1.2.3-70-g09d2


From c5f58f2d700ef484fc2fbaa9c624c6076109f989 Mon Sep 17 00:00:00 2001
From: Markus Pargmann <mpa@pengutronix.de>
Date: Fri, 21 Aug 2015 10:26:40 +0200
Subject: regmap: Add missing comments about struct regmap_bus

There are some fields of this struct undocumented or old. This patch
updates the missing comments.

Signed-off-by: Markus Pargmann <mpa@pengutronix.de>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 59c55ea0f0b5..73fc34d0c4c2 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -296,8 +296,12 @@ typedef void (*regmap_hw_free_context)(void *context);
  *                if not implemented  on a given device.
  * @async_write: Write operation which completes asynchronously, optional and
  *               must serialise with respect to non-async I/O.
+ * @reg_write: Write a single register value to the given register address. This
+ *             write operation has to complete when returning from the function.
  * @read: Read operation.  Data is returned in the buffer used to transmit
  *         data.
+ * @reg_read: Read a single register value from a given register address.
+ * @free_context: Free context.
  * @async_alloc: Allocate a regmap_async() structure.
  * @read_flag_mask: Mask to be set in the top byte of the register when doing
  *                  a read.
@@ -307,7 +311,6 @@ typedef void (*regmap_hw_free_context)(void *context);
  * @val_format_endian_default: Default endianness for formatted register
  *     values. Used when the regmap_config specifies DEFAULT. If this is
  *     DEFAULT, BIG is assumed.
- * @async_size: Size of struct used for async work.
  */
 struct regmap_bus {
 	bool fast_io;
-- 
cgit v1.2.3-70-g09d2


From 2f064f3485cd29633ad1b3cfb00cc519509a3d72 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 21 Aug 2015 14:11:51 -0700
Subject: mm: make page pfmemalloc check more robust

Commit c48a11c7ad26 ("netvm: propagate page->pfmemalloc to skb") added
checks for page->pfmemalloc to __skb_fill_page_desc():

        if (page->pfmemalloc && !page->mapping)
                skb->pfmemalloc = true;

It assumes page->mapping == NULL implies that page->pfmemalloc can be
trusted.  However, __delete_from_page_cache() can set set page->mapping
to NULL and leave page->index value alone.  Due to being in union, a
non-zero page->index will be interpreted as true page->pfmemalloc.

So the assumption is invalid if the networking code can see such a page.
And it seems it can.  We have encountered this with a NFS over loopback
setup when such a page is attached to a new skbuf.  There is no copying
going on in this case so the page confuses __skb_fill_page_desc which
interprets the index as pfmemalloc flag and the network stack drops
packets that have been allocated using the reserves unless they are to
be queued on sockets handling the swapping which is the case here and
that leads to hangs when the nfs client waits for a response from the
server which has been dropped and thus never arrive.

The struct page is already heavily packed so rather than finding another
hole to put it in, let's do a trick instead.  We can reuse the index
again but define it to an impossible value (-1UL).  This is the page
index so it should never see the value that large.  Replace all direct
users of page->pfmemalloc by page_is_pfmemalloc which will hide this
nastiness from unspoiled eyes.

The information will get lost if somebody wants to use page->index
obviously but that was the case before and the original code expected
that the information should be persisted somewhere else if that is
really needed (e.g.  what SLAB and SLUB do).

[akpm@linux-foundation.org: fix blooper in slub]
Fixes: c48a11c7ad26 ("netvm: propagate page->pfmemalloc to skb")
Signed-off-by: Michal Hocko <mhocko@suse.com>
Debugged-by: Vlastimil Babka <vbabka@suse.com>
Debugged-by: Jiri Bohac <jbohac@suse.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David Miller <davem@davemloft.net>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: <stable@vger.kernel.org>	[3.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/net/ethernet/intel/fm10k/fm10k_main.c     |  2 +-
 drivers/net/ethernet/intel/igb/igb_main.c         |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |  2 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  2 +-
 include/linux/mm.h                                | 28 +++++++++++++++++++++++
 include/linux/mm_types.h                          |  9 --------
 include/linux/skbuff.h                            | 14 ++++--------
 mm/page_alloc.c                                   |  9 +++++---
 mm/slab.c                                         |  4 ++--
 mm/slub.c                                         |  2 +-
 net/core/skbuff.c                                 |  2 +-
 11 files changed, 47 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index 982fdcdc795b..b5b2925103ec 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -216,7 +216,7 @@ static void fm10k_reuse_rx_page(struct fm10k_ring *rx_ring,
 
 static inline bool fm10k_page_is_reserved(struct page *page)
 {
-	return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
+	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
 static bool fm10k_can_reuse_rx_page(struct fm10k_rx_buffer *rx_buffer,
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 2f70a9b152bd..830466c49987 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6566,7 +6566,7 @@ static void igb_reuse_rx_page(struct igb_ring *rx_ring,
 
 static inline bool igb_page_is_reserved(struct page *page)
 {
-	return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
+	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
 static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 9aa6104e34ea..ae21e0b06c3a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1832,7 +1832,7 @@ static void ixgbe_reuse_rx_page(struct ixgbe_ring *rx_ring,
 
 static inline bool ixgbe_page_is_reserved(struct page *page)
 {
-	return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
+	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index e71cdde9cb01..1d7b00b038a2 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -765,7 +765,7 @@ static void ixgbevf_reuse_rx_page(struct ixgbevf_ring *rx_ring,
 
 static inline bool ixgbevf_page_is_reserved(struct page *page)
 {
-	return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
+	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
 /**
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2e872f92dbac..bf6f117fcf4d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1002,6 +1002,34 @@ static inline int page_mapped(struct page *page)
 	return atomic_read(&(page)->_mapcount) >= 0;
 }
 
+/*
+ * Return true only if the page has been allocated with
+ * ALLOC_NO_WATERMARKS and the low watermark was not
+ * met implying that the system is under some pressure.
+ */
+static inline bool page_is_pfmemalloc(struct page *page)
+{
+	/*
+	 * Page index cannot be this large so this must be
+	 * a pfmemalloc page.
+	 */
+	return page->index == -1UL;
+}
+
+/*
+ * Only to be called by the page allocator on a freshly allocated
+ * page.
+ */
+static inline void set_page_pfmemalloc(struct page *page)
+{
+	page->index = -1UL;
+}
+
+static inline void clear_page_pfmemalloc(struct page *page)
+{
+	page->index = 0;
+}
+
 /*
  * Different kinds of faults, as returned by handle_mm_fault().
  * Used to decide whether a process gets delivered SIGBUS or
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0038ac7466fd..15549578d559 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -63,15 +63,6 @@ struct page {
 		union {
 			pgoff_t index;		/* Our offset within mapping. */
 			void *freelist;		/* sl[aou]b first free object */
-			bool pfmemalloc;	/* If set by the page allocator,
-						 * ALLOC_NO_WATERMARKS was set
-						 * and the low watermark was not
-						 * met implying that the system
-						 * is under some pressure. The
-						 * caller should try ensure
-						 * this page is only used to
-						 * free other pages.
-						 */
 		};
 
 		union {
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 22b6d9ca1654..9b88536487e6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1602,20 +1602,16 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
 	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
 	/*
-	 * Propagate page->pfmemalloc to the skb if we can. The problem is
-	 * that not all callers have unique ownership of the page. If
-	 * pfmemalloc is set, we check the mapping as a mapping implies
-	 * page->index is set (index and pfmemalloc share space).
-	 * If it's a valid mapping, we cannot use page->pfmemalloc but we
-	 * do not lose pfmemalloc information as the pages would not be
-	 * allocated using __GFP_MEMALLOC.
+	 * Propagate page pfmemalloc to the skb if we can. The problem is
+	 * that not all callers have unique ownership of the page but rely
+	 * on page_is_pfmemalloc doing the right thing(tm).
 	 */
 	frag->page.p		  = page;
 	frag->page_offset	  = off;
 	skb_frag_size_set(frag, size);
 
 	page = compound_head(page);
-	if (page->pfmemalloc && !page->mapping)
+	if (page_is_pfmemalloc(page))
 		skb->pfmemalloc	= true;
 }
 
@@ -2263,7 +2259,7 @@ static inline struct page *dev_alloc_page(void)
 static inline void skb_propagate_pfmemalloc(struct page *page,
 					     struct sk_buff *skb)
 {
-	if (page && page->pfmemalloc)
+	if (page_is_pfmemalloc(page))
 		skb->pfmemalloc = true;
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df959b7d6085..5b5240b7f642 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1343,12 +1343,15 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
 	set_page_owner(page, order, gfp_flags);
 
 	/*
-	 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
+	 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
 	 * allocate the page. The expectation is that the caller is taking
 	 * steps that will free more memory. The caller should avoid the page
 	 * being used for !PFMEMALLOC purposes.
 	 */
-	page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
+	if (alloc_flags & ALLOC_NO_WATERMARKS)
+		set_page_pfmemalloc(page);
+	else
+		clear_page_pfmemalloc(page);
 
 	return 0;
 }
@@ -3345,7 +3348,7 @@ refill:
 		atomic_add(size - 1, &page->_count);
 
 		/* reset page count bias and offset to start of new frag */
-		nc->pfmemalloc = page->pfmemalloc;
+		nc->pfmemalloc = page_is_pfmemalloc(page);
 		nc->pagecnt_bias = size;
 		nc->offset = size;
 	}
diff --git a/mm/slab.c b/mm/slab.c
index 200e22412a16..bbd0b47dc6a9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1603,7 +1603,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 	}
 
 	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
-	if (unlikely(page->pfmemalloc))
+	if (page_is_pfmemalloc(page))
 		pfmemalloc_active = true;
 
 	nr_pages = (1 << cachep->gfporder);
@@ -1614,7 +1614,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 		add_zone_page_state(page_zone(page),
 			NR_SLAB_UNRECLAIMABLE, nr_pages);
 	__SetPageSlab(page);
-	if (page->pfmemalloc)
+	if (page_is_pfmemalloc(page))
 		SetPageSlabPfmemalloc(page);
 
 	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
diff --git a/mm/slub.c b/mm/slub.c
index 816df0016555..f68c0e50f3c0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1427,7 +1427,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	inc_slabs_node(s, page_to_nid(page), page->objects);
 	page->slab_cache = s;
 	__SetPageSlab(page);
-	if (page->pfmemalloc)
+	if (page_is_pfmemalloc(page))
 		SetPageSlabPfmemalloc(page);
 
 	start = page_address(page);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bf9a5d93c2d1..7b84330e5d30 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -340,7 +340,7 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
 
 	if (skb && frag_size) {
 		skb->head_frag = 1;
-		if (virt_to_head_page(data)->pfmemalloc)
+		if (page_is_pfmemalloc(virt_to_head_page(data)))
 			skb->pfmemalloc = 1;
 	}
 	return skb;
-- 
cgit v1.2.3-70-g09d2


From c031f6a904975d5fa84c541333cce444a21ca713 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 19 Aug 2015 19:02:02 +0800
Subject: f2fs: add annotation for space utilization of regular/inline dentry

Add annotation to let us know more clearly about space utilization
information of regular dentry and inline dentry.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/linux/f2fs_fs.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 920408a21ffd..25c6324a0dd0 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -417,15 +417,25 @@ typedef __le32	f2fs_hash_t;
 
 #define GET_DENTRY_SLOTS(x)	((x + F2FS_SLOT_LEN - 1) >> F2FS_SLOT_LEN_BITS)
 
-/* the number of dentry in a block */
-#define NR_DENTRY_IN_BLOCK	214
-
 /* MAX level for dir lookup */
 #define MAX_DIR_HASH_DEPTH	63
 
 /* MAX buckets in one level of dir */
 #define MAX_DIR_BUCKETS		(1 << ((MAX_DIR_HASH_DEPTH / 2) - 1))
 
+/*
+ * space utilization of regular dentry and inline dentry
+ *		regular dentry			inline dentry
+ * bitmap	1 * 27 = 27			1 * 23 = 23
+ * reserved	1 * 3 = 3			1 * 7 = 7
+ * dentry	11 * 214 = 2354			11 * 182 = 2002
+ * filename	8 * 214 = 1712			8 * 182 = 1456
+ * total	4096				3488
+ *
+ * Note: there are more reserved space in inline dentry than in regular
+ * dentry, when converting inline dentry we should handle this carefully.
+ */
+#define NR_DENTRY_IN_BLOCK	214	/* the number of dentry in a block */
 #define SIZE_OF_DIR_ENTRY	11	/* by byte */
 #define SIZE_OF_DENTRY_BITMAP	((NR_DENTRY_IN_BLOCK + BITS_PER_BYTE - 1) / \
 					BITS_PER_BYTE)
-- 
cgit v1.2.3-70-g09d2


From 920e277e17f12870188f4564887a95ae9ac03e31 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Date: Thu, 13 Aug 2015 08:37:23 +0300
Subject: x86/kasan: Define KASAN_SHADOW_OFFSET per architecture

Current definition of  KASAN_SHADOW_OFFSET in
include/linux/kasan.h will not work for upcomming arm64, so move
it to the arch header.

Signed-off-by: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Klimov <klimov.linux@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Keitel <dkeitel@codeaurora.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yury <yury.norov@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1439444244-26057-2-git-send-email-ryabinin.a.a@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/kasan.h | 3 +++
 include/linux/kasan.h        | 1 -
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
index 74a2a8dc9908..1410b567ecde 100644
--- a/arch/x86/include/asm/kasan.h
+++ b/arch/x86/include/asm/kasan.h
@@ -1,6 +1,9 @@
 #ifndef _ASM_X86_KASAN_H
 #define _ASM_X86_KASAN_H
 
+#include <linux/const.h>
+#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
+
 /*
  * Compiler uses shadow offset assuming that addresses start
  * from 0. Kernel addresses don't start from 0, so shadow
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 5486d777b706..6fb1c7d4292c 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -10,7 +10,6 @@ struct vm_struct;
 #ifdef CONFIG_KASAN
 
 #define KASAN_SHADOW_SCALE_SHIFT 3
-#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
 
 #include <asm/kasan.h>
 #include <linux/sched.h>
-- 
cgit v1.2.3-70-g09d2


From 69786cdb379bbc6eab14cf2393c1abd879316e85 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Date: Thu, 13 Aug 2015 08:37:24 +0300
Subject: x86/kasan, mm: Introduce generic kasan_populate_zero_shadow()

Introduce generic kasan_populate_zero_shadow(shadow_start,
shadow_end). This function maps kasan_zero_page to the
[shadow_start, shadow_end] addresses.

This replaces x86_64 specific populate_zero_shadow() and will
be used for ARM64 in follow on patches.

The main changes from original version are:

 * Use p?d_populate*() instead of set_p?d()
 * Use memblock allocator directly instead of vmemmap_alloc_block()
 * __pa() instead of __pa_nodebug(). __pa() causes troubles
   iff we use it before kasan_early_init(). kasan_populate_zero_shadow()
   will be used later, so we ok with __pa() here.

Signed-off-by: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Klimov <klimov.linux@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: David Keitel <dkeitel@codeaurora.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yury <yury.norov@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1439444244-26057-3-git-send-email-ryabinin.a.a@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/kasan_init_64.c | 123 ++---------------------------------
 include/linux/kasan.h       |   9 +++
 mm/kasan/Makefile           |   2 +-
 mm/kasan/kasan_init.c       | 152 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 167 insertions(+), 119 deletions(-)
 create mode 100644 mm/kasan/kasan_init.c

(limited to 'include/linux')

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index e1840f3db5b5..9ce5da27b136 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -12,20 +12,6 @@
 extern pgd_t early_level4_pgt[PTRS_PER_PGD];
 extern struct range pfn_mapped[E820_X_MAX];
 
-static pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
-static pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
-static pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
-
-/*
- * This page used as early shadow. We don't use empty_zero_page
- * at early stages, stack instrumentation could write some garbage
- * to this page.
- * Latter we reuse it as zero shadow for large ranges of memory
- * that allowed to access, but not instrumented by kasan
- * (vmalloc/vmemmap ...).
- */
-static unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
-
 static int __init map_range(struct range *range)
 {
 	unsigned long start;
@@ -62,106 +48,6 @@ static void __init kasan_map_early_shadow(pgd_t *pgd)
 	}
 }
 
-static int __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
-				unsigned long end)
-{
-	pte_t *pte = pte_offset_kernel(pmd, addr);
-
-	while (addr + PAGE_SIZE <= end) {
-		WARN_ON(!pte_none(*pte));
-		set_pte(pte, __pte(__pa_nodebug(kasan_zero_page)
-					| __PAGE_KERNEL_RO));
-		addr += PAGE_SIZE;
-		pte = pte_offset_kernel(pmd, addr);
-	}
-	return 0;
-}
-
-static int __init zero_pmd_populate(pud_t *pud, unsigned long addr,
-				unsigned long end)
-{
-	int ret = 0;
-	pmd_t *pmd = pmd_offset(pud, addr);
-
-	while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) {
-		WARN_ON(!pmd_none(*pmd));
-		set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte)
-					| _KERNPG_TABLE));
-		addr += PMD_SIZE;
-		pmd = pmd_offset(pud, addr);
-	}
-	if (addr < end) {
-		if (pmd_none(*pmd)) {
-			void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
-			if (!p)
-				return -ENOMEM;
-			set_pmd(pmd, __pmd(__pa_nodebug(p) | _KERNPG_TABLE));
-		}
-		ret = zero_pte_populate(pmd, addr, end);
-	}
-	return ret;
-}
-
-
-static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
-				unsigned long end)
-{
-	int ret = 0;
-	pud_t *pud = pud_offset(pgd, addr);
-
-	while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) {
-		WARN_ON(!pud_none(*pud));
-		set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd)
-					| _KERNPG_TABLE));
-		addr += PUD_SIZE;
-		pud = pud_offset(pgd, addr);
-	}
-
-	if (addr < end) {
-		if (pud_none(*pud)) {
-			void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
-			if (!p)
-				return -ENOMEM;
-			set_pud(pud, __pud(__pa_nodebug(p) | _KERNPG_TABLE));
-		}
-		ret = zero_pmd_populate(pud, addr, end);
-	}
-	return ret;
-}
-
-static int __init zero_pgd_populate(unsigned long addr, unsigned long end)
-{
-	int ret = 0;
-	pgd_t *pgd = pgd_offset_k(addr);
-
-	while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) {
-		WARN_ON(!pgd_none(*pgd));
-		set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud)
-					| _KERNPG_TABLE));
-		addr += PGDIR_SIZE;
-		pgd = pgd_offset_k(addr);
-	}
-
-	if (addr < end) {
-		if (pgd_none(*pgd)) {
-			void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
-			if (!p)
-				return -ENOMEM;
-			set_pgd(pgd, __pgd(__pa_nodebug(p) | _KERNPG_TABLE));
-		}
-		ret = zero_pud_populate(pgd, addr, end);
-	}
-	return ret;
-}
-
-
-static void __init populate_zero_shadow(const void *start, const void *end)
-{
-	if (zero_pgd_populate((unsigned long)start, (unsigned long)end))
-		panic("kasan: unable to map zero shadow!");
-}
-
-
 #ifdef CONFIG_KASAN_INLINE
 static int kasan_die_handler(struct notifier_block *self,
 			     unsigned long val,
@@ -213,7 +99,7 @@ void __init kasan_init(void)
 
 	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
 
-	populate_zero_shadow((void *)KASAN_SHADOW_START,
+	kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
 			kasan_mem_to_shadow((void *)PAGE_OFFSET));
 
 	for (i = 0; i < E820_X_MAX; i++) {
@@ -223,14 +109,15 @@ void __init kasan_init(void)
 		if (map_range(&pfn_mapped[i]))
 			panic("kasan: unable to allocate shadow!");
 	}
-	populate_zero_shadow(kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
-			kasan_mem_to_shadow((void *)__START_KERNEL_map));
+	kasan_populate_zero_shadow(
+		kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+		kasan_mem_to_shadow((void *)__START_KERNEL_map));
 
 	vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
 			(unsigned long)kasan_mem_to_shadow(_end),
 			NUMA_NO_NODE);
 
-	populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
+	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
 			(void *)KASAN_SHADOW_END);
 
 	memset(kasan_zero_page, 0, PAGE_SIZE);
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 6fb1c7d4292c..4b9f85c963d0 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -12,8 +12,17 @@ struct vm_struct;
 #define KASAN_SHADOW_SCALE_SHIFT 3
 
 #include <asm/kasan.h>
+#include <asm/pgtable.h>
 #include <linux/sched.h>
 
+extern unsigned char kasan_zero_page[PAGE_SIZE];
+extern pte_t kasan_zero_pte[PTRS_PER_PTE];
+extern pmd_t kasan_zero_pmd[PTRS_PER_PMD];
+extern pud_t kasan_zero_pud[PTRS_PER_PUD];
+
+void kasan_populate_zero_shadow(const void *shadow_start,
+				const void *shadow_end);
+
 static inline void *kasan_mem_to_shadow(const void *addr)
 {
 	return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT)
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index bd837b8c2f41..64710148941e 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -5,4 +5,4 @@ CFLAGS_REMOVE_kasan.o = -pg
 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
 CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
 
-obj-y := kasan.o report.o
+obj-y := kasan.o report.o kasan_init.o
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
new file mode 100644
index 000000000000..3f9a41cf0ac6
--- /dev/null
+++ b/mm/kasan/kasan_init.c
@@ -0,0 +1,152 @@
+/*
+ * This file contains some kasan initialization code.
+ *
+ * Copyright (c) 2015 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/memblock.h>
+#include <linux/pfn.h>
+
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+
+/*
+ * This page serves two purposes:
+ *   - It used as early shadow memory. The entire shadow region populated
+ *     with this page, before we will be able to setup normal shadow memory.
+ *   - Latter it reused it as zero shadow to cover large ranges of memory
+ *     that allowed to access, but not handled by kasan (vmalloc/vmemmap ...).
+ */
+unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
+
+#if CONFIG_PGTABLE_LEVELS > 3
+pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
+#endif
+#if CONFIG_PGTABLE_LEVELS > 2
+pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
+#endif
+pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
+
+static __init void *early_alloc(size_t size, int node)
+{
+	return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS),
+					BOOTMEM_ALLOC_ACCESSIBLE, node);
+}
+
+static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
+				unsigned long end)
+{
+	pte_t *pte = pte_offset_kernel(pmd, addr);
+	pte_t zero_pte;
+
+	zero_pte = pfn_pte(PFN_DOWN(__pa(kasan_zero_page)), PAGE_KERNEL);
+	zero_pte = pte_wrprotect(zero_pte);
+
+	while (addr + PAGE_SIZE <= end) {
+		set_pte_at(&init_mm, addr, pte, zero_pte);
+		addr += PAGE_SIZE;
+		pte = pte_offset_kernel(pmd, addr);
+	}
+}
+
+static void __init zero_pmd_populate(pud_t *pud, unsigned long addr,
+				unsigned long end)
+{
+	pmd_t *pmd = pmd_offset(pud, addr);
+	unsigned long next;
+
+	do {
+		next = pmd_addr_end(addr, end);
+
+		if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) {
+			pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
+			continue;
+		}
+
+		if (pmd_none(*pmd)) {
+			pmd_populate_kernel(&init_mm, pmd,
+					early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+		}
+		zero_pte_populate(pmd, addr, next);
+	} while (pmd++, addr = next, addr != end);
+}
+
+static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
+				unsigned long end)
+{
+	pud_t *pud = pud_offset(pgd, addr);
+	unsigned long next;
+
+	do {
+		next = pud_addr_end(addr, end);
+		if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) {
+			pmd_t *pmd;
+
+			pud_populate(&init_mm, pud, kasan_zero_pmd);
+			pmd = pmd_offset(pud, addr);
+			pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
+			continue;
+		}
+
+		if (pud_none(*pud)) {
+			pud_populate(&init_mm, pud,
+				early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+		}
+		zero_pmd_populate(pud, addr, next);
+	} while (pud++, addr = next, addr != end);
+}
+
+/**
+ * kasan_populate_zero_shadow - populate shadow memory region with
+ *                               kasan_zero_page
+ * @shadow_start - start of the memory range to populate
+ * @shadow_end   - end of the memory range to populate
+ */
+void __init kasan_populate_zero_shadow(const void *shadow_start,
+				const void *shadow_end)
+{
+	unsigned long addr = (unsigned long)shadow_start;
+	unsigned long end = (unsigned long)shadow_end;
+	pgd_t *pgd = pgd_offset_k(addr);
+	unsigned long next;
+
+	do {
+		next = pgd_addr_end(addr, end);
+
+		if (IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) {
+			pud_t *pud;
+			pmd_t *pmd;
+
+			/*
+			 * kasan_zero_pud should be populated with pmds
+			 * at this moment.
+			 * [pud,pmd]_populate*() below needed only for
+			 * 3,2 - level page tables where we don't have
+			 * puds,pmds, so pgd_populate(), pud_populate()
+			 * is noops.
+			 */
+			pgd_populate(&init_mm, pgd, kasan_zero_pud);
+			pud = pud_offset(pgd, addr);
+			pud_populate(&init_mm, pud, kasan_zero_pmd);
+			pmd = pmd_offset(pud, addr);
+			pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
+			continue;
+		}
+
+		if (pgd_none(*pgd)) {
+			pgd_populate(&init_mm, pgd,
+				early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+		}
+		zero_pud_populate(pgd, addr, next);
+	} while (pgd++, addr = next, addr != end);
+}
-- 
cgit v1.2.3-70-g09d2


From b7fe10e5ebac2a3f37e95535e616494b65fa020f Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Wed, 19 Aug 2015 17:07:32 -0700
Subject: gro: Fix remcsum offload to deal with frags in GRO

The remote checksum offload GRO did not consider the case that frag0
might be in use. This patch fixes that by accessing headers using the
skb_gro functions and not saving offsets relative to skb->head.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       | 23 +++++++++--------------
 include/linux/netdevice.h | 44 ++++++++++++++++++++++++++++++++------------
 net/ipv4/fou.c            | 28 ++++++++++++----------------
 3 files changed, 53 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 54615bb9d916..64fcd2402562 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -519,10 +519,10 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 					  u32 data, struct gro_remcsum *grc,
 					  bool nopartial)
 {
-	size_t start, offset, plen;
+	size_t start, offset;
 
 	if (skb->remcsum_offload)
-		return NULL;
+		return vh;
 
 	if (!NAPI_GRO_CB(skb)->csum_valid)
 		return NULL;
@@ -532,17 +532,8 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 			  offsetof(struct udphdr, check) :
 			  offsetof(struct tcphdr, check));
 
-	plen = hdrlen + offset + sizeof(u16);
-
-	/* Pull checksum that will be written */
-	if (skb_gro_header_hard(skb, off + plen)) {
-		vh = skb_gro_header_slow(skb, off + plen, off);
-		if (!vh)
-			return NULL;
-	}
-
-	skb_gro_remcsum_process(skb, (void *)vh + hdrlen,
-				start, offset, grc, nopartial);
+	vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen,
+				     start, offset, grc, nopartial);
 
 	skb->remcsum_offload = 1;
 
@@ -573,7 +564,6 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
 			goto out;
 	}
 
-	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
 	skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));
 
 	flags = ntohl(vh->vx_flags);
@@ -588,6 +578,8 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
 			goto out;
 	}
 
+	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
+
 	flush = 0;
 
 	for (p = *head; p; p = p->next) {
@@ -1110,6 +1102,9 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 {
 	size_t start, offset, plen;
 
+	if (skb->remcsum_offload)
+		return vh;
+
 	start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
 	offset = start + ((data & VXLAN_RCO_UDP) ?
 			  offsetof(struct udphdr, check) :
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4bd177faa90c..6abe0d6f1e1d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2311,8 +2311,7 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb);
 
 static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb)
 {
-	return (NAPI_GRO_CB(skb)->gro_remcsum_start - skb_headroom(skb) ==
-		skb_gro_offset(skb));
+	return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb));
 }
 
 static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
@@ -2408,37 +2407,58 @@ static inline void skb_gro_remcsum_init(struct gro_remcsum *grc)
 	grc->delta = 0;
 }
 
-static inline void skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
-					   int start, int offset,
-					   struct gro_remcsum *grc,
-					   bool nopartial)
+static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
+					    unsigned int off, size_t hdrlen,
+					    int start, int offset,
+					    struct gro_remcsum *grc,
+					    bool nopartial)
 {
 	__wsum delta;
+	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
 
 	BUG_ON(!NAPI_GRO_CB(skb)->csum_valid);
 
 	if (!nopartial) {
-		NAPI_GRO_CB(skb)->gro_remcsum_start =
-		    ((unsigned char *)ptr + start) - skb->head;
-		return;
+		NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start;
+		return ptr;
+	}
+
+	ptr = skb_gro_header_fast(skb, off);
+	if (skb_gro_header_hard(skb, off + plen)) {
+		ptr = skb_gro_header_slow(skb, off + plen, off);
+		if (!ptr)
+			return NULL;
 	}
 
-	delta = remcsum_adjust(ptr, NAPI_GRO_CB(skb)->csum, start, offset);
+	delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum,
+			       start, offset);
 
 	/* Adjust skb->csum since we changed the packet */
 	NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
 
-	grc->offset = (ptr + offset) - (void *)skb->head;
+	grc->offset = off + hdrlen + offset;
 	grc->delta = delta;
+
+	return ptr;
 }
 
 static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
 					   struct gro_remcsum *grc)
 {
+	void *ptr;
+	size_t plen = grc->offset + sizeof(u16);
+
 	if (!grc->delta)
 		return;
 
-	remcsum_unadjust((__sum16 *)(skb->head + grc->offset), grc->delta);
+	ptr = skb_gro_header_fast(skb, grc->offset);
+	if (skb_gro_header_hard(skb, grc->offset + sizeof(u16))) {
+		ptr = skb_gro_header_slow(skb, plen, grc->offset);
+		if (!ptr)
+			return;
+	}
+
+	remcsum_unadjust((__sum16 *)ptr, grc->delta);
 }
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 34968cd5c146..eb11f9506894 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -79,7 +79,11 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
 	__be16 *pd = data;
 	size_t start = ntohs(pd[0]);
 	size_t offset = ntohs(pd[1]);
-	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
+	size_t plen = sizeof(struct udphdr) + hdrlen +
+	    max_t(size_t, offset + sizeof(u16), start);
+
+	if (skb->remcsum_offload)
+		return guehdr;
 
 	if (!pskb_may_pull(skb, plen))
 		return NULL;
@@ -221,29 +225,21 @@ out_unlock:
 
 static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 				      struct guehdr *guehdr, void *data,
-				      size_t hdrlen, u8 ipproto,
-				      struct gro_remcsum *grc, bool nopartial)
+				      size_t hdrlen, struct gro_remcsum *grc,
+				      bool nopartial)
 {
 	__be16 *pd = data;
 	size_t start = ntohs(pd[0]);
 	size_t offset = ntohs(pd[1]);
-	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
 
 	if (skb->remcsum_offload)
-		return NULL;
+		return guehdr;
 
 	if (!NAPI_GRO_CB(skb)->csum_valid)
 		return NULL;
 
-	/* Pull checksum that will be written */
-	if (skb_gro_header_hard(skb, off + plen)) {
-		guehdr = skb_gro_header_slow(skb, off + plen, off);
-		if (!guehdr)
-			return NULL;
-	}
-
-	skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen,
-				start, offset, grc, nopartial);
+	guehdr = skb_gro_remcsum_process(skb, (void *)guehdr, off, hdrlen,
+					 start, offset, grc, nopartial);
 
 	skb->remcsum_offload = 1;
 
@@ -307,10 +303,10 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
 
 		if (flags & GUE_PFLAG_REMCSUM) {
 			guehdr = gue_gro_remcsum(skb, off, guehdr,
-						 data + doffset, hdrlen,
-						 guehdr->proto_ctype, &grc,
+						 data + doffset, hdrlen, &grc,
 						 !!(fou->flags &
 						    FOU_F_REMCSUM_NOPARTIAL));
+
 			if (!guehdr)
 				goto out;
 
-- 
cgit v1.2.3-70-g09d2


From c5ebb387f4e6b2dd7c74d71caf7b696834d0c887 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Tue, 19 May 2015 17:44:31 +0200
Subject: i2c: add a flag to mark clients as slaves

And update indentation with one more tab, sigh...

Tested-by: Andrey Danin <danindrey@mail.ru>
Acked-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index e2c859b74f8b..5aea07137218 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -550,11 +550,12 @@ void i2c_lock_adapter(struct i2c_adapter *);
 void i2c_unlock_adapter(struct i2c_adapter *);
 
 /*flags for the client struct: */
-#define I2C_CLIENT_PEC	0x04		/* Use Packet Error Checking */
-#define I2C_CLIENT_TEN	0x10		/* we have a ten bit chip address */
+#define I2C_CLIENT_PEC		0x04	/* Use Packet Error Checking */
+#define I2C_CLIENT_TEN		0x10	/* we have a ten bit chip address */
 					/* Must equal I2C_M_TEN below */
-#define I2C_CLIENT_WAKE	0x80		/* for board_info; true iff can wake */
-#define I2C_CLIENT_SCCB	0x9000		/* Use Omnivision SCCB protocol */
+#define I2C_CLIENT_SLAVE	0x20	/* we are the slave */
+#define I2C_CLIENT_WAKE		0x80	/* for board_info; true iff can wake */
+#define I2C_CLIENT_SCCB		0x9000	/* Use Omnivision SCCB protocol */
 					/* Must match I2C_M_STOP|IGNORE_NAK */
 
 /* i2c adapter classes (bitmask) */
-- 
cgit v1.2.3-70-g09d2


From b3fdd32799d834e2626fae087906e886037350c6 Mon Sep 17 00:00:00 2001
From: York Sun <yorksun@freescale.com>
Date: Mon, 17 Aug 2015 11:53:48 -0700
Subject: i2c: mux: Add register-based mux i2c-mux-reg

Based on i2c-mux-gpio driver, similarly the register-based mux
switch from one bus to another by setting a single register.
The register can be on PCIe bus, local bus, or any memory-mapped
address. The endianness of such register can be specified in device
tree if used, or in platform data.

Signed-off-by: York Sun <yorksun@freescale.com>
Acked-by: Alexander Sverdlin <alexander.sverdlin@nokia.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 .../devicetree/bindings/i2c/i2c-mux-reg.txt        |  74 ++++++
 drivers/i2c/muxes/Kconfig                          |  11 +
 drivers/i2c/muxes/Makefile                         |   1 +
 drivers/i2c/muxes/i2c-mux-reg.c                    | 294 +++++++++++++++++++++
 include/linux/platform_data/i2c-mux-reg.h          |  44 +++
 5 files changed, 424 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/i2c/i2c-mux-reg.txt
 create mode 100644 drivers/i2c/muxes/i2c-mux-reg.c
 create mode 100644 include/linux/platform_data/i2c-mux-reg.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/i2c/i2c-mux-reg.txt b/Documentation/devicetree/bindings/i2c/i2c-mux-reg.txt
new file mode 100644
index 000000000000..688783fbe696
--- /dev/null
+++ b/Documentation/devicetree/bindings/i2c/i2c-mux-reg.txt
@@ -0,0 +1,74 @@
+Register-based I2C Bus Mux
+
+This binding describes an I2C bus multiplexer that uses a single register
+to route the I2C signals.
+
+Required properties:
+- compatible: i2c-mux-reg
+- i2c-parent: The phandle of the I2C bus that this multiplexer's master-side
+  port is connected to.
+* Standard I2C mux properties. See mux.txt in this directory.
+* I2C child bus nodes. See mux.txt in this directory.
+
+Optional properties:
+- reg: this pair of <offset size> specifies the register to control the mux.
+  The <offset size> depends on its parent node. It can be any memory-mapped
+  address. The size must be either 1, 2, or 4 bytes. If reg is omitted, the
+  resource of this device will be used.
+- little-endian: The existence indicates the register is in little endian.
+- big-endian: The existence indicates the register is in big endian.
+  If both little-endian and big-endian are omitted, the endianness of the
+  CPU will be used.
+- write-only: The existence indicates the register is write-only.
+- idle-state: value to set the muxer to when idle. When no value is
+  given, it defaults to the last value used.
+
+Whenever an access is made to a device on a child bus, the value set
+in the revelant node's reg property will be output to the register.
+
+If an idle state is defined, using the idle-state (optional) property,
+whenever an access is not being made to a device on a child bus, the
+register will be set according to the idle value.
+
+If an idle state is not defined, the most recently used value will be
+left programmed into the register.
+
+Example of a mux on PCIe card, the host is a powerpc SoC (big endian):
+
+	i2c-mux {
+		/* the <offset size> depends on the address translation
+		 * of the parent device. If omitted, device resource
+		 * will be used instead. The size is to determine
+		 * whether iowrite32, iowrite16, or iowrite8 will be used.
+		 */
+		reg = <0x6028 0x4>;
+		little-endian;		/* little endian register on PCIe */
+		compatible = "i2c-mux-reg";
+		#address-cells = <1>;
+		#size-cells = <0>;
+		i2c-parent = <&i2c1>;
+		i2c@0 {
+			reg = <0>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+
+			si5338: clock-generator@70 {
+				compatible = "silabs,si5338";
+				reg = <0x70>;
+				/* other stuff */
+			};
+		};
+
+		i2c@1 {
+			/* data is written using iowrite32 */
+			reg = <1>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+
+			si5338: clock-generator@70 {
+				compatible = "silabs,si5338";
+				reg = <0x70>;
+				/* other stuff */
+			};
+		};
+	};
diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig
index fdd0769c84a3..f06b0e24673b 100644
--- a/drivers/i2c/muxes/Kconfig
+++ b/drivers/i2c/muxes/Kconfig
@@ -61,4 +61,15 @@ config I2C_MUX_PINCTRL
 	  This driver can also be built as a module. If so, the module will be
 	  called pinctrl-i2cmux.
 
+config I2C_MUX_REG
+	tristate "Register-based I2C multiplexer"
+	help
+	  If you say yes to this option, support will be included for a
+	  register based I2C multiplexer. This driver provides access to
+	  I2C busses connected through a MUX, which is controlled
+	  by a single register.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called i2c-mux-reg.
+
 endmenu
diff --git a/drivers/i2c/muxes/Makefile b/drivers/i2c/muxes/Makefile
index 465778b5d5dc..e89799b76a92 100644
--- a/drivers/i2c/muxes/Makefile
+++ b/drivers/i2c/muxes/Makefile
@@ -7,5 +7,6 @@ obj-$(CONFIG_I2C_MUX_GPIO)	+= i2c-mux-gpio.o
 obj-$(CONFIG_I2C_MUX_PCA9541)	+= i2c-mux-pca9541.o
 obj-$(CONFIG_I2C_MUX_PCA954x)	+= i2c-mux-pca954x.o
 obj-$(CONFIG_I2C_MUX_PINCTRL)	+= i2c-mux-pinctrl.o
+obj-$(CONFIG_I2C_MUX_REG)	+= i2c-mux-reg.o
 
 ccflags-$(CONFIG_I2C_DEBUG_BUS) := -DDEBUG
diff --git a/drivers/i2c/muxes/i2c-mux-reg.c b/drivers/i2c/muxes/i2c-mux-reg.c
new file mode 100644
index 000000000000..86d41d36a783
--- /dev/null
+++ b/drivers/i2c/muxes/i2c-mux-reg.c
@@ -0,0 +1,294 @@
+/*
+ * I2C multiplexer using a single register
+ *
+ * Copyright 2015 Freescale Semiconductor
+ * York Sun  <yorksun@freescale.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/i2c.h>
+#include <linux/i2c-mux.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/platform_data/i2c-mux-reg.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+struct regmux {
+	struct i2c_adapter *parent;
+	struct i2c_adapter **adap; /* child busses */
+	struct i2c_mux_reg_platform_data data;
+};
+
+static int i2c_mux_reg_set(const struct regmux *mux, unsigned int chan_id)
+{
+	if (!mux->data.reg)
+		return -EINVAL;
+
+	switch (mux->data.reg_size) {
+	case 4:
+		if (mux->data.little_endian) {
+			iowrite32(chan_id, mux->data.reg);
+			if (!mux->data.write_only)
+				ioread32(mux->data.reg);
+		} else {
+			iowrite32be(chan_id, mux->data.reg);
+			if (!mux->data.write_only)
+				ioread32(mux->data.reg);
+		}
+		break;
+	case 2:
+		if (mux->data.little_endian) {
+			iowrite16(chan_id, mux->data.reg);
+			if (!mux->data.write_only)
+				ioread16(mux->data.reg);
+		} else {
+			iowrite16be(chan_id, mux->data.reg);
+			if (!mux->data.write_only)
+				ioread16be(mux->data.reg);
+		}
+		break;
+	case 1:
+		iowrite8(chan_id, mux->data.reg);
+		if (!mux->data.write_only)
+			ioread8(mux->data.reg);
+		break;
+	default:
+		pr_err("Invalid register size\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int i2c_mux_reg_select(struct i2c_adapter *adap, void *data,
+			      unsigned int chan)
+{
+	struct regmux *mux = data;
+
+	return i2c_mux_reg_set(mux, chan);
+}
+
+static int i2c_mux_reg_deselect(struct i2c_adapter *adap, void *data,
+				unsigned int chan)
+{
+	struct regmux *mux = data;
+
+	if (mux->data.idle_in_use)
+		return i2c_mux_reg_set(mux, mux->data.idle);
+
+	return 0;
+}
+
+#ifdef CONFIG_OF
+static int i2c_mux_reg_probe_dt(struct regmux *mux,
+					struct platform_device *pdev)
+{
+	struct device_node *np = pdev->dev.of_node;
+	struct device_node *adapter_np, *child;
+	struct i2c_adapter *adapter;
+	struct resource res;
+	unsigned *values;
+	int i = 0;
+
+	if (!np)
+		return -ENODEV;
+
+	adapter_np = of_parse_phandle(np, "i2c-parent", 0);
+	if (!adapter_np) {
+		dev_err(&pdev->dev, "Cannot parse i2c-parent\n");
+		return -ENODEV;
+	}
+	adapter = of_find_i2c_adapter_by_node(adapter_np);
+	if (!adapter)
+		return -EPROBE_DEFER;
+
+	mux->parent = adapter;
+	mux->data.parent = i2c_adapter_id(adapter);
+	put_device(&adapter->dev);
+
+	mux->data.n_values = of_get_child_count(np);
+	if (of_find_property(np, "little-endian", NULL)) {
+		mux->data.little_endian = true;
+	} else if (of_find_property(np, "big-endian", NULL)) {
+		mux->data.little_endian = false;
+	} else {
+#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : \
+	defined(__LITTLE_ENDIAN)
+		mux->data.little_endian = true;
+#elif defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : \
+	defined(__BIG_ENDIAN)
+		mux->data.little_endian = false;
+#else
+#error Endianness not defined?
+#endif
+	}
+	if (of_find_property(np, "write-only", NULL))
+		mux->data.write_only = true;
+	else
+		mux->data.write_only = false;
+
+	values = devm_kzalloc(&pdev->dev,
+			      sizeof(*mux->data.values) * mux->data.n_values,
+			      GFP_KERNEL);
+	if (!values) {
+		dev_err(&pdev->dev, "Cannot allocate values array");
+		return -ENOMEM;
+	}
+
+	for_each_child_of_node(np, child) {
+		of_property_read_u32(child, "reg", values + i);
+		i++;
+	}
+	mux->data.values = values;
+
+	if (!of_property_read_u32(np, "idle-state", &mux->data.idle))
+		mux->data.idle_in_use = true;
+
+	/* map address from "reg" if exists */
+	if (of_address_to_resource(np, 0, &res)) {
+		mux->data.reg_size = resource_size(&res);
+		if (mux->data.reg_size > 4) {
+			dev_err(&pdev->dev, "Invalid address size\n");
+			return -EINVAL;
+		}
+		mux->data.reg = devm_ioremap_resource(&pdev->dev, &res);
+		if (IS_ERR(mux->data.reg))
+			return PTR_ERR(mux->data.reg);
+	}
+
+	return 0;
+}
+#else
+static int i2c_mux_reg_probe_dt(struct gpiomux *mux,
+					struct platform_device *pdev)
+{
+	return 0;
+}
+#endif
+
+static int i2c_mux_reg_probe(struct platform_device *pdev)
+{
+	struct regmux *mux;
+	struct i2c_adapter *parent;
+	struct resource *res;
+	int (*deselect)(struct i2c_adapter *, void *, u32);
+	unsigned int class;
+	int i, ret, nr;
+
+	mux = devm_kzalloc(&pdev->dev, sizeof(*mux), GFP_KERNEL);
+	if (!mux)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, mux);
+
+	if (dev_get_platdata(&pdev->dev)) {
+		memcpy(&mux->data, dev_get_platdata(&pdev->dev),
+			sizeof(mux->data));
+
+		parent = i2c_get_adapter(mux->data.parent);
+		if (!parent)
+			return -EPROBE_DEFER;
+
+		mux->parent = parent;
+	} else {
+		ret = i2c_mux_reg_probe_dt(mux, pdev);
+		if (ret < 0) {
+			dev_err(&pdev->dev, "Error parsing device tree");
+			return ret;
+		}
+	}
+
+	if (!mux->data.reg) {
+		dev_info(&pdev->dev,
+			"Register not set, using platform resource\n");
+		res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+		mux->data.reg_size = resource_size(res);
+		if (mux->data.reg_size > 4) {
+			dev_err(&pdev->dev, "Invalid resource size\n");
+			return -EINVAL;
+		}
+		mux->data.reg = devm_ioremap_resource(&pdev->dev, res);
+		if (IS_ERR(mux->data.reg))
+			return PTR_ERR(mux->data.reg);
+	}
+
+	mux->adap = devm_kzalloc(&pdev->dev,
+				 sizeof(*mux->adap) * mux->data.n_values,
+				 GFP_KERNEL);
+	if (!mux->adap) {
+		dev_err(&pdev->dev, "Cannot allocate i2c_adapter structure");
+		return -ENOMEM;
+	}
+
+	if (mux->data.idle_in_use)
+		deselect = i2c_mux_reg_deselect;
+	else
+		deselect = NULL;
+
+	for (i = 0; i < mux->data.n_values; i++) {
+		nr = mux->data.base_nr ? (mux->data.base_nr + i) : 0;
+		class = mux->data.classes ? mux->data.classes[i] : 0;
+
+		mux->adap[i] = i2c_add_mux_adapter(mux->parent, &pdev->dev, mux,
+						   nr, mux->data.values[i],
+						   class, i2c_mux_reg_select,
+						   deselect);
+		if (!mux->adap[i]) {
+			ret = -ENODEV;
+			dev_err(&pdev->dev, "Failed to add adapter %d\n", i);
+			goto add_adapter_failed;
+		}
+	}
+
+	dev_dbg(&pdev->dev, "%d port mux on %s adapter\n",
+		 mux->data.n_values, mux->parent->name);
+
+	return 0;
+
+add_adapter_failed:
+	for (; i > 0; i--)
+		i2c_del_mux_adapter(mux->adap[i - 1]);
+
+	return ret;
+}
+
+static int i2c_mux_reg_remove(struct platform_device *pdev)
+{
+	struct regmux *mux = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < mux->data.n_values; i++)
+		i2c_del_mux_adapter(mux->adap[i]);
+
+	i2c_put_adapter(mux->parent);
+
+	return 0;
+}
+
+static const struct of_device_id i2c_mux_reg_of_match[] = {
+	{ .compatible = "i2c-mux-reg", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, i2c_mux_reg_of_match);
+
+static struct platform_driver i2c_mux_reg_driver = {
+	.probe	= i2c_mux_reg_probe,
+	.remove	= i2c_mux_reg_remove,
+	.driver	= {
+		.name	= "i2c-mux-reg",
+	},
+};
+
+module_platform_driver(i2c_mux_reg_driver);
+
+MODULE_DESCRIPTION("Register-based I2C multiplexer driver");
+MODULE_AUTHOR("York Sun <yorksun@freescale.com>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:i2c-mux-reg");
diff --git a/include/linux/platform_data/i2c-mux-reg.h b/include/linux/platform_data/i2c-mux-reg.h
new file mode 100644
index 000000000000..c68712aadf43
--- /dev/null
+++ b/include/linux/platform_data/i2c-mux-reg.h
@@ -0,0 +1,44 @@
+/*
+ * I2C multiplexer using a single register
+ *
+ * Copyright 2015 Freescale Semiconductor
+ * York Sun <yorksun@freescale.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#ifndef __LINUX_PLATFORM_DATA_I2C_MUX_REG_H
+#define __LINUX_PLATFORM_DATA_I2C_MUX_REG_H
+
+/**
+ * struct i2c_mux_reg_platform_data - Platform-dependent data for i2c-mux-reg
+ * @parent: Parent I2C bus adapter number
+ * @base_nr: Base I2C bus number to number adapters from or zero for dynamic
+ * @values: Array of value for each channel
+ * @n_values: Number of multiplexer channels
+ * @little_endian: Indicating if the register is in little endian
+ * @write_only: Reading the register is not allowed by hardware
+ * @classes: Optional I2C auto-detection classes
+ * @idle: Value to write to mux when idle
+ * @idle_in_use: indicate if idle value is in use
+ * @reg: Virtual address of the register to switch channel
+ * @reg_size: register size in bytes
+ */
+struct i2c_mux_reg_platform_data {
+	int parent;
+	int base_nr;
+	const unsigned int *values;
+	int n_values;
+	bool little_endian;
+	bool write_only;
+	const unsigned int *classes;
+	u32 idle;
+	bool idle_in_use;
+	void __iomem *reg;
+	resource_size_t reg_size;
+};
+
+#endif	/* __LINUX_PLATFORM_DATA_I2C_MUX_REG_H */
-- 
cgit v1.2.3-70-g09d2


From 01eef96e37d77cd89156e5f51aab81a9d5c96539 Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Wed, 12 Aug 2015 17:31:33 +0300
Subject: i2c: core: Add support for best effort block read emulation

There are devices that need to handle block transactions
regardless of the capabilities exported by the adapter.
For performance reasons, they need to use i2c read blocks
if available, otherwise emulate the block transaction with word
or byte transactions.

Add support for a helper function that would read a data block
using the best transfer available: I2C_FUNC_SMBUS_READ_I2C_BLOCK,
I2C_FUNC_SMBUS_READ_WORD_DATA or I2C_FUNC_SMBUS_READ_BYTE_DATA.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/i2c.h    |  3 +++
 2 files changed, 60 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index a6780289c61d..98f6c75b1d18 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -3007,6 +3007,63 @@ trace:
 }
 EXPORT_SYMBOL(i2c_smbus_xfer);
 
+/**
+ * i2c_smbus_read_i2c_block_data_or_emulated - read block or emulate
+ * @client: Handle to slave device
+ * @command: Byte interpreted by slave
+ * @length: Size of data block; SMBus allows at most I2C_SMBUS_BLOCK_MAX bytes
+ * @values: Byte array into which data will be read; big enough to hold
+ *	the data returned by the slave.  SMBus allows at most
+ *	I2C_SMBUS_BLOCK_MAX bytes.
+ *
+ * This executes the SMBus "block read" protocol if supported by the adapter.
+ * If block read is not supported, it emulates it using either word or byte
+ * read protocols depending on availability.
+ *
+ * The addresses of the I2C slave device that are accessed with this function
+ * must be mapped to a linear region, so that a block read will have the same
+ * effect as a byte read. Before using this function you must double-check
+ * if the I2C slave does support exchanging a block transfer with a byte
+ * transfer.
+ */
+s32 i2c_smbus_read_i2c_block_data_or_emulated(const struct i2c_client *client,
+					      u8 command, u8 length, u8 *values)
+{
+	u8 i = 0;
+	int status;
+
+	if (length > I2C_SMBUS_BLOCK_MAX)
+		length = I2C_SMBUS_BLOCK_MAX;
+
+	if (i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_READ_I2C_BLOCK))
+		return i2c_smbus_read_i2c_block_data(client, command, length, values);
+
+	if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_READ_BYTE_DATA))
+		return -EOPNOTSUPP;
+
+	if (i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_READ_WORD_DATA)) {
+		while ((i + 2) <= length) {
+			status = i2c_smbus_read_word_data(client, command + i);
+			if (status < 0)
+				return status;
+			values[i] = status & 0xff;
+			values[i + 1] = status >> 8;
+			i += 2;
+		}
+	}
+
+	while (i < length) {
+		status = i2c_smbus_read_byte_data(client, command + i);
+		if (status < 0)
+			return status;
+		values[i] = status;
+		i++;
+	}
+
+	return i;
+}
+EXPORT_SYMBOL(i2c_smbus_read_i2c_block_data_or_emulated);
+
 #if IS_ENABLED(CONFIG_I2C_SLAVE)
 int i2c_slave_register(struct i2c_client *client, i2c_slave_cb_t slave_cb)
 {
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 5aea07137218..768063baafbf 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -121,6 +121,9 @@ extern s32 i2c_smbus_read_i2c_block_data(const struct i2c_client *client,
 extern s32 i2c_smbus_write_i2c_block_data(const struct i2c_client *client,
 					  u8 command, u8 length,
 					  const u8 *values);
+extern s32
+i2c_smbus_read_i2c_block_data_or_emulated(const struct i2c_client *client,
+					  u8 command, u8 length, u8 *values);
 #endif /* I2C */
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 27d868b5e6cfaee4fec66b388e4085ff94050fa7 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 24 Aug 2015 08:48:16 -0500
Subject: PCI: Set MPS to match upstream bridge

Firmware typically configures the PCIe fabric with a consistent Max Payload
Size setting based on the devices present at boot.  A hot-added device
typically has the power-on default MPS setting (128 bytes), which may not
match the fabric.

The previous Linux default, in the absence of any "pci=pcie_bus_*" options,
was PCIE_BUS_TUNE_OFF, in which we never touch MPS, even for hot-added
devices.

Add a new default setting, PCIE_BUS_DEFAULT, in which we make sure every
device's MPS setting matches the upstream bridge.  This makes it more
likely that a hot-added device will work in a system with optimized MPS
configuration.

Note that if we hot-add a device that only supports 128-byte MPS, it still
likely won't work because we don't reconfigure the rest of the fabric.
Booting with "pci=pcie_bus_peer2peer" is a workaround for this because it
sets MPS to 128 for everything.

[bhelgaas: changelog, new default, rework for pci_configure_device() path]
Tested-by: Keith Busch <keith.busch@intel.com>
Tested-by: Jordan Hargrave <jharg93@gmail.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
---
 drivers/pci/pci.c    |  2 +-
 drivers/pci/probe.c  | 22 ++++++++++++++++++++--
 drivers/pci/quirks.c |  3 ++-
 include/linux/pci.h  |  9 +++++----
 4 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 0008c950452c..b96b4ccc2819 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -81,7 +81,7 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE;
 unsigned long pci_hotplug_io_size  = DEFAULT_HOTPLUG_IO_SIZE;
 unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE;
 
-enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_TUNE_OFF;
+enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_DEFAULT;
 
 /*
  * The default CLS is used if arch didn't set CLS explicitly and not
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index eb32395d5897..eebabe3a814e 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1278,7 +1278,7 @@ int pci_setup_device(struct pci_dev *dev)
 static void pci_configure_mps(struct pci_dev *dev)
 {
 	struct pci_dev *bridge = pci_upstream_bridge(dev);
-	int mps, p_mps;
+	int mps, p_mps, rc;
 
 	if (!pci_is_pcie(dev) || !bridge || !pci_is_pcie(bridge))
 		return;
@@ -1294,6 +1294,23 @@ static void pci_configure_mps(struct pci_dev *dev)
 			 mps, pci_name(bridge), p_mps);
 		return;
 	}
+
+	/*
+	 * Fancier MPS configuration is done later by
+	 * pcie_bus_configure_settings()
+	 */
+	if (pcie_bus_config != PCIE_BUS_DEFAULT)
+		return;
+
+	rc = pcie_set_mps(dev, p_mps);
+	if (rc) {
+		dev_warn(&dev->dev, "can't set Max Payload Size to %d; if necessary, use \"pci=pcie_bus_safe\" and report a bug\n",
+			 p_mps);
+		return;
+	}
+
+	dev_info(&dev->dev, "Max Payload Size set to %d (was %d, max %d)\n",
+		 p_mps, mps, 128 << dev->pcie_mpss);
 }
 
 static struct hpp_type0 pci_default_type0 = {
@@ -1821,7 +1838,8 @@ static int pcie_bus_configure_set(struct pci_dev *dev, void *data)
 	if (!pci_is_pcie(dev))
 		return 0;
 
-	if (pcie_bus_config == PCIE_BUS_TUNE_OFF)
+	if (pcie_bus_config == PCIE_BUS_TUNE_OFF ||
+	    pcie_bus_config == PCIE_BUS_DEFAULT)
 		return 0;
 
 	mps = 128 << *(u8 *)data;
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index e9fd0e90fa3b..55f2c208bd0e 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -2862,7 +2862,8 @@ static void quirk_intel_mc_errata(struct pci_dev *dev)
 	int err;
 	u16 rcc;
 
-	if (pcie_bus_config == PCIE_BUS_TUNE_OFF)
+	if (pcie_bus_config == PCIE_BUS_TUNE_OFF ||
+	    pcie_bus_config == PCIE_BUS_DEFAULT)
 		return;
 
 	/* Intel errata specifies bits to change but does not say what they are.
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4d4f9d29fcc9..0007942923a7 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -738,10 +738,11 @@ struct pci_driver {
 void pcie_bus_configure_settings(struct pci_bus *bus);
 
 enum pcie_bus_config_types {
-	PCIE_BUS_TUNE_OFF,
-	PCIE_BUS_SAFE,
-	PCIE_BUS_PERFORMANCE,
-	PCIE_BUS_PEER2PEER,
+	PCIE_BUS_TUNE_OFF,	/* don't touch MPS at all */
+	PCIE_BUS_DEFAULT,	/* ensure MPS matches upstream bridge */
+	PCIE_BUS_SAFE,		/* use largest MPS boot-time devices support */
+	PCIE_BUS_PERFORMANCE,	/* use MPS and MRRS for best performance */
+	PCIE_BUS_PEER2PEER,	/* set MPS = 128 for all devices */
 };
 
 extern enum pcie_bus_config_types pcie_bus_config;
-- 
cgit v1.2.3-70-g09d2


From f8bcbe62acd0e1ce9004b83e98a4af87ae385dcf Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Sat, 8 Aug 2015 10:44:10 +0200
Subject: lib: scatterlist: add sg splitting function

Sometimes a scatter-gather has to be split into several chunks, or sub
scatter lists. This happens for example if a scatter list will be
handled by multiple DMA channels, each one filling a part of it.

A concrete example comes with the media V4L2 API, where the scatter list
is allocated from userspace to hold an image, regardless of the
knowledge of how many DMAs will fill it :
 - in a simple RGB565 case, one DMA will pump data from the camera ISP
   to memory
 - in the trickier YUV422 case, 3 DMAs will pump data from the camera
   ISP pipes, one for pipe Y, one for pipe U and one for pipe V

For these cases, it is necessary to split the original scatter list into
multiple scatter lists, which is the purpose of this patch.

The guarantees that are required for this patch are :
 - the intersection of spans of any couple of resulting scatter lists is
   empty.
 - the union of spans of all resulting scatter lists is a subrange of
   the span of the original scatter list.
 - streaming DMA API operations (mapping, unmapping) should not happen
   both on both the resulting and the original scatter list. It's either
   the first or the later ones.
 - the caller is reponsible to call kfree() on the resulting
   scatterlists.

Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/scatterlist.h |   5 ++
 lib/Kconfig                 |   7 ++
 lib/Makefile                |   1 +
 lib/sg_split.c              | 202 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 215 insertions(+)
 create mode 100644 lib/sg_split.c

(limited to 'include/linux')

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 698e906ca730..556ec1ea2574 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -247,6 +247,11 @@ struct scatterlist *sg_next(struct scatterlist *);
 struct scatterlist *sg_last(struct scatterlist *s, unsigned int);
 void sg_init_table(struct scatterlist *, unsigned int);
 void sg_init_one(struct scatterlist *, const void *, unsigned int);
+int sg_split(struct scatterlist *in, const int in_mapped_nents,
+	     const off_t skip, const int nb_splits,
+	     const size_t *split_sizes,
+	     struct scatterlist **out, int *out_mapped_nents,
+	     gfp_t gfp_mask);
 
 typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t);
 typedef void (sg_free_fn)(struct scatterlist *, unsigned int);
diff --git a/lib/Kconfig b/lib/Kconfig
index 3a2ef67db6c7..dc516164415a 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -521,6 +521,13 @@ config UCS2_STRING
 
 source "lib/fonts/Kconfig"
 
+config SG_SPLIT
+	def_bool n
+	help
+	 Provides a heler to split scatterlists into chunks, each chunk being a
+	 scatterlist. This should be selected by a driver or an API which
+	 whishes to split a scatterlist amongst multiple DMA channel.
+
 #
 # sg chaining option
 #
diff --git a/lib/Makefile b/lib/Makefile
index 6897b527581a..2ee6ea2e9b08 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -160,6 +160,7 @@ obj-$(CONFIG_GENERIC_STRNLEN_USER) += strnlen_user.o
 
 obj-$(CONFIG_GENERIC_NET_UTILS) += net_utils.o
 
+obj-$(CONFIG_SG_SPLIT) += sg_split.o
 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
 
 libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \
diff --git a/lib/sg_split.c b/lib/sg_split.c
new file mode 100644
index 000000000000..b063410c3593
--- /dev/null
+++ b/lib/sg_split.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (C) 2015 Robert Jarzmik <robert.jarzmik@free.fr>
+ *
+ * Scatterlist splitting helpers.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+
+struct sg_splitter {
+	struct scatterlist *in_sg0;
+	int nents;
+	off_t skip_sg0;
+	unsigned int length_last_sg;
+
+	struct scatterlist *out_sg;
+};
+
+static int sg_calculate_split(struct scatterlist *in, int nents, int nb_splits,
+			      off_t skip, const size_t *sizes,
+			      struct sg_splitter *splitters, bool mapped)
+{
+	int i;
+	unsigned int sglen;
+	size_t size = sizes[0], len;
+	struct sg_splitter *curr = splitters;
+	struct scatterlist *sg;
+
+	for (i = 0; i < nb_splits; i++) {
+		splitters[i].in_sg0 = NULL;
+		splitters[i].nents = 0;
+	}
+
+	for_each_sg(in, sg, nents, i) {
+		sglen = mapped ? sg_dma_len(sg) : sg->length;
+		if (skip > sglen) {
+			skip -= sglen;
+			continue;
+		}
+
+		len = min_t(size_t, size, sglen - skip);
+		if (!curr->in_sg0) {
+			curr->in_sg0 = sg;
+			curr->skip_sg0 = skip;
+		}
+		size -= len;
+		curr->nents++;
+		curr->length_last_sg = len;
+
+		while (!size && (skip + len < sglen) && (--nb_splits > 0)) {
+			curr++;
+			size = *(++sizes);
+			skip += len;
+			len = min_t(size_t, size, sglen - skip);
+
+			curr->in_sg0 = sg;
+			curr->skip_sg0 = skip;
+			curr->nents = 1;
+			curr->length_last_sg = len;
+			size -= len;
+		}
+		skip = 0;
+
+		if (!size && --nb_splits > 0) {
+			curr++;
+			size = *(++sizes);
+		}
+
+		if (!nb_splits)
+			break;
+	}
+
+	return (size || !splitters[0].in_sg0) ? -EINVAL : 0;
+}
+
+static void sg_split_phys(struct sg_splitter *splitters, const int nb_splits)
+{
+	int i, j;
+	struct scatterlist *in_sg, *out_sg;
+	struct sg_splitter *split;
+
+	for (i = 0, split = splitters; i < nb_splits; i++, split++) {
+		in_sg = split->in_sg0;
+		out_sg = split->out_sg;
+		for (j = 0; j < split->nents; j++, out_sg++) {
+			*out_sg = *in_sg;
+			if (!j) {
+				out_sg->offset += split->skip_sg0;
+				out_sg->length -= split->skip_sg0;
+			} else {
+				out_sg->offset = 0;
+			}
+			sg_dma_address(out_sg) = 0;
+			sg_dma_len(out_sg) = 0;
+			in_sg = sg_next(in_sg);
+		}
+		out_sg[-1].length = split->length_last_sg;
+		sg_mark_end(out_sg - 1);
+	}
+}
+
+static void sg_split_mapped(struct sg_splitter *splitters, const int nb_splits)
+{
+	int i, j;
+	struct scatterlist *in_sg, *out_sg;
+	struct sg_splitter *split;
+
+	for (i = 0, split = splitters; i < nb_splits; i++, split++) {
+		in_sg = split->in_sg0;
+		out_sg = split->out_sg;
+		for (j = 0; j < split->nents; j++, out_sg++) {
+			sg_dma_address(out_sg) = sg_dma_address(in_sg);
+			sg_dma_len(out_sg) = sg_dma_len(in_sg);
+			if (!j) {
+				sg_dma_address(out_sg) += split->skip_sg0;
+				sg_dma_len(out_sg) -= split->skip_sg0;
+			}
+			in_sg = sg_next(in_sg);
+		}
+		sg_dma_len(--out_sg) = split->length_last_sg;
+	}
+}
+
+/**
+ * sg_split - split a scatterlist into several scatterlists
+ * @in: the input sg list
+ * @in_mapped_nents: the result of a dma_map_sg(in, ...), or 0 if not mapped.
+ * @skip: the number of bytes to skip in the input sg list
+ * @nb_splits: the number of desired sg outputs
+ * @split_sizes: the respective size of each output sg list in bytes
+ * @out: an array where to store the allocated output sg lists
+ * @out_mapped_nents: the resulting sg lists mapped number of sg entries. Might
+ *                    be NULL if sglist not already mapped (in_mapped_nents = 0)
+ * @gfp_mask: the allocation flag
+ *
+ * This function splits the input sg list into nb_splits sg lists, which are
+ * allocated and stored into out.
+ * The @in is split into :
+ *  - @out[0], which covers bytes [@skip .. @skip + @split_sizes[0] - 1] of @in
+ *  - @out[1], which covers bytes [@skip + split_sizes[0] ..
+ *                                 @skip + @split_sizes[0] + @split_sizes[1] -1]
+ * etc ...
+ * It will be the caller's duty to kfree() out array members.
+ *
+ * Returns 0 upon success, or error code
+ */
+int sg_split(struct scatterlist *in, const int in_mapped_nents,
+	     const off_t skip, const int nb_splits,
+	     const size_t *split_sizes,
+	     struct scatterlist **out, int *out_mapped_nents,
+	     gfp_t gfp_mask)
+{
+	int i, ret;
+	struct sg_splitter *splitters;
+
+	splitters = kcalloc(nb_splits, sizeof(*splitters), gfp_mask);
+	if (!splitters)
+		return -ENOMEM;
+
+	ret = sg_calculate_split(in, sg_nents(in), nb_splits, skip, split_sizes,
+			   splitters, false);
+	if (ret < 0)
+		goto err;
+
+	ret = -ENOMEM;
+	for (i = 0; i < nb_splits; i++) {
+		splitters[i].out_sg = kmalloc_array(splitters[i].nents,
+						    sizeof(struct scatterlist),
+						    gfp_mask);
+		if (!splitters[i].out_sg)
+			goto err;
+	}
+
+	/*
+	 * The order of these 3 calls is important and should be kept.
+	 */
+	sg_split_phys(splitters, nb_splits);
+	ret = sg_calculate_split(in, in_mapped_nents, nb_splits, skip,
+				 split_sizes, splitters, true);
+	if (ret < 0)
+		goto err;
+	sg_split_mapped(splitters, nb_splits);
+
+	for (i = 0; i < nb_splits; i++) {
+		out[i] = splitters[i].out_sg;
+		if (out_mapped_nents)
+			out_mapped_nents[i] = splitters[i].nents;
+	}
+
+	kfree(splitters);
+	return 0;
+
+err:
+	for (i = 0; i < nb_splits; i++)
+		kfree(splitters[i].out_sg);
+	kfree(splitters);
+	return ret;
+}
+EXPORT_SYMBOL(sg_split);
-- 
cgit v1.2.3-70-g09d2


From 1a9c069cb2d28bb72fefee509e0d26f92d7f7166 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Thu, 25 Jun 2015 15:55:14 -0700
Subject: clk: Add clk_hw_*() APIs for use by clk providers

clk providers shouldn't need to use the consumer APIs (clk.h).
Add provider APIs to replace the __clk_*() APIs that take a
struct clk_hw as their first argument instead of a struct clk.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk.c            | 61 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/clk-provider.h |  9 +++++++
 2 files changed, 70 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 4912c2e55e2c..a30fd30f4948 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -278,6 +278,12 @@ const char *__clk_get_name(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(__clk_get_name);
 
+const char *clk_hw_get_name(struct clk_hw *hw)
+{
+	return hw->core->name;
+}
+EXPORT_SYMBOL_GPL(clk_hw_get_name);
+
 struct clk_hw *__clk_get_hw(struct clk *clk)
 {
 	return !clk ? NULL : clk->core->hw;
@@ -290,6 +296,12 @@ u8 __clk_get_num_parents(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(__clk_get_num_parents);
 
+unsigned int clk_hw_get_num_parents(struct clk_hw *hw)
+{
+	return hw->core->num_parents;
+}
+EXPORT_SYMBOL_GPL(clk_hw_get_num_parents);
+
 struct clk *__clk_get_parent(struct clk *clk)
 {
 	if (!clk)
@@ -300,6 +312,12 @@ struct clk *__clk_get_parent(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(__clk_get_parent);
 
+struct clk_hw *clk_hw_get_parent(struct clk_hw *hw)
+{
+	return hw->core->parent ? hw->core->parent->hw : NULL;
+}
+EXPORT_SYMBOL_GPL(clk_hw_get_parent);
+
 static struct clk_core *__clk_lookup_subtree(const char *name,
 					     struct clk_core *core)
 {
@@ -370,6 +388,16 @@ struct clk *clk_get_parent_by_index(struct clk *clk, u8 index)
 }
 EXPORT_SYMBOL_GPL(clk_get_parent_by_index);
 
+struct clk_hw *clk_hw_get_parent_by_index(struct clk_hw *hw, unsigned int index)
+{
+	struct clk_core *parent;
+
+	parent = clk_core_get_parent_by_index(hw->core, index);
+
+	return !parent ? NULL : parent->hw;
+}
+EXPORT_SYMBOL_GPL(clk_hw_get_parent_by_index);
+
 unsigned int __clk_get_enable_count(struct clk *clk)
 {
 	return !clk ? 0 : clk->core->enable_count;
@@ -405,6 +433,12 @@ unsigned long __clk_get_rate(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(__clk_get_rate);
 
+unsigned long clk_hw_get_rate(struct clk_hw *hw)
+{
+	return clk_core_get_rate_nolock(hw->core);
+}
+EXPORT_SYMBOL_GPL(clk_hw_get_rate);
+
 static unsigned long __clk_get_accuracy(struct clk_core *core)
 {
 	if (!core)
@@ -419,6 +453,12 @@ unsigned long __clk_get_flags(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(__clk_get_flags);
 
+unsigned long clk_hw_get_flags(struct clk_hw *hw)
+{
+	return hw->core->flags;
+}
+EXPORT_SYMBOL_GPL(clk_hw_get_flags);
+
 bool __clk_is_prepared(struct clk *clk)
 {
 	if (!clk)
@@ -427,6 +467,11 @@ bool __clk_is_prepared(struct clk *clk)
 	return clk_core_is_prepared(clk->core);
 }
 
+bool clk_hw_is_prepared(struct clk_hw *hw)
+{
+	return clk_core_is_prepared(hw->core);
+}
+
 bool __clk_is_enabled(struct clk *clk)
 {
 	if (!clk)
@@ -861,6 +906,22 @@ unsigned long __clk_round_rate(struct clk *clk, unsigned long rate)
 }
 EXPORT_SYMBOL_GPL(__clk_round_rate);
 
+unsigned long clk_hw_round_rate(struct clk_hw *hw, unsigned long rate)
+{
+	int ret;
+	struct clk_rate_request req;
+
+	clk_core_get_boundaries(hw->core, &req.min_rate, &req.max_rate);
+	req.rate = rate;
+
+	ret = clk_core_round_rate_nolock(hw->core, &req);
+	if (ret)
+		return 0;
+
+	return req.rate;
+}
+EXPORT_SYMBOL_GPL(clk_hw_round_rate);
+
 /**
  * clk_round_rate - round the given rate for a clk
  * @clk: the clk for which we are rounding a rate
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 06a56e55cfaf..be88dae0c3eb 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -607,14 +607,22 @@ void devm_clk_unregister(struct device *dev, struct clk *clk);
 
 /* helper functions */
 const char *__clk_get_name(struct clk *clk);
+const char *clk_hw_get_name(struct clk_hw *hw);
 struct clk_hw *__clk_get_hw(struct clk *clk);
 u8 __clk_get_num_parents(struct clk *clk);
+unsigned int clk_hw_get_num_parents(struct clk_hw *hw);
 struct clk *__clk_get_parent(struct clk *clk);
+struct clk_hw *clk_hw_get_parent(struct clk_hw *hw);
 struct clk *clk_get_parent_by_index(struct clk *clk, u8 index);
+struct clk_hw *clk_hw_get_parent_by_index(struct clk_hw *hw,
+					  unsigned int index);
 unsigned int __clk_get_enable_count(struct clk *clk);
 unsigned long __clk_get_rate(struct clk *clk);
+unsigned long clk_hw_get_rate(struct clk_hw *hw);
 unsigned long __clk_get_flags(struct clk *clk);
+unsigned long clk_hw_get_flags(struct clk_hw *hw);
 bool __clk_is_prepared(struct clk *clk);
+bool clk_hw_is_prepared(struct clk_hw *hw);
 bool __clk_is_enabled(struct clk *clk);
 struct clk *__clk_lookup(const char *name);
 int __clk_mux_determine_rate(struct clk_hw *hw,
@@ -636,6 +644,7 @@ static inline void __clk_hw_set_clk(struct clk_hw *dst, struct clk_hw *src)
  * FIXME clock api without lock protection
  */
 unsigned long __clk_round_rate(struct clk *clk, unsigned long rate);
+unsigned long clk_hw_round_rate(struct clk_hw *hw, unsigned long rate);
 
 struct of_device_id;
 
-- 
cgit v1.2.3-70-g09d2


From fc4a05d4b0eb1a0110ef11201bf563cd4b53fbce Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Thu, 25 Jun 2015 17:24:15 -0700
Subject: clk: Remove unused provider APIs

Remove these APIs now that we've converted all users to the
replacement struct clk_hw based versions.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk.c            | 92 ++++++++------------------------------------
 include/linux/clk-provider.h |  6 ---
 2 files changed, 16 insertions(+), 82 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index a30fd30f4948..128ad748b682 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -290,28 +290,12 @@ struct clk_hw *__clk_get_hw(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(__clk_get_hw);
 
-u8 __clk_get_num_parents(struct clk *clk)
-{
-	return !clk ? 0 : clk->core->num_parents;
-}
-EXPORT_SYMBOL_GPL(__clk_get_num_parents);
-
 unsigned int clk_hw_get_num_parents(struct clk_hw *hw)
 {
 	return hw->core->num_parents;
 }
 EXPORT_SYMBOL_GPL(clk_hw_get_num_parents);
 
-struct clk *__clk_get_parent(struct clk *clk)
-{
-	if (!clk)
-		return NULL;
-
-	/* TODO: Create a per-user clk and change callers to call clk_put */
-	return !clk->core->parent ? NULL : clk->core->parent->hw->clk;
-}
-EXPORT_SYMBOL_GPL(__clk_get_parent);
-
 struct clk_hw *clk_hw_get_parent(struct clk_hw *hw)
 {
 	return hw->core->parent ? hw->core->parent->hw : NULL;
@@ -375,19 +359,6 @@ static struct clk_core *clk_core_get_parent_by_index(struct clk_core *core,
 		return core->parents[index];
 }
 
-struct clk *clk_get_parent_by_index(struct clk *clk, u8 index)
-{
-	struct clk_core *parent;
-
-	if (!clk)
-		return NULL;
-
-	parent = clk_core_get_parent_by_index(clk->core, index);
-
-	return !parent ? NULL : parent->hw->clk;
-}
-EXPORT_SYMBOL_GPL(clk_get_parent_by_index);
-
 struct clk_hw *clk_hw_get_parent_by_index(struct clk_hw *hw, unsigned int index)
 {
 	struct clk_core *parent;
@@ -424,15 +395,6 @@ out:
 	return ret;
 }
 
-unsigned long __clk_get_rate(struct clk *clk)
-{
-	if (!clk)
-		return 0;
-
-	return clk_core_get_rate_nolock(clk->core);
-}
-EXPORT_SYMBOL_GPL(__clk_get_rate);
-
 unsigned long clk_hw_get_rate(struct clk_hw *hw)
 {
 	return clk_core_get_rate_nolock(hw->core);
@@ -459,14 +421,6 @@ unsigned long clk_hw_get_flags(struct clk_hw *hw)
 }
 EXPORT_SYMBOL_GPL(clk_hw_get_flags);
 
-bool __clk_is_prepared(struct clk *clk)
-{
-	if (!clk)
-		return false;
-
-	return clk_core_is_prepared(clk->core);
-}
-
 bool clk_hw_is_prepared(struct clk_hw *hw)
 {
 	return clk_core_is_prepared(hw->core);
@@ -880,32 +834,6 @@ int __clk_determine_rate(struct clk_hw *hw, struct clk_rate_request *req)
 }
 EXPORT_SYMBOL_GPL(__clk_determine_rate);
 
-/**
- * __clk_round_rate - round the given rate for a clk
- * @clk: round the rate of this clock
- * @rate: the rate which is to be rounded
- *
- * Useful for clk_ops such as .set_rate
- */
-unsigned long __clk_round_rate(struct clk *clk, unsigned long rate)
-{
-	struct clk_rate_request req;
-	int ret;
-
-	if (!clk)
-		return 0;
-
-	clk_core_get_boundaries(clk->core, &req.min_rate, &req.max_rate);
-	req.rate = rate;
-
-	ret = clk_core_round_rate_nolock(clk->core, &req);
-	if (ret)
-		return 0;
-
-	return req.rate;
-}
-EXPORT_SYMBOL_GPL(__clk_round_rate);
-
 unsigned long clk_hw_round_rate(struct clk_hw *hw, unsigned long rate)
 {
 	int ret;
@@ -933,16 +861,24 @@ EXPORT_SYMBOL_GPL(clk_hw_round_rate);
  */
 long clk_round_rate(struct clk *clk, unsigned long rate)
 {
-	unsigned long ret;
+	struct clk_rate_request req;
+	int ret;
 
 	if (!clk)
 		return 0;
 
 	clk_prepare_lock();
-	ret = __clk_round_rate(clk, rate);
+
+	clk_core_get_boundaries(clk->core, &req.min_rate, &req.max_rate);
+	req.rate = rate;
+
+	ret = clk_core_round_rate_nolock(clk->core, &req);
 	clk_prepare_unlock();
 
-	return ret;
+	if (ret)
+		return ret;
+
+	return req.rate;
 }
 EXPORT_SYMBOL_GPL(clk_round_rate);
 
@@ -1711,8 +1647,12 @@ struct clk *clk_get_parent(struct clk *clk)
 {
 	struct clk *parent;
 
+	if (!clk)
+		return NULL;
+
 	clk_prepare_lock();
-	parent = __clk_get_parent(clk);
+	/* TODO: Create a per-user clk and change callers to call clk_put */
+	parent = !clk->core->parent ? NULL : clk->core->parent->hw->clk;
 	clk_prepare_unlock();
 
 	return parent;
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index be88dae0c3eb..0d3128fbc14e 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -609,19 +609,14 @@ void devm_clk_unregister(struct device *dev, struct clk *clk);
 const char *__clk_get_name(struct clk *clk);
 const char *clk_hw_get_name(struct clk_hw *hw);
 struct clk_hw *__clk_get_hw(struct clk *clk);
-u8 __clk_get_num_parents(struct clk *clk);
 unsigned int clk_hw_get_num_parents(struct clk_hw *hw);
-struct clk *__clk_get_parent(struct clk *clk);
 struct clk_hw *clk_hw_get_parent(struct clk_hw *hw);
-struct clk *clk_get_parent_by_index(struct clk *clk, u8 index);
 struct clk_hw *clk_hw_get_parent_by_index(struct clk_hw *hw,
 					  unsigned int index);
 unsigned int __clk_get_enable_count(struct clk *clk);
-unsigned long __clk_get_rate(struct clk *clk);
 unsigned long clk_hw_get_rate(struct clk_hw *hw);
 unsigned long __clk_get_flags(struct clk *clk);
 unsigned long clk_hw_get_flags(struct clk_hw *hw);
-bool __clk_is_prepared(struct clk *clk);
 bool clk_hw_is_prepared(struct clk_hw *hw);
 bool __clk_is_enabled(struct clk *clk);
 struct clk *__clk_lookup(const char *name);
@@ -643,7 +638,6 @@ static inline void __clk_hw_set_clk(struct clk_hw *dst, struct clk_hw *src)
 /*
  * FIXME clock api without lock protection
  */
-unsigned long __clk_round_rate(struct clk *clk, unsigned long rate);
 unsigned long clk_hw_round_rate(struct clk_hw *hw, unsigned long rate);
 
 struct of_device_id;
-- 
cgit v1.2.3-70-g09d2


From 0f350f063eb62212a701a512f74e63ae4714441c Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 30 Jul 2015 15:19:12 +0200
Subject: clk: ux500: delete the non-DT U8500 clock implementation

This code is unused and not coming back. Let's kill it off.

Cc: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/ux500/Makefile              |   1 -
 drivers/clk/ux500/u8500_clk.c           | 525 --------------------------------
 include/linux/platform_data/clk-ux500.h |   2 -
 3 files changed, 528 deletions(-)
 delete mode 100644 drivers/clk/ux500/u8500_clk.c

(limited to 'include/linux')

diff --git a/drivers/clk/ux500/Makefile b/drivers/clk/ux500/Makefile
index 521483f0ba33..f3baef29859c 100644
--- a/drivers/clk/ux500/Makefile
+++ b/drivers/clk/ux500/Makefile
@@ -9,7 +9,6 @@ obj-y += clk-sysctrl.o
 
 # Clock definitions
 obj-y += u8500_of_clk.o
-obj-y += u8500_clk.o
 obj-y += u9540_clk.o
 obj-y += u8540_clk.o
 
diff --git a/drivers/clk/ux500/u8500_clk.c b/drivers/clk/ux500/u8500_clk.c
deleted file mode 100644
index 1c7b639d9222..000000000000
--- a/drivers/clk/ux500/u8500_clk.c
+++ /dev/null
@@ -1,525 +0,0 @@
-/*
- * Clock definitions for u8500 platform.
- *
- * Copyright (C) 2012 ST-Ericsson SA
- * Author: Ulf Hansson <ulf.hansson@linaro.org>
- *
- * License terms: GNU General Public License (GPL) version 2
- */
-
-#include <linux/clkdev.h>
-#include <linux/clk-provider.h>
-#include <linux/mfd/dbx500-prcmu.h>
-#include <linux/platform_data/clk-ux500.h>
-#include "clk.h"
-
-void u8500_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
-		    u32 clkrst5_base, u32 clkrst6_base)
-{
-	struct prcmu_fw_version *fw_version;
-	const char *sgaclk_parent = NULL;
-	struct clk *clk;
-
-	/* Clock sources */
-	clk = clk_reg_prcmu_gate("soc0_pll", NULL, PRCMU_PLLSOC0,
-				CLK_IS_ROOT|CLK_IGNORE_UNUSED);
-	clk_register_clkdev(clk, "soc0_pll", NULL);
-
-	clk = clk_reg_prcmu_gate("soc1_pll", NULL, PRCMU_PLLSOC1,
-				CLK_IS_ROOT|CLK_IGNORE_UNUSED);
-	clk_register_clkdev(clk, "soc1_pll", NULL);
-
-	clk = clk_reg_prcmu_gate("ddr_pll", NULL, PRCMU_PLLDDR,
-				CLK_IS_ROOT|CLK_IGNORE_UNUSED);
-	clk_register_clkdev(clk, "ddr_pll", NULL);
-
-	/* FIXME: Add sys, ulp and int clocks here. */
-
-	clk = clk_register_fixed_rate(NULL, "rtc32k", "NULL",
-				CLK_IS_ROOT|CLK_IGNORE_UNUSED,
-				32768);
-	clk_register_clkdev(clk, "clk32k", NULL);
-	clk_register_clkdev(clk, "apb_pclk", "rtc-pl031");
-
-	/* PRCMU clocks */
-	fw_version = prcmu_get_fw_version();
-	if (fw_version != NULL) {
-		switch (fw_version->project) {
-		case PRCMU_FW_PROJECT_U8500_C2:
-		case PRCMU_FW_PROJECT_U8520:
-		case PRCMU_FW_PROJECT_U8420:
-			sgaclk_parent = "soc0_pll";
-			break;
-		default:
-			break;
-		}
-	}
-
-	if (sgaclk_parent)
-		clk = clk_reg_prcmu_gate("sgclk", sgaclk_parent,
-					PRCMU_SGACLK, 0);
-	else
-		clk = clk_reg_prcmu_gate("sgclk", NULL,
-					PRCMU_SGACLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "mali");
-
-	clk = clk_reg_prcmu_gate("uartclk", NULL, PRCMU_UARTCLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "UART");
-
-	clk = clk_reg_prcmu_gate("msp02clk", NULL, PRCMU_MSP02CLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "MSP02");
-
-	clk = clk_reg_prcmu_gate("msp1clk", NULL, PRCMU_MSP1CLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "MSP1");
-
-	clk = clk_reg_prcmu_gate("i2cclk", NULL, PRCMU_I2CCLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "I2C");
-
-	clk = clk_reg_prcmu_gate("slimclk", NULL, PRCMU_SLIMCLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "slim");
-
-	clk = clk_reg_prcmu_gate("per1clk", NULL, PRCMU_PER1CLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "PERIPH1");
-
-	clk = clk_reg_prcmu_gate("per2clk", NULL, PRCMU_PER2CLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "PERIPH2");
-
-	clk = clk_reg_prcmu_gate("per3clk", NULL, PRCMU_PER3CLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "PERIPH3");
-
-	clk = clk_reg_prcmu_gate("per5clk", NULL, PRCMU_PER5CLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "PERIPH5");
-
-	clk = clk_reg_prcmu_gate("per6clk", NULL, PRCMU_PER6CLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "PERIPH6");
-
-	clk = clk_reg_prcmu_gate("per7clk", NULL, PRCMU_PER7CLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "PERIPH7");
-
-	clk = clk_reg_prcmu_scalable("lcdclk", NULL, PRCMU_LCDCLK, 0,
-				CLK_IS_ROOT|CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "lcd");
-	clk_register_clkdev(clk, "lcd", "mcde");
-
-	clk = clk_reg_prcmu_opp_gate("bmlclk", NULL, PRCMU_BMLCLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "bml");
-
-	clk = clk_reg_prcmu_scalable("hsitxclk", NULL, PRCMU_HSITXCLK, 0,
-				CLK_IS_ROOT|CLK_SET_RATE_GATE);
-
-	clk = clk_reg_prcmu_scalable("hsirxclk", NULL, PRCMU_HSIRXCLK, 0,
-				CLK_IS_ROOT|CLK_SET_RATE_GATE);
-
-	clk = clk_reg_prcmu_scalable("hdmiclk", NULL, PRCMU_HDMICLK, 0,
-				CLK_IS_ROOT|CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "hdmi");
-	clk_register_clkdev(clk, "hdmi", "mcde");
-
-	clk = clk_reg_prcmu_scalable("apeatclk", NULL, PRCMU_APEATCLK, 0,
-				     CLK_IS_ROOT|CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "apeat");
-
-	clk = clk_reg_prcmu_scalable("apetraceclk", NULL, PRCMU_APETRACECLK, 0,
-				CLK_IS_ROOT|CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "apetrace");
-
-	clk = clk_reg_prcmu_gate("mcdeclk", NULL, PRCMU_MCDECLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "mcde");
-	clk_register_clkdev(clk, "mcde", "mcde");
-	clk_register_clkdev(clk, "dsisys", "dsilink.0");
-	clk_register_clkdev(clk, "dsisys", "dsilink.1");
-	clk_register_clkdev(clk, "dsisys", "dsilink.2");
-
-	clk = clk_reg_prcmu_opp_gate("ipi2cclk", NULL, PRCMU_IPI2CCLK,
-				CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "ipi2");
-
-	clk = clk_reg_prcmu_gate("dsialtclk", NULL, PRCMU_DSIALTCLK,
-				CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "dsialt");
-
-	clk = clk_reg_prcmu_gate("dmaclk", NULL, PRCMU_DMACLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "dma40.0");
-
-	clk = clk_reg_prcmu_gate("b2r2clk", NULL, PRCMU_B2R2CLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "b2r2");
-	clk_register_clkdev(clk, NULL, "b2r2_core");
-	clk_register_clkdev(clk, NULL, "U8500-B2R2.0");
-
-	clk = clk_reg_prcmu_scalable("tvclk", NULL, PRCMU_TVCLK, 0,
-				CLK_IS_ROOT|CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "tv");
-	clk_register_clkdev(clk, "tv", "mcde");
-
-	clk = clk_reg_prcmu_gate("sspclk", NULL, PRCMU_SSPCLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "SSP");
-
-	clk = clk_reg_prcmu_gate("rngclk", NULL, PRCMU_RNGCLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "rngclk");
-
-	clk = clk_reg_prcmu_gate("uiccclk", NULL, PRCMU_UICCCLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "uicc");
-
-	clk = clk_reg_prcmu_gate("timclk", NULL, PRCMU_TIMCLK, CLK_IS_ROOT);
-	clk_register_clkdev(clk, NULL, "mtu0");
-	clk_register_clkdev(clk, NULL, "mtu1");
-
-	clk = clk_reg_prcmu_opp_volt_scalable("sdmmcclk", NULL, PRCMU_SDMMCCLK,
-					100000000,
-					CLK_IS_ROOT|CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "sdmmc");
-
-	clk = clk_reg_prcmu_scalable("dsi_pll", "hdmiclk",
-				PRCMU_PLLDSI, 0, CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, "dsihs2", "mcde");
-	clk_register_clkdev(clk, "dsihs2", "dsilink.2");
-
-
-	clk = clk_reg_prcmu_scalable("dsi0clk", "dsi_pll",
-				PRCMU_DSI0CLK, 0, CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, "dsihs0", "mcde");
-	clk_register_clkdev(clk, "dsihs0", "dsilink.0");
-
-	clk = clk_reg_prcmu_scalable("dsi1clk", "dsi_pll",
-				PRCMU_DSI1CLK, 0, CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, "dsihs1", "mcde");
-	clk_register_clkdev(clk, "dsihs1", "dsilink.1");
-
-	clk = clk_reg_prcmu_scalable("dsi0escclk", "tvclk",
-				PRCMU_DSI0ESCCLK, 0, CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, "dsilp0", "dsilink.0");
-	clk_register_clkdev(clk, "dsilp0", "mcde");
-
-	clk = clk_reg_prcmu_scalable("dsi1escclk", "tvclk",
-				PRCMU_DSI1ESCCLK, 0, CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, "dsilp1", "dsilink.1");
-	clk_register_clkdev(clk, "dsilp1", "mcde");
-
-	clk = clk_reg_prcmu_scalable("dsi2escclk", "tvclk",
-				PRCMU_DSI2ESCCLK, 0, CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, "dsilp2", "dsilink.2");
-	clk_register_clkdev(clk, "dsilp2", "mcde");
-
-	clk = clk_reg_prcmu_scalable_rate("armss", NULL,
-				PRCMU_ARMSS, 0, CLK_IS_ROOT|CLK_IGNORE_UNUSED);
-	clk_register_clkdev(clk, "armss", NULL);
-
-	clk = clk_register_fixed_factor(NULL, "smp_twd", "armss",
-				CLK_IGNORE_UNUSED, 1, 2);
-	clk_register_clkdev(clk, NULL, "smp_twd");
-
-	/*
-	 * FIXME: Add special handled PRCMU clocks here:
-	 * 1. clkout0yuv, use PRCMU as parent + need regulator + pinctrl.
-	 * 2. ab9540_clkout1yuv, see clkout0yuv
-	 */
-
-	/* PRCC P-clocks */
-	clk = clk_reg_prcc_pclk("p1_pclk0", "per1clk", clkrst1_base,
-				BIT(0), 0);
-	clk_register_clkdev(clk, "apb_pclk", "uart0");
-
-	clk = clk_reg_prcc_pclk("p1_pclk1", "per1clk", clkrst1_base,
-				BIT(1), 0);
-	clk_register_clkdev(clk, "apb_pclk", "uart1");
-
-	clk = clk_reg_prcc_pclk("p1_pclk2", "per1clk", clkrst1_base,
-				BIT(2), 0);
-	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.1");
-
-	clk = clk_reg_prcc_pclk("p1_pclk3", "per1clk", clkrst1_base,
-				BIT(3), 0);
-	clk_register_clkdev(clk, "apb_pclk", "msp0");
-	clk_register_clkdev(clk, "apb_pclk", "ux500-msp-i2s.0");
-
-	clk = clk_reg_prcc_pclk("p1_pclk4", "per1clk", clkrst1_base,
-				BIT(4), 0);
-	clk_register_clkdev(clk, "apb_pclk", "msp1");
-	clk_register_clkdev(clk, "apb_pclk", "ux500-msp-i2s.1");
-
-	clk = clk_reg_prcc_pclk("p1_pclk5", "per1clk", clkrst1_base,
-				BIT(5), 0);
-	clk_register_clkdev(clk, "apb_pclk", "sdi0");
-
-	clk = clk_reg_prcc_pclk("p1_pclk6", "per1clk", clkrst1_base,
-				BIT(6), 0);
-	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.2");
-
-	clk = clk_reg_prcc_pclk("p1_pclk7", "per1clk", clkrst1_base,
-				BIT(7), 0);
-	clk_register_clkdev(clk, NULL, "spi3");
-
-	clk = clk_reg_prcc_pclk("p1_pclk8", "per1clk", clkrst1_base,
-				BIT(8), 0);
-	clk_register_clkdev(clk, "apb_pclk", "slimbus0");
-
-	clk = clk_reg_prcc_pclk("p1_pclk9", "per1clk", clkrst1_base,
-				BIT(9), 0);
-	clk_register_clkdev(clk, NULL, "gpio.0");
-	clk_register_clkdev(clk, NULL, "gpio.1");
-	clk_register_clkdev(clk, NULL, "gpioblock0");
-
-	clk = clk_reg_prcc_pclk("p1_pclk10", "per1clk", clkrst1_base,
-				BIT(10), 0);
-	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.4");
-
-	clk = clk_reg_prcc_pclk("p1_pclk11", "per1clk", clkrst1_base,
-				BIT(11), 0);
-	clk_register_clkdev(clk, "apb_pclk", "msp3");
-	clk_register_clkdev(clk, "apb_pclk", "ux500-msp-i2s.3");
-
-	clk = clk_reg_prcc_pclk("p2_pclk0", "per2clk", clkrst2_base,
-				BIT(0), 0);
-	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.3");
-
-	clk = clk_reg_prcc_pclk("p2_pclk1", "per2clk", clkrst2_base,
-				BIT(1), 0);
-	clk_register_clkdev(clk, NULL, "spi2");
-
-	clk = clk_reg_prcc_pclk("p2_pclk2", "per2clk", clkrst2_base,
-				BIT(2), 0);
-	clk_register_clkdev(clk, NULL, "spi1");
-
-	clk = clk_reg_prcc_pclk("p2_pclk3", "per2clk", clkrst2_base,
-				BIT(3), 0);
-	clk_register_clkdev(clk, NULL, "pwl");
-
-	clk = clk_reg_prcc_pclk("p2_pclk4", "per2clk", clkrst2_base,
-				BIT(4), 0);
-	clk_register_clkdev(clk, "apb_pclk", "sdi4");
-
-	clk = clk_reg_prcc_pclk("p2_pclk5", "per2clk", clkrst2_base,
-				BIT(5), 0);
-	clk_register_clkdev(clk, "apb_pclk", "msp2");
-	clk_register_clkdev(clk, "apb_pclk", "ux500-msp-i2s.2");
-
-	clk = clk_reg_prcc_pclk("p2_pclk6", "per2clk", clkrst2_base,
-				BIT(6), 0);
-	clk_register_clkdev(clk, "apb_pclk", "sdi1");
-
-	clk = clk_reg_prcc_pclk("p2_pclk7", "per2clk", clkrst2_base,
-				BIT(7), 0);
-	clk_register_clkdev(clk, "apb_pclk", "sdi3");
-
-	clk = clk_reg_prcc_pclk("p2_pclk8", "per2clk", clkrst2_base,
-				BIT(8), 0);
-	clk_register_clkdev(clk, NULL, "spi0");
-
-	clk = clk_reg_prcc_pclk("p2_pclk9", "per2clk", clkrst2_base,
-				BIT(9), 0);
-	clk_register_clkdev(clk, "hsir_hclk", "ste_hsi.0");
-
-	clk = clk_reg_prcc_pclk("p2_pclk10", "per2clk", clkrst2_base,
-				BIT(10), 0);
-	clk_register_clkdev(clk, "hsit_hclk", "ste_hsi.0");
-
-	clk = clk_reg_prcc_pclk("p2_pclk11", "per2clk", clkrst2_base,
-				BIT(11), 0);
-	clk_register_clkdev(clk, NULL, "gpio.6");
-	clk_register_clkdev(clk, NULL, "gpio.7");
-	clk_register_clkdev(clk, NULL, "gpioblock1");
-
-	clk = clk_reg_prcc_pclk("p2_pclk12", "per2clk", clkrst2_base,
-				BIT(12), 0);
-
-	clk = clk_reg_prcc_pclk("p3_pclk0", "per3clk", clkrst3_base,
-				BIT(0), 0);
-	clk_register_clkdev(clk, "fsmc", NULL);
-	clk_register_clkdev(clk, NULL, "smsc911x.0");
-
-	clk = clk_reg_prcc_pclk("p3_pclk1", "per3clk", clkrst3_base,
-				BIT(1), 0);
-	clk_register_clkdev(clk, "apb_pclk", "ssp0");
-
-	clk = clk_reg_prcc_pclk("p3_pclk2", "per3clk", clkrst3_base,
-				BIT(2), 0);
-	clk_register_clkdev(clk, "apb_pclk", "ssp1");
-
-	clk = clk_reg_prcc_pclk("p3_pclk3", "per3clk", clkrst3_base,
-				BIT(3), 0);
-	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.0");
-
-	clk = clk_reg_prcc_pclk("p3_pclk4", "per3clk", clkrst3_base,
-				BIT(4), 0);
-	clk_register_clkdev(clk, "apb_pclk", "sdi2");
-
-	clk = clk_reg_prcc_pclk("p3_pclk5", "per3clk", clkrst3_base,
-				BIT(5), 0);
-	clk_register_clkdev(clk, "apb_pclk", "ske");
-	clk_register_clkdev(clk, "apb_pclk", "nmk-ske-keypad");
-
-	clk = clk_reg_prcc_pclk("p3_pclk6", "per3clk", clkrst3_base,
-				BIT(6), 0);
-	clk_register_clkdev(clk, "apb_pclk", "uart2");
-
-	clk = clk_reg_prcc_pclk("p3_pclk7", "per3clk", clkrst3_base,
-				BIT(7), 0);
-	clk_register_clkdev(clk, "apb_pclk", "sdi5");
-
-	clk = clk_reg_prcc_pclk("p3_pclk8", "per3clk", clkrst3_base,
-				BIT(8), 0);
-	clk_register_clkdev(clk, NULL, "gpio.2");
-	clk_register_clkdev(clk, NULL, "gpio.3");
-	clk_register_clkdev(clk, NULL, "gpio.4");
-	clk_register_clkdev(clk, NULL, "gpio.5");
-	clk_register_clkdev(clk, NULL, "gpioblock2");
-
-	clk = clk_reg_prcc_pclk("p5_pclk0", "per5clk", clkrst5_base,
-				BIT(0), 0);
-	clk_register_clkdev(clk, "usb", "musb-ux500.0");
-
-	clk = clk_reg_prcc_pclk("p5_pclk1", "per5clk", clkrst5_base,
-				BIT(1), 0);
-	clk_register_clkdev(clk, NULL, "gpio.8");
-	clk_register_clkdev(clk, NULL, "gpioblock3");
-
-	clk = clk_reg_prcc_pclk("p6_pclk0", "per6clk", clkrst6_base,
-				BIT(0), 0);
-	clk_register_clkdev(clk, "apb_pclk", "rng");
-
-	clk = clk_reg_prcc_pclk("p6_pclk1", "per6clk", clkrst6_base,
-				BIT(1), 0);
-	clk_register_clkdev(clk, NULL, "cryp0");
-	clk_register_clkdev(clk, NULL, "cryp1");
-
-	clk = clk_reg_prcc_pclk("p6_pclk2", "per6clk", clkrst6_base,
-				BIT(2), 0);
-	clk_register_clkdev(clk, NULL, "hash0");
-
-	clk = clk_reg_prcc_pclk("p6_pclk3", "per6clk", clkrst6_base,
-				BIT(3), 0);
-	clk_register_clkdev(clk, NULL, "pka");
-
-	clk = clk_reg_prcc_pclk("p6_pclk4", "per6clk", clkrst6_base,
-				BIT(4), 0);
-	clk_register_clkdev(clk, NULL, "hash1");
-
-	clk = clk_reg_prcc_pclk("p6_pclk5", "per6clk", clkrst6_base,
-				BIT(5), 0);
-	clk_register_clkdev(clk, NULL, "cfgreg");
-
-	clk = clk_reg_prcc_pclk("p6_pclk6", "per6clk", clkrst6_base,
-				BIT(6), 0);
-	clk_register_clkdev(clk, "apb_pclk", "mtu0");
-
-	clk = clk_reg_prcc_pclk("p6_pclk7", "per6clk", clkrst6_base,
-				BIT(7), 0);
-	clk_register_clkdev(clk, "apb_pclk", "mtu1");
-
-	/* PRCC K-clocks
-	 *
-	 * FIXME: Some drivers requires PERPIH[n| to be automatically enabled
-	 * by enabling just the K-clock, even if it is not a valid parent to
-	 * the K-clock. Until drivers get fixed we might need some kind of
-	 * "parent muxed join".
-	 */
-
-	/* Periph1 */
-	clk = clk_reg_prcc_kclk("p1_uart0_kclk", "uartclk",
-			clkrst1_base, BIT(0), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "uart0");
-
-	clk = clk_reg_prcc_kclk("p1_uart1_kclk", "uartclk",
-			clkrst1_base, BIT(1), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "uart1");
-
-	clk = clk_reg_prcc_kclk("p1_i2c1_kclk", "i2cclk",
-			clkrst1_base, BIT(2), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "nmk-i2c.1");
-
-	clk = clk_reg_prcc_kclk("p1_msp0_kclk", "msp02clk",
-			clkrst1_base, BIT(3), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "msp0");
-	clk_register_clkdev(clk, NULL, "ux500-msp-i2s.0");
-
-	clk = clk_reg_prcc_kclk("p1_msp1_kclk", "msp1clk",
-			clkrst1_base, BIT(4), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "msp1");
-	clk_register_clkdev(clk, NULL, "ux500-msp-i2s.1");
-
-	clk = clk_reg_prcc_kclk("p1_sdi0_kclk", "sdmmcclk",
-			clkrst1_base, BIT(5), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "sdi0");
-
-	clk = clk_reg_prcc_kclk("p1_i2c2_kclk", "i2cclk",
-			clkrst1_base, BIT(6), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "nmk-i2c.2");
-
-	clk = clk_reg_prcc_kclk("p1_slimbus0_kclk", "slimclk",
-			clkrst1_base, BIT(8), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "slimbus0");
-
-	clk = clk_reg_prcc_kclk("p1_i2c4_kclk", "i2cclk",
-			clkrst1_base, BIT(9), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "nmk-i2c.4");
-
-	clk = clk_reg_prcc_kclk("p1_msp3_kclk", "msp1clk",
-			clkrst1_base, BIT(10), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "msp3");
-	clk_register_clkdev(clk, NULL, "ux500-msp-i2s.3");
-
-	/* Periph2 */
-	clk = clk_reg_prcc_kclk("p2_i2c3_kclk", "i2cclk",
-			clkrst2_base, BIT(0), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "nmk-i2c.3");
-
-	clk = clk_reg_prcc_kclk("p2_sdi4_kclk", "sdmmcclk",
-			clkrst2_base, BIT(2), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "sdi4");
-
-	clk = clk_reg_prcc_kclk("p2_msp2_kclk", "msp02clk",
-			clkrst2_base, BIT(3), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "msp2");
-	clk_register_clkdev(clk, NULL, "ux500-msp-i2s.2");
-
-	clk = clk_reg_prcc_kclk("p2_sdi1_kclk", "sdmmcclk",
-			clkrst2_base, BIT(4), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "sdi1");
-
-	clk = clk_reg_prcc_kclk("p2_sdi3_kclk", "sdmmcclk",
-			clkrst2_base, BIT(5), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "sdi3");
-
-	/* Note that rate is received from parent. */
-	clk = clk_reg_prcc_kclk("p2_ssirx_kclk", "hsirxclk",
-			clkrst2_base, BIT(6),
-			CLK_SET_RATE_GATE|CLK_SET_RATE_PARENT);
-	clk = clk_reg_prcc_kclk("p2_ssitx_kclk", "hsitxclk",
-			clkrst2_base, BIT(7),
-			CLK_SET_RATE_GATE|CLK_SET_RATE_PARENT);
-
-	/* Periph3 */
-	clk = clk_reg_prcc_kclk("p3_ssp0_kclk", "sspclk",
-			clkrst3_base, BIT(1), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "ssp0");
-
-	clk = clk_reg_prcc_kclk("p3_ssp1_kclk", "sspclk",
-			clkrst3_base, BIT(2), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "ssp1");
-
-	clk = clk_reg_prcc_kclk("p3_i2c0_kclk", "i2cclk",
-			clkrst3_base, BIT(3), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "nmk-i2c.0");
-
-	clk = clk_reg_prcc_kclk("p3_sdi2_kclk", "sdmmcclk",
-			clkrst3_base, BIT(4), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "sdi2");
-
-	clk = clk_reg_prcc_kclk("p3_ske_kclk", "rtc32k",
-			clkrst3_base, BIT(5), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "ske");
-	clk_register_clkdev(clk, NULL, "nmk-ske-keypad");
-
-	clk = clk_reg_prcc_kclk("p3_uart2_kclk", "uartclk",
-			clkrst3_base, BIT(6), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "uart2");
-
-	clk = clk_reg_prcc_kclk("p3_sdi5_kclk", "sdmmcclk",
-			clkrst3_base, BIT(7), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "sdi5");
-
-	/* Periph6 */
-	clk = clk_reg_prcc_kclk("p3_rng_kclk", "rngclk",
-			clkrst6_base, BIT(0), CLK_SET_RATE_GATE);
-	clk_register_clkdev(clk, NULL, "rng");
-}
diff --git a/include/linux/platform_data/clk-ux500.h b/include/linux/platform_data/clk-ux500.h
index 97baf831e071..0058edb24391 100644
--- a/include/linux/platform_data/clk-ux500.h
+++ b/include/linux/platform_data/clk-ux500.h
@@ -13,8 +13,6 @@
 void u8500_of_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
 		       u32 clkrst5_base, u32 clkrst6_base);
 
-void u8500_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
-		    u32 clkrst5_base, u32 clkrst6_base);
 void u9540_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
 		    u32 clkrst5_base, u32 clkrst6_base);
 void u8540_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
-- 
cgit v1.2.3-70-g09d2


From e7df6f6e21883d7e8b3ad4641c911da8314ef283 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Wed, 12 Aug 2015 13:04:56 -0700
Subject: clk: Constify clk_hw argument to provider APIs

We don't modify the clk_hw argument in these functions, so it's
safe to mark it as const.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk.c            | 15 ++++++++-------
 include/linux/clk-provider.h | 14 +++++++-------
 2 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 128ad748b682..8e6688d1ecbd 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -278,7 +278,7 @@ const char *__clk_get_name(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(__clk_get_name);
 
-const char *clk_hw_get_name(struct clk_hw *hw)
+const char *clk_hw_get_name(const struct clk_hw *hw)
 {
 	return hw->core->name;
 }
@@ -290,13 +290,13 @@ struct clk_hw *__clk_get_hw(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(__clk_get_hw);
 
-unsigned int clk_hw_get_num_parents(struct clk_hw *hw)
+unsigned int clk_hw_get_num_parents(const struct clk_hw *hw)
 {
 	return hw->core->num_parents;
 }
 EXPORT_SYMBOL_GPL(clk_hw_get_num_parents);
 
-struct clk_hw *clk_hw_get_parent(struct clk_hw *hw)
+struct clk_hw *clk_hw_get_parent(const struct clk_hw *hw)
 {
 	return hw->core->parent ? hw->core->parent->hw : NULL;
 }
@@ -359,7 +359,8 @@ static struct clk_core *clk_core_get_parent_by_index(struct clk_core *core,
 		return core->parents[index];
 }
 
-struct clk_hw *clk_hw_get_parent_by_index(struct clk_hw *hw, unsigned int index)
+struct clk_hw *
+clk_hw_get_parent_by_index(const struct clk_hw *hw, unsigned int index)
 {
 	struct clk_core *parent;
 
@@ -395,7 +396,7 @@ out:
 	return ret;
 }
 
-unsigned long clk_hw_get_rate(struct clk_hw *hw)
+unsigned long clk_hw_get_rate(const struct clk_hw *hw)
 {
 	return clk_core_get_rate_nolock(hw->core);
 }
@@ -415,13 +416,13 @@ unsigned long __clk_get_flags(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(__clk_get_flags);
 
-unsigned long clk_hw_get_flags(struct clk_hw *hw)
+unsigned long clk_hw_get_flags(const struct clk_hw *hw)
 {
 	return hw->core->flags;
 }
 EXPORT_SYMBOL_GPL(clk_hw_get_flags);
 
-bool clk_hw_is_prepared(struct clk_hw *hw)
+bool clk_hw_is_prepared(const struct clk_hw *hw)
 {
 	return clk_core_is_prepared(hw->core);
 }
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 0d3128fbc14e..3ecc07d0da77 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -607,17 +607,17 @@ void devm_clk_unregister(struct device *dev, struct clk *clk);
 
 /* helper functions */
 const char *__clk_get_name(struct clk *clk);
-const char *clk_hw_get_name(struct clk_hw *hw);
+const char *clk_hw_get_name(const struct clk_hw *hw);
 struct clk_hw *__clk_get_hw(struct clk *clk);
-unsigned int clk_hw_get_num_parents(struct clk_hw *hw);
-struct clk_hw *clk_hw_get_parent(struct clk_hw *hw);
-struct clk_hw *clk_hw_get_parent_by_index(struct clk_hw *hw,
+unsigned int clk_hw_get_num_parents(const struct clk_hw *hw);
+struct clk_hw *clk_hw_get_parent(const struct clk_hw *hw);
+struct clk_hw *clk_hw_get_parent_by_index(const struct clk_hw *hw,
 					  unsigned int index);
 unsigned int __clk_get_enable_count(struct clk *clk);
-unsigned long clk_hw_get_rate(struct clk_hw *hw);
+unsigned long clk_hw_get_rate(const struct clk_hw *hw);
 unsigned long __clk_get_flags(struct clk *clk);
-unsigned long clk_hw_get_flags(struct clk_hw *hw);
-bool clk_hw_is_prepared(struct clk_hw *hw);
+unsigned long clk_hw_get_flags(const struct clk_hw *hw);
+bool clk_hw_is_prepared(const struct clk_hw *hw);
 bool __clk_is_enabled(struct clk *clk);
 struct clk *__clk_lookup(const char *name);
 int __clk_mux_determine_rate(struct clk_hw *hw,
-- 
cgit v1.2.3-70-g09d2


From 5dc0fe199b358966021b015c71ca4049d0f42aa6 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 30 Jul 2015 15:19:25 +0200
Subject: clk/ARM: move Ux500 PRCC bases to the device tree

The base addresses for the Ux500 PRCC controllers are hardcoded,
let's move them to the clock node in the device tree and delete
the constants.

Cc: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Olof Johansson <olof@lixom.net>
Acked-by: Michael Turquette <mturquette@baylibre.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 arch/arm/boot/dts/ste-dbx5x0.dtsi       |   7 ++
 arch/arm/mach-ux500/cpu.c               |  21 +---
 drivers/clk/ux500/u8500_of_clk.c        | 163 ++++++++++++++------------
 drivers/clk/ux500/u8540_clk.c           | 197 +++++++++++++++++++-------------
 drivers/clk/ux500/u9540_clk.c           |   3 +-
 include/linux/platform_data/clk-ux500.h |  10 +-
 6 files changed, 225 insertions(+), 176 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/boot/dts/ste-dbx5x0.dtsi b/arch/arm/boot/dts/ste-dbx5x0.dtsi
index 853684ad7773..a56bf890afaf 100644
--- a/arch/arm/boot/dts/ste-dbx5x0.dtsi
+++ b/arch/arm/boot/dts/ste-dbx5x0.dtsi
@@ -219,6 +219,13 @@
 
 		clocks {
 			compatible = "stericsson,u8500-clks";
+			/*
+			 * Registers for the CLKRST block on peripheral
+			 * groups 1, 2, 3, 5, 6,
+			 */
+			reg = <0x8012f000 0x1000>, <0x8011f000 0x1000>,
+			    <0x8000f000 0x1000>, <0xa03ff000 0x1000>,
+			    <0xa03cf000 0x1000>;
 
 			prcmu_clk: prcmu-clock {
 				#clock-cells = <1>;
diff --git a/arch/arm/mach-ux500/cpu.c b/arch/arm/mach-ux500/cpu.c
index e31d3d61c998..b316e18a76aa 100644
--- a/arch/arm/mach-ux500/cpu.c
+++ b/arch/arm/mach-ux500/cpu.c
@@ -72,21 +72,12 @@ void __init ux500_init_irq(void)
 	 * Init clocks here so that they are available for system timer
 	 * initialization.
 	 */
-	if (cpu_is_u8500_family()) {
-		u8500_of_clk_init(U8500_CLKRST1_BASE,
-				  U8500_CLKRST2_BASE,
-				  U8500_CLKRST3_BASE,
-				  U8500_CLKRST5_BASE,
-				  U8500_CLKRST6_BASE);
-	} else if (cpu_is_u9540()) {
-		u9540_clk_init(U8500_CLKRST1_BASE, U8500_CLKRST2_BASE,
-			       U8500_CLKRST3_BASE, U8500_CLKRST5_BASE,
-			       U8500_CLKRST6_BASE);
-	} else if (cpu_is_u8540()) {
-		u8540_clk_init(U8500_CLKRST1_BASE, U8500_CLKRST2_BASE,
-			       U8500_CLKRST3_BASE, U8500_CLKRST5_BASE,
-			       U8500_CLKRST6_BASE);
-	}
+	if (cpu_is_u8500_family())
+		u8500_clk_init();
+	else if (cpu_is_u9540())
+		u9540_clk_init();
+	else if (cpu_is_u8540())
+		u8540_clk_init();
 }
 
 static const char * __init ux500_get_machine(void)
diff --git a/drivers/clk/ux500/u8500_of_clk.c b/drivers/clk/ux500/u8500_of_clk.c
index c3e3b20e4b43..271c09644652 100644
--- a/drivers/clk/ux500/u8500_of_clk.c
+++ b/drivers/clk/ux500/u8500_of_clk.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/clk-provider.h>
 #include <linux/mfd/dbx500-prcmu.h>
 #include <linux/platform_data/clk-ux500.h>
@@ -52,14 +53,25 @@ static const struct of_device_id u8500_clk_of_match[] = {
 	{ },
 };
 
-void u8500_of_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
-		       u32 clkrst5_base, u32 clkrst6_base)
+/* CLKRST4 is missing making it hard to index things */
+enum clkrst_index {
+	CLKRST1_INDEX = 0,
+	CLKRST2_INDEX,
+	CLKRST3_INDEX,
+	CLKRST5_INDEX,
+	CLKRST6_INDEX,
+	CLKRST_MAX,
+};
+
+void u8500_clk_init(void)
 {
 	struct prcmu_fw_version *fw_version;
 	struct device_node *np = NULL;
 	struct device_node *child = NULL;
 	const char *sgaclk_parent = NULL;
 	struct clk *clk, *rtc_clk, *twd_clk;
+	u32 bases[CLKRST_MAX];
+	int i;
 
 	if (of_have_populated_dt())
 		np = of_find_matching_node(NULL, u8500_clk_of_match);
@@ -67,6 +79,15 @@ void u8500_of_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
 		pr_err("Either DT or U8500 Clock node not found\n");
 		return;
 	}
+	for (i = 0; i < ARRAY_SIZE(bases); i++) {
+		struct resource r;
+
+		if (of_address_to_resource(np, i, &r))
+			/* Not much choice but to continue */
+			pr_err("failed to get CLKRST %d base address\n",
+			       i + 1);
+		bases[i] = r.start;
+	}
 
 	/* Clock sources */
 	clk = clk_reg_prcmu_gate("soc0_pll", NULL, PRCMU_PLLSOC0,
@@ -244,179 +265,179 @@ void u8500_of_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
 	 */
 
 	/* PRCC P-clocks */
-	clk = clk_reg_prcc_pclk("p1_pclk0", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk0", "per1clk", bases[CLKRST1_INDEX],
 				BIT(0), 0);
 	PRCC_PCLK_STORE(clk, 1, 0);
 
-	clk = clk_reg_prcc_pclk("p1_pclk1", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk1", "per1clk", bases[CLKRST1_INDEX],
 				BIT(1), 0);
 	PRCC_PCLK_STORE(clk, 1, 1);
 
-	clk = clk_reg_prcc_pclk("p1_pclk2", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk2", "per1clk", bases[CLKRST1_INDEX],
 				BIT(2), 0);
 	PRCC_PCLK_STORE(clk, 1, 2);
 
-	clk = clk_reg_prcc_pclk("p1_pclk3", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk3", "per1clk", bases[CLKRST1_INDEX],
 				BIT(3), 0);
 	PRCC_PCLK_STORE(clk, 1, 3);
 
-	clk = clk_reg_prcc_pclk("p1_pclk4", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk4", "per1clk", bases[CLKRST1_INDEX],
 				BIT(4), 0);
 	PRCC_PCLK_STORE(clk, 1, 4);
 
-	clk = clk_reg_prcc_pclk("p1_pclk5", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk5", "per1clk", bases[CLKRST1_INDEX],
 				BIT(5), 0);
 	PRCC_PCLK_STORE(clk, 1, 5);
 
-	clk = clk_reg_prcc_pclk("p1_pclk6", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk6", "per1clk", bases[CLKRST1_INDEX],
 				BIT(6), 0);
 	PRCC_PCLK_STORE(clk, 1, 6);
 
-	clk = clk_reg_prcc_pclk("p1_pclk7", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk7", "per1clk", bases[CLKRST1_INDEX],
 				BIT(7), 0);
 	PRCC_PCLK_STORE(clk, 1, 7);
 
-	clk = clk_reg_prcc_pclk("p1_pclk8", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk8", "per1clk", bases[CLKRST1_INDEX],
 				BIT(8), 0);
 	PRCC_PCLK_STORE(clk, 1, 8);
 
-	clk = clk_reg_prcc_pclk("p1_pclk9", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk9", "per1clk", bases[CLKRST1_INDEX],
 				BIT(9), 0);
 	PRCC_PCLK_STORE(clk, 1, 9);
 
-	clk = clk_reg_prcc_pclk("p1_pclk10", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk10", "per1clk", bases[CLKRST1_INDEX],
 				BIT(10), 0);
 	PRCC_PCLK_STORE(clk, 1, 10);
 
-	clk = clk_reg_prcc_pclk("p1_pclk11", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk11", "per1clk", bases[CLKRST1_INDEX],
 				BIT(11), 0);
 	PRCC_PCLK_STORE(clk, 1, 11);
 
-	clk = clk_reg_prcc_pclk("p2_pclk0", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk0", "per2clk", bases[CLKRST2_INDEX],
 				BIT(0), 0);
 	PRCC_PCLK_STORE(clk, 2, 0);
 
-	clk = clk_reg_prcc_pclk("p2_pclk1", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk1", "per2clk", bases[CLKRST2_INDEX],
 				BIT(1), 0);
 	PRCC_PCLK_STORE(clk, 2, 1);
 
-	clk = clk_reg_prcc_pclk("p2_pclk2", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk2", "per2clk", bases[CLKRST2_INDEX],
 				BIT(2), 0);
 	PRCC_PCLK_STORE(clk, 2, 2);
 
-	clk = clk_reg_prcc_pclk("p2_pclk3", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk3", "per2clk", bases[CLKRST2_INDEX],
 				BIT(3), 0);
 	PRCC_PCLK_STORE(clk, 2, 3);
 
-	clk = clk_reg_prcc_pclk("p2_pclk4", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk4", "per2clk", bases[CLKRST2_INDEX],
 				BIT(4), 0);
 	PRCC_PCLK_STORE(clk, 2, 4);
 
-	clk = clk_reg_prcc_pclk("p2_pclk5", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk5", "per2clk", bases[CLKRST2_INDEX],
 				BIT(5), 0);
 	PRCC_PCLK_STORE(clk, 2, 5);
 
-	clk = clk_reg_prcc_pclk("p2_pclk6", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk6", "per2clk", bases[CLKRST2_INDEX],
 				BIT(6), 0);
 	PRCC_PCLK_STORE(clk, 2, 6);
 
-	clk = clk_reg_prcc_pclk("p2_pclk7", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk7", "per2clk", bases[CLKRST2_INDEX],
 				BIT(7), 0);
 	PRCC_PCLK_STORE(clk, 2, 7);
 
-	clk = clk_reg_prcc_pclk("p2_pclk8", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk8", "per2clk", bases[CLKRST2_INDEX],
 				BIT(8), 0);
 	PRCC_PCLK_STORE(clk, 2, 8);
 
-	clk = clk_reg_prcc_pclk("p2_pclk9", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk9", "per2clk", bases[CLKRST2_INDEX],
 				BIT(9), 0);
 	PRCC_PCLK_STORE(clk, 2, 9);
 
-	clk = clk_reg_prcc_pclk("p2_pclk10", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk10", "per2clk", bases[CLKRST2_INDEX],
 				BIT(10), 0);
 	PRCC_PCLK_STORE(clk, 2, 10);
 
-	clk = clk_reg_prcc_pclk("p2_pclk11", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk11", "per2clk", bases[CLKRST2_INDEX],
 				BIT(11), 0);
 	PRCC_PCLK_STORE(clk, 2, 11);
 
-	clk = clk_reg_prcc_pclk("p2_pclk12", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk12", "per2clk", bases[CLKRST2_INDEX],
 				BIT(12), 0);
 	PRCC_PCLK_STORE(clk, 2, 12);
 
-	clk = clk_reg_prcc_pclk("p3_pclk0", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk0", "per3clk", bases[CLKRST3_INDEX],
 				BIT(0), 0);
 	PRCC_PCLK_STORE(clk, 3, 0);
 
-	clk = clk_reg_prcc_pclk("p3_pclk1", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk1", "per3clk", bases[CLKRST3_INDEX],
 				BIT(1), 0);
 	PRCC_PCLK_STORE(clk, 3, 1);
 
-	clk = clk_reg_prcc_pclk("p3_pclk2", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk2", "per3clk", bases[CLKRST3_INDEX],
 				BIT(2), 0);
 	PRCC_PCLK_STORE(clk, 3, 2);
 
-	clk = clk_reg_prcc_pclk("p3_pclk3", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk3", "per3clk", bases[CLKRST3_INDEX],
 				BIT(3), 0);
 	PRCC_PCLK_STORE(clk, 3, 3);
 
-	clk = clk_reg_prcc_pclk("p3_pclk4", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk4", "per3clk", bases[CLKRST3_INDEX],
 				BIT(4), 0);
 	PRCC_PCLK_STORE(clk, 3, 4);
 
-	clk = clk_reg_prcc_pclk("p3_pclk5", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk5", "per3clk", bases[CLKRST3_INDEX],
 				BIT(5), 0);
 	PRCC_PCLK_STORE(clk, 3, 5);
 
-	clk = clk_reg_prcc_pclk("p3_pclk6", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk6", "per3clk", bases[CLKRST3_INDEX],
 				BIT(6), 0);
 	PRCC_PCLK_STORE(clk, 3, 6);
 
-	clk = clk_reg_prcc_pclk("p3_pclk7", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk7", "per3clk", bases[CLKRST3_INDEX],
 				BIT(7), 0);
 	PRCC_PCLK_STORE(clk, 3, 7);
 
-	clk = clk_reg_prcc_pclk("p3_pclk8", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk8", "per3clk", bases[CLKRST3_INDEX],
 				BIT(8), 0);
 	PRCC_PCLK_STORE(clk, 3, 8);
 
-	clk = clk_reg_prcc_pclk("p5_pclk0", "per5clk", clkrst5_base,
+	clk = clk_reg_prcc_pclk("p5_pclk0", "per5clk", bases[CLKRST5_INDEX],
 				BIT(0), 0);
 	PRCC_PCLK_STORE(clk, 5, 0);
 
-	clk = clk_reg_prcc_pclk("p5_pclk1", "per5clk", clkrst5_base,
+	clk = clk_reg_prcc_pclk("p5_pclk1", "per5clk", bases[CLKRST5_INDEX],
 				BIT(1), 0);
 	PRCC_PCLK_STORE(clk, 5, 1);
 
-	clk = clk_reg_prcc_pclk("p6_pclk0", "per6clk", clkrst6_base,
+	clk = clk_reg_prcc_pclk("p6_pclk0", "per6clk", bases[CLKRST6_INDEX],
 				BIT(0), 0);
 	PRCC_PCLK_STORE(clk, 6, 0);
 
-	clk = clk_reg_prcc_pclk("p6_pclk1", "per6clk", clkrst6_base,
+	clk = clk_reg_prcc_pclk("p6_pclk1", "per6clk", bases[CLKRST6_INDEX],
 				BIT(1), 0);
 	PRCC_PCLK_STORE(clk, 6, 1);
 
-	clk = clk_reg_prcc_pclk("p6_pclk2", "per6clk", clkrst6_base,
+	clk = clk_reg_prcc_pclk("p6_pclk2", "per6clk", bases[CLKRST6_INDEX],
 				BIT(2), 0);
 	PRCC_PCLK_STORE(clk, 6, 2);
 
-	clk = clk_reg_prcc_pclk("p6_pclk3", "per6clk", clkrst6_base,
+	clk = clk_reg_prcc_pclk("p6_pclk3", "per6clk", bases[CLKRST6_INDEX],
 				BIT(3), 0);
 	PRCC_PCLK_STORE(clk, 6, 3);
 
-	clk = clk_reg_prcc_pclk("p6_pclk4", "per6clk", clkrst6_base,
+	clk = clk_reg_prcc_pclk("p6_pclk4", "per6clk", bases[CLKRST6_INDEX],
 				BIT(4), 0);
 	PRCC_PCLK_STORE(clk, 6, 4);
 
-	clk = clk_reg_prcc_pclk("p6_pclk5", "per6clk", clkrst6_base,
+	clk = clk_reg_prcc_pclk("p6_pclk5", "per6clk", bases[CLKRST6_INDEX],
 				BIT(5), 0);
 	PRCC_PCLK_STORE(clk, 6, 5);
 
-	clk = clk_reg_prcc_pclk("p6_pclk6", "per6clk", clkrst6_base,
+	clk = clk_reg_prcc_pclk("p6_pclk6", "per6clk", bases[CLKRST6_INDEX],
 				BIT(6), 0);
 	PRCC_PCLK_STORE(clk, 6, 6);
 
-	clk = clk_reg_prcc_pclk("p6_pclk7", "per6clk", clkrst6_base,
+	clk = clk_reg_prcc_pclk("p6_pclk7", "per6clk", bases[CLKRST6_INDEX],
 				BIT(7), 0);
 	PRCC_PCLK_STORE(clk, 6, 7);
 
@@ -430,109 +451,109 @@ void u8500_of_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
 
 	/* Periph1 */
 	clk = clk_reg_prcc_kclk("p1_uart0_kclk", "uartclk",
-			clkrst1_base, BIT(0), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(0), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 1, 0);
 
 	clk = clk_reg_prcc_kclk("p1_uart1_kclk", "uartclk",
-			clkrst1_base, BIT(1), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(1), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 1, 1);
 
 	clk = clk_reg_prcc_kclk("p1_i2c1_kclk", "i2cclk",
-			clkrst1_base, BIT(2), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(2), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 1, 2);
 
 	clk = clk_reg_prcc_kclk("p1_msp0_kclk", "msp02clk",
-			clkrst1_base, BIT(3), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(3), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 1, 3);
 
 	clk = clk_reg_prcc_kclk("p1_msp1_kclk", "msp1clk",
-			clkrst1_base, BIT(4), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(4), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 1, 4);
 
 	clk = clk_reg_prcc_kclk("p1_sdi0_kclk", "sdmmcclk",
-			clkrst1_base, BIT(5), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(5), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 1, 5);
 
 	clk = clk_reg_prcc_kclk("p1_i2c2_kclk", "i2cclk",
-			clkrst1_base, BIT(6), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(6), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 1, 6);
 
 	clk = clk_reg_prcc_kclk("p1_slimbus0_kclk", "slimclk",
-			clkrst1_base, BIT(8), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(8), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 1, 8);
 
 	clk = clk_reg_prcc_kclk("p1_i2c4_kclk", "i2cclk",
-			clkrst1_base, BIT(9), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(9), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 1, 9);
 
 	clk = clk_reg_prcc_kclk("p1_msp3_kclk", "msp1clk",
-			clkrst1_base, BIT(10), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(10), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 1, 10);
 
 	/* Periph2 */
 	clk = clk_reg_prcc_kclk("p2_i2c3_kclk", "i2cclk",
-			clkrst2_base, BIT(0), CLK_SET_RATE_GATE);
+			bases[CLKRST2_INDEX], BIT(0), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 2, 0);
 
 	clk = clk_reg_prcc_kclk("p2_sdi4_kclk", "sdmmcclk",
-			clkrst2_base, BIT(2), CLK_SET_RATE_GATE);
+			bases[CLKRST2_INDEX], BIT(2), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 2, 2);
 
 	clk = clk_reg_prcc_kclk("p2_msp2_kclk", "msp02clk",
-			clkrst2_base, BIT(3), CLK_SET_RATE_GATE);
+			bases[CLKRST2_INDEX], BIT(3), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 2, 3);
 
 	clk = clk_reg_prcc_kclk("p2_sdi1_kclk", "sdmmcclk",
-			clkrst2_base, BIT(4), CLK_SET_RATE_GATE);
+			bases[CLKRST2_INDEX], BIT(4), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 2, 4);
 
 	clk = clk_reg_prcc_kclk("p2_sdi3_kclk", "sdmmcclk",
-			clkrst2_base, BIT(5), CLK_SET_RATE_GATE);
+			bases[CLKRST2_INDEX], BIT(5), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 2, 5);
 
 	/* Note that rate is received from parent. */
 	clk = clk_reg_prcc_kclk("p2_ssirx_kclk", "hsirxclk",
-			clkrst2_base, BIT(6),
+			bases[CLKRST2_INDEX], BIT(6),
 			CLK_SET_RATE_GATE|CLK_SET_RATE_PARENT);
 	PRCC_KCLK_STORE(clk, 2, 6);
 
 	clk = clk_reg_prcc_kclk("p2_ssitx_kclk", "hsitxclk",
-			clkrst2_base, BIT(7),
+			bases[CLKRST2_INDEX], BIT(7),
 			CLK_SET_RATE_GATE|CLK_SET_RATE_PARENT);
 	PRCC_KCLK_STORE(clk, 2, 7);
 
 	/* Periph3 */
 	clk = clk_reg_prcc_kclk("p3_ssp0_kclk", "sspclk",
-			clkrst3_base, BIT(1), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(1), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 3, 1);
 
 	clk = clk_reg_prcc_kclk("p3_ssp1_kclk", "sspclk",
-			clkrst3_base, BIT(2), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(2), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 3, 2);
 
 	clk = clk_reg_prcc_kclk("p3_i2c0_kclk", "i2cclk",
-			clkrst3_base, BIT(3), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(3), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 3, 3);
 
 	clk = clk_reg_prcc_kclk("p3_sdi2_kclk", "sdmmcclk",
-			clkrst3_base, BIT(4), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(4), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 3, 4);
 
 	clk = clk_reg_prcc_kclk("p3_ske_kclk", "rtc32k",
-			clkrst3_base, BIT(5), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(5), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 3, 5);
 
 	clk = clk_reg_prcc_kclk("p3_uart2_kclk", "uartclk",
-			clkrst3_base, BIT(6), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(6), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 3, 6);
 
 	clk = clk_reg_prcc_kclk("p3_sdi5_kclk", "sdmmcclk",
-			clkrst3_base, BIT(7), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(7), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 3, 7);
 
 	/* Periph6 */
 	clk = clk_reg_prcc_kclk("p3_rng_kclk", "rngclk",
-			clkrst6_base, BIT(0), CLK_SET_RATE_GATE);
+			bases[CLKRST6_INDEX], BIT(0), CLK_SET_RATE_GATE);
 	PRCC_KCLK_STORE(clk, 6, 0);
 
 	for_each_child_of_node(np, child) {
diff --git a/drivers/clk/ux500/u8540_clk.c b/drivers/clk/ux500/u8540_clk.c
index d0de335ea1e9..d7bcb7a86615 100644
--- a/drivers/clk/ux500/u8540_clk.c
+++ b/drivers/clk/ux500/u8540_clk.c
@@ -7,16 +7,51 @@
  * License terms: GNU General Public License (GPL) version 2
  */
 
+#include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/clkdev.h>
 #include <linux/clk-provider.h>
 #include <linux/mfd/dbx500-prcmu.h>
 #include <linux/platform_data/clk-ux500.h>
 #include "clk.h"
 
-void u8540_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
-		    u32 clkrst5_base, u32 clkrst6_base)
+static const struct of_device_id u8540_clk_of_match[] = {
+	{ .compatible = "stericsson,u8540-clks", },
+	{ }
+};
+
+/* CLKRST4 is missing making it hard to index things */
+enum clkrst_index {
+	CLKRST1_INDEX = 0,
+	CLKRST2_INDEX,
+	CLKRST3_INDEX,
+	CLKRST5_INDEX,
+	CLKRST6_INDEX,
+	CLKRST_MAX,
+};
+
+void u8540_clk_init(void)
 {
 	struct clk *clk;
+	struct device_node *np = NULL;
+	u32 bases[CLKRST_MAX];
+	int i;
+
+	if (of_have_populated_dt())
+		np = of_find_matching_node(NULL, u8540_clk_of_match);
+	if (!np) {
+		pr_err("Either DT or U8540 Clock node not found\n");
+		return;
+	}
+	for (i = 0; i < ARRAY_SIZE(bases); i++) {
+		struct resource r;
+
+		if (of_address_to_resource(np, i, &r))
+			/* Not much choice but to continue */
+			pr_err("failed to get CLKRST %d base address\n",
+			       i + 1);
+		bases[i] = r.start;
+	}
 
 	/* Clock sources. */
 	/* Fixed ClockGen */
@@ -218,151 +253,151 @@ void u8540_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
 
 	/* PRCC P-clocks */
 	/* Peripheral 1 : PRCC P-clocks */
-	clk = clk_reg_prcc_pclk("p1_pclk0", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk0", "per1clk", bases[CLKRST1_INDEX],
 				BIT(0), 0);
 	clk_register_clkdev(clk, "apb_pclk", "uart0");
 
-	clk = clk_reg_prcc_pclk("p1_pclk1", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk1", "per1clk", bases[CLKRST1_INDEX],
 				BIT(1), 0);
 	clk_register_clkdev(clk, "apb_pclk", "uart1");
 
-	clk = clk_reg_prcc_pclk("p1_pclk2", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk2", "per1clk", bases[CLKRST1_INDEX],
 				BIT(2), 0);
 	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.1");
 
-	clk = clk_reg_prcc_pclk("p1_pclk3", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk3", "per1clk", bases[CLKRST1_INDEX],
 				BIT(3), 0);
 	clk_register_clkdev(clk, "apb_pclk", "msp0");
 	clk_register_clkdev(clk, "apb_pclk", "dbx5x0-msp-i2s.0");
 
-	clk = clk_reg_prcc_pclk("p1_pclk4", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk4", "per1clk", bases[CLKRST1_INDEX],
 				BIT(4), 0);
 	clk_register_clkdev(clk, "apb_pclk", "msp1");
 	clk_register_clkdev(clk, "apb_pclk", "dbx5x0-msp-i2s.1");
 
-	clk = clk_reg_prcc_pclk("p1_pclk5", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk5", "per1clk", bases[CLKRST1_INDEX],
 				BIT(5), 0);
 	clk_register_clkdev(clk, "apb_pclk", "sdi0");
 
-	clk = clk_reg_prcc_pclk("p1_pclk6", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk6", "per1clk", bases[CLKRST1_INDEX],
 				BIT(6), 0);
 	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.2");
 
-	clk = clk_reg_prcc_pclk("p1_pclk7", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk7", "per1clk", bases[CLKRST1_INDEX],
 				BIT(7), 0);
 	clk_register_clkdev(clk, NULL, "spi3");
 
-	clk = clk_reg_prcc_pclk("p1_pclk8", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk8", "per1clk", bases[CLKRST1_INDEX],
 				BIT(8), 0);
 	clk_register_clkdev(clk, "apb_pclk", "slimbus0");
 
-	clk = clk_reg_prcc_pclk("p1_pclk9", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk9", "per1clk", bases[CLKRST1_INDEX],
 				BIT(9), 0);
 	clk_register_clkdev(clk, NULL, "gpio.0");
 	clk_register_clkdev(clk, NULL, "gpio.1");
 	clk_register_clkdev(clk, NULL, "gpioblock0");
 	clk_register_clkdev(clk, "apb_pclk", "ab85xx-codec.0");
 
-	clk = clk_reg_prcc_pclk("p1_pclk10", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk10", "per1clk", bases[CLKRST1_INDEX],
 				BIT(10), 0);
 	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.4");
 
-	clk = clk_reg_prcc_pclk("p1_pclk11", "per1clk", clkrst1_base,
+	clk = clk_reg_prcc_pclk("p1_pclk11", "per1clk", bases[CLKRST1_INDEX],
 				BIT(11), 0);
 	clk_register_clkdev(clk, "apb_pclk", "msp3");
 	clk_register_clkdev(clk, "apb_pclk", "dbx5x0-msp-i2s.3");
 
 	/* Peripheral 2 : PRCC P-clocks */
-	clk = clk_reg_prcc_pclk("p2_pclk0", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk0", "per2clk", bases[CLKRST2_INDEX],
 				BIT(0), 0);
 	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.3");
 
-	clk = clk_reg_prcc_pclk("p2_pclk1", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk1", "per2clk", bases[CLKRST2_INDEX],
 				BIT(1), 0);
 	clk_register_clkdev(clk, NULL, "spi2");
 
-	clk = clk_reg_prcc_pclk("p2_pclk2", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk2", "per2clk", bases[CLKRST2_INDEX],
 				BIT(2), 0);
 	clk_register_clkdev(clk, NULL, "spi1");
 
-	clk = clk_reg_prcc_pclk("p2_pclk3", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk3", "per2clk", bases[CLKRST2_INDEX],
 				BIT(3), 0);
 	clk_register_clkdev(clk, NULL, "pwl");
 
-	clk = clk_reg_prcc_pclk("p2_pclk4", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk4", "per2clk", bases[CLKRST2_INDEX],
 				BIT(4), 0);
 	clk_register_clkdev(clk, "apb_pclk", "sdi4");
 
-	clk = clk_reg_prcc_pclk("p2_pclk5", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk5", "per2clk", bases[CLKRST2_INDEX],
 				BIT(5), 0);
 	clk_register_clkdev(clk, "apb_pclk", "msp2");
 	clk_register_clkdev(clk, "apb_pclk", "dbx5x0-msp-i2s.2");
 
-	clk = clk_reg_prcc_pclk("p2_pclk6", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk6", "per2clk", bases[CLKRST2_INDEX],
 				BIT(6), 0);
 	clk_register_clkdev(clk, "apb_pclk", "sdi1");
 
-	clk = clk_reg_prcc_pclk("p2_pclk7", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk7", "per2clk", bases[CLKRST2_INDEX],
 				BIT(7), 0);
 	clk_register_clkdev(clk, "apb_pclk", "sdi3");
 
-	clk = clk_reg_prcc_pclk("p2_pclk8", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk8", "per2clk", bases[CLKRST2_INDEX],
 				BIT(8), 0);
 	clk_register_clkdev(clk, NULL, "spi0");
 
-	clk = clk_reg_prcc_pclk("p2_pclk9", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk9", "per2clk", bases[CLKRST2_INDEX],
 				BIT(9), 0);
 	clk_register_clkdev(clk, "hsir_hclk", "ste_hsi.0");
 
-	clk = clk_reg_prcc_pclk("p2_pclk10", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk10", "per2clk", bases[CLKRST2_INDEX],
 				BIT(10), 0);
 	clk_register_clkdev(clk, "hsit_hclk", "ste_hsi.0");
 
-	clk = clk_reg_prcc_pclk("p2_pclk11", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk11", "per2clk", bases[CLKRST2_INDEX],
 				BIT(11), 0);
 	clk_register_clkdev(clk, NULL, "gpio.6");
 	clk_register_clkdev(clk, NULL, "gpio.7");
 	clk_register_clkdev(clk, NULL, "gpioblock1");
 
-	clk = clk_reg_prcc_pclk("p2_pclk12", "per2clk", clkrst2_base,
+	clk = clk_reg_prcc_pclk("p2_pclk12", "per2clk", bases[CLKRST2_INDEX],
 				BIT(12), 0);
 	clk_register_clkdev(clk, "msp4-pclk", "ab85xx-codec.0");
 
 	/* Peripheral 3 : PRCC P-clocks */
-	clk = clk_reg_prcc_pclk("p3_pclk0", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk0", "per3clk", bases[CLKRST3_INDEX],
 				BIT(0), 0);
 	clk_register_clkdev(clk, NULL, "fsmc");
 
-	clk = clk_reg_prcc_pclk("p3_pclk1", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk1", "per3clk", bases[CLKRST3_INDEX],
 				BIT(1), 0);
 	clk_register_clkdev(clk, "apb_pclk", "ssp0");
 
-	clk = clk_reg_prcc_pclk("p3_pclk2", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk2", "per3clk", bases[CLKRST3_INDEX],
 				BIT(2), 0);
 	clk_register_clkdev(clk, "apb_pclk", "ssp1");
 
-	clk = clk_reg_prcc_pclk("p3_pclk3", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk3", "per3clk", bases[CLKRST3_INDEX],
 				BIT(3), 0);
 	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.0");
 
-	clk = clk_reg_prcc_pclk("p3_pclk4", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk4", "per3clk", bases[CLKRST3_INDEX],
 				BIT(4), 0);
 	clk_register_clkdev(clk, "apb_pclk", "sdi2");
 
-	clk = clk_reg_prcc_pclk("p3_pclk5", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk5", "per3clk", bases[CLKRST3_INDEX],
 				BIT(5), 0);
 	clk_register_clkdev(clk, "apb_pclk", "ske");
 	clk_register_clkdev(clk, "apb_pclk", "nmk-ske-keypad");
 
-	clk = clk_reg_prcc_pclk("p3_pclk6", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk6", "per3clk", bases[CLKRST3_INDEX],
 				BIT(6), 0);
 	clk_register_clkdev(clk, "apb_pclk", "uart2");
 
-	clk = clk_reg_prcc_pclk("p3_pclk7", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk7", "per3clk", bases[CLKRST3_INDEX],
 				BIT(7), 0);
 	clk_register_clkdev(clk, "apb_pclk", "sdi5");
 
-	clk = clk_reg_prcc_pclk("p3_pclk8", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk8", "per3clk", bases[CLKRST3_INDEX],
 				BIT(8), 0);
 	clk_register_clkdev(clk, NULL, "gpio.2");
 	clk_register_clkdev(clk, NULL, "gpio.3");
@@ -370,64 +405,64 @@ void u8540_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
 	clk_register_clkdev(clk, NULL, "gpio.5");
 	clk_register_clkdev(clk, NULL, "gpioblock2");
 
-	clk = clk_reg_prcc_pclk("p3_pclk9", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk9", "per3clk", bases[CLKRST3_INDEX],
 				BIT(9), 0);
 	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.5");
 
-	clk = clk_reg_prcc_pclk("p3_pclk10", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk10", "per3clk", bases[CLKRST3_INDEX],
 				BIT(10), 0);
 	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.6");
 
-	clk = clk_reg_prcc_pclk("p3_pclk11", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk11", "per3clk", bases[CLKRST3_INDEX],
 				BIT(11), 0);
 	clk_register_clkdev(clk, "apb_pclk", "uart3");
 
-	clk = clk_reg_prcc_pclk("p3_pclk12", "per3clk", clkrst3_base,
+	clk = clk_reg_prcc_pclk("p3_pclk12", "per3clk", bases[CLKRST3_INDEX],
 				BIT(12), 0);
 	clk_register_clkdev(clk, "apb_pclk", "uart4");
 
 	/* Peripheral 5 : PRCC P-clocks */
-	clk = clk_reg_prcc_pclk("p5_pclk0", "per5clk", clkrst5_base,
+	clk = clk_reg_prcc_pclk("p5_pclk0", "per5clk", bases[CLKRST5_INDEX],
 				BIT(0), 0);
 	clk_register_clkdev(clk, "usb", "musb-ux500.0");
 	clk_register_clkdev(clk, "usbclk", "ab-iddet.0");
 
-	clk = clk_reg_prcc_pclk("p5_pclk1", "per5clk", clkrst5_base,
+	clk = clk_reg_prcc_pclk("p5_pclk1", "per5clk", bases[CLKRST5_INDEX],
 				BIT(1), 0);
 	clk_register_clkdev(clk, NULL, "gpio.8");
 	clk_register_clkdev(clk, NULL, "gpioblock3");
 
 	/* Peripheral 6 : PRCC P-clocks */
-	clk = clk_reg_prcc_pclk("p6_pclk0", "per6clk", clkrst6_base,
+	clk = clk_reg_prcc_pclk("p6_pclk0", "per6clk", bases[CLKRST6_INDEX],
 				BIT(0), 0);
 	clk_register_clkdev(clk, "apb_pclk", "rng");
 
-	clk = clk_reg_prcc_pclk("p6_pclk1", "per6clk", clkrst6_base,
+	clk = clk_reg_prcc_pclk("p6_pclk1", "per6clk", bases[CLKRST6_INDEX],
 				BIT(1), 0);
 	clk_register_clkdev(clk, NULL, "cryp0");
 	clk_register_clkdev(clk, NULL, "cryp1");
 
-	clk = clk_reg_prcc_pclk("p6_pclk2", "per6clk", clkrst6_base,
+	clk = clk_reg_prcc_pclk("p6_pclk2", "per6clk", bases[CLKRST6_INDEX],
 				BIT(2), 0);
 	clk_register_clkdev(clk, NULL, "hash0");
 
-	clk = clk_reg_prcc_pclk("p6_pclk3", "per6clk", clkrst6_base,
+	clk = clk_reg_prcc_pclk("p6_pclk3", "per6clk", bases[CLKRST6_INDEX],
 				BIT(3), 0);
 	clk_register_clkdev(clk, NULL, "pka");
 
-	clk = clk_reg_prcc_pclk("p6_pclk4", "per6clk", clkrst6_base,
+	clk = clk_reg_prcc_pclk("p6_pclk4", "per6clk", bases[CLKRST6_INDEX],
 				BIT(4), 0);
 	clk_register_clkdev(clk, NULL, "db8540-hash1");
 
-	clk = clk_reg_prcc_pclk("p6_pclk5", "per6clk", clkrst6_base,
+	clk = clk_reg_prcc_pclk("p6_pclk5", "per6clk", bases[CLKRST6_INDEX],
 				BIT(5), 0);
 	clk_register_clkdev(clk, NULL, "cfgreg");
 
-	clk = clk_reg_prcc_pclk("p6_pclk6", "per6clk", clkrst6_base,
+	clk = clk_reg_prcc_pclk("p6_pclk6", "per6clk", bases[CLKRST6_INDEX],
 				BIT(6), 0);
 	clk_register_clkdev(clk, "apb_pclk", "mtu0");
 
-	clk = clk_reg_prcc_pclk("p6_pclk7", "per6clk", clkrst6_base,
+	clk = clk_reg_prcc_pclk("p6_pclk7", "per6clk", bases[CLKRST6_INDEX],
 				BIT(7), 0);
 	clk_register_clkdev(clk, "apb_pclk", "mtu1");
 
@@ -441,138 +476,138 @@ void u8540_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
 
 	/* Peripheral 1 : PRCC K-clocks */
 	clk = clk_reg_prcc_kclk("p1_uart0_kclk", "uartclk",
-			clkrst1_base, BIT(0), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(0), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "uart0");
 
 	clk = clk_reg_prcc_kclk("p1_uart1_kclk", "uartclk",
-			clkrst1_base, BIT(1), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(1), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "uart1");
 
 	clk = clk_reg_prcc_kclk("p1_i2c1_kclk", "i2cclk",
-			clkrst1_base, BIT(2), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(2), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "nmk-i2c.1");
 
 	clk = clk_reg_prcc_kclk("p1_msp0_kclk", "msp02clk",
-			clkrst1_base, BIT(3), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(3), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "msp0");
 	clk_register_clkdev(clk, NULL, "dbx5x0-msp-i2s.0");
 
 	clk = clk_reg_prcc_kclk("p1_msp1_kclk", "msp1clk",
-			clkrst1_base, BIT(4), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(4), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "msp1");
 	clk_register_clkdev(clk, NULL, "dbx5x0-msp-i2s.1");
 
 	clk = clk_reg_prcc_kclk("p1_sdi0_kclk", "sdmmchclk",
-			clkrst1_base, BIT(5), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(5), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "sdi0");
 
 	clk = clk_reg_prcc_kclk("p1_i2c2_kclk", "i2cclk",
-			clkrst1_base, BIT(6), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(6), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "nmk-i2c.2");
 
 	clk = clk_reg_prcc_kclk("p1_slimbus0_kclk", "slimclk",
-			clkrst1_base, BIT(8), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(8), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "slimbus0");
 
 	clk = clk_reg_prcc_kclk("p1_i2c4_kclk", "i2cclk",
-			clkrst1_base, BIT(9), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(9), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "nmk-i2c.4");
 
 	clk = clk_reg_prcc_kclk("p1_msp3_kclk", "msp1clk",
-			clkrst1_base, BIT(10), CLK_SET_RATE_GATE);
+			bases[CLKRST1_INDEX], BIT(10), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "msp3");
 	clk_register_clkdev(clk, NULL, "dbx5x0-msp-i2s.3");
 
 	/* Peripheral 2 : PRCC K-clocks */
 	clk = clk_reg_prcc_kclk("p2_i2c3_kclk", "i2cclk",
-			clkrst2_base, BIT(0), CLK_SET_RATE_GATE);
+			bases[CLKRST2_INDEX], BIT(0), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "nmk-i2c.3");
 
 	clk = clk_reg_prcc_kclk("p2_pwl_kclk", "rtc32k",
-			clkrst2_base, BIT(1), CLK_SET_RATE_GATE);
+			bases[CLKRST2_INDEX], BIT(1), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "pwl");
 
 	clk = clk_reg_prcc_kclk("p2_sdi4_kclk", "sdmmchclk",
-			clkrst2_base, BIT(2), CLK_SET_RATE_GATE);
+			bases[CLKRST2_INDEX], BIT(2), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "sdi4");
 
 	clk = clk_reg_prcc_kclk("p2_msp2_kclk", "msp02clk",
-			clkrst2_base, BIT(3), CLK_SET_RATE_GATE);
+			bases[CLKRST2_INDEX], BIT(3), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "msp2");
 	clk_register_clkdev(clk, NULL, "dbx5x0-msp-i2s.2");
 
 	clk = clk_reg_prcc_kclk("p2_sdi1_kclk", "sdmmchclk",
-			clkrst2_base, BIT(4), CLK_SET_RATE_GATE);
+			bases[CLKRST2_INDEX], BIT(4), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "sdi1");
 
 	clk = clk_reg_prcc_kclk("p2_sdi3_kclk", "sdmmcclk",
-			clkrst2_base, BIT(5), CLK_SET_RATE_GATE);
+			bases[CLKRST2_INDEX], BIT(5), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "sdi3");
 
 	clk = clk_reg_prcc_kclk("p2_ssirx_kclk", "hsirxclk",
-			clkrst2_base, BIT(6),
+			bases[CLKRST2_INDEX], BIT(6),
 			CLK_SET_RATE_GATE|CLK_SET_RATE_PARENT);
 	clk_register_clkdev(clk, "hsir_hsirxclk", "ste_hsi.0");
 
 	clk = clk_reg_prcc_kclk("p2_ssitx_kclk", "hsitxclk",
-			clkrst2_base, BIT(7),
+			bases[CLKRST2_INDEX], BIT(7),
 			CLK_SET_RATE_GATE|CLK_SET_RATE_PARENT);
 	clk_register_clkdev(clk, "hsit_hsitxclk", "ste_hsi.0");
 
 	/* Should only be 9540, but might be added for 85xx as well */
 	clk = clk_reg_prcc_kclk("p2_msp4_kclk", "msp02clk",
-			clkrst2_base, BIT(9), CLK_SET_RATE_GATE);
+			bases[CLKRST2_INDEX], BIT(9), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "msp4");
 	clk_register_clkdev(clk, "msp4", "ab85xx-codec.0");
 
 	/* Peripheral 3 : PRCC K-clocks */
 	clk = clk_reg_prcc_kclk("p3_ssp0_kclk", "sspclk",
-			clkrst3_base, BIT(1), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(1), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "ssp0");
 
 	clk = clk_reg_prcc_kclk("p3_ssp1_kclk", "sspclk",
-			clkrst3_base, BIT(2), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(2), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "ssp1");
 
 	clk = clk_reg_prcc_kclk("p3_i2c0_kclk", "i2cclk",
-			clkrst3_base, BIT(3), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(3), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "nmk-i2c.0");
 
 	clk = clk_reg_prcc_kclk("p3_sdi2_kclk", "sdmmchclk",
-			clkrst3_base, BIT(4), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(4), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "sdi2");
 
 	clk = clk_reg_prcc_kclk("p3_ske_kclk", "rtc32k",
-			clkrst3_base, BIT(5), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(5), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "ske");
 	clk_register_clkdev(clk, NULL, "nmk-ske-keypad");
 
 	clk = clk_reg_prcc_kclk("p3_uart2_kclk", "uartclk",
-			clkrst3_base, BIT(6), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(6), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "uart2");
 
 	clk = clk_reg_prcc_kclk("p3_sdi5_kclk", "sdmmcclk",
-			clkrst3_base, BIT(7), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(7), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "sdi5");
 
 	clk = clk_reg_prcc_kclk("p3_i2c5_kclk", "i2cclk",
-			clkrst3_base, BIT(8), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(8), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "nmk-i2c.5");
 
 	clk = clk_reg_prcc_kclk("p3_i2c6_kclk", "i2cclk",
-			clkrst3_base, BIT(9), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(9), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "nmk-i2c.6");
 
 	clk = clk_reg_prcc_kclk("p3_uart3_kclk", "uartclk",
-			clkrst3_base, BIT(10), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(10), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "uart3");
 
 	clk = clk_reg_prcc_kclk("p3_uart4_kclk", "uartclk",
-			clkrst3_base, BIT(11), CLK_SET_RATE_GATE);
+			bases[CLKRST3_INDEX], BIT(11), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "uart4");
 
 	/* Peripheral 6 : PRCC K-clocks */
 	clk = clk_reg_prcc_kclk("p6_rng_kclk", "rngclk",
-			clkrst6_base, BIT(0), CLK_SET_RATE_GATE);
+			bases[CLKRST6_INDEX], BIT(0), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "rng");
 }
diff --git a/drivers/clk/ux500/u9540_clk.c b/drivers/clk/ux500/u9540_clk.c
index 179bd3871b34..2138a4c8cbca 100644
--- a/drivers/clk/ux500/u9540_clk.c
+++ b/drivers/clk/ux500/u9540_clk.c
@@ -12,8 +12,7 @@
 #include <linux/platform_data/clk-ux500.h>
 #include "clk.h"
 
-void u9540_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
-		    u32 clkrst5_base, u32 clkrst6_base)
+void u9540_clk_init(void)
 {
 	/* register clocks here */
 }
diff --git a/include/linux/platform_data/clk-ux500.h b/include/linux/platform_data/clk-ux500.h
index 0058edb24391..3af0da1f3be5 100644
--- a/include/linux/platform_data/clk-ux500.h
+++ b/include/linux/platform_data/clk-ux500.h
@@ -10,12 +10,8 @@
 #ifndef __CLK_UX500_H
 #define __CLK_UX500_H
 
-void u8500_of_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
-		       u32 clkrst5_base, u32 clkrst6_base);
-
-void u9540_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
-		    u32 clkrst5_base, u32 clkrst6_base);
-void u8540_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
-		    u32 clkrst5_base, u32 clkrst6_base);
+void u8500_clk_init(void);
+void u9540_clk_init(void);
+void u8540_clk_init(void);
 
 #endif /* __CLK_UX500_H */
-- 
cgit v1.2.3-70-g09d2


From fe009175ae3ec3724c1414440e22a1d32d806ec5 Mon Sep 17 00:00:00 2001
From: Milo Kim <milo.kim@ti.com>
Date: Mon, 20 Jul 2015 15:45:38 +0900
Subject: backlight: lp855x: Use private data for regulator control

LP855x backlight device can be enabled by external VDD input. The
'supply' data is used for this purpose. It's kind of private data
which runs internally, so there is no reason to expose to the
platform data.

And devm_regulator_get() is moved from _parse_dt() to _probe().
Regulator consumer(lp855x) can control regulator not only from DT
but also from platform data configuration in a source file such
like board-*.c.

Signed-off-by: Milo Kim <milo.kim@ti.com>
Acked-by: Sean Paul <seanpaul@chromium.org>
Acked-by: Jingoo Han <jingoohan1@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/video/backlight/lp855x_bl.c  | 23 ++++++++++++-----------
 include/linux/platform_data/lp855x.h |  2 --
 2 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/backlight/lp855x_bl.c b/drivers/video/backlight/lp855x_bl.c
index 88116b493f3b..f88df9ec08d0 100644
--- a/drivers/video/backlight/lp855x_bl.c
+++ b/drivers/video/backlight/lp855x_bl.c
@@ -73,6 +73,7 @@ struct lp855x {
 	struct device *dev;
 	struct lp855x_platform_data *pdata;
 	struct pwm_device *pwm;
+	struct regulator *supply;	/* regulator for VDD input */
 };
 
 static int lp855x_write_byte(struct lp855x *lp, u8 reg, u8 data)
@@ -378,13 +379,6 @@ static int lp855x_parse_dt(struct lp855x *lp)
 		pdata->rom_data = &rom[0];
 	}
 
-	pdata->supply = devm_regulator_get(dev, "power");
-	if (IS_ERR(pdata->supply)) {
-		if (PTR_ERR(pdata->supply) == -EPROBE_DEFER)
-			return -EPROBE_DEFER;
-		pdata->supply = NULL;
-	}
-
 	lp->pdata = pdata;
 
 	return 0;
@@ -425,8 +419,15 @@ static int lp855x_probe(struct i2c_client *cl, const struct i2c_device_id *id)
 	else
 		lp->mode = REGISTER_BASED;
 
-	if (lp->pdata->supply) {
-		ret = regulator_enable(lp->pdata->supply);
+	lp->supply = devm_regulator_get(lp->dev, "power");
+	if (IS_ERR(lp->supply)) {
+		if (PTR_ERR(lp->supply) == -EPROBE_DEFER)
+			return -EPROBE_DEFER;
+		lp->supply = NULL;
+	}
+
+	if (lp->supply) {
+		ret = regulator_enable(lp->supply);
 		if (ret < 0) {
 			dev_err(&cl->dev, "failed to enable supply: %d\n", ret);
 			return ret;
@@ -464,8 +465,8 @@ static int lp855x_remove(struct i2c_client *cl)
 
 	lp->bl->props.brightness = 0;
 	backlight_update_status(lp->bl);
-	if (lp->pdata->supply)
-		regulator_disable(lp->pdata->supply);
+	if (lp->supply)
+		regulator_disable(lp->supply);
 	sysfs_remove_group(&lp->dev->kobj, &lp855x_attr_group);
 
 	return 0;
diff --git a/include/linux/platform_data/lp855x.h b/include/linux/platform_data/lp855x.h
index 9c7fd1efe495..1b2ba24e4e03 100644
--- a/include/linux/platform_data/lp855x.h
+++ b/include/linux/platform_data/lp855x.h
@@ -136,7 +136,6 @@ struct lp855x_rom_data {
 		Only valid when mode is PWM_BASED.
  * @size_program : total size of lp855x_rom_data
  * @rom_data : list of new eeprom/eprom registers
- * @supply : regulator that supplies 3V input
  */
 struct lp855x_platform_data {
 	const char *name;
@@ -145,7 +144,6 @@ struct lp855x_platform_data {
 	unsigned int period_ns;
 	int size_program;
 	struct lp855x_rom_data *rom_data;
-	struct regulator *supply;
 };
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From c43996f4001de629af4a4d6713782e883677e5b9 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@suse.com>
Date: Mon, 24 Aug 2015 12:13:23 -0700
Subject: PCI: Add pci_ioremap_wc_bar()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This lets drivers take advantage of PAT when available. It
should help with the transition of converting video drivers over
to ioremap_wc() to help with the goal of eventually using
_PAGE_CACHE_UC over _PAGE_CACHE_UC_MINUS on x86 on
ioremap_nocache(), see:

  de33c442ed2a ("x86 PAT: fix performance drop for glx, use UC minus for ioremap(), ioremap_nocache() and pci_mmap_page_range()")

Signed-off-by: Luis R. Rodriguez <mcgrof@suse.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Cc: <syrjala@sci.fi>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Antonino Daplas <adaplas@gmail.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jean-Christophe Plagniol-Villard <plagnioj@jcrosoft.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomi Valkeinen <tomi.valkeinen@ti.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Ville Syrjälä <syrjala@sci.fi>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: airlied@linux.ie
Cc: benh@kernel.crashing.org
Cc: dan.j.williams@intel.com
Cc: konrad.wilk@oracle.com
Cc: linux-fbdev@vger.kernel.org
Cc: linux-pci@vger.kernel.org
Cc: mst@redhat.com
Cc: vinod.koul@intel.com
Cc: xen-devel@lists.xensource.com
Link: http://lkml.kernel.org/r/1440443613-13696-2-git-send-email-mcgrof@do-not-panic.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/pci/pci.c   | 14 ++++++++++++++
 include/linux/pci.h |  1 +
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 0008c950452c..fdae37b473f8 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -138,6 +138,20 @@ void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar)
 	return ioremap_nocache(res->start, resource_size(res));
 }
 EXPORT_SYMBOL_GPL(pci_ioremap_bar);
+
+void __iomem *pci_ioremap_wc_bar(struct pci_dev *pdev, int bar)
+{
+	/*
+	 * Make sure the BAR is actually a memory resource, not an IO resource
+	 */
+	if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	return ioremap_wc(pci_resource_start(pdev, bar),
+			  pci_resource_len(pdev, bar));
+}
+EXPORT_SYMBOL_GPL(pci_ioremap_wc_bar);
 #endif
 
 #define PCI_FIND_CAP_TTL	48
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8a0321a8fb59..985dd392e5f2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1661,6 +1661,7 @@ static inline void pci_mmcfg_late_init(void) { }
 int pci_ext_cfg_avail(void);
 
 void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);
+void __iomem *pci_ioremap_wc_bar(struct pci_dev *pdev, int bar);
 
 #ifdef CONFIG_PCI_IOV
 int pci_iov_virtfn_bus(struct pci_dev *dev, int id);
-- 
cgit v1.2.3-70-g09d2


From 43443ad692cf1d41a90cac2ed7066a10cd67a9c6 Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Sun, 2 Aug 2015 19:44:43 +0200
Subject: of/platform: add function to populate default bus

When a default bus like the simple-bus should be used someone had to
call of_platform_populate() with the default match table. This match
table was not exported, so it is impossible for code build as a module
to use this. Instead of exporting of_default_bus_match_table, add a new
function which uses this default match table and populates the bus.

Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/platform.c       | 9 +++++++++
 include/linux/of_platform.h | 9 +++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index ddf8e42c9367..918f01f26d4b 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -456,6 +456,15 @@ int of_platform_populate(struct device_node *root,
 }
 EXPORT_SYMBOL_GPL(of_platform_populate);
 
+int of_platform_default_populate(struct device_node *root,
+				 const struct of_dev_auxdata *lookup,
+				 struct device *parent)
+{
+	return of_platform_populate(root, of_default_bus_match_table, lookup,
+				    parent);
+}
+EXPORT_SYMBOL_GPL(of_platform_default_populate);
+
 static int of_platform_device_destroy(struct device *dev, void *data)
 {
 	/* Do not touch devices not populated from the device tree */
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 611a691145c4..956a1006aefc 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -72,6 +72,9 @@ extern int of_platform_populate(struct device_node *root,
 				const struct of_device_id *matches,
 				const struct of_dev_auxdata *lookup,
 				struct device *parent);
+extern int of_platform_default_populate(struct device_node *root,
+					const struct of_dev_auxdata *lookup,
+					struct device *parent);
 extern void of_platform_depopulate(struct device *parent);
 #else
 static inline int of_platform_populate(struct device_node *root,
@@ -81,6 +84,12 @@ static inline int of_platform_populate(struct device_node *root,
 {
 	return -ENODEV;
 }
+static inline int of_platform_default_populate(struct device_node *root,
+					       const struct of_dev_auxdata *lookup,
+					       struct device *parent)
+{
+	return -ENODEV;
+}
 static inline void of_platform_depopulate(struct device *parent) { }
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From a8cfe8cfd0da4502f5fa924f47c7ba6c7047c722 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Tue, 18 Aug 2015 15:16:45 +0800
Subject: clk: Add missing header for 'bool' definition to clk-conf.h

of_clk_set_defaults uses the type 'bool', but clk-conf.h does not
include its definition.

This results in a compile error when only clk-conf.h is used.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 include/linux/clk/clk-conf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk/clk-conf.h b/include/linux/clk/clk-conf.h
index f3050e15f833..e0c362363c38 100644
--- a/include/linux/clk/clk-conf.h
+++ b/include/linux/clk/clk-conf.h
@@ -7,6 +7,8 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/types.h>
+
 struct device_node;
 
 #if defined(CONFIG_OF) && defined(CONFIG_COMMON_CLK)
-- 
cgit v1.2.3-70-g09d2


From 4bf011815f2e093c7f60004f4f5683cf40b905b9 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 10 Aug 2015 19:56:46 +0300
Subject: device property: check fwnode type in to_of_node()

Potentially one of platform can support both ACPI and OF. In that case when we
call to_of_node() for non-OF fwnode types we will get non-NULL result, which is
wrong. Check for the type and return a correspondent result.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/of.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index edc068d19c79..2194b8ca41f9 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -136,7 +136,8 @@ static inline bool is_of_node(struct fwnode_handle *fwnode)
 
 static inline struct device_node *to_of_node(struct fwnode_handle *fwnode)
 {
-	return fwnode ? container_of(fwnode, struct device_node, fwnode) : NULL;
+	return is_of_node(fwnode) ?
+		container_of(fwnode, struct device_node, fwnode) : NULL;
 }
 
 static inline bool of_have_populated_dt(void)
-- 
cgit v1.2.3-70-g09d2


From 41d6bb4c890c8db01248b1bdd512a18e7bd29ca3 Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Mon, 17 Aug 2015 15:35:24 +0300
Subject: gpiolib: add description for gpio irqchip fields in struct gpio_chip

Add missed description for GPIO irqchip fields in struct gpio_chip.

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 0c7004dab437..1aed31c5ffba 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -65,6 +65,17 @@ struct seq_file;
  *	registers.
  * @irq_not_threaded: flag must be set if @can_sleep is set but the
  *	IRQs don't need to be threaded
+ * @irqchip: GPIO IRQ chip impl, provided by GPIO driver
+ * @irqdomain: Interrupt translation domain; responsible for mapping
+ *	between GPIO hwirq number and linux irq number
+ * @irq_base: first linux IRQ number assigned to GPIO IRQ chip (deprecated)
+ * @irq_handler: the irq handler to use (often a predefined irq core function)
+ *	for GPIO IRQs, provided by GPIO driver
+ * @irq_default_type: default IRQ triggering type applied during GPIO driver
+ *	initialization, provided by GPIO driver
+ * @irq_parent: GPIO IRQ chip parent/bank linux irq number,
+ *	provided by GPIO driver
+ * @lock_key: per GPIO IRQ chip lockdep class
  *
  * A gpio_chip can help platforms abstract various sources of GPIOs so
  * they can all be accessed through a common programing interface.
-- 
cgit v1.2.3-70-g09d2


From 22b6839b914bbe5d94de11bbb83931952090719c Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@linux.vnet.ibm.com>
Date: Mon, 24 Aug 2015 22:42:46 +1000
Subject: PCI: Make pci_msi_setup_pci_dev() non-static for use by arch code

Commit 1851617cd2da ("PCI/MSI: Disable MSI at enumeration even if kernel
doesn't support MSI") changed the location of the code that initialises
dev->msi_cap/msix_cap and then disables MSI/MSI-X interrupts at PCI
probe time in devices that have this flag set. It moved the code from
pci_msi_init_pci_dev() to a new function named pci_msi_setup_pci_dev(),
called by pci_setup_device().

The pseries PCI probing code does not call pci_setup_device(), so since
the aforementioned commit the function pci_msi_setup_pci_dev() is not
called and MSI/MSI-X interrupts are left enabled. Additionally because
dev->msi_cap/msix_cap are not initialised no driver can ever enable
MSI/MSI-X.

To fix this, the pseries PCI probe should manually call
pci_msi_setup_pci_dev(), so this patch makes it non-static.

Fixes: 1851617cd2da ("PCI/MSI: Disable MSI at enumeration even if kernel doesn't support MSI")
[mpe: Update change log to mention dev->msi_cap/msix_cap]
Signed-off-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/pci/probe.c | 2 +-
 include/linux/pci.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index b978bbfe044c..f6ae0d0052eb 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1108,7 +1108,7 @@ int pci_cfg_space_size(struct pci_dev *dev)
 
 #define LEGACY_IO_RESOURCE	(IORESOURCE_IO | IORESOURCE_PCI_FIXED)
 
-static void pci_msi_setup_pci_dev(struct pci_dev *dev)
+void pci_msi_setup_pci_dev(struct pci_dev *dev)
 {
 	/*
 	 * Disable the MSI hardware to avoid screaming interrupts
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8a0321a8fb59..860c751810fc 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1202,6 +1202,7 @@ struct msix_entry {
 	u16	entry;	/* driver uses to specify entry, OS writes */
 };
 
+void pci_msi_setup_pci_dev(struct pci_dev *dev);
 
 #ifdef CONFIG_PCI_MSI
 int pci_msi_vec_count(struct pci_dev *dev);
-- 
cgit v1.2.3-70-g09d2


From c3f57f02e3a275d8b5c6dc692adb21525ccb392c Mon Sep 17 00:00:00 2001
From: Markos Chandras <markos.chandras@imgtec.com>
Date: Tue, 14 Jul 2015 10:26:09 +0100
Subject: IRQCHIP: irq-mips-gic: Extend GIC accessors for 64-bit CMs

Previously, the GIC accessors were only accessing u32 registers but
newer CMs may actually be 64-bit on MIPS64 cores. As a result of which,
extended these accessors to support 64-bit reads and writes.

Signed-off-by: Markos Chandras <markos.chandras@imgtec.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Andrew Bresticker <abrestic@chromium.org>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/10709/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   | 121 ++++++++++++++++++++++++---------------
 include/linux/irqchip/mips-gic.h |  10 +++-
 2 files changed, 84 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index ff4be0515a0d..d8db854afded 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -42,20 +42,46 @@ static struct irq_chip gic_level_irq_controller, gic_edge_irq_controller;
 
 static void __gic_irq_dispatch(void);
 
-static inline unsigned int gic_read(unsigned int reg)
+static inline u32 gic_read32(unsigned int reg)
 {
 	return __raw_readl(gic_base + reg);
 }
 
-static inline void gic_write(unsigned int reg, unsigned int val)
+static inline u64 gic_read64(unsigned int reg)
 {
-	__raw_writel(val, gic_base + reg);
+	return __raw_readq(gic_base + reg);
 }
 
-static inline void gic_update_bits(unsigned int reg, unsigned int mask,
-				   unsigned int val)
+static inline unsigned long gic_read(unsigned int reg)
 {
-	unsigned int regval;
+	if (!mips_cm_is64)
+		return gic_read32(reg);
+	else
+		return gic_read64(reg);
+}
+
+static inline void gic_write32(unsigned int reg, u32 val)
+{
+	return __raw_writel(val, gic_base + reg);
+}
+
+static inline void gic_write64(unsigned int reg, u64 val)
+{
+	return __raw_writeq(val, gic_base + reg);
+}
+
+static inline void gic_write(unsigned int reg, unsigned long val)
+{
+	if (!mips_cm_is64)
+		return gic_write32(reg, (u32)val);
+	else
+		return gic_write64(reg, (u64)val);
+}
+
+static inline void gic_update_bits(unsigned int reg, unsigned long mask,
+				   unsigned long val)
+{
+	unsigned long regval;
 
 	regval = gic_read(reg);
 	regval &= ~mask;
@@ -66,40 +92,40 @@ static inline void gic_update_bits(unsigned int reg, unsigned int mask,
 static inline void gic_reset_mask(unsigned int intr)
 {
 	gic_write(GIC_REG(SHARED, GIC_SH_RMASK) + GIC_INTR_OFS(intr),
-		  1 << GIC_INTR_BIT(intr));
+		  1ul << GIC_INTR_BIT(intr));
 }
 
 static inline void gic_set_mask(unsigned int intr)
 {
 	gic_write(GIC_REG(SHARED, GIC_SH_SMASK) + GIC_INTR_OFS(intr),
-		  1 << GIC_INTR_BIT(intr));
+		  1ul << GIC_INTR_BIT(intr));
 }
 
 static inline void gic_set_polarity(unsigned int intr, unsigned int pol)
 {
 	gic_update_bits(GIC_REG(SHARED, GIC_SH_SET_POLARITY) +
-			GIC_INTR_OFS(intr), 1 << GIC_INTR_BIT(intr),
-			pol << GIC_INTR_BIT(intr));
+			GIC_INTR_OFS(intr), 1ul << GIC_INTR_BIT(intr),
+			(unsigned long)pol << GIC_INTR_BIT(intr));
 }
 
 static inline void gic_set_trigger(unsigned int intr, unsigned int trig)
 {
 	gic_update_bits(GIC_REG(SHARED, GIC_SH_SET_TRIGGER) +
-			GIC_INTR_OFS(intr), 1 << GIC_INTR_BIT(intr),
-			trig << GIC_INTR_BIT(intr));
+			GIC_INTR_OFS(intr), 1ul << GIC_INTR_BIT(intr),
+			(unsigned long)trig << GIC_INTR_BIT(intr));
 }
 
 static inline void gic_set_dual_edge(unsigned int intr, unsigned int dual)
 {
 	gic_update_bits(GIC_REG(SHARED, GIC_SH_SET_DUAL) + GIC_INTR_OFS(intr),
-			1 << GIC_INTR_BIT(intr),
-			dual << GIC_INTR_BIT(intr));
+			1ul << GIC_INTR_BIT(intr),
+			(unsigned long)dual << GIC_INTR_BIT(intr));
 }
 
 static inline void gic_map_to_pin(unsigned int intr, unsigned int pin)
 {
-	gic_write(GIC_REG(SHARED, GIC_SH_INTR_MAP_TO_PIN_BASE) +
-		  GIC_SH_MAP_TO_PIN(intr), GIC_MAP_TO_PIN_MSK | pin);
+	gic_write32(GIC_REG(SHARED, GIC_SH_INTR_MAP_TO_PIN_BASE) +
+		    GIC_SH_MAP_TO_PIN(intr), GIC_MAP_TO_PIN_MSK | pin);
 }
 
 static inline void gic_map_to_vpe(unsigned int intr, unsigned int vpe)
@@ -115,9 +141,9 @@ cycle_t gic_read_count(void)
 	unsigned int hi, hi2, lo;
 
 	do {
-		hi = gic_read(GIC_REG(SHARED, GIC_SH_COUNTER_63_32));
-		lo = gic_read(GIC_REG(SHARED, GIC_SH_COUNTER_31_00));
-		hi2 = gic_read(GIC_REG(SHARED, GIC_SH_COUNTER_63_32));
+		hi = gic_read32(GIC_REG(SHARED, GIC_SH_COUNTER_63_32));
+		lo = gic_read32(GIC_REG(SHARED, GIC_SH_COUNTER_31_00));
+		hi2 = gic_read32(GIC_REG(SHARED, GIC_SH_COUNTER_63_32));
 	} while (hi2 != hi);
 
 	return (((cycle_t) hi) << 32) + lo;
@@ -136,9 +162,9 @@ unsigned int gic_get_count_width(void)
 
 void gic_write_compare(cycle_t cnt)
 {
-	gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_HI),
+	gic_write32(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_HI),
 				(int)(cnt >> 32));
-	gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_LO),
+	gic_write32(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_LO),
 				(int)(cnt & 0xffffffff));
 }
 
@@ -148,10 +174,10 @@ void gic_write_cpu_compare(cycle_t cnt, int cpu)
 
 	local_irq_save(flags);
 
-	gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR), cpu);
-	gic_write(GIC_REG(VPE_OTHER, GIC_VPE_COMPARE_HI),
+	gic_write32(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR), cpu);
+	gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_COMPARE_HI),
 				(int)(cnt >> 32));
-	gic_write(GIC_REG(VPE_OTHER, GIC_VPE_COMPARE_LO),
+	gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_COMPARE_LO),
 				(int)(cnt & 0xffffffff));
 
 	local_irq_restore(flags);
@@ -161,8 +187,8 @@ cycle_t gic_read_compare(void)
 {
 	unsigned int hi, lo;
 
-	hi = gic_read(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_HI));
-	lo = gic_read(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_LO));
+	hi = gic_read32(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_HI));
+	lo = gic_read32(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_LO));
 
 	return (((cycle_t) hi) << 32) + lo;
 }
@@ -197,7 +223,7 @@ static bool gic_local_irq_is_routable(int intr)
 	if (cpu_has_veic)
 		return true;
 
-	vpe_ctl = gic_read(GIC_REG(VPE_LOCAL, GIC_VPE_CTL));
+	vpe_ctl = gic_read32(GIC_REG(VPE_LOCAL, GIC_VPE_CTL));
 	switch (intr) {
 	case GIC_LOCAL_INT_TIMER:
 		return vpe_ctl & GIC_VPE_CTL_TIMER_RTBL_MSK;
@@ -263,7 +289,7 @@ int gic_get_c0_fdc_int(void)
 
 static void gic_handle_shared_int(bool chained)
 {
-	unsigned int i, intr, virq;
+	unsigned int i, intr, virq, gic_reg_step = mips_cm_is64 ? 8 : 4;
 	unsigned long *pcpu_mask;
 	unsigned long pending_reg, intrmask_reg;
 	DECLARE_BITMAP(pending, GIC_MAX_INTRS);
@@ -278,8 +304,8 @@ static void gic_handle_shared_int(bool chained)
 	for (i = 0; i < BITS_TO_LONGS(gic_shared_intrs); i++) {
 		pending[i] = gic_read(pending_reg);
 		intrmask[i] = gic_read(intrmask_reg);
-		pending_reg += 0x4;
-		intrmask_reg += 0x4;
+		pending_reg += gic_reg_step;
+		intrmask_reg += gic_reg_step;
 	}
 
 	bitmap_and(pending, pending, intrmask, gic_shared_intrs);
@@ -429,8 +455,8 @@ static void gic_handle_local_int(bool chained)
 	unsigned long pending, masked;
 	unsigned int intr, virq;
 
-	pending = gic_read(GIC_REG(VPE_LOCAL, GIC_VPE_PEND));
-	masked = gic_read(GIC_REG(VPE_LOCAL, GIC_VPE_MASK));
+	pending = gic_read32(GIC_REG(VPE_LOCAL, GIC_VPE_PEND));
+	masked = gic_read32(GIC_REG(VPE_LOCAL, GIC_VPE_MASK));
 
 	bitmap_and(&pending, &pending, &masked, GIC_NUM_LOCAL_INTRS);
 
@@ -453,14 +479,14 @@ static void gic_mask_local_irq(struct irq_data *d)
 {
 	int intr = GIC_HWIRQ_TO_LOCAL(d->hwirq);
 
-	gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_RMASK), 1 << intr);
+	gic_write32(GIC_REG(VPE_LOCAL, GIC_VPE_RMASK), 1 << intr);
 }
 
 static void gic_unmask_local_irq(struct irq_data *d)
 {
 	int intr = GIC_HWIRQ_TO_LOCAL(d->hwirq);
 
-	gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_SMASK), 1 << intr);
+	gic_write32(GIC_REG(VPE_LOCAL, GIC_VPE_SMASK), 1 << intr);
 }
 
 static struct irq_chip gic_local_irq_controller = {
@@ -478,7 +504,7 @@ static void gic_mask_local_irq_all_vpes(struct irq_data *d)
 	spin_lock_irqsave(&gic_lock, flags);
 	for (i = 0; i < gic_vpes; i++) {
 		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR), i);
-		gic_write(GIC_REG(VPE_OTHER, GIC_VPE_RMASK), 1 << intr);
+		gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_RMASK), 1 << intr);
 	}
 	spin_unlock_irqrestore(&gic_lock, flags);
 }
@@ -492,7 +518,7 @@ static void gic_unmask_local_irq_all_vpes(struct irq_data *d)
 	spin_lock_irqsave(&gic_lock, flags);
 	for (i = 0; i < gic_vpes; i++) {
 		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR), i);
-		gic_write(GIC_REG(VPE_OTHER, GIC_VPE_SMASK), 1 << intr);
+		gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_SMASK), 1 << intr);
 	}
 	spin_unlock_irqrestore(&gic_lock, flags);
 }
@@ -612,7 +638,7 @@ static void __init gic_basic_init(void)
 		for (j = 0; j < GIC_NUM_LOCAL_INTRS; j++) {
 			if (!gic_local_irq_is_routable(j))
 				continue;
-			gic_write(GIC_REG(VPE_OTHER, GIC_VPE_RMASK), 1 << j);
+			gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_RMASK), 1 << j);
 		}
 	}
 }
@@ -657,27 +683,32 @@ static int gic_local_irq_domain_map(struct irq_domain *d, unsigned int virq,
 
 		switch (intr) {
 		case GIC_LOCAL_INT_WD:
-			gic_write(GIC_REG(VPE_OTHER, GIC_VPE_WD_MAP), val);
+			gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_WD_MAP), val);
 			break;
 		case GIC_LOCAL_INT_COMPARE:
-			gic_write(GIC_REG(VPE_OTHER, GIC_VPE_COMPARE_MAP), val);
+			gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_COMPARE_MAP),
+				    val);
 			break;
 		case GIC_LOCAL_INT_TIMER:
 			/* CONFIG_MIPS_CMP workaround (see __gic_init) */
 			val = GIC_MAP_TO_PIN_MSK | timer_cpu_pin;
-			gic_write(GIC_REG(VPE_OTHER, GIC_VPE_TIMER_MAP), val);
+			gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_TIMER_MAP),
+				    val);
 			break;
 		case GIC_LOCAL_INT_PERFCTR:
-			gic_write(GIC_REG(VPE_OTHER, GIC_VPE_PERFCTR_MAP), val);
+			gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_PERFCTR_MAP),
+				    val);
 			break;
 		case GIC_LOCAL_INT_SWINT0:
-			gic_write(GIC_REG(VPE_OTHER, GIC_VPE_SWINT0_MAP), val);
+			gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_SWINT0_MAP),
+				    val);
 			break;
 		case GIC_LOCAL_INT_SWINT1:
-			gic_write(GIC_REG(VPE_OTHER, GIC_VPE_SWINT1_MAP), val);
+			gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_SWINT1_MAP),
+				    val);
 			break;
 		case GIC_LOCAL_INT_FDC:
-			gic_write(GIC_REG(VPE_OTHER, GIC_VPE_FDC_MAP), val);
+			gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_FDC_MAP), val);
 			break;
 		default:
 			pr_err("Invalid local IRQ %d\n", intr);
@@ -782,7 +813,7 @@ static void __init __gic_init(unsigned long gic_base_addr,
 		 */
 		if (IS_ENABLED(CONFIG_MIPS_CMP) &&
 		    gic_local_irq_is_routable(GIC_LOCAL_INT_TIMER)) {
-			timer_cpu_pin = gic_read(GIC_REG(VPE_LOCAL,
+			timer_cpu_pin = gic_read32(GIC_REG(VPE_LOCAL,
 							 GIC_VPE_TIMER_MAP)) &
 					GIC_MAP_MSK;
 			irq_set_chained_handler(MIPS_CPU_IRQ_BASE +
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 9b1ad3734911..10e4a9073019 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -45,8 +45,14 @@
 #define GIC_SH_REVISIONID_OFS		0x0020
 
 /* Convert an interrupt number to a byte offset/bit for multi-word registers */
-#define GIC_INTR_OFS(intr)		(((intr) / 32) * 4)
-#define GIC_INTR_BIT(intr)		((intr) % 32)
+#define GIC_INTR_OFS(intr) ({				\
+	unsigned bits = mips_cm_is64 ? 64 : 32;		\
+	unsigned reg_idx = (intr) / bits;		\
+	unsigned reg_width = bits / 8;			\
+							\
+	reg_idx * reg_width;				\
+})
+#define GIC_INTR_BIT(intr)		((intr) % (mips_cm_is64 ? 64 : 32))
 
 /* Polarity : Reset Value is always 0 */
 #define GIC_SH_SET_POLARITY_OFS		0x0100
-- 
cgit v1.2.3-70-g09d2


From 6f50c83529ac1fa3444ff4be5f5b0bf3d76db678 Mon Sep 17 00:00:00 2001
From: Markos Chandras <markos.chandras@imgtec.com>
Date: Thu, 9 Jul 2015 10:40:49 +0100
Subject: IRQCHIP: irq-mips-gic: Add support for CM3 64-bit timer irqs

CM3 uses a 64-bit counter and compare registers so add support for
them in the GIC counter interrupt.

Signed-off-by: Markos Chandras <markos.chandras@imgtec.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Andrew Bresticker <abrestic@chromium.org>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/10648/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   | 33 ++++++++++++++++++++++++---------
 include/linux/irqchip/mips-gic.h |  4 ++++
 2 files changed, 28 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index d8db854afded..7d4616963b5a 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -140,6 +140,9 @@ cycle_t gic_read_count(void)
 {
 	unsigned int hi, hi2, lo;
 
+	if (mips_cm_is64)
+		return (cycle_t)gic_read(GIC_REG(SHARED, GIC_SH_COUNTER));
+
 	do {
 		hi = gic_read32(GIC_REG(SHARED, GIC_SH_COUNTER_63_32));
 		lo = gic_read32(GIC_REG(SHARED, GIC_SH_COUNTER_31_00));
@@ -162,10 +165,14 @@ unsigned int gic_get_count_width(void)
 
 void gic_write_compare(cycle_t cnt)
 {
-	gic_write32(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_HI),
-				(int)(cnt >> 32));
-	gic_write32(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_LO),
-				(int)(cnt & 0xffffffff));
+	if (mips_cm_is64) {
+		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE), cnt);
+	} else {
+		gic_write32(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_HI),
+					(int)(cnt >> 32));
+		gic_write32(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_LO),
+					(int)(cnt & 0xffffffff));
+	}
 }
 
 void gic_write_cpu_compare(cycle_t cnt, int cpu)
@@ -174,11 +181,16 @@ void gic_write_cpu_compare(cycle_t cnt, int cpu)
 
 	local_irq_save(flags);
 
-	gic_write32(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR), cpu);
-	gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_COMPARE_HI),
-				(int)(cnt >> 32));
-	gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_COMPARE_LO),
-				(int)(cnt & 0xffffffff));
+	gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR), cpu);
+
+	if (mips_cm_is64) {
+		gic_write(GIC_REG(VPE_OTHER, GIC_VPE_COMPARE), cnt);
+	} else {
+		gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_COMPARE_HI),
+					(int)(cnt >> 32));
+		gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_COMPARE_LO),
+					(int)(cnt & 0xffffffff));
+	}
 
 	local_irq_restore(flags);
 }
@@ -187,6 +199,9 @@ cycle_t gic_read_compare(void)
 {
 	unsigned int hi, lo;
 
+	if (mips_cm_is64)
+		return (cycle_t)gic_read(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE));
+
 	hi = gic_read32(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_HI));
 	lo = gic_read32(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_LO));
 
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 10e4a9073019..4e6861605050 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -41,6 +41,8 @@
 
 /* Shared Global Counter */
 #define GIC_SH_COUNTER_31_00_OFS	0x0010
+/* 64-bit counter register for CM3 */
+#define GIC_SH_COUNTER_OFS		GIC_SH_COUNTER_31_00_OFS
 #define GIC_SH_COUNTER_63_32_OFS	0x0014
 #define GIC_SH_REVISIONID_OFS		0x0020
 
@@ -104,6 +106,8 @@
 #define GIC_VPE_WD_COUNT0_OFS		0x0094
 #define GIC_VPE_WD_INITIAL0_OFS		0x0098
 #define GIC_VPE_COMPARE_LO_OFS		0x00a0
+/* 64-bit Compare register on CM3 */
+#define GIC_VPE_COMPARE_OFS		GIC_VPE_COMPARE_LO_OFS
 #define GIC_VPE_COMPARE_HI_OFS		0x00a4
 
 #define GIC_VPE_EIC_SHADOW_SET_BASE_OFS	0x0100
-- 
cgit v1.2.3-70-g09d2


From 5d0ddfebb93069061880fc57ee4ba7246bd1e1ee Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Fri, 21 Aug 2015 15:36:23 +0800
Subject: ACPI, PCI: Penalize legacy IRQ used by ACPI SCI

Nick Meier reported a regression with HyperV that "
  After rebooting the VM, the following messages are logged in syslog
  when trying to load the tulip driver:
    tulip: Linux Tulip drivers version 1.1.15 (Feb 27, 2007)
    tulip: 0000:00:0a.0: PCI INT A: failed to register GSI
    tulip: Cannot enable tulip board #0, aborting
    tulip: probe of 0000:00:0a.0 failed with error -16
  Errors occur in 3.19.0 kernel
  Works in 3.17 kernel.
"

According to the ACPI dump file posted by Nick at
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1440072

The ACPI MADT table includes an interrupt source overridden entry for
ACPI SCI:
[236h 0566  1]                Subtable Type : 02 <Interrupt Source Override>
[237h 0567  1]                       Length : 0A
[238h 0568  1]                          Bus : 00
[239h 0569  1]                       Source : 09
[23Ah 0570  4]                    Interrupt : 00000009
[23Eh 0574  2]        Flags (decoded below) : 000D
                                   Polarity : 1
                               Trigger Mode : 3

And in DSDT table, we have _PRT method to define PCI interrupts, which
eventually goes to:
        Name (PRSA, ResourceTemplate ()
        {
            IRQ (Level, ActiveLow, Shared, )
                {3,4,5,7,9,10,11,12,14,15}
        })
        Name (PRSB, ResourceTemplate ()
        {
            IRQ (Level, ActiveLow, Shared, )
                {3,4,5,7,9,10,11,12,14,15}
        })
        Name (PRSC, ResourceTemplate ()
        {
            IRQ (Level, ActiveLow, Shared, )
                {3,4,5,7,9,10,11,12,14,15}
        })
        Name (PRSD, ResourceTemplate ()
        {
            IRQ (Level, ActiveLow, Shared, )
                {3,4,5,7,9,10,11,12,14,15}
        })

According to the MADT and DSDT tables, IRQ 9 may be used for:
 1) ACPI SCI in level, high mode
 2) PCI legacy IRQ in level, low mode
So there's a conflict in polarity setting for IRQ 9.

Prior to commit cd68f6bd53cf ("x86, irq, acpi: Get rid of special
handling of GSI for ACPI SCI"), ACPI SCI is handled specially and
there's no check for conflicts between ACPI SCI and PCI legagy IRQ.
And it seems that the HyperV hypervisor doesn't make use of the
polarity configuration in IOAPIC entry, so it just works.

Commit cd68f6bd53cf gets rid of the specially handling of ACPI SCI,
and then the pin attribute checking code discloses the conflicts
between ACPI SCI and PCI legacy IRQ on HyperV virtual machine,
and rejects the request to assign IRQ9 to PCI devices.

So penalize legacy IRQ used by ACPI SCI and mark it unusable if ACPI
SCI attributes conflict with PCI IRQ attributes.

Please refer to following links for more information:
https://bugzilla.kernel.org/show_bug.cgi?id=101301
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1440072

Fixes: cd68f6bd53cf ("x86, irq, acpi: Get rid of special handling of GSI for ACPI SCI")
Reported-and-tested-by: Nick Meier <nmeier@microsoft.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: 3.19+ <stable@vger.kernel.org> # 3.19+
Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/kernel/acpi/boot.c |  1 +
 drivers/acpi/pci_link.c     | 16 ++++++++++++++++
 include/linux/acpi.h        |  2 +-
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index e49ee24da85e..9393896717d0 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -445,6 +445,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
 		polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
 
 	mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
+	acpi_penalize_sci_irq(bus_irq, trigger, polarity);
 
 	/*
 	 * stash over-ride to indicate we've been here
diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
index cfd7581cc19f..b09ad554430a 100644
--- a/drivers/acpi/pci_link.c
+++ b/drivers/acpi/pci_link.c
@@ -825,6 +825,22 @@ void acpi_penalize_isa_irq(int irq, int active)
 	}
 }
 
+/*
+ * Penalize IRQ used by ACPI SCI. If ACPI SCI pin attributes conflict with
+ * PCI IRQ attributes, mark ACPI SCI as ISA_ALWAYS so it won't be use for
+ * PCI IRQs.
+ */
+void acpi_penalize_sci_irq(int irq, int trigger, int polarity)
+{
+	if (irq >= 0 && irq < ARRAY_SIZE(acpi_irq_penalty)) {
+		if (trigger != ACPI_MADT_TRIGGER_LEVEL ||
+		    polarity != ACPI_MADT_POLARITY_ACTIVE_LOW)
+			acpi_irq_penalty[irq] += PIRQ_PENALTY_ISA_ALWAYS;
+		else
+			acpi_irq_penalty[irq] += PIRQ_PENALTY_PCI_USING;
+	}
+}
+
 /*
  * Over-ride default table to reserve additional IRQs for use by ISA
  * e.g. acpi_irq_isa=5
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index d2445fa9999f..0b2394f61af4 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -221,7 +221,7 @@ struct pci_dev;
 
 int acpi_pci_irq_enable (struct pci_dev *dev);
 void acpi_penalize_isa_irq(int irq, int active);
-
+void acpi_penalize_sci_irq(int irq, int trigger, int polarity);
 void acpi_pci_irq_disable (struct pci_dev *dev);
 
 extern int ec_read(u8 addr, u8 *val);
-- 
cgit v1.2.3-70-g09d2


From d407e30ba614b1542c8ac032f8fb2332b8071efe Mon Sep 17 00:00:00 2001
From: Haibo Chen <haibo.chen@freescale.com>
Date: Tue, 11 Aug 2015 19:38:27 +0800
Subject: mmc: sdhci-esdhc-imx: add tuning-step setting support

tuning-step is the delay cell steps in tuning procedure. The default value
of tuning-step is 1. Some boards or cards need another value to pass the
tuning procedure. For example, imx7d-sdb board need the tuning-step value
as 2, otherwise it can't pass the tuning procedure.

So this patch add the tuning-step setting in driver, so that user can set
the tuning-step value in dts.

Signed-off-by: Haibo Chen <haibo.chen@freescale.com>
Acked-by: Dong Aisheng <aisheng.dong@freescale.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci-esdhc-imx.c          | 9 +++++++++
 include/linux/platform_data/mmc-esdhc-imx.h | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c
index b8b7e8842ed0..298551d00fa4 100644
--- a/drivers/mmc/host/sdhci-esdhc-imx.c
+++ b/drivers/mmc/host/sdhci-esdhc-imx.c
@@ -75,6 +75,7 @@
 #define ESDHC_STD_TUNING_EN		(1 << 24)
 /* NOTE: the minimum valid tuning start tap for mx6sl is 1 */
 #define ESDHC_TUNING_START_TAP		0x1
+#define ESDHC_TUNING_STEP_SHIFT		16
 
 /* pinctrl state */
 #define ESDHC_PINCTRL_STATE_100MHZ	"state_100mhz"
@@ -474,6 +475,7 @@ static void esdhc_writew_le(struct sdhci_host *host, u16 val, int reg)
 		} else if (imx_data->socdata->flags & ESDHC_FLAG_STD_TUNING) {
 			u32 v = readl(host->ioaddr + SDHCI_ACMD12_ERR);
 			u32 m = readl(host->ioaddr + ESDHC_MIX_CTRL);
+			u32 tuning_ctrl;
 			if (val & SDHCI_CTRL_TUNED_CLK) {
 				v |= ESDHC_MIX_CTRL_SMPCLK_SEL;
 			} else {
@@ -484,6 +486,11 @@ static void esdhc_writew_le(struct sdhci_host *host, u16 val, int reg)
 			if (val & SDHCI_CTRL_EXEC_TUNING) {
 				v |= ESDHC_MIX_CTRL_EXE_TUNE;
 				m |= ESDHC_MIX_CTRL_FBCLK_SEL;
+				tuning_ctrl = readl(host->ioaddr + ESDHC_TUNING_CTRL);
+				tuning_ctrl |= ESDHC_STD_TUNING_EN | ESDHC_TUNING_START_TAP;
+				if (imx_data->boarddata.tuning_step)
+					tuning_ctrl |= imx_data->boarddata.tuning_step << ESDHC_TUNING_STEP_SHIFT;
+					writel(tuning_ctrl, host->ioaddr + ESDHC_TUNING_CTRL);
 			} else {
 				v &= ~ESDHC_MIX_CTRL_EXE_TUNE;
 			}
@@ -963,6 +970,8 @@ sdhci_esdhc_imx_probe_dt(struct platform_device *pdev,
 	if (gpio_is_valid(boarddata->wp_gpio))
 		boarddata->wp_type = ESDHC_WP_GPIO;
 
+	of_property_read_u32(np, "fsl,tuning-step", &boarddata->tuning_step);
+
 	if (of_find_property(np, "no-1-8-v", NULL))
 		boarddata->support_vsel = false;
 	else
diff --git a/include/linux/platform_data/mmc-esdhc-imx.h b/include/linux/platform_data/mmc-esdhc-imx.h
index e1571efa3f2b..95ccab3f454a 100644
--- a/include/linux/platform_data/mmc-esdhc-imx.h
+++ b/include/linux/platform_data/mmc-esdhc-imx.h
@@ -45,5 +45,6 @@ struct esdhc_platform_data {
 	int max_bus_width;
 	bool support_vsel;
 	unsigned int delay_line;
+	unsigned int tuning_step;       /* The delay cell steps in tuning procedure */
 };
 #endif /* __ASM_ARCH_IMX_ESDHC_H */
-- 
cgit v1.2.3-70-g09d2


From b5b4ff0a633910b2b9dca7915fd6ab17aa10dc3e Mon Sep 17 00:00:00 2001
From: Shawn Lin <shawn.lin@rock-chips.com>
Date: Wed, 12 Aug 2015 13:08:32 +0800
Subject: mmc: block: skip trim for some kingston eMMCs

For some mass production of kingston eMMCs which adopt Phison's
firmware will meet an unrecoverable data conrruption occasionally
if performing trim due to a firmware bug confirmed by vendor. We
found it on Intel-C3230RK platform. So we add fixup of broken trim
for it.

Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/card/block.c | 10 ++++++++++
 drivers/mmc/core/core.c  |  3 ++-
 include/linux/mmc/card.h |  2 ++
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index a58287e574cc..c742cfd7674e 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -2389,6 +2389,7 @@ force_ro_fail:
 #define CID_MANFID_TOSHIBA	0x11
 #define CID_MANFID_MICRON	0x13
 #define CID_MANFID_SAMSUNG	0x15
+#define CID_MANFID_KINGSTON	0x70
 
 static const struct mmc_fixup blk_fixups[] =
 {
@@ -2451,6 +2452,15 @@ static const struct mmc_fixup blk_fixups[] =
 	MMC_FIXUP("VZL00M", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc,
 		  MMC_QUIRK_SEC_ERASE_TRIM_BROKEN),
 
+	/*
+	 *  On Some Kingston eMMCs, performing trim can result in
+	 *  unrecoverable data conrruption occasionally due to a firmware bug.
+	 */
+	MMC_FIXUP("V10008", CID_MANFID_KINGSTON, CID_OEMID_ANY, add_quirk_mmc,
+		  MMC_QUIRK_TRIM_BROKEN),
+	MMC_FIXUP("V10016", CID_MANFID_KINGSTON, CID_OEMID_ANY, add_quirk_mmc,
+		  MMC_QUIRK_TRIM_BROKEN),
+
 	END_FIXUP
 };
 
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 57edb2a9bb04..664b61729fa9 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -2250,7 +2250,8 @@ EXPORT_SYMBOL(mmc_can_erase);
 
 int mmc_can_trim(struct mmc_card *card)
 {
-	if (card->ext_csd.sec_feature_support & EXT_CSD_SEC_GB_CL_EN)
+	if ((card->ext_csd.sec_feature_support & EXT_CSD_SEC_GB_CL_EN) &&
+	    (!(card->quirks & MMC_QUIRK_TRIM_BROKEN)))
 		return 1;
 	return 0;
 }
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 8fcbcd13218f..fdd0779ccdfa 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -279,6 +279,8 @@ struct mmc_card {
 #define MMC_QUIRK_LONG_READ_TIME (1<<9)		/* Data read time > CSD says */
 #define MMC_QUIRK_SEC_ERASE_TRIM_BROKEN (1<<10)	/* Skip secure for erase/trim */
 #define MMC_QUIRK_BROKEN_IRQ_POLLING	(1<<11)	/* Polling SDIO_CCCR_INTx could create a fake interrupt */
+#define MMC_QUIRK_TRIM_BROKEN	(1<<12)		/* Skip trim */
+
 
 	unsigned int		erase_size;	/* erase size in sectors */
  	unsigned int		erase_shift;	/* if erase unit is power 2 */
-- 
cgit v1.2.3-70-g09d2


From 0b6a3da9617a08e13afc09cb7e148470ed0eb280 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Wed, 26 Aug 2015 17:00:42 +0100
Subject: irqchip/GICv3: Convert to EOImode == 1

So far, GICv3 has been used in with EOImode == 0. The effect of this
mode is to perform the priority drop and the deactivation of the
interrupt at the same time.

While this works perfectly for Linux (we only have a single priority),
it causes issues when an interrupt is forwarded to a guest, and when
we want the guest to perform the EOI itself.

For this case, the GIC architecture provides EOImode == 1, where:
- A write to ICC_EOIR1_EL1 drops the priority of the interrupt and
  leaves it active. Other interrupts at the same priority level can
  now be taken, but the active interrupt cannot be taken again
- A write to ICC_DIR_EL1 marks the interrupt as inactive, meaning
  it can now be taken again.

This patch converts the driver to be able to use this new mode,
depending on whether or not the kernel can behave as a hypervisor.
No feature change.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-and-tested-by: Eric Auger <eric.auger@linaro.org>
Cc: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: kvmarm@lists.cs.columbia.edu
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1440604845-28229-2-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/irqchip/irq-gic-v3.c       | 68 ++++++++++++++++++++++++++++++++++----
 include/linux/irqchip/arm-gic-v3.h |  9 +++++
 2 files changed, 71 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index e406bc5f13e4..5c31cc9353d5 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -31,6 +31,7 @@
 #include <asm/cputype.h>
 #include <asm/exception.h>
 #include <asm/smp_plat.h>
+#include <asm/virt.h>
 
 #include "irq-gic-common.h"
 
@@ -50,6 +51,7 @@ struct gic_chip_data {
 };
 
 static struct gic_chip_data gic_data __read_mostly;
+static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;
 
 #define gic_data_rdist()		(this_cpu_ptr(gic_data.rdists.rdist))
 #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
@@ -231,6 +233,11 @@ static void gic_mask_irq(struct irq_data *d)
 	gic_poke_irq(d, GICD_ICENABLER);
 }
 
+static void gic_eoimode1_mask_irq(struct irq_data *d)
+{
+	gic_mask_irq(d);
+}
+
 static void gic_unmask_irq(struct irq_data *d)
 {
 	gic_poke_irq(d, GICD_ISENABLER);
@@ -296,6 +303,16 @@ static void gic_eoi_irq(struct irq_data *d)
 	gic_write_eoir(gic_irq(d));
 }
 
+static void gic_eoimode1_eoi_irq(struct irq_data *d)
+{
+	/*
+	 * No need to deactivate an LPI.
+	 */
+	if (gic_irq(d) >= 8192)
+		return;
+	gic_write_dir(gic_irq(d));
+}
+
 static int gic_set_type(struct irq_data *d, unsigned int type)
 {
 	unsigned int irq = gic_irq(d);
@@ -343,15 +360,26 @@ static asmlinkage void __exception_irq_entry gic_handle_irq(struct pt_regs *regs
 
 		if (likely(irqnr > 15 && irqnr < 1020) || irqnr >= 8192) {
 			int err;
+
+			if (static_key_true(&supports_deactivate))
+				gic_write_eoir(irqnr);
+
 			err = handle_domain_irq(gic_data.domain, irqnr, regs);
 			if (err) {
 				WARN_ONCE(true, "Unexpected interrupt received!\n");
-				gic_write_eoir(irqnr);
+				if (static_key_true(&supports_deactivate)) {
+					if (irqnr < 8192)
+						gic_write_dir(irqnr);
+				} else {
+					gic_write_eoir(irqnr);
+				}
 			}
 			continue;
 		}
 		if (irqnr < 16) {
 			gic_write_eoir(irqnr);
+			if (static_key_true(&supports_deactivate))
+				gic_write_dir(irqnr);
 #ifdef CONFIG_SMP
 			handle_IPI(irqnr, regs);
 #else
@@ -451,8 +479,13 @@ static void gic_cpu_sys_reg_init(void)
 	/* Set priority mask register */
 	gic_write_pmr(DEFAULT_PMR_VALUE);
 
-	/* EOI deactivates interrupt too (mode 0) */
-	gic_write_ctlr(ICC_CTLR_EL1_EOImode_drop_dir);
+	if (static_key_true(&supports_deactivate)) {
+		/* EOI drops priority only (mode 1) */
+		gic_write_ctlr(ICC_CTLR_EL1_EOImode_drop);
+	} else {
+		/* EOI deactivates interrupt too (mode 0) */
+		gic_write_ctlr(ICC_CTLR_EL1_EOImode_drop_dir);
+	}
 
 	/* ... and let's hit the road... */
 	gic_write_grpen1(1);
@@ -661,11 +694,28 @@ static struct irq_chip gic_chip = {
 	.flags			= IRQCHIP_SET_TYPE_MASKED,
 };
 
+static struct irq_chip gic_eoimode1_chip = {
+	.name			= "GICv3",
+	.irq_mask		= gic_eoimode1_mask_irq,
+	.irq_unmask		= gic_unmask_irq,
+	.irq_eoi		= gic_eoimode1_eoi_irq,
+	.irq_set_type		= gic_set_type,
+	.irq_set_affinity	= gic_set_affinity,
+	.irq_get_irqchip_state	= gic_irq_get_irqchip_state,
+	.irq_set_irqchip_state	= gic_irq_set_irqchip_state,
+	.flags			= IRQCHIP_SET_TYPE_MASKED,
+};
+
 #define GIC_ID_NR		(1U << gic_data.rdists.id_bits)
 
 static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
 			      irq_hw_number_t hw)
 {
+	struct irq_chip *chip = &gic_chip;
+
+	if (static_key_true(&supports_deactivate))
+		chip = &gic_eoimode1_chip;
+
 	/* SGIs are private to the core kernel */
 	if (hw < 16)
 		return -EPERM;
@@ -679,13 +729,13 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
 	/* PPIs */
 	if (hw < 32) {
 		irq_set_percpu_devid(irq);
-		irq_domain_set_info(d, irq, hw, &gic_chip, d->host_data,
+		irq_domain_set_info(d, irq, hw, chip, d->host_data,
 				    handle_percpu_devid_irq, NULL, NULL);
 		set_irq_flags(irq, IRQF_VALID | IRQF_NOAUTOEN);
 	}
 	/* SPIs */
 	if (hw >= 32 && hw < gic_data.irq_nr) {
-		irq_domain_set_info(d, irq, hw, &gic_chip, d->host_data,
+		irq_domain_set_info(d, irq, hw, chip, d->host_data,
 				    handle_fasteoi_irq, NULL, NULL);
 		set_irq_flags(irq, IRQF_VALID | IRQF_PROBE);
 	}
@@ -693,7 +743,7 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
 	if (hw >= 8192 && hw < GIC_ID_NR) {
 		if (!gic_dist_supports_lpis())
 			return -EPERM;
-		irq_domain_set_info(d, irq, hw, &gic_chip, d->host_data,
+		irq_domain_set_info(d, irq, hw, chip, d->host_data,
 				    handle_fasteoi_irq, NULL, NULL);
 		set_irq_flags(irq, IRQF_VALID);
 	}
@@ -820,6 +870,12 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare
 	if (of_property_read_u64(node, "redistributor-stride", &redist_stride))
 		redist_stride = 0;
 
+	if (!is_hyp_mode_available())
+		static_key_slow_dec(&supports_deactivate);
+
+	if (static_key_true(&supports_deactivate))
+		pr_info("GIC: Using split EOI/Deactivate mode\n");
+
 	gic_data.dist_base = dist_base;
 	gic_data.redist_regions = rdist_regs;
 	gic_data.nr_redist_regions = nr_redist_regions;
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index bf982e021fbd..71e4faf33091 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -104,6 +104,8 @@
 #define GICR_SYNCR			0x00C0
 #define GICR_MOVLPIR			0x0100
 #define GICR_MOVALLR			0x0110
+#define GICR_ISACTIVER			GICD_ISACTIVER
+#define GICR_ICACTIVER			GICD_ICACTIVER
 #define GICR_IDREGS			GICD_IDREGS
 #define GICR_PIDR2			GICD_PIDR2
 
@@ -288,6 +290,7 @@
 #define ICH_VMCR_PMR_MASK		(0xffUL << ICH_VMCR_PMR_SHIFT)
 
 #define ICC_EOIR1_EL1			sys_reg(3, 0, 12, 12, 1)
+#define ICC_DIR_EL1			sys_reg(3, 0, 12, 11, 1)
 #define ICC_IAR1_EL1			sys_reg(3, 0, 12, 12, 0)
 #define ICC_SGI1R_EL1			sys_reg(3, 0, 12, 11, 5)
 #define ICC_PMR_EL1			sys_reg(3, 0, 4, 6, 0)
@@ -385,6 +388,12 @@ static inline void gic_write_eoir(u64 irq)
 	isb();
 }
 
+static inline void gic_write_dir(u64 irq)
+{
+	asm volatile("msr_s " __stringify(ICC_DIR_EL1) ", %0" : : "r" (irq));
+	isb();
+}
+
 struct irq_domain;
 int its_cpu_init(void);
 int its_init(struct device_node *node, struct rdists *rdists,
-- 
cgit v1.2.3-70-g09d2


From 0b996fd35957a30568cddbce05b917c1897966e0 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Wed, 26 Aug 2015 17:00:44 +0100
Subject: irqchip/GIC: Convert to EOImode == 1

So far, GICv2 has been used with EOImode == 0. The effect of this
mode is to perform the priority drop and the deactivation of the
interrupt at the same time.

While this works perfectly for Linux (we only have a single priority),
it causes issues when an interrupt is forwarded to a guest, and when
we want the guest to perform the EOI itself.

For this case, the GIC architecture provides EOImode == 1, where:
- A write to the EOI register drops the priority of the interrupt
  and leaves it active. Other interrupts at the same priority level
  can now be taken, but the active interrupt cannot be taken again
- A write to the DIR marks the interrupt as inactive, meaning it can
  now be taken again.

We only enable this feature when booted in HYP mode and that
the device-tree reported a suitable CPU interface. Observable behaviour
should remain unchanged.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-and-tested-by: Eric Auger <eric.auger@linaro.org>
Cc: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: kvmarm@lists.cs.columbia.edu
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1440604845-28229-4-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/irqchip/irq-gic.c       | 71 +++++++++++++++++++++++++++++++++++++++--
 include/linux/irqchip/arm-gic.h |  4 +++
 2 files changed, 72 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index aa3e7b8a69c4..c835f4c60d21 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -47,6 +47,7 @@
 #include <asm/irq.h>
 #include <asm/exception.h>
 #include <asm/smp_plat.h>
+#include <asm/virt.h>
 
 #include "irq-gic-common.h"
 
@@ -82,6 +83,8 @@ static DEFINE_RAW_SPINLOCK(irq_controller_lock);
 #define NR_GIC_CPU_IF 8
 static u8 gic_cpu_map[NR_GIC_CPU_IF] __read_mostly;
 
+static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;
+
 #ifndef MAX_GIC_NR
 #define MAX_GIC_NR	1
 #endif
@@ -157,6 +160,11 @@ static void gic_mask_irq(struct irq_data *d)
 	gic_poke_irq(d, GIC_DIST_ENABLE_CLEAR);
 }
 
+static void gic_eoimode1_mask_irq(struct irq_data *d)
+{
+	gic_mask_irq(d);
+}
+
 static void gic_unmask_irq(struct irq_data *d)
 {
 	gic_poke_irq(d, GIC_DIST_ENABLE_SET);
@@ -167,6 +175,11 @@ static void gic_eoi_irq(struct irq_data *d)
 	writel_relaxed(gic_irq(d), gic_cpu_base(d) + GIC_CPU_EOI);
 }
 
+static void gic_eoimode1_eoi_irq(struct irq_data *d)
+{
+	writel_relaxed(gic_irq(d), gic_cpu_base(d) + GIC_CPU_DEACTIVATE);
+}
+
 static int gic_irq_set_irqchip_state(struct irq_data *d,
 				     enum irqchip_irq_state which, bool val)
 {
@@ -272,11 +285,15 @@ static void __exception_irq_entry gic_handle_irq(struct pt_regs *regs)
 		irqnr = irqstat & GICC_IAR_INT_ID_MASK;
 
 		if (likely(irqnr > 15 && irqnr < 1021)) {
+			if (static_key_true(&supports_deactivate))
+				writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI);
 			handle_domain_irq(gic->domain, irqnr, regs);
 			continue;
 		}
 		if (irqnr < 16) {
 			writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI);
+			if (static_key_true(&supports_deactivate))
+				writel_relaxed(irqstat, cpu_base + GIC_CPU_DEACTIVATE);
 #ifdef CONFIG_SMP
 			handle_IPI(irqnr, regs);
 #endif
@@ -329,6 +346,22 @@ static struct irq_chip gic_chip = {
 				  IRQCHIP_MASK_ON_SUSPEND,
 };
 
+static struct irq_chip gic_eoimode1_chip = {
+	.name			= "GICv2",
+	.irq_mask		= gic_eoimode1_mask_irq,
+	.irq_unmask		= gic_unmask_irq,
+	.irq_eoi		= gic_eoimode1_eoi_irq,
+	.irq_set_type		= gic_set_type,
+#ifdef CONFIG_SMP
+	.irq_set_affinity	= gic_set_affinity,
+#endif
+	.irq_get_irqchip_state	= gic_irq_get_irqchip_state,
+	.irq_set_irqchip_state	= gic_irq_set_irqchip_state,
+	.flags			= IRQCHIP_SET_TYPE_MASKED |
+				  IRQCHIP_SKIP_SET_WAKE |
+				  IRQCHIP_MASK_ON_SUSPEND,
+};
+
 void __init gic_cascade_irq(unsigned int gic_nr, unsigned int irq)
 {
 	if (gic_nr >= MAX_GIC_NR)
@@ -360,6 +393,10 @@ static void gic_cpu_if_up(struct gic_chip_data *gic)
 {
 	void __iomem *cpu_base = gic_data_cpu_base(gic);
 	u32 bypass = 0;
+	u32 mode = 0;
+
+	if (static_key_true(&supports_deactivate))
+		mode = GIC_CPU_CTRL_EOImodeNS;
 
 	/*
 	* Preserve bypass disable bits to be written back later
@@ -367,7 +404,7 @@ static void gic_cpu_if_up(struct gic_chip_data *gic)
 	bypass = readl(cpu_base + GIC_CPU_CTRL);
 	bypass &= GICC_DIS_BYPASS_MASK;
 
-	writel_relaxed(bypass | GICC_ENABLE, cpu_base + GIC_CPU_CTRL);
+	writel_relaxed(bypass | mode | GICC_ENABLE, cpu_base + GIC_CPU_CTRL);
 }
 
 
@@ -803,13 +840,20 @@ void __init gic_init_physaddr(struct device_node *node)
 static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
 				irq_hw_number_t hw)
 {
+	struct irq_chip *chip = &gic_chip;
+
+	if (static_key_true(&supports_deactivate)) {
+		if (d->host_data == (void *)&gic_data[0])
+			chip = &gic_eoimode1_chip;
+	}
+
 	if (hw < 32) {
 		irq_set_percpu_devid(irq);
-		irq_domain_set_info(d, irq, hw, &gic_chip, d->host_data,
+		irq_domain_set_info(d, irq, hw, chip, d->host_data,
 				    handle_percpu_devid_irq, NULL, NULL);
 		set_irq_flags(irq, IRQF_VALID | IRQF_NOAUTOEN);
 	} else {
-		irq_domain_set_info(d, irq, hw, &gic_chip, d->host_data,
+		irq_domain_set_info(d, irq, hw, chip, d->host_data,
 				    handle_fasteoi_irq, NULL, NULL);
 		set_irq_flags(irq, IRQF_VALID | IRQF_PROBE);
 	}
@@ -995,6 +1039,8 @@ void __init gic_init_bases(unsigned int gic_nr, int irq_start,
 		register_cpu_notifier(&gic_cpu_notifier);
 #endif
 		set_handle_irq(gic_handle_irq);
+		if (static_key_true(&supports_deactivate))
+			pr_info("GIC: Using split EOI/Deactivate mode\n");
 	}
 
 	gic_dist_init(gic);
@@ -1010,6 +1056,7 @@ gic_of_init(struct device_node *node, struct device_node *parent)
 {
 	void __iomem *cpu_base;
 	void __iomem *dist_base;
+	struct resource cpu_res;
 	u32 percpu_offset;
 	int irq;
 
@@ -1022,6 +1069,16 @@ gic_of_init(struct device_node *node, struct device_node *parent)
 	cpu_base = of_iomap(node, 1);
 	WARN(!cpu_base, "unable to map gic cpu registers\n");
 
+	of_address_to_resource(node, 1, &cpu_res);
+
+	/*
+	 * Disable split EOI/Deactivate if either HYP is not available
+	 * or the CPU interface is too small.
+	 */
+	if (gic_cnt == 0 && (!is_hyp_mode_available() ||
+			     resource_size(&cpu_res) < SZ_8K))
+		static_key_slow_dec(&supports_deactivate);
+
 	if (of_property_read_u32(node, "cpu-offset", &percpu_offset))
 		percpu_offset = 0;
 
@@ -1140,6 +1197,14 @@ gic_v2_acpi_init(struct acpi_table_header *table)
 		return -ENOMEM;
 	}
 
+	/*
+	 * Disable split EOI/Deactivate if HYP is not available. ACPI
+	 * guarantees that we'll always have a GICv2, so the CPU
+	 * interface will always be the right size.
+	 */
+	if (!is_hyp_mode_available())
+		static_key_slow_dec(&supports_deactivate);
+
 	/*
 	 * Initialize zero GIC instance (no multi-GIC support). Also, set GIC
 	 * as default IRQ domain to allow for GSI registration and GSI to IRQ
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index 65da435d01c1..af3d29f70781 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -20,9 +20,13 @@
 #define GIC_CPU_ALIAS_BINPOINT		0x1c
 #define GIC_CPU_ACTIVEPRIO		0xd0
 #define GIC_CPU_IDENT			0xfc
+#define GIC_CPU_DEACTIVATE		0x1000
 
 #define GICC_ENABLE			0x1
 #define GICC_INT_PRI_THRESHOLD		0xf0
+
+#define GIC_CPU_CTRL_EOImodeNS		(1 << 9)
+
 #define GICC_IAR_INT_ID_MASK		0x3ff
 #define GICC_INT_SPURIOUS		1023
 #define GICC_DIS_BYPASS_MASK		0x1e0
-- 
cgit v1.2.3-70-g09d2


From 0e4ead9d7b3655d76371604abb9b0dcc4e79bb7d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 27 Aug 2015 09:31:18 +0200
Subject: net: introduce change upper device notifier change info

Add info that is passed along with NETDEV_CHANGEUPPER event.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  7 +++++++
 net/core/dev.c            | 16 ++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6abe0d6f1e1d..39f30daac483 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2127,6 +2127,13 @@ struct netdev_notifier_change_info {
 	unsigned int flags_changed;
 };
 
+struct netdev_notifier_changeupper_info {
+	struct netdev_notifier_info info; /* must be first */
+	struct net_device *upper_dev; /* new upper dev */
+	bool master; /* is upper dev master */
+	bool linking; /* is the nofication for link or unlink */
+};
+
 static inline void netdev_notifier_info_init(struct netdev_notifier_info *info,
 					     struct net_device *dev)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 7bb24f1879b8..a8e6cf4298d3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5311,6 +5311,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 				   struct net_device *upper_dev, bool master,
 				   void *private)
 {
+	struct netdev_notifier_changeupper_info changeupper_info;
 	struct netdev_adjacent *i, *j, *to_i, *to_j;
 	int ret = 0;
 
@@ -5329,6 +5330,10 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	if (master && netdev_master_upper_dev_get(dev))
 		return -EBUSY;
 
+	changeupper_info.upper_dev = upper_dev;
+	changeupper_info.master = master;
+	changeupper_info.linking = true;
+
 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
 						   master);
 	if (ret)
@@ -5367,7 +5372,8 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 			goto rollback_lower_mesh;
 	}
 
-	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+				      &changeupper_info.info);
 	return 0;
 
 rollback_lower_mesh:
@@ -5462,9 +5468,14 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev)
 {
+	struct netdev_notifier_changeupper_info changeupper_info;
 	struct netdev_adjacent *i, *j;
 	ASSERT_RTNL();
 
+	changeupper_info.upper_dev = upper_dev;
+	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
+	changeupper_info.linking = false;
+
 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 
 	/* Here is the tricky part. We must remove all dev's lower
@@ -5484,7 +5495,8 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
 		__netdev_adjacent_dev_unlink(dev, i->dev);
 
-	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+				      &changeupper_info.info);
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
-- 
cgit v1.2.3-70-g09d2


From 0894ae3f0a587bda9733ec4a4b67af7ded3a9498 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 27 Aug 2015 09:31:19 +0200
Subject: net: add netif_is_bridge_master helper

Add this helper so code can easily figure out if netdev is a bridge.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 39f30daac483..be625f4754b9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3848,6 +3848,11 @@ static inline bool netif_is_vrf(const struct net_device *dev)
 	return dev->priv_flags & IFF_VRF_MASTER;
 }
 
+static inline bool netif_is_bridge_master(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_EBRIDGE;
+}
+
 static inline bool netif_index_is_vrf(struct net *net, int ifindex)
 {
 	bool rc = false;
-- 
cgit v1.2.3-70-g09d2


From 35d4e1725202e6656fcfa8b88447327ad3ae0c0c Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 27 Aug 2015 09:31:20 +0200
Subject: net: add netif_is_ovs_master helper with IFF_OPENVSWITCH private flag

Add this helper so code can easily figure out if netdev is openswitch.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h            | 8 ++++++++
 net/openvswitch/vport-internal_dev.c | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index be625f4754b9..6656a43c072d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1264,6 +1264,7 @@ struct net_device_ops {
  * @IFF_MACVLAN: Macvlan device
  * @IFF_VRF_MASTER: device is a VRF master
  * @IFF_NO_QUEUE: device can run without qdisc attached
+ * @IFF_OPENVSWITCH: device is a Open vSwitch master
  */
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
@@ -1293,6 +1294,7 @@ enum netdev_priv_flags {
 	IFF_IPVLAN_SLAVE		= 1<<24,
 	IFF_VRF_MASTER			= 1<<25,
 	IFF_NO_QUEUE			= 1<<26,
+	IFF_OPENVSWITCH			= 1<<27,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1322,6 +1324,7 @@ enum netdev_priv_flags {
 #define IFF_IPVLAN_SLAVE		IFF_IPVLAN_SLAVE
 #define IFF_VRF_MASTER			IFF_VRF_MASTER
 #define IFF_NO_QUEUE			IFF_NO_QUEUE
+#define IFF_OPENVSWITCH			IFF_OPENVSWITCH
 
 /**
  *	struct net_device - The DEVICE structure.
@@ -3853,6 +3856,11 @@ static inline bool netif_is_bridge_master(const struct net_device *dev)
 	return dev->priv_flags & IFF_EBRIDGE;
 }
 
+static inline bool netif_is_ovs_master(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_OPENVSWITCH;
+}
+
 static inline bool netif_index_is_vrf(struct net *net, int ifindex)
 {
 	bool rc = false;
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index c058bbf876c3..80b3e12ec882 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -135,7 +135,7 @@ static void do_setup(struct net_device *netdev)
 	netdev->netdev_ops = &internal_dev_netdev_ops;
 
 	netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
-	netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH;
 	netdev->destructor = internal_dev_destructor;
 	netdev->ethtool_ops = &internal_dev_ethtool_ops;
 	netdev->rtnl_link_ops = &internal_dev_link_ops;
-- 
cgit v1.2.3-70-g09d2


From 0dc1549bfd67053181415a3f7544628a6bcd2a08 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 27 Aug 2015 09:31:21 +0200
Subject: net: kill long time unused bonding private flags

We don't use them for years, just kill them now.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 57 +++++++++++++++++------------------------------
 1 file changed, 21 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6656a43c072d..88a00694eda5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1240,13 +1240,8 @@ struct net_device_ops {
  *
  * @IFF_802_1Q_VLAN: 802.1Q VLAN device
  * @IFF_EBRIDGE: Ethernet bridging device
- * @IFF_SLAVE_INACTIVE: bonding slave not the curr. active
- * @IFF_MASTER_8023AD: bonding master, 802.3ad
- * @IFF_MASTER_ALB: bonding master, balance-alb
  * @IFF_BONDING: bonding master or slave
- * @IFF_SLAVE_NEEDARP: need ARPs for validation
  * @IFF_ISATAP: ISATAP interface (RFC4214)
- * @IFF_MASTER_ARPMON: bonding master, ARP mon in use
  * @IFF_WAN_HDLC: WAN HDLC device
  * @IFF_XMIT_DST_RELEASE: dev_hard_start_xmit() is allowed to
  *	release skb->dst
@@ -1269,43 +1264,33 @@ struct net_device_ops {
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
 	IFF_EBRIDGE			= 1<<1,
-	IFF_SLAVE_INACTIVE		= 1<<2,
-	IFF_MASTER_8023AD		= 1<<3,
-	IFF_MASTER_ALB			= 1<<4,
-	IFF_BONDING			= 1<<5,
-	IFF_SLAVE_NEEDARP		= 1<<6,
-	IFF_ISATAP			= 1<<7,
-	IFF_MASTER_ARPMON		= 1<<8,
-	IFF_WAN_HDLC			= 1<<9,
-	IFF_XMIT_DST_RELEASE		= 1<<10,
-	IFF_DONT_BRIDGE			= 1<<11,
-	IFF_DISABLE_NETPOLL		= 1<<12,
-	IFF_MACVLAN_PORT		= 1<<13,
-	IFF_BRIDGE_PORT			= 1<<14,
-	IFF_OVS_DATAPATH		= 1<<15,
-	IFF_TX_SKB_SHARING		= 1<<16,
-	IFF_UNICAST_FLT			= 1<<17,
-	IFF_TEAM_PORT			= 1<<18,
-	IFF_SUPP_NOFCS			= 1<<19,
-	IFF_LIVE_ADDR_CHANGE		= 1<<20,
-	IFF_MACVLAN			= 1<<21,
-	IFF_XMIT_DST_RELEASE_PERM	= 1<<22,
-	IFF_IPVLAN_MASTER		= 1<<23,
-	IFF_IPVLAN_SLAVE		= 1<<24,
-	IFF_VRF_MASTER			= 1<<25,
-	IFF_NO_QUEUE			= 1<<26,
-	IFF_OPENVSWITCH			= 1<<27,
+	IFF_BONDING			= 1<<2,
+	IFF_ISATAP			= 1<<3,
+	IFF_WAN_HDLC			= 1<<4,
+	IFF_XMIT_DST_RELEASE		= 1<<5,
+	IFF_DONT_BRIDGE			= 1<<6,
+	IFF_DISABLE_NETPOLL		= 1<<7,
+	IFF_MACVLAN_PORT		= 1<<8,
+	IFF_BRIDGE_PORT			= 1<<9,
+	IFF_OVS_DATAPATH		= 1<<10,
+	IFF_TX_SKB_SHARING		= 1<<11,
+	IFF_UNICAST_FLT			= 1<<12,
+	IFF_TEAM_PORT			= 1<<13,
+	IFF_SUPP_NOFCS			= 1<<14,
+	IFF_LIVE_ADDR_CHANGE		= 1<<15,
+	IFF_MACVLAN			= 1<<16,
+	IFF_XMIT_DST_RELEASE_PERM	= 1<<17,
+	IFF_IPVLAN_MASTER		= 1<<18,
+	IFF_IPVLAN_SLAVE		= 1<<19,
+	IFF_VRF_MASTER			= 1<<20,
+	IFF_NO_QUEUE			= 1<<21,
+	IFF_OPENVSWITCH			= 1<<22,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
 #define IFF_EBRIDGE			IFF_EBRIDGE
-#define IFF_SLAVE_INACTIVE		IFF_SLAVE_INACTIVE
-#define IFF_MASTER_8023AD		IFF_MASTER_8023AD
-#define IFF_MASTER_ALB			IFF_MASTER_ALB
 #define IFF_BONDING			IFF_BONDING
-#define IFF_SLAVE_NEEDARP		IFF_SLAVE_NEEDARP
 #define IFF_ISATAP			IFF_ISATAP
-#define IFF_MASTER_ARPMON		IFF_MASTER_ARPMON
 #define IFF_WAN_HDLC			IFF_WAN_HDLC
 #define IFF_XMIT_DST_RELEASE		IFF_XMIT_DST_RELEASE
 #define IFF_DONT_BRIDGE			IFF_DONT_BRIDGE
-- 
cgit v1.2.3-70-g09d2


From 2e4cfae2a8e3f9ce3925c9b6e9e865fe8476fc4f Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Thu, 27 Aug 2015 15:25:45 -0700
Subject: netfilter: Define v6ops in !CONFIG_NETFILTER case.

When CONFIG_OPENVSWITCH is set, and CONFIG_NETFILTER is not set, the
openvswitch IPv6 fragmentation handling cannot refer to ipv6_ops because
it isn't defined. Add a dummy version to avoid #ifdefs in source files.

Fixes: 7f8a436 "openvswitch: Add conntrack action"
Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv6.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 8b7d28f3aada..771574677e83 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -9,15 +9,6 @@
 
 #include <uapi/linux/netfilter_ipv6.h>
 
-
-#ifdef CONFIG_NETFILTER
-int ip6_route_me_harder(struct sk_buff *skb);
-__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
-			unsigned int dataoff, u_int8_t protocol);
-
-int ipv6_netfilter_init(void);
-void ipv6_netfilter_fini(void);
-
 /*
  * Hook functions for ipv6 to allow xt_* modules to be built-in even
  * if IPv6 is a module.
@@ -30,6 +21,14 @@ struct nf_ipv6_ops {
 			int (*output)(struct sock *, struct sk_buff *));
 };
 
+#ifdef CONFIG_NETFILTER
+int ip6_route_me_harder(struct sk_buff *skb);
+__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
+			unsigned int dataoff, u_int8_t protocol);
+
+int ipv6_netfilter_init(void);
+void ipv6_netfilter_fini(void);
+
 extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops;
 static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void)
 {
@@ -39,6 +38,7 @@ static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void)
 #else /* CONFIG_NETFILTER */
 static inline int ipv6_netfilter_init(void) { return 0; }
 static inline void ipv6_netfilter_fini(void) { return; }
+static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void) { return NULL; }
 #endif /* CONFIG_NETFILTER */
 
 #endif /*__LINUX_IP6_NETFILTER_H*/
-- 
cgit v1.2.3-70-g09d2


From cb389b9c0e00c30c9daf20287f7d91e2466edbb1 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 7 Aug 2015 17:41:00 -0400
Subject: dax: drop size parameter to ->direct_access()

None of the implementations currently use it.  The common
bdev_direct_access() entry point handles all the size checks before
calling ->direct_access().

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/powerpc/sysdev/axonram.c | 2 +-
 drivers/block/brd.c           | 6 +-----
 drivers/nvdimm/pmem.c         | 2 +-
 drivers/s390/block/dcssblk.c  | 4 ++--
 fs/block_dev.c                | 2 +-
 include/linux/blkdev.h        | 2 +-
 6 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index a2be2a66dab6..4419c84ac15a 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -141,7 +141,7 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
  */
 static long
 axon_ram_direct_access(struct block_device *device, sector_t sector,
-		       void __pmem **kaddr, unsigned long *pfn, long size)
+		       void __pmem **kaddr, unsigned long *pfn)
 {
 	struct axon_ram_bank *bank = device->bd_disk->private_data;
 	loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index c96402fd1560..03c45c41bdfa 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -371,7 +371,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 static long brd_direct_access(struct block_device *bdev, sector_t sector,
-			void __pmem **kaddr, unsigned long *pfn, long size)
+			void __pmem **kaddr, unsigned long *pfn)
 {
 	struct brd_device *brd = bdev->bd_disk->private_data;
 	struct page *page;
@@ -384,10 +384,6 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector,
 	*kaddr = (void __pmem *)page_address(page);
 	*pfn = page_to_pfn(page);
 
-	/*
-	 * TODO: If size > PAGE_SIZE, we could look to see if the next page in
-	 * the file happens to be mapped to the next page of physical RAM.
-	 */
 	return PAGE_SIZE;
 }
 #else
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index f3b629779266..3b5b9cb758b6 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -92,7 +92,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 }
 
 static long pmem_direct_access(struct block_device *bdev, sector_t sector,
-		      void __pmem **kaddr, unsigned long *pfn, long size)
+		      void __pmem **kaddr, unsigned long *pfn)
 {
 	struct pmem_device *pmem = bdev->bd_disk->private_data;
 	size_t offset = sector << 9;
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 2c5a397b9f3e..8c027a9e4e8a 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -29,7 +29,7 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode);
 static void dcssblk_release(struct gendisk *disk, fmode_t mode);
 static void dcssblk_make_request(struct request_queue *q, struct bio *bio);
 static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
-			 void __pmem **kaddr, unsigned long *pfn, long size);
+			 void __pmem **kaddr, unsigned long *pfn);
 
 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
 
@@ -879,7 +879,7 @@ fail:
 
 static long
 dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
-			void __pmem **kaddr, unsigned long *pfn, long size)
+			void __pmem **kaddr, unsigned long *pfn)
 {
 	struct dcssblk_dev_info *dev_info;
 	unsigned long offset, dev_sz;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2345a9870e2c..3831e5691b32 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -462,7 +462,7 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector,
 	sector += get_start_sect(bdev);
 	if (sector % (PAGE_SIZE / 512))
 		return -EINVAL;
-	avail = ops->direct_access(bdev, sector, addr, pfn, size);
+	avail = ops->direct_access(bdev, sector, addr, pfn);
 	if (!avail)
 		return -ERANGE;
 	return min(avail, size);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c401ecdff9cb..c22064f326b2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1556,7 +1556,7 @@ struct block_device_operations {
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	long (*direct_access)(struct block_device *, sector_t, void __pmem **,
-			unsigned long *pfn, long size);
+			unsigned long *pfn);
 	unsigned int (*check_events) (struct gendisk *disk,
 				      unsigned int clearing);
 	/* ->media_changed() is DEPRECATED, use ->check_events() instead */
-- 
cgit v1.2.3-70-g09d2


From 033fbae988fcb67e5077203512181890848b8e90 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 9 Aug 2015 15:29:06 -0400
Subject: mm: ZONE_DEVICE for "device memory"

While pmem is usable as a block device or via DAX mappings to userspace
there are several usage scenarios that can not target pmem due to its
lack of struct page coverage. In preparation for "hot plugging" pmem
into the vmemmap add ZONE_DEVICE as a new zone to tag these pages
separately from the ones that are subject to standard page allocations.
Importantly "device memory" can be removed at will by userspace
unbinding the driver of the device.

Having a separate zone prevents allocation and otherwise marks these
pages that are distinct from typical uniform memory.  Device memory has
different lifetime and performance characteristics than RAM.  However,
since we have run out of ZONES_SHIFT bits this functionality currently
depends on sacrificing ZONE_DMA.

Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Jerome Glisse <j.glisse@gmail.com>
[hch: various simplifications in the arch interface]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/ia64/mm/init.c            |  4 ++--
 arch/powerpc/mm/mem.c          |  4 ++--
 arch/s390/mm/init.c            |  2 +-
 arch/sh/mm/init.c              |  5 +++--
 arch/tile/mm/init.c            |  2 +-
 arch/x86/mm/init_32.c          |  4 ++--
 arch/x86/mm/init_64.c          |  4 ++--
 include/linux/memory_hotplug.h |  5 +++--
 include/linux/mmzone.h         | 23 +++++++++++++++++++++++
 mm/Kconfig                     | 17 +++++++++++++++++
 mm/memory_hotplug.c            | 14 +++++++++++---
 mm/page_alloc.c                |  3 +++
 12 files changed, 70 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 97e48b0eefc7..1841ef69183d 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -645,7 +645,7 @@ mem_init (void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
 	pg_data_t *pgdat;
 	struct zone *zone;
@@ -656,7 +656,7 @@ int arch_add_memory(int nid, u64 start, u64 size)
 	pgdat = NODE_DATA(nid);
 
 	zone = pgdat->node_zones +
-		zone_for_memory(nid, start, size, ZONE_NORMAL);
+		zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
 	ret = __add_pages(nid, zone, start_pfn, nr_pages);
 
 	if (ret)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 0f11819d8f1d..6571cfb05668 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -113,7 +113,7 @@ int memory_add_physaddr_to_nid(u64 start)
 }
 #endif
 
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
 	struct pglist_data *pgdata;
 	struct zone *zone;
@@ -128,7 +128,7 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
 	/* this should work for most non-highmem platforms */
 	zone = pgdata->node_zones +
-		zone_for_memory(nid, start, size, 0);
+		zone_for_memory(nid, start, size, 0, for_device);
 
 	return __add_pages(nid, zone, start_pfn, nr_pages);
 }
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 76e873748b56..48ee78be88ba 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -168,7 +168,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
 #endif
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
 	unsigned long zone_start_pfn, zone_end_pfn, nr_pages;
 	unsigned long start_pfn = PFN_DOWN(start);
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 2790b6a64157..c1490096b863 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -485,7 +485,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 #endif
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
 	pg_data_t *pgdat;
 	unsigned long start_pfn = start >> PAGE_SHIFT;
@@ -496,7 +496,8 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
 	/* We only have ZONE_NORMAL, so this is easy.. */
 	ret = __add_pages(nid, pgdat->node_zones +
-			zone_for_memory(nid, start, size, ZONE_NORMAL),
+			zone_for_memory(nid, start, size, ZONE_NORMAL,
+			for_device),
 			start_pfn, nr_pages);
 	if (unlikely(ret))
 		printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index 5bd252e3fdc5..d4e1fc41d06d 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -863,7 +863,7 @@ void __init mem_init(void)
  * memory to the highmem for now.
  */
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-int arch_add_memory(u64 start, u64 size)
+int arch_add_memory(u64 start, u64 size, bool for_device)
 {
 	struct pglist_data *pgdata = &contig_page_data;
 	struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8340e45c891a..2a9237d20a70 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -822,11 +822,11 @@ void __init mem_init(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
 	struct pglist_data *pgdata = NODE_DATA(nid);
 	struct zone *zone = pgdata->node_zones +
-		zone_for_memory(nid, start, size, ZONE_HIGHMEM);
+		zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3fba623e3ba5..30564e2752d3 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -687,11 +687,11 @@ static void  update_end_of_memory_vars(u64 start, u64 size)
  * Memory is added always to NORMAL zone. This means you will never get
  * additional DMA/DMA32 memory.
  */
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
 	struct pglist_data *pgdat = NODE_DATA(nid);
 	struct zone *zone = pgdat->node_zones +
-		zone_for_memory(nid, start, size, ZONE_NORMAL);
+		zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 6ffa0ac7f7d6..8f60e899b33c 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -266,8 +266,9 @@ static inline void remove_memory(int nid, u64 start, u64 size) {}
 extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
 		void *arg, int (*func)(struct memory_block *, void *));
 extern int add_memory(int nid, u64 start, u64 size);
-extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default);
-extern int arch_add_memory(int nid, u64 start, u64 size);
+extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
+		bool for_device);
+extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern bool is_memblock_offlined(struct memory_block *mem);
 extern void remove_memory(int nid, u64 start, u64 size);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 754c25966a0a..9217fd93c25b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -319,7 +319,11 @@ enum zone_type {
 	ZONE_HIGHMEM,
 #endif
 	ZONE_MOVABLE,
+#ifdef CONFIG_ZONE_DEVICE
+	ZONE_DEVICE,
+#endif
 	__MAX_NR_ZONES
+
 };
 
 #ifndef __GENERATING_BOUNDS_H
@@ -794,6 +798,25 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
 	return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
 }
 
+static inline int zone_id(const struct zone *zone)
+{
+	struct pglist_data *pgdat = zone->zone_pgdat;
+
+	return zone - pgdat->node_zones;
+}
+
+#ifdef CONFIG_ZONE_DEVICE
+static inline bool is_dev_zone(const struct zone *zone)
+{
+	return zone_id(zone) == ZONE_DEVICE;
+}
+#else
+static inline bool is_dev_zone(const struct zone *zone)
+{
+	return false;
+}
+#endif
+
 #include <linux/memory_hotplug.h>
 
 extern struct mutex zonelists_mutex;
diff --git a/mm/Kconfig b/mm/Kconfig
index e79de2bd12cd..a0cd086df16b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -654,3 +654,20 @@ config DEFERRED_STRUCT_PAGE_INIT
 	  when kswapd starts. This has a potential performance impact on
 	  processes running early in the lifetime of the systemm until kswapd
 	  finishes the initialisation.
+
+config ZONE_DEVICE
+	bool "Device memory (pmem, etc...) hotplug support" if EXPERT
+	default !ZONE_DMA
+	depends on !ZONE_DMA
+	depends on MEMORY_HOTPLUG
+	depends on MEMORY_HOTREMOVE
+	depends on X86_64 #arch_add_memory() comprehends device memory
+
+	help
+	  Device memory hotplug support allows for establishing pmem,
+	  or other device driver discovered memory regions, in the
+	  memmap. This allows pfn_to_page() lookups of otherwise
+	  "device-physical" addresses which is needed for using a DAX
+	  mapping in an O_DIRECT operation, among other things.
+
+	  If FS_DAX is enabled, then say Y.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 26fbba7d888f..24e4c76c951b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -770,7 +770,10 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 
 	start = phys_start_pfn << PAGE_SHIFT;
 	size = nr_pages * PAGE_SIZE;
-	ret = release_mem_region_adjustable(&iomem_resource, start, size);
+
+	/* in the ZONE_DEVICE case device driver owns the memory region */
+	if (!is_dev_zone(zone))
+		ret = release_mem_region_adjustable(&iomem_resource, start, size);
 	if (ret) {
 		resource_size_t endres = start + size - 1;
 
@@ -1207,8 +1210,13 @@ static int should_add_memory_movable(int nid, u64 start, u64 size)
 	return 0;
 }
 
-int zone_for_memory(int nid, u64 start, u64 size, int zone_default)
+int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
+		bool for_device)
 {
+#ifdef CONFIG_ZONE_DEVICE
+	if (for_device)
+		return ZONE_DEVICE;
+#endif
 	if (should_add_memory_movable(nid, start, size))
 		return ZONE_MOVABLE;
 
@@ -1249,7 +1257,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
 	}
 
 	/* call arch's memory hotadd */
-	ret = arch_add_memory(nid, start, size);
+	ret = arch_add_memory(nid, start, size, false);
 
 	if (ret < 0)
 		goto error;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef19f22b2b7d..0f19b4e18233 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -207,6 +207,9 @@ static char * const zone_names[MAX_NR_ZONES] = {
 	 "HighMem",
 #endif
 	 "Movable",
+#ifdef CONFIG_ZONE_DEVICE
+	 "Device",
+#endif
 };
 
 int min_free_kbytes = 1024;
-- 
cgit v1.2.3-70-g09d2


From 41e94a851304f7acac840adec4004f8aeee53ad4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 17 Aug 2015 16:00:35 +0200
Subject: add devm_memremap_pages

This behaves like devm_memremap except that it ensures we have page
structures available that can back the region.

Signed-off-by: Christoph Hellwig <hch@lst.de>
[djbw: catch attempts to remap RAM, drop flags]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/io.h | 20 ++++++++++++++++++++
 kernel/memremap.c  | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io.h b/include/linux/io.h
index d8d749abd665..de64c1e53612 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -20,10 +20,13 @@
 
 #include <linux/types.h>
 #include <linux/init.h>
+#include <linux/bug.h>
+#include <linux/err.h>
 #include <asm/io.h>
 #include <asm/page.h>
 
 struct device;
+struct resource;
 
 __visible void __iowrite32_copy(void __iomem *to, const void *from, size_t count);
 void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
@@ -84,6 +87,23 @@ void *devm_memremap(struct device *dev, resource_size_t offset,
 		size_t size, unsigned long flags);
 void devm_memunmap(struct device *dev, void *addr);
 
+void *__devm_memremap_pages(struct device *dev, struct resource *res);
+
+#ifdef CONFIG_ZONE_DEVICE
+void *devm_memremap_pages(struct device *dev, struct resource *res);
+#else
+static inline void *devm_memremap_pages(struct device *dev, struct resource *res)
+{
+	/*
+	 * Fail attempts to call devm_memremap_pages() without
+	 * ZONE_DEVICE support enabled, this requires callers to fall
+	 * back to plain devm_memremap() based on config
+	 */
+	WARN_ON_ONCE(1);
+	return ERR_PTR(-ENXIO);
+}
+#endif
+
 /*
  * Some systems do not have legacy ISA devices.
  * /dev/port is not a valid interface on these systems.
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 5c9b55eaf121..72b0c66628b6 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/io.h>
 #include <linux/mm.h>
+#include <linux/memory_hotplug.h>
 
 #ifndef ioremap_cache
 /* temporary while we convert existing ioremap_cache users to memremap */
@@ -135,3 +136,55 @@ void devm_memunmap(struct device *dev, void *addr)
 	memunmap(addr);
 }
 EXPORT_SYMBOL(devm_memunmap);
+
+#ifdef CONFIG_ZONE_DEVICE
+struct page_map {
+	struct resource res;
+};
+
+static void devm_memremap_pages_release(struct device *dev, void *res)
+{
+	struct page_map *page_map = res;
+
+	/* pages are dead and unused, undo the arch mapping */
+	arch_remove_memory(page_map->res.start, resource_size(&page_map->res));
+}
+
+void *devm_memremap_pages(struct device *dev, struct resource *res)
+{
+	int is_ram = region_intersects(res->start, resource_size(res),
+			"System RAM");
+	struct page_map *page_map;
+	int error, nid;
+
+	if (is_ram == REGION_MIXED) {
+		WARN_ONCE(1, "%s attempted on mixed region %pr\n",
+				__func__, res);
+		return ERR_PTR(-ENXIO);
+	}
+
+	if (is_ram == REGION_INTERSECTS)
+		return __va(res->start);
+
+	page_map = devres_alloc(devm_memremap_pages_release,
+			sizeof(*page_map), GFP_KERNEL);
+	if (!page_map)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(&page_map->res, res, sizeof(*res));
+
+	nid = dev_to_node(dev);
+	if (nid < 0)
+		nid = 0;
+
+	error = arch_add_memory(nid, res->start, resource_size(res), true);
+	if (error) {
+		devres_free(page_map);
+		return ERR_PTR(error);
+	}
+
+	devres_add(dev, page_map);
+	return __va(res->start);
+}
+EXPORT_SYMBOL(devm_memremap_pages);
+#endif /* CONFIG_ZONE_DEVICE */
-- 
cgit v1.2.3-70-g09d2


From 96601adb745186ccbcf5b078d4756f13381ec2af Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 24 Aug 2015 18:29:38 -0400
Subject: x86, pmem: clarify that ARCH_HAS_PMEM_API implies PMEM mapped WB

Given that a write-back (WB) mapping plus non-temporal stores is
expected to be the most efficient way to access PMEM, update the
definition of ARCH_HAS_PMEM_API to imply arch support for
WB-mapped-PMEM.  This is needed as a pre-requisite for adding PMEM to
the direct map and mapping it with struct page.

The above clarification for X86_64 means that memcpy_to_pmem() is
permitted to use the non-temporal arch_memcpy_to_pmem() rather than
needlessly fall back to default_memcpy_to_pmem() when the pcommit
instruction is not available.  When arch_memcpy_to_pmem() is not
guaranteed to flush writes out of cache, i.e. on older X86_32
implementations where non-temporal stores may just dirty cache,
ARCH_HAS_PMEM_API is simply disabled.

The default fall back for persistent memory handling remains.  Namely,
map it with the WT (write-through) cache-type and hope for the best.

arch_has_pmem_api() is updated to only indicate whether the arch
provides the proper helpers to meet the minimum "writes are visible
outside the cache hierarchy after memcpy_to_pmem() + wmb_pmem()".  Code
that cares whether wmb_pmem() actually flushes writes to pmem must now
call arch_has_wmb_pmem() directly.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
[hch: set ARCH_HAS_PMEM_API=n on x86_32]
Reviewed-by: Christoph Hellwig <hch@lst.de>
[toshi: x86_32 compile fixes]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/Kconfig            |  2 +-
 arch/x86/include/asm/pmem.h |  9 +--------
 drivers/acpi/nfit.c         |  3 ++-
 drivers/nvdimm/pmem.c       |  2 +-
 include/linux/pmem.h        | 36 ++++++++++++++++++++++--------------
 5 files changed, 27 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 03ab6122325a..ef4c6bbb3af1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,7 +27,7 @@ config X86
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_GCOV_PROFILE_ALL
-	select ARCH_HAS_PMEM_API
+	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_MMIO_FLUSH
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index bb026c5adf8a..d8ce3ec816ab 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -18,8 +18,6 @@
 #include <asm/cpufeature.h>
 #include <asm/special_insns.h>
 
-#define ARCH_MEMREMAP_PMEM MEMREMAP_WB
-
 #ifdef CONFIG_ARCH_HAS_PMEM_API
 /**
  * arch_memcpy_to_pmem - copy data to persistent memory
@@ -143,18 +141,13 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
 	__arch_wb_cache_pmem(vaddr, size);
 }
 
-static inline bool arch_has_wmb_pmem(void)
+static inline bool __arch_has_wmb_pmem(void)
 {
-#ifdef CONFIG_X86_64
 	/*
 	 * We require that wmb() be an 'sfence', that is only guaranteed on
 	 * 64-bit builds
 	 */
 	return static_cpu_has(X86_FEATURE_PCOMMIT);
-#else
-	return false;
-#endif
 }
 #endif /* CONFIG_ARCH_HAS_PMEM_API */
-
 #endif /* __ASM_X86_PMEM_H__ */
diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 56fff0141636..f61e69fa2ad1 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -20,6 +20,7 @@
 #include <linux/sort.h>
 #include <linux/pmem.h>
 #include <linux/io.h>
+#include <asm/cacheflush.h>
 #include "nfit.h"
 
 /*
@@ -1371,7 +1372,7 @@ static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus,
 			return -ENOMEM;
 	}
 
-	if (!arch_has_pmem_api() && !nfit_blk->nvdimm_flush)
+	if (!arch_has_wmb_pmem() && !nfit_blk->nvdimm_flush)
 		dev_warn(dev, "unable to guarantee persistence of writes\n");
 
 	if (mmio->line_size == 0)
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 3b5b9cb758b6..20bf122328da 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -125,7 +125,7 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 
 	pmem->phys_addr = res->start;
 	pmem->size = resource_size(res);
-	if (!arch_has_pmem_api())
+	if (!arch_has_wmb_pmem())
 		dev_warn(dev, "unable to guarantee persistence of writes\n");
 
 	if (!devm_request_mem_region(dev, pmem->phys_addr, pmem->size,
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
index a9d84bf335ee..85f810b33917 100644
--- a/include/linux/pmem.h
+++ b/include/linux/pmem.h
@@ -17,16 +17,23 @@
 #include <linux/uio.h>
 
 #ifdef CONFIG_ARCH_HAS_PMEM_API
+#define ARCH_MEMREMAP_PMEM MEMREMAP_WB
 #include <asm/pmem.h>
 #else
-static inline void arch_wmb_pmem(void)
+#define ARCH_MEMREMAP_PMEM MEMREMAP_WT
+/*
+ * These are simply here to enable compilation, all call sites gate
+ * calling these symbols with arch_has_pmem_api() and redirect to the
+ * implementation in asm/pmem.h.
+ */
+static inline bool __arch_has_wmb_pmem(void)
 {
-	BUG();
+	return false;
 }
 
-static inline bool arch_has_wmb_pmem(void)
+static inline void arch_wmb_pmem(void)
 {
-	return false;
+	BUG();
 }
 
 static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
@@ -53,7 +60,6 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
  * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(),
  * arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem().
  */
-
 static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size)
 {
 	memcpy(dst, (void __force const *) src, size);
@@ -64,8 +70,13 @@ static inline void memunmap_pmem(struct device *dev, void __pmem *addr)
 	devm_memunmap(dev, (void __force *) addr);
 }
 
+static inline bool arch_has_pmem_api(void)
+{
+	return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API);
+}
+
 /**
- * arch_has_pmem_api - true if wmb_pmem() ensures durability
+ * arch_has_wmb_pmem - true if wmb_pmem() ensures durability
  *
  * For a given cpu implementation within an architecture it is possible
  * that wmb_pmem() resolves to a nop.  In the case this returns
@@ -73,9 +84,9 @@ static inline void memunmap_pmem(struct device *dev, void __pmem *addr)
  * fall back to a different data consistency model, or otherwise notify
  * the user.
  */
-static inline bool arch_has_pmem_api(void)
+static inline bool arch_has_wmb_pmem(void)
 {
-	return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && arch_has_wmb_pmem();
+	return arch_has_pmem_api() && __arch_has_wmb_pmem();
 }
 
 /*
@@ -120,13 +131,8 @@ static inline void default_clear_pmem(void __pmem *addr, size_t size)
 static inline void __pmem *memremap_pmem(struct device *dev,
 		resource_size_t offset, unsigned long size)
 {
-#ifdef ARCH_MEMREMAP_PMEM
 	return (void __pmem *) devm_memremap(dev, offset, size,
 			ARCH_MEMREMAP_PMEM);
-#else
-	return (void __pmem *) devm_memremap(dev, offset, size,
-			MEMREMAP_WT);
-#endif
 }
 
 /**
@@ -158,8 +164,10 @@ static inline void memcpy_to_pmem(void __pmem *dst, const void *src, size_t n)
  */
 static inline void wmb_pmem(void)
 {
-	if (arch_has_pmem_api())
+	if (arch_has_wmb_pmem())
 		arch_wmb_pmem();
+	else
+		wmb();
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 8c61282ff61c28d5a12bb53f0eaa221d30fd3ae1 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Wed, 26 Aug 2015 21:12:58 +0800
Subject: NFS: Get suppattr_exclcreat when getting server capabilities

Create file with attributs as NFS4_CREATE_EXCLUSIVE4_1 mode
depends on suppattr_exclcreat attribut.

v3, same as v2.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c         | 14 +++++++++++++-
 fs/nfs/nfs4xdr.c          | 26 +++++++++++++++++++++-----
 include/linux/nfs_fs_sb.h |  5 +++++
 include/linux/nfs_xdr.h   |  2 ++
 4 files changed, 41 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4687661bfbdc..a6a28d45cca4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2893,8 +2893,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
 
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
+	u32 bitmask[3] = {}, minorversion = server->nfs_client->cl_minorversion;
 	struct nfs4_server_caps_arg args = {
 		.fhandle = fhandle,
+		.bitmask = bitmask,
 	};
 	struct nfs4_server_caps_res res = {};
 	struct rpc_message msg = {
@@ -2904,10 +2906,18 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 	};
 	int status;
 
+	bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS |
+		     FATTR4_WORD0_FH_EXPIRE_TYPE |
+		     FATTR4_WORD0_LINK_SUPPORT |
+		     FATTR4_WORD0_SYMLINK_SUPPORT |
+		     FATTR4_WORD0_ACLSUPPORT;
+	if (minorversion)
+		bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 	if (status == 0) {
 		/* Sanity check the server answers */
-		switch (server->nfs_client->cl_minorversion) {
+		switch (minorversion) {
 		case 0:
 			res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK;
 			res.attr_bitmask[2] = 0;
@@ -2960,6 +2970,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
 		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
 		server->cache_consistency_bitmask[2] = 0;
+		memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask,
+			sizeof(server->exclcreat_bitmask));
 		server->acl_bitmask = res.acl_bitmask;
 		server->fh_expire_type = res.fh_expire_type;
 	}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c42459e45f62..ad8dde12f23b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2582,6 +2582,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
 				     struct xdr_stream *xdr,
 				     struct nfs4_server_caps_arg *args)
 {
+	const u32 *bitmask = args->bitmask;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
@@ -2589,11 +2590,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fhandle, &hdr);
-	encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
-			   FATTR4_WORD0_FH_EXPIRE_TYPE|
-			   FATTR4_WORD0_LINK_SUPPORT|
-			   FATTR4_WORD0_SYMLINK_SUPPORT|
-			   FATTR4_WORD0_ACLSUPPORT, &hdr);
+	encode_getattr_three(xdr, bitmask[0], bitmask[1], bitmask[2], &hdr);
 	encode_nops(&hdr);
 }
 
@@ -3370,6 +3367,22 @@ out_overflow:
 	return -EIO;
 }
 
+static int decode_attr_exclcreat_supported(struct xdr_stream *xdr,
+				 uint32_t *bitmap, uint32_t *bitmask)
+{
+	if (likely(bitmap[2] & FATTR4_WORD2_SUPPATTR_EXCLCREAT)) {
+		int ret;
+		ret = decode_attr_bitmap(xdr, bitmask);
+		if (unlikely(ret < 0))
+			return ret;
+		bitmap[2] &= ~FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+	} else
+		bitmask[0] = bitmask[1] = bitmask[2] = 0;
+	dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+		bitmask[0], bitmask[1], bitmask[2]);
+	return 0;
+}
+
 static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
 {
 	__be32 *p;
@@ -4323,6 +4336,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
 		goto xdr_error;
 	if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0)
 		goto xdr_error;
+	if ((status = decode_attr_exclcreat_supported(xdr, bitmap,
+				res->exclcreat_bitmask)) != 0)
+		goto xdr_error;
 	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
 	dprintk("%s: xdr returned %d!\n", __func__, -status);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 20bc8e51b161..570a7df2775b 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -173,6 +173,11 @@ struct nfs_server {
 						   set of attributes supported
 						   on this filesystem excluding
 						   the label support bit. */
+	u32			exclcreat_bitmask[3];
+						/* V4 bitmask representing the
+						   set of attributes supported
+						   on this filesystem for the
+						   exclusive create. */
 	u32			cache_consistency_bitmask[3];
 						/* V4 bitmask representing the subset
 						   of change attribute, size, ctime
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b9b530409ff7..0d7c832ec415 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1057,11 +1057,13 @@ struct nfs4_statfs_res {
 struct nfs4_server_caps_arg {
 	struct nfs4_sequence_args	seq_args;
 	struct nfs_fh		       *fhandle;
+	const u32 *			bitmask;
 };
 
 struct nfs4_server_caps_res {
 	struct nfs4_sequence_res	seq_res;
 	u32				attr_bitmask[3];
+	u32				exclcreat_bitmask[3];
 	u32				acl_bitmask;
 	u32				has_links;
 	u32				has_symlinks;
-- 
cgit v1.2.3-70-g09d2


From 5334c5bdac926c5f8d89729beccb46fe88eda9e7 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Wed, 26 Aug 2015 21:13:37 +0800
Subject: NFS: Send attributes in OPEN request for NFS4_CREATE_EXCLUSIVE4_1

Client sends a SETATTR request after OPEN for updating attributes.
For create file with S_ISGID is set, the S_ISGID in SETATTR will be
ignored at nfs server as chmod of no PERMISSION.

v3, same as v2.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 18 ++++++++++++++----
 fs/nfs/nfs4xdr.c        | 26 ++++++++++++++++++--------
 include/linux/nfs_xdr.h |  2 +-
 3 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a6a28d45cca4..2923abf2fc0c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2307,15 +2307,25 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
  * fields corresponding to attributes that were used to store the verifier.
  * Make sure we clobber those fields in the later setattr call
  */
-static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr)
+static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
+				struct iattr *sattr, struct nfs4_label **label)
 {
-	if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
+	const u32 *attrset = opendata->o_res.attrset;
+
+	if ((attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
 	    !(sattr->ia_valid & ATTR_ATIME_SET))
 		sattr->ia_valid |= ATTR_ATIME;
 
-	if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
+	if ((attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
 	    !(sattr->ia_valid & ATTR_MTIME_SET))
 		sattr->ia_valid |= ATTR_MTIME;
+
+	/* Except MODE, it seems harmless of setting twice. */
+	if ((attrset[1] & FATTR4_WORD1_MODE))
+		sattr->ia_valid &= ~ATTR_MODE;
+
+	if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL)
+		*label = NULL;
 }
 
 static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
@@ -2440,7 +2450,7 @@ static int _nfs4_do_open(struct inode *dir,
 
 	if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
 	    (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
-		nfs4_exclusive_attrset(opendata, sattr);
+		nfs4_exclusive_attrset(opendata, sattr, &label);
 
 		nfs_fattr_init(opendata->o_res.f_attr);
 		status = nfs4_do_setattr(state->inode, cred,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index ad8dde12f23b..a7be571c1666 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1001,7 +1001,8 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
 
 static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
 				const struct nfs4_label *label,
-				const struct nfs_server *server)
+				const struct nfs_server *server,
+				bool excl_check)
 {
 	char owner_name[IDMAP_NAMESZ];
 	char owner_group[IDMAP_NAMESZ];
@@ -1067,6 +1068,17 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
 		bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
 		len += 4;
 	}
+
+	if (excl_check) {
+		const u32 *excl_bmval = server->exclcreat_bitmask;
+		bmval[0] &= excl_bmval[0];
+		bmval[1] &= excl_bmval[1];
+		bmval[2] &= excl_bmval[2];
+
+		if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL))
+			label = NULL;
+	}
+
 	if (label) {
 		len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
 		bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
@@ -1170,7 +1182,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 	}
 
 	encode_string(xdr, create->name->len, create->name->name);
-	encode_attrs(xdr, create->attrs, create->label, create->server);
+	encode_attrs(xdr, create->attrs, create->label, create->server, false);
 }
 
 static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1384,18 +1396,17 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 {
-	struct iattr dummy;
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
 	switch(arg->createmode) {
 	case NFS4_CREATE_UNCHECKED:
 		*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
-		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
 		break;
 	case NFS4_CREATE_GUARDED:
 		*p = cpu_to_be32(NFS4_CREATE_GUARDED);
-		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
 		break;
 	case NFS4_CREATE_EXCLUSIVE:
 		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1404,8 +1415,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 	case NFS4_CREATE_EXCLUSIVE4_1:
 		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
 		encode_nfs4_verifier(xdr, &arg->u.verifier);
-		dummy.ia_valid = 0;
-		encode_attrs(xdr, &dummy, arg->label, arg->server);
+		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true);
 	}
 }
 
@@ -1661,7 +1671,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 {
 	encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
 	encode_nfs4_stateid(xdr, &arg->stateid);
-	encode_attrs(xdr, arg->iap, arg->label, server);
+	encode_attrs(xdr, arg->iap, arg->label, server, false);
 }
 
 static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0d7c832ec415..b4392d86d157 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -379,7 +379,7 @@ struct nfs_openargs {
 	struct stateowner_id	id;
 	union {
 		struct {
-			struct iattr *  attrs;    /* UNCHECKED, GUARDED */
+			struct iattr *  attrs;    /* UNCHECKED, GUARDED, EXCLUSIVE4_1 */
 			nfs4_verifier   verifier; /* EXCLUSIVE */
 		};
 		nfs4_stateid	delegation;		/* CLAIM_DELEGATE_CUR */
-- 
cgit v1.2.3-70-g09d2


From f7fafd083ccc340502448903aaddc76f10785c8c Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@gmail.com>
Date: Thu, 2 Jul 2015 19:56:40 +0200
Subject: leds: leds-ns2: move LED modes mapping outside of the driver

On the board n090401 (Seagate NAS 4-Bay), the LED mode mapping (GPIO
values to LED mode) is different from the one used on other boards
supported by the leds-ns2 driver.

With this patch the hardcoded mapping is removed from leds-ns2. Now,
it must be defined either in the platform data (if an old-fashion board
setup file is used) or in the DT node. In order to allow the later, this
patch also introduces a modes-map property for the leds-ns2 DT binding.

Signed-off-by: Vincent Donnefort <vdonnefort@gmail.com>
Signed-off-by: Jacek Anaszewski <j.anaszewski@samsung.com>
---
 .../devicetree/bindings/leds/leds-ns2.txt          |   9 ++
 drivers/leds/leds-ns2.c                            | 102 +++++++++++----------
 include/dt-bindings/leds/leds-ns2.h                |   8 ++
 include/linux/platform_data/leds-kirkwood-ns2.h    |  14 +++
 4 files changed, 85 insertions(+), 48 deletions(-)
 create mode 100644 include/dt-bindings/leds/leds-ns2.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/leds/leds-ns2.txt b/Documentation/devicetree/bindings/leds/leds-ns2.txt
index aef3aca34d2d..9f81258a5b6e 100644
--- a/Documentation/devicetree/bindings/leds/leds-ns2.txt
+++ b/Documentation/devicetree/bindings/leds/leds-ns2.txt
@@ -8,6 +8,9 @@ Each LED is represented as a sub-node of the ns2-leds device.
 Required sub-node properties:
 - cmd-gpio: Command LED GPIO. See OF device-tree GPIO specification.
 - slow-gpio: Slow LED GPIO. See OF device-tree GPIO specification.
+- modes-map: A mapping between LED modes (off, on or SATA activity blinking) and
+  the corresponding cmd-gpio/slow-gpio values. All the GPIO values combinations
+  should be given in order to avoid having an unknown mode at driver probe time.
 
 Optional sub-node properties:
 - label: Name for this LED. If omitted, the label is taken from the node name.
@@ -15,6 +18,8 @@ Optional sub-node properties:
 
 Example:
 
+#include <dt-bindings/leds/leds-ns2.h>
+
 ns2-leds {
 	compatible = "lacie,ns2-leds";
 
@@ -22,5 +27,9 @@ ns2-leds {
 		label = "ns2:blue:sata";
 		slow-gpio = <&gpio0 29 0>;
 		cmd-gpio = <&gpio0 30 0>;
+		modes-map = <NS_V2_LED_OFF  0 1
+			     NS_V2_LED_ON   1 0
+			     NS_V2_LED_ON   0 0
+			     NS_V2_LED_SATA 1 1>;
 	};
 };
diff --git a/drivers/leds/leds-ns2.c b/drivers/leds/leds-ns2.c
index 1fd6adbb43b7..b0bc03539dbb 100644
--- a/drivers/leds/leds-ns2.c
+++ b/drivers/leds/leds-ns2.c
@@ -33,46 +33,20 @@
 #include <linux/of_gpio.h>
 
 /*
- * The Network Space v2 dual-GPIO LED is wired to a CPLD and can blink in
- * relation with the SATA activity. This capability is exposed through the
- * "sata" sysfs attribute.
- *
- * The following array detail the different LED registers and the combination
- * of their possible values:
- *
- *  cmd_led   |  slow_led  | /SATA active | LED state
- *            |            |              |
- *     1      |     0      |      x       |  off
- *     -      |     1      |      x       |  on
- *     0      |     0      |      1       |  on
- *     0      |     0      |      0       |  blink (rate 300ms)
+ * The Network Space v2 dual-GPIO LED is wired to a CPLD. Three different LED
+ * modes are available: off, on and SATA activity blinking. The LED modes are
+ * controlled through two GPIOs (command and slow): each combination of values
+ * for the command/slow GPIOs corresponds to a LED mode.
  */
 
-enum ns2_led_modes {
-	NS_V2_LED_OFF,
-	NS_V2_LED_ON,
-	NS_V2_LED_SATA,
-};
-
-struct ns2_led_mode_value {
-	enum ns2_led_modes	mode;
-	int			cmd_level;
-	int			slow_level;
-};
-
-static struct ns2_led_mode_value ns2_led_modval[] = {
-	{ NS_V2_LED_OFF	, 1, 0 },
-	{ NS_V2_LED_ON	, 0, 1 },
-	{ NS_V2_LED_ON	, 1, 1 },
-	{ NS_V2_LED_SATA, 0, 0 },
-};
-
 struct ns2_led_data {
 	struct led_classdev	cdev;
 	unsigned		cmd;
 	unsigned		slow;
 	unsigned char		sata; /* True when SATA mode active. */
 	rwlock_t		rw_lock; /* Lock GPIOs. */
+	int			num_modes;
+	struct ns2_led_modval	*modval;
 };
 
 static int ns2_led_get_mode(struct ns2_led_data *led_dat,
@@ -88,10 +62,10 @@ static int ns2_led_get_mode(struct ns2_led_data *led_dat,
 	cmd_level = gpio_get_value(led_dat->cmd);
 	slow_level = gpio_get_value(led_dat->slow);
 
-	for (i = 0; i < ARRAY_SIZE(ns2_led_modval); i++) {
-		if (cmd_level == ns2_led_modval[i].cmd_level &&
-		    slow_level == ns2_led_modval[i].slow_level) {
-			*mode = ns2_led_modval[i].mode;
+	for (i = 0; i < led_dat->num_modes; i++) {
+		if (cmd_level == led_dat->modval[i].cmd_level &&
+		    slow_level == led_dat->modval[i].slow_level) {
+			*mode = led_dat->modval[i].mode;
 			ret = 0;
 			break;
 		}
@@ -110,12 +84,12 @@ static void ns2_led_set_mode(struct ns2_led_data *led_dat,
 
 	write_lock_irqsave(&led_dat->rw_lock, flags);
 
-	for (i = 0; i < ARRAY_SIZE(ns2_led_modval); i++) {
-		if (mode == ns2_led_modval[i].mode) {
+	for (i = 0; i < led_dat->num_modes; i++) {
+		if (mode == led_dat->modval[i].mode) {
 			gpio_set_value(led_dat->cmd,
-				       ns2_led_modval[i].cmd_level);
+				       led_dat->modval[i].cmd_level);
 			gpio_set_value(led_dat->slow,
-				       ns2_led_modval[i].slow_level);
+				       led_dat->modval[i].slow_level);
 		}
 	}
 
@@ -228,6 +202,8 @@ create_ns2_led(struct platform_device *pdev, struct ns2_led_data *led_dat,
 	led_dat->cdev.groups = ns2_led_groups;
 	led_dat->cmd = template->cmd;
 	led_dat->slow = template->slow;
+	led_dat->modval = template->modval;
+	led_dat->num_modes = template->num_modes;
 
 	ret = ns2_led_get_mode(led_dat, &mode);
 	if (ret < 0)
@@ -259,9 +235,8 @@ ns2_leds_get_of_pdata(struct device *dev, struct ns2_led_platform_data *pdata)
 {
 	struct device_node *np = dev->of_node;
 	struct device_node *child;
-	struct ns2_led *leds;
+	struct ns2_led *led, *leds;
 	int num_leds = 0;
-	int i = 0;
 
 	num_leds = of_get_child_count(np);
 	if (!num_leds)
@@ -272,26 +247,57 @@ ns2_leds_get_of_pdata(struct device *dev, struct ns2_led_platform_data *pdata)
 	if (!leds)
 		return -ENOMEM;
 
+	led = leds;
 	for_each_child_of_node(np, child) {
 		const char *string;
-		int ret;
+		int ret, i, num_modes;
+		struct ns2_led_modval *modval;
 
 		ret = of_get_named_gpio(child, "cmd-gpio", 0);
 		if (ret < 0)
 			return ret;
-		leds[i].cmd = ret;
+		led->cmd = ret;
 		ret = of_get_named_gpio(child, "slow-gpio", 0);
 		if (ret < 0)
 			return ret;
-		leds[i].slow = ret;
+		led->slow = ret;
 		ret = of_property_read_string(child, "label", &string);
-		leds[i].name = (ret == 0) ? string : child->name;
+		led->name = (ret == 0) ? string : child->name;
 		ret = of_property_read_string(child, "linux,default-trigger",
 					      &string);
 		if (ret == 0)
-			leds[i].default_trigger = string;
+			led->default_trigger = string;
+
+		ret = of_property_count_u32_elems(child, "modes-map");
+		if (ret < 0 || ret % 3) {
+			dev_err(dev,
+				"Missing or malformed modes-map property\n");
+			return -EINVAL;
+		}
+
+		num_modes = ret / 3;
+		modval = devm_kzalloc(dev,
+				      num_modes * sizeof(struct ns2_led_modval),
+				      GFP_KERNEL);
+		if (!modval)
+			return -ENOMEM;
+
+		for (i = 0; i < num_modes; i++) {
+			of_property_read_u32_index(child,
+						"modes-map", 3 * i,
+						(u32 *) &modval[i].mode);
+			of_property_read_u32_index(child,
+						"modes-map", 3 * i + 1,
+						(u32 *) &modval[i].cmd_level);
+			of_property_read_u32_index(child,
+						"modes-map", 3 * i + 2,
+						(u32 *) &modval[i].slow_level);
+		}
+
+		led->num_modes = num_modes;
+		led->modval = modval;
 
-		i++;
+		led++;
 	}
 
 	pdata->leds = leds;
diff --git a/include/dt-bindings/leds/leds-ns2.h b/include/dt-bindings/leds/leds-ns2.h
new file mode 100644
index 000000000000..491c5f974a92
--- /dev/null
+++ b/include/dt-bindings/leds/leds-ns2.h
@@ -0,0 +1,8 @@
+#ifndef _DT_BINDINGS_LEDS_NS2_H
+#define _DT_BINDINGS_LEDS_NS2_H
+
+#define NS_V2_LED_OFF	0
+#define NS_V2_LED_ON	1
+#define NS_V2_LED_SATA	2
+
+#endif
diff --git a/include/linux/platform_data/leds-kirkwood-ns2.h b/include/linux/platform_data/leds-kirkwood-ns2.h
index 6a9fed57f346..eb8a6860e816 100644
--- a/include/linux/platform_data/leds-kirkwood-ns2.h
+++ b/include/linux/platform_data/leds-kirkwood-ns2.h
@@ -9,11 +9,25 @@
 #ifndef __LEDS_KIRKWOOD_NS2_H
 #define __LEDS_KIRKWOOD_NS2_H
 
+enum ns2_led_modes {
+	NS_V2_LED_OFF,
+	NS_V2_LED_ON,
+	NS_V2_LED_SATA,
+};
+
+struct ns2_led_modval {
+	enum ns2_led_modes	mode;
+	int			cmd_level;
+	int			slow_level;
+};
+
 struct ns2_led {
 	const char	*name;
 	const char	*default_trigger;
 	unsigned	cmd;
 	unsigned	slow;
+	int		num_modes;
+	struct ns2_led_modval *modval;
 };
 
 struct ns2_led_platform_data {
-- 
cgit v1.2.3-70-g09d2


From ba6a860d41ed3a377d61d59d7c7b08dd7455c686 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <pang.xunlei@linaro.org>
Date: Tue, 4 Aug 2015 13:48:55 +0800
Subject: cpuidle/coupled: Remove cpuidle_device::safe_state_index

cpuidle_device::safe_state_index need to be initialized before
use, it should be the same as cpuidle_driver::safe_state_index.

We tackled this issue by removing the safe_state_index from the
cpuidle_device structure and use the one in the cpuidle_driver
structure instead.

Suggested-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/coupled.c | 4 ++--
 include/linux/cpuidle.h   | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index 7936dce4b878..6493e4055abe 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -473,7 +473,7 @@ int cpuidle_enter_state_coupled(struct cpuidle_device *dev,
 			return entered_state;
 		}
 		entered_state = cpuidle_enter_state(dev, drv,
-			dev->safe_state_index);
+			drv->safe_state_index);
 		local_irq_disable();
 	}
 
@@ -521,7 +521,7 @@ retry:
 		}
 
 		entered_state = cpuidle_enter_state(dev, drv,
-			dev->safe_state_index);
+			drv->safe_state_index);
 		local_irq_disable();
 	}
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index d075d34279df..786ad32631a6 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -84,7 +84,6 @@ struct cpuidle_device {
 	struct list_head 	device_list;
 
 #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
-	int			safe_state_index;
 	cpumask_t		coupled_cpus;
 	struct cpuidle_coupled	*coupled;
 #endif
-- 
cgit v1.2.3-70-g09d2


From df2cf4a78e488d26728590cb3c6b4fe4c4862c77 Mon Sep 17 00:00:00 2001
From: Philip Downey <pdowney@brocade.com>
Date: Thu, 27 Aug 2015 16:46:26 +0100
Subject: IGMP: Inhibit reports for local multicast groups

The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is
reserved for the use of routing protocols and other low-level topology
discovery or maintenance protocols, such as gateway discovery and
group membership reporting.  Multicast routers should not forward any
multicast datagram with destination addresses in this range,
regardless of its TTL.

Currently, IGMP reports are generated for this reserved range of
addresses even though a router will ignore this information since it
has no purpose.  However, the presence of reserved group addresses in
an IGMP membership report uses up network bandwidth and can also
obscure addresses of interest when inspecting membership reports using
packet inspection or debug messages.

Although the RFCs for the various version of IGMP (e.g.RFC 3376 for
v3) do not specify that the reserved addresses be excluded from
membership reports, it should do no harm in doing so.  In particular
there should be no adverse effect in any IGMP snooping functionality
since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD
Snooping Switches Considerations) section 2.1.2. Data Forwarding
Rules:

    2) Packets with a destination IP (DIP) address in the 224.0.0.X
       range which are not IGMP must be forwarded on all ports.

IGMP reports for local multicast groups can now be optionally
inhibited by means of a system control variable (by setting the value
to zero) e.g.:
    echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports

To retain backwards compatibility the previous behaviour is retained
by default on system boot or reverted by setting the value back to
non-zero e.g.:
    echo 1 >  /proc/sys/net/ipv4/igmp_link_local_mcast_reports

Signed-off-by: Philip Downey <pdowney@brocade.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h       |  1 +
 net/ipv4/igmp.c            | 26 +++++++++++++++++++++++++-
 net/ipv4/sysctl_net_ipv4.c |  7 +++++++
 3 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 193ad488d3e2..908429216d9f 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -37,6 +37,7 @@ static inline struct igmpv3_query *
 	return (struct igmpv3_query *)skb_transport_header(skb);
 }
 
+extern int sysctl_igmp_llm_reports;
 extern int sysctl_igmp_max_memberships;
 extern int sysctl_igmp_max_msf;
 extern int sysctl_igmp_qrv;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 9fdfd9deac11..d38b8b61eaee 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -110,6 +110,9 @@
 #define IP_MAX_MEMBERSHIPS	20
 #define IP_MAX_MSF		10
 
+/* IGMP reports for link-local multicast groups are enabled by default */
+int sysctl_igmp_llm_reports __read_mostly = 1;
+
 #ifdef CONFIG_IP_MULTICAST
 /* Parameter names and values are taken from igmp-v2-06 draft */
 
@@ -437,6 +440,8 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
 
 	if (pmc->multiaddr == IGMP_ALL_HOSTS)
 		return skb;
+	if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports)
+		return skb;
 
 	isquery = type == IGMPV3_MODE_IS_INCLUDE ||
 		  type == IGMPV3_MODE_IS_EXCLUDE;
@@ -545,6 +550,9 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
 		for_each_pmc_rcu(in_dev, pmc) {
 			if (pmc->multiaddr == IGMP_ALL_HOSTS)
 				continue;
+			if (ipv4_is_local_multicast(pmc->multiaddr) &&
+			     !sysctl_igmp_llm_reports)
+				continue;
 			spin_lock_bh(&pmc->lock);
 			if (pmc->sfcount[MCAST_EXCLUDE])
 				type = IGMPV3_MODE_IS_EXCLUDE;
@@ -678,7 +686,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
 
 	if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
 		return igmpv3_send_report(in_dev, pmc);
-	else if (type == IGMP_HOST_LEAVE_MESSAGE)
+
+	if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports)
+		return 0;
+
+	if (type == IGMP_HOST_LEAVE_MESSAGE)
 		dst = IGMP_ALL_ROUTER;
 	else
 		dst = group;
@@ -851,6 +863,8 @@ static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
 
 	if (group == IGMP_ALL_HOSTS)
 		return false;
+	if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports)
+		return false;
 
 	rcu_read_lock();
 	for_each_pmc_rcu(in_dev, im) {
@@ -957,6 +971,9 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
 			continue;
 		if (im->multiaddr == IGMP_ALL_HOSTS)
 			continue;
+		if (ipv4_is_local_multicast(im->multiaddr) &&
+		    !sysctl_igmp_llm_reports)
+			continue;
 		spin_lock_bh(&im->lock);
 		if (im->tm_running)
 			im->gsquery = im->gsquery && mark;
@@ -1181,6 +1198,8 @@ static void igmp_group_dropped(struct ip_mc_list *im)
 #ifdef CONFIG_IP_MULTICAST
 	if (im->multiaddr == IGMP_ALL_HOSTS)
 		return;
+	if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports)
+		return;
 
 	reporter = im->reporter;
 	igmp_stop_timer(im);
@@ -1213,6 +1232,8 @@ static void igmp_group_added(struct ip_mc_list *im)
 #ifdef CONFIG_IP_MULTICAST
 	if (im->multiaddr == IGMP_ALL_HOSTS)
 		return;
+	if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports)
+		return;
 
 	if (in_dev->dead)
 		return;
@@ -1518,6 +1539,9 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)
 	for_each_pmc_rtnl(in_dev, im) {
 		if (im->multiaddr == IGMP_ALL_HOSTS)
 			continue;
+		if (ipv4_is_local_multicast(im->multiaddr) &&
+		    !sysctl_igmp_llm_reports)
+			continue;
 
 		/* a failover is happening and switches
 		 * must be notified immediately
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 879bdc5c95b1..894da3a70aff 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -929,6 +929,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "igmp_link_local_mcast_reports",
+		.data		= &sysctl_igmp_llm_reports,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
-- 
cgit v1.2.3-70-g09d2


From 1a6877b9c0c2ad901d4335d909432d3bb6d3a330 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 28 Aug 2015 15:56:22 -0700
Subject: lib: introduce strncpy_from_unsafe()

generalize FETCH_FUNC_NAME(memory, string) into
strncpy_from_unsafe() and fix sparse warnings that were
present in original implementation.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/uaccess.h     |  2 ++
 kernel/trace/trace_kprobe.c | 20 ++++----------------
 lib/strncpy_from_user.c     | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index ae572c138607..d6f2c2c5b043 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -129,4 +129,6 @@ extern long __probe_kernel_read(void *dst, const void *src, size_t size);
 extern long notrace probe_kernel_write(void *dst, const void *src, size_t size);
 extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size);
 
+extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
+
 #endif		/* __LINUX_UACCESS_H__ */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b7d0cdd9906c..c9956440d0e6 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
 static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
 					    void *addr, void *dest)
 {
-	long ret;
 	int maxlen = get_rloc_len(*(u32 *)dest);
 	u8 *dst = get_rloc_data(dest);
-	u8 *src = addr;
-	mm_segment_t old_fs = get_fs();
+	long ret;
 
 	if (!maxlen)
 		return;
@@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
 	 * Try to get string again, since the string can be changed while
 	 * probing.
 	 */
-	set_fs(KERNEL_DS);
-	pagefault_disable();
-
-	do
-		ret = __copy_from_user_inatomic(dst++, src++, 1);
-	while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
-
-	dst[-1] = '\0';
-	pagefault_enable();
-	set_fs(old_fs);
+	ret = strncpy_from_unsafe(dst, addr, maxlen);
 
 	if (ret < 0) {	/* Failed to fetch string */
-		((u8 *)get_rloc_data(dest))[0] = '\0';
+		dst[0] = '\0';
 		*(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
 	} else {
-		*(u32 *)dest = make_data_rloc(src - (u8 *)addr,
-					      get_rloc_offs(*(u32 *)dest));
+		*(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
 	}
 }
 NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index e0af6ff73d14..ead8c4a068d1 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -112,3 +112,44 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
 	return -EFAULT;
 }
 EXPORT_SYMBOL(strncpy_from_user);
+
+/**
+ * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address.
+ * @dst:   Destination address, in kernel space.  This buffer must be at
+ *         least @count bytes long.
+ * @src:   Unsafe address.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Copies a NUL-terminated string from unsafe address to kernel buffer.
+ *
+ * On success, returns the length of the string INCLUDING the trailing NUL.
+ *
+ * If access fails, returns -EFAULT (some data may have been copied
+ * and the trailing NUL added).
+ *
+ * If @count is smaller than the length of the string, copies @count-1 bytes,
+ * sets the last byte of @dst buffer to NUL and returns @count.
+ */
+long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
+{
+	mm_segment_t old_fs = get_fs();
+	const void *src = unsafe_addr;
+	long ret;
+
+	if (unlikely(count <= 0))
+		return 0;
+
+	set_fs(KERNEL_DS);
+	pagefault_disable();
+
+	do {
+		ret = __copy_from_user_inatomic(dst++,
+						(const void __user __force *)src++, 1);
+	} while (dst[-1] && ret == 0 && src - unsafe_addr < count);
+
+	dst[-1] = '\0';
+	pagefault_enable();
+	set_fs(old_fs);
+
+	return ret < 0 ? ret : src - unsafe_addr;
+}
-- 
cgit v1.2.3-70-g09d2


From ad440bf40e2846966da44e885bb7d8a1f8384fa6 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 18 Aug 2015 13:38:02 +0200
Subject: PM / Domains: Remove unusable governor dummies

The governor dummies for the !CONFIG_PM_GENERIC_DOMAINS case are
unusable, as a governors is always referred to by taking its address,
which you can't do with a literal NULL pointer.

I.e.

	pm_genpd_init(genpd, &simple_qos_governor, false);

fails to compile with:

	error: lvalue required as unary '&' operand

Hence just remove the governor dummies.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_domain.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index b2725e6e8e7b..b1cf7e797892 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -221,8 +221,6 @@ static inline int pm_genpd_name_poweron(const char *domain_name)
 	return -ENOSYS;
 }
 static inline void pm_genpd_poweroff_unused(void) {}
-#define simple_qos_governor NULL
-#define pm_domain_always_on_gov NULL
 #endif
 
 static inline int pm_genpd_add_device(struct generic_pm_domain *genpd,
-- 
cgit v1.2.3-70-g09d2


From a3c874200cbcd95ed914ba84f33f571a0ef7adfa Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Mon, 20 Jul 2015 19:54:36 +0300
Subject: mlx5: Fix missing device local_dma_lkey

The mlx5 driver exposes device capability IB_DEVICE_LOCAL_DMA_LKEY
but does not set the the device local_dma_lkey. This breaks
rpcrdma drivers.

Query and set this lkey when creating the device resources.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/main.c            |  9 ++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/fw.c | 22 ++++++++++++++++++++++
 include/linux/mlx5/device.h                  | 11 +++++++++++
 include/linux/mlx5/driver.h                  |  1 +
 4 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 085c24b4b603..5ab5bb3feb97 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1256,10 +1256,18 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
 	struct ib_srq_init_attr attr;
 	struct mlx5_ib_dev *dev;
 	struct ib_cq_init_attr cq_attr = {.cqe = 1};
+	u32 rsvd_lkey;
 	int ret = 0;
 
 	dev = container_of(devr, struct mlx5_ib_dev, devr);
 
+	ret = mlx5_core_query_special_context(dev->mdev, &rsvd_lkey);
+	if (ret) {
+		pr_err("Failed to query special context %d\n", ret);
+		return ret;
+	}
+	dev->ib_dev.local_dma_lkey = rsvd_lkey;
+
 	devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
 	if (IS_ERR(devr->p0)) {
 		ret = PTR_ERR(devr->p0);
@@ -1421,7 +1429,6 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX);
 	dev->ib_dev.owner		= THIS_MODULE;
 	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
-	dev->ib_dev.local_dma_lkey	= 0 /* not supported for now */;
 	dev->num_ports		= MLX5_CAP_GEN(mdev, num_ports);
 	dev->ib_dev.phys_port_cnt     = dev->num_ports;
 	dev->ib_dev.num_comp_vectors    =
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 9335e5ae18cc..aa0d5ffe92d8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -200,3 +200,25 @@ int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev)
 
 	return err;
 }
+
+int mlx5_core_query_special_context(struct mlx5_core_dev *dev, u32 *rsvd_lkey)
+{
+	struct mlx5_cmd_query_special_contexts_mbox_in in;
+	struct mlx5_cmd_query_special_contexts_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+
+	*rsvd_lkey = be32_to_cpu(out.resd_lkey);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_query_special_context);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index b943cd9e2097..6e4169c5ad78 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -402,6 +402,17 @@ struct mlx5_cmd_teardown_hca_mbox_out {
 	u8			rsvd[8];
 };
 
+struct mlx5_cmd_query_special_contexts_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_cmd_query_special_contexts_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32                  dump_fill_mkey;
+	__be32                  resd_lkey;
+};
+
 struct mlx5_cmd_layout {
 	u8		type;
 	u8		rsvd0[3];
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5722d88c2429..1e2e48ccb3fd 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -828,6 +828,7 @@ void *mlx5_get_protocol_dev(struct mlx5_core_dev *mdev, int protocol);
 int mlx5_register_interface(struct mlx5_interface *intf);
 void mlx5_unregister_interface(struct mlx5_interface *intf);
 int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id);
+int mlx5_core_query_special_context(struct mlx5_core_dev *dev, u32 *rsvd_lkey);
 
 struct mlx5_profile {
 	u64	mask;
-- 
cgit v1.2.3-70-g09d2


From bc3fe2e3769874dfa8674791e84c4a901ba9e48b Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Mon, 27 Jul 2015 18:10:12 -0500
Subject: svcrdma: Use max_sge_rd for destination read depths

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          |  1 +
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 12 +-----------
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  4 ++++
 3 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index cb94ee4181d4..83211bc9219e 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -132,6 +132,7 @@ struct svcxprt_rdma {
 	struct list_head     sc_accept_q;	/* Conn. waiting accept */
 	int		     sc_ord;		/* RDMA read limit */
 	int                  sc_max_sge;
+	int                  sc_max_sge_rd;	/* max sge for read target */
 
 	int                  sc_sq_depth;	/* Depth of SQ */
 	atomic_t             sc_sq_count;	/* Number of SQ WR on queue */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 2e1348bde325..cb5174284074 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -115,15 +115,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 	rqstp->rq_arg.tail[0].iov_len = 0;
 }
 
-static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
-{
-	if (!rdma_cap_read_multi_sge(xprt->sc_cm_id->device,
-				     xprt->sc_cm_id->port_num))
-		return 1;
-	else
-		return min_t(int, sge_count, xprt->sc_max_sge);
-}
-
 /* Issue an RDMA_READ using the local lkey to map the data sink */
 int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
 			struct svc_rqst *rqstp,
@@ -144,8 +135,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
 
 	ctxt->direction = DMA_FROM_DEVICE;
 	ctxt->read_hdr = head;
-	pages_needed =
-		min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed));
+	pages_needed = min_t(int, pages_needed, xprt->sc_max_sge_rd);
 	read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
 
 	for (pno = 0; pno < pages_needed; pno++) {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 6b36279e4288..fdc850ffc26c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -872,6 +872,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	 * capabilities of this particular device */
 	newxprt->sc_max_sge = min((size_t)devattr.max_sge,
 				  (size_t)RPCSVC_MAXPAGES);
+	newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd,
+				       RPCSVC_MAXPAGES);
 	newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
 				   (size_t)svcrdma_max_requests);
 	newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
@@ -1046,6 +1048,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		"    remote_ip       : %pI4\n"
 		"    remote_port     : %d\n"
 		"    max_sge         : %d\n"
+		"    max_sge_rd      : %d\n"
 		"    sq_depth        : %d\n"
 		"    max_requests    : %d\n"
 		"    ord             : %d\n",
@@ -1059,6 +1062,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
 		       route.addr.dst_addr)->sin_port),
 		newxprt->sc_max_sge,
+		newxprt->sc_max_sge_rd,
 		newxprt->sc_sq_depth,
 		newxprt->sc_max_requests,
 		newxprt->sc_ord);
-- 
cgit v1.2.3-70-g09d2


From 004f1afbe199e6ab20805b95aefd83ccd24bc5c7 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 24 Aug 2015 19:20:23 -0400
Subject: libnvdimm, pmem: direct map legacy pmem by default

The expectation is that the legacy / non-standard pmem discovery method
(e820 type-12) will only ever be used to describe small quantities of
persistent memory.  Larger capacities will be described via the ACPI
NFIT.  When "allocate struct page from pmem" support is added this default
policy can be overridden by assigning a legacy pmem namespace to a pfn
device, however this would be only be necessary if a platform used the
legacy mechanism to define a very large range.

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/e820.c           |  1 +
 drivers/nvdimm/namespace_devs.c | 35 ++++++++++++++++++++++++++++++-----
 drivers/nvdimm/nd.h             |  2 ++
 drivers/nvdimm/pmem.c           | 15 ++++++++++++---
 drivers/nvdimm/region_devs.c    |  1 +
 include/linux/libnvdimm.h       |  4 ++++
 6 files changed, 50 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
index 1b5743ad92db..8282db2ef99e 100644
--- a/drivers/nvdimm/e820.c
+++ b/drivers/nvdimm/e820.c
@@ -49,6 +49,7 @@ static int e820_pmem_probe(struct platform_device *pdev)
 		ndr_desc.res = p;
 		ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
 		ndr_desc.numa_node = NUMA_NO_NODE;
+		set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
 		if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
 			goto err;
 	}
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 9303ca29be9b..0955b2cb10fe 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/slab.h>
+#include <linux/pmem.h>
 #include <linux/nd.h>
 #include "nd-core.h"
 #include "nd.h"
@@ -76,11 +77,32 @@ static bool is_namespace_io(struct device *dev)
 	return dev ? dev->type == &namespace_io_device_type : false;
 }
 
+bool pmem_should_map_pages(struct device *dev)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+
+	if (!IS_ENABLED(CONFIG_ZONE_DEVICE))
+		return false;
+
+	if (!test_bit(ND_REGION_PAGEMAP, &nd_region->flags))
+		return false;
+
+	if (is_nd_pfn(dev) || is_nd_btt(dev))
+		return false;
+
+#ifdef ARCH_MEMREMAP_PMEM
+	return ARCH_MEMREMAP_PMEM == MEMREMAP_WB;
+#else
+	return false;
+#endif
+}
+EXPORT_SYMBOL(pmem_should_map_pages);
+
 const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
 		char *name)
 {
 	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
-	const char *suffix = "";
+	const char *suffix = NULL;
 
 	if (ndns->claim) {
 		if (is_nd_btt(ndns->claim))
@@ -93,13 +115,16 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
 					dev_name(ndns->claim));
 	}
 
-	if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev))
-		sprintf(name, "pmem%d%s", nd_region->id, suffix);
-	else if (is_namespace_blk(&ndns->dev)) {
+	if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) {
+		if (!suffix && pmem_should_map_pages(&ndns->dev))
+			suffix = "m";
+		sprintf(name, "pmem%d%s", nd_region->id, suffix ? suffix : "");
+	} else if (is_namespace_blk(&ndns->dev)) {
 		struct nd_namespace_blk *nsblk;
 
 		nsblk = to_nd_namespace_blk(&ndns->dev);
-		sprintf(name, "ndblk%d.%d%s", nd_region->id, nsblk->id, suffix);
+		sprintf(name, "ndblk%d.%d%s", nd_region->id, nsblk->id,
+				suffix ? suffix : "");
 	} else {
 		return NULL;
 	}
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 83e5d0935012..417e521d299c 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -100,6 +100,7 @@ struct nd_region {
 	struct ida ns_ida;
 	struct ida btt_ida;
 	struct ida pfn_ida;
+	unsigned long flags;
 	struct device *ns_seed;
 	struct device *btt_seed;
 	struct device *pfn_seed;
@@ -276,4 +277,5 @@ static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
 void nd_iostat_end(struct bio *bio, unsigned long start);
 resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk);
 const u8 *nd_dev_to_uuid(struct device *dev);
+bool pmem_should_map_pages(struct device *dev);
 #endif /* __ND_H__ */
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 2f885e5d9c36..c5ae2e579288 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -148,9 +148,18 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 		return ERR_PTR(-EBUSY);
 	}
 
-	pmem->virt_addr = memremap_pmem(dev, pmem->phys_addr, pmem->size);
-	if (!pmem->virt_addr)
-		return ERR_PTR(-ENXIO);
+	if (pmem_should_map_pages(dev)) {
+		void *addr = devm_memremap_pages(dev, res);
+
+		if (IS_ERR(addr))
+			return addr;
+		pmem->virt_addr = (void __pmem *) addr;
+	} else {
+		pmem->virt_addr = memremap_pmem(dev, pmem->phys_addr,
+				pmem->size);
+		if (!pmem->virt_addr)
+			return ERR_PTR(-ENXIO);
+	}
 
 	return pmem;
 }
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index da4338154ad2..529f3f02e7b2 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -758,6 +758,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	nd_region->provider_data = ndr_desc->provider_data;
 	nd_region->nd_set = ndr_desc->nd_set;
 	nd_region->num_lanes = ndr_desc->num_lanes;
+	nd_region->flags = ndr_desc->flags;
 	nd_region->ro = ro;
 	nd_region->numa_node = ndr_desc->numa_node;
 	ida_init(&nd_region->ns_ida);
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 75e3af01ee32..3f021dc5da8c 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -31,6 +31,9 @@ enum {
 	ND_CMD_ARS_STATUS_MAX = SZ_4K,
 	ND_MAX_MAPPINGS = 32,
 
+	/* region flag indicating to direct-map persistent memory by default */
+	ND_REGION_PAGEMAP = 0,
+
 	/* mark newly adjusted resources as requiring a label update */
 	DPA_RESOURCE_ADJUSTED = 1 << 0,
 };
@@ -91,6 +94,7 @@ struct nd_region_desc {
 	void *provider_data;
 	int num_lanes;
 	int numa_node;
+	unsigned long flags;
 };
 
 struct nvdimm_bus;
-- 
cgit v1.2.3-70-g09d2


From adaac459759db4a1fd35baddbe47bac700095496 Mon Sep 17 00:00:00 2001
From: Markus Pargmann <mpa@pengutronix.de>
Date: Sun, 30 Aug 2015 09:33:53 +0200
Subject: regmap: Introduce max_raw_read/write for regmap_bulk_read/write

There are some buses which have a limit on the maximum number of bytes
that can be send/received. An example for this is
I2C_FUNC_SMBUS_I2C_BLOCK which does not support any reads/writes of more
than 32 bytes. The regmap_bulk operations should still be able to
utilize the full 32 bytes in this case.

Signed-off-by: Markus Pargmann <mpa@pengutronix.de>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/internal.h |  4 ++
 drivers/base/regmap/regmap.c   | 85 ++++++++++++++++++++++++++++++++++--------
 include/linux/regmap.h         |  4 ++
 3 files changed, 78 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index d744ae3926dd..fc554e357c5d 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -146,6 +146,10 @@ struct regmap {
 	/* if set, the device supports multi write mode */
 	bool can_multi_write;
 
+	/* if set, raw reads/writes are limited to this size */
+	size_t max_raw_read;
+	size_t max_raw_write;
+
 	struct rb_root range_tree;
 	void *selector_work_buf;	/* Scratch buffer used for selector */
 };
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index a6b6f7ee87ee..7cbe42680877 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -579,6 +579,8 @@ struct regmap *regmap_init(struct device *dev,
 	map->use_single_read = config->use_single_rw || !bus || !bus->read;
 	map->use_single_write = config->use_single_rw || !bus || !bus->write;
 	map->can_multi_write = config->can_multi_write && bus && bus->write;
+	map->max_raw_read = bus->max_raw_read;
+	map->max_raw_write = bus->max_raw_write;
 	map->dev = dev;
 	map->bus = bus;
 	map->bus_context = bus_context;
@@ -1674,6 +1676,7 @@ int regmap_bulk_write(struct regmap *map, unsigned int reg, const void *val,
 {
 	int ret = 0, i;
 	size_t val_bytes = map->format.val_bytes;
+	size_t total_size = val_bytes * val_count;
 
 	if (map->bus && !map->format.parse_inplace)
 		return -EINVAL;
@@ -1722,16 +1725,37 @@ int regmap_bulk_write(struct regmap *map, unsigned int reg, const void *val,
 		}
 out:
 		map->unlock(map->lock_arg);
-	} else if (map->use_single_write) {
+	} else if (map->use_single_write ||
+		   (map->max_raw_write && map->max_raw_write < total_size)) {
+		int chunk_stride = map->reg_stride;
+		size_t chunk_size = val_bytes;
+		size_t chunk_count = val_count;
+
+		if (!map->use_single_write) {
+			chunk_size = map->max_raw_write;
+			if (chunk_size % val_bytes)
+				chunk_size -= chunk_size % val_bytes;
+			chunk_count = total_size / chunk_size;
+			chunk_stride *= chunk_size / val_bytes;
+		}
+
 		map->lock(map->lock_arg);
-		for (i = 0; i < val_count; i++) {
+		/* Write as many bytes as possible with chunk_size */
+		for (i = 0; i < chunk_count; i++) {
 			ret = _regmap_raw_write(map,
-						reg + (i * map->reg_stride),
-						val + (i * val_bytes),
-						val_bytes);
+						reg + (i * chunk_stride),
+						val + (i * chunk_size),
+						chunk_size);
 			if (ret)
 				break;
 		}
+
+		/* Write remaining bytes */
+		if (!ret && chunk_size * i < total_size) {
+			ret = _regmap_raw_write(map, reg + (i * chunk_stride),
+						val + (i * chunk_size),
+						total_size - i * chunk_size);
+		}
 		map->unlock(map->lock_arg);
 	} else {
 		void *wval;
@@ -2319,20 +2343,51 @@ int regmap_bulk_read(struct regmap *map, unsigned int reg, void *val,
 		 * Some devices does not support bulk read, for
 		 * them we have a series of single read operations.
 		 */
-		if (map->use_single_read) {
-			for (i = 0; i < val_count; i++) {
-				ret = regmap_raw_read(map,
-						reg + (i * map->reg_stride),
-						val + (i * val_bytes),
-						val_bytes);
-				if (ret != 0)
-					return ret;
-			}
-		} else {
+		size_t total_size = val_bytes * val_count;
+
+		if (!map->use_single_read &&
+		    (!map->max_raw_read || map->max_raw_read > total_size)) {
 			ret = regmap_raw_read(map, reg, val,
 					      val_bytes * val_count);
 			if (ret != 0)
 				return ret;
+		} else {
+			/*
+			 * Some devices do not support bulk read or do not
+			 * support large bulk reads, for them we have a series
+			 * of read operations.
+			 */
+			int chunk_stride = map->reg_stride;
+			size_t chunk_size = val_bytes;
+			size_t chunk_count = val_count;
+
+			if (!map->use_single_read) {
+				chunk_size = map->max_raw_read;
+				if (chunk_size % val_bytes)
+					chunk_size -= chunk_size % val_bytes;
+				chunk_count = total_size / chunk_size;
+				chunk_stride *= chunk_size / val_bytes;
+			}
+
+			/* Read bytes that fit into a multiple of chunk_size */
+			for (i = 0; i < chunk_count; i++) {
+				ret = regmap_raw_read(map,
+						      reg + (i * chunk_stride),
+						      val + (i * chunk_size),
+						      chunk_size);
+				if (ret != 0)
+					return ret;
+			}
+
+			/* Read remaining bytes */
+			if (chunk_size * i < total_size) {
+				ret = regmap_raw_read(map,
+						      reg + (i * chunk_stride),
+						      val + (i * chunk_size),
+						      total_size - i * chunk_size);
+				if (ret != 0)
+					return ret;
+			}
 		}
 
 		for (i = 0; i < val_count * val_bytes; i += val_bytes)
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 73fc34d0c4c2..327b8f291d3f 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -311,6 +311,8 @@ typedef void (*regmap_hw_free_context)(void *context);
  * @val_format_endian_default: Default endianness for formatted register
  *     values. Used when the regmap_config specifies DEFAULT. If this is
  *     DEFAULT, BIG is assumed.
+ * @max_raw_read: Max raw read size that can be used on the bus.
+ * @max_raw_write: Max raw write size that can be used on the bus.
  */
 struct regmap_bus {
 	bool fast_io;
@@ -325,6 +327,8 @@ struct regmap_bus {
 	u8 read_flag_mask;
 	enum regmap_endian reg_format_endian_default;
 	enum regmap_endian val_format_endian_default;
+	size_t max_raw_read;
+	size_t max_raw_write;
 };
 
 struct regmap *regmap_init(struct device *dev,
-- 
cgit v1.2.3-70-g09d2


From f50c9eb4e9304cf555206c93152f580c0e7213b2 Mon Sep 17 00:00:00 2001
From: Markus Pargmann <mpa@pengutronix.de>
Date: Sun, 30 Aug 2015 09:33:54 +0200
Subject: regmap: regmap max_raw_read/write getter functions

Add functions to access the maximum size we can read/write using
regmap_raw_read/write().

This helps drivers that need to know how much they can write with the
raw functions without problems. There are some devices (e.g. bmc150)
that have fifos as registers which need to be read in specific chunks
otherwise samples are dropped.

Signed-off-by: Markus Pargmann <mpa@pengutronix.de>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap.c | 22 ++++++++++++++++++++++
 include/linux/regmap.h       |  2 ++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 7cbe42680877..47210101e308 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -1393,6 +1393,28 @@ bool regmap_can_raw_write(struct regmap *map)
 }
 EXPORT_SYMBOL_GPL(regmap_can_raw_write);
 
+/**
+ * regmap_get_raw_read_max - Get the maximum size we can read
+ *
+ * @map: Map to check.
+ */
+size_t regmap_get_raw_read_max(struct regmap *map)
+{
+	return map->max_raw_read;
+}
+EXPORT_SYMBOL_GPL(regmap_get_raw_read_max);
+
+/**
+ * regmap_get_raw_write_max - Get the maximum size we can read
+ *
+ * @map: Map to check.
+ */
+size_t regmap_get_raw_write_max(struct regmap *map)
+{
+	return map->max_raw_write;
+}
+EXPORT_SYMBOL_GPL(regmap_get_raw_write_max);
+
 static int _regmap_bus_formatted_write(void *context, unsigned int reg,
 				       unsigned int val)
 {
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 327b8f291d3f..6724d0e3819e 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -444,6 +444,8 @@ int regmap_get_max_register(struct regmap *map);
 int regmap_get_reg_stride(struct regmap *map);
 int regmap_async_complete(struct regmap *map);
 bool regmap_can_raw_write(struct regmap *map);
+size_t regmap_get_raw_read_max(struct regmap *map);
+size_t regmap_get_raw_write_max(struct regmap *map);
 
 int regcache_sync(struct regmap *map);
 int regcache_sync_region(struct regmap *map, unsigned int min,
-- 
cgit v1.2.3-70-g09d2


From 816dd19b3d191da88bc034fb85e21ed09a3ed320 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 30 Jul 2015 18:33:23 +0300
Subject: net: Add info for NETDEV_CHANGEUPPER event

Some consumers of NETDEV_CHANGEUPPER event would like to know which
upper device was linked/unlinked and what operation was carried.

Add information in the notifier info block for that purpose.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/linux/netdevice.h | 14 ++++++++++++++
 net/core/dev.c            | 12 ++++++++++--
 2 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e20979dfd6a9..2b7fe4e3146a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3556,6 +3556,20 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 				    netdev_features_t features);
 
+enum netdev_changeupper_event {
+	NETDEV_CHANGEUPPER_LINK,
+	NETDEV_CHANGEUPPER_UNLINK,
+};
+
+struct netdev_changeupper_info {
+	struct netdev_notifier_info	info; /* must be first */
+	enum netdev_changeupper_event	event;
+	struct net_device		*upper;
+};
+
+void netdev_changeupper_info_change(struct net_device *dev,
+				    struct netdev_changeupper_info *info);
+
 struct netdev_bonding_info {
 	ifslave	slave;
 	ifbond	master;
diff --git a/net/core/dev.c b/net/core/dev.c
index a8e4dd430285..6e6f14e5d44f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5302,6 +5302,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 				   void *private)
 {
 	struct netdev_adjacent *i, *j, *to_i, *to_j;
+	struct netdev_changeupper_info changeupper_info;
 	int ret = 0;
 
 	ASSERT_RTNL();
@@ -5357,7 +5358,10 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 			goto rollback_lower_mesh;
 	}
 
-	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+	changeupper_info.event = NETDEV_CHANGEUPPER_LINK;
+	changeupper_info.upper = upper_dev;
+	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+				      &changeupper_info.info);
 	return 0;
 
 rollback_lower_mesh:
@@ -5453,6 +5457,7 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev)
 {
 	struct netdev_adjacent *i, *j;
+	struct netdev_changeupper_info changeupper_info;
 	ASSERT_RTNL();
 
 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
@@ -5474,7 +5479,10 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
 		__netdev_adjacent_dev_unlink(dev, i->dev);
 
-	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+	changeupper_info.event = NETDEV_CHANGEUPPER_UNLINK;
+	changeupper_info.upper = upper_dev;
+	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+				      &changeupper_info.info);
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
-- 
cgit v1.2.3-70-g09d2


From 79857cd31fe70145ff007d4e968557af342c8ccd Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Thu, 30 Jul 2015 18:33:28 +0300
Subject: net/mlx4: Postpone the registration of net_device

The mlx4 network driver was registered in the context of the 'add'
function of the core driver (called when HW should be registered).
This makes the netdev event NETDEV_REGISTER to be sent in a context
where the answer to get_protocol_dev() callback returns NULL. This may
be confusing to listeners of netdev events.
This patch is a preparation to the patch that implements the
get_netdev() callback in the IB/mlx4 driver.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_main.c | 36 ++++++++++++++++------------
 drivers/net/ethernet/mellanox/mlx4/intf.c    |  3 +++
 include/linux/mlx4/driver.h                  |  1 +
 3 files changed, 25 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c
index 913b716ed2e1..a946e4bf71d2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c
@@ -224,6 +224,26 @@ static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr)
 	kfree(mdev);
 }
 
+static void mlx4_en_activate(struct mlx4_dev *dev, void *ctx)
+{
+	int i;
+	struct mlx4_en_dev *mdev = ctx;
+
+	/* Create a netdev for each port */
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		mlx4_info(mdev, "Activating port:%d\n", i);
+		if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
+			mdev->pndev[i] = NULL;
+	}
+
+	/* register notifier */
+	mdev->nb.notifier_call = mlx4_en_netdev_event;
+	if (register_netdevice_notifier(&mdev->nb)) {
+		mdev->nb.notifier_call = NULL;
+		mlx4_err(mdev, "Failed to create notifier\n");
+	}
+}
+
 static void *mlx4_en_add(struct mlx4_dev *dev)
 {
 	struct mlx4_en_dev *mdev;
@@ -297,21 +317,6 @@ static void *mlx4_en_add(struct mlx4_dev *dev)
 	mutex_init(&mdev->state_lock);
 	mdev->device_up = true;
 
-	/* Setup ports */
-
-	/* Create a netdev for each port */
-	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
-		mlx4_info(mdev, "Activating port:%d\n", i);
-		if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
-			mdev->pndev[i] = NULL;
-	}
-	/* register notifier */
-	mdev->nb.notifier_call = mlx4_en_netdev_event;
-	if (register_netdevice_notifier(&mdev->nb)) {
-		mdev->nb.notifier_call = NULL;
-		mlx4_err(mdev, "Failed to create notifier\n");
-	}
-
 	return mdev;
 
 err_mr:
@@ -335,6 +340,7 @@ static struct mlx4_interface mlx4_en_interface = {
 	.event		= mlx4_en_event,
 	.get_dev	= mlx4_en_get_netdev,
 	.protocol	= MLX4_PROT_ETH,
+	.activate	= mlx4_en_activate,
 };
 
 static void mlx4_en_verify_params(void)
diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
index 0d80aed59043..0472941af820 100644
--- a/drivers/net/ethernet/mellanox/mlx4/intf.c
+++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
@@ -63,8 +63,11 @@ static void mlx4_add_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
 		spin_lock_irq(&priv->ctx_lock);
 		list_add_tail(&dev_ctx->list, &priv->ctx_list);
 		spin_unlock_irq(&priv->ctx_lock);
+		if (intf->activate)
+			intf->activate(&priv->dev, dev_ctx->context);
 	} else
 		kfree(dev_ctx);
+
 }
 
 static void mlx4_remove_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
index 9553a73d2049..5a06d969338e 100644
--- a/include/linux/mlx4/driver.h
+++ b/include/linux/mlx4/driver.h
@@ -59,6 +59,7 @@ struct mlx4_interface {
 	void			(*event) (struct mlx4_dev *dev, void *context,
 					  enum mlx4_dev_event event, unsigned long param);
 	void *			(*get_dev)(struct mlx4_dev *dev, void *context, u8 port);
+	void			(*activate)(struct mlx4_dev *dev, void *context);
 	struct list_head	list;
 	enum mlx4_protocol	protocol;
 	int			flags;
-- 
cgit v1.2.3-70-g09d2


From e26be1bfef81a2314a075f54dd8930cf5e8656df Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Thu, 30 Jul 2015 18:33:29 +0300
Subject: IB/mlx4: Implement ib_device callbacks

get_netdev: get the net_device on the physical port of the IB transport port. In
port aggregation mode it is required to return the netdev of the active port.

modify_gid: note for a change in the RoCE gid cache. Handle this by writing to
the harsware GID table. It is possible that indexes in cahce and hardware tables
won't match so a translation is required when modifying a QP or creating an
address handle.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/cache.c      |   3 +-
 drivers/infiniband/hw/mlx4/main.c    | 236 ++++++++++++++++++++++++++++++++++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  17 +++
 include/linux/mlx4/device.h          |   3 +-
 include/rdma/ib_verbs.h              |   2 +
 5 files changed, 257 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index fc39a2fed37b..8f66c67ff0df 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -55,7 +55,8 @@ struct ib_update_work {
 	u8                 port_num;
 };
 
-static union ib_gid zgid;
+union ib_gid zgid;
+EXPORT_SYMBOL(zgid);
 
 static const struct ib_gid_attr zattr;
 
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 9ab73a496d52..5286ea7bb04a 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -45,6 +45,9 @@
 #include <rdma/ib_smi.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+
+#include <net/bonding.h>
 
 #include <linux/mlx4/driver.h>
 #include <linux/mlx4/cmd.h>
@@ -93,8 +96,6 @@ static void init_query_mad(struct ib_smp *mad)
 	mad->method	   = IB_MGMT_METHOD_GET;
 }
 
-static union ib_gid zgid;
-
 static int check_flow_steering_support(struct mlx4_dev *dev)
 {
 	int eth_num_ports = 0;
@@ -131,6 +132,237 @@ static int num_ib_ports(struct mlx4_dev *dev)
 	return ib_ports;
 }
 
+static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, u8 port_num)
+{
+	struct mlx4_ib_dev *ibdev = to_mdev(device);
+	struct net_device *dev;
+
+	rcu_read_lock();
+	dev = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port_num);
+
+	if (dev) {
+		if (mlx4_is_bonded(ibdev->dev)) {
+			struct net_device *upper = NULL;
+
+			upper = netdev_master_upper_dev_get_rcu(dev);
+			if (upper) {
+				struct net_device *active;
+
+				active = bond_option_active_slave_get_rcu(netdev_priv(upper));
+				if (active)
+					dev = active;
+			}
+		}
+	}
+	if (dev)
+		dev_hold(dev);
+
+	rcu_read_unlock();
+	return dev;
+}
+
+static int mlx4_ib_update_gids(struct gid_entry *gids,
+			       struct mlx4_ib_dev *ibdev,
+			       u8 port_num)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	struct mlx4_dev *dev = ibdev->dev;
+	int i;
+	union ib_gid *gid_tbl;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+
+	gid_tbl = mailbox->buf;
+
+	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
+		memcpy(&gid_tbl[i], &gids[i].gid, sizeof(union ib_gid));
+
+	err = mlx4_cmd(dev, mailbox->dma,
+		       MLX4_SET_PORT_GID_TABLE << 8 | port_num,
+		       1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+	if (mlx4_is_bonded(dev))
+		err += mlx4_cmd(dev, mailbox->dma,
+				MLX4_SET_PORT_GID_TABLE << 8 | 2,
+				1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+				MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+static int mlx4_ib_add_gid(struct ib_device *device,
+			   u8 port_num,
+			   unsigned int index,
+			   const union ib_gid *gid,
+			   const struct ib_gid_attr *attr,
+			   void **context)
+{
+	struct mlx4_ib_dev *ibdev = to_mdev(device);
+	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+	struct mlx4_port_gid_table   *port_gid_table;
+	int free = -1, found = -1;
+	int ret = 0;
+	int hw_update = 0;
+	int i;
+	struct gid_entry *gids = NULL;
+
+	if (!rdma_cap_roce_gid_table(device, port_num))
+		return -EINVAL;
+
+	if (port_num > MLX4_MAX_PORTS)
+		return -EINVAL;
+
+	if (!context)
+		return -EINVAL;
+
+	port_gid_table = &iboe->gids[port_num - 1];
+	spin_lock_bh(&iboe->lock);
+	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
+		if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid))) {
+			found = i;
+			break;
+		}
+		if (free < 0 && !memcmp(&port_gid_table->gids[i].gid, &zgid, sizeof(*gid)))
+			free = i; /* HW has space */
+	}
+
+	if (found < 0) {
+		if (free < 0) {
+			ret = -ENOSPC;
+		} else {
+			port_gid_table->gids[free].ctx = kmalloc(sizeof(*port_gid_table->gids[free].ctx), GFP_ATOMIC);
+			if (!port_gid_table->gids[free].ctx) {
+				ret = -ENOMEM;
+			} else {
+				*context = port_gid_table->gids[free].ctx;
+				memcpy(&port_gid_table->gids[free].gid, gid, sizeof(*gid));
+				port_gid_table->gids[free].ctx->real_index = free;
+				port_gid_table->gids[free].ctx->refcount = 1;
+				hw_update = 1;
+			}
+		}
+	} else {
+		struct gid_cache_context *ctx = port_gid_table->gids[found].ctx;
+		*context = ctx;
+		ctx->refcount++;
+	}
+	if (!ret && hw_update) {
+		gids = kmalloc(sizeof(*gids) * MLX4_MAX_PORT_GIDS, GFP_ATOMIC);
+		if (!gids) {
+			ret = -ENOMEM;
+		} else {
+			for (i = 0; i < MLX4_MAX_PORT_GIDS; i++)
+				memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid));
+		}
+	}
+	spin_unlock_bh(&iboe->lock);
+
+	if (!ret && hw_update) {
+		ret = mlx4_ib_update_gids(gids, ibdev, port_num);
+		kfree(gids);
+	}
+
+	return ret;
+}
+
+static int mlx4_ib_del_gid(struct ib_device *device,
+			   u8 port_num,
+			   unsigned int index,
+			   void **context)
+{
+	struct gid_cache_context *ctx = *context;
+	struct mlx4_ib_dev *ibdev = to_mdev(device);
+	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+	struct mlx4_port_gid_table   *port_gid_table;
+	int ret = 0;
+	int hw_update = 0;
+	struct gid_entry *gids = NULL;
+
+	if (!rdma_cap_roce_gid_table(device, port_num))
+		return -EINVAL;
+
+	if (port_num > MLX4_MAX_PORTS)
+		return -EINVAL;
+
+	port_gid_table = &iboe->gids[port_num - 1];
+	spin_lock_bh(&iboe->lock);
+	if (ctx) {
+		ctx->refcount--;
+		if (!ctx->refcount) {
+			unsigned int real_index = ctx->real_index;
+
+			memcpy(&port_gid_table->gids[real_index].gid, &zgid, sizeof(zgid));
+			kfree(port_gid_table->gids[real_index].ctx);
+			port_gid_table->gids[real_index].ctx = NULL;
+			hw_update = 1;
+		}
+	}
+	if (!ret && hw_update) {
+		int i;
+
+		gids = kmalloc(sizeof(*gids) * MLX4_MAX_PORT_GIDS, GFP_ATOMIC);
+		if (!gids) {
+			ret = -ENOMEM;
+		} else {
+			for (i = 0; i < MLX4_MAX_PORT_GIDS; i++)
+				memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid));
+		}
+	}
+	spin_unlock_bh(&iboe->lock);
+
+	if (!ret && hw_update) {
+		ret = mlx4_ib_update_gids(gids, ibdev, port_num);
+		kfree(gids);
+	}
+	return ret;
+}
+
+int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
+				    u8 port_num, int index)
+{
+	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+	struct gid_cache_context *ctx = NULL;
+	union ib_gid gid;
+	struct mlx4_port_gid_table   *port_gid_table;
+	int real_index = -EINVAL;
+	int i;
+	int ret;
+	unsigned long flags;
+
+	if (port_num > MLX4_MAX_PORTS)
+		return -EINVAL;
+
+	if (mlx4_is_bonded(ibdev->dev))
+		port_num = 1;
+
+	if (!rdma_cap_roce_gid_table(&ibdev->ib_dev, port_num))
+		return index;
+
+	ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid);
+	if (ret)
+		return ret;
+
+	if (!memcmp(&gid, &zgid, sizeof(gid)))
+		return -EINVAL;
+
+	spin_lock_irqsave(&iboe->lock, flags);
+	port_gid_table = &iboe->gids[port_num - 1];
+
+	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
+		if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid))) {
+			ctx = port_gid_table->gids[i].ctx;
+			break;
+		}
+	if (ctx)
+		real_index = ctx->real_index;
+	spin_unlock_irqrestore(&iboe->lock, flags);
+	return real_index;
+}
+
 static int mlx4_ib_query_device(struct ib_device *ibdev,
 				struct ib_device_attr *props,
 				struct ib_udata *uhw)
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 3b85f0475a25..87a720f10f50 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -457,6 +457,20 @@ struct mlx4_ib_sriov {
 	struct idr pv_id_table;
 };
 
+struct gid_cache_context {
+	int real_index;
+	int refcount;
+};
+
+struct gid_entry {
+	union ib_gid	gid;
+	struct gid_cache_context *ctx;
+};
+
+struct mlx4_port_gid_table {
+	struct gid_entry gids[MLX4_MAX_PORT_GIDS];
+};
+
 struct mlx4_ib_iboe {
 	spinlock_t		lock;
 	struct net_device      *netdevs[MLX4_MAX_PORTS];
@@ -466,6 +480,7 @@ struct mlx4_ib_iboe {
 	struct notifier_block	nb_inet;
 	struct notifier_block	nb_inet6;
 	union ib_gid		gid_table[MLX4_MAX_PORTS][128];
+	struct mlx4_port_gid_table gids[MLX4_MAX_PORTS];
 };
 
 struct pkey_mgt {
@@ -839,5 +854,7 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
 			  u64 start, u64 length, u64 virt_addr,
 			  int mr_access_flags, struct ib_pd *pd,
 			  struct ib_udata *udata);
+int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
+				    u8 port_num, int index);
 
 #endif /* MLX4_IB_H */
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index fd13c1ce3b4a..5c7687a65717 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -79,7 +79,8 @@ enum {
 
 enum {
 	MLX4_MAX_PORTS		= 2,
-	MLX4_MAX_PORT_PKEYS	= 128
+	MLX4_MAX_PORT_PKEYS	= 128,
+	MLX4_MAX_PORT_GIDS	= 128
 };
 
 /* base qkey for use in sriov tunnel-qp/proxy-qp communication.
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 2de56834a6be..5eff55c8b39d 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -65,6 +65,8 @@ union ib_gid {
 	} global;
 };
 
+extern union ib_gid zgid;
+
 struct ib_gid_attr {
 	struct net_device	*ndev;
 };
-- 
cgit v1.2.3-70-g09d2


From 5a11dd7d9649149f336ca72069d56ce52b21567f Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 31 Aug 2015 15:56:46 +0200
Subject: net: phy: Allow PHY devices to identify themselves as Ethernet
 switches, etc.

Some Ethernet MAC drivers using the PHY library require the hardcoding
of link parameters when interfaced to a switch device, SFP module,
switch to switch port, etc. This has typically lead to various ad-hoc
implementations looking like this:

- using a "fixed PHY" emulated device, which will provide link
  indication towards the Ethernet MAC driver and hardware

- pretend there is no PHY and hardcode link parameters, ala mv643x_eth

Based on that, it is desireable to have the PHY drivers advertise the
correct link parameters, just like regular Ethernet PHYs towards their
CPU Ethernet MAC drivers, however, Ethernet MAC drivers should be able
to tell whether this link should be monitored or not. In the context
of an Ethernet switch, SFP module, switch to switch link, we do not
need to monitor this link since it should be always up.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/fixed_phy.c |  1 +
 include/linux/phy.h         | 12 ++++++++++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index 99d9bc19c94a..ce46428ff21f 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -303,6 +303,7 @@ struct phy_device *fixed_phy_register(unsigned int irq,
 
 	of_node_get(np);
 	phy->dev.of_node = np;
+	phy->is_pseudo_fixed_link = true;
 
 	ret = phy_device_register(phy);
 	if (ret) {
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e5fb1d415961..962387a192f1 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -330,6 +330,7 @@ struct phy_c45_device_ids {
  * c45_ids: 802.3-c45 Device Identifers if is_c45.
  * is_c45:  Set to true if this phy uses clause 45 addressing.
  * is_internal: Set to true if this phy is internal to a MAC.
+ * is_pseudo_fixed_link: Set to true if this phy is an Ethernet switch, etc.
  * has_fixups: Set to true if this phy has fixups/quirks.
  * suspended: Set to true if this phy has been suspended successfully.
  * state: state of the PHY for management purposes
@@ -368,6 +369,7 @@ struct phy_device {
 	struct phy_c45_device_ids c45_ids;
 	bool is_c45;
 	bool is_internal;
+	bool is_pseudo_fixed_link;
 	bool has_fixups;
 	bool suspended;
 
@@ -688,6 +690,16 @@ static inline bool phy_interface_is_rgmii(struct phy_device *phydev)
 {
 	return phydev->interface >= PHY_INTERFACE_MODE_RGMII &&
 		phydev->interface <= PHY_INTERFACE_MODE_RGMII_TXID;
+};
+
+/*
+ * phy_is_pseudo_fixed_link - Convenience function for testing if this
+ * PHY is the CPU port facing side of an Ethernet switch, or similar.
+ * @phydev: the phy_device struct
+ */
+static inline bool phy_is_pseudo_fixed_link(struct phy_device *phydev)
+{
+	return phydev->is_pseudo_fixed_link;
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From a5597008dbc230876db2d344561d634f4d52ea4a Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 31 Aug 2015 15:56:53 +0200
Subject: phy: fixed_phy: Add gpio to determine link up/down.

An SFP module may have a link up/down status pin which can be
connection to a GPIO line of the host. Add support for reading such an
GPIO in the fixed_phy driver.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../devicetree/bindings/net/fixed-link.txt         | 14 +++++++++++-
 Documentation/networking/stmmac.txt                |  2 +-
 arch/m68k/coldfire/m5272.c                         |  2 +-
 arch/mips/ar7/platform.c                           |  5 +++--
 arch/mips/bcm47xx/setup.c                          |  2 +-
 drivers/net/ethernet/broadcom/genet/bcmmii.c       |  2 +-
 drivers/net/phy/fixed_phy.c                        | 26 +++++++++++++++++++---
 drivers/of/of_mdio.c                               | 13 ++++++++---
 include/linux/phy_fixed.h                          |  8 +++++--
 9 files changed, 59 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/fixed-link.txt b/Documentation/devicetree/bindings/net/fixed-link.txt
index 82bf7e0f47b6..ec5d889fe3d8 100644
--- a/Documentation/devicetree/bindings/net/fixed-link.txt
+++ b/Documentation/devicetree/bindings/net/fixed-link.txt
@@ -17,6 +17,8 @@ properties:
   enabled.
 * 'asym-pause' (boolean, optional), to indicate that asym_pause should
   be enabled.
+* 'link-gpios' ('gpio-list', optional), to indicate if a gpio can be read
+  to determine if the link is up.
 
 Old, deprecated 'fixed-link' binding:
 
@@ -30,7 +32,7 @@ Old, deprecated 'fixed-link' binding:
   - e: asymmetric pause configuration: 0 for no asymmetric pause, 1 for
     asymmetric pause
 
-Example:
+Examples:
 
 ethernet@0 {
 	...
@@ -40,3 +42,13 @@ ethernet@0 {
 	};
 	...
 };
+
+ethernet@1 {
+	...
+	fixed-link {
+	      speed = <1000>;
+	      pause;
+	      link-gpios = <&gpio0 12 GPIO_ACTIVE_HIGH>;
+	};
+	...
+};
diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt
index 2903b1cf4d70..d64a14714236 100644
--- a/Documentation/networking/stmmac.txt
+++ b/Documentation/networking/stmmac.txt
@@ -254,7 +254,7 @@ static struct fixed_phy_status stmmac0_fixed_phy_status = {
 
 During the board's device_init we can configure the first
 MAC for fixed_link by calling:
-  fixed_phy_add(PHY_POLL, 1, &stmmac0_fixed_phy_status));)
+  fixed_phy_add(PHY_POLL, 1, &stmmac0_fixed_phy_status, -1);
 and the second one, with a real PHY device attached to the bus,
 by using the stmmac_mdio_bus_data structure (to provide the id, the
 reset procedure etc).
diff --git a/arch/m68k/coldfire/m5272.c b/arch/m68k/coldfire/m5272.c
index b15219ed22bf..c525e4c08f84 100644
--- a/arch/m68k/coldfire/m5272.c
+++ b/arch/m68k/coldfire/m5272.c
@@ -126,7 +126,7 @@ static struct fixed_phy_status nettel_fixed_phy_status __initdata = {
 static int __init init_BSP(void)
 {
 	m5272_uarts_init();
-	fixed_phy_add(PHY_POLL, 0, &nettel_fixed_phy_status);
+	fixed_phy_add(PHY_POLL, 0, &nettel_fixed_phy_status, -1);
 	return 0;
 }
 
diff --git a/arch/mips/ar7/platform.c b/arch/mips/ar7/platform.c
index be9ff1673ded..298b97715d5f 100644
--- a/arch/mips/ar7/platform.c
+++ b/arch/mips/ar7/platform.c
@@ -679,7 +679,8 @@ static int __init ar7_register_devices(void)
 	}
 
 	if (ar7_has_high_cpmac()) {
-		res = fixed_phy_add(PHY_POLL, cpmac_high.id, &fixed_phy_status);
+		res = fixed_phy_add(PHY_POLL, cpmac_high.id,
+				    &fixed_phy_status, -1);
 		if (!res) {
 			cpmac_get_mac(1, cpmac_high_data.dev_addr);
 
@@ -692,7 +693,7 @@ static int __init ar7_register_devices(void)
 	} else
 		cpmac_low_data.phy_mask = 0xffffffff;
 
-	res = fixed_phy_add(PHY_POLL, cpmac_low.id, &fixed_phy_status);
+	res = fixed_phy_add(PHY_POLL, cpmac_low.id, &fixed_phy_status, -1);
 	if (!res) {
 		cpmac_get_mac(0, cpmac_low_data.dev_addr);
 		res = platform_device_register(&cpmac_low);
diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c
index 98c075f81795..17503a05938e 100644
--- a/arch/mips/bcm47xx/setup.c
+++ b/arch/mips/bcm47xx/setup.c
@@ -263,7 +263,7 @@ static int __init bcm47xx_register_bus_complete(void)
 	bcm47xx_leds_register();
 	bcm47xx_workarounds();
 
-	fixed_phy_add(PHY_POLL, 0, &bcm47xx_fixed_phy_status);
+	fixed_phy_add(PHY_POLL, 0, &bcm47xx_fixed_phy_status, -1);
 	return 0;
 }
 device_initcall(bcm47xx_register_bus_complete);
diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c
index b3679ad1c1c7..c8affad76f36 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
@@ -585,7 +585,7 @@ static int bcmgenet_mii_pd_init(struct bcmgenet_priv *priv)
 			.asym_pause = 0,
 		};
 
-		phydev = fixed_phy_register(PHY_POLL, &fphy_status, NULL);
+		phydev = fixed_phy_register(PHY_POLL, &fphy_status, -1, NULL);
 		if (!phydev || IS_ERR(phydev)) {
 			dev_err(kdev, "failed to register fixed PHY device\n");
 			return -ENODEV;
diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index 2f9457f05a2e..1bb70e3cc03e 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -22,6 +22,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/of.h>
+#include <linux/gpio.h>
 
 #define MII_REGS_NUM 29
 
@@ -38,6 +39,7 @@ struct fixed_phy {
 	struct fixed_phy_status status;
 	int (*link_update)(struct net_device *, struct fixed_phy_status *);
 	struct list_head node;
+	int link_gpio;
 };
 
 static struct platform_device *pdev;
@@ -52,6 +54,9 @@ static int fixed_phy_update_regs(struct fixed_phy *fp)
 	u16 lpagb = 0;
 	u16 lpa = 0;
 
+	if (gpio_is_valid(fp->link_gpio))
+		fp->status.link = !!gpio_get_value_cansleep(fp->link_gpio);
+
 	if (!fp->status.link)
 		goto done;
 	bmsr |= BMSR_LSTATUS | BMSR_ANEGCOMPLETE;
@@ -215,7 +220,8 @@ int fixed_phy_update_state(struct phy_device *phydev,
 EXPORT_SYMBOL(fixed_phy_update_state);
 
 int fixed_phy_add(unsigned int irq, int phy_addr,
-		  struct fixed_phy_status *status)
+		  struct fixed_phy_status *status,
+		  int link_gpio)
 {
 	int ret;
 	struct fixed_mdio_bus *fmb = &platform_fmb;
@@ -231,15 +237,26 @@ int fixed_phy_add(unsigned int irq, int phy_addr,
 
 	fp->addr = phy_addr;
 	fp->status = *status;
+	fp->link_gpio = link_gpio;
+
+	if (gpio_is_valid(fp->link_gpio)) {
+		ret = gpio_request_one(fp->link_gpio, GPIOF_DIR_IN,
+				       "fixed-link-gpio-link");
+		if (ret)
+			goto err_regs;
+	}
 
 	ret = fixed_phy_update_regs(fp);
 	if (ret)
-		goto err_regs;
+		goto err_gpio;
 
 	list_add_tail(&fp->node, &fmb->phys);
 
 	return 0;
 
+err_gpio:
+	if (gpio_is_valid(fp->link_gpio))
+		gpio_free(fp->link_gpio);
 err_regs:
 	kfree(fp);
 	return ret;
@@ -254,6 +271,8 @@ void fixed_phy_del(int phy_addr)
 	list_for_each_entry_safe(fp, tmp, &fmb->phys, node) {
 		if (fp->addr == phy_addr) {
 			list_del(&fp->node);
+			if (gpio_is_valid(fp->link_gpio))
+				gpio_free(fp->link_gpio);
 			kfree(fp);
 			return;
 		}
@@ -266,6 +285,7 @@ static DEFINE_SPINLOCK(phy_fixed_addr_lock);
 
 struct phy_device *fixed_phy_register(unsigned int irq,
 				      struct fixed_phy_status *status,
+				      int link_gpio,
 				      struct device_node *np)
 {
 	struct fixed_mdio_bus *fmb = &platform_fmb;
@@ -282,7 +302,7 @@ struct phy_device *fixed_phy_register(unsigned int irq,
 	phy_addr = phy_fixed_addr++;
 	spin_unlock(&phy_fixed_addr_lock);
 
-	ret = fixed_phy_add(PHY_POLL, phy_addr, status);
+	ret = fixed_phy_add(PHY_POLL, phy_addr, status, link_gpio);
 	if (ret < 0)
 		return ERR_PTR(ret);
 
diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index 7c8c23cc6896..1350fa25cdb0 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -16,6 +16,7 @@
 #include <linux/phy.h>
 #include <linux/phy_fixed.h>
 #include <linux/of.h>
+#include <linux/of_gpio.h>
 #include <linux/of_irq.h>
 #include <linux/of_mdio.h>
 #include <linux/module.h>
@@ -294,6 +295,7 @@ int of_phy_register_fixed_link(struct device_node *np)
 	struct fixed_phy_status status = {};
 	struct device_node *fixed_link_node;
 	const __be32 *fixed_link_prop;
+	int link_gpio;
 	int len, err;
 	struct phy_device *phy;
 	const char *managed;
@@ -302,7 +304,7 @@ int of_phy_register_fixed_link(struct device_node *np)
 	if (err == 0) {
 		if (strcmp(managed, "in-band-status") == 0) {
 			/* status is zeroed, namely its .link member */
-			phy = fixed_phy_register(PHY_POLL, &status, np);
+			phy = fixed_phy_register(PHY_POLL, &status, -1, np);
 			return IS_ERR(phy) ? PTR_ERR(phy) : 0;
 		}
 	}
@@ -318,8 +320,13 @@ int of_phy_register_fixed_link(struct device_node *np)
 		status.pause = of_property_read_bool(fixed_link_node, "pause");
 		status.asym_pause = of_property_read_bool(fixed_link_node,
 							  "asym-pause");
+		link_gpio = of_get_named_gpio_flags(fixed_link_node,
+						    "link-gpios", 0, NULL);
 		of_node_put(fixed_link_node);
-		phy = fixed_phy_register(PHY_POLL, &status, np);
+		if (link_gpio == -EPROBE_DEFER)
+			return -EPROBE_DEFER;
+
+		phy = fixed_phy_register(PHY_POLL, &status, link_gpio, np);
 		return IS_ERR(phy) ? PTR_ERR(phy) : 0;
 	}
 
@@ -331,7 +338,7 @@ int of_phy_register_fixed_link(struct device_node *np)
 		status.speed = be32_to_cpu(fixed_link_prop[2]);
 		status.pause = be32_to_cpu(fixed_link_prop[3]);
 		status.asym_pause = be32_to_cpu(fixed_link_prop[4]);
-		phy = fixed_phy_register(PHY_POLL, &status, np);
+		phy = fixed_phy_register(PHY_POLL, &status, -1, np);
 		return IS_ERR(phy) ? PTR_ERR(phy) : 0;
 	}
 
diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h
index fe5732d53eda..2400d2ea4f34 100644
--- a/include/linux/phy_fixed.h
+++ b/include/linux/phy_fixed.h
@@ -13,9 +13,11 @@ struct device_node;
 
 #if IS_ENABLED(CONFIG_FIXED_PHY)
 extern int fixed_phy_add(unsigned int irq, int phy_id,
-			 struct fixed_phy_status *status);
+			 struct fixed_phy_status *status,
+			 int link_gpio);
 extern struct phy_device *fixed_phy_register(unsigned int irq,
 					     struct fixed_phy_status *status,
+					     int link_gpio,
 					     struct device_node *np);
 extern void fixed_phy_del(int phy_addr);
 extern int fixed_phy_set_link_update(struct phy_device *phydev,
@@ -26,12 +28,14 @@ extern int fixed_phy_update_state(struct phy_device *phydev,
 			   const struct fixed_phy_status *changed);
 #else
 static inline int fixed_phy_add(unsigned int irq, int phy_id,
-				struct fixed_phy_status *status)
+				struct fixed_phy_status *status,
+				int link_gpio)
 {
 	return -ENODEV;
 }
 static inline struct phy_device *fixed_phy_register(unsigned int irq,
 						struct fixed_phy_status *status,
+						int gpio_link,
 						struct device_node *np)
 {
 	return ERR_PTR(-ENODEV);
-- 
cgit v1.2.3-70-g09d2


From 07f081fb5057b2ea98baeca3a47bf0eb33e94aa1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 30 Aug 2015 16:59:57 +0100
Subject: PKCS#7: Add OIDs for sha224, sha284 and sha512 hash algos and use
 them

Add OIDs for sha224, sha284 and sha512 hash algos and use them to select
the hashing algorithm.  Without this, something like the following error
might get written to dmesg:

[   31.829322] PKCS7: Unknown OID: [32] 2.16.840.1.101.3.4.2.3
[   31.829328] PKCS7: Unknown OID: [180] 2.16.840.1.101.3.4.2.3
[   31.829330] Unsupported digest algo: 55

Where the 55 on the third line is OID__NR indicating an unknown OID.

Reported-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-By: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 crypto/asymmetric_keys/mscode_parser.c | 9 +++++++++
 crypto/asymmetric_keys/pkcs7_parser.c  | 8 ++++++++
 include/linux/oid_registry.h           | 3 +++
 3 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/crypto/asymmetric_keys/mscode_parser.c b/crypto/asymmetric_keys/mscode_parser.c
index 214a992123cd..adcef59eec0b 100644
--- a/crypto/asymmetric_keys/mscode_parser.c
+++ b/crypto/asymmetric_keys/mscode_parser.c
@@ -97,6 +97,15 @@ int mscode_note_digest_algo(void *context, size_t hdrlen,
 	case OID_sha256:
 		ctx->digest_algo = HASH_ALGO_SHA256;
 		break;
+	case OID_sha384:
+		ctx->digest_algo = HASH_ALGO_SHA384;
+		break;
+	case OID_sha512:
+		ctx->digest_algo = HASH_ALGO_SHA512;
+		break;
+	case OID_sha224:
+		ctx->digest_algo = HASH_ALGO_SHA224;
+		break;
 
 	case OID__NR:
 		sprint_oid(value, vlen, buffer, sizeof(buffer));
diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c
index e6298b7a945a..758acabf2d81 100644
--- a/crypto/asymmetric_keys/pkcs7_parser.c
+++ b/crypto/asymmetric_keys/pkcs7_parser.c
@@ -229,6 +229,14 @@ int pkcs7_sig_note_digest_algo(void *context, size_t hdrlen,
 	case OID_sha256:
 		ctx->sinfo->sig.pkey_hash_algo = HASH_ALGO_SHA256;
 		break;
+	case OID_sha384:
+		ctx->sinfo->sig.pkey_hash_algo = HASH_ALGO_SHA384;
+		break;
+	case OID_sha512:
+		ctx->sinfo->sig.pkey_hash_algo = HASH_ALGO_SHA512;
+		break;
+	case OID_sha224:
+		ctx->sinfo->sig.pkey_hash_algo = HASH_ALGO_SHA224;
 	default:
 		printk("Unsupported digest algo: %u\n", ctx->last_oid);
 		return -ENOPKG;
diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index 93e0ff92fb9b..d2fa9ca42e9a 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -63,6 +63,9 @@ enum OID {
 	OID_certAuthInfoAccess,		/* 1.3.6.1.5.5.7.1.1 */
 	OID_sha1,			/* 1.3.14.3.2.26 */
 	OID_sha256,			/* 2.16.840.1.101.3.4.2.1 */
+	OID_sha384,			/* 2.16.840.1.101.3.4.2.2 */
+	OID_sha512,			/* 2.16.840.1.101.3.4.2.3 */
+	OID_sha224,			/* 2.16.840.1.101.3.4.2.4 */
 
 	/* Distinguished Name attribute IDs [RFC 2256] */
 	OID_commonName,			/* 2.5.4.3 */
-- 
cgit v1.2.3-70-g09d2


From 5bcd0b7f3c56c616abffd89e11c841834dd1528c Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Tue, 1 Sep 2015 07:56:38 +0800
Subject: reset: Add (devm_)reset_control_get stub functions

So the drivers can be compiled with CONFIG_RESET_CONTROLLER disabled.

Signed-off-by: Axel Lin <axel.lin@ingics.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 include/linux/reset.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/reset.h b/include/linux/reset.h
index da5602bd77d7..7f65f9cff951 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -74,6 +74,20 @@ static inline int device_reset_optional(struct device *dev)
 	return -ENOSYS;
 }
 
+static inline struct reset_control *__must_check reset_control_get(
+					struct device *dev, const char *id)
+{
+	WARN_ON(1);
+	return ERR_PTR(-EINVAL);
+}
+
+static inline struct reset_control *__must_check devm_reset_control_get(
+					struct device *dev, const char *id)
+{
+	WARN_ON(1);
+	return ERR_PTR(-EINVAL);
+}
+
 static inline struct reset_control *reset_control_get_optional(
 					struct device *dev, const char *id)
 {
-- 
cgit v1.2.3-70-g09d2


From 6bfb7c7434f75d29241413dc7e784295ba56de98 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 3 Aug 2015 08:36:14 +0530
Subject: cpufreq: remove redundant CPUFREQ_INCOMPATIBLE notifier event

What's being done from CPUFREQ_INCOMPATIBLE, can also be done with
CPUFREQ_ADJUST. There is nothing special with CPUFREQ_INCOMPATIBLE
notifier.

Kill CPUFREQ_INCOMPATIBLE and fix its usage sites.

This also updates the numbering of notifier events to remove holes.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/cpu-freq/core.txt       | 7 ++-----
 drivers/acpi/processor_perflib.c      | 2 +-
 drivers/cpufreq/cpufreq.c             | 4 ----
 drivers/cpufreq/ppc_cbe_cpufreq_pmi.c | 4 ++--
 drivers/video/fbdev/pxafb.c           | 1 -
 drivers/video/fbdev/sa1100fb.c        | 1 -
 include/linux/cpufreq.h               | 9 ++++-----
 7 files changed, 9 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cpu-freq/core.txt b/Documentation/cpu-freq/core.txt
index 70933eadc308..ba78e7c2a069 100644
--- a/Documentation/cpu-freq/core.txt
+++ b/Documentation/cpu-freq/core.txt
@@ -55,16 +55,13 @@ transition notifiers.
 ----------------------------
 
 These are notified when a new policy is intended to be set. Each
-CPUFreq policy notifier is called three times for a policy transition:
+CPUFreq policy notifier is called twice for a policy transition:
 
 1.) During CPUFREQ_ADJUST all CPUFreq notifiers may change the limit if
     they see a need for this - may it be thermal considerations or
     hardware limitations.
 
-2.) During CPUFREQ_INCOMPATIBLE only changes may be done in order to avoid
-    hardware failure.
-
-3.) And during CPUFREQ_NOTIFY all notifiers are informed of the new policy
+2.) And during CPUFREQ_NOTIFY all notifiers are informed of the new policy
    - if two hardware drivers failed to agree on a new policy before this
    stage, the incompatible hardware shall be shut down, and the user
    informed of this.
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 36b6da2918a6..91941f1cdecb 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -87,7 +87,7 @@ static int acpi_processor_ppc_notifier(struct notifier_block *nb,
 	if (ignore_ppc)
 		return 0;
 
-	if (event != CPUFREQ_INCOMPATIBLE)
+	if (event != CPUFREQ_ADJUST)
 		return 0;
 
 	mutex_lock(&performance_mutex);
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 76a26609d96b..293f47b814bf 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2206,10 +2206,6 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
 			CPUFREQ_ADJUST, new_policy);
 
-	/* adjust if necessary - hardware incompatibility*/
-	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
-			CPUFREQ_INCOMPATIBLE, new_policy);
-
 	/*
 	 * verify the cpu speed can be set within this limit, which might be
 	 * different to the first one
diff --git a/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c b/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c
index d29e8da396a0..7969f7690498 100644
--- a/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c
+++ b/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c
@@ -97,8 +97,8 @@ static int pmi_notifier(struct notifier_block *nb,
 	struct cpufreq_frequency_table *cbe_freqs;
 	u8 node;
 
-	/* Should this really be called for CPUFREQ_ADJUST, CPUFREQ_INCOMPATIBLE
-	 * and CPUFREQ_NOTIFY policy events?)
+	/* Should this really be called for CPUFREQ_ADJUST and CPUFREQ_NOTIFY
+	 * policy events?)
 	 */
 	if (event == CPUFREQ_START)
 		return 0;
diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c
index 7245611ec963..94813af97f09 100644
--- a/drivers/video/fbdev/pxafb.c
+++ b/drivers/video/fbdev/pxafb.c
@@ -1668,7 +1668,6 @@ pxafb_freq_policy(struct notifier_block *nb, unsigned long val, void *data)
 
 	switch (val) {
 	case CPUFREQ_ADJUST:
-	case CPUFREQ_INCOMPATIBLE:
 		pr_debug("min dma period: %d ps, "
 			"new clock %d kHz\n", pxafb_display_dma_period(var),
 			policy->max);
diff --git a/drivers/video/fbdev/sa1100fb.c b/drivers/video/fbdev/sa1100fb.c
index 89dd7e02197f..dcf774c15889 100644
--- a/drivers/video/fbdev/sa1100fb.c
+++ b/drivers/video/fbdev/sa1100fb.c
@@ -1042,7 +1042,6 @@ sa1100fb_freq_policy(struct notifier_block *nb, unsigned long val,
 
 	switch (val) {
 	case CPUFREQ_ADJUST:
-	case CPUFREQ_INCOMPATIBLE:
 		dev_dbg(fbi->dev, "min dma period: %d ps, "
 			"new clock %d kHz\n", sa1100fb_min_dma_period(fbi),
 			policy->max);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index bde1e567b3a9..bedcc90c0757 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -369,11 +369,10 @@ static inline void cpufreq_resume(void) {}
 
 /* Policy Notifiers  */
 #define CPUFREQ_ADJUST			(0)
-#define CPUFREQ_INCOMPATIBLE		(1)
-#define CPUFREQ_NOTIFY			(2)
-#define CPUFREQ_START			(3)
-#define CPUFREQ_CREATE_POLICY		(4)
-#define CPUFREQ_REMOVE_POLICY		(5)
+#define CPUFREQ_NOTIFY			(1)
+#define CPUFREQ_START			(2)
+#define CPUFREQ_CREATE_POLICY		(3)
+#define CPUFREQ_REMOVE_POLICY		(4)
 
 #ifdef CONFIG_CPU_FREQ
 int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list);
-- 
cgit v1.2.3-70-g09d2


From e27f8bd248756310a6df8b67f96d41d5a693642c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 3 Aug 2015 08:36:17 +0530
Subject: cpufreq: remove redundant 'governor' field from user_policy

Its always same as policy->governor, and there is no need to keep
another copy of it. Remove it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 7 ++-----
 include/linux/cpufreq.h   | 1 -
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 8e71d8e08439..3033952391fe 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -675,7 +675,6 @@ static ssize_t store_scaling_governor(struct cpufreq_policy *policy,
 		return ret;
 
 	policy->user_policy.policy = policy->policy;
-	policy->user_policy.governor = policy->governor;
 	return count;
 }
 
@@ -1323,10 +1322,9 @@ static int cpufreq_online(unsigned int cpu)
 		goto out_exit_policy;
 	}
 
-	if (new_policy) {
+	if (new_policy)
 		policy->user_policy.policy = policy->policy;
-		policy->user_policy.governor = policy->governor;
-	}
+
 	up_write(&policy->rwsem);
 
 	kobject_uevent(&policy->kobj, KOBJ_ADD);
@@ -2305,7 +2303,6 @@ int cpufreq_update_policy(unsigned int cpu)
 	new_policy.min = policy->user_policy.min;
 	new_policy.max = policy->user_policy.max;
 	new_policy.policy = policy->user_policy.policy;
-	new_policy.governor = policy->user_policy.governor;
 
 	/*
 	 * BIOS might change freq behind our back
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index bedcc90c0757..752bf8a5c314 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -55,7 +55,6 @@ struct cpufreq_real_policy {
 	unsigned int		min;    /* in kHz */
 	unsigned int		max;    /* in kHz */
 	unsigned int		policy; /* see above */
-	struct cpufreq_governor	*governor; /* see below */
 };
 
 struct cpufreq_policy {
-- 
cgit v1.2.3-70-g09d2


From 88dc4384958759510db248bf9158434ac783d407 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 3 Aug 2015 08:36:18 +0530
Subject: cpufreq: remove redundant 'policy' field from user_policy

Its always same as policy->policy, and there is no need to keep another
copy of it. Remove it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 10 +---------
 include/linux/cpufreq.h   |  1 -
 2 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 3033952391fe..a80bd68bbd74 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -671,11 +671,7 @@ static ssize_t store_scaling_governor(struct cpufreq_policy *policy,
 		return -EINVAL;
 
 	ret = cpufreq_set_policy(policy, &new_policy);
-	if (ret)
-		return ret;
-
-	policy->user_policy.policy = policy->policy;
-	return count;
+	return ret ? ret : count;
 }
 
 /**
@@ -1322,9 +1318,6 @@ static int cpufreq_online(unsigned int cpu)
 		goto out_exit_policy;
 	}
 
-	if (new_policy)
-		policy->user_policy.policy = policy->policy;
-
 	up_write(&policy->rwsem);
 
 	kobject_uevent(&policy->kobj, KOBJ_ADD);
@@ -2302,7 +2295,6 @@ int cpufreq_update_policy(unsigned int cpu)
 	memcpy(&new_policy, policy, sizeof(*policy));
 	new_policy.min = policy->user_policy.min;
 	new_policy.max = policy->user_policy.max;
-	new_policy.policy = policy->user_policy.policy;
 
 	/*
 	 * BIOS might change freq behind our back
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 752bf8a5c314..54dbbff0a55e 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -54,7 +54,6 @@ struct cpufreq_cpuinfo {
 struct cpufreq_real_policy {
 	unsigned int		min;    /* in kHz */
 	unsigned int		max;    /* in kHz */
-	unsigned int		policy; /* see above */
 };
 
 struct cpufreq_policy {
-- 
cgit v1.2.3-70-g09d2


From c7a7b418dd1991079dd7ef03fec7d1863ef96154 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 3 Aug 2015 08:36:19 +0530
Subject: cpufreq: rename cpufreq_real_policy as cpufreq_user_policy

Its all about caching min/max freq requested by userspace, and
the name 'cpufreq_real_policy' doesn't fit that well. Rename it to
cpufreq_user_policy.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpufreq.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 54dbbff0a55e..6ff6a4d95eea 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -51,7 +51,7 @@ struct cpufreq_cpuinfo {
 	unsigned int		transition_latency;
 };
 
-struct cpufreq_real_policy {
+struct cpufreq_user_policy {
 	unsigned int		min;    /* in kHz */
 	unsigned int		max;    /* in kHz */
 };
@@ -86,7 +86,7 @@ struct cpufreq_policy {
 	struct work_struct	update; /* if update_policy() needs to be
 					 * called, but you're in IRQ context */
 
-	struct cpufreq_real_policy	user_policy;
+	struct cpufreq_user_policy user_policy;
 	struct cpufreq_frequency_table	*freq_table;
 
 	struct list_head        policy_list;
-- 
cgit v1.2.3-70-g09d2


From e5276937ae6e654a811345f0716266f12e77bede Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 09:24:23 -0700
Subject: flow_dissector: Move skb related functions to skbuff.h

Move the flow dissector functions that are specific to skbuffs into
skbuff.h out of flow_dissector.h. This makes flow_dissector.h have
no dependencies on skbuff.h.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 47 +++++++++++++++++++++++++++++++++++++++++
 include/net/flow_dissector.h | 50 --------------------------------------------
 2 files changed, 47 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 989307f991db..8a697c673b58 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -945,6 +945,53 @@ skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
 	skb->hash = hash;
 }
 
+void __skb_get_hash(struct sk_buff *skb);
+u32 skb_get_poff(const struct sk_buff *skb);
+u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+		   const struct flow_keys *keys, int hlen);
+__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
+			    void *data, int hlen_proto);
+
+static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
+					int thoff, u8 ip_proto)
+{
+	return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
+}
+
+void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
+			     const struct flow_dissector_key *key,
+			     unsigned int key_count);
+
+bool __skb_flow_dissect(const struct sk_buff *skb,
+			struct flow_dissector *flow_dissector,
+			void *target_container,
+			void *data, __be16 proto, int nhoff, int hlen);
+
+static inline bool skb_flow_dissect(const struct sk_buff *skb,
+				    struct flow_dissector *flow_dissector,
+				    void *target_container)
+{
+	return __skb_flow_dissect(skb, flow_dissector, target_container,
+				  NULL, 0, 0, 0);
+}
+
+static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
+					      struct flow_keys *flow)
+{
+	memset(flow, 0, sizeof(*flow));
+	return __skb_flow_dissect(skb, &flow_keys_dissector, flow,
+				  NULL, 0, 0, 0);
+}
+
+static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow,
+						  void *data, __be16 proto,
+						  int nhoff, int hlen)
+{
+	memset(flow, 0, sizeof(*flow));
+	return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow,
+				  data, proto, nhoff, hlen);
+}
+
 static inline __u32 skb_get_hash(struct sk_buff *skb)
 {
 	if (!skb->l4_hash && !skb->sw_hash)
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 1a8c22419936..6777a84c6f94 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -2,7 +2,6 @@
 #define _NET_FLOW_DISSECTOR_H
 
 #include <linux/types.h>
-#include <linux/skbuff.h>
 #include <linux/in6.h>
 #include <uapi/linux/if_ether.h>
 
@@ -134,23 +133,6 @@ struct flow_dissector {
 	unsigned short int offset[FLOW_DISSECTOR_KEY_MAX];
 };
 
-void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
-			     const struct flow_dissector_key *key,
-			     unsigned int key_count);
-
-bool __skb_flow_dissect(const struct sk_buff *skb,
-			struct flow_dissector *flow_dissector,
-			void *target_container,
-			void *data, __be16 proto, int nhoff, int hlen);
-
-static inline bool skb_flow_dissect(const struct sk_buff *skb,
-				    struct flow_dissector *flow_dissector,
-				    void *target_container)
-{
-	return __skb_flow_dissect(skb, flow_dissector, target_container,
-				  NULL, 0, 0, 0);
-}
-
 struct flow_keys {
 	struct flow_dissector_key_control control;
 #define FLOW_KEYS_HASH_START_FIELD basic
@@ -170,38 +152,6 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow);
 extern struct flow_dissector flow_keys_dissector;
 extern struct flow_dissector flow_keys_buf_dissector;
 
-static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
-					      struct flow_keys *flow)
-{
-	memset(flow, 0, sizeof(*flow));
-	return __skb_flow_dissect(skb, &flow_keys_dissector, flow,
-				  NULL, 0, 0, 0);
-}
-
-static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow,
-						  void *data, __be16 proto,
-						  int nhoff, int hlen)
-{
-	memset(flow, 0, sizeof(*flow));
-	return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow,
-				  data, proto, nhoff, hlen);
-}
-
-__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
-			    void *data, int hlen_proto);
-
-static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
-					int thoff, u8 ip_proto)
-{
-	return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
-}
-
-u32 flow_hash_from_keys(struct flow_keys *keys);
-void __skb_get_hash(struct sk_buff *skb);
-u32 skb_get_poff(const struct sk_buff *skb);
-u32 __skb_get_poff(const struct sk_buff *skb, void *data,
-		   const struct flow_keys *keys, int hlen);
-
 /* struct flow_keys_digest:
  *
  * This structure is used to hold a digest of the full flow keys. This is a
-- 
cgit v1.2.3-70-g09d2


From bcc83839ffdb063dd2b0370cd85c4f825761fc59 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 09:24:24 -0700
Subject: skbuff: Make __skb_set_sw_hash a general function

Move __skb_set_sw_hash to skbuff.h and add __skb_set_hash which is
a common method (between __skb_set_sw_hash and skb_set_hash) to set
the hash in an skbuff.

Also, move skb_clear_hash to be closer to __skb_set_hash.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 45 ++++++++++++++++++++++++++++----------------
 include/net/flow_dissector.h |  5 +++++
 net/core/flow_dissector.c    | 18 ++++++------------
 3 files changed, 40 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8a697c673b58..5d2c812e725b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -937,14 +937,40 @@ enum pkt_hash_types {
 	PKT_HASH_TYPE_L4,	/* Input: src_IP, dst_IP, src_port, dst_port */
 };
 
-static inline void
-skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
+static inline void skb_clear_hash(struct sk_buff *skb)
 {
-	skb->l4_hash = (type == PKT_HASH_TYPE_L4);
+	skb->hash = 0;
 	skb->sw_hash = 0;
+	skb->l4_hash = 0;
+}
+
+static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb)
+{
+	if (!skb->l4_hash)
+		skb_clear_hash(skb);
+}
+
+static inline void
+__skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4)
+{
+	skb->l4_hash = is_l4;
+	skb->sw_hash = is_sw;
 	skb->hash = hash;
 }
 
+static inline void
+skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
+{
+	/* Used by drivers to set hash from HW */
+	__skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4);
+}
+
+static inline void
+__skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4)
+{
+	__skb_set_hash(skb, hash, true, is_l4);
+}
+
 void __skb_get_hash(struct sk_buff *skb);
 u32 skb_get_poff(const struct sk_buff *skb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
@@ -1027,19 +1053,6 @@ static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
 	return skb->hash;
 }
 
-static inline void skb_clear_hash(struct sk_buff *skb)
-{
-	skb->hash = 0;
-	skb->sw_hash = 0;
-	skb->l4_hash = 0;
-}
-
-static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb)
-{
-	if (!skb->l4_hash)
-		skb_clear_hash(skb);
-}
-
 static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from)
 {
 	to->hash = from->hash;
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 6777a84c6f94..af76c496f7db 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -167,4 +167,9 @@ struct flow_keys_digest {
 void make_flow_keys_digest(struct flow_keys_digest *digest,
 			   const struct flow_keys *flow);
 
+static inline bool flow_keys_have_l4(struct flow_keys *keys)
+{
+	return (keys->ports.ports || keys->tags.flow_label);
+}
+
 #endif
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 11e6540fa386..151b6e48b81f 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -590,15 +590,6 @@ void make_flow_keys_digest(struct flow_keys_digest *digest,
 }
 EXPORT_SYMBOL(make_flow_keys_digest);
 
-static inline void __skb_set_sw_hash(struct sk_buff *skb, u32 hash,
-				     struct flow_keys *keys)
-{
-	if (keys->ports.ports)
-		skb->l4_hash = 1;
-	skb->sw_hash = 1;
-	skb->hash = hash;
-}
-
 /**
  * __skb_get_hash: calculate a flow hash
  * @skb: sk_buff to calculate flow hash from
@@ -619,7 +610,8 @@ void __skb_get_hash(struct sk_buff *skb)
 	if (!hash)
 		return;
 
-	__skb_set_sw_hash(skb, hash, &keys);
+	__skb_set_sw_hash(skb, hash,
+			  flow_keys_have_l4(&keys));
 }
 EXPORT_SYMBOL(__skb_get_hash);
 
@@ -648,7 +640,8 @@ __u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
 	keys.tags.flow_label = (__force u32)fl6->flowlabel;
 	keys.basic.ip_proto = fl6->flowi6_proto;
 
-	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys);
+	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys),
+			  flow_keys_have_l4(&keys));
 
 	return skb->hash;
 }
@@ -668,7 +661,8 @@ __u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
 	keys.keyid.keyid = fl4->fl4_gre_key;
 	keys.basic.ip_proto = fl4->flowi4_proto;
 
-	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys);
+	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys),
+			  flow_keys_have_l4(&keys));
 
 	return skb->hash;
 }
-- 
cgit v1.2.3-70-g09d2


From c6cc1ca7f4d70cbb3ea3a5ca163c5dabaf155cdb Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 09:24:25 -0700
Subject: flowi: Abstract out functions to get flow hash based on flowi

Create __get_hash_from_flowi6 and __get_hash_from_flowi4 to get the
flow keys and hash based on flowi structures. These are called by
__skb_get_hash_flowi6 and __skb_get_hash_flowi4. Also, created
get_hash_from_flowi6 and get_hash_from_flowi4 which can be called
when just the hash value for a flowi is needed.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 16 ++++++++++++----
 include/net/flow.h           | 19 +++++++++++++++++++
 include/net/flow_dissector.h |  2 ++
 net/core/flow.c              | 36 ++++++++++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5d2c812e725b..bbe41bccfc5f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1030,8 +1030,12 @@ __u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6);
 
 static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
 {
-	if (!skb->l4_hash && !skb->sw_hash)
-		__skb_get_hash_flowi6(skb, fl6);
+	if (!skb->l4_hash && !skb->sw_hash) {
+		struct flow_keys keys;
+
+		__skb_set_sw_hash(skb, __get_hash_from_flowi6(fl6, &keys),
+				  flow_keys_have_l4(&keys));
+	}
 
 	return skb->hash;
 }
@@ -1040,8 +1044,12 @@ __u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl);
 
 static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
 {
-	if (!skb->l4_hash && !skb->sw_hash)
-		__skb_get_hash_flowi4(skb, fl4);
+	if (!skb->l4_hash && !skb->sw_hash) {
+		struct flow_keys keys;
+
+		__skb_set_sw_hash(skb, __get_hash_from_flowi4(fl4, &keys),
+				  flow_keys_have_l4(&keys));
+	}
 
 	return skb->hash;
 }
diff --git a/include/net/flow.h b/include/net/flow.h
index 9e0297c4c11d..dafe97c3c048 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -10,6 +10,7 @@
 #include <linux/socket.h>
 #include <linux/in6.h>
 #include <linux/atomic.h>
+#include <net/flow_dissector.h>
 
 /*
  * ifindex generation is per-net namespace, and loopback is
@@ -243,4 +244,22 @@ void flow_cache_flush(struct net *net);
 void flow_cache_flush_deferred(struct net *net);
 extern atomic_t flow_cache_genid;
 
+__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys);
+
+static inline __u32 get_hash_from_flowi6(struct flowi6 *fl6)
+{
+	struct flow_keys keys;
+
+	return __get_hash_from_flowi6(fl6, &keys);
+}
+
+__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys);
+
+static inline __u32 get_hash_from_flowi4(struct flowi4 *fl4)
+{
+	struct flow_keys keys;
+
+	return __get_hash_from_flowi4(fl4, &keys);
+}
+
 #endif
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index af76c496f7db..74fe160f0b05 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -172,4 +172,6 @@ static inline bool flow_keys_have_l4(struct flow_keys *keys)
 	return (keys->ports.ports || keys->tags.flow_label);
 }
 
+u32 flow_hash_from_keys(struct flow_keys *keys);
+
 #endif
diff --git a/net/core/flow.c b/net/core/flow.c
index 1033725be40b..61930bb0eb59 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -22,6 +22,7 @@
 #include <linux/cpumask.h>
 #include <linux/mutex.h>
 #include <net/flow.h>
+#include <net/flow_dissector.h>
 #include <linux/atomic.h>
 #include <linux/security.h>
 #include <net/net_namespace.h>
@@ -509,3 +510,38 @@ void flow_cache_fini(struct net *net)
 	fc->percpu = NULL;
 }
 EXPORT_SYMBOL(flow_cache_fini);
+
+__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys)
+{
+	memset(keys, 0, sizeof(*keys));
+
+	memcpy(&keys->addrs.v6addrs.src, &fl6->saddr,
+	    sizeof(keys->addrs.v6addrs.src));
+	memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr,
+	    sizeof(keys->addrs.v6addrs.dst));
+	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+	keys->ports.src = fl6->fl6_sport;
+	keys->ports.dst = fl6->fl6_dport;
+	keys->keyid.keyid = fl6->fl6_gre_key;
+	keys->tags.flow_label = (__force u32)fl6->flowlabel;
+	keys->basic.ip_proto = fl6->flowi6_proto;
+
+	return flow_hash_from_keys(keys);
+}
+EXPORT_SYMBOL(__get_hash_from_flowi6);
+
+__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys)
+{
+	memset(keys, 0, sizeof(*keys));
+
+	keys->addrs.v4addrs.src = fl4->saddr;
+	keys->addrs.v4addrs.dst = fl4->daddr;
+	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+	keys->ports.src = fl4->fl4_sport;
+	keys->ports.dst = fl4->fl4_dport;
+	keys->keyid.keyid = fl4->fl4_gre_key;
+	keys->basic.ip_proto = fl4->flowi4_proto;
+
+	return flow_hash_from_keys(keys);
+}
+EXPORT_SYMBOL(__get_hash_from_flowi4);
-- 
cgit v1.2.3-70-g09d2


From cd79a2382aa5dcefa6e21a7c59bb1bb19e53b74d Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 09:24:27 -0700
Subject: flow_dissector: Add flags argument to skb_flow_dissector functions

The flags argument will allow control of the dissection process (for
instance whether to parse beyond L3).

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c             |  2 +-
 drivers/net/ethernet/cisco/enic/enic_clsf.c |  2 +-
 drivers/net/hyperv/netvsc_drv.c             |  2 +-
 include/linux/skbuff.h                      | 19 +++++++++++--------
 net/core/flow_dissector.c                   |  7 ++++---
 net/ethernet/eth.c                          |  2 +-
 net/sched/cls_flow.c                        |  2 +-
 net/sched/cls_flower.c                      |  2 +-
 net/sched/sch_choke.c                       |  4 ++--
 9 files changed, 23 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 06e2d01f0b4e..771a449d2f56 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3095,7 +3095,7 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 	int noff, proto = -1;
 
 	if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23)
-		return skb_flow_dissect_flow_keys(skb, fk);
+		return skb_flow_dissect_flow_keys(skb, fk, 0);
 
 	fk->ports.ports = 0;
 	noff = skb_network_offset(skb);
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c
index d106186f4f4a..3c677ed3c29e 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.c
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c
@@ -177,7 +177,7 @@ int enic_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
 	int res, i;
 
 	enic = netdev_priv(dev);
-	res = skb_flow_dissect_flow_keys(skb, &keys);
+	res = skb_flow_dissect_flow_keys(skb, &keys, 0);
 	if (!res || keys.basic.n_proto != htons(ETH_P_IP) ||
 	    (keys.basic.ip_proto != IPPROTO_TCP &&
 	     keys.basic.ip_proto != IPPROTO_UDP))
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 2990024b90f9..409b48e1e589 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -239,7 +239,7 @@ static bool netvsc_set_hash(u32 *hash, struct sk_buff *skb)
 	struct flow_keys flow;
 	int data_len;
 
-	if (!skb_flow_dissect_flow_keys(skb, &flow) ||
+	if (!skb_flow_dissect_flow_keys(skb, &flow, 0) ||
 	    !(flow.basic.n_proto == htons(ETH_P_IP) ||
 	      flow.basic.n_proto == htons(ETH_P_IPV6)))
 		return false;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bbe41bccfc5f..9e62687c70f3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -991,31 +991,34 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 bool __skb_flow_dissect(const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
 			void *target_container,
-			void *data, __be16 proto, int nhoff, int hlen);
+			void *data, __be16 proto, int nhoff, int hlen,
+			unsigned int flags);
 
 static inline bool skb_flow_dissect(const struct sk_buff *skb,
 				    struct flow_dissector *flow_dissector,
-				    void *target_container)
+				    void *target_container, unsigned int flags)
 {
 	return __skb_flow_dissect(skb, flow_dissector, target_container,
-				  NULL, 0, 0, 0);
+				  NULL, 0, 0, 0, flags);
 }
 
 static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
-					      struct flow_keys *flow)
+					      struct flow_keys *flow,
+					      unsigned int flags)
 {
 	memset(flow, 0, sizeof(*flow));
 	return __skb_flow_dissect(skb, &flow_keys_dissector, flow,
-				  NULL, 0, 0, 0);
+				  NULL, 0, 0, 0, flags);
 }
 
 static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow,
 						  void *data, __be16 proto,
-						  int nhoff, int hlen)
+						  int nhoff, int hlen,
+						  unsigned int flags)
 {
 	memset(flow, 0, sizeof(*flow));
 	return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow,
-				  data, proto, nhoff, hlen);
+				  data, proto, nhoff, hlen, flags);
 }
 
 static inline __u32 skb_get_hash(struct sk_buff *skb)
@@ -2046,7 +2049,7 @@ static inline void skb_probe_transport_header(struct sk_buff *skb,
 
 	if (skb_transport_header_was_set(skb))
 		return;
-	else if (skb_flow_dissect_flow_keys(skb, &keys))
+	else if (skb_flow_dissect_flow_keys(skb, &keys, 0))
 		skb_set_transport_header(skb, keys.control.thoff);
 	else
 		skb_set_transport_header(skb, offset_hint);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 22f3d768b459..c3d9807cb34e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -121,7 +121,8 @@ EXPORT_SYMBOL(__skb_flow_get_ports);
 bool __skb_flow_dissect(const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
 			void *target_container,
-			void *data, __be16 proto, int nhoff, int hlen)
+			void *data, __be16 proto, int nhoff, int hlen,
+			unsigned int flags)
 {
 	struct flow_dissector_key_control *key_control;
 	struct flow_dissector_key_basic *key_basic;
@@ -556,7 +557,7 @@ EXPORT_SYMBOL(flow_hash_from_keys);
 static inline u32 ___skb_get_hash(const struct sk_buff *skb,
 				  struct flow_keys *keys, u32 keyval)
 {
-	if (!skb_flow_dissect_flow_keys(skb, keys))
+	if (!skb_flow_dissect_flow_keys(skb, keys, 0))
 		return 0;
 
 	return __flow_hash_from_keys(keys, keyval);
@@ -726,7 +727,7 @@ u32 skb_get_poff(const struct sk_buff *skb)
 {
 	struct flow_keys keys;
 
-	if (!skb_flow_dissect_flow_keys(skb, &keys))
+	if (!skb_flow_dissect_flow_keys(skb, &keys, 0))
 		return 0;
 
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 217127c3a3ef..d850fdc828f9 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -132,7 +132,7 @@ u32 eth_get_headlen(void *data, unsigned int len)
 
 	/* parse any remaining L2/L3 headers, check for L4 */
 	if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto,
-					    sizeof(*eth), len))
+					    sizeof(*eth), len, 0))
 		return max_t(u32, keys.control.thoff, sizeof(*eth));
 
 	/* parse for any L4 headers */
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index bb2a0f529c1f..536838b657bf 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -301,7 +301,7 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 
 		keymask = f->keymask;
 		if (keymask & FLOW_KEYS_NEEDED)
-			skb_flow_dissect_flow_keys(skb, &flow_keys);
+			skb_flow_dissect_flow_keys(skb, &flow_keys, 0);
 
 		for (n = 0; n < f->nkeys; n++) {
 			key = ffs(keymask) - 1;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 2f3d03f99487..57692947ebbe 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -129,7 +129,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	 * so do it rather here.
 	 */
 	skb_key.basic.n_proto = skb->protocol;
-	skb_flow_dissect(skb, &head->dissector, &skb_key);
+	skb_flow_dissect(skb, &head->dissector, &skb_key, 0);
 
 	fl_set_masked_key(&skb_mkey, &skb_key, &head->mask);
 
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 665bde07916b..02bfd3d1c4f0 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -170,13 +170,13 @@ static bool choke_match_flow(struct sk_buff *skb1,
 
 	if (!choke_skb_cb(skb1)->keys_valid) {
 		choke_skb_cb(skb1)->keys_valid = 1;
-		skb_flow_dissect_flow_keys(skb1, &temp);
+		skb_flow_dissect_flow_keys(skb1, &temp, 0);
 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
 	}
 
 	if (!choke_skb_cb(skb2)->keys_valid) {
 		choke_skb_cb(skb2)->keys_valid = 1;
-		skb_flow_dissect_flow_keys(skb2, &temp);
+		skb_flow_dissect_flow_keys(skb2, &temp, 0);
 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From de4c1f8ba302ccf4f2b3b17dc614b0a0b14d351a Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 18:11:04 -0700
Subject: flow_dissector: Fix function argument ordering dependency

Commit c6cc1ca7f4d70c ("flowi: Abstract out functions to get flow hash
based on flowi") introduced a bug in __skb_set_sw_hash where we
require a dependency on evaluating arguments in a function in order.
There is no such ordering enforced in C, so this incorrect. This
patch fixes that by splitting out the arguments. This bug was
found via a compiler warning that keys may be uninitialized.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9e62687c70f3..eabfb810bc62 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1035,9 +1035,9 @@ static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
 {
 	if (!skb->l4_hash && !skb->sw_hash) {
 		struct flow_keys keys;
+		__u32 hash = __get_hash_from_flowi6(fl6, &keys);
 
-		__skb_set_sw_hash(skb, __get_hash_from_flowi6(fl6, &keys),
-				  flow_keys_have_l4(&keys));
+		__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
 	}
 
 	return skb->hash;
@@ -1049,9 +1049,9 @@ static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
 {
 	if (!skb->l4_hash && !skb->sw_hash) {
 		struct flow_keys keys;
+		__u32 hash = __get_hash_from_flowi4(fl4, &keys);
 
-		__skb_set_sw_hash(skb, __get_hash_from_flowi4(fl4, &keys),
-				  flow_keys_have_l4(&keys));
+		__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
 	}
 
 	return skb->hash;
-- 
cgit v1.2.3-70-g09d2


From 20a17bf6c04e3eca8824c930ecc55ab832558e3b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 1 Sep 2015 21:19:17 -0700
Subject: flow_dissector: Use 'const' where possible.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  8 ++---
 include/net/flow.h        |  8 ++---
 net/core/flow_dissector.c | 79 ++++++++++++++++++++++++-----------------------
 3 files changed, 49 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index eabfb810bc62..2738d355cdf9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1029,9 +1029,9 @@ static inline __u32 skb_get_hash(struct sk_buff *skb)
 	return skb->hash;
 }
 
-__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6);
+__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6);
 
-static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
+static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
 {
 	if (!skb->l4_hash && !skb->sw_hash) {
 		struct flow_keys keys;
@@ -1043,9 +1043,9 @@ static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
 	return skb->hash;
 }
 
-__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl);
+__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl);
 
-static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
+static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4)
 {
 	if (!skb->l4_hash && !skb->sw_hash) {
 		struct flow_keys keys;
diff --git a/include/net/flow.h b/include/net/flow.h
index dafe97c3c048..acd6a096250e 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -244,18 +244,18 @@ void flow_cache_flush(struct net *net);
 void flow_cache_flush_deferred(struct net *net);
 extern atomic_t flow_cache_genid;
 
-__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys);
+__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys);
 
-static inline __u32 get_hash_from_flowi6(struct flowi6 *fl6)
+static inline __u32 get_hash_from_flowi6(const struct flowi6 *fl6)
 {
 	struct flow_keys keys;
 
 	return __get_hash_from_flowi6(fl6, &keys);
 }
 
-__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys);
+__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys);
 
-static inline __u32 get_hash_from_flowi4(struct flowi4 *fl4)
+static inline __u32 get_hash_from_flowi4(const struct flowi4 *fl4)
 {
 	struct flow_keys keys;
 
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 345a0408cfe4..d79699c9d1b9 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -19,14 +19,14 @@
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
-static bool skb_flow_dissector_uses_key(struct flow_dissector *flow_dissector,
-					enum flow_dissector_key_id key_id)
+static bool dissector_uses_key(const struct flow_dissector *flow_dissector,
+			       enum flow_dissector_key_id key_id)
 {
 	return flow_dissector->used_keys & (1 << key_id);
 }
 
-static void skb_flow_dissector_set_key(struct flow_dissector *flow_dissector,
-				       enum flow_dissector_key_id key_id)
+static void dissector_set_key(struct flow_dissector *flow_dissector,
+			      enum flow_dissector_key_id key_id)
 {
 	flow_dissector->used_keys |= (1 << key_id);
 }
@@ -51,20 +51,20 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 		 * boundaries of unsigned short.
 		 */
 		BUG_ON(key->offset > USHRT_MAX);
-		BUG_ON(skb_flow_dissector_uses_key(flow_dissector,
-						   key->key_id));
+		BUG_ON(dissector_uses_key(flow_dissector,
+					  key->key_id));
 
-		skb_flow_dissector_set_key(flow_dissector, key->key_id);
+		dissector_set_key(flow_dissector, key->key_id);
 		flow_dissector->offset[key->key_id] = key->offset;
 	}
 
 	/* Ensure that the dissector always includes control and basic key.
 	 * That way we are able to avoid handling lack of these in fast path.
 	 */
-	BUG_ON(!skb_flow_dissector_uses_key(flow_dissector,
-					    FLOW_DISSECTOR_KEY_CONTROL));
-	BUG_ON(!skb_flow_dissector_uses_key(flow_dissector,
-					    FLOW_DISSECTOR_KEY_BASIC));
+	BUG_ON(!dissector_uses_key(flow_dissector,
+				   FLOW_DISSECTOR_KEY_CONTROL));
+	BUG_ON(!dissector_uses_key(flow_dissector,
+				   FLOW_DISSECTOR_KEY_BASIC));
 }
 EXPORT_SYMBOL(skb_flow_dissector_init);
 
@@ -154,8 +154,8 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 					      FLOW_DISSECTOR_KEY_BASIC,
 					      target_container);
 
-	if (skb_flow_dissector_uses_key(flow_dissector,
-					FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+	if (dissector_uses_key(flow_dissector,
+			       FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
 		struct ethhdr *eth = eth_hdr(skb);
 		struct flow_dissector_key_eth_addrs *key_eth_addrs;
 
@@ -178,8 +178,8 @@ ip:
 
 		ip_proto = iph->protocol;
 
-		if (!skb_flow_dissector_uses_key(flow_dissector,
-						 FLOW_DISSECTOR_KEY_IPV4_ADDRS))
+		if (!dissector_uses_key(flow_dissector,
+					FLOW_DISSECTOR_KEY_IPV4_ADDRS))
 			break;
 
 		key_addrs = skb_flow_dissector_target(flow_dissector,
@@ -218,8 +218,8 @@ ipv6:
 		ip_proto = iph->nexthdr;
 		nhoff += sizeof(struct ipv6hdr);
 
-		if (skb_flow_dissector_uses_key(flow_dissector,
-						FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
+		if (dissector_uses_key(flow_dissector,
+				       FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
 			struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs;
 
 			key_ipv6_addrs = skb_flow_dissector_target(flow_dissector,
@@ -232,8 +232,8 @@ ipv6:
 
 		flow_label = ip6_flowlabel(iph);
 		if (flow_label) {
-			if (skb_flow_dissector_uses_key(flow_dissector,
-				FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
+			if (dissector_uses_key(flow_dissector,
+					       FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
 				key_tags = skb_flow_dissector_target(flow_dissector,
 								     FLOW_DISSECTOR_KEY_FLOW_LABEL,
 								     target_container);
@@ -257,8 +257,8 @@ ipv6:
 		if (!vlan)
 			goto out_bad;
 
-		if (skb_flow_dissector_uses_key(flow_dissector,
-						FLOW_DISSECTOR_KEY_VLANID)) {
+		if (dissector_uses_key(flow_dissector,
+				       FLOW_DISSECTOR_KEY_VLANID)) {
 			key_tags = skb_flow_dissector_target(flow_dissector,
 							     FLOW_DISSECTOR_KEY_VLANID,
 							     target_container);
@@ -298,8 +298,8 @@ ipv6:
 		if (!hdr)
 			goto out_bad;
 
-		if (skb_flow_dissector_uses_key(flow_dissector,
-						FLOW_DISSECTOR_KEY_TIPC_ADDRS)) {
+		if (dissector_uses_key(flow_dissector,
+				       FLOW_DISSECTOR_KEY_TIPC_ADDRS)) {
 			key_addrs = skb_flow_dissector_target(flow_dissector,
 							      FLOW_DISSECTOR_KEY_TIPC_ADDRS,
 							      target_container);
@@ -320,8 +320,8 @@ mpls:
 
 		if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >>
 		     MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) {
-			if (skb_flow_dissector_uses_key(flow_dissector,
-							FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
+			if (dissector_uses_key(flow_dissector,
+					       FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
 				key_keyid = skb_flow_dissector_target(flow_dissector,
 								      FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
 								      target_container);
@@ -374,8 +374,8 @@ ip_proto_again:
 			if (!keyid)
 				goto out_bad;
 
-			if (skb_flow_dissector_uses_key(flow_dissector,
-							FLOW_DISSECTOR_KEY_GRE_KEYID)) {
+			if (dissector_uses_key(flow_dissector,
+					       FLOW_DISSECTOR_KEY_GRE_KEYID)) {
 				key_keyid = skb_flow_dissector_target(flow_dissector,
 								      FLOW_DISSECTOR_KEY_GRE_KEYID,
 								      target_container);
@@ -470,8 +470,8 @@ ip_proto_again:
 		break;
 	}
 
-	if (skb_flow_dissector_uses_key(flow_dissector,
-					FLOW_DISSECTOR_KEY_PORTS)) {
+	if (dissector_uses_key(flow_dissector,
+			       FLOW_DISSECTOR_KEY_PORTS)) {
 		key_ports = skb_flow_dissector_target(flow_dissector,
 						      FLOW_DISSECTOR_KEY_PORTS,
 						      target_container);
@@ -497,18 +497,21 @@ static __always_inline void __flow_hash_secret_init(void)
 	net_get_random_once(&hashrnd, sizeof(hashrnd));
 }
 
-static __always_inline u32 __flow_hash_words(u32 *words, u32 length, u32 keyval)
+static __always_inline u32 __flow_hash_words(const u32 *words, u32 length,
+					     u32 keyval)
 {
 	return jhash2(words, length, keyval);
 }
 
-static inline void *flow_keys_hash_start(struct flow_keys *flow)
+static inline const u32 *flow_keys_hash_start(const struct flow_keys *flow)
 {
+	const void *p = flow;
+
 	BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32));
-	return (void *)flow + FLOW_KEYS_HASH_OFFSET;
+	return (const u32 *)(p + FLOW_KEYS_HASH_OFFSET);
 }
 
-static inline size_t flow_keys_hash_length(struct flow_keys *flow)
+static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
 {
 	size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);
 	BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
@@ -598,7 +601,7 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 
 	__flow_hash_consistentify(keys);
 
-	hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys),
+	hash = __flow_hash_words(flow_keys_hash_start(keys),
 				 flow_keys_hash_length(keys), keyval);
 	if (!hash)
 		hash = 1;
@@ -677,7 +680,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
 }
 EXPORT_SYMBOL(skb_get_hash_perturb);
 
-__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
+__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
 {
 	struct flow_keys keys;
 
@@ -701,7 +704,7 @@ __u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
 }
 EXPORT_SYMBOL(__skb_get_hash_flowi6);
 
-__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
+__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4)
 {
 	struct flow_keys keys;
 
@@ -787,7 +790,7 @@ u32 skb_get_poff(const struct sk_buff *skb)
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
 }
 
-__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys)
+__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
 {
 	memset(keys, 0, sizeof(*keys));
 
@@ -806,7 +809,7 @@ __u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys)
 }
 EXPORT_SYMBOL(__get_hash_from_flowi6);
 
-__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys)
+__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys)
 {
 	memset(keys, 0, sizeof(*keys));
 
-- 
cgit v1.2.3-70-g09d2


From 62da98656b62a5ca57f22263705175af8ded5aa1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Sep 2015 01:26:07 +0200
Subject: netfilter: nf_conntrack: make nf_ct_zone_dflt built-in

Fengguang reported, that some randconfig generated the following linker
issue with nf_ct_zone_dflt object involved:

  [...]
  CC      init/version.o
  LD      init/built-in.o
  net/built-in.o: In function `ipv4_conntrack_defrag':
  nf_defrag_ipv4.c:(.text+0x93e95): undefined reference to `nf_ct_zone_dflt'
  net/built-in.o: In function `ipv6_defrag':
  nf_defrag_ipv6_hooks.c:(.text+0xe3ffe): undefined reference to `nf_ct_zone_dflt'
  make: *** [vmlinux] Error 1

Given that configurations exist where we have a built-in part, which is
accessing nf_ct_zone_dflt such as the two handlers nf_ct_defrag_user()
and nf_ct6_defrag_user(), and a part that configures nf_conntrack as a
module, we must move nf_ct_zone_dflt into a fixed, guaranteed built-in
area when netfilter is configured in general.

Therefore, split the more generic parts into a common header under
include/linux/netfilter/ and move nf_ct_zone_dflt into the built-in
section that already holds parts related to CONFIG_NF_CONNTRACK in the
netfilter core. This fixes the issue on my side.

Fixes: 308ac9143ee2 ("netfilter: nf_conntrack: push zone object into functions")
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h                          |  2 ++
 .../linux/netfilter/nf_conntrack_zones_common.h    | 23 ++++++++++++++++++++++
 include/net/netfilter/nf_conntrack_zones.h         | 19 +-----------------
 net/netfilter/core.c                               |  6 ++++++
 net/netfilter/nf_conntrack_core.c                  |  7 -------
 5 files changed, 32 insertions(+), 25 deletions(-)
 create mode 100644 include/linux/netfilter/nf_conntrack_zones_common.h

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index d788ce62d826..36a652531791 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -368,6 +368,8 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 #endif /*CONFIG_NETFILTER*/
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#include <linux/netfilter/nf_conntrack_zones_common.h>
+
 extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu;
 void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
 extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu;
diff --git a/include/linux/netfilter/nf_conntrack_zones_common.h b/include/linux/netfilter/nf_conntrack_zones_common.h
new file mode 100644
index 000000000000..5d7cf36d4766
--- /dev/null
+++ b/include/linux/netfilter/nf_conntrack_zones_common.h
@@ -0,0 +1,23 @@
+#ifndef _NF_CONNTRACK_ZONES_COMMON_H
+#define _NF_CONNTRACK_ZONES_COMMON_H
+
+#include <uapi/linux/netfilter/nf_conntrack_tuple_common.h>
+
+#define NF_CT_DEFAULT_ZONE_ID	0
+
+#define NF_CT_ZONE_DIR_ORIG	(1 << IP_CT_DIR_ORIGINAL)
+#define NF_CT_ZONE_DIR_REPL	(1 << IP_CT_DIR_REPLY)
+
+#define NF_CT_DEFAULT_ZONE_DIR	(NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL)
+
+#define NF_CT_FLAG_MARK		1
+
+struct nf_conntrack_zone {
+	u16	id;
+	u8	flags;
+	u8	dir;
+};
+
+extern const struct nf_conntrack_zone nf_ct_zone_dflt;
+
+#endif /* _NF_CONNTRACK_ZONES_COMMON_H */
diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h
index 5316c7b3a374..4e32512cef32 100644
--- a/include/net/netfilter/nf_conntrack_zones.h
+++ b/include/net/netfilter/nf_conntrack_zones.h
@@ -1,24 +1,7 @@
 #ifndef _NF_CONNTRACK_ZONES_H
 #define _NF_CONNTRACK_ZONES_H
 
-#include <linux/netfilter/nf_conntrack_tuple_common.h>
-
-#define NF_CT_DEFAULT_ZONE_ID	0
-
-#define NF_CT_ZONE_DIR_ORIG	(1 << IP_CT_DIR_ORIGINAL)
-#define NF_CT_ZONE_DIR_REPL	(1 << IP_CT_DIR_REPLY)
-
-#define NF_CT_DEFAULT_ZONE_DIR	(NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL)
-
-#define NF_CT_FLAG_MARK		1
-
-struct nf_conntrack_zone {
-	u16	id;
-	u8	flags;
-	u8	dir;
-};
-
-extern const struct nf_conntrack_zone nf_ct_zone_dflt;
+#include <linux/netfilter/nf_conntrack_zones_common.h>
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 #include <net/netfilter/nf_conntrack_extend.h>
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 0b939b7ad724..8e47f8113495 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -388,6 +388,12 @@ EXPORT_SYMBOL(nf_conntrack_destroy);
 struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly;
 EXPORT_SYMBOL_GPL(nfq_ct_hook);
 
+/* Built-in default zone used e.g. by modules. */
+const struct nf_conntrack_zone nf_ct_zone_dflt = {
+	.id	= NF_CT_DEFAULT_ZONE_ID,
+	.dir	= NF_CT_DEFAULT_ZONE_DIR,
+};
+EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
 #endif /* CONFIG_NF_CONNTRACK */
 
 #ifdef CONFIG_NF_NAT_NEEDED
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ac3be9b0629b..eedf0495f11f 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1286,13 +1286,6 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,
 }
 EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
 
-/* Built-in default zone used e.g. by modules. */
-const struct nf_conntrack_zone nf_ct_zone_dflt = {
-	.id	= NF_CT_DEFAULT_ZONE_ID,
-	.dir	= NF_CT_DEFAULT_ZONE_DIR,
-};
-EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
-
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = {
 	.len	= sizeof(struct nf_conntrack_zone),
-- 
cgit v1.2.3-70-g09d2


From 81d02b7f8c507f06299476a0e5b2aa677c5eaecb Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Sat, 13 Jun 2015 10:34:25 -0500
Subject: ipmi: Make some data const that was only read

Several data structures were only used for reading, so make them
const.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 drivers/char/ipmi/ipmi_bt_sm.c      |  2 +-
 drivers/char/ipmi/ipmi_kcs_sm.c     |  2 +-
 drivers/char/ipmi/ipmi_msghandler.c | 12 +++++-----
 drivers/char/ipmi/ipmi_si_intf.c    | 47 ++++++++++++++++++++-----------------
 drivers/char/ipmi/ipmi_si_sm.h      | 10 ++++----
 drivers/char/ipmi/ipmi_smic_sm.c    |  2 +-
 include/linux/ipmi_smi.h            |  2 +-
 7 files changed, 40 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ipmi/ipmi_bt_sm.c b/drivers/char/ipmi/ipmi_bt_sm.c
index 61e71616689b..feafdab734ae 100644
--- a/drivers/char/ipmi/ipmi_bt_sm.c
+++ b/drivers/char/ipmi/ipmi_bt_sm.c
@@ -694,7 +694,7 @@ static int bt_size(void)
 	return sizeof(struct si_sm_data);
 }
 
-struct si_sm_handlers bt_smi_handlers = {
+const struct si_sm_handlers bt_smi_handlers = {
 	.init_data		= bt_init_data,
 	.start_transaction	= bt_start_transaction,
 	.get_result		= bt_get_result,
diff --git a/drivers/char/ipmi/ipmi_kcs_sm.c b/drivers/char/ipmi/ipmi_kcs_sm.c
index 8c25f596808a..1da61af7f576 100644
--- a/drivers/char/ipmi/ipmi_kcs_sm.c
+++ b/drivers/char/ipmi/ipmi_kcs_sm.c
@@ -540,7 +540,7 @@ static void kcs_cleanup(struct si_sm_data *kcs)
 {
 }
 
-struct si_sm_handlers kcs_smi_handlers = {
+const struct si_sm_handlers kcs_smi_handlers = {
 	.init_data         = init_kcs_data,
 	.start_transaction = start_kcs_transaction,
 	.get_result        = get_kcs_result,
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index ef4a418f630a..e9ea29c4ec60 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -342,7 +342,7 @@ struct ipmi_smi {
 	 * an umpreemptible region to use this.  You must fetch the
 	 * value into a local variable and make sure it is not NULL.
 	 */
-	struct ipmi_smi_handlers *handlers;
+	const struct ipmi_smi_handlers *handlers;
 	void                     *send_info;
 
 #ifdef CONFIG_PROC_FS
@@ -1015,7 +1015,7 @@ int ipmi_get_smi_info(int if_num, struct ipmi_smi_info *data)
 {
 	int           rv = 0;
 	ipmi_smi_t    intf;
-	struct ipmi_smi_handlers *handlers;
+	const struct ipmi_smi_handlers *handlers;
 
 	mutex_lock(&ipmi_interfaces_mutex);
 	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
@@ -1501,7 +1501,7 @@ static struct ipmi_smi_msg *smi_add_send_msg(ipmi_smi_t intf,
 }
 
 
-static void smi_send(ipmi_smi_t intf, struct ipmi_smi_handlers *handlers,
+static void smi_send(ipmi_smi_t intf, const struct ipmi_smi_handlers *handlers,
 		     struct ipmi_smi_msg *smi_msg, int priority)
 {
 	int run_to_completion = intf->run_to_completion;
@@ -2747,7 +2747,7 @@ void ipmi_poll_interface(ipmi_user_t user)
 }
 EXPORT_SYMBOL(ipmi_poll_interface);
 
-int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
+int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
 		      void		       *send_info,
 		      struct ipmi_device_id    *device_id,
 		      struct device            *si_dev,
@@ -4019,7 +4019,7 @@ static void check_msg_timeout(ipmi_smi_t intf, struct seq_table *ent,
 			      unsigned int *waiting_msgs)
 {
 	struct ipmi_recv_msg     *msg;
-	struct ipmi_smi_handlers *handlers;
+	const struct ipmi_smi_handlers *handlers;
 
 	if (intf->in_shutdown)
 		return;
@@ -4086,7 +4086,7 @@ static void check_msg_timeout(ipmi_smi_t intf, struct seq_table *ent,
 				ipmi_inc_stat(intf,
 					      retransmitted_ipmb_commands);
 
-			smi_send(intf, intf->handlers, smi_msg, 0);
+			smi_send(intf, handlers, smi_msg, 0);
 		} else
 			ipmi_free_smi_msg(smi_msg);
 
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 4387bd6de2ca..4a4a13dc98b3 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -163,7 +163,7 @@ struct smi_info {
 	int                    intf_num;
 	ipmi_smi_t             intf;
 	struct si_sm_data      *si_sm;
-	struct si_sm_handlers  *handlers;
+	const struct si_sm_handlers *handlers;
 	enum si_type           si_type;
 	spinlock_t             si_lock;
 	struct ipmi_smi_msg    *waiting_msg;
@@ -1254,7 +1254,7 @@ static void set_maintenance_mode(void *send_info, bool enable)
 		atomic_set(&smi_info->req_events, 0);
 }
 
-static struct ipmi_smi_handlers handlers = {
+static const struct ipmi_smi_handlers handlers = {
 	.owner                  = THIS_MODULE,
 	.start_processing       = smi_start_processing,
 	.get_smi_info		= get_smi_info,
@@ -1442,14 +1442,14 @@ static int std_irq_setup(struct smi_info *info)
 	return rv;
 }
 
-static unsigned char port_inb(struct si_sm_io *io, unsigned int offset)
+static unsigned char port_inb(const struct si_sm_io *io, unsigned int offset)
 {
 	unsigned int addr = io->addr_data;
 
 	return inb(addr + (offset * io->regspacing));
 }
 
-static void port_outb(struct si_sm_io *io, unsigned int offset,
+static void port_outb(const struct si_sm_io *io, unsigned int offset,
 		      unsigned char b)
 {
 	unsigned int addr = io->addr_data;
@@ -1457,14 +1457,14 @@ static void port_outb(struct si_sm_io *io, unsigned int offset,
 	outb(b, addr + (offset * io->regspacing));
 }
 
-static unsigned char port_inw(struct si_sm_io *io, unsigned int offset)
+static unsigned char port_inw(const struct si_sm_io *io, unsigned int offset)
 {
 	unsigned int addr = io->addr_data;
 
 	return (inw(addr + (offset * io->regspacing)) >> io->regshift) & 0xff;
 }
 
-static void port_outw(struct si_sm_io *io, unsigned int offset,
+static void port_outw(const struct si_sm_io *io, unsigned int offset,
 		      unsigned char b)
 {
 	unsigned int addr = io->addr_data;
@@ -1472,14 +1472,14 @@ static void port_outw(struct si_sm_io *io, unsigned int offset,
 	outw(b << io->regshift, addr + (offset * io->regspacing));
 }
 
-static unsigned char port_inl(struct si_sm_io *io, unsigned int offset)
+static unsigned char port_inl(const struct si_sm_io *io, unsigned int offset)
 {
 	unsigned int addr = io->addr_data;
 
 	return (inl(addr + (offset * io->regspacing)) >> io->regshift) & 0xff;
 }
 
-static void port_outl(struct si_sm_io *io, unsigned int offset,
+static void port_outl(const struct si_sm_io *io, unsigned int offset,
 		      unsigned char b)
 {
 	unsigned int addr = io->addr_data;
@@ -1552,49 +1552,52 @@ static int port_setup(struct smi_info *info)
 	return 0;
 }
 
-static unsigned char intf_mem_inb(struct si_sm_io *io, unsigned int offset)
+static unsigned char intf_mem_inb(const struct si_sm_io *io,
+				  unsigned int offset)
 {
 	return readb((io->addr)+(offset * io->regspacing));
 }
 
-static void intf_mem_outb(struct si_sm_io *io, unsigned int offset,
-		     unsigned char b)
+static void intf_mem_outb(const struct si_sm_io *io, unsigned int offset,
+			  unsigned char b)
 {
 	writeb(b, (io->addr)+(offset * io->regspacing));
 }
 
-static unsigned char intf_mem_inw(struct si_sm_io *io, unsigned int offset)
+static unsigned char intf_mem_inw(const struct si_sm_io *io,
+				  unsigned int offset)
 {
 	return (readw((io->addr)+(offset * io->regspacing)) >> io->regshift)
 		& 0xff;
 }
 
-static void intf_mem_outw(struct si_sm_io *io, unsigned int offset,
-		     unsigned char b)
+static void intf_mem_outw(const struct si_sm_io *io, unsigned int offset,
+			  unsigned char b)
 {
 	writeb(b << io->regshift, (io->addr)+(offset * io->regspacing));
 }
 
-static unsigned char intf_mem_inl(struct si_sm_io *io, unsigned int offset)
+static unsigned char intf_mem_inl(const struct si_sm_io *io,
+				  unsigned int offset)
 {
 	return (readl((io->addr)+(offset * io->regspacing)) >> io->regshift)
 		& 0xff;
 }
 
-static void intf_mem_outl(struct si_sm_io *io, unsigned int offset,
-		     unsigned char b)
+static void intf_mem_outl(const struct si_sm_io *io, unsigned int offset,
+			  unsigned char b)
 {
 	writel(b << io->regshift, (io->addr)+(offset * io->regspacing));
 }
 
 #ifdef readq
-static unsigned char mem_inq(struct si_sm_io *io, unsigned int offset)
+static unsigned char mem_inq(const struct si_sm_io *io, unsigned int offset)
 {
 	return (readq((io->addr)+(offset * io->regspacing)) >> io->regshift)
 		& 0xff;
 }
 
-static void mem_outq(struct si_sm_io *io, unsigned int offset,
+static void mem_outq(const struct si_sm_io *io, unsigned int offset,
 		     unsigned char b)
 {
 	writeq(b << io->regshift, (io->addr)+(offset * io->regspacing));
@@ -2522,7 +2525,7 @@ static void ipmi_pci_remove(struct pci_dev *pdev)
 	pci_disable_device(pdev);
 }
 
-static struct pci_device_id ipmi_pci_devices[] = {
+static const struct pci_device_id ipmi_pci_devices[] = {
 	{ PCI_DEVICE(PCI_HP_VENDOR_ID, PCI_MMC_DEVICE_ID) },
 	{ PCI_DEVICE_CLASS(PCI_ERMC_CLASSCODE, PCI_ERMC_CLASSCODE_MASK) },
 	{ 0, }
@@ -2751,7 +2754,7 @@ err_free:
 	return rv;
 }
 
-static struct acpi_device_id acpi_ipmi_match[] = {
+static const struct acpi_device_id acpi_ipmi_match[] = {
 	{ "IPI0001", 0 },
 	{ },
 };
@@ -3324,7 +3327,7 @@ static inline void wait_for_timer_and_thread(struct smi_info *smi_info)
 		del_timer_sync(&smi_info->si_timer);
 }
 
-static struct ipmi_default_vals
+static const struct ipmi_default_vals
 {
 	int type;
 	int port;
diff --git a/drivers/char/ipmi/ipmi_si_sm.h b/drivers/char/ipmi/ipmi_si_sm.h
index df89f73475fb..a705027c0493 100644
--- a/drivers/char/ipmi/ipmi_si_sm.h
+++ b/drivers/char/ipmi/ipmi_si_sm.h
@@ -46,8 +46,8 @@ struct si_sm_data;
  * this interface.
  */
 struct si_sm_io {
-	unsigned char (*inputb)(struct si_sm_io *io, unsigned int offset);
-	void (*outputb)(struct si_sm_io *io,
+	unsigned char (*inputb)(const struct si_sm_io *io, unsigned int offset);
+	void (*outputb)(const struct si_sm_io *io,
 			unsigned int  offset,
 			unsigned char b);
 
@@ -135,7 +135,7 @@ struct si_sm_handlers {
 };
 
 /* Current state machines that we can use. */
-extern struct si_sm_handlers kcs_smi_handlers;
-extern struct si_sm_handlers smic_smi_handlers;
-extern struct si_sm_handlers bt_smi_handlers;
+extern const struct si_sm_handlers kcs_smi_handlers;
+extern const struct si_sm_handlers smic_smi_handlers;
+extern const struct si_sm_handlers bt_smi_handlers;
 
diff --git a/drivers/char/ipmi/ipmi_smic_sm.c b/drivers/char/ipmi/ipmi_smic_sm.c
index c8e77afa8b96..8f7c73ff58f2 100644
--- a/drivers/char/ipmi/ipmi_smic_sm.c
+++ b/drivers/char/ipmi/ipmi_smic_sm.c
@@ -589,7 +589,7 @@ static int smic_size(void)
 	return sizeof(struct si_sm_data);
 }
 
-struct si_sm_handlers smic_smi_handlers = {
+const struct si_sm_handlers smic_smi_handlers = {
 	.init_data         = init_smic_data,
 	.start_transaction = start_smic_transaction,
 	.get_result        = smic_get_result,
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index 0b1e569f5ff5..41de0cf34c49 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -207,7 +207,7 @@ static inline int ipmi_demangle_device_id(const unsigned char *data,
    upper layer until the start_processing() function in the handlers
    is called, and the lower layer must get the interface from that
    call. */
-int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
+int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
 		      void                     *send_info,
 		      struct ipmi_device_id    *device_id,
 		      struct device            *dev,
-- 
cgit v1.2.3-70-g09d2


From 82802f968bd3118af04eaeb3814c21d9813be527 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Mon, 27 Jul 2015 14:55:16 +0900
Subject: ipmi: Don't flush messages in sender() in run-to-completion mode

When flushing queued messages in run-to-completion mode,
smi_event_handler() is recursively called.

flush_messages()
 smi_event_handler()
  handle_transaction_done()
   deliver_recv_msg()
    ipmi_smi_msg_received()
     smi_recv_tasklet()
      sender()
       flush_messages()
        smi_event_handler()
         ...

The depth of the recursive call depends on the number of queued
messages, so it can cause a stack overflow if many messages have
been queued.

To solve this problem, this patch removes flush_messages()
from sender()@ipmi_si_intf.c.  Instead, add flush_messages() to
caller side of sender() if needed.  Additionally, to implement this,
add new handler flush_messages to struct ipmi_smi_handlers.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>

Fixed up a comment and some spacing issues.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 drivers/char/ipmi/ipmi_msghandler.c |  3 +++
 drivers/char/ipmi/ipmi_si_intf.c    | 10 +++++-----
 include/linux/ipmi_smi.h            |  5 +++++
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 5e31c339062e..6e191ff910e6 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -4295,6 +4295,9 @@ static void ipmi_panic_request_and_wait(ipmi_smi_t           intf,
 			    0, 1); /* Don't retry, and don't wait. */
 	if (rv)
 		atomic_sub(2, &panic_done_count);
+	else if (intf->handlers->flush_messages)
+		intf->handlers->flush_messages(intf->send_info);
+
 	while (atomic_read(&panic_done_count) != 0)
 		ipmi_poll(intf);
 }
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 5bd6d5b974cd..2f4cf6e78f72 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -924,8 +924,9 @@ static void check_start_timer_thread(struct smi_info *smi_info)
 	}
 }
 
-static void flush_messages(struct smi_info *smi_info)
+static void flush_messages(void *send_info)
 {
+	struct smi_info *smi_info = send_info;
 	enum si_sm_result result;
 
 	/*
@@ -949,12 +950,10 @@ static void sender(void                *send_info,
 
 	if (smi_info->run_to_completion) {
 		/*
-		 * If we are running to completion, start it and run
-		 * transactions until everything is clear.
+		 * If we are running to completion, start it.  Upper
+		 * layer will call flush_messages to clear it out.
 		 */
 		smi_info->waiting_msg = msg;
-
-		flush_messages(smi_info);
 		return;
 	}
 
@@ -1260,6 +1259,7 @@ static const struct ipmi_smi_handlers handlers = {
 	.set_need_watch		= set_need_watch,
 	.set_maintenance_mode   = set_maintenance_mode,
 	.set_run_to_completion  = set_run_to_completion,
+	.flush_messages		= flush_messages,
 	.poll			= poll,
 };
 
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index 41de0cf34c49..f8cea14485dd 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -115,6 +115,11 @@ struct ipmi_smi_handlers {
 	   implement it. */
 	void (*set_need_watch)(void *send_info, bool enable);
 
+	/*
+	 * Called when flushing all pending messages.
+	 */
+	void (*flush_messages)(void *send_info);
+
 	/* Called when the interface should go into "run to
 	   completion" mode.  If this call sets the value to true, the
 	   interface should make sure that all messages are flushed
-- 
cgit v1.2.3-70-g09d2


From 062a68a5e0aaa9577d75391ffafa11e3c2a5f892 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 4 Sep 2015 09:11:24 -0700
Subject: Revert "uart: pl011: Add support to ZTE ZX296702 uart"

This reverts commit 8cd90e50d1408c65c355084b1c7f8f9085f49c6b as with
this patch the serial console is broken on lots of platforms.

Reported-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jun Nie <jun.nie@linaro.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Tested-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/Kconfig      |   4 +-
 drivers/tty/serial/amba-pl011.c | 195 +++-------------------------------------
 include/linux/amba/serial.h     |  14 ---
 3 files changed, 16 insertions(+), 197 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index ed299b9e6375..687b1ea294b7 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -47,12 +47,12 @@ config SERIAL_AMBA_PL010_CONSOLE
 
 config SERIAL_AMBA_PL011
 	tristate "ARM AMBA PL011 serial port support"
-	depends on ARM_AMBA || SOC_ZX296702
+	depends on ARM_AMBA
 	select SERIAL_CORE
 	help
 	  This selects the ARM(R) AMBA(R) PrimeCell PL011 UART.  If you have
 	  an Integrator/PP2, Integrator/CP or Versatile platform, say Y or M
-	  here. Say Y or M if you have SOC_ZX296702.
+	  here.
 
 	  If unsure, say N.
 
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index 2af09ab153b6..017443d092c1 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -74,10 +74,6 @@
 /* There is by now at least one vendor with differing details, so handle it */
 struct vendor_data {
 	unsigned int		ifls;
-	unsigned int		fr_busy;
-	unsigned int		fr_dsr;
-	unsigned int		fr_cts;
-	unsigned int		fr_ri;
 	unsigned int		lcrh_tx;
 	unsigned int		lcrh_rx;
 	u16			*reg_lut;
@@ -131,7 +127,6 @@ static u16 arm_reg[] = {
 	[REG_DMACR]		= UART011_DMACR,
 };
 
-#ifdef CONFIG_ARM_AMBA
 static unsigned int get_fifosize_arm(struct amba_device *dev)
 {
 	return amba_rev(dev) < 3 ? 16 : 32;
@@ -139,10 +134,6 @@ static unsigned int get_fifosize_arm(struct amba_device *dev)
 
 static struct vendor_data vendor_arm = {
 	.ifls			= UART011_IFLS_RX4_8|UART011_IFLS_TX4_8,
-	.fr_busy		= UART01x_FR_BUSY,
-	.fr_dsr			= UART01x_FR_DSR,
-	.fr_cts			= UART01x_FR_CTS,
-	.fr_ri			= UART011_FR_RI,
 	.lcrh_tx		= REG_LCRH,
 	.lcrh_rx		= REG_LCRH,
 	.reg_lut		= arm_reg,
@@ -153,13 +144,8 @@ static struct vendor_data vendor_arm = {
 	.fixed_options		= false,
 	.get_fifosize		= get_fifosize_arm,
 };
-#endif
 
 static struct vendor_data vendor_sbsa = {
-	.fr_busy		= UART01x_FR_BUSY,
-	.fr_dsr			= UART01x_FR_DSR,
-	.fr_cts			= UART01x_FR_CTS,
-	.fr_ri			= UART011_FR_RI,
 	.reg_lut		= arm_reg,
 	.oversampling		= false,
 	.dma_threshold		= false,
@@ -168,7 +154,6 @@ static struct vendor_data vendor_sbsa = {
 	.fixed_options		= true,
 };
 
-#ifdef CONFIG_ARM_AMBA
 static u16 st_reg[] = {
 	[REG_DR]		= UART01x_DR,
 	[REG_RSR]		= UART01x_RSR,
@@ -195,10 +180,6 @@ static unsigned int get_fifosize_st(struct amba_device *dev)
 
 static struct vendor_data vendor_st = {
 	.ifls			= UART011_IFLS_RX_HALF|UART011_IFLS_TX_HALF,
-	.fr_busy		= UART01x_FR_BUSY,
-	.fr_dsr			= UART01x_FR_DSR,
-	.fr_cts			= UART01x_FR_CTS,
-	.fr_ri			= UART011_FR_RI,
 	.lcrh_tx		= REG_LCRH,
 	.lcrh_rx		= REG_ST_LCRH_RX,
 	.reg_lut		= st_reg,
@@ -209,43 +190,6 @@ static struct vendor_data vendor_st = {
 	.fixed_options		= false,
 	.get_fifosize		= get_fifosize_st,
 };
-#endif
-
-#ifdef CONFIG_SOC_ZX296702
-static u16 zte_reg[] = {
-	[REG_DR]		= ZX_UART01x_DR,
-	[REG_RSR]		= UART01x_RSR,
-	[REG_ST_DMAWM]		= ST_UART011_DMAWM,
-	[REG_FR]		= ZX_UART01x_FR,
-	[REG_ST_LCRH_RX]	= ST_UART011_LCRH_RX,
-	[REG_ILPR]		= UART01x_ILPR,
-	[REG_IBRD]		= UART011_IBRD,
-	[REG_FBRD]		= UART011_FBRD,
-	[REG_LCRH]		= ZX_UART011_LCRH_TX,
-	[REG_CR]		= ZX_UART011_CR,
-	[REG_IFLS]		= ZX_UART011_IFLS,
-	[REG_IMSC]		= ZX_UART011_IMSC,
-	[REG_RIS]		= ZX_UART011_RIS,
-	[REG_MIS]		= ZX_UART011_MIS,
-	[REG_ICR]		= ZX_UART011_ICR,
-	[REG_DMACR]		= ZX_UART011_DMACR,
-};
-
-static struct vendor_data vendor_zte = {
-	.ifls			= UART011_IFLS_RX4_8|UART011_IFLS_TX4_8,
-	.fr_busy		= ZX_UART01x_FR_BUSY,
-	.fr_dsr			= ZX_UART01x_FR_DSR,
-	.fr_cts			= ZX_UART01x_FR_CTS,
-	.fr_ri			= ZX_UART011_FR_RI,
-	.lcrh_tx		= REG_LCRH,
-	.lcrh_rx		= REG_ST_LCRH_RX,
-	.reg_lut		= zte_reg,
-	.oversampling		= false,
-	.dma_threshold		= false,
-	.cts_event_workaround	= false,
-	.fixed_options		= false,
-};
-#endif
 
 /* Deals with DMA transactions */
 
@@ -289,10 +233,6 @@ struct uart_amba_port {
 	unsigned int		im;		/* interrupt mask */
 	unsigned int		old_status;
 	unsigned int		fifosize;	/* vendor-specific */
-	unsigned int		fr_busy;        /* vendor-specific */
-	unsigned int		fr_dsr;		/* vendor-specific */
-	unsigned int		fr_cts;         /* vendor-specific */
-	unsigned int		fr_ri;		/* vendor-specific */
 	unsigned int		lcrh_tx;	/* vendor-specific */
 	unsigned int		lcrh_rx;	/* vendor-specific */
 	unsigned int		old_cr;		/* state during shutdown */
@@ -1223,7 +1163,7 @@ static void pl011_dma_shutdown(struct uart_amba_port *uap)
 		return;
 
 	/* Disable RX and TX DMA */
-	while (pl011_readw(uap, REG_FR) & uap->fr_busy)
+	while (pl011_readw(uap, REG_FR) & UART01x_FR_BUSY)
 		barrier();
 
 	spin_lock_irq(&uap->port.lock);
@@ -1472,11 +1412,11 @@ static void pl011_modem_status(struct uart_amba_port *uap)
 	if (delta & UART01x_FR_DCD)
 		uart_handle_dcd_change(&uap->port, status & UART01x_FR_DCD);
 
-	if (delta & uap->fr_dsr)
+	if (delta & UART01x_FR_DSR)
 		uap->port.icount.dsr++;
 
-	if (delta & uap->fr_cts)
-		uart_handle_cts_change(&uap->port, status & uap->fr_cts);
+	if (delta & UART01x_FR_CTS)
+		uart_handle_cts_change(&uap->port, status & UART01x_FR_CTS);
 
 	wake_up_interruptible(&uap->port.state->port.delta_msr_wait);
 }
@@ -1547,7 +1487,7 @@ static unsigned int pl011_tx_empty(struct uart_port *port)
 	struct uart_amba_port *uap =
 	    container_of(port, struct uart_amba_port, port);
 	unsigned int status = pl011_readw(uap, REG_FR);
-	return status & (uap->fr_busy|UART01x_FR_TXFF) ? 0 : TIOCSER_TEMT;
+	return status & (UART01x_FR_BUSY|UART01x_FR_TXFF) ? 0 : TIOCSER_TEMT;
 }
 
 static unsigned int pl011_get_mctrl(struct uart_port *port)
@@ -1562,9 +1502,9 @@ static unsigned int pl011_get_mctrl(struct uart_port *port)
 		result |= tiocmbit
 
 	TIOCMBIT(UART01x_FR_DCD, TIOCM_CAR);
-	TIOCMBIT(uap->fr_dsr, TIOCM_DSR);
-	TIOCMBIT(uap->fr_cts, TIOCM_CTS);
-	TIOCMBIT(uap->fr_ri, TIOCM_RNG);
+	TIOCMBIT(UART01x_FR_DSR, TIOCM_DSR);
+	TIOCMBIT(UART01x_FR_CTS, TIOCM_CTS);
+	TIOCMBIT(UART011_FR_RI, TIOCM_RNG);
 #undef TIOCMBIT
 	return result;
 }
@@ -1780,7 +1720,8 @@ static int pl011_startup(struct uart_port *port)
 	/*
 	 * initialise the old status of the modem signals
 	 */
-	uap->old_status = pl011_readw(uap, REG_FR) & UART01x_FR_MODEM_ANY;
+	uap->old_status = pl011_readw(uap, REG_FR) &
+			UART01x_FR_MODEM_ANY;
 
 	/* Startup DMA */
 	pl011_dma_startup(uap);
@@ -1859,7 +1800,7 @@ static void pl011_disable_interrupts(struct uart_amba_port *uap)
 	/* mask all interrupts and clear all pending ones */
 	uap->im = 0;
 	pl011_writew(uap, uap->im, REG_IMSC);
-	pl011_writew(uap, 0xffff, REG_ICR);
+	pl011_writew(0xffff, REG_ICR);
 
 	spin_unlock_irq(&uap->port.lock);
 }
@@ -2237,7 +2178,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
 	 */
 	do {
 		status = pl011_readw(uap, REG_FR);
-	} while (status & uap->fr_busy);
+	} while (status & UART01x_FR_BUSY);
 	if (!uap->vendor->always_enabled)
 		pl011_writew(uap, old_cr, REG_CR);
 
@@ -2354,7 +2295,7 @@ static void pl011_putc(struct uart_port *port, int c)
 	while (pl011_readw(uap, REG_FR) & UART01x_FR_TXFF)
 		;
 	pl011_writeb(uap, c, REG_DR);
-	while (pl011_readw(uap, REG_FR) & uap->fr_busy)
+	while (pl011_readw(uap, REG_FR) & UART01x_FR_BUSY)
 		;
 }
 
@@ -2500,7 +2441,6 @@ static int pl011_register_port(struct uart_amba_port *uap)
 	return ret;
 }
 
-#ifdef CONFIG_ARM_AMBA
 static int pl011_probe(struct amba_device *dev, const struct amba_id *id)
 {
 	struct uart_amba_port *uap;
@@ -2524,10 +2464,6 @@ static int pl011_probe(struct amba_device *dev, const struct amba_id *id)
 	uap->reg_lut = vendor->reg_lut;
 	uap->lcrh_rx = vendor->lcrh_rx;
 	uap->lcrh_tx = vendor->lcrh_tx;
-	uap->fr_busy = vendor->fr_busy;
-	uap->fr_dsr = vendor->fr_dsr;
-	uap->fr_cts = vendor->fr_cts;
-	uap->fr_ri = vendor->fr_ri;
 	uap->fifosize = vendor->get_fifosize(dev);
 	uap->port.irq = dev->irq[0];
 	uap->port.ops = &amba_pl011_pops;
@@ -2551,67 +2487,6 @@ static int pl011_remove(struct amba_device *dev)
 	pl011_unregister_port(uap);
 	return 0;
 }
-#endif
-
-#ifdef CONFIG_SOC_ZX296702
-static int zx_uart_probe(struct platform_device *pdev)
-{
-	struct uart_amba_port *uap;
-	struct vendor_data *vendor = &vendor_zte;
-	struct resource *res;
-	int portnr, ret;
-
-	portnr = pl011_find_free_port();
-	if (portnr < 0)
-		return portnr;
-
-	uap = devm_kzalloc(&pdev->dev, sizeof(struct uart_amba_port),
-			GFP_KERNEL);
-	if (!uap) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	uap->clk = devm_clk_get(&pdev->dev, NULL);
-	if (IS_ERR(uap->clk)) {
-		ret = PTR_ERR(uap->clk);
-		goto out;
-	}
-
-	uap->vendor	= vendor;
-	uap->reg_lut	= vendor->reg_lut;
-	uap->lcrh_rx	= vendor->lcrh_rx;
-	uap->lcrh_tx	= vendor->lcrh_tx;
-	uap->fr_busy	= vendor->fr_busy;
-	uap->fr_dsr	= vendor->fr_dsr;
-	uap->fr_cts	= vendor->fr_cts;
-	uap->fr_ri	= vendor->fr_ri;
-	uap->fifosize	= 16;
-	uap->port.irq	= platform_get_irq(pdev, 0);
-	uap->port.ops	= &amba_pl011_pops;
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-
-	ret = pl011_setup_port(&pdev->dev, uap, res, portnr);
-	if (ret)
-		return ret;
-
-	platform_set_drvdata(pdev, uap);
-
-	return pl011_register_port(uap);
-out:
-	return ret;
-}
-
-static int zx_uart_remove(struct platform_device *pdev)
-{
-	struct uart_amba_port *uap = platform_get_drvdata(pdev);
-
-	uart_remove_one_port(&amba_reg, &uap->port);
-	pl011_unregister_port(uap);
-	return 0;
-}
-#endif
 
 #ifdef CONFIG_PM_SLEEP
 static int pl011_suspend(struct device *dev)
@@ -2669,10 +2544,6 @@ static int sbsa_uart_probe(struct platform_device *pdev)
 
 	uap->vendor	= &vendor_sbsa;
 	uap->reg_lut	= vendor_sbsa.reg_lut;
-	uap->fr_busy	= vendor_sbsa.fr_busy;
-	uap->fr_dsr	= vendor_sbsa.fr_dsr;
-	uap->fr_cts	= vendor_sbsa.fr_cts;
-	uap->fr_ri	= vendor_sbsa.fr_ri;
 	uap->fifosize	= 32;
 	uap->port.irq	= platform_get_irq(pdev, 0);
 	uap->port.ops	= &sbsa_uart_pops;
@@ -2722,7 +2593,6 @@ static struct platform_driver arm_sbsa_uart_platform_driver = {
 	},
 };
 
-#ifdef CONFIG_ARM_AMBA
 static struct amba_id pl011_ids[] = {
 	{
 		.id	= 0x00041011,
@@ -2748,57 +2618,20 @@ static struct amba_driver pl011_driver = {
 	.probe		= pl011_probe,
 	.remove		= pl011_remove,
 };
-#endif
-
-#ifdef CONFIG_SOC_ZX296702
-static const struct of_device_id zx_uart_dt_ids[] = {
-	{ .compatible = "zte,zx296702-uart", },
-	{ /* sentinel */ }
-};
-MODULE_DEVICE_TABLE(of, zx_uart_dt_ids);
-
-static struct platform_driver zx_uart_driver = {
-	.driver = {
-		.name	= "zx-uart",
-		.owner	= THIS_MODULE,
-		.pm	= &pl011_dev_pm_ops,
-		.of_match_table = zx_uart_dt_ids,
-	},
-	.probe		= zx_uart_probe,
-	.remove		= zx_uart_remove,
-};
-#endif
-
 
 static int __init pl011_init(void)
 {
-	int ret;
 	printk(KERN_INFO "Serial: AMBA PL011 UART driver\n");
 
 	if (platform_driver_register(&arm_sbsa_uart_platform_driver))
 		pr_warn("could not register SBSA UART platform driver\n");
-
-#ifdef CONFIG_SOC_ZX296702
-	ret = platform_driver_register(&zx_uart_driver);
-	if (ret)
-		pr_warn("could not register ZX UART platform driver\n");
-#endif
-
-#ifdef CONFIG_ARM_AMBA
-	ret = amba_driver_register(&pl011_driver);
-#endif
-	return ret;
+	return amba_driver_register(&pl011_driver);
 }
 
 static void __exit pl011_exit(void)
 {
 	platform_driver_unregister(&arm_sbsa_uart_platform_driver);
-#ifdef CONFIG_SOC_ZX296702
-	platform_driver_unregister(&zx_uart_driver);
-#endif
-#ifdef CONFIG_ARM_AMBA
 	amba_driver_unregister(&pl011_driver);
-#endif
 }
 
 /*
diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index 6a0a89ed7f81..0ddb5c02ad8b 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -33,14 +33,12 @@
 #define UART01x_DR		0x00	/* Data read or written from the interface. */
 #define UART01x_RSR		0x04	/* Receive status register (Read). */
 #define UART01x_ECR		0x04	/* Error clear register (Write). */
-#define ZX_UART01x_DR		0x04	/* Data read or written from the interface. */
 #define UART010_LCRH		0x08	/* Line control register, high byte. */
 #define ST_UART011_DMAWM	0x08    /* DMA watermark configure register. */
 #define UART010_LCRM		0x0C	/* Line control register, middle byte. */
 #define ST_UART011_TIMEOUT	0x0C    /* Timeout period register. */
 #define UART010_LCRL		0x10	/* Line control register, low byte. */
 #define UART010_CR		0x14	/* Control register. */
-#define ZX_UART01x_FR		0x14	/* Flag register (Read only). */
 #define UART01x_FR		0x18	/* Flag register (Read only). */
 #define UART010_IIR		0x1C	/* Interrupt identification register (Read). */
 #define UART010_ICR		0x1C	/* Interrupt clear register (Write). */
@@ -51,21 +49,13 @@
 #define UART011_LCRH		0x2c	/* Line control register. */
 #define ST_UART011_LCRH_TX	0x2c    /* Tx Line control register. */
 #define UART011_CR		0x30	/* Control register. */
-#define ZX_UART011_LCRH_TX	0x30    /* Tx Line control register. */
 #define UART011_IFLS		0x34	/* Interrupt fifo level select. */
-#define ZX_UART011_CR		0x34	/* Control register. */
-#define ZX_UART011_IFLS		0x38	/* Interrupt fifo level select. */
 #define UART011_IMSC		0x38	/* Interrupt mask. */
 #define UART011_RIS		0x3c	/* Raw interrupt status. */
 #define UART011_MIS		0x40	/* Masked interrupt status. */
-#define ZX_UART011_IMSC		0x40	/* Interrupt mask. */
 #define UART011_ICR		0x44	/* Interrupt clear register. */
-#define ZX_UART011_RIS		0x44	/* Raw interrupt status. */
 #define UART011_DMACR		0x48	/* DMA control register. */
-#define ZX_UART011_MIS		0x48	/* Masked interrupt status. */
-#define ZX_UART011_ICR		0x4c	/* Interrupt clear register. */
 #define ST_UART011_XFCR		0x50	/* XON/XOFF control register. */
-#define ZX_UART011_DMACR	0x50	/* DMA control register. */
 #define ST_UART011_XON1		0x54	/* XON1 register. */
 #define ST_UART011_XON2		0x58	/* XON2 register. */
 #define ST_UART011_XOFF1	0x5C	/* XON1 register. */
@@ -85,19 +75,15 @@
 #define UART01x_RSR_PE 		0x02
 #define UART01x_RSR_FE 		0x01
 
-#define ZX_UART01x_FR_BUSY	0x300
 #define UART011_FR_RI		0x100
 #define UART011_FR_TXFE		0x080
 #define UART011_FR_RXFF		0x040
 #define UART01x_FR_TXFF		0x020
 #define UART01x_FR_RXFE		0x010
 #define UART01x_FR_BUSY		0x008
-#define ZX_UART01x_FR_DSR       0x008
 #define UART01x_FR_DCD 		0x004
 #define UART01x_FR_DSR 		0x002
-#define ZX_UART01x_FR_CTS	0x002
 #define UART01x_FR_CTS 		0x001
-#define ZX_UART011_FR_RI	0x001
 #define UART01x_FR_TMSK		(UART01x_FR_TXFF + UART01x_FR_BUSY)
 
 #define UART011_CR_CTSEN	0x8000	/* CTS hardware flow control */
-- 
cgit v1.2.3-70-g09d2


From e9f069868d60550c4b46f084ac9276a57c1b4711 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 4 Sep 2015 15:42:42 -0700
Subject: kernel/kthread.c:kthread_create_on_node(): clarify documentation

- Make it clear that the `node' arg refers to memory allocations only:
  kthread_create_on_node() does not pin the new thread to that node's
  CPUs.

- Encourage the use of NUMA_NO_NODE.

[nzimmer@sgi.com: use NUMA_NO_NODE in kthread_create() also]
Cc: Nathan Zimmer <nzimmer@sgi.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kthread.h | 2 +-
 kernel/kthread.c        | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 869b21dcf503..e691b6a23f72 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -11,7 +11,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 					   const char namefmt[], ...);
 
 #define kthread_create(threadfn, data, namefmt, arg...) \
-	kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)
+	kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
 
 
 struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 490924cc9e7c..9ff173dca1ae 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -248,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create)
  * kthread_create_on_node - create a kthread.
  * @threadfn: the function to run until signal_pending(current).
  * @data: data ptr for @threadfn.
- * @node: memory node number.
+ * @node: task and thread structures for the thread are allocated on this node
  * @namefmt: printf-style name for the thread.
  *
  * Description: This helper function creates and names a kernel
  * thread.  The thread will be stopped: use wake_up_process() to start
- * it.  See also kthread_run().
+ * it.  See also kthread_run().  The new thread has SCHED_NORMAL policy and
+ * is affine to all CPUs.
  *
  * If thread is going to be bound on a particular cpu, give its node
- * in @node, to get NUMA affinity for kthread stack, or else give -1.
+ * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
  * When woken, the thread will run @threadfn() with @data as its
  * argument. @threadfn() can either call do_exit() directly if it is a
  * standalone thread for which no one will call kthread_stop(), or
-- 
cgit v1.2.3-70-g09d2


From 58319057b7847667f0c9585b9de0e8932b0fdb08 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 4 Sep 2015 15:42:45 -0700
Subject: capabilities: ambient capabilities

Credit where credit is due: this idea comes from Christoph Lameter with
a lot of valuable input from Serge Hallyn.  This patch is heavily based
on Christoph's patch.

===== The status quo =====

On Linux, there are a number of capabilities defined by the kernel.  To
perform various privileged tasks, processes can wield capabilities that
they hold.

Each task has four capability masks: effective (pE), permitted (pP),
inheritable (pI), and a bounding set (X).  When the kernel checks for a
capability, it checks pE.  The other capability masks serve to modify
what capabilities can be in pE.

Any task can remove capabilities from pE, pP, or pI at any time.  If a
task has a capability in pP, it can add that capability to pE and/or pI.
If a task has CAP_SETPCAP, then it can add any capability to pI, and it
can remove capabilities from X.

Tasks are not the only things that can have capabilities; files can also
have capabilities.  A file can have no capabilty information at all [1].
If a file has capability information, then it has a permitted mask (fP)
and an inheritable mask (fI) as well as a single effective bit (fE) [2].
File capabilities modify the capabilities of tasks that execve(2) them.

A task that successfully calls execve has its capabilities modified for
the file ultimately being excecuted (i.e.  the binary itself if that
binary is ELF or for the interpreter if the binary is a script.) [3] In
the capability evolution rules, for each mask Z, pZ represents the old
value and pZ' represents the new value.  The rules are:

  pP' = (X & fP) | (pI & fI)
  pI' = pI
  pE' = (fE ? pP' : 0)
  X is unchanged

For setuid binaries, fP, fI, and fE are modified by a moderately
complicated set of rules that emulate POSIX behavior.  Similarly, if
euid == 0 or ruid == 0, then fP, fI, and fE are modified differently
(primary, fP and fI usually end up being the full set).  For nonroot
users executing binaries with neither setuid nor file caps, fI and fP
are empty and fE is false.

As an extra complication, if you execute a process as nonroot and fE is
set, then the "secure exec" rules are in effect: AT_SECURE gets set,
LD_PRELOAD doesn't work, etc.

This is rather messy.  We've learned that making any changes is
dangerous, though: if a new kernel version allows an unprivileged
program to change its security state in a way that persists cross
execution of a setuid program or a program with file caps, this
persistent state is surprisingly likely to allow setuid or file-capped
programs to be exploited for privilege escalation.

===== The problem =====

Capability inheritance is basically useless.

If you aren't root and you execute an ordinary binary, fI is zero, so
your capabilities have no effect whatsoever on pP'.  This means that you
can't usefully execute a helper process or a shell command with elevated
capabilities if you aren't root.

On current kernels, you can sort of work around this by setting fI to
the full set for most or all non-setuid executable files.  This causes
pP' = pI for nonroot, and inheritance works.  No one does this because
it's a PITA and it isn't even supported on most filesystems.

If you try this, you'll discover that every nonroot program ends up with
secure exec rules, breaking many things.

This is a problem that has bitten many people who have tried to use
capabilities for anything useful.

===== The proposed change =====

This patch adds a fifth capability mask called the ambient mask (pA).
pA does what most people expect pI to do.

pA obeys the invariant that no bit can ever be set in pA if it is not
set in both pP and pI.  Dropping a bit from pP or pI drops that bit from
pA.  This ensures that existing programs that try to drop capabilities
still do so, with a complication.  Because capability inheritance is so
broken, setting KEEPCAPS, using setresuid to switch to nonroot uids, and
then calling execve effectively drops capabilities.  Therefore,
setresuid from root to nonroot conditionally clears pA unless
SECBIT_NO_SETUID_FIXUP is set.  Processes that don't like this can
re-add bits to pA afterwards.

The capability evolution rules are changed:

  pA' = (file caps or setuid or setgid ? 0 : pA)
  pP' = (X & fP) | (pI & fI) | pA'
  pI' = pI
  pE' = (fE ? pP' : pA')
  X is unchanged

If you are nonroot but you have a capability, you can add it to pA.  If
you do so, your children get that capability in pA, pP, and pE.  For
example, you can set pA = CAP_NET_BIND_SERVICE, and your children can
automatically bind low-numbered ports.  Hallelujah!

Unprivileged users can create user namespaces, map themselves to a
nonzero uid, and create both privileged (relative to their namespace)
and unprivileged process trees.  This is currently more or less
impossible.  Hallelujah!

You cannot use pA to try to subvert a setuid, setgid, or file-capped
program: if you execute any such program, pA gets cleared and the
resulting evolution rules are unchanged by this patch.

Users with nonzero pA are unlikely to unintentionally leak that
capability.  If they run programs that try to drop privileges, dropping
privileges will still work.

It's worth noting that the degree of paranoia in this patch could
possibly be reduced without causing serious problems.  Specifically, if
we allowed pA to persist across executing non-pA-aware setuid binaries
and across setresuid, then, naively, the only capabilities that could
leak as a result would be the capabilities in pA, and any attacker
*already* has those capabilities.  This would make me nervous, though --
setuid binaries that tried to privilege-separate might fail to do so,
and putting CAP_DAC_READ_SEARCH or CAP_DAC_OVERRIDE into pA could have
unexpected side effects.  (Whether these unexpected side effects would
be exploitable is an open question.) I've therefore taken the more
paranoid route.  We can revisit this later.

An alternative would be to require PR_SET_NO_NEW_PRIVS before setting
ambient capabilities.  I think that this would be annoying and would
make granting otherwise unprivileged users minor ambient capabilities
(CAP_NET_BIND_SERVICE or CAP_NET_RAW for example) much less useful than
it is with this patch.

===== Footnotes =====

[1] Files that are missing the "security.capability" xattr or that have
unrecognized values for that xattr end up with has_cap set to false.
The code that does that appears to be complicated for no good reason.

[2] The libcap capability mask parsers and formatters are dangerously
misleading and the documentation is flat-out wrong.  fE is *not* a mask;
it's a single bit.  This has probably confused every single person who
has tried to use file capabilities.

[3] Linux very confusingly processes both the script and the interpreter
if applicable, for reasons that elude me.  The results from thinking
about a script's file capabilities and/or setuid bits are mostly
discarded.

Preliminary userspace code is here, but it needs updating:
https://git.kernel.org/cgit/linux/kernel/git/luto/util-linux-playground.git/commit/?h=cap_ambient&id=7f5afbd175d2

Here is a test program that can be used to verify the functionality
(from Christoph):

/*
 * Test program for the ambient capabilities. This program spawns a shell
 * that allows running processes with a defined set of capabilities.
 *
 * (C) 2015 Christoph Lameter <cl@linux.com>
 * Released under: GPL v3 or later.
 *
 *
 * Compile using:
 *
 *	gcc -o ambient_test ambient_test.o -lcap-ng
 *
 * This program must have the following capabilities to run properly:
 * Permissions for CAP_NET_RAW, CAP_NET_ADMIN, CAP_SYS_NICE
 *
 * A command to equip the binary with the right caps is:
 *
 *	setcap cap_net_raw,cap_net_admin,cap_sys_nice+p ambient_test
 *
 *
 * To get a shell with additional caps that can be inherited by other processes:
 *
 *	./ambient_test /bin/bash
 *
 *
 * Verifying that it works:
 *
 * From the bash spawed by ambient_test run
 *
 *	cat /proc/$$/status
 *
 * and have a look at the capabilities.
 */

#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <cap-ng.h>
#include <sys/prctl.h>
#include <linux/capability.h>

/*
 * Definitions from the kernel header files. These are going to be removed
 * when the /usr/include files have these defined.
 */
#define PR_CAP_AMBIENT 47
#define PR_CAP_AMBIENT_IS_SET 1
#define PR_CAP_AMBIENT_RAISE 2
#define PR_CAP_AMBIENT_LOWER 3
#define PR_CAP_AMBIENT_CLEAR_ALL 4

static void set_ambient_cap(int cap)
{
	int rc;

	capng_get_caps_process();
	rc = capng_update(CAPNG_ADD, CAPNG_INHERITABLE, cap);
	if (rc) {
		printf("Cannot add inheritable cap\n");
		exit(2);
	}
	capng_apply(CAPNG_SELECT_CAPS);

	/* Note the two 0s at the end. Kernel checks for these */
	if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0)) {
		perror("Cannot set cap");
		exit(1);
	}
}

int main(int argc, char **argv)
{
	int rc;

	set_ambient_cap(CAP_NET_RAW);
	set_ambient_cap(CAP_NET_ADMIN);
	set_ambient_cap(CAP_SYS_NICE);

	printf("Ambient_test forking shell\n");
	if (execv(argv[1], argv + 1))
		perror("Cannot exec");

	return 0;
}

Signed-off-by: Christoph Lameter <cl@linux.com> # Original author
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Aaron Jones <aaronmdjones@gmail.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Andrew G. Morgan <morgan@kernel.org>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Austin S Hemmelgarn <ahferroin7@gmail.com>
Cc: Markku Savela <msa@moth.iki.fi>
Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: James Morris <james.l.morris@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c              |   5 ++-
 include/linux/cred.h         |   8 ++++
 include/uapi/linux/prctl.h   |   7 +++
 kernel/user_namespace.c      |   1 +
 security/commoncap.c         | 102 ++++++++++++++++++++++++++++++++++++++-----
 security/keys/process_keys.c |   1 +
 6 files changed, 113 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index ce065cf3104f..f60f0121e331 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header,
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
 {
 	const struct cred *cred;
-	kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
+	kernel_cap_t cap_inheritable, cap_permitted, cap_effective,
+			cap_bset, cap_ambient;
 
 	rcu_read_lock();
 	cred = __task_cred(p);
@@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
 	cap_permitted	= cred->cap_permitted;
 	cap_effective	= cred->cap_effective;
 	cap_bset	= cred->cap_bset;
+	cap_ambient	= cred->cap_ambient;
 	rcu_read_unlock();
 
 	render_cap_t(m, "CapInh:\t", &cap_inheritable);
 	render_cap_t(m, "CapPrm:\t", &cap_permitted);
 	render_cap_t(m, "CapEff:\t", &cap_effective);
 	render_cap_t(m, "CapBnd:\t", &cap_bset);
+	render_cap_t(m, "CapAmb:\t", &cap_ambient);
 }
 
 static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 8b6c083e68a7..8d70e1361ecd 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -137,6 +137,7 @@ struct cred {
 	kernel_cap_t	cap_permitted;	/* caps we're permitted */
 	kernel_cap_t	cap_effective;	/* caps we can actually use */
 	kernel_cap_t	cap_bset;	/* capability bounding set */
+	kernel_cap_t	cap_ambient;	/* Ambient capability set */
 #ifdef CONFIG_KEYS
 	unsigned char	jit_keyring;	/* default keyring to attach requested
 					 * keys to */
@@ -212,6 +213,13 @@ static inline void validate_process_creds(void)
 }
 #endif
 
+static inline bool cap_ambient_invariant_ok(const struct cred *cred)
+{
+	return cap_issubset(cred->cap_ambient,
+			    cap_intersect(cred->cap_permitted,
+					  cred->cap_inheritable));
+}
+
 /**
  * get_new_cred - Get a reference on a new set of credentials
  * @cred: The new credentials to reference
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 31891d9535e2..a8d0759a9e40 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -190,4 +190,11 @@ struct prctl_mm_map {
 # define PR_FP_MODE_FR		(1 << 0)	/* 64b FP registers */
 # define PR_FP_MODE_FRE		(1 << 1)	/* 32b compatibility */
 
+/* Control the ambient capability set */
+#define PR_CAP_AMBIENT			47
+# define PR_CAP_AMBIENT_IS_SET		1
+# define PR_CAP_AMBIENT_RAISE		2
+# define PR_CAP_AMBIENT_LOWER		3
+# define PR_CAP_AMBIENT_CLEAR_ALL	4
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index f65a0a06a8c0..88fefa68c516 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
 	cred->cap_inheritable = CAP_EMPTY_SET;
 	cred->cap_permitted = CAP_FULL_SET;
 	cred->cap_effective = CAP_FULL_SET;
+	cred->cap_ambient = CAP_EMPTY_SET;
 	cred->cap_bset = CAP_FULL_SET;
 #ifdef CONFIG_KEYS
 	key_put(cred->request_key_auth);
diff --git a/security/commoncap.c b/security/commoncap.c
index d103f5a4043d..1f74dde1063e 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -267,6 +267,16 @@ int cap_capset(struct cred *new,
 	new->cap_effective   = *effective;
 	new->cap_inheritable = *inheritable;
 	new->cap_permitted   = *permitted;
+
+	/*
+	 * Mask off ambient bits that are no longer both permitted and
+	 * inheritable.
+	 */
+	new->cap_ambient = cap_intersect(new->cap_ambient,
+					 cap_intersect(*permitted,
+						       *inheritable));
+	if (WARN_ON(!cap_ambient_invariant_ok(new)))
+		return -EINVAL;
 	return 0;
 }
 
@@ -347,6 +357,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
 
 		/*
 		 * pP' = (X & fP) | (pI & fI)
+		 * The addition of pA' is handled later.
 		 */
 		new->cap_permitted.cap[i] =
 			(new->cap_bset.cap[i] & permitted) |
@@ -474,10 +485,13 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
 {
 	const struct cred *old = current_cred();
 	struct cred *new = bprm->cred;
-	bool effective, has_cap = false;
+	bool effective, has_cap = false, is_setid;
 	int ret;
 	kuid_t root_uid;
 
+	if (WARN_ON(!cap_ambient_invariant_ok(old)))
+		return -EPERM;
+
 	effective = false;
 	ret = get_file_caps(bprm, &effective, &has_cap);
 	if (ret < 0)
@@ -522,8 +536,9 @@ skip:
 	 *
 	 * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
 	 */
-	if ((!uid_eq(new->euid, old->uid) ||
-	     !gid_eq(new->egid, old->gid) ||
+	is_setid = !uid_eq(new->euid, old->uid) || !gid_eq(new->egid, old->gid);
+
+	if ((is_setid ||
 	     !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
 	    bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
 		/* downgrade; they get no more than they had, and maybe less */
@@ -539,10 +554,28 @@ skip:
 	new->suid = new->fsuid = new->euid;
 	new->sgid = new->fsgid = new->egid;
 
+	/* File caps or setid cancels ambient. */
+	if (has_cap || is_setid)
+		cap_clear(new->cap_ambient);
+
+	/*
+	 * Now that we've computed pA', update pP' to give:
+	 *   pP' = (X & fP) | (pI & fI) | pA'
+	 */
+	new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);
+
+	/*
+	 * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
+	 * this is the same as pE' = (fE ? pP' : 0) | pA'.
+	 */
 	if (effective)
 		new->cap_effective = new->cap_permitted;
 	else
-		cap_clear(new->cap_effective);
+		new->cap_effective = new->cap_ambient;
+
+	if (WARN_ON(!cap_ambient_invariant_ok(new)))
+		return -EPERM;
+
 	bprm->cap_effective = effective;
 
 	/*
@@ -557,7 +590,7 @@ skip:
 	 * Number 1 above might fail if you don't have a full bset, but I think
 	 * that is interesting information to audit.
 	 */
-	if (!cap_isclear(new->cap_effective)) {
+	if (!cap_issubset(new->cap_effective, new->cap_ambient)) {
 		if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
 		    !uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) ||
 		    issecure(SECURE_NOROOT)) {
@@ -568,6 +601,10 @@ skip:
 	}
 
 	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+
+	if (WARN_ON(!cap_ambient_invariant_ok(new)))
+		return -EPERM;
+
 	return 0;
 }
 
@@ -589,7 +626,7 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
 	if (!uid_eq(cred->uid, root_uid)) {
 		if (bprm->cap_effective)
 			return 1;
-		if (!cap_isclear(cred->cap_permitted))
+		if (!cap_issubset(cred->cap_permitted, cred->cap_ambient))
 			return 1;
 	}
 
@@ -691,10 +728,18 @@ static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
 	     uid_eq(old->suid, root_uid)) &&
 	    (!uid_eq(new->uid, root_uid) &&
 	     !uid_eq(new->euid, root_uid) &&
-	     !uid_eq(new->suid, root_uid)) &&
-	    !issecure(SECURE_KEEP_CAPS)) {
-		cap_clear(new->cap_permitted);
-		cap_clear(new->cap_effective);
+	     !uid_eq(new->suid, root_uid))) {
+		if (!issecure(SECURE_KEEP_CAPS)) {
+			cap_clear(new->cap_permitted);
+			cap_clear(new->cap_effective);
+		}
+
+		/*
+		 * Pre-ambient programs expect setresuid to nonroot followed
+		 * by exec to drop capabilities.  We should make sure that
+		 * this remains the case.
+		 */
+		cap_clear(new->cap_ambient);
 	}
 	if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
 		cap_clear(new->cap_effective);
@@ -924,6 +969,43 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
 		return commit_creds(new);
 
+	case PR_CAP_AMBIENT:
+		if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
+			if (arg3 | arg4 | arg5)
+				return -EINVAL;
+
+			new = prepare_creds();
+			if (!new)
+				return -ENOMEM;
+			cap_clear(new->cap_ambient);
+			return commit_creds(new);
+		}
+
+		if (((!cap_valid(arg3)) | arg4 | arg5))
+			return -EINVAL;
+
+		if (arg2 == PR_CAP_AMBIENT_IS_SET) {
+			return !!cap_raised(current_cred()->cap_ambient, arg3);
+		} else if (arg2 != PR_CAP_AMBIENT_RAISE &&
+			   arg2 != PR_CAP_AMBIENT_LOWER) {
+			return -EINVAL;
+		} else {
+			if (arg2 == PR_CAP_AMBIENT_RAISE &&
+			    (!cap_raised(current_cred()->cap_permitted, arg3) ||
+			     !cap_raised(current_cred()->cap_inheritable,
+					 arg3)))
+				return -EPERM;
+
+			new = prepare_creds();
+			if (!new)
+				return -ENOMEM;
+			if (arg2 == PR_CAP_AMBIENT_RAISE)
+				cap_raise(new->cap_ambient, arg3);
+			else
+				cap_lower(new->cap_ambient, arg3);
+			return commit_creds(new);
+		}
+
 	default:
 		/* No functionality available - continue with default */
 		return -ENOSYS;
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index bd536cb221e2..43b4cddbf2b3 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -848,6 +848,7 @@ void key_change_session_keyring(struct callback_head *twork)
 	new->cap_inheritable	= old->cap_inheritable;
 	new->cap_permitted	= old->cap_permitted;
 	new->cap_effective	= old->cap_effective;
+	new->cap_ambient	= old->cap_ambient;
 	new->cap_bset		= old->cap_bset;
 
 	new->jit_keyring	= old->jit_keyring;
-- 
cgit v1.2.3-70-g09d2


From 1e39fc01836d02a11515aaabd97a0a938326bfe2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.com>
Date: Fri, 4 Sep 2015 15:43:06 -0700
Subject: fsnotify: document mark locking

Signed-off-by: Jan Kara <jack@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fsnotify_backend.h | 50 ++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 65a517dd32f7..dd6ddb0287ed 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -195,40 +195,50 @@ struct fsnotify_group {
 #define FSNOTIFY_EVENT_INODE	2
 
 /*
- * a mark is simply an object attached to an in core inode which allows an
+ * A mark is simply an object attached to an in core inode which allows an
  * fsnotify listener to indicate they are either no longer interested in events
  * of a type matching mask or only interested in those events.
  *
- * these are flushed when an inode is evicted from core and may be flushed
- * when the inode is modified (as seen by fsnotify_access).  Some fsnotify users
- * (such as dnotify) will flush these when the open fd is closed and not at
- * inode eviction or modification.
+ * These are flushed when an inode is evicted from core and may be flushed
+ * when the inode is modified (as seen by fsnotify_access).  Some fsnotify
+ * users (such as dnotify) will flush these when the open fd is closed and not
+ * at inode eviction or modification.
+ *
+ * Text in brackets is showing the lock(s) protecting modifications of a
+ * particular entry. obj_lock means either inode->i_lock or
+ * mnt->mnt_root->d_lock depending on the mark type.
  */
 struct fsnotify_mark {
-	__u32 mask;			/* mask this mark is for */
-	/* we hold ref for each i_list and g_list.  also one ref for each 'thing'
+	/* Mask this mark is for [mark->lock, group->mark_mutex] */
+	__u32 mask;
+	/* We hold one for presence in g_list. Also one ref for each 'thing'
 	 * in kernel that found and may be using this mark. */
-	atomic_t refcnt;		/* active things looking at this mark */
-	struct fsnotify_group *group;	/* group this mark is for */
-	struct list_head g_list;	/* list of marks by group->i_fsnotify_marks
-					 * Also reused for queueing mark into
-					 * destroy_list when it's waiting for
-					 * the end of SRCU period before it can
-					 * be freed */
-	spinlock_t lock;		/* protect group and inode */
-	struct hlist_node obj_list;	/* list of marks for inode / vfsmount */
-	struct list_head free_list;	/* tmp list used when freeing this mark */
-	union {
+	atomic_t refcnt;
+	/* Group this mark is for. Set on mark creation, stable until last ref
+	 * is dropped */
+	struct fsnotify_group *group;
+	/* List of marks by group->i_fsnotify_marks. Also reused for queueing
+	 * mark into destroy_list when it's waiting for the end of SRCU period
+	 * before it can be freed. [group->mark_mutex] */
+	struct list_head g_list;
+	/* Protects inode / mnt pointers, flags, masks */
+	spinlock_t lock;
+	/* List of marks for inode / vfsmount [obj_lock] */
+	struct hlist_node obj_list;
+	/* tmp list used when freeing this mark */
+	struct list_head free_list;
+	union {	/* Object pointer [mark->lock, group->mark_mutex] */
 		struct inode *inode;	/* inode this mark is associated with */
 		struct vfsmount *mnt;	/* vfsmount this mark is associated with */
 	};
-	__u32 ignored_mask;		/* events types to ignore */
+	/* Events types to ignore [mark->lock, group->mark_mutex] */
+	__u32 ignored_mask;
 #define FSNOTIFY_MARK_FLAG_INODE		0x01
 #define FSNOTIFY_MARK_FLAG_VFSMOUNT		0x02
 #define FSNOTIFY_MARK_FLAG_OBJECT_PINNED	0x04
 #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x08
 #define FSNOTIFY_MARK_FLAG_ALIVE		0x10
-	unsigned int flags;		/* vfsmount or inode mark? */
+	unsigned int flags;		/* flags [mark->lock] */
 	void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
 };
 
-- 
cgit v1.2.3-70-g09d2


From 925d1132a03e33cb8f29a0057300d023b4f1be23 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.com>
Date: Fri, 4 Sep 2015 15:43:09 -0700
Subject: fsnotify: remove mark->free_list

Free list is used when all marks on given inode / mount should be
destroyed when inode / mount is going away.  However we can free all of
the marks without using a special list with some care.

Signed-off-by: Jan Kara <jack@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fsnotify.c             |  1 -
 fs/notify/fsnotify.h             | 21 +++++++++++++++------
 fs/notify/inode_mark.c           | 20 --------------------
 fs/notify/mark.c                 | 40 +++++++++++++++++++++++++---------------
 fs/notify/vfsmount_mark.c        | 19 -------------------
 include/linux/fsnotify_backend.h |  2 --
 6 files changed, 40 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index d675e76251d3..db39de2dd4cb 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -26,7 +26,6 @@
 
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
-#include "../mount.h"
 
 /*
  * Clear all of the marks on an inode when it is being evicted from core
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 13a00be516d2..b44c68a857e7 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -6,6 +6,8 @@
 #include <linux/srcu.h>
 #include <linux/types.h>
 
+#include "../mount.h"
+
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
 
@@ -38,15 +40,22 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
 extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
 /* inode specific destruction of a mark */
 extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
-/* Destroy all marks in the given list */
-extern void fsnotify_destroy_marks(struct list_head *to_free);
 /* Find mark belonging to given group in the list of marks */
 extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
 						struct fsnotify_group *group);
-/* run the list of all marks associated with inode and flag them to be freed */
-extern void fsnotify_clear_marks_by_inode(struct inode *inode);
-/* run the list of all marks associated with vfsmount and flag them to be freed */
-extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt);
+/* Destroy all marks in the given list protected by 'lock' */
+extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock);
+/* run the list of all marks associated with inode and destroy them */
+static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
+{
+	fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock);
+}
+/* run the list of all marks associated with vfsmount and destroy them */
+static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
+{
+	fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
+			       &mnt->mnt_root->d_lock);
+}
 /*
  * update the dentry->d_flags of all of inode's children to indicate if inode cares
  * about events that happen to its children.
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3daf513ee99e..474a3ce1b5e1 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -64,26 +64,6 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
 	spin_unlock(&inode->i_lock);
 }
 
-/*
- * Given an inode, destroy all of the marks associated with that inode.
- */
-void fsnotify_clear_marks_by_inode(struct inode *inode)
-{
-	struct fsnotify_mark *mark;
-	struct hlist_node *n;
-	LIST_HEAD(free_list);
-
-	spin_lock(&inode->i_lock);
-	hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) {
-		list_add(&mark->free_list, &free_list);
-		hlist_del_init_rcu(&mark->obj_list);
-		fsnotify_get_mark(mark);
-	}
-	spin_unlock(&inode->i_lock);
-
-	fsnotify_destroy_marks(&free_list);
-}
-
 /*
  * Given a group clear all of the inode marks associated with that group.
  */
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 39ddcaf0918f..3b2d1ba41e7b 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -203,24 +203,34 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 	mutex_unlock(&group->mark_mutex);
 }
 
-/*
- * Destroy all marks in the given list. The marks must be already detached from
- * the original inode / vfsmount.
- */
-void fsnotify_destroy_marks(struct list_head *to_free)
+void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
 {
-	struct fsnotify_mark *mark, *lmark;
-	struct fsnotify_group *group;
-
-	list_for_each_entry_safe(mark, lmark, to_free, free_list) {
-		spin_lock(&mark->lock);
-		fsnotify_get_group(mark->group);
-		group = mark->group;
-		spin_unlock(&mark->lock);
+	struct fsnotify_mark *mark;
 
-		fsnotify_destroy_mark(mark, group);
+	while (1) {
+		/*
+		 * We have to be careful since we can race with e.g.
+		 * fsnotify_clear_marks_by_group() and once we drop 'lock',
+		 * mark can get removed from the obj_list and destroyed. But
+		 * we are holding mark reference so mark cannot be freed and
+		 * calling fsnotify_destroy_mark() more than once is fine.
+		 */
+		spin_lock(lock);
+		if (hlist_empty(head)) {
+			spin_unlock(lock);
+			break;
+		}
+		mark = hlist_entry(head->first, struct fsnotify_mark, obj_list);
+		/*
+		 * We don't update i_fsnotify_mask / mnt_fsnotify_mask here
+		 * since inode / mount is going away anyway. So just remove
+		 * mark from the list.
+		 */
+		hlist_del_init_rcu(&mark->obj_list);
+		fsnotify_get_mark(mark);
+		spin_unlock(lock);
+		fsnotify_destroy_mark(mark, mark->group);
 		fsnotify_put_mark(mark);
-		fsnotify_put_group(group);
 	}
 }
 
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 326b148e623c..a8fcab68faef 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -28,25 +28,6 @@
 
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
-#include "../mount.h"
-
-void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
-{
-	struct fsnotify_mark *mark;
-	struct hlist_node *n;
-	struct mount *m = real_mount(mnt);
-	LIST_HEAD(free_list);
-
-	spin_lock(&mnt->mnt_root->d_lock);
-	hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) {
-		list_add(&mark->free_list, &free_list);
-		hlist_del_init_rcu(&mark->obj_list);
-		fsnotify_get_mark(mark);
-	}
-	spin_unlock(&mnt->mnt_root->d_lock);
-
-	fsnotify_destroy_marks(&free_list);
-}
 
 void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
 {
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index dd6ddb0287ed..f044fe30e8c3 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -225,8 +225,6 @@ struct fsnotify_mark {
 	spinlock_t lock;
 	/* List of marks for inode / vfsmount [obj_lock] */
 	struct hlist_node obj_list;
-	/* tmp list used when freeing this mark */
-	struct list_head free_list;
 	union {	/* Object pointer [mark->lock, group->mark_mutex] */
 		struct inode *inode;	/* inode this mark is associated with */
 		struct vfsmount *mnt;	/* vfsmount this mark is associated with */
-- 
cgit v1.2.3-70-g09d2


From 4712e722f91457e60723b9cef6265a74290efba9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.com>
Date: Fri, 4 Sep 2015 15:43:12 -0700
Subject: fsnotify: get rid of fsnotify_destroy_mark_locked()

fsnotify_destroy_mark_locked() is subtle to use because it temporarily
releases group->mark_mutex.  To avoid future problems with this
function, split it into two.

fsnotify_detach_mark() is the part that needs group->mark_mutex and
fsnotify_free_mark() is the part that must be called outside of
group->mark_mutex.  This way it's much clearer what's going on and we
also avoid some pointless acquisitions of group->mark_mutex.

Signed-off-by: Jan Kara <jack@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/dnotify/dnotify.c        | 14 +++++---
 fs/notify/fanotify/fanotify_user.c |  8 +++--
 fs/notify/mark.c                   | 73 +++++++++++++++++++++-----------------
 include/linux/fsnotify_backend.h   |  7 ++--
 4 files changed, 61 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 44523f4a6084..6faaf710e563 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -154,6 +154,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 	struct dnotify_struct *dn;
 	struct dnotify_struct **prev;
 	struct inode *inode;
+	bool free = false;
 
 	inode = file_inode(filp);
 	if (!S_ISDIR(inode->i_mode))
@@ -182,11 +183,15 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 
 	/* nothing else could have found us thanks to the dnotify_groups
 	   mark_mutex */
-	if (dn_mark->dn == NULL)
-		fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
+	if (dn_mark->dn == NULL) {
+		fsnotify_detach_mark(fsn_mark);
+		free = true;
+	}
 
 	mutex_unlock(&dnotify_group->mark_mutex);
 
+	if (free)
+		fsnotify_free_mark(fsn_mark);
 	fsnotify_put_mark(fsn_mark);
 }
 
@@ -362,9 +367,10 @@ out:
 	spin_unlock(&fsn_mark->lock);
 
 	if (destroy)
-		fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
-
+		fsnotify_detach_mark(fsn_mark);
 	mutex_unlock(&dnotify_group->mark_mutex);
+	if (destroy)
+		fsnotify_free_mark(fsn_mark);
 	fsnotify_put_mark(fsn_mark);
 out_err:
 	if (new_fsn_mark)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index cf275500a665..8e8e6bcd1d43 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -529,8 +529,10 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
 						 &destroy_mark);
 	if (destroy_mark)
-		fsnotify_destroy_mark_locked(fsn_mark, group);
+		fsnotify_detach_mark(fsn_mark);
 	mutex_unlock(&group->mark_mutex);
+	if (destroy_mark)
+		fsnotify_free_mark(fsn_mark);
 
 	fsnotify_put_mark(fsn_mark);
 	if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -557,8 +559,10 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
 						 &destroy_mark);
 	if (destroy_mark)
-		fsnotify_destroy_mark_locked(fsn_mark, group);
+		fsnotify_detach_mark(fsn_mark);
 	mutex_unlock(&group->mark_mutex);
+	if (destroy_mark)
+		fsnotify_free_mark(fsn_mark);
 
 	/* matches the fsnotify_find_inode_mark() */
 	fsnotify_put_mark(fsn_mark);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 3b2d1ba41e7b..fc0df4442f7b 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -122,26 +122,27 @@ u32 fsnotify_recalc_mask(struct hlist_head *head)
 }
 
 /*
- * Any time a mark is getting freed we end up here.
- * The caller had better be holding a reference to this mark so we don't actually
- * do the final put under the mark->lock
+ * Remove mark from inode / vfsmount list, group list, drop inode reference
+ * if we got one.
+ *
+ * Must be called with group->mark_mutex held.
  */
-void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
-				  struct fsnotify_group *group)
+void fsnotify_detach_mark(struct fsnotify_mark *mark)
 {
 	struct inode *inode = NULL;
+	struct fsnotify_group *group = mark->group;
 
 	BUG_ON(!mutex_is_locked(&group->mark_mutex));
 
 	spin_lock(&mark->lock);
 
 	/* something else already called this function on this mark */
-	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
 		spin_unlock(&mark->lock);
 		return;
 	}
 
-	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+	mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
 
 	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
 		inode = mark->inode;
@@ -150,6 +151,12 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
 		fsnotify_destroy_vfsmount_mark(mark);
 	else
 		BUG();
+	/*
+	 * Note that we didn't update flags telling whether inode cares about
+	 * what's happening with children. We update these flags from
+	 * __fsnotify_parent() lazily when next event happens on one of our
+	 * children.
+	 */
 
 	list_del_init(&mark->g_list);
 
@@ -157,18 +164,32 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
 
 	if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
 		iput(inode);
-	/* release lock temporarily */
-	mutex_unlock(&group->mark_mutex);
+
+	atomic_dec(&group->num_marks);
+}
+
+/*
+ * Free fsnotify mark. The freeing is actually happening from a kthread which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+	struct fsnotify_group *group = mark->group;
+
+	spin_lock(&mark->lock);
+	/* something else already called this function on this mark */
+	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+		spin_unlock(&mark->lock);
+		return;
+	}
+	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+	spin_unlock(&mark->lock);
 
 	spin_lock(&destroy_lock);
 	list_add(&mark->g_list, &destroy_list);
 	spin_unlock(&destroy_lock);
 	wake_up(&destroy_waitq);
-	/*
-	 * We don't necessarily have a ref on mark from caller so the above destroy
-	 * may have actually freed it, unless this group provides a 'freeing_mark'
-	 * function which must be holding a reference.
-	 */
 
 	/*
 	 * Some groups like to know that marks are being freed.  This is a
@@ -177,30 +198,15 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
 	 */
 	if (group->ops->freeing_mark)
 		group->ops->freeing_mark(mark, group);
-
-	/*
-	 * __fsnotify_update_child_dentry_flags(inode);
-	 *
-	 * I really want to call that, but we can't, we have no idea if the inode
-	 * still exists the second we drop the mark->lock.
-	 *
-	 * The next time an event arrive to this inode from one of it's children
-	 * __fsnotify_parent will see that the inode doesn't care about it's
-	 * children and will update all of these flags then.  So really this
-	 * is just a lazy update (and could be a perf win...)
-	 */
-
-	atomic_dec(&group->num_marks);
-
-	mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
 }
 
 void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 			   struct fsnotify_group *group)
 {
 	mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
-	fsnotify_destroy_mark_locked(mark, group);
+	fsnotify_detach_mark(mark);
 	mutex_unlock(&group->mark_mutex);
+	fsnotify_free_mark(mark);
 }
 
 void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
@@ -342,7 +348,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 	 * inode->i_lock
 	 */
 	spin_lock(&mark->lock);
-	mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
+	mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
 
 	fsnotify_get_group(group);
 	mark->group = group;
@@ -448,8 +454,9 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
 		}
 		mark = list_first_entry(&to_free, struct fsnotify_mark, g_list);
 		fsnotify_get_mark(mark);
-		fsnotify_destroy_mark_locked(mark, group);
+		fsnotify_detach_mark(mark);
 		mutex_unlock(&group->mark_mutex);
+		fsnotify_free_mark(mark);
 		fsnotify_put_mark(mark);
 	}
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index f044fe30e8c3..e0727d77feaf 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -236,6 +236,7 @@ struct fsnotify_mark {
 #define FSNOTIFY_MARK_FLAG_OBJECT_PINNED	0x04
 #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x08
 #define FSNOTIFY_MARK_FLAG_ALIVE		0x10
+#define FSNOTIFY_MARK_FLAG_ATTACHED		0x20
 	unsigned int flags;		/* flags [mark->lock] */
 	void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
 };
@@ -353,8 +354,10 @@ extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct fsnotify_
 /* given a group and a mark, flag mark to be freed when all references are dropped */
 extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 				  struct fsnotify_group *group);
-extern void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
-					 struct fsnotify_group *group);
+/* detach mark from inode / mount list, group list, drop inode reference */
+extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
+/* free mark */
+extern void fsnotify_free_mark(struct fsnotify_mark *mark);
 /* run all the marks in a group, and clear all of the vfsmount marks */
 extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group);
 /* run all the marks in a group, and clear all of the inode marks */
-- 
cgit v1.2.3-70-g09d2


From a068acf2ee77693e0bf39d6e07139ba704f461c3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 4 Sep 2015 15:44:57 -0700
Subject: fs: create and use seq_show_option for escaping

Many file systems that implement the show_options hook fail to correctly
escape their output which could lead to unescaped characters (e.g.  new
lines) leaking into /proc/mounts and /proc/[pid]/mountinfo files.  This
could lead to confusion, spoofed entries (resulting in things like
systemd issuing false d-bus "mount" notifications), and who knows what
else.  This looks like it would only be the root user stepping on
themselves, but it's possible weird things could happen in containers or
in other situations with delegated mount privileges.

Here's an example using overlay with setuid fusermount trusting the
contents of /proc/mounts (via the /etc/mtab symlink).  Imagine the use
of "sudo" is something more sneaky:

  $ BASE="ovl"
  $ MNT="$BASE/mnt"
  $ LOW="$BASE/lower"
  $ UP="$BASE/upper"
  $ WORK="$BASE/work/ 0 0
  none /proc fuse.pwn user_id=1000"
  $ mkdir -p "$LOW" "$UP" "$WORK"
  $ sudo mount -t overlay -o "lowerdir=$LOW,upperdir=$UP,workdir=$WORK" none /mnt
  $ cat /proc/mounts
  none /root/ovl/mnt overlay rw,relatime,lowerdir=ovl/lower,upperdir=ovl/upper,workdir=ovl/work/ 0 0
  none /proc fuse.pwn user_id=1000 0 0
  $ fusermount -u /proc
  $ cat /proc/mounts
  cat: /proc/mounts: No such file or directory

This fixes the problem by adding new seq_show_option and
seq_show_option_n helpers, and updating the vulnerable show_option
handlers to use them as needed.  Some, like SELinux, need to be open
coded due to unusual existing escape mechanisms.

[akpm@linux-foundation.org: add lost chunk, per Kees]
[keescook@chromium.org: seq_show_option should be using const parameters]
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Acked-by: Jan Kara <jack@suse.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Cc: J. R. Okajima <hooanon05g@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ceph/super.c          |  2 +-
 fs/cifs/cifsfs.c         |  6 +++---
 fs/ext4/super.c          |  4 ++--
 fs/gfs2/super.c          |  6 +++---
 fs/hfs/super.c           |  4 ++--
 fs/hfsplus/options.c     |  4 ++--
 fs/hostfs/hostfs_kern.c  |  2 +-
 fs/ocfs2/super.c         |  4 ++--
 fs/overlayfs/super.c     |  6 +++---
 fs/reiserfs/super.c      |  8 +++++---
 fs/xfs/xfs_super.c       |  4 ++--
 include/linux/seq_file.h | 35 +++++++++++++++++++++++++++++++++++
 kernel/cgroup.c          |  7 ++++---
 net/ceph/ceph_common.c   |  7 +++++--
 security/selinux/hooks.c |  2 +-
 15 files changed, 71 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d1c833c321b9..7b6bfcbf801c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -479,7 +479,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 	if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
 		seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
 	if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
-		seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+		seq_show_option(m, "snapdirname", fsopt->snapdir_name);
 
 	return 0;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0a9fb6b53126..6a1119e87fbb 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -394,17 +394,17 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 	struct sockaddr *srcaddr;
 	srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
 
-	seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
+	seq_show_option(s, "vers", tcon->ses->server->vals->version_string);
 	cifs_show_security(s, tcon->ses);
 	cifs_show_cache_flavor(s, cifs_sb);
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
 		seq_puts(s, ",multiuser");
 	else if (tcon->ses->user_name)
-		seq_printf(s, ",username=%s", tcon->ses->user_name);
+		seq_show_option(s, "username", tcon->ses->user_name);
 
 	if (tcon->ses->domainName)
-		seq_printf(s, ",domain=%s", tcon->ses->domainName);
+		seq_show_option(s, "domain", tcon->ses->domainName);
 
 	if (srcaddr->sa_family != AF_UNSPEC) {
 		struct sockaddr_in *saddr4;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ee3878262a49..a63c7b0a10cf 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1776,10 +1776,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
 	}
 
 	if (sbi->s_qf_names[USRQUOTA])
-		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+		seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
 
 	if (sbi->s_qf_names[GRPQUOTA])
-		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+		seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
 #endif
 }
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2982445947e1..894fb01a91da 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1334,11 +1334,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
 	if (is_ancestor(root, sdp->sd_master_dir))
 		seq_puts(s, ",meta");
 	if (args->ar_lockproto[0])
-		seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+		seq_show_option(s, "lockproto", args->ar_lockproto);
 	if (args->ar_locktable[0])
-		seq_printf(s, ",locktable=%s", args->ar_locktable);
+		seq_show_option(s, "locktable", args->ar_locktable);
 	if (args->ar_hostdata[0])
-		seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+		seq_show_option(s, "hostdata", args->ar_hostdata);
 	if (args->ar_spectator)
 		seq_puts(s, ",spectator");
 	if (args->ar_localflocks)
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 55c03b9e9070..4574fdd3d421 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -136,9 +136,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root)
 	struct hfs_sb_info *sbi = HFS_SB(root->d_sb);
 
 	if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
-		seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator);
+		seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4);
 	if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
-		seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type);
+		seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4);
 	seq_printf(seq, ",uid=%u,gid=%u",
 			from_kuid_munged(&init_user_ns, sbi->s_uid),
 			from_kgid_munged(&init_user_ns, sbi->s_gid));
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index c90b72ee676d..bb806e58c977 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -218,9 +218,9 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb);
 
 	if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
-		seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
+		seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4);
 	if (sbi->type != HFSPLUS_DEF_CR_TYPE)
-		seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
+		seq_show_option_n(seq, "type", (char *)&sbi->type, 4);
 	seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
 			from_kuid_munged(&init_user_ns, sbi->uid),
 			from_kgid_munged(&init_user_ns, sbi->gid));
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 059597b23f67..2ac99db3750e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -260,7 +260,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
 	size_t offset = strlen(root_ino) + 1;
 
 	if (strlen(root_path) > offset)
-		seq_printf(seq, ",%s", root_path + offset);
+		seq_show_option(seq, root_path + offset, NULL);
 
 	if (append)
 		seq_puts(seq, ",append");
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3a9a1af39ad7..2de4c8a9340c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1563,8 +1563,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
 		seq_printf(s, ",localflocks,");
 
 	if (osb->osb_cluster_stack[0])
-		seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
-			   osb->osb_cluster_stack);
+		seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack,
+				  OCFS2_STACK_LABEL_LEN);
 	if (opts & OCFS2_MOUNT_USRQUOTA)
 		seq_printf(s, ",usrquota");
 	if (opts & OCFS2_MOUNT_GRPQUOTA)
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 7466ff339c66..79073d68b475 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -588,10 +588,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
 	struct super_block *sb = dentry->d_sb;
 	struct ovl_fs *ufs = sb->s_fs_info;
 
-	seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
+	seq_show_option(m, "lowerdir", ufs->config.lowerdir);
 	if (ufs->config.upperdir) {
-		seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
-		seq_printf(m, ",workdir=%s", ufs->config.workdir);
+		seq_show_option(m, "upperdir", ufs->config.upperdir);
+		seq_show_option(m, "workdir", ufs->config.workdir);
 	}
 	return 0;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0e4cf728126f..4a62fe8cc3bf 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -714,18 +714,20 @@ static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",acl");
 
 	if (REISERFS_SB(s)->s_jdev)
-		seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev);
+		seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev);
 
 	if (journal->j_max_commit_age != journal->j_default_max_commit_age)
 		seq_printf(seq, ",commit=%d", journal->j_max_commit_age);
 
 #ifdef CONFIG_QUOTA
 	if (REISERFS_SB(s)->s_qf_names[USRQUOTA])
-		seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]);
+		seq_show_option(seq, "usrjquota",
+				REISERFS_SB(s)->s_qf_names[USRQUOTA]);
 	else if (opts & (1 << REISERFS_USRQUOTA))
 		seq_puts(seq, ",usrquota");
 	if (REISERFS_SB(s)->s_qf_names[GRPQUOTA])
-		seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
+		seq_show_option(seq, "grpjquota",
+				REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
 	else if (opts & (1 << REISERFS_GRPQUOTA))
 		seq_puts(seq, ",grpquota");
 	if (REISERFS_SB(s)->s_jquota_fmt) {
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 1fb16562c159..bbd9b1f10ffb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -511,9 +511,9 @@ xfs_showargs(
 		seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
 
 	if (mp->m_logname)
-		seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname);
+		seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname);
 	if (mp->m_rtname)
-		seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname);
+		seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname);
 
 	if (mp->m_dalign > 0)
 		seq_printf(m, "," MNTOPT_SUNIT "=%d",
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 912a7c482649..d4c7271382cb 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -149,6 +149,41 @@ static inline struct user_namespace *seq_user_ns(struct seq_file *seq)
 #endif
 }
 
+/**
+ * seq_show_options - display mount options with appropriate escapes.
+ * @m: the seq_file handle
+ * @name: the mount option name
+ * @value: the mount option name's value, can be NULL
+ */
+static inline void seq_show_option(struct seq_file *m, const char *name,
+				   const char *value)
+{
+	seq_putc(m, ',');
+	seq_escape(m, name, ",= \t\n\\");
+	if (value) {
+		seq_putc(m, '=');
+		seq_escape(m, value, ", \t\n\\");
+	}
+}
+
+/**
+ * seq_show_option_n - display mount options with appropriate escapes
+ *		       where @value must be a specific length.
+ * @m: the seq_file handle
+ * @name: the mount option name
+ * @value: the mount option name's value, cannot be NULL
+ * @length: the length of @value to display
+ *
+ * This is a macro since this uses "length" to define the size of the
+ * stack buffer.
+ */
+#define seq_show_option_n(m, name, value, length) {	\
+	char val_buf[length + 1];			\
+	strncpy(val_buf, value, length);		\
+	val_buf[length] = '\0';				\
+	seq_show_option(m, name, val_buf);		\
+}
+
 #define SEQ_START_TOKEN ((void *)1)
 /*
  * Helpers for iteration over list_head-s in seq_files
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f3f5cd5e2c0d..a8538e443784 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1342,7 +1342,7 @@ static int cgroup_show_options(struct seq_file *seq,
 	if (root != &cgrp_dfl_root)
 		for_each_subsys(ss, ssid)
 			if (root->subsys_mask & (1 << ssid))
-				seq_printf(seq, ",%s", ss->legacy_name);
+				seq_show_option(seq, ss->name, NULL);
 	if (root->flags & CGRP_ROOT_NOPREFIX)
 		seq_puts(seq, ",noprefix");
 	if (root->flags & CGRP_ROOT_XATTR)
@@ -1350,13 +1350,14 @@ static int cgroup_show_options(struct seq_file *seq,
 
 	spin_lock(&release_agent_path_lock);
 	if (strlen(root->release_agent_path))
-		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+		seq_show_option(seq, "release_agent",
+				root->release_agent_path);
 	spin_unlock(&release_agent_path_lock);
 
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
-		seq_printf(seq, ",name=%s", root->name);
+		seq_show_option(seq, "name", root->name);
 	return 0;
 }
 
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index f30329f72641..69a4d30a9ccf 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -517,8 +517,11 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
 	struct ceph_options *opt = client->options;
 	size_t pos = m->count;
 
-	if (opt->name)
-		seq_printf(m, "name=%s,", opt->name);
+	if (opt->name) {
+		seq_puts(m, "name=");
+		seq_escape(m, opt->name, ", \t\n\\");
+		seq_putc(m, ',');
+	}
 	if (opt->key)
 		seq_puts(m, "secret=<hidden>,");
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 564079c5c49d..cdf4c589a391 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1100,7 +1100,7 @@ static void selinux_write_opts(struct seq_file *m,
 		seq_puts(m, prefix);
 		if (has_comma)
 			seq_putc(m, '\"');
-		seq_puts(m, opts->mnt_opts[i]);
+		seq_escape(m, opts->mnt_opts[i], "\"\n\\");
 		if (has_comma)
 			seq_putc(m, '\"');
 	}
-- 
cgit v1.2.3-70-g09d2


From 230ec93909f00678401cb2d63b8b95f1dea68e40 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 4 Sep 2015 15:45:06 -0700
Subject: smpboot: allow passing the cpumask on per-cpu thread registration

It makes the registration cheaper and simpler for the smpboot per-cpu
kthread users that don't need to always update the cpumask after threads
creation.

[sfr@canb.auug.org.au: fix for allow passing the cpumask on per-cpu thread registration]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/smpboot.h | 11 ++++++++++-
 kernel/smpboot.c        | 14 +++++++++-----
 kernel/watchdog.c       |  9 +++------
 3 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index da3c593f9845..e6109a6cd8f6 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -48,7 +48,16 @@ struct smp_hotplug_thread {
 	const char			*thread_comm;
 };
 
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+					   const struct cpumask *cpumask);
+
+static inline int
+smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+	return smpboot_register_percpu_thread_cpumask(plug_thread,
+						      cpu_possible_mask);
+}
+
 void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
 int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
 					 const struct cpumask *);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 60aa858a6a07..a818cbc73e14 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -273,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 }
 
 /**
- * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
+ * 					    to hotplug
  * @plug_thread:	Hotplug thread descriptor
+ * @cpumask:		The cpumask where threads run
  *
  * Creates and starts the threads on all online cpus.
  */
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+					   const struct cpumask *cpumask)
 {
 	unsigned int cpu;
 	int ret = 0;
 
 	if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
 		return -ENOMEM;
-	cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
+	cpumask_copy(plug_thread->cpumask, cpumask);
 
 	get_online_cpus();
 	mutex_lock(&smpboot_threads_lock);
@@ -296,7 +299,8 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
 			free_cpumask_var(plug_thread->cpumask);
 			goto out;
 		}
-		smpboot_unpark_thread(plug_thread, cpu);
+		if (cpumask_test_cpu(cpu, cpumask))
+			smpboot_unpark_thread(plug_thread, cpu);
 	}
 	list_add(&plug_thread->list, &hotplug_threads);
 out:
@@ -304,7 +308,7 @@ out:
 	put_online_cpus();
 	return ret;
 }
-EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
 
 /**
  * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a6ffa43f2993..e5bb86fb0ea5 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -713,15 +713,12 @@ static int watchdog_enable_all_cpus(void)
 	int err = 0;
 
 	if (!watchdog_running) {
-		err = smpboot_register_percpu_thread(&watchdog_threads);
+		err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+							     &watchdog_cpumask);
 		if (err)
 			pr_err("Failed to create watchdog threads, disabled\n");
-		else {
-			if (smpboot_update_cpumask_percpu_thread(
-				    &watchdog_threads, &watchdog_cpumask))
-				pr_err("Failed to set cpumask for watchdog threads\n");
+		else
 			watchdog_running = 1;
-		}
 	} else {
 		/*
 		 * Enable/disable the lockup detectors or
-- 
cgit v1.2.3-70-g09d2


From aacfbe6a9724bb6d66a656a5abcc681d5649ed92 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Fri, 4 Sep 2015 15:45:12 -0700
Subject: kernel/watchdog: move NMI function header declarations from
 watchdog.h to nmi.h

The kernel's NMI watchdog has nothing to do with the watchdog subsystem.
Its header declarations should be in linux/nmi.h, not linux/watchdog.h.

The code provided two sets of dummy functions if HARDLOCKUP_DETECTOR is
not configured, one in the include file and one in kernel/watchdog.c.
Remove the dummy functions from kernel/watchdog.c and use those from the
include file.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 2 +-
 include/linux/nmi.h                    | 8 +++++---
 include/linux/watchdog.h               | 8 --------
 kernel/watchdog.c                      | 2 --
 4 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3f124d553c5a..36bd8250934b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/export.h>
-#include <linux/watchdog.h>
+#include <linux/nmi.h>
 
 #include <asm/cpufeature.h>
 #include <asm/hardirq.h>
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index f94da0e65dea..088714537d10 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -26,10 +26,12 @@ static inline void touch_nmi_watchdog(void)
 
 #if defined(CONFIG_HARDLOCKUP_DETECTOR)
 extern void hardlockup_detector_disable(void);
+void watchdog_nmi_disable_all(void);
+void watchdog_nmi_enable_all(void);
 #else
-static inline void hardlockup_detector_disable(void)
-{
-}
+static inline void hardlockup_detector_disable(void) {}
+static inline void watchdog_nmi_disable_all(void) {}
+static inline void watchdog_nmi_enable_all(void) {}
 #endif
 
 /*
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index f47feada5b42..d74a0e907b9e 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -140,12 +140,4 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd,
 extern int watchdog_register_device(struct watchdog_device *);
 extern void watchdog_unregister_device(struct watchdog_device *);
 
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-void watchdog_nmi_disable_all(void);
-void watchdog_nmi_enable_all(void);
-#else
-static inline void watchdog_nmi_disable_all(void) {}
-static inline void watchdog_nmi_enable_all(void) {}
-#endif
-
 #endif  /* ifndef _LINUX_WATCHDOG_H */
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d18330fa4776..e74d48bc3e61 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -651,8 +651,6 @@ unlock:
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
 static void watchdog_nmi_disable(unsigned int cpu) { return; }
-void watchdog_nmi_enable_all(void) {}
-void watchdog_nmi_disable_all(void) {}
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
 static struct smp_hotplug_thread watchdog_threads = {
-- 
cgit v1.2.3-70-g09d2


From 8c073d27d7ad293bf734cc8475689413afadab81 Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Fri, 4 Sep 2015 15:45:18 -0700
Subject: watchdog: introduce watchdog_suspend() and watchdog_resume()

This interface can be utilized to deactivate the hard and soft lockup
detector temporarily.  Callers are expected to minimize the duration of
deactivation.  Multiple deactivations are allowed to occur in parallel but
should be rare in practice.

[akpm@linux-foundation.org: remove unneeded static initialization]
Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Stephane Eranian <eranian@google.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nmi.h |  2 ++
 kernel/watchdog.c   | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 088714537d10..e9f213c337bb 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -82,6 +82,8 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
 				void __user *, size_t *, loff_t *);
 extern int proc_watchdog_cpumask(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
+extern int watchdog_suspend(void);
+extern void watchdog_resume(void);
 #endif
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6c489e49c610..e6eb5b697212 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -67,6 +67,7 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
 #define for_each_watchdog_cpu(cpu) \
 	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
 
+static int __read_mostly watchdog_suspended;
 static int __read_mostly watchdog_running;
 static u64 __read_mostly sample_period;
 
@@ -700,6 +701,50 @@ static void watchdog_unpark_threads(void)
 	put_online_cpus();
 }
 
+/*
+ * Suspend the hard and soft lockup detector by parking the watchdog threads.
+ */
+int watchdog_suspend(void)
+{
+	int ret = 0;
+
+	mutex_lock(&watchdog_proc_mutex);
+	/*
+	 * Multiple suspend requests can be active in parallel (counted by
+	 * the 'watchdog_suspended' variable). If the watchdog threads are
+	 * running, the first caller takes care that they will be parked.
+	 * The state of 'watchdog_running' cannot change while a suspend
+	 * request is active (see related changes in 'proc' handlers).
+	 */
+	if (watchdog_running && !watchdog_suspended)
+		ret = watchdog_park_threads();
+
+	if (ret == 0)
+		watchdog_suspended++;
+
+	mutex_unlock(&watchdog_proc_mutex);
+
+	return ret;
+}
+
+/*
+ * Resume the hard and soft lockup detector by unparking the watchdog threads.
+ */
+void watchdog_resume(void)
+{
+	mutex_lock(&watchdog_proc_mutex);
+
+	watchdog_suspended--;
+	/*
+	 * The watchdog threads are unparked if they were previously running
+	 * and if there is no more active suspend request.
+	 */
+	if (watchdog_running && !watchdog_suspended)
+		watchdog_unpark_threads();
+
+	mutex_unlock(&watchdog_proc_mutex);
+}
+
 static void restart_watchdog_hrtimer(void *info)
 {
 	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
@@ -818,6 +863,12 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
 
 	mutex_lock(&watchdog_proc_mutex);
 
+	if (watchdog_suspended) {
+		/* no parameter changes allowed while watchdog is suspended */
+		err = -EAGAIN;
+		goto out;
+	}
+
 	/*
 	 * If the parameter is being read return the state of the corresponding
 	 * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
@@ -903,6 +954,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
 
 	mutex_lock(&watchdog_proc_mutex);
 
+	if (watchdog_suspended) {
+		/* no parameter changes allowed while watchdog is suspended */
+		err = -EAGAIN;
+		goto out;
+	}
+
 	old = ACCESS_ONCE(watchdog_thresh);
 	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
@@ -934,6 +991,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 	int err;
 
 	mutex_lock(&watchdog_proc_mutex);
+
+	if (watchdog_suspended) {
+		/* no parameter changes allowed while watchdog is suspended */
+		err = -EAGAIN;
+		goto out;
+	}
+
 	err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
 	if (!err && write) {
 		/* Remove impossible cpus to keep sysctl output cleaner. */
@@ -951,6 +1015,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 				pr_err("cpumask update failed\n");
 		}
 	}
+out:
 	mutex_unlock(&watchdog_proc_mutex);
 	return err;
 }
-- 
cgit v1.2.3-70-g09d2


From 999bbe49ea0118b70ddf3f5d679f51dc7a97ae55 Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Fri, 4 Sep 2015 15:45:25 -0700
Subject: watchdog: use suspend/resume interface in fixup_ht_bug()

Remove watchdog_nmi_disable_all() and watchdog_nmi_enable_all() since
these functions are no longer needed.  If a subsystem has a need to
deactivate the watchdog temporarily, it should utilize the
watchdog_suspend() and watchdog_resume() functions.

[akpm@linux-foundation.org: fix build with CONFIG_LOCKUP_DETECTOR=m]
Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Stephane Eranian <eranian@google.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c |  7 +++++--
 include/linux/nmi.h                    | 13 +++++++++----
 kernel/watchdog.c                      | 35 ----------------------------------
 3 files changed, 14 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 36bd8250934b..144ab91951a7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -3627,7 +3627,10 @@ static __init int fixup_ht_bug(void)
 		return 0;
 	}
 
-	watchdog_nmi_disable_all();
+	if (watchdog_suspend() != 0) {
+		pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
+		return 0;
+	}
 
 	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
 
@@ -3635,7 +3638,7 @@ static __init int fixup_ht_bug(void)
 	x86_pmu.commit_scheduling = NULL;
 	x86_pmu.stop_scheduling = NULL;
 
-	watchdog_nmi_enable_all();
+	watchdog_resume();
 
 	get_online_cpus();
 
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index e9f213c337bb..e5afe8bae202 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -26,12 +26,8 @@ static inline void touch_nmi_watchdog(void)
 
 #if defined(CONFIG_HARDLOCKUP_DETECTOR)
 extern void hardlockup_detector_disable(void);
-void watchdog_nmi_disable_all(void);
-void watchdog_nmi_enable_all(void);
 #else
 static inline void hardlockup_detector_disable(void) {}
-static inline void watchdog_nmi_disable_all(void) {}
-static inline void watchdog_nmi_enable_all(void) {}
 #endif
 
 /*
@@ -84,6 +80,15 @@ extern int proc_watchdog_cpumask(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
 extern int watchdog_suspend(void);
 extern void watchdog_resume(void);
+#else
+static inline int watchdog_suspend(void)
+{
+	return 0;
+}
+
+static inline void watchdog_resume(void)
+{
+}
 #endif
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index eb8f94b50101..69666f4b8e8f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -615,41 +615,6 @@ static void watchdog_nmi_disable(unsigned int cpu)
 	}
 }
 
-void watchdog_nmi_enable_all(void)
-{
-	int cpu;
-
-	mutex_lock(&watchdog_proc_mutex);
-
-	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-		goto unlock;
-
-	get_online_cpus();
-	for_each_watchdog_cpu(cpu)
-		watchdog_nmi_enable(cpu);
-	put_online_cpus();
-
-unlock:
-	mutex_unlock(&watchdog_proc_mutex);
-}
-
-void watchdog_nmi_disable_all(void)
-{
-	int cpu;
-
-	mutex_lock(&watchdog_proc_mutex);
-
-	if (!watchdog_running)
-		goto unlock;
-
-	get_online_cpus();
-	for_each_watchdog_cpu(cpu)
-		watchdog_nmi_disable(cpu);
-	put_online_cpus();
-
-unlock:
-	mutex_unlock(&watchdog_proc_mutex);
-}
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
 static void watchdog_nmi_disable(unsigned int cpu) { return; }
-- 
cgit v1.2.3-70-g09d2


From ec6a90661a0d6ce1461d05c7a58a0a151154e14a Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Fri, 4 Sep 2015 15:45:28 -0700
Subject: watchdog: rename watchdog_suspend() and watchdog_resume()

Rename watchdog_suspend() to lockup_detector_suspend() and
watchdog_resume() to lockup_detector_resume() to avoid confusion with the
watchdog subsystem and to be consistent with the existing name
lockup_detector_init().

Also provide comment blocks to explain the watchdog_running and
watchdog_suspended variables and their relationship.

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Stephane Eranian <eranian@google.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c |  4 ++--
 include/linux/nmi.h                    |  8 ++++----
 kernel/watchdog.c                      | 26 ++++++++++++++++++++++----
 3 files changed, 28 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 144ab91951a7..cd9b6d0b10bf 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -3627,7 +3627,7 @@ static __init int fixup_ht_bug(void)
 		return 0;
 	}
 
-	if (watchdog_suspend() != 0) {
+	if (lockup_detector_suspend() != 0) {
 		pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
 		return 0;
 	}
@@ -3638,7 +3638,7 @@ static __init int fixup_ht_bug(void)
 	x86_pmu.commit_scheduling = NULL;
 	x86_pmu.stop_scheduling = NULL;
 
-	watchdog_resume();
+	lockup_detector_resume();
 
 	get_online_cpus();
 
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index e5afe8bae202..a91adf6e02f2 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -78,15 +78,15 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
 				void __user *, size_t *, loff_t *);
 extern int proc_watchdog_cpumask(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
-extern int watchdog_suspend(void);
-extern void watchdog_resume(void);
+extern int lockup_detector_suspend(void);
+extern void lockup_detector_resume(void);
 #else
-static inline int watchdog_suspend(void)
+static inline int lockup_detector_suspend(void)
 {
 	return 0;
 }
 
-static inline void watchdog_resume(void)
+static inline void lockup_detector_resume(void)
 {
 }
 #endif
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 69666f4b8e8f..64ed1c37bd1f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -67,8 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
 #define for_each_watchdog_cpu(cpu) \
 	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
 
-static int __read_mostly watchdog_suspended;
+/*
+ * The 'watchdog_running' variable is set to 1 when the watchdog threads
+ * are registered/started and is set to 0 when the watchdog threads are
+ * unregistered/stopped, so it is an indicator whether the threads exist.
+ */
 static int __read_mostly watchdog_running;
+/*
+ * If a subsystem has a need to deactivate the watchdog temporarily, it
+ * can use the suspend/resume interface to achieve this. The content of
+ * the 'watchdog_suspended' variable reflects this state. Existing threads
+ * are parked/unparked by the lockup_detector_{suspend|resume} functions
+ * (see comment blocks pertaining to those functions for further details).
+ *
+ * 'watchdog_suspended' also prevents threads from being registered/started
+ * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
+ * of 'watchdog_running' cannot change while the watchdog is deactivated
+ * temporarily (see related code in 'proc' handlers).
+ */
+static int __read_mostly watchdog_suspended;
+
 static u64 __read_mostly sample_period;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -669,7 +687,7 @@ static void watchdog_unpark_threads(void)
 /*
  * Suspend the hard and soft lockup detector by parking the watchdog threads.
  */
-int watchdog_suspend(void)
+int lockup_detector_suspend(void)
 {
 	int ret = 0;
 
@@ -679,7 +697,7 @@ int watchdog_suspend(void)
 	 * the 'watchdog_suspended' variable). If the watchdog threads are
 	 * running, the first caller takes care that they will be parked.
 	 * The state of 'watchdog_running' cannot change while a suspend
-	 * request is active (see related changes in 'proc' handlers).
+	 * request is active (see related code in 'proc' handlers).
 	 */
 	if (watchdog_running && !watchdog_suspended)
 		ret = watchdog_park_threads();
@@ -695,7 +713,7 @@ int watchdog_suspend(void)
 /*
  * Resume the hard and soft lockup detector by unparking the watchdog threads.
  */
-void watchdog_resume(void)
+void lockup_detector_resume(void)
 {
 	mutex_lock(&watchdog_proc_mutex);
 
-- 
cgit v1.2.3-70-g09d2


From 484748f0b65a1950b2b93f444a2287e8dd2cedd6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Fri, 4 Sep 2015 15:45:34 -0700
Subject: slab: infrastructure for bulk object allocation and freeing

Add the basic infrastructure for alloc/free operations on pointer arrays.
It includes a generic function in the common slab code that is used in
this infrastructure patch to create the unoptimized functionality for slab
bulk operations.

Allocators can then provide optimized allocation functions for situations
in which large numbers of objects are needed.  These optimization may
avoid taking locks repeatedly and bypass metadata creation if all objects
in slab pages can be used to provide the objects required.

Allocators can extend the skeletons provided and add their own code to the
bulk alloc and free functions.  They can keep the generic allocation and
freeing and just fall back to those if optimizations would not work (like
for example when debugging is on).

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 10 ++++++++++
 mm/slab.c            | 13 +++++++++++++
 mm/slab.h            |  9 +++++++++
 mm/slab_common.c     | 23 +++++++++++++++++++++++
 mm/slob.c            | 13 +++++++++++++
 mm/slub.c            | 14 ++++++++++++++
 6 files changed, 82 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index a99f0e5243e1..7e37d448ed91 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -290,6 +290,16 @@ void *__kmalloc(size_t size, gfp_t flags);
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
 void kmem_cache_free(struct kmem_cache *, void *);
 
+/*
+ * Bulk allocation and freeing operations. These are accellerated in an
+ * allocator specific way to avoid taking locks repeatedly or building
+ * metadata structures unnecessarily.
+ *
+ * Note that interrupts must be enabled when calling these functions.
+ */
+void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
+bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+
 #ifdef CONFIG_NUMA
 void *__kmalloc_node(size_t size, gfp_t flags, int node);
 void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
diff --git a/mm/slab.c b/mm/slab.c
index bbd0b47dc6a9..60c936938b84 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3416,6 +3416,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+	__kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+								void **p)
+{
+	return __kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
 #ifdef CONFIG_TRACING
 void *
 kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
diff --git a/mm/slab.h b/mm/slab.h
index 8da63e4e470f..88b55497738c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -163,6 +163,15 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
 ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 		       size_t count, loff_t *ppos);
 
+/*
+ * Generic implementation of bulk operations
+ * These are useful for situations in which the allocator cannot
+ * perform optimizations. In that case segments of the objecct listed
+ * may be allocated or freed using these operations.
+ */
+void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
+bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+
 #ifdef CONFIG_MEMCG_KMEM
 /*
  * Iterate over all memcg caches of the given root cache. The caller must hold
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 86831105a09f..c26829fe4e37 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -104,6 +104,29 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
 }
 #endif
 
+void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
+{
+	size_t i;
+
+	for (i = 0; i < nr; i++)
+		kmem_cache_free(s, p[i]);
+}
+
+bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
+								void **p)
+{
+	size_t i;
+
+	for (i = 0; i < nr; i++) {
+		void *x = p[i] = kmem_cache_alloc(s, flags);
+		if (!x) {
+			__kmem_cache_free_bulk(s, i, p);
+			return false;
+		}
+	}
+	return true;
+}
+
 #ifdef CONFIG_MEMCG_KMEM
 void slab_init_memcg_params(struct kmem_cache *s)
 {
diff --git a/mm/slob.c b/mm/slob.c
index 4765f65019c7..165bbd3cd606 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -611,6 +611,19 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+	__kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+								void **p)
+{
+	return __kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
 int __kmem_cache_shutdown(struct kmem_cache *c)
 {
 	/* No way to check for remaining objects */
diff --git a/mm/slub.c b/mm/slub.c
index defd76f98648..3ca89ef9b7b0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2750,6 +2750,20 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+	__kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+								void **p)
+{
+	return __kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
+
 /*
  * Object placement in a slab is made very easy because we always start at
  * offset 0. If we tune the size of the object to the alignment then we can
-- 
cgit v1.2.3-70-g09d2


From 51360155eccb907ff8635bd10fc7de876408c2e0 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:04 -0700
Subject: userfaultfd: waitqueue: add nr wake parameter to __wake_up_locked_key

userfaultfd needs to wake all waitqueues (pass 0 as nr parameter), instead
of the current hardcoded 1 (that would wake just the first waitqueue in
the head list).

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h | 5 +++--
 kernel/sched/wait.c  | 7 ++++---
 net/sunrpc/sched.c   | 2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1e1bf9f963a9..d3d077228d4c 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -147,7 +147,8 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
 
 typedef int wait_bit_action_f(struct wait_bit_key *);
 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+			  void *key);
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
@@ -179,7 +180,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 #define wake_up_poll(x, m)						\
 	__wake_up(x, TASK_NORMAL, 1, (void *) (m))
 #define wake_up_locked_poll(x, m)					\
-	__wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
+	__wake_up_locked_key((x), TASK_NORMAL, 1, (void *) (m))
 #define wake_up_interruptible_poll(x, m)				\
 	__wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
 #define wake_up_interruptible_sync_poll(x, m)				\
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 052e02672d12..272d9322bc5d 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+			  void *key)
 {
-	__wake_up_common(q, mode, 1, 0, key);
+	__wake_up_common(q, mode, nr, 0, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
@@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
 	if (!list_empty(&wait->task_list))
 		list_del_init(&wait->task_list);
 	else if (waitqueue_active(q))
-		__wake_up_locked_key(q, mode, key);
+		__wake_up_locked_key(q, mode, 1, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 337ca851a350..b140c092d226 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -297,7 +297,7 @@ static int rpc_complete_task(struct rpc_task *task)
 	clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
 	ret = atomic_dec_and_test(&task->tk_count);
 	if (waitqueue_active(wq))
-		__wake_up_locked_key(wq, TASK_NORMAL, &k);
+		__wake_up_locked_key(wq, TASK_NORMAL, 1, &k);
 	spin_unlock_irqrestore(&wq->lock, flags);
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From 932b18e0aec65acb089f4bd8761ee85e70f8eb6a Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:10 -0700
Subject: userfaultfd: linux/userfaultfd_k.h

Kernel header defining the methods needed by the VM common code to
interact with the userfaultfd.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/userfaultfd_k.h | 79 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 include/linux/userfaultfd_k.h

(limited to 'include/linux')

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
new file mode 100644
index 000000000000..e1e43609a179
--- /dev/null
+++ b/include/linux/userfaultfd_k.h
@@ -0,0 +1,79 @@
+/*
+ *  include/linux/userfaultfd_k.h
+ *
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_K_H
+#define _LINUX_USERFAULTFD_K_H
+
+#ifdef CONFIG_USERFAULTFD
+
+#include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */
+
+#include <linux/fcntl.h>
+
+/*
+ * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
+ * new flags, since they might collide with O_* ones. We want
+ * to re-use O_* flags that couldn't possibly have a meaning
+ * from userfaultfd, in order to leave a free define-space for
+ * shared O_* flags.
+ */
+#define UFFD_CLOEXEC O_CLOEXEC
+#define UFFD_NONBLOCK O_NONBLOCK
+
+#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
+#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
+
+extern int handle_userfault(struct vm_area_struct *vma, unsigned long address,
+			    unsigned int flags, unsigned long reason);
+
+/* mm helpers */
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+					struct vm_userfaultfd_ctx vm_ctx)
+{
+	return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
+}
+
+static inline bool userfaultfd_missing(struct vm_area_struct *vma)
+{
+	return vma->vm_flags & VM_UFFD_MISSING;
+}
+
+static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+{
+	return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
+}
+
+#else /* CONFIG_USERFAULTFD */
+
+/* mm helpers */
+static inline int handle_userfault(struct vm_area_struct *vma,
+				   unsigned long address,
+				   unsigned int flags,
+				   unsigned long reason)
+{
+	return VM_FAULT_SIGBUS;
+}
+
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+					struct vm_userfaultfd_ctx vm_ctx)
+{
+	return true;
+}
+
+static inline bool userfaultfd_missing(struct vm_area_struct *vma)
+{
+	return false;
+}
+
+static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+{
+	return false;
+}
+
+#endif /* CONFIG_USERFAULTFD */
+
+#endif /* _LINUX_USERFAULTFD_K_H */
-- 
cgit v1.2.3-70-g09d2


From 745f234be12b6191b15eae8dd415cc81a9137f47 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:14 -0700
Subject: userfaultfd: add vm_userfaultfd_ctx to the vm_area_struct

This adds the vm_userfaultfd_ctx to the vm_area_struct.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 11 +++++++++++
 kernel/fork.c            |  1 +
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 15549578d559..26a30c3566f0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -256,6 +256,16 @@ struct vm_region {
 						* this region */
 };
 
+#ifdef CONFIG_USERFAULTFD
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
+struct vm_userfaultfd_ctx {
+	struct userfaultfd_ctx *ctx;
+};
+#else /* CONFIG_USERFAULTFD */
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
+struct vm_userfaultfd_ctx {};
+#endif /* CONFIG_USERFAULTFD */
+
 /*
  * This struct defines a memory VMM memory area. There is one of these
  * per VM-area/task.  A VM area is any part of the process virtual memory
@@ -322,6 +332,7 @@ struct vm_area_struct {
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
+	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 };
 
 struct core_thread {
diff --git a/kernel/fork.c b/kernel/fork.c
index 03aa2e6de7a4..ceb4eb4abb9d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -456,6 +456,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			goto fail_nomem_anon_vma_fork;
 		tmp->vm_flags &= ~VM_LOCKED;
 		tmp->vm_next = tmp->vm_prev = NULL;
+		tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 		file = tmp->vm_file;
 		if (file) {
 			struct inode *inode = file_inode(file);
-- 
cgit v1.2.3-70-g09d2


From 16ba6f811dfe44bc14f7946a4b257b85476fc16e Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:17 -0700
Subject: userfaultfd: add VM_UFFD_MISSING and VM_UFFD_WP

These two flags gets set in vma->vm_flags to tell the VM common code
if the userfaultfd is armed and in which mode (only tracking missing
faults, only tracking wrprotect faults or both). If neither flags is
set it means the userfaultfd is not armed on the vma.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 2 ++
 include/linux/mm.h | 2 ++
 kernel/fork.c      | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ca1e091881d4..3b4d8255e806 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -597,6 +597,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_HUGEPAGE)]	= "hg",
 		[ilog2(VM_NOHUGEPAGE)]	= "nh",
 		[ilog2(VM_MERGEABLE)]	= "mg",
+		[ilog2(VM_UFFD_MISSING)]= "um",
+		[ilog2(VM_UFFD_WP)]	= "uw",
 	};
 	size_t i;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf6f117fcf4d..0f7cd30039ea 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -124,8 +124,10 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_MAYSHARE	0x00000080
 
 #define VM_GROWSDOWN	0x00000100	/* general info on the segment */
+#define VM_UFFD_MISSING	0x00000200	/* missing pages tracking */
 #define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
+#define VM_UFFD_WP	0x00001000	/* wrprotect pages tracking */
 
 #define VM_LOCKED	0x00002000
 #define VM_IO           0x00004000	/* Memory mapped I/O or similar */
diff --git a/kernel/fork.c b/kernel/fork.c
index ceb4eb4abb9d..7d5f0f118a63 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -454,7 +454,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		tmp->vm_mm = mm;
 		if (anon_vma_fork(tmp, mpnt))
 			goto fail_nomem_anon_vma_fork;
-		tmp->vm_flags &= ~VM_LOCKED;
+		tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
 		tmp->vm_next = tmp->vm_prev = NULL;
 		tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 		file = tmp->vm_file;
-- 
cgit v1.2.3-70-g09d2


From 19a809afe2fe089317226bbe5c5a1ce7f53dcdca Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:24 -0700
Subject: userfaultfd: teach vma_merge to merge across vma->vm_userfaultfd_ctx

vma->vm_userfaultfd_ctx is yet another vma parameter that vma_merge
must be aware about so that we can merge vmas back like they were
originally before arming the userfaultfd on some memory range.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 +-
 mm/madvise.c       |  3 ++-
 mm/mempolicy.c     |  4 ++--
 mm/mlock.c         |  3 ++-
 mm/mmap.c          | 40 +++++++++++++++++++++++++++-------------
 mm/mprotect.c      |  3 ++-
 6 files changed, 36 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0f7cd30039ea..77a9d609523e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1835,7 +1835,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 extern struct vm_area_struct *vma_merge(struct mm_struct *,
 	struct vm_area_struct *prev, unsigned long addr, unsigned long end,
 	unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-	struct mempolicy *);
+	struct mempolicy *, struct vm_userfaultfd_ctx);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int split_vma(struct mm_struct *,
 	struct vm_area_struct *, unsigned long addr, int new_below);
diff --git a/mm/madvise.c b/mm/madvise.c
index 64bb8a22110c..911357973905 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -103,7 +103,8 @@ static long madvise_behavior(struct vm_area_struct *vma,
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
-				vma->vm_file, pgoff, vma_policy(vma));
+			  vma->vm_file, pgoff, vma_policy(vma),
+			  vma->vm_userfaultfd_ctx);
 	if (*prev) {
 		vma = *prev;
 		goto success;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 99d4c1d0b858..a7f1e0d1d6b8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -722,8 +722,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 		pgoff = vma->vm_pgoff +
 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
-				  vma->anon_vma, vma->vm_file, pgoff,
-				  new_pol);
+				 vma->anon_vma, vma->vm_file, pgoff,
+				 new_pol, vma->vm_userfaultfd_ctx);
 		if (prev) {
 			vma = prev;
 			next = vma->vm_next;
diff --git a/mm/mlock.c b/mm/mlock.c
index 6fd2cf15e868..25936680064f 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -510,7 +510,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
-			  vma->vm_file, pgoff, vma_policy(vma));
+			  vma->vm_file, pgoff, vma_policy(vma),
+			  vma->vm_userfaultfd_ctx);
 	if (*prev) {
 		vma = *prev;
 		goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index f126923ce683..82db4fc0a9d3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -41,6 +41,7 @@
 #include <linux/notifier.h>
 #include <linux/memory.h>
 #include <linux/printk.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -919,7 +920,8 @@ again:			remove_next = 1 + (end > next->vm_end);
  * per-vma resources, so we don't attempt to merge those.
  */
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
-			struct file *file, unsigned long vm_flags)
+				struct file *file, unsigned long vm_flags,
+				struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 {
 	/*
 	 * VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -935,6 +937,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
 		return 0;
 	if (vma->vm_ops && vma->vm_ops->close)
 		return 0;
+	if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
+		return 0;
 	return 1;
 }
 
@@ -965,9 +969,11 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
  */
 static int
 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
-	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+		     struct anon_vma *anon_vma, struct file *file,
+		     pgoff_t vm_pgoff,
+		     struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 {
-	if (is_mergeable_vma(vma, file, vm_flags) &&
+	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 		if (vma->vm_pgoff == vm_pgoff)
 			return 1;
@@ -984,9 +990,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
  */
 static int
 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
-	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+		    struct anon_vma *anon_vma, struct file *file,
+		    pgoff_t vm_pgoff,
+		    struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 {
-	if (is_mergeable_vma(vma, file, vm_flags) &&
+	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 		pgoff_t vm_pglen;
 		vm_pglen = vma_pages(vma);
@@ -1029,7 +1037,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 			struct vm_area_struct *prev, unsigned long addr,
 			unsigned long end, unsigned long vm_flags,
 			struct anon_vma *anon_vma, struct file *file,
-			pgoff_t pgoff, struct mempolicy *policy)
+			pgoff_t pgoff, struct mempolicy *policy,
+			struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 {
 	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
 	struct vm_area_struct *area, *next;
@@ -1056,14 +1065,17 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	if (prev && prev->vm_end == addr &&
 			mpol_equal(vma_policy(prev), policy) &&
 			can_vma_merge_after(prev, vm_flags,
-						anon_vma, file, pgoff)) {
+					    anon_vma, file, pgoff,
+					    vm_userfaultfd_ctx)) {
 		/*
 		 * OK, it can.  Can we now merge in the successor as well?
 		 */
 		if (next && end == next->vm_start &&
 				mpol_equal(policy, vma_policy(next)) &&
 				can_vma_merge_before(next, vm_flags,
-					anon_vma, file, pgoff+pglen) &&
+						     anon_vma, file,
+						     pgoff+pglen,
+						     vm_userfaultfd_ctx) &&
 				is_mergeable_anon_vma(prev->anon_vma,
 						      next->anon_vma, NULL)) {
 							/* cases 1, 6 */
@@ -1084,7 +1096,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	if (next && end == next->vm_start &&
 			mpol_equal(policy, vma_policy(next)) &&
 			can_vma_merge_before(next, vm_flags,
-					anon_vma, file, pgoff+pglen)) {
+					     anon_vma, file, pgoff+pglen,
+					     vm_userfaultfd_ctx)) {
 		if (prev && addr < prev->vm_end)	/* case 4 */
 			err = vma_adjust(prev, prev->vm_start,
 				addr, prev->vm_pgoff, NULL);
@@ -1570,8 +1583,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	/*
 	 * Can we just expand an old mapping?
 	 */
-	vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
-			NULL);
+	vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
+			NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
 	if (vma)
 		goto out;
 
@@ -2757,7 +2770,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 
 	/* Can we just expand an old private anonymous mapping? */
 	vma = vma_merge(mm, prev, addr, addr + len, flags,
-					NULL, NULL, pgoff, NULL);
+			NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
 	if (vma)
 		goto out;
 
@@ -2913,7 +2926,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
 		return NULL;	/* should never get here */
 	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
-			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+			    vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+			    vma->vm_userfaultfd_ctx);
 	if (new_vma) {
 		/*
 		 * Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e7d6f1171ecb..ef5be8eaab00 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -292,7 +292,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	 */
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*pprev = vma_merge(mm, *pprev, start, end, newflags,
-			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+			   vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+			   vma->vm_userfaultfd_ctx);
 	if (*pprev) {
 		vma = *pprev;
 		goto success;
-- 
cgit v1.2.3-70-g09d2


From 1380fca084743fef8d17e59b273473393944ce58 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:58 -0700
Subject: userfaultfd: activate syscall

This activates the userfaultfd syscall.

[sfr@canb.auug.org.au: activate syscall fix]
[akpm@linux-foundation.org: don't enable userfaultfd on powerpc]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/entry/syscalls/syscall_32.tbl | 1 +
 arch/x86/entry/syscalls/syscall_64.tbl | 1 +
 include/linux/syscalls.h               | 1 +
 kernel/sys_ni.c                        | 1 +
 4 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 25e3cf1cd8fd..477bfa6db370 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -380,3 +380,4 @@
 371	i386	recvfrom		sys_recvfrom			compat_sys_recvfrom
 372	i386	recvmsg			sys_recvmsg			compat_sys_recvmsg
 373	i386	shutdown		sys_shutdown
+374	i386	userfaultfd		sys_userfaultfd
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 9ef32d5f1b19..81c490634db9 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -329,6 +329,7 @@
 320	common	kexec_file_load		sys_kexec_file_load
 321	common	bpf			sys_bpf
 322	64	execveat		stub_execveat
+323	common	userfaultfd		sys_userfaultfd
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b45c45b8c829..08001317aee7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -810,6 +810,7 @@ asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
 asmlinkage long sys_eventfd(unsigned int count);
 asmlinkage long sys_eventfd2(unsigned int count, int flags);
 asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
+asmlinkage long sys_userfaultfd(int flags);
 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
 asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ca7d84f438f1..03c3875d9958 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -219,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
 cond_syscall(sys_memfd_create);
+cond_syscall(sys_userfaultfd);
 
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
-- 
cgit v1.2.3-70-g09d2


From c1a4de99fada21e2e9251e52cbb51eff5aadc757 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:47:04 -0700
Subject: userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE
 preparation

This implements mcopy_atomic and mfill_zeropage that are the lowlevel
VM methods that are invoked respectively by the UFFDIO_COPY and
UFFDIO_ZEROPAGE userfaultfd commands.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/userfaultfd_k.h |   6 +
 mm/Makefile                   |   1 +
 mm/userfaultfd.c              | 269 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 276 insertions(+)
 create mode 100644 mm/userfaultfd.c

(limited to 'include/linux')

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index e1e43609a179..587480ad41b7 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -30,6 +30,12 @@
 extern int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 			    unsigned int flags, unsigned long reason);
 
+extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
+			    unsigned long src_start, unsigned long len);
+extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
+			      unsigned long dst_start,
+			      unsigned long len);
+
 /* mm helpers */
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
 					struct vm_userfaultfd_ctx vm_ctx)
diff --git a/mm/Makefile b/mm/Makefile
index 98c4eaeabdcb..b424d5e5b6ff 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_CMA)	+= cma.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
new file mode 100644
index 000000000000..c54c761609fc
--- /dev/null
+++ b/mm/userfaultfd.c
@@ -0,0 +1,269 @@
+/*
+ *  mm/userfaultfd.c
+ *
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/mmu_notifier.h>
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+static int mcopy_atomic_pte(struct mm_struct *dst_mm,
+			    pmd_t *dst_pmd,
+			    struct vm_area_struct *dst_vma,
+			    unsigned long dst_addr,
+			    unsigned long src_addr)
+{
+	struct mem_cgroup *memcg;
+	pte_t _dst_pte, *dst_pte;
+	spinlock_t *ptl;
+	struct page *page;
+	void *page_kaddr;
+	int ret;
+
+	ret = -ENOMEM;
+	page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
+	if (!page)
+		goto out;
+
+	page_kaddr = kmap(page);
+	ret = -EFAULT;
+	if (copy_from_user(page_kaddr, (const void __user *) src_addr,
+			   PAGE_SIZE))
+		goto out_kunmap_release;
+	kunmap(page);
+
+	/*
+	 * The memory barrier inside __SetPageUptodate makes sure that
+	 * preceeding stores to the page contents become visible before
+	 * the set_pte_at() write.
+	 */
+	__SetPageUptodate(page);
+
+	ret = -ENOMEM;
+	if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
+		goto out_release;
+
+	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+	if (dst_vma->vm_flags & VM_WRITE)
+		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+
+	ret = -EEXIST;
+	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+	if (!pte_none(*dst_pte))
+		goto out_release_uncharge_unlock;
+
+	inc_mm_counter(dst_mm, MM_ANONPAGES);
+	page_add_new_anon_rmap(page, dst_vma, dst_addr);
+	mem_cgroup_commit_charge(page, memcg, false);
+	lru_cache_add_active_or_unevictable(page, dst_vma);
+
+	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+	pte_unmap_unlock(dst_pte, ptl);
+	ret = 0;
+out:
+	return ret;
+out_release_uncharge_unlock:
+	pte_unmap_unlock(dst_pte, ptl);
+	mem_cgroup_cancel_charge(page, memcg);
+out_release:
+	page_cache_release(page);
+	goto out;
+out_kunmap_release:
+	kunmap(page);
+	goto out_release;
+}
+
+static int mfill_zeropage_pte(struct mm_struct *dst_mm,
+			      pmd_t *dst_pmd,
+			      struct vm_area_struct *dst_vma,
+			      unsigned long dst_addr)
+{
+	pte_t _dst_pte, *dst_pte;
+	spinlock_t *ptl;
+	int ret;
+
+	_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
+					 dst_vma->vm_page_prot));
+	ret = -EEXIST;
+	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+	if (!pte_none(*dst_pte))
+		goto out_unlock;
+	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(dst_vma, dst_addr, dst_pte);
+	ret = 0;
+out_unlock:
+	pte_unmap_unlock(dst_pte, ptl);
+	return ret;
+}
+
+static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd = NULL;
+
+	pgd = pgd_offset(mm, address);
+	pud = pud_alloc(mm, pgd, address);
+	if (pud)
+		/*
+		 * Note that we didn't run this because the pmd was
+		 * missing, the *pmd may be already established and in
+		 * turn it may also be a trans_huge_pmd.
+		 */
+		pmd = pmd_alloc(mm, pud, address);
+	return pmd;
+}
+
+static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
+					      unsigned long dst_start,
+					      unsigned long src_start,
+					      unsigned long len,
+					      bool zeropage)
+{
+	struct vm_area_struct *dst_vma;
+	ssize_t err;
+	pmd_t *dst_pmd;
+	unsigned long src_addr, dst_addr;
+	long copied = 0;
+
+	/*
+	 * Sanitize the command parameters:
+	 */
+	BUG_ON(dst_start & ~PAGE_MASK);
+	BUG_ON(len & ~PAGE_MASK);
+
+	/* Does the address range wrap, or is the span zero-sized? */
+	BUG_ON(src_start + len <= src_start);
+	BUG_ON(dst_start + len <= dst_start);
+
+	down_read(&dst_mm->mmap_sem);
+
+	/*
+	 * Make sure the vma is not shared, that the dst range is
+	 * both valid and fully within a single existing vma.
+	 */
+	err = -EINVAL;
+	dst_vma = find_vma(dst_mm, dst_start);
+	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+		goto out;
+	if (dst_start < dst_vma->vm_start ||
+	    dst_start + len > dst_vma->vm_end)
+		goto out;
+
+	/*
+	 * Be strict and only allow __mcopy_atomic on userfaultfd
+	 * registered ranges to prevent userland errors going
+	 * unnoticed. As far as the VM consistency is concerned, it
+	 * would be perfectly safe to remove this check, but there's
+	 * no useful usage for __mcopy_atomic ouside of userfaultfd
+	 * registered ranges. This is after all why these are ioctls
+	 * belonging to the userfaultfd and not syscalls.
+	 */
+	if (!dst_vma->vm_userfaultfd_ctx.ctx)
+		goto out;
+
+	/*
+	 * FIXME: only allow copying on anonymous vmas, tmpfs should
+	 * be added.
+	 */
+	if (dst_vma->vm_ops)
+		goto out;
+
+	/*
+	 * Ensure the dst_vma has a anon_vma or this page
+	 * would get a NULL anon_vma when moved in the
+	 * dst_vma.
+	 */
+	err = -ENOMEM;
+	if (unlikely(anon_vma_prepare(dst_vma)))
+		goto out;
+
+	for (src_addr = src_start, dst_addr = dst_start;
+	     src_addr < src_start + len; ) {
+		pmd_t dst_pmdval;
+		BUG_ON(dst_addr >= dst_start + len);
+		dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
+		if (unlikely(!dst_pmd)) {
+			err = -ENOMEM;
+			break;
+		}
+
+		dst_pmdval = pmd_read_atomic(dst_pmd);
+		/*
+		 * If the dst_pmd is mapped as THP don't
+		 * override it and just be strict.
+		 */
+		if (unlikely(pmd_trans_huge(dst_pmdval))) {
+			err = -EEXIST;
+			break;
+		}
+		if (unlikely(pmd_none(dst_pmdval)) &&
+		    unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd,
+					 dst_addr))) {
+			err = -ENOMEM;
+			break;
+		}
+		/* If an huge pmd materialized from under us fail */
+		if (unlikely(pmd_trans_huge(*dst_pmd))) {
+			err = -EFAULT;
+			break;
+		}
+
+		BUG_ON(pmd_none(*dst_pmd));
+		BUG_ON(pmd_trans_huge(*dst_pmd));
+
+		if (!zeropage)
+			err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+					       dst_addr, src_addr);
+		else
+			err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
+						 dst_addr);
+
+		cond_resched();
+
+		if (!err) {
+			dst_addr += PAGE_SIZE;
+			src_addr += PAGE_SIZE;
+			copied += PAGE_SIZE;
+
+			if (fatal_signal_pending(current))
+				err = -EINTR;
+		}
+		if (err)
+			break;
+	}
+
+out:
+	up_read(&dst_mm->mmap_sem);
+	BUG_ON(copied < 0);
+	BUG_ON(err > 0);
+	BUG_ON(!copied && !err);
+	return copied ? copied : err;
+}
+
+ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
+		     unsigned long src_start, unsigned long len)
+{
+	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
+}
+
+ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
+		       unsigned long len)
+{
+	return __mcopy_atomic(dst_mm, start, 0, len, true);
+}
-- 
cgit v1.2.3-70-g09d2


From 5b74283ab251b9db55cbbe31d19ca72482103290 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Fri, 4 Sep 2015 15:47:29 -0700
Subject: x86, mm: trace when an IPI is about to be sent

When unmapping pages it is necessary to flush the TLB.  If that page was
accessed by another CPU then an IPI is used to flush the remote CPU.  That
is a lot of IPIs if kswapd is scanning and unmapping >100K pages per
second.

There already is a window between when a page is unmapped and when it is
TLB flushed.  This series increases the window so multiple pages can be
flushed using a single IPI.  This should be safe or the kernel is hosed
already.

Patch 1 simply made the rest of the series easier to write as ftrace
        could identify all the senders of TLB flush IPIS.

Patch 2 tracks what CPUs potentially map a PFN and then sends an IPI
        to flush the entire TLB.

Patch 3 tracks when there potentially are writable TLB entries that
        need to be batched differently

Patch 4 increases SWAP_CLUSTER_MAX to further batch flushes

The performance impact is documented in the changelogs but in the optimistic
case on a 4-socket machine the full series reduces interrupts from 900K
interrupts/second to 60K interrupts/second.

This patch (of 4):

It is easy to trace when an IPI is received to flush a TLB but harder to
detect what event sent it.  This patch makes it easy to identify the
source of IPIs being transmitted for TLB flushes on x86.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/tlb.c          | 1 +
 include/linux/mm_types.h   | 1 +
 include/trace/events/tlb.h | 3 ++-
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 90b924acd982..8ddb5d0d66fb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -140,6 +140,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 	info.flush_end = end;
 
 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+	trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start);
 	if (is_uv_system()) {
 		unsigned int cpu;
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 26a30c3566f0..c8d0a73d64c4 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -554,6 +554,7 @@ enum tlb_flush_reason {
 	TLB_REMOTE_SHOOTDOWN,
 	TLB_LOCAL_SHOOTDOWN,
 	TLB_LOCAL_MM_SHOOTDOWN,
+	TLB_REMOTE_SEND_IPI,
 	NR_TLB_FLUSH_REASONS,
 };
 
diff --git a/include/trace/events/tlb.h b/include/trace/events/tlb.h
index 4250f364a6ca..bc8815f45f3b 100644
--- a/include/trace/events/tlb.h
+++ b/include/trace/events/tlb.h
@@ -11,7 +11,8 @@
 	EM(  TLB_FLUSH_ON_TASK_SWITCH,	"flush on task switch" )	\
 	EM(  TLB_REMOTE_SHOOTDOWN,	"remote shootdown" )		\
 	EM(  TLB_LOCAL_SHOOTDOWN,	"local shootdown" )		\
-	EMe( TLB_LOCAL_MM_SHOOTDOWN,	"local mm shootdown" )
+	EM(  TLB_LOCAL_MM_SHOOTDOWN,	"local mm shootdown" )		\
+	EMe( TLB_REMOTE_SEND_IPI,	"remote ipi send" )
 
 /*
  * First define the enums in TLB_FLUSH_REASON to be exported to userspace
-- 
cgit v1.2.3-70-g09d2


From 72b252aed506b8f1a03f7abd29caef4cdf6a043b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Fri, 4 Sep 2015 15:47:32 -0700
Subject: mm: send one IPI per CPU to TLB flush all entries after unmapping
 pages

An IPI is sent to flush remote TLBs when a page is unmapped that was
potentially accesssed by other CPUs.  There are many circumstances where
this happens but the obvious one is kswapd reclaiming pages belonging to a
running process as kswapd and the task are likely running on separate
CPUs.

On small machines, this is not a significant problem but as machine gets
larger with more cores and more memory, the cost of these IPIs can be
high.  This patch uses a simple structure that tracks CPUs that
potentially have TLB entries for pages being unmapped.  When the unmapping
is complete, the full TLB is flushed on the assumption that a refill cost
is lower than flushing individual entries.

Architectures wishing to do this must give the following guarantee.

        If a clean page is unmapped and not immediately flushed, the
        architecture must guarantee that a write to that linear address
        from a CPU with a cached TLB entry will trap a page fault.

This is essentially what the kernel already depends on but the window is
much larger with this patch applied and is worth highlighting.  The
architecture should consider whether the cost of the full TLB flush is
higher than sending an IPI to flush each individual entry.  An additional
architecture helper called flush_tlb_local is required.  It's a trivial
wrapper with some accounting in the x86 case.

The impact of this patch depends on the workload as measuring any benefit
requires both mapped pages co-located on the LRU and memory pressure.  The
case with the biggest impact is multiple processes reading mapped pages
taken from the vm-scalability test suite.  The test case uses NR_CPU
readers of mapped files that consume 10*RAM.

Linear mapped reader on a 4-node machine with 64G RAM and 48 CPUs

                                           4.2.0-rc1          4.2.0-rc1
                                             vanilla       flushfull-v7
Ops lru-file-mmap-read-elapsed      159.62 (  0.00%)   120.68 ( 24.40%)
Ops lru-file-mmap-read-time_range    30.59 (  0.00%)     2.80 ( 90.85%)
Ops lru-file-mmap-read-time_stddv     6.70 (  0.00%)     0.64 ( 90.38%)

           4.2.0-rc1    4.2.0-rc1
             vanilla flushfull-v7
User          581.00       611.43
System       5804.93      4111.76
Elapsed       161.03       122.12

This is showing that the readers completed 24.40% faster with 29% less
system CPU time.  From vmstats, it is known that the vanilla kernel was
interrupted roughly 900K times per second during the steady phase of the
test and the patched kernel was interrupts 180K times per second.

The impact is lower on a single socket machine.

                                           4.2.0-rc1          4.2.0-rc1
                                             vanilla       flushfull-v7
Ops lru-file-mmap-read-elapsed       25.33 (  0.00%)    20.38 ( 19.54%)
Ops lru-file-mmap-read-time_range     0.91 (  0.00%)     1.44 (-58.24%)
Ops lru-file-mmap-read-time_stddv     0.28 (  0.00%)     0.47 (-65.34%)

           4.2.0-rc1    4.2.0-rc1
             vanilla flushfull-v7
User           58.09        57.64
System        111.82        76.56
Elapsed        27.29        22.55

It's still a noticeable improvement with vmstat showing interrupts went
from roughly 500K per second to 45K per second.

The patch will have no impact on workloads with no memory pressure or have
relatively few mapped pages.  It will have an unpredictable impact on the
workload running on the CPU being flushed as it'll depend on how many TLB
entries need to be refilled and how long that takes.  Worst case, the TLB
will be completely cleared of active entries when the target PFNs were not
resident at all.

[sasha.levin@oracle.com: trace tlb flush after disabling preemption in try_to_unmap_flush]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig                |   1 +
 arch/x86/include/asm/tlbflush.h |   6 +++
 include/linux/rmap.h            |   3 ++
 include/linux/sched.h           |  16 +++++++
 init/Kconfig                    |  10 ++++
 mm/internal.h                   |  11 +++++
 mm/rmap.c                       | 104 +++++++++++++++++++++++++++++++++++++++-
 mm/vmscan.c                     |  23 ++++++++-
 8 files changed, 172 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 48f7433dac6f..117e2f373e50 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
 	select ARCH_USE_CMPXCHG_LOCKREF		if X86_64
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
+	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_IPC_PARSE_VERSION	if X86_32
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cd791948b286..6df2029405a3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void)
 
 #endif	/* SMP */
 
+/* Not inlined due to inc_irq_stat not being defined yet */
+#define flush_tlb_local() {		\
+	inc_irq_stat(irq_tlb_count);	\
+	local_flush_tlb();		\
+}
+
 #ifndef CONFIG_PARAVIRT
 #define flush_tlb_others(mask, mm, start, end)	\
 	native_flush_tlb_others(mask, mm, start, end)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c89c53a113a8..29446aeef36e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -89,6 +89,9 @@ enum ttu_flags {
 	TTU_IGNORE_MLOCK = (1 << 8),	/* ignore mlock */
 	TTU_IGNORE_ACCESS = (1 << 9),	/* don't age */
 	TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
+	TTU_BATCH_FLUSH = (1 << 11),	/* Batch TLB flushes where possible
+					 * and caller guarantees they will
+					 * do a final flush if necessary */
 };
 
 #ifdef CONFIG_MMU
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 119823decc46..3c602c20c717 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1344,6 +1344,18 @@ enum perf_event_task_context {
 	perf_nr_task_contexts,
 };
 
+/* Track pages that require TLB flushes */
+struct tlbflush_unmap_batch {
+	/*
+	 * Each bit set is a CPU that potentially has a TLB entry for one of
+	 * the PFNs being flushed. See set_tlb_ubc_flush_pending().
+	 */
+	struct cpumask cpumask;
+
+	/* True if any bit in cpumask is set */
+	bool flush_required;
+};
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -1700,6 +1712,10 @@ struct task_struct {
 	unsigned long numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+	struct tlbflush_unmap_batch tlb_ubc;
+#endif
+
 	struct rcu_head rcu;
 
 	/*
diff --git a/init/Kconfig b/init/Kconfig
index 161acd8bc56f..cf7e4824c8d0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -882,6 +882,16 @@ config GENERIC_SCHED_CLOCK
 config ARCH_SUPPORTS_NUMA_BALANCING
 	bool
 
+#
+# For architectures that prefer to flush all TLBs after a number of pages
+# are unmapped instead of sending one IPI per page to flush. The architecture
+# must provide guarantees on what happens if a clean TLB cache entry is
+# written after the unmap. Details are in mm/rmap.c near the check for
+# should_defer_flush. The architecture should also consider if the full flush
+# and the refill costs are offset by the savings of sending fewer IPIs.
+config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+	bool
+
 #
 # For architectures that know their GCC __int128 support is sound
 #
diff --git a/mm/internal.h b/mm/internal.h
index 36b23f1e2ca6..bd6372ac5f7f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -426,4 +426,15 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 #define ALLOC_CMA		0x80 /* allow allocations from CMA areas */
 #define ALLOC_FAIR		0x100 /* fair zone allocation */
 
+enum ttu_flags;
+struct tlbflush_unmap_batch;
+
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+void try_to_unmap_flush(void);
+#else
+static inline void try_to_unmap_flush(void)
+{
+}
+
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/rmap.c b/mm/rmap.c
index 171b68768df1..326d5d89e45c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,8 @@
 
 #include <asm/tlbflush.h>
 
+#include <trace/events/tlb.h>
+
 #include "internal.h"
 
 static struct kmem_cache *anon_vma_cachep;
@@ -583,6 +585,89 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 	return address;
 }
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void percpu_flush_tlb_batch_pages(void *data)
+{
+	/*
+	 * All TLB entries are flushed on the assumption that it is
+	 * cheaper to flush all TLBs and let them be refilled than
+	 * flushing individual PFNs. Note that we do not track mm's
+	 * to flush as that might simply be multiple full TLB flushes
+	 * for no gain.
+	 */
+	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+	flush_tlb_local();
+}
+
+/*
+ * Flush TLB entries for recently unmapped pages from remote CPUs. It is
+ * important if a PTE was dirty when it was unmapped that it's flushed
+ * before any IO is initiated on the page to prevent lost writes. Similarly,
+ * it must be flushed before freeing to prevent data leakage.
+ */
+void try_to_unmap_flush(void)
+{
+	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+	int cpu;
+
+	if (!tlb_ubc->flush_required)
+		return;
+
+	cpu = get_cpu();
+
+	trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
+
+	if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
+		percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
+
+	if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
+		smp_call_function_many(&tlb_ubc->cpumask,
+			percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
+	}
+	cpumask_clear(&tlb_ubc->cpumask);
+	tlb_ubc->flush_required = false;
+	put_cpu();
+}
+
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+		struct page *page)
+{
+	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+
+	cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+	tlb_ubc->flush_required = true;
+}
+
+/*
+ * Returns true if the TLB flush should be deferred to the end of a batch of
+ * unmap operations to reduce IPIs.
+ */
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+	bool should_defer = false;
+
+	if (!(flags & TTU_BATCH_FLUSH))
+		return false;
+
+	/* If remote CPUs need to be flushed then defer batch the flush */
+	if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+		should_defer = true;
+	put_cpu();
+
+	return should_defer;
+}
+#else
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+		struct page *page)
+{
+}
+
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+	return false;
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
 /*
  * At what user virtual address is page expected in vma?
  * Caller should check the page is actually part of the vma.
@@ -1220,7 +1305,24 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 	/* Nuke the page table entry. */
 	flush_cache_page(vma, address, page_to_pfn(page));
-	pteval = ptep_clear_flush(vma, address, pte);
+	if (should_defer_flush(mm, flags)) {
+		/*
+		 * We clear the PTE but do not flush so potentially a remote
+		 * CPU could still be writing to the page. If the entry was
+		 * previously clean then the architecture must guarantee that
+		 * a clear->dirty transition on a cached TLB entry is written
+		 * through and traps if the PTE is unmapped.
+		 */
+		pteval = ptep_get_and_clear(mm, address, pte);
+
+		/* Potentially writable TLBs must be flushed before IO */
+		if (pte_dirty(pteval))
+			flush_tlb_page(vma, address);
+		else
+			set_tlb_ubc_flush_pending(mm, page);
+	} else {
+		pteval = ptep_clear_flush(vma, address, pte);
+	}
 
 	/* Move the dirty bit to the physical page now the pte is gone. */
 	if (pte_dirty(pteval))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8286938c70de..99ec00d6a5dd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1057,7 +1057,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
-			switch (try_to_unmap(page, ttu_flags)) {
+			switch (try_to_unmap(page,
+					ttu_flags|TTU_BATCH_FLUSH)) {
 			case SWAP_FAIL:
 				goto activate_locked;
 			case SWAP_AGAIN:
@@ -1208,6 +1209,7 @@ keep:
 	}
 
 	mem_cgroup_uncharge_list(&free_pages);
+	try_to_unmap_flush();
 	free_hot_cold_page_list(&free_pages, true);
 
 	list_splice(&ret_pages, page_list);
@@ -2151,6 +2153,23 @@ out:
 	}
 }
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void init_tlb_ubc(void)
+{
+	/*
+	 * This deliberately does not clear the cpumask as it's expensive
+	 * and unnecessary. If there happens to be data in there then the
+	 * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and
+	 * then will be cleared.
+	 */
+	current->tlb_ubc.flush_required = false;
+}
+#else
+static inline void init_tlb_ubc(void)
+{
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
@@ -2185,6 +2204,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
 	scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
 			 sc->priority == DEF_PRIORITY);
 
+	init_tlb_ubc();
+
 	blk_start_plug(&plug);
 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
 					nr[LRU_INACTIVE_FILE]) {
-- 
cgit v1.2.3-70-g09d2


From d950c9477d51f0cefc2ed3cf76e695d46af0d9c1 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Fri, 4 Sep 2015 15:47:35 -0700
Subject: mm: defer flush of writable TLB entries

If a PTE is unmapped and it's dirty then it was writable recently.  Due to
deferred TLB flushing, it's best to assume a writable TLB cache entry
exists.  With that assumption, the TLB must be flushed before any IO can
start or the page is freed to avoid lost writes or data corruption.  This
patch defers flushing of potentially writable TLBs as long as possible.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  7 +++++++
 mm/internal.h         |  4 ++++
 mm/rmap.c             | 28 +++++++++++++++++++++-------
 mm/vmscan.c           |  7 ++++++-
 4 files changed, 38 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3c602c20c717..a4ab9daa387c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1354,6 +1354,13 @@ struct tlbflush_unmap_batch {
 
 	/* True if any bit in cpumask is set */
 	bool flush_required;
+
+	/*
+	 * If true then the PTE was dirty when unmapped. The entry must be
+	 * flushed before IO is initiated or a stale TLB entry potentially
+	 * allows an update without redirtying the page.
+	 */
+	bool writable;
 };
 
 struct task_struct {
diff --git a/mm/internal.h b/mm/internal.h
index bd6372ac5f7f..1195dd2d6a2b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -431,10 +431,14 @@ struct tlbflush_unmap_batch;
 
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 void try_to_unmap_flush(void);
+void try_to_unmap_flush_dirty(void);
 #else
 static inline void try_to_unmap_flush(void)
 {
 }
+static inline void try_to_unmap_flush_dirty(void)
+{
+}
 
 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/rmap.c b/mm/rmap.c
index 326d5d89e45c..0db38e7d0a72 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -626,16 +626,34 @@ void try_to_unmap_flush(void)
 	}
 	cpumask_clear(&tlb_ubc->cpumask);
 	tlb_ubc->flush_required = false;
+	tlb_ubc->writable = false;
 	put_cpu();
 }
 
+/* Flush iff there are potentially writable TLB entries that can race with IO */
+void try_to_unmap_flush_dirty(void)
+{
+	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+
+	if (tlb_ubc->writable)
+		try_to_unmap_flush();
+}
+
 static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
-		struct page *page)
+		struct page *page, bool writable)
 {
 	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
 
 	cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
 	tlb_ubc->flush_required = true;
+
+	/*
+	 * If the PTE was dirty then it's best to assume it's writable. The
+	 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
+	 * before the page is queued for IO.
+	 */
+	if (writable)
+		tlb_ubc->writable = true;
 }
 
 /*
@@ -658,7 +676,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
 }
 #else
 static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
-		struct page *page)
+		struct page *page, bool writable)
 {
 }
 
@@ -1315,11 +1333,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		 */
 		pteval = ptep_get_and_clear(mm, address, pte);
 
-		/* Potentially writable TLBs must be flushed before IO */
-		if (pte_dirty(pteval))
-			flush_tlb_page(vma, address);
-		else
-			set_tlb_ubc_flush_pending(mm, page);
+		set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval));
 	} else {
 		pteval = ptep_clear_flush(vma, address, pte);
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 99ec00d6a5dd..b1139039122a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1098,7 +1098,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			if (!sc->may_writepage)
 				goto keep_locked;
 
-			/* Page is dirty, try to write it out here */
+			/*
+			 * Page is dirty. Flush the TLB if a writable entry
+			 * potentially exists to avoid CPU writes after IO
+			 * starts and then write it out here.
+			 */
+			try_to_unmap_flush_dirty();
 			switch (pageout(page, mapping, sc)) {
 			case PAGE_KEEP:
 				goto keep_locked;
-- 
cgit v1.2.3-70-g09d2


From 73858173593c31cb94bce63fe1c24eb803bb04e6 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Date: Fri, 4 Sep 2015 15:47:43 -0700
Subject: genalloc: add name arg to gen_pool_get() and devm_gen_pool_create()

This change modifies gen_pool_get() and devm_gen_pool_create() client
interfaces adding one more argument "name" of a gen_pool object.

Due to implementation gen_pool_get() is capable to retrieve only one
gen_pool associated with a device even if multiple gen_pools are created,
fortunately right at the moment it is sufficient for the clients, hence
provide NULL as a valid argument on both producer devm_gen_pool_create()
and consumer gen_pool_get() sides.

Because only one created gen_pool per device is addressable, explicitly
add a restriction to devm_gen_pool_create() to create only one gen_pool
per device, this implies two possible error codes returned by the
function, account it on client side (only misc/sram).  This completes
client side changes related to genalloc updates.

[akpm@linux-foundation.org: gen_pool_get() cleanup]
Signed-off-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Cc: Philipp Zabel <p.zabel@pengutronix.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Nicolas Ferre <nicolas.ferre@atmel.com>
Cc: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Cc: Jean-Christophe Plagniol-Villard <plagnioj@jcrosoft.com>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Sascha Hauer <kernel@pengutronix.de>
Cc: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mach-at91/pm.c                   |  2 +-
 arch/arm/mach-imx/pm-imx5.c               |  2 +-
 arch/arm/mach-imx/pm-imx6.c               |  2 +-
 arch/arm/mach-socfpga/pm.c                |  2 +-
 drivers/media/platform/coda/coda-common.c |  2 +-
 drivers/misc/sram.c                       |  8 ++---
 include/linux/genalloc.h                  |  4 +--
 lib/genalloc.c                            | 49 ++++++++++++++++++-------------
 8 files changed, 39 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index 265ffeb2037e..80e277cfcc8b 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -369,7 +369,7 @@ static void __init at91_pm_sram_init(void)
 		return;
 	}
 
-	sram_pool = gen_pool_get(&pdev->dev);
+	sram_pool = gen_pool_get(&pdev->dev, NULL);
 	if (!sram_pool) {
 		pr_warn("%s: sram pool unavailable!\n", __func__);
 		return;
diff --git a/arch/arm/mach-imx/pm-imx5.c b/arch/arm/mach-imx/pm-imx5.c
index 1885676c23c0..532d4b08276d 100644
--- a/arch/arm/mach-imx/pm-imx5.c
+++ b/arch/arm/mach-imx/pm-imx5.c
@@ -297,7 +297,7 @@ static int __init imx_suspend_alloc_ocram(
 		goto put_node;
 	}
 
-	ocram_pool = gen_pool_get(&pdev->dev);
+	ocram_pool = gen_pool_get(&pdev->dev, NULL);
 	if (!ocram_pool) {
 		pr_warn("%s: ocram pool unavailable!\n", __func__);
 		ret = -ENODEV;
diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c
index 93ecf559d06d..8ff8fc0b261c 100644
--- a/arch/arm/mach-imx/pm-imx6.c
+++ b/arch/arm/mach-imx/pm-imx6.c
@@ -451,7 +451,7 @@ static int __init imx6q_suspend_init(const struct imx6_pm_socdata *socdata)
 		goto put_node;
 	}
 
-	ocram_pool = gen_pool_get(&pdev->dev);
+	ocram_pool = gen_pool_get(&pdev->dev, NULL);
 	if (!ocram_pool) {
 		pr_warn("%s: ocram pool unavailable!\n", __func__);
 		ret = -ENODEV;
diff --git a/arch/arm/mach-socfpga/pm.c b/arch/arm/mach-socfpga/pm.c
index 6a4199f2bffb..c378ab0c2431 100644
--- a/arch/arm/mach-socfpga/pm.c
+++ b/arch/arm/mach-socfpga/pm.c
@@ -56,7 +56,7 @@ static int socfpga_setup_ocram_self_refresh(void)
 		goto put_node;
 	}
 
-	ocram_pool = gen_pool_get(&pdev->dev);
+	ocram_pool = gen_pool_get(&pdev->dev, NULL);
 	if (!ocram_pool) {
 		pr_warn("%s: ocram pool unavailable!\n", __func__);
 		ret = -ENODEV;
diff --git a/drivers/media/platform/coda/coda-common.c b/drivers/media/platform/coda/coda-common.c
index 58f65486de33..284ac4c934ba 100644
--- a/drivers/media/platform/coda/coda-common.c
+++ b/drivers/media/platform/coda/coda-common.c
@@ -2157,7 +2157,7 @@ static int coda_probe(struct platform_device *pdev)
 	/* Get IRAM pool from device tree or platform data */
 	pool = of_gen_pool_get(np, "iram", 0);
 	if (!pool && pdata)
-		pool = gen_pool_get(pdata->iram_dev);
+		pool = gen_pool_get(pdata->iram_dev, NULL);
 	if (!pool) {
 		dev_err(&pdev->dev, "iram pool not available\n");
 		return -ENOMEM;
diff --git a/drivers/misc/sram.c b/drivers/misc/sram.c
index 15c33cc34a80..431e1dd528bc 100644
--- a/drivers/misc/sram.c
+++ b/drivers/misc/sram.c
@@ -186,10 +186,10 @@ static int sram_probe(struct platform_device *pdev)
 	if (IS_ERR(sram->virt_base))
 		return PTR_ERR(sram->virt_base);
 
-	sram->pool = devm_gen_pool_create(sram->dev,
-					  ilog2(SRAM_GRANULARITY), -1);
-	if (!sram->pool)
-		return -ENOMEM;
+	sram->pool = devm_gen_pool_create(sram->dev, ilog2(SRAM_GRANULARITY),
+					  NUMA_NO_NODE, NULL);
+	if (IS_ERR(sram->pool))
+		return PTR_ERR(sram->pool);
 
 	ret = sram_reserve_regions(sram, res);
 	if (ret)
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 5383bb1394a1..6afa65e6cdb7 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -118,8 +118,8 @@ extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
 		unsigned long start, unsigned int nr, void *data);
 
 extern struct gen_pool *devm_gen_pool_create(struct device *dev,
-		int min_alloc_order, int nid);
-extern struct gen_pool *gen_pool_get(struct device *dev);
+		int min_alloc_order, int nid, const char *name);
+extern struct gen_pool *gen_pool_get(struct device *dev, const char *name);
 
 bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
 			size_t size);
diff --git a/lib/genalloc.c b/lib/genalloc.c
index daf0afb6d979..b13cfd1a366e 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -570,24 +570,47 @@ static void devm_gen_pool_release(struct device *dev, void *res)
 	gen_pool_destroy(*(struct gen_pool **)res);
 }
 
+/**
+ * gen_pool_get - Obtain the gen_pool (if any) for a device
+ * @dev: device to retrieve the gen_pool from
+ * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device
+ *
+ * Returns the gen_pool for the device if one is present, or NULL.
+ */
+struct gen_pool *gen_pool_get(struct device *dev, const char *name)
+{
+	struct gen_pool **p;
+
+	p = devres_find(dev, devm_gen_pool_release, NULL, NULL);
+	if (!p)
+		return NULL;
+	return *p;
+}
+EXPORT_SYMBOL_GPL(gen_pool_get);
+
 /**
  * devm_gen_pool_create - managed gen_pool_create
  * @dev: device that provides the gen_pool
  * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
- * @nid: node id of the node the pool structure should be allocated on, or -1
+ * @nid: node selector for allocated gen_pool, %NUMA_NO_NODE for all nodes
+ * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device
  *
  * Create a new special memory pool that can be used to manage special purpose
  * memory not managed by the regular kmalloc/kfree interface. The pool will be
  * automatically destroyed by the device management code.
  */
 struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
-		int nid)
+				      int nid, const char *name)
 {
 	struct gen_pool **ptr, *pool;
 
+	/* Check that genpool to be created is uniquely addressed on device */
+	if (gen_pool_get(dev, name))
+		return ERR_PTR(-EINVAL);
+
 	ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL);
 	if (!ptr)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	pool = gen_pool_create(min_alloc_order, nid);
 	if (pool) {
@@ -595,29 +618,13 @@ struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
 		devres_add(dev, ptr);
 	} else {
 		devres_free(ptr);
+		return ERR_PTR(-ENOMEM);
 	}
 
 	return pool;
 }
 EXPORT_SYMBOL(devm_gen_pool_create);
 
-/**
- * gen_pool_get - Obtain the gen_pool (if any) for a device
- * @dev: device to retrieve the gen_pool from
- *
- * Returns the gen_pool for the device if one is present, or NULL.
- */
-struct gen_pool *gen_pool_get(struct device *dev)
-{
-	struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL,
-					NULL);
-
-	if (!p)
-		return NULL;
-	return *p;
-}
-EXPORT_SYMBOL_GPL(gen_pool_get);
-
 #ifdef CONFIG_OF
 /**
  * of_gen_pool_get - find a pool by phandle property
@@ -642,7 +649,7 @@ struct gen_pool *of_gen_pool_get(struct device_node *np,
 	of_node_put(np_pool);
 	if (!pdev)
 		return NULL;
-	return gen_pool_get(&pdev->dev);
+	return gen_pool_get(&pdev->dev, NULL);
 }
 EXPORT_SYMBOL_GPL(of_gen_pool_get);
 #endif /* CONFIG_OF */
-- 
cgit v1.2.3-70-g09d2


From c98c36355dd6d5c4433c8d17e8eb839ca9b97606 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Date: Fri, 4 Sep 2015 15:47:47 -0700
Subject: genalloc: add support of multiple gen_pools per device

This change fills devm_gen_pool_create()/gen_pool_get() "name" argument
stub with contents and extends of_gen_pool_get() functionality on this
basis.

If there is no associated platform device with a device node passed to
of_gen_pool_get(), the function attempts to get a label property or device
node name (= repeats MTD OF partition standard) and seeks for a named
gen_pool registered by device of the parent device node.

The main idea of the change is to allow registration of independent
gen_pools under the same umbrella device, say "partitions" on "storage
device", the original functionality of one "partition" per "storage
device" is untouched.

[akpm@linux-foundation.org: fix constness in devres_find()]
[dan.carpenter@oracle.com: freeing const data pointers]
Signed-off-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Cc: Philipp Zabel <p.zabel@pengutronix.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Nicolas Ferre <nicolas.ferre@atmel.com>
Cc: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Cc: Jean-Christophe Plagniol-Villard <plagnioj@jcrosoft.com>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Sascha Hauer <kernel@pengutronix.de>
Cc: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/genalloc.h |  2 ++
 lib/genalloc.c           | 71 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 59 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 6afa65e6cdb7..7ff168d06967 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -59,6 +59,8 @@ struct gen_pool {
 
 	genpool_algo_t algo;		/* allocation function */
 	void *data;
+
+	const char *name;
 };
 
 /*
diff --git a/lib/genalloc.c b/lib/genalloc.c
index b13cfd1a366e..116a166b096f 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -160,6 +160,7 @@ struct gen_pool *gen_pool_create(int min_alloc_order, int nid)
 		pool->min_alloc_order = min_alloc_order;
 		pool->algo = gen_pool_first_fit;
 		pool->data = NULL;
+		pool->name = NULL;
 	}
 	return pool;
 }
@@ -252,8 +253,8 @@ void gen_pool_destroy(struct gen_pool *pool)
 
 		kfree(chunk);
 	}
+	kfree_const(pool->name);
 	kfree(pool);
-	return;
 }
 EXPORT_SYMBOL(gen_pool_destroy);
 
@@ -570,6 +571,20 @@ static void devm_gen_pool_release(struct device *dev, void *res)
 	gen_pool_destroy(*(struct gen_pool **)res);
 }
 
+static int devm_gen_pool_match(struct device *dev, void *res, void *data)
+{
+	struct gen_pool **p = res;
+
+	/* NULL data matches only a pool without an assigned name */
+	if (!data && !(*p)->name)
+		return 1;
+
+	if (!data || !(*p)->name)
+		return 0;
+
+	return !strcmp((*p)->name, data);
+}
+
 /**
  * gen_pool_get - Obtain the gen_pool (if any) for a device
  * @dev: device to retrieve the gen_pool from
@@ -581,7 +596,8 @@ struct gen_pool *gen_pool_get(struct device *dev, const char *name)
 {
 	struct gen_pool **p;
 
-	p = devres_find(dev, devm_gen_pool_release, NULL, NULL);
+	p = devres_find(dev, devm_gen_pool_release, devm_gen_pool_match,
+			(void *)name);
 	if (!p)
 		return NULL;
 	return *p;
@@ -603,25 +619,38 @@ struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
 				      int nid, const char *name)
 {
 	struct gen_pool **ptr, *pool;
+	const char *pool_name = NULL;
 
 	/* Check that genpool to be created is uniquely addressed on device */
 	if (gen_pool_get(dev, name))
 		return ERR_PTR(-EINVAL);
 
+	if (name) {
+		pool_name = kstrdup_const(name, GFP_KERNEL);
+		if (!pool_name)
+			return ERR_PTR(-ENOMEM);
+	}
+
 	ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL);
 	if (!ptr)
-		return ERR_PTR(-ENOMEM);
+		goto free_pool_name;
 
 	pool = gen_pool_create(min_alloc_order, nid);
-	if (pool) {
-		*ptr = pool;
-		devres_add(dev, ptr);
-	} else {
-		devres_free(ptr);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (!pool)
+		goto free_devres;
+
+	*ptr = pool;
+	pool->name = pool_name;
+	devres_add(dev, ptr);
 
 	return pool;
+
+free_devres:
+	devres_free(ptr);
+free_pool_name:
+	kfree_const(pool_name);
+
+	return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(devm_gen_pool_create);
 
@@ -640,16 +669,30 @@ struct gen_pool *of_gen_pool_get(struct device_node *np,
 	const char *propname, int index)
 {
 	struct platform_device *pdev;
-	struct device_node *np_pool;
+	struct device_node *np_pool, *parent;
+	const char *name = NULL;
+	struct gen_pool *pool = NULL;
 
 	np_pool = of_parse_phandle(np, propname, index);
 	if (!np_pool)
 		return NULL;
+
 	pdev = of_find_device_by_node(np_pool);
+	if (!pdev) {
+		/* Check if named gen_pool is created by parent node device */
+		parent = of_get_parent(np_pool);
+		pdev = of_find_device_by_node(parent);
+		of_node_put(parent);
+
+		of_property_read_string(np_pool, "label", &name);
+		if (!name)
+			name = np_pool->name;
+	}
+	if (pdev)
+		pool = gen_pool_get(&pdev->dev, name);
 	of_node_put(np_pool);
-	if (!pdev)
-		return NULL;
-	return gen_pool_get(&pdev->dev, NULL);
+
+	return pool;
 }
 EXPORT_SYMBOL_GPL(of_gen_pool_get);
 #endif /* CONFIG_OF */
-- 
cgit v1.2.3-70-g09d2


From 5477e70a6420a6b7ca96c8e21413ee1c96a84260 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 4 Sep 2015 15:48:04 -0700
Subject: mm: move ->mremap() from file_operations to vm_operations_struct

vma->vm_ops->mremap() looks more natural and clean in move_vma(), and this
way ->mremap() can have more users.  Say, vdso.

While at it, s/aio_ring_remap/aio_ring_mremap/.

Note: this is the minimal change before ->mremap() finds another user in
file_operations; this method should have more arguments, and it can be
used to kill arch_remap().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c           | 27 ++++++++++++++++++---------
 include/linux/fs.h |  1 -
 include/linux/mm.h |  1 +
 mm/mremap.c        |  4 ++--
 4 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index 480440f4701f..155f84253f33 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -308,15 +308,9 @@ static void aio_free_ring(struct kioctx *ctx)
 	}
 }
 
-static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	vma->vm_flags |= VM_DONTEXPAND;
-	vma->vm_ops = &generic_file_vm_ops;
-	return 0;
-}
-
-static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_mremap(struct vm_area_struct *vma)
 {
+	struct file *file = vma->vm_file;
 	struct mm_struct *mm = vma->vm_mm;
 	struct kioctx_table *table;
 	int i, res = -EINVAL;
@@ -342,9 +336,24 @@ static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
 	return res;
 }
 
+static const struct vm_operations_struct aio_ring_vm_ops = {
+	.mremap		= aio_ring_mremap,
+#if IS_ENABLED(CONFIG_MMU)
+	.fault		= filemap_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite	= filemap_page_mkwrite,
+#endif
+};
+
+static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_flags |= VM_DONTEXPAND;
+	vma->vm_ops = &aio_ring_vm_ops;
+	return 0;
+}
+
 static const struct file_operations aio_ring_fops = {
 	.mmap = aio_ring_mmap,
-	.mremap = aio_ring_remap,
 };
 
 #if IS_ENABLED(CONFIG_MIGRATION)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fbd780c33c5f..864203c10dbc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1612,7 +1612,6 @@ struct file_operations {
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
-	int (*mremap)(struct file *, struct vm_area_struct *);
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *, fl_owner_t id);
 	int (*release) (struct inode *, struct file *);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 77a9d609523e..8b257c43855b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -247,6 +247,7 @@ struct vm_fault {
 struct vm_operations_struct {
 	void (*open)(struct vm_area_struct * area);
 	void (*close)(struct vm_area_struct * area);
+	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
 	void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
 
diff --git a/mm/mremap.c b/mm/mremap.c
index f54a43fa4b79..3310378bb60f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -277,8 +277,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 				     need_rmap_locks);
 	if (moved_len < old_len) {
 		err = -ENOMEM;
-	} else if (vma->vm_file && vma->vm_file->f_op->mremap) {
-		err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
+	} else if (vma->vm_ops && vma->vm_ops->mremap) {
+		err = vma->vm_ops->mremap(new_vma);
 	}
 
 	if (unlikely(err)) {
-- 
cgit v1.2.3-70-g09d2


From 4e6dab4233f667c0ae465e5cb46603b49b4f6d74 Mon Sep 17 00:00:00 2001
From: "minkyung88.kim" <minkyung88.kim@lge.com>
Date: Fri, 4 Sep 2015 15:48:16 -0700
Subject: mm: remove struct node_active_region

struct node_active_region is not used anymore.  Remove it.

Signed-off-by: minkyung88.kim <minkyung88.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 754c25966a0a..ac00e2050943 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -690,14 +690,6 @@ struct zonelist {
 #endif
 };
 
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-struct node_active_region {
-	unsigned long start_pfn;
-	unsigned long end_pfn;
-	int nid;
-};
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
 #ifndef CONFIG_DISCONTIGMEM
 /* The array of struct pages - for discontigmem use pgdat->lmem_map */
 extern struct page *mem_map;
-- 
cgit v1.2.3-70-g09d2


From 19020f8ab83de9dc5a9c8af1f321a526f38bbc40 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 3 Sep 2015 22:07:37 +0800
Subject: KVM: make halt_poll_ns per-vCPU

Change halt_poll_ns into per-VCPU variable, seeded from module parameter,
to allow greater flexibility.

Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 1 +
 virt/kvm/kvm_main.c      | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 81089cf1f0c1..1bef9e21e725 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -242,6 +242,7 @@ struct kvm_vcpu {
 	int sigset_active;
 	sigset_t sigset;
 	struct kvm_vcpu_stat stat;
+	unsigned int halt_poll_ns;
 
 #ifdef CONFIG_HAS_IOMEM
 	int mmio_needed;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d8db2f8fce9c..c06e57cd1269 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -217,6 +217,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	vcpu->kvm = kvm;
 	vcpu->vcpu_id = id;
 	vcpu->pid = NULL;
+	vcpu->halt_poll_ns = 0;
 	init_waitqueue_head(&vcpu->wq);
 	kvm_async_pf_vcpu_init(vcpu);
 
@@ -1930,8 +1931,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	bool waited = false;
 
 	start = cur = ktime_get();
-	if (halt_poll_ns) {
-		ktime_t stop = ktime_add_ns(ktime_get(), halt_poll_ns);
+	if (vcpu->halt_poll_ns) {
+		ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
 
 		do {
 			/*
-- 
cgit v1.2.3-70-g09d2


From 7d160a6c462c2c690e074c173b43aad7204049ad Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 5 Sep 2015 19:06:57 -0400
Subject: NFSv4: Express delegation limit in units of pages

Since we're tracking modifications to the page cache on a per-page
basis, it makes sense to express the limit to how much we may cache
in units of pages.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/delegation.c     |  4 ++--
 fs/nfs/delegation.h     |  2 +-
 fs/nfs/nfs4xdr.c        | 16 ++++++++++------
 include/linux/nfs_xdr.h |  2 +-
 4 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 029d688a969f..cd503cc2251c 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -175,7 +175,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
 		if (delegation->inode != NULL) {
 			nfs4_stateid_copy(&delegation->stateid, &res->delegation);
 			delegation->type = res->delegation_type;
-			delegation->maxsize = res->maxsize;
+			delegation->pagemod_limit = res->pagemod_limit;
 			oldcred = delegation->cred;
 			delegation->cred = get_rpccred(cred);
 			clear_bit(NFS_DELEGATION_NEED_RECLAIM,
@@ -337,7 +337,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 		return -ENOMEM;
 	nfs4_stateid_copy(&delegation->stateid, &res->delegation);
 	delegation->type = res->delegation_type;
-	delegation->maxsize = res->maxsize;
+	delegation->pagemod_limit = res->pagemod_limit;
 	delegation->change_attr = inode->i_version;
 	delegation->cred = get_rpccred(cred);
 	delegation->inode = inode;
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index e3c20a3ccc93..554178a17376 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -18,7 +18,7 @@ struct nfs_delegation {
 	struct inode *inode;
 	nfs4_stateid stateid;
 	fmode_t type;
-	loff_t maxsize;
+	unsigned long pagemod_limit;
 	__u64 change_attr;
 	unsigned long flags;
 	spinlock_t lock;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index ff4784c54e04..788adf3897c7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4932,24 +4932,28 @@ static int decode_lookup(struct xdr_stream *xdr)
 }
 
 /* This is too sick! */
-static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
+static int decode_space_limit(struct xdr_stream *xdr,
+		unsigned long *pagemod_limit)
 {
 	__be32 *p;
 	uint32_t limit_type, nblocks, blocksize;
+	u64 maxsize = 0;
 
 	p = xdr_inline_decode(xdr, 12);
 	if (unlikely(!p))
 		goto out_overflow;
 	limit_type = be32_to_cpup(p++);
 	switch (limit_type) {
-	case 1:
-		xdr_decode_hyper(p, maxsize);
+	case NFS4_LIMIT_SIZE:
+		xdr_decode_hyper(p, &maxsize);
 		break;
-	case 2:
+	case NFS4_LIMIT_BLOCKS:
 		nblocks = be32_to_cpup(p++);
 		blocksize = be32_to_cpup(p);
-		*maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
+		maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
 	}
+	maxsize >>= PAGE_CACHE_SHIFT;
+	*pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
 	return 0;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
@@ -4977,7 +4981,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
 		break;
 	case NFS4_OPEN_DELEGATE_WRITE:
 		res->delegation_type = FMODE_WRITE|FMODE_READ;
-		if (decode_space_limit(xdr, &res->maxsize) < 0)
+		if (decode_space_limit(xdr, &res->pagemod_limit) < 0)
 				return -EIO;
 	}
 	return decode_ace(xdr, NULL, res->server->nfs_client);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b4392d86d157..52faf7e96c65 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -406,8 +406,8 @@ struct nfs_openres {
 	const struct nfs_server *server;
 	fmode_t			delegation_type;
 	nfs4_stateid		delegation;
+	unsigned long		pagemod_limit;
 	__u32			do_recall;
-	__u64			maxsize;
 	__u32			attrset[NFS4_BITMAP_SIZE];
 	struct nfs4_string	*owner;
 	struct nfs4_string	*group_owner;
-- 
cgit v1.2.3-70-g09d2


From e74bfeedad08180b968d8613dcde141ffb0720c3 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 13 Jul 2015 08:07:17 -0400
Subject: NTB: Add flow control to the ntb_netdev

Right now if we push the NTB really hard, we start dropping packets due
to not able to process the packets fast enough. We need to st:qop the
upper layer from flooding us when that happens.

A timer is necessary in order to restart the queue once the resource has
been processed on the receive side. Due to the way NTB is setup, the
resources on the tx side are tied to the processing of the rx side and
there's no async way to know when the rx side has released those
resources.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 drivers/net/ntb_netdev.c      | 77 +++++++++++++++++++++++++++++++++++++++++++
 drivers/ntb/ntb_transport.c   | 18 +++++++++-
 include/linux/ntb_transport.h |  1 +
 3 files changed, 95 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ntb_netdev.c b/drivers/net/ntb_netdev.c
index d8757bf9ad75..a9acf7156855 100644
--- a/drivers/net/ntb_netdev.c
+++ b/drivers/net/ntb_netdev.c
@@ -61,11 +61,21 @@ MODULE_VERSION(NTB_NETDEV_VER);
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Intel Corporation");
 
+/* Time in usecs for tx resource reaper */
+static unsigned int tx_time = 1;
+
+/* Number of descriptors to free before resuming tx */
+static unsigned int tx_start = 10;
+
+/* Number of descriptors still available before stop upper layer tx */
+static unsigned int tx_stop = 5;
+
 struct ntb_netdev {
 	struct list_head list;
 	struct pci_dev *pdev;
 	struct net_device *ndev;
 	struct ntb_transport_qp *qp;
+	struct timer_list tx_timer;
 };
 
 #define	NTB_TX_TIMEOUT_MS	1000
@@ -136,11 +146,42 @@ enqueue_again:
 	}
 }
 
+static int __ntb_netdev_maybe_stop_tx(struct net_device *netdev,
+				      struct ntb_transport_qp *qp, int size)
+{
+	struct ntb_netdev *dev = netdev_priv(netdev);
+
+	netif_stop_queue(netdev);
+	/* Make sure to see the latest value of ntb_transport_tx_free_entry()
+	 * since the queue was last started.
+	 */
+	smp_mb();
+
+	if (likely(ntb_transport_tx_free_entry(qp) < size)) {
+		mod_timer(&dev->tx_timer, jiffies + usecs_to_jiffies(tx_time));
+		return -EBUSY;
+	}
+
+	netif_start_queue(netdev);
+	return 0;
+}
+
+static int ntb_netdev_maybe_stop_tx(struct net_device *ndev,
+				    struct ntb_transport_qp *qp, int size)
+{
+	if (netif_queue_stopped(ndev) ||
+	    (ntb_transport_tx_free_entry(qp) >= size))
+		return 0;
+
+	return __ntb_netdev_maybe_stop_tx(ndev, qp, size);
+}
+
 static void ntb_netdev_tx_handler(struct ntb_transport_qp *qp, void *qp_data,
 				  void *data, int len)
 {
 	struct net_device *ndev = qp_data;
 	struct sk_buff *skb;
+	struct ntb_netdev *dev = netdev_priv(ndev);
 
 	skb = data;
 	if (!skb || !ndev)
@@ -155,6 +196,15 @@ static void ntb_netdev_tx_handler(struct ntb_transport_qp *qp, void *qp_data,
 	}
 
 	dev_kfree_skb(skb);
+
+	if (ntb_transport_tx_free_entry(dev->qp) >= tx_start) {
+		/* Make sure anybody stopping the queue after this sees the new
+		 * value of ntb_transport_tx_free_entry()
+		 */
+		smp_mb();
+		if (netif_queue_stopped(ndev))
+			netif_wake_queue(ndev);
+	}
 }
 
 static netdev_tx_t ntb_netdev_start_xmit(struct sk_buff *skb,
@@ -163,10 +213,15 @@ static netdev_tx_t ntb_netdev_start_xmit(struct sk_buff *skb,
 	struct ntb_netdev *dev = netdev_priv(ndev);
 	int rc;
 
+	ntb_netdev_maybe_stop_tx(ndev, dev->qp, tx_stop);
+
 	rc = ntb_transport_tx_enqueue(dev->qp, skb, skb->data, skb->len);
 	if (rc)
 		goto err;
 
+	/* check for next submit */
+	ntb_netdev_maybe_stop_tx(ndev, dev->qp, tx_stop);
+
 	return NETDEV_TX_OK;
 
 err:
@@ -175,6 +230,23 @@ err:
 	return NETDEV_TX_BUSY;
 }
 
+static void ntb_netdev_tx_timer(unsigned long data)
+{
+	struct net_device *ndev = (struct net_device *)data;
+	struct ntb_netdev *dev = netdev_priv(ndev);
+
+	if (ntb_transport_tx_free_entry(dev->qp) < tx_stop) {
+		mod_timer(&dev->tx_timer, jiffies + msecs_to_jiffies(tx_time));
+	} else {
+		/* Make sure anybody stopping the queue after this sees the new
+		 * value of ntb_transport_tx_free_entry()
+		 */
+		smp_mb();
+		if (netif_queue_stopped(ndev))
+			netif_wake_queue(ndev);
+	}
+}
+
 static int ntb_netdev_open(struct net_device *ndev)
 {
 	struct ntb_netdev *dev = netdev_priv(ndev);
@@ -197,8 +269,11 @@ static int ntb_netdev_open(struct net_device *ndev)
 		}
 	}
 
+	setup_timer(&dev->tx_timer, ntb_netdev_tx_timer, (unsigned long)ndev);
+
 	netif_carrier_off(ndev);
 	ntb_transport_link_up(dev->qp);
+	netif_start_queue(ndev);
 
 	return 0;
 
@@ -219,6 +294,8 @@ static int ntb_netdev_close(struct net_device *ndev)
 	while ((skb = ntb_transport_rx_remove(dev->qp, &len)))
 		dev_kfree_skb(skb);
 
+	del_timer_sync(&dev->tx_timer);
+
 	return 0;
 }
 
diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
index 1c6386d5f79c..0d851d684523 100644
--- a/drivers/ntb/ntb_transport.c
+++ b/drivers/ntb/ntb_transport.c
@@ -494,6 +494,12 @@ static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count,
 			       "tx_index - \t%u\n", qp->tx_index);
 	out_offset += snprintf(buf + out_offset, out_count - out_offset,
 			       "tx_max_entry - \t%u\n", qp->tx_max_entry);
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "qp->remote_rx_info->entry - \t%u\n",
+			       qp->remote_rx_info->entry);
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "free tx - \t%u\n",
+			       ntb_transport_tx_free_entry(qp));
 
 	out_offset += snprintf(buf + out_offset, out_count - out_offset,
 			       "\nQP Link %s\n",
@@ -535,6 +541,7 @@ static struct ntb_queue_entry *ntb_list_rm(spinlock_t *lock,
 	}
 	entry = list_first_entry(list, struct ntb_queue_entry, entry);
 	list_del(&entry->entry);
+
 out:
 	spin_unlock_irqrestore(lock, flags);
 
@@ -1843,7 +1850,7 @@ int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
 	entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
 	if (!entry) {
 		qp->tx_err_no_buf++;
-		return -ENOMEM;
+		return -EBUSY;
 	}
 
 	entry->cb_data = cb;
@@ -1969,6 +1976,15 @@ unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp)
 }
 EXPORT_SYMBOL_GPL(ntb_transport_max_size);
 
+unsigned int ntb_transport_tx_free_entry(struct ntb_transport_qp *qp)
+{
+	unsigned int head = qp->tx_index;
+	unsigned int tail = qp->remote_rx_info->entry;
+
+	return tail > head ? tail - head : qp->tx_max_entry + tail - head;
+}
+EXPORT_SYMBOL_GPL(ntb_transport_tx_free_entry);
+
 static void ntb_transport_doorbell_callback(void *data, int vector)
 {
 	struct ntb_transport_ctx *nt = data;
diff --git a/include/linux/ntb_transport.h b/include/linux/ntb_transport.h
index 2862861366a5..7243eb98a722 100644
--- a/include/linux/ntb_transport.h
+++ b/include/linux/ntb_transport.h
@@ -83,3 +83,4 @@ void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len);
 void ntb_transport_link_up(struct ntb_transport_qp *qp);
 void ntb_transport_link_down(struct ntb_transport_qp *qp);
 bool ntb_transport_link_query(struct ntb_transport_qp *qp);
+unsigned int ntb_transport_tx_free_entry(struct ntb_transport_qp *qp);
-- 
cgit v1.2.3-70-g09d2


From a7c23237481782fbea3c2230e362b72863e144b0 Mon Sep 17 00:00:00 2001
From: Allen Hubbe <Allen.Hubbe@emc.com>
Date: Wed, 15 Jul 2015 04:15:28 -0400
Subject: NTB: Fix documentation for ntb_link_is_up

There was a copy and paste error in the documentation for
ntb_link_is_up.  The long description was mistakenly copied from
ntb_link_set_trans.

This adds the appropriate long description for ntb_link_is_up.

Reported-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Allen Hubbe <Allen.Hubbe@emc.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 include/linux/ntb.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index b02f72bb8e32..e3d3299c6052 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -522,10 +522,9 @@ static inline int ntb_mw_clear_trans(struct ntb_dev *ntb, int idx)
  * @speed:	OUT - The link speed expressed as PCIe generation number.
  * @width:	OUT - The link width expressed as the number of PCIe lanes.
  *
- * Set the translation of a memory window.  The peer may access local memory
- * through the window starting at the address, up to the size.  The address
- * must be aligned to the alignment specified by ntb_mw_get_range().  The size
- * must be aligned to the size alignment specified by ntb_mw_get_range().
+ * Get the current state of the ntb link.  It is recommended to query the link
+ * state once after every link event.  It is safe to query the link state in
+ * the context of the link event callback.
  *
  * Return: One if the link is up, zero if the link is down, otherwise a
  *		negative value indicating the error number.
-- 
cgit v1.2.3-70-g09d2


From 86663c91866ae85c219f1a80ef2c9460b7ca5cd8 Mon Sep 17 00:00:00 2001
From: Allen Hubbe <Allen.Hubbe@emc.com>
Date: Wed, 15 Jul 2015 12:43:21 -0400
Subject: NTB: Fix documentation for ntb_peer_db_clear.

The documentation should say "peer" not "local" when referring to the
peer doorbell register.

Reported-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Allen Hubbe <Allen.Hubbe@emc.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 include/linux/ntb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index e3d3299c6052..f798e2afba88 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -794,7 +794,7 @@ static inline int ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
 }
 
 /**
- * ntb_peer_db_clear() - clear bits in the local doorbell register
+ * ntb_peer_db_clear() - clear bits in the peer doorbell register
  * @ntb:	NTB device context.
  * @db_bits:	Doorbell bits to clear.
  *
-- 
cgit v1.2.3-70-g09d2


From 8b9558aab853e98ba6e3fee0dd8545544966958c Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 1 Sep 2015 17:19:38 +0800
Subject: libceph: use keepalive2 to verify the mon session is alive

Signed-off-by: Yan, Zheng <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/libceph.h   |  2 ++
 include/linux/ceph/messenger.h |  4 +++
 include/linux/ceph/msgr.h      |  4 ++-
 net/ceph/ceph_common.c         |  1 +
 net/ceph/messenger.c           | 59 ++++++++++++++++++++++++++++++++++++++----
 net/ceph/mon_client.c          | 37 ++++++++++++++++++++------
 6 files changed, 93 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 9ebee53d3bf5..397c5cd09794 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -46,6 +46,7 @@ struct ceph_options {
 	unsigned long mount_timeout;		/* jiffies */
 	unsigned long osd_idle_ttl;		/* jiffies */
 	unsigned long osd_keepalive_timeout;	/* jiffies */
+	unsigned long monc_ping_timeout;	/* jiffies */
 
 	/*
 	 * any type that can't be simply compared or doesn't need need
@@ -66,6 +67,7 @@ struct ceph_options {
 #define CEPH_MOUNT_TIMEOUT_DEFAULT	msecs_to_jiffies(60 * 1000)
 #define CEPH_OSD_KEEPALIVE_DEFAULT	msecs_to_jiffies(5 * 1000)
 #define CEPH_OSD_IDLE_TTL_DEFAULT	msecs_to_jiffies(60 * 1000)
+#define CEPH_MONC_PING_TIMEOUT_DEFAULT	msecs_to_jiffies(30 * 1000)
 
 #define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
 #define CEPH_MSG_MAX_MIDDLE_LEN	(16*1024*1024)
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 37753278987a..7e1252e97a30 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -248,6 +248,8 @@ struct ceph_connection {
 	int in_base_pos;     /* bytes read */
 	__le64 in_temp_ack;  /* for reading an ack */
 
+	struct timespec last_keepalive_ack;
+
 	struct delayed_work work;	    /* send|recv work */
 	unsigned long       delay;          /* current delay interval */
 };
@@ -285,6 +287,8 @@ extern void ceph_msg_revoke(struct ceph_msg *msg);
 extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
 
 extern void ceph_con_keepalive(struct ceph_connection *con);
+extern bool ceph_con_keepalive_expired(struct ceph_connection *con,
+				       unsigned long interval);
 
 extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
 				size_t length, size_t alignment);
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 1c1887206ffa..0fe2656ac415 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -84,10 +84,12 @@ struct ceph_entity_inst {
 #define CEPH_MSGR_TAG_MSG           7  /* message */
 #define CEPH_MSGR_TAG_ACK           8  /* message ack */
 #define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
-#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
+#define CEPH_MSGR_TAG_BADPROTOVER   10 /* bad protocol version */
 #define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
 #define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
 #define CEPH_MSGR_TAG_SEQ           13 /* 64-bit int follows with seen seq number */
+#define CEPH_MSGR_TAG_KEEPALIVE2    14 /* keepalive2 byte + ceph_timespec */
+#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive2 reply */
 
 
 /*
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index f30329f72641..3f56eefc2a07 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -357,6 +357,7 @@ ceph_parse_options(char *options, const char *dev_name,
 	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
 	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
 	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
+	opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT;
 
 	/* get mon ip(s) */
 	/* ip1[:port1][,ip2[:port2]...] */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 101ab6285fba..36757d46ac40 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -163,6 +163,7 @@ static struct kmem_cache	*ceph_msg_data_cache;
 static char tag_msg = CEPH_MSGR_TAG_MSG;
 static char tag_ack = CEPH_MSGR_TAG_ACK;
 static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
 
 #ifdef CONFIG_LOCKDEP
 static struct lock_class_key socket_class;
@@ -1351,7 +1352,15 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 {
 	dout("prepare_write_keepalive %p\n", con);
 	con_out_kvec_reset(con);
-	con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
+	if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
+		struct timespec ts = CURRENT_TIME;
+		struct ceph_timespec ceph_ts;
+		ceph_encode_timespec(&ceph_ts, &ts);
+		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
+		con_out_kvec_add(con, sizeof(ceph_ts), &ceph_ts);
+	} else {
+		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
+	}
 	con_flag_set(con, CON_FLAG_WRITE_PENDING);
 }
 
@@ -1625,6 +1634,12 @@ static void prepare_read_tag(struct ceph_connection *con)
 	con->in_tag = CEPH_MSGR_TAG_READY;
 }
 
+static void prepare_read_keepalive_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_keepalive_ack %p\n", con);
+	con->in_base_pos = 0;
+}
+
 /*
  * Prepare to read a message.
  */
@@ -2457,6 +2472,17 @@ static void process_message(struct ceph_connection *con)
 	mutex_lock(&con->mutex);
 }
 
+static int read_keepalive_ack(struct ceph_connection *con)
+{
+	struct ceph_timespec ceph_ts;
+	size_t size = sizeof(ceph_ts);
+	int ret = read_partial(con, size, size, &ceph_ts);
+	if (ret <= 0)
+		return ret;
+	ceph_decode_timespec(&con->last_keepalive_ack, &ceph_ts);
+	prepare_read_tag(con);
+	return 1;
+}
 
 /*
  * Write something to the socket.  Called in a worker thread when the
@@ -2526,6 +2552,10 @@ more_kvec:
 
 do_next:
 	if (con->state == CON_STATE_OPEN) {
+		if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
+			prepare_write_keepalive(con);
+			goto more;
+		}
 		/* is anything else pending? */
 		if (!list_empty(&con->out_queue)) {
 			prepare_write_message(con);
@@ -2535,10 +2565,6 @@ do_next:
 			prepare_write_ack(con);
 			goto more;
 		}
-		if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
-			prepare_write_keepalive(con);
-			goto more;
-		}
 	}
 
 	/* Nothing to do! */
@@ -2641,6 +2667,9 @@ more:
 		case CEPH_MSGR_TAG_ACK:
 			prepare_read_ack(con);
 			break;
+		case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
+			prepare_read_keepalive_ack(con);
+			break;
 		case CEPH_MSGR_TAG_CLOSE:
 			con_close_socket(con);
 			con->state = CON_STATE_CLOSED;
@@ -2684,6 +2713,12 @@ more:
 		process_ack(con);
 		goto more;
 	}
+	if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+		ret = read_keepalive_ack(con);
+		if (ret <= 0)
+			goto out;
+		goto more;
+	}
 
 out:
 	dout("try_read done on %p ret %d\n", con, ret);
@@ -3101,6 +3136,20 @@ void ceph_con_keepalive(struct ceph_connection *con)
 }
 EXPORT_SYMBOL(ceph_con_keepalive);
 
+bool ceph_con_keepalive_expired(struct ceph_connection *con,
+			       unsigned long interval)
+{
+	if (interval > 0 &&
+	    (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+		struct timespec now = CURRENT_TIME;
+		struct timespec ts;
+		jiffies_to_timespec(interval, &ts);
+		ts = timespec_add(con->last_keepalive_ack, ts);
+		return timespec_compare(&now, &ts) >= 0;
+	}
+	return false;
+}
+
 static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
 {
 	struct ceph_msg_data *data;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 9d6ff1215928..edda01626a45 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -149,6 +149,10 @@ static int __open_session(struct ceph_mon_client *monc)
 			      CEPH_ENTITY_TYPE_MON, monc->cur_mon,
 			      &monc->monmap->mon_inst[monc->cur_mon].addr);
 
+		/* send an initial keepalive to ensure our timestamp is
+		 * valid by the time we are in an OPENED state */
+		ceph_con_keepalive(&monc->con);
+
 		/* initiatiate authentication handshake */
 		ret = ceph_auth_build_hello(monc->auth,
 					    monc->m_auth->front.iov_base,
@@ -170,14 +174,19 @@ static bool __sub_expired(struct ceph_mon_client *monc)
  */
 static void __schedule_delayed(struct ceph_mon_client *monc)
 {
-	unsigned int delay;
+	struct ceph_options *opt = monc->client->options;
+	unsigned long delay;
 
-	if (monc->cur_mon < 0 || __sub_expired(monc))
+	if (monc->cur_mon < 0 || __sub_expired(monc)) {
 		delay = 10 * HZ;
-	else
+	} else {
 		delay = 20 * HZ;
-	dout("__schedule_delayed after %u\n", delay);
-	schedule_delayed_work(&monc->delayed_work, delay);
+		if (opt->monc_ping_timeout > 0)
+			delay = min(delay, opt->monc_ping_timeout / 3);
+	}
+	dout("__schedule_delayed after %lu\n", delay);
+	schedule_delayed_work(&monc->delayed_work,
+			      round_jiffies_relative(delay));
 }
 
 /*
@@ -743,11 +752,23 @@ static void delayed_work(struct work_struct *work)
 		__close_session(monc);
 		__open_session(monc);  /* continue hunting */
 	} else {
-		ceph_con_keepalive(&monc->con);
+		struct ceph_options *opt = monc->client->options;
+		int is_auth = ceph_auth_is_authenticated(monc->auth);
+		if (ceph_con_keepalive_expired(&monc->con,
+					       opt->monc_ping_timeout)) {
+			dout("monc keepalive timeout\n");
+			is_auth = 0;
+			__close_session(monc);
+			monc->hunting = true;
+			__open_session(monc);
+		}
 
-		__validate_auth(monc);
+		if (!monc->hunting) {
+			ceph_con_keepalive(&monc->con);
+			__validate_auth(monc);
+		}
 
-		if (ceph_auth_is_authenticated(monc->auth))
+		if (is_auth)
 			__send_subscribe(monc);
 	}
 	__schedule_delayed(monc);
-- 
cgit v1.2.3-70-g09d2


From b5330628546616af14ff23075fbf8d4ad91f6e25 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 8 Sep 2015 14:58:28 -0700
Subject: mm: introduce vma_is_anonymous(vma) helper

special_mapping_fault() is absolutely broken.  It seems it was always
wrong, but this didn't matter until vdso/vvar started to use more than
one page.

And after this change vma_is_anonymous() becomes really trivial, it
simply checks vm_ops == NULL.  However, I do think the helper makes
sense.  There are a lot of ->vm_ops != NULL checks, the helper makes the
caller's code more understandable (self-documented) and this is more
grep-friendly.

This patch (of 3):

Preparation.  Add the new simple helper, vma_is_anonymous(vma), and change
handle_pte_fault() to use it.  It will have more users.

The name is not accurate, say a hpet_mmap()'ed vma is not anonymous.
Perhaps it should be named vma_has_fault() instead.  But it matches the
logic in mmap.c/memory.c (see next changes).  "True" just means that a
page fault will use do_anonymous_page().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 5 +++++
 mm/memory.c        | 8 ++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8b257c43855b..dfb7ce05f1e3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1260,6 +1260,11 @@ static inline int vma_growsdown(struct vm_area_struct *vma, unsigned long addr)
 	return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
 }
 
+static inline bool vma_is_anonymous(struct vm_area_struct *vma)
+{
+	return !vma->vm_ops;
+}
+
 static inline int stack_guard_page_start(struct vm_area_struct *vma,
 					     unsigned long addr)
 {
diff --git a/mm/memory.c b/mm/memory.c
index bb04d8f2f86c..882c9d7ae2f5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3267,12 +3267,12 @@ static int handle_pte_fault(struct mm_struct *mm,
 	barrier();
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
-			if (vma->vm_ops)
+			if (vma_is_anonymous(vma))
+				return do_anonymous_page(mm, vma, address,
+							 pte, pmd, flags);
+			else
 				return do_fault(mm, vma, address, pte, pmd,
 						flags, entry);
-
-			return do_anonymous_page(mm, vma, address, pte, pmd,
-					flags);
 		}
 		return do_swap_page(mm, vma, address,
 					pte, pmd, flags, entry);
-- 
cgit v1.2.3-70-g09d2


From e1b9996b85ba3ff143ded04523cd015762d20f03 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 8 Sep 2015 14:58:37 -0700
Subject: thp: vma_adjust_trans_huge(): adjust file-backed VMA too

This series of patches adds support for using PMD page table entries to
map DAX files.  We expect NV-DIMMs to start showing up that are many
gigabytes in size and the memory consumption of 4kB PTEs will be
astronomical.

The patch series leverages much of the Transparant Huge Pages
infrastructure, going so far as to borrow one of Kirill's patches from
his THP page cache series.

This patch (of 10):

Since we're going to have huge pages in page cache, we need to call adjust
file-backed VMA, which potentially can contain huge pages.

For now we call it for all VMAs.

Probably later we will need to introduce a flag to indicate that the VMA
has huge pages.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 11 +----------
 mm/huge_memory.c        |  2 +-
 2 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f10b20f05159..1c53c7d7ef7e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -122,7 +122,7 @@ extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
 #endif
 extern int hugepage_madvise(struct vm_area_struct *vma,
 			    unsigned long *vm_flags, int advice);
-extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
 				    unsigned long start,
 				    unsigned long end,
 				    long adjust_next);
@@ -138,15 +138,6 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
 	else
 		return 0;
 }
-static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
-					 unsigned long start,
-					 unsigned long end,
-					 long adjust_next)
-{
-	if (!vma->anon_vma || vma->vm_ops)
-		return;
-	__vma_adjust_trans_huge(vma, start, end, adjust_next);
-}
 static inline int hpage_nr_pages(struct page *page)
 {
 	if (unlikely(PageTransHuge(page)))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 279a818a39b1..4d5fcb630d32 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2991,7 +2991,7 @@ static void split_huge_page_address(struct mm_struct *mm,
 	split_huge_page_pmd_mm(mm, address, pmd);
 }
 
-void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+void vma_adjust_trans_huge(struct vm_area_struct *vma,
 			     unsigned long start,
 			     unsigned long end,
 			     long adjust_next)
-- 
cgit v1.2.3-70-g09d2


From c94c2acf84dc16cf4b989bb0bc849785b7ff52f5 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 8 Sep 2015 14:58:40 -0700
Subject: dax: move DAX-related functions to a new header

In order to handle the !CONFIG_TRANSPARENT_HUGEPAGES case, we need to
return VM_FAULT_FALLBACK from the inlined dax_pmd_fault(), which is
defined in linux/mm.h.  Given that we don't want to include <linux/mm.h>
in <linux/fs.h>, the easiest solution is to move the DAX-related
functions to a new header, <linux/dax.h>.  We could also have moved
VM_FAULT_* definitions to a new header, or a different header that isn't
quite such a boil-the-ocean header as <linux/mm.h>, but this felt like
the best option.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c      |  1 +
 fs/ext2/file.c      |  1 +
 fs/ext2/inode.c     |  1 +
 fs/ext4/file.c      |  1 +
 fs/ext4/indirect.c  |  1 +
 fs/ext4/inode.c     |  1 +
 fs/xfs/xfs_buf.h    |  1 +
 include/linux/dax.h | 21 +++++++++++++++++++++
 include/linux/fs.h  | 14 --------------
 9 files changed, 28 insertions(+), 14 deletions(-)
 create mode 100644 include/linux/dax.h

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 33b813e04f79..28cc525b8d59 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -28,6 +28,7 @@
 #include <linux/namei.h>
 #include <linux/log2.h>
 #include <linux/cleancache.h>
+#include <linux/dax.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 3b57c9f83c9b..db4c299b7cf6 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -20,6 +20,7 @@
 
 #include <linux/time.h>
 #include <linux/pagemap.h>
+#include <linux/dax.h>
 #include <linux/quotaops.h>
 #include "ext2.h"
 #include "xattr.h"
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index a3a404c5df2e..c60a248c640c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -25,6 +25,7 @@
 #include <linux/time.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
+#include <linux/dax.h>
 #include <linux/quotaops.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bc313ac5d3fa..f8a631047379 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/path.h>
+#include <linux/dax.h>
 #include <linux/quotaops.h>
 #include <linux/pagevec.h>
 #include <linux/uio.h>
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 4f6ac499f09e..2468261748b2 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -22,6 +22,7 @@
 
 #include "ext4_jbd2.h"
 #include "truncate.h"
+#include <linux/dax.h>
 #include <linux/uio.h>
 
 #include <trace/events/ext4.h>
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 29f1af7c2cab..5ebcc7683a5c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -22,6 +22,7 @@
 #include <linux/time.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
+#include <linux/dax.h>
 #include <linux/quotaops.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 331c1ccf8264..c79b717d9b88 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -23,6 +23,7 @@
 #include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/dax.h>
 #include <linux/buffer_head.h>
 #include <linux/uio.h>
 #include <linux/list_lru.h>
diff --git a/include/linux/dax.h b/include/linux/dax.h
new file mode 100644
index 000000000000..4f27d3dbf6e8
--- /dev/null
+++ b/include/linux/dax.h
@@ -0,0 +1,21 @@
+#ifndef _LINUX_DAX_H
+#define _LINUX_DAX_H
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <asm/pgtable.h>
+
+ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
+		  get_block_t, dio_iodone_t, int flags);
+int dax_clear_blocks(struct inode *, sector_t block, long size);
+int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
+int dax_truncate_page(struct inode *, loff_t from, get_block_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+		dax_iodone_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+		dax_iodone_t);
+int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
+#define dax_mkwrite(vma, vmf, gb, iod)		dax_fault(vma, vmf, gb, iod)
+#define __dax_mkwrite(vma, vmf, gb, iod)	__dax_fault(vma, vmf, gb, iod)
+
+#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b2f9b9c25e41..72d8a844c692 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -52,7 +52,6 @@ struct swap_info_struct;
 struct seq_file;
 struct workqueue_struct;
 struct iov_iter;
-struct vm_fault;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -2678,19 +2677,6 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
 extern int generic_file_open(struct inode * inode, struct file * filp);
 extern int nonseekable_open(struct inode * inode, struct file * filp);
 
-ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
-		  get_block_t, dio_iodone_t, int flags);
-int dax_clear_blocks(struct inode *, sector_t block, long size);
-int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
-int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
-		dax_iodone_t);
-int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
-		dax_iodone_t);
-int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb, iod)		dax_fault(vma, vmf, gb, iod)
-#define __dax_mkwrite(vma, vmf, gb, iod)	__dax_fault(vma, vmf, gb, iod)
-
 #ifdef CONFIG_BLOCK
 typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
 			    loff_t file_offset);
-- 
cgit v1.2.3-70-g09d2


From 4897c7655d9419ba7e62bac145ec6a1847134d93 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 8 Sep 2015 14:58:45 -0700
Subject: thp: prepare for DAX huge pages

Add a vma_is_dax() helper macro to test whether the VMA is DAX, and use it
in zap_huge_pmd() and __split_huge_page_pmd().

[akpm@linux-foundation.org: fix build]
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dax.h |  4 ++++
 mm/huge_memory.c    | 48 +++++++++++++++++++++++++++++-------------------
 2 files changed, 33 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 4f27d3dbf6e8..9b51f9d40ad9 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -18,4 +18,8 @@ int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
 #define dax_mkwrite(vma, vmf, gb, iod)		dax_fault(vma, vmf, gb, iod)
 #define __dax_mkwrite(vma, vmf, gb, iod)	__dax_fault(vma, vmf, gb, iod)
 
+static inline bool vma_is_dax(struct vm_area_struct *vma)
+{
+	return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
+}
 #endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ca475dfdb28f..9057241d5722 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -16,6 +16,7 @@
 #include <linux/swap.h>
 #include <linux/shrinker.h>
 #include <linux/mm_inline.h>
+#include <linux/dax.h>
 #include <linux/kthread.h>
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
@@ -794,7 +795,7 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 }
 
 /* Caller must hold page table lock. */
-static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
 		struct page *zero_page)
 {
@@ -1421,7 +1422,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	int ret = 0;
 
 	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
-		struct page *page;
 		pgtable_t pgtable;
 		pmd_t orig_pmd;
 		/*
@@ -1433,13 +1433,22 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
 							tlb->fullmm);
 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
-		pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
+		if (vma_is_dax(vma)) {
+			if (is_huge_zero_pmd(orig_pmd)) {
+				pgtable = NULL;
+			} else {
+				spin_unlock(ptl);
+				return 1;
+			}
+		} else {
+			pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
+		}
 		if (is_huge_zero_pmd(orig_pmd)) {
 			atomic_long_dec(&tlb->mm->nr_ptes);
 			spin_unlock(ptl);
 			put_huge_zero_page();
 		} else {
-			page = pmd_page(orig_pmd);
+			struct page *page = pmd_page(orig_pmd);
 			page_remove_rmap(page);
 			VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
 			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
@@ -1448,7 +1457,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			spin_unlock(ptl);
 			tlb_remove_page(tlb, page);
 		}
-		pte_free(tlb->mm, pgtable);
+		if (pgtable)
+			pte_free(tlb->mm, pgtable);
 		ret = 1;
 	}
 	return ret;
@@ -2914,7 +2924,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
 		pmd_t *pmd)
 {
 	spinlock_t *ptl;
-	struct page *page;
+	struct page *page = NULL;
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long haddr = address & HPAGE_PMD_MASK;
 	unsigned long mmun_start;	/* For mmu_notifiers */
@@ -2927,25 +2937,25 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
 again:
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	ptl = pmd_lock(mm, pmd);
-	if (unlikely(!pmd_trans_huge(*pmd))) {
-		spin_unlock(ptl);
-		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-		return;
-	}
-	if (is_huge_zero_pmd(*pmd)) {
+	if (unlikely(!pmd_trans_huge(*pmd)))
+		goto unlock;
+	if (vma_is_dax(vma)) {
+		pmdp_huge_clear_flush(vma, haddr, pmd);
+	} else if (is_huge_zero_pmd(*pmd)) {
 		__split_huge_zero_page_pmd(vma, haddr, pmd);
-		spin_unlock(ptl);
-		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-		return;
+	} else {
+		page = pmd_page(*pmd);
+		VM_BUG_ON_PAGE(!page_count(page), page);
+		get_page(page);
 	}
-	page = pmd_page(*pmd);
-	VM_BUG_ON_PAGE(!page_count(page), page);
-	get_page(page);
+ unlock:
 	spin_unlock(ptl);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
-	split_huge_page(page);
+	if (!page)
+		return;
 
+	split_huge_page(page);
 	put_page(page);
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From b96375f74a6d4f39fc6cbdc0bce5175115c7f96f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 8 Sep 2015 14:58:48 -0700
Subject: mm: add a pmd_fault handler

Allow non-anonymous VMAs to provide huge pages in response to a page fault.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 ++
 mm/memory.c        | 30 ++++++++++++++++++++++++------
 2 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dfb7ce05f1e3..e1fbd18b8c4b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -249,6 +249,8 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
+	int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
+						pmd_t *, unsigned int flags);
 	void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
 
 	/* notification that a previously read-only page is about to become
diff --git a/mm/memory.c b/mm/memory.c
index 882c9d7ae2f5..a3f9a8ccec0f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3232,6 +3232,27 @@ out:
 	return 0;
 }
 
+static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, pmd_t *pmd, unsigned int flags)
+{
+	if (!vma->vm_ops)
+		return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
+	if (vma->vm_ops->pmd_fault)
+		return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+	return VM_FAULT_FALLBACK;
+}
+
+static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
+			unsigned int flags)
+{
+	if (!vma->vm_ops)
+		return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
+	if (vma->vm_ops->pmd_fault)
+		return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+	return VM_FAULT_FALLBACK;
+}
+
 /*
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most
@@ -3334,10 +3355,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!pmd)
 		return VM_FAULT_OOM;
 	if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
-		int ret = VM_FAULT_FALLBACK;
-		if (!vma->vm_ops)
-			ret = do_huge_pmd_anonymous_page(mm, vma, address,
-					pmd, flags);
+		int ret = create_huge_pmd(mm, vma, address, pmd, flags);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
 	} else {
@@ -3361,8 +3379,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 							     orig_pmd, pmd);
 
 			if (dirty && !pmd_write(orig_pmd)) {
-				ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
-							  orig_pmd);
+				ret = wp_huge_pmd(mm, vma, address, pmd,
+							orig_pmd, flags);
 				if (!(ret & VM_FAULT_FALLBACK))
 					return ret;
 			} else {
-- 
cgit v1.2.3-70-g09d2


From fc43704437ebe40f642ac53f7ee73661fe74e6b8 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 8 Sep 2015 14:58:51 -0700
Subject: mm: export various functions for the benefit of DAX

To use the huge zero page in DAX, we need these functions exported.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 10 ++++++++++
 mm/huge_memory.c        |  7 +------
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 1c53c7d7ef7e..70587ea079c3 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -155,6 +155,16 @@ static inline bool is_huge_zero_page(struct page *page)
 	return ACCESS_ONCE(huge_zero_page) == page;
 }
 
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+	return is_huge_zero_page(pmd_page(pmd));
+}
+
+struct page *get_huge_zero_page(void);
+bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+		struct vm_area_struct *vma, unsigned long haddr,
+		pmd_t *pmd, struct page *zero_page);
+
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9057241d5722..c426a89e025c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -173,12 +173,7 @@ fail:
 static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 
-static inline bool is_huge_zero_pmd(pmd_t pmd)
-{
-	return is_huge_zero_page(pmd_page(pmd));
-}
-
-static struct page *get_huge_zero_page(void)
+struct page *get_huge_zero_page(void)
 {
 	struct page *zero_page;
 retry:
-- 
cgit v1.2.3-70-g09d2


From 5cad465d7fa646bad3d677df276bfc8e2ad709e3 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 8 Sep 2015 14:58:54 -0700
Subject: mm: add vmf_insert_pfn_pmd()

Similar to vm_insert_pfn(), but for PMDs rather than PTEs.  The 'vmf_'
prefix instead of 'vm_' prefix is intended to indicate that it returns a
VMF_ value rather than an errno (which would only have to be converted
into a VMF_ value anyway).

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h |  2 ++
 mm/huge_memory.c        | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 70587ea079c3..f9b612fec4dd 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -33,6 +33,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma,
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned long addr, pgprot_t newprot,
 			int prot_numa);
+int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
+			unsigned long pfn, bool write);
 
 enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_FLAG,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c426a89e025c..3ea6f908a5e0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -869,6 +869,49 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 					    flags);
 }
 
+static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+		pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pmd_t entry;
+	spinlock_t *ptl;
+
+	ptl = pmd_lock(mm, pmd);
+	if (pmd_none(*pmd)) {
+		entry = pmd_mkhuge(pfn_pmd(pfn, prot));
+		if (write) {
+			entry = pmd_mkyoung(pmd_mkdirty(entry));
+			entry = maybe_pmd_mkwrite(entry, vma);
+		}
+		set_pmd_at(mm, addr, pmd, entry);
+		update_mmu_cache_pmd(vma, addr, pmd);
+	}
+	spin_unlock(ptl);
+	return VM_FAULT_NOPAGE;
+}
+
+int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+			pmd_t *pmd, unsigned long pfn, bool write)
+{
+	pgprot_t pgprot = vma->vm_page_prot;
+	/*
+	 * If we had pmd_special, we could avoid all these restrictions,
+	 * but we need to be consistent with PTEs and architectures that
+	 * can't support a 'special' bit.
+	 */
+	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+						(VM_PFNMAP|VM_MIXEDMAP));
+	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+
+	if (addr < vma->vm_start || addr >= vma->vm_end)
+		return VM_FAULT_SIGBUS;
+	if (track_pfn_insert(vma, &pgprot, pfn))
+		return VM_FAULT_SIGBUS;
+	return insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
+}
+
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 		  struct vm_area_struct *vma)
-- 
cgit v1.2.3-70-g09d2


From 844f35db1088dd1a9de37b53d4d823626232bd19 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 8 Sep 2015 14:58:57 -0700
Subject: dax: add huge page fault support

This is the support code for DAX-enabled filesystems to allow them to
provide huge pages in response to faults.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/dax.txt |   7 +-
 fs/dax.c                          | 152 ++++++++++++++++++++++++++++++++++++++
 include/linux/dax.h               |  14 ++++
 3 files changed, 170 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 7af2851d667c..7bde64014a89 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -60,9 +60,10 @@ Filesystem support consists of
 - implementing the direct_IO address space operation, and calling
   dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
 - implementing an mmap file operation for DAX files which sets the
-  VM_MIXEDMAP flag on the VMA, and setting the vm_ops to include handlers
-  for fault and page_mkwrite (which should probably call dax_fault() and
-  dax_mkwrite(), passing the appropriate get_block() callback)
+  VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
+  include handlers for fault, pmd_fault and page_mkwrite (which should
+  probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the
+  appropriate get_block() callback)
 - calling dax_truncate_page() instead of block_truncate_page() for DAX files
 - calling dax_zero_page_range() instead of zero_user() for DAX files
 - ensuring that there is sufficient locking between reads, writes,
diff --git a/fs/dax.c b/fs/dax.c
index a7f77e1fa18c..15f8ffc13fa6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -494,6 +494,158 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 }
 EXPORT_SYMBOL_GPL(dax_fault);
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
+ * more often than one might expect in the below function.
+ */
+#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
+
+int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+		pmd_t *pmd, unsigned int flags, get_block_t get_block,
+		dax_iodone_t complete_unwritten)
+{
+	struct file *file = vma->vm_file;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	struct buffer_head bh;
+	unsigned blkbits = inode->i_blkbits;
+	unsigned long pmd_addr = address & PMD_MASK;
+	bool write = flags & FAULT_FLAG_WRITE;
+	long length;
+	void *kaddr;
+	pgoff_t size, pgoff;
+	sector_t block, sector;
+	unsigned long pfn;
+	int result = 0;
+
+	/* Fall back to PTEs if we're going to COW */
+	if (write && !(vma->vm_flags & VM_SHARED))
+		return VM_FAULT_FALLBACK;
+	/* If the PMD would extend outside the VMA */
+	if (pmd_addr < vma->vm_start)
+		return VM_FAULT_FALLBACK;
+	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+		return VM_FAULT_FALLBACK;
+
+	pgoff = ((pmd_addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (pgoff >= size)
+		return VM_FAULT_SIGBUS;
+	/* If the PMD would cover blocks out of the file */
+	if ((pgoff | PG_PMD_COLOUR) >= size)
+		return VM_FAULT_FALLBACK;
+
+	memset(&bh, 0, sizeof(bh));
+	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
+
+	bh.b_size = PMD_SIZE;
+	length = get_block(inode, block, &bh, write);
+	if (length)
+		return VM_FAULT_SIGBUS;
+	i_mmap_lock_read(mapping);
+
+	/*
+	 * If the filesystem isn't willing to tell us the length of a hole,
+	 * just fall back to PTEs.  Calling get_block 512 times in a loop
+	 * would be silly.
+	 */
+	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
+		goto fallback;
+
+	/* Guard against a race with truncate */
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (pgoff >= size) {
+		result = VM_FAULT_SIGBUS;
+		goto out;
+	}
+	if ((pgoff | PG_PMD_COLOUR) >= size)
+		goto fallback;
+
+	if (is_huge_zero_pmd(*pmd))
+		unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
+
+	if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
+		bool set;
+		spinlock_t *ptl;
+		struct mm_struct *mm = vma->vm_mm;
+		struct page *zero_page = get_huge_zero_page();
+		if (unlikely(!zero_page))
+			goto fallback;
+
+		ptl = pmd_lock(mm, pmd);
+		set = set_huge_zero_page(NULL, mm, vma, pmd_addr, pmd,
+								zero_page);
+		spin_unlock(ptl);
+		result = VM_FAULT_NOPAGE;
+	} else {
+		sector = bh.b_blocknr << (blkbits - 9);
+		length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
+						bh.b_size);
+		if (length < 0) {
+			result = VM_FAULT_SIGBUS;
+			goto out;
+		}
+		if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
+			goto fallback;
+
+		if (buffer_unwritten(&bh) || buffer_new(&bh)) {
+			int i;
+			for (i = 0; i < PTRS_PER_PMD; i++)
+				clear_page(kaddr + i * PAGE_SIZE);
+			count_vm_event(PGMAJFAULT);
+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			result |= VM_FAULT_MAJOR;
+		}
+
+		result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
+	}
+
+ out:
+	i_mmap_unlock_read(mapping);
+
+	if (buffer_unwritten(&bh))
+		complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
+
+	return result;
+
+ fallback:
+	count_vm_event(THP_FAULT_FALLBACK);
+	result = VM_FAULT_FALLBACK;
+	goto out;
+}
+EXPORT_SYMBOL_GPL(__dax_pmd_fault);
+
+/**
+ * dax_pmd_fault - handle a PMD fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * pmd_fault handler for DAX files.
+ */
+int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+			pmd_t *pmd, unsigned int flags, get_block_t get_block,
+			dax_iodone_t complete_unwritten)
+{
+	int result;
+	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+	if (flags & FAULT_FLAG_WRITE) {
+		sb_start_pagefault(sb);
+		file_update_time(vma->vm_file);
+	}
+	result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
+				complete_unwritten);
+	if (flags & FAULT_FLAG_WRITE)
+		sb_end_pagefault(sb);
+
+	return result;
+}
+EXPORT_SYMBOL_GPL(dax_pmd_fault);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGES */
+
 /**
  * dax_pfn_mkwrite - handle first write to DAX page
  * @vma: The virtual memory area where the fault occurred
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 9b51f9d40ad9..b415e521528d 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -14,6 +14,20 @@ int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
 		dax_iodone_t);
 int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
 		dax_iodone_t);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
+				unsigned int flags, get_block_t, dax_iodone_t);
+int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
+				unsigned int flags, get_block_t, dax_iodone_t);
+#else
+static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+				pmd_t *pmd, unsigned int flags, get_block_t gb,
+				dax_iodone_t di)
+{
+	return VM_FAULT_FALLBACK;
+}
+#define __dax_pmd_fault dax_pmd_fault
+#endif
 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
 #define dax_mkwrite(vma, vmf, gb, iod)		dax_fault(vma, vmf, gb, iod)
 #define __dax_mkwrite(vma, vmf, gb, iod)	__dax_fault(vma, vmf, gb, iod)
-- 
cgit v1.2.3-70-g09d2


From d295e3415a88ae63a37a22652808b20c7fcb970e Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 8 Sep 2015 14:59:34 -0700
Subject: dax: don't use set_huge_zero_page()

This is another place where DAX assumed that pgtable_t was a pointer.
Open code the important parts of set_huge_zero_page() in DAX and make
set_huge_zero_page() static again.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c                | 18 ++++++++++++------
 include/linux/huge_mm.h |  3 ---
 mm/huge_memory.c        |  2 +-
 3 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 9593f4bee327..d778e5f1a01c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -572,18 +572,24 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
 
 	if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
-		bool set;
 		spinlock_t *ptl;
-		struct mm_struct *mm = vma->vm_mm;
+		pmd_t entry;
 		struct page *zero_page = get_huge_zero_page();
+
 		if (unlikely(!zero_page))
 			goto fallback;
 
-		ptl = pmd_lock(mm, pmd);
-		set = set_huge_zero_page(NULL, mm, vma, pmd_addr, pmd,
-								zero_page);
-		spin_unlock(ptl);
+		ptl = pmd_lock(vma->vm_mm, pmd);
+		if (!pmd_none(*pmd)) {
+			spin_unlock(ptl);
+			goto fallback;
+		}
+
+		entry = mk_pmd(zero_page, vma->vm_page_prot);
+		entry = pmd_mkhuge(entry);
+		set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
 		result = VM_FAULT_NOPAGE;
+		spin_unlock(ptl);
 	} else {
 		sector = bh.b_blocknr << (blkbits - 9);
 		length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f9b612fec4dd..ecb080d6ff42 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -163,9 +163,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
 }
 
 struct page *get_huge_zero_page(void);
-bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
-		struct vm_area_struct *vma, unsigned long haddr,
-		pmd_t *pmd, struct page *zero_page);
 
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 96dfd9d81fcb..3e574efad8f8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -790,7 +790,7 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 }
 
 /* Caller must hold page table lock. */
-bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
 		struct page *zero_page)
 {
-- 
cgit v1.2.3-70-g09d2


From b5e3aa0a4d5e35329203fd09acb0dbc7f7fd64de Mon Sep 17 00:00:00 2001
From: Vineet Gupta <Vineet.Gupta1@synopsys.com>
Date: Tue, 8 Sep 2015 14:59:56 -0700
Subject: mm: remove put_page_unless_one()

It has no callers.

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e1fbd18b8c4b..4ec72ef1f04a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -361,18 +361,6 @@ static inline int get_page_unless_zero(struct page *page)
 	return atomic_inc_not_zero(&page->_count);
 }
 
-/*
- * Try to drop a ref unless the page has a refcount of one, return false if
- * that is the case.
- * This is to make sure that the refcount won't become zero after this drop.
- * This can be called when MMU is off so it must not access
- * any of the virtual mappings.
- */
-static inline int put_page_unless_one(struct page *page)
-{
-	return atomic_add_unless(&page->_count, -1, 1);
-}
-
 extern int page_is_ram(unsigned long pfn);
 extern int region_is_ram(resource_size_t phys_addr, unsigned long size);
 
-- 
cgit v1.2.3-70-g09d2


From 8334b96221ff0dcbde4873d31eb4d84774ed8ed4 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 8 Sep 2015 15:00:24 -0700
Subject: mm: /proc/pid/smaps:: show proportional swap share of the mapping

We want to know per-process workingset size for smart memory management
on userland and we use swap(ex, zram) heavily to maximize memory
efficiency so workingset includes swap as well as RSS.

On such system, if there are lots of shared anonymous pages, it's really
hard to figure out exactly how many each process consumes memory(ie, rss
+ wap) if the system has lots of shared anonymous memory(e.g, android).

This patch introduces SwapPss field on /proc/<pid>/smaps so we can get
more exact workingset size per process.

Bongkyu tested it. Result is below.

1. 50M used swap
SwapTotal: 461976 kB
SwapFree: 411192 kB

$ adb shell cat /proc/*/smaps | grep "SwapPss:" | awk '{sum += $2} END {print sum}';
48236
$ adb shell cat /proc/*/smaps | grep "Swap:" | awk '{sum += $2} END {print sum}';
141184

2. 240M used swap
SwapTotal: 461976 kB
SwapFree: 216808 kB

$ adb shell cat /proc/*/smaps | grep "SwapPss:" | awk '{sum += $2} END {print sum}';
230315
$ adb shell cat /proc/*/smaps | grep "Swap:" | awk '{sum += $2} END {print sum}';
1387744

[akpm@linux-foundation.org: simplify kunmap_atomic() call]
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reported-by: Bongkyu Kim <bongkyu.kim@lge.com>
Tested-by: Bongkyu Kim <bongkyu.kim@lge.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 18 +++++++++++-----
 fs/proc/task_mmu.c                 | 18 ++++++++++++++--
 include/linux/swap.h               |  6 ++++++
 mm/swapfile.c                      | 42 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 77 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 6f7fafde0884..d411ca63c8b6 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -424,6 +424,7 @@ Private_Dirty:         0 kB
 Referenced:          892 kB
 Anonymous:             0 kB
 Swap:                  0 kB
+SwapPss:               0 kB
 KernelPageSize:        4 kB
 MMUPageSize:           4 kB
 Locked:              374 kB
@@ -433,16 +434,23 @@ the first of these lines shows the same information as is displayed for the
 mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
 (size), the amount of the mapping that is currently resident in RAM (RSS), the
 process' proportional share of this mapping (PSS), the number of clean and
-dirty private pages in the mapping.  Note that even a page which is part of a
-MAP_SHARED mapping, but has only a single pte mapped, i.e.  is currently used
-by only one process, is accounted as private and not as shared.  "Referenced"
-indicates the amount of memory currently marked as referenced or accessed.
+dirty private pages in the mapping.
+
+The "proportional set size" (PSS) of a process is the count of pages it has
+in memory, where each page is divided by the number of processes sharing it.
+So if a process has 1000 pages all to itself, and 1000 shared with one other
+process, its PSS will be 1500.
+Note that even a page which is part of a MAP_SHARED mapping, but has only
+a single pte mapped, i.e.  is currently used by only one process, is accounted
+as private and not as shared.
+"Referenced" indicates the amount of memory currently marked as referenced or
+accessed.
 "Anonymous" shows the amount of memory that does not belong to any file.  Even
 a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
 and a page is modified, the file page is replaced by a private anonymous copy.
 "Swap" shows how much would-be-anonymous memory is also used, but out on
 swap.
-
+"SwapPss" shows proportional swap share of this mapping.
 "VmFlags" field deserves a separate description. This member represents the kernel
 flags associated with the particular virtual memory area in two letter encoded
 manner. The codes are the following:
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 67c76468a7be..41f1a50c10c9 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -446,6 +446,7 @@ struct mem_size_stats {
 	unsigned long anonymous_thp;
 	unsigned long swap;
 	u64 pss;
+	u64 swap_pss;
 };
 
 static void smaps_account(struct mem_size_stats *mss, struct page *page,
@@ -492,9 +493,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 	} else if (is_swap_pte(*pte)) {
 		swp_entry_t swpent = pte_to_swp_entry(*pte);
 
-		if (!non_swap_entry(swpent))
+		if (!non_swap_entry(swpent)) {
+			int mapcount;
+
 			mss->swap += PAGE_SIZE;
-		else if (is_migration_entry(swpent))
+			mapcount = swp_swapcount(swpent);
+			if (mapcount >= 2) {
+				u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
+
+				do_div(pss_delta, mapcount);
+				mss->swap_pss += pss_delta;
+			} else {
+				mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
+			}
+		} else if (is_migration_entry(swpent))
 			page = migration_entry_to_page(swpent);
 	}
 
@@ -640,6 +652,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 		   "Anonymous:      %8lu kB\n"
 		   "AnonHugePages:  %8lu kB\n"
 		   "Swap:           %8lu kB\n"
+		   "SwapPss:        %8lu kB\n"
 		   "KernelPageSize: %8lu kB\n"
 		   "MMUPageSize:    %8lu kB\n"
 		   "Locked:         %8lu kB\n",
@@ -654,6 +667,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 		   mss.anonymous >> 10,
 		   mss.anonymous_thp >> 10,
 		   mss.swap >> 10,
+		   (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
 		   vma_kernel_pagesize(vma) >> 10,
 		   vma_mmu_pagesize(vma) >> 10,
 		   (vma->vm_flags & VM_LOCKED) ?
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 31496d201fdc..6282f1eb3d6a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -431,6 +431,7 @@ extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct page *, struct block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
+extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
 extern int reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
@@ -522,6 +523,11 @@ static inline int page_swapcount(struct page *page)
 	return 0;
 }
 
+static inline int swp_swapcount(swp_entry_t entry)
+{
+	return 0;
+}
+
 #define reuse_swap_page(page)	(page_mapcount(page) == 1)
 
 static inline int try_to_free_swap(struct page *page)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index aebc2dd6e649..58877312cf6b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -874,6 +874,48 @@ int page_swapcount(struct page *page)
 	return count;
 }
 
+/*
+ * How many references to @entry are currently swapped out?
+ * This considers COUNT_CONTINUED so it returns exact answer.
+ */
+int swp_swapcount(swp_entry_t entry)
+{
+	int count, tmp_count, n;
+	struct swap_info_struct *p;
+	struct page *page;
+	pgoff_t offset;
+	unsigned char *map;
+
+	p = swap_info_get(entry);
+	if (!p)
+		return 0;
+
+	count = swap_count(p->swap_map[swp_offset(entry)]);
+	if (!(count & COUNT_CONTINUED))
+		goto out;
+
+	count &= ~COUNT_CONTINUED;
+	n = SWAP_MAP_MAX + 1;
+
+	offset = swp_offset(entry);
+	page = vmalloc_to_page(p->swap_map + offset);
+	offset &= ~PAGE_MASK;
+	VM_BUG_ON(page_private(page) != SWP_CONTINUED);
+
+	do {
+		page = list_entry(page->lru.next, struct page, lru);
+		map = kmap_atomic(page);
+		tmp_count = map[offset];
+		kunmap_atomic(map);
+
+		count += (tmp_count & ~COUNT_CONTINUED) * n;
+		n *= (SWAP_CONT_MAX + 1);
+	} while (tmp_count & COUNT_CONTINUED);
+out:
+	spin_unlock(&p->lock);
+	return count;
+}
+
 /*
  * We can write to an anon page without COW if there are no other references
  * to it.  And as a side-effect, free up its swap: because the old content
-- 
cgit v1.2.3-70-g09d2


From 28c015d07507e164d93b33498b4e482ff81c0e9b Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 8 Sep 2015 15:00:31 -0700
Subject: mm: improve __GFP_NORETRY comment based on implementation

Explicitly state that __GFP_NORETRY will attempt direct reclaim and
memory compaction before returning NULL and that the oom killer is not
called in the current implementation of the page allocator.

[akpm@linux-foundation.org: s/has/have/]
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index ad35f300b9a4..3bd64b115999 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -63,7 +63,10 @@ struct vm_area_struct;
  * but it is definitely preferable to use the flag rather than opencode endless
  * loop around allocator.
  *
- * __GFP_NORETRY: The VM implementation must not retry indefinitely.
+ * __GFP_NORETRY: The VM implementation must not retry indefinitely and will
+ * return NULL when direct reclaim and memory compaction have failed to allow
+ * the allocation to succeed.  The OOM killer is not called with the current
+ * implementation.
  *
  * __GFP_MOVABLE: Flag that this page will be movable by the page migration
  * mechanism or reclaimed
-- 
cgit v1.2.3-70-g09d2


From 6e0fc46dc2152d3e2d25a5d5b640ae3586c247c6 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 8 Sep 2015 15:00:36 -0700
Subject: mm, oom: organize oom context into struct

There are essential elements to an oom context that are passed around to
multiple functions.

Organize these elements into a new struct, struct oom_control, that
specifies the context for an oom condition.

This patch introduces no functional change.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/tty/sysrq.c |  12 +++++-
 include/linux/oom.h |  25 +++++++-----
 mm/memcontrol.c     |  16 +++++---
 mm/oom_kill.c       | 115 ++++++++++++++++++++++++----------------------------
 mm/page_alloc.c     |  10 ++++-
 5 files changed, 98 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index b5b427888b24..ed3e258f4ee9 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -353,9 +353,17 @@ static struct sysrq_key_op sysrq_term_op = {
 
 static void moom_callback(struct work_struct *ignored)
 {
+	const gfp_t gfp_mask = GFP_KERNEL;
+	struct oom_control oc = {
+		.zonelist = node_zonelist(first_memory_node, gfp_mask),
+		.nodemask = NULL,
+		.gfp_mask = gfp_mask,
+		.order = 0,
+		.force_kill = true,
+	};
+
 	mutex_lock(&oom_lock);
-	if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
-			   GFP_KERNEL, 0, NULL, true))
+	if (!out_of_memory(&oc))
 		pr_info("OOM request ignored because killer is disabled\n");
 	mutex_unlock(&oom_lock);
 }
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 7deecb7bca5e..cb29085ded37 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -12,6 +12,14 @@ struct notifier_block;
 struct mem_cgroup;
 struct task_struct;
 
+struct oom_control {
+	struct zonelist *zonelist;
+	nodemask_t	*nodemask;
+	gfp_t		gfp_mask;
+	int		order;
+	bool		force_kill;
+};
+
 /*
  * Types of limitations to the nodes from which allocations may occur
  */
@@ -57,21 +65,18 @@ extern unsigned long oom_badness(struct task_struct *p,
 
 extern int oom_kills_count(void);
 extern void note_oom_kill(void);
-extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 			     unsigned int points, unsigned long totalpages,
-			     struct mem_cgroup *memcg, nodemask_t *nodemask,
-			     const char *message);
+			     struct mem_cgroup *memcg, const char *message);
 
-extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
-			       int order, const nodemask_t *nodemask,
+extern void check_panic_on_oom(struct oom_control *oc,
+			       enum oom_constraint constraint,
 			       struct mem_cgroup *memcg);
 
-extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
-		unsigned long totalpages, const nodemask_t *nodemask,
-		bool force_kill);
+extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+		struct task_struct *task, unsigned long totalpages);
 
-extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-		int order, nodemask_t *mask, bool force_kill);
+extern bool out_of_memory(struct oom_control *oc);
 
 extern void exit_oom_victim(void);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1af057575ce9..573d90347aa2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1545,6 +1545,13 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				     int order)
 {
+	struct oom_control oc = {
+		.zonelist = NULL,
+		.nodemask = NULL,
+		.gfp_mask = gfp_mask,
+		.order = order,
+		.force_kill = false,
+	};
 	struct mem_cgroup *iter;
 	unsigned long chosen_points = 0;
 	unsigned long totalpages;
@@ -1563,7 +1570,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		goto unlock;
 	}
 
-	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
+	check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
 	totalpages = mem_cgroup_get_limit(memcg) ? : 1;
 	for_each_mem_cgroup_tree(iter, memcg) {
 		struct css_task_iter it;
@@ -1571,8 +1578,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 		css_task_iter_start(&iter->css, &it);
 		while ((task = css_task_iter_next(&it))) {
-			switch (oom_scan_process_thread(task, totalpages, NULL,
-							false)) {
+			switch (oom_scan_process_thread(&oc, task, totalpages)) {
 			case OOM_SCAN_SELECT:
 				if (chosen)
 					put_task_struct(chosen);
@@ -1610,8 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 	if (chosen) {
 		points = chosen_points * 1000 / totalpages;
-		oom_kill_process(chosen, gfp_mask, order, points, totalpages,
-				 memcg, NULL, "Memory cgroup out of memory");
+		oom_kill_process(&oc, chosen, points, totalpages, memcg,
+				 "Memory cgroup out of memory");
 	}
 unlock:
 	mutex_unlock(&oom_lock);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dff991e0681e..80a7cbd89d66 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
  * Determine the type of allocation constraint.
  */
 #ifdef CONFIG_NUMA
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-				gfp_t gfp_mask, nodemask_t *nodemask,
-				unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+					     unsigned long *totalpages)
 {
 	struct zone *zone;
 	struct zoneref *z;
-	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+	enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
 	bool cpuset_limited = false;
 	int nid;
 
 	/* Default to all available memory */
 	*totalpages = totalram_pages + total_swap_pages;
 
-	if (!zonelist)
+	if (!oc->zonelist)
 		return CONSTRAINT_NONE;
 	/*
 	 * Reach here only when __GFP_NOFAIL is used. So, we should avoid
 	 * to kill current.We have to random task kill in this case.
 	 * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
 	 */
-	if (gfp_mask & __GFP_THISNODE)
+	if (oc->gfp_mask & __GFP_THISNODE)
 		return CONSTRAINT_NONE;
 
 	/*
@@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 	 * the page allocator means a mempolicy is in effect.  Cpuset policy
 	 * is enforced in get_page_from_freelist().
 	 */
-	if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
+	if (oc->nodemask &&
+	    !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
 		*totalpages = total_swap_pages;
-		for_each_node_mask(nid, *nodemask)
+		for_each_node_mask(nid, *oc->nodemask)
 			*totalpages += node_spanned_pages(nid);
 		return CONSTRAINT_MEMORY_POLICY;
 	}
 
 	/* Check this allocation failure is caused by cpuset's wall function */
-	for_each_zone_zonelist_nodemask(zone, z, zonelist,
-			high_zoneidx, nodemask)
-		if (!cpuset_zone_allowed(zone, gfp_mask))
+	for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
+			high_zoneidx, oc->nodemask)
+		if (!cpuset_zone_allowed(zone, oc->gfp_mask))
 			cpuset_limited = true;
 
 	if (cpuset_limited) {
@@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 	return CONSTRAINT_NONE;
 }
 #else
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-				gfp_t gfp_mask, nodemask_t *nodemask,
-				unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+					     unsigned long *totalpages)
 {
 	*totalpages = totalram_pages + total_swap_pages;
 	return CONSTRAINT_NONE;
 }
 #endif
 
-enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
-		unsigned long totalpages, const nodemask_t *nodemask,
-		bool force_kill)
+enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+			struct task_struct *task, unsigned long totalpages)
 {
-	if (oom_unkillable_task(task, NULL, nodemask))
+	if (oom_unkillable_task(task, NULL, oc->nodemask))
 		return OOM_SCAN_CONTINUE;
 
 	/*
@@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 	 * Don't allow any other task to have access to the reserves.
 	 */
 	if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
-		if (!force_kill)
+		if (!oc->force_kill)
 			return OOM_SCAN_ABORT;
 	}
 	if (!task->mm)
@@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 	if (oom_task_origin(task))
 		return OOM_SCAN_SELECT;
 
-	if (task_will_free_mem(task) && !force_kill)
+	if (task_will_free_mem(task) && !oc->force_kill)
 		return OOM_SCAN_ABORT;
 
 	return OOM_SCAN_OK;
@@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 /*
  * Simple selection loop. We chose the process with the highest
  * number of 'points'.  Returns -1 on scan abort.
- *
- * (not docbooked, we don't want this one cluttering up the manual)
  */
-static struct task_struct *select_bad_process(unsigned int *ppoints,
-		unsigned long totalpages, const nodemask_t *nodemask,
-		bool force_kill)
+static struct task_struct *select_bad_process(struct oom_control *oc,
+		unsigned int *ppoints, unsigned long totalpages)
 {
 	struct task_struct *g, *p;
 	struct task_struct *chosen = NULL;
@@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 	for_each_process_thread(g, p) {
 		unsigned int points;
 
-		switch (oom_scan_process_thread(p, totalpages, nodemask,
-						force_kill)) {
+		switch (oom_scan_process_thread(oc, p, totalpages)) {
 		case OOM_SCAN_SELECT:
 			chosen = p;
 			chosen_points = ULONG_MAX;
@@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 		case OOM_SCAN_OK:
 			break;
 		};
-		points = oom_badness(p, NULL, nodemask, totalpages);
+		points = oom_badness(p, NULL, oc->nodemask, totalpages);
 		if (!points || points < chosen_points)
 			continue;
 		/* Prefer thread group leaders for display purposes */
@@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 	rcu_read_unlock();
 }
 
-static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
-			struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static void dump_header(struct oom_control *oc, struct task_struct *p,
+			struct mem_cgroup *memcg)
 {
 	task_lock(current);
 	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
 		"oom_score_adj=%hd\n",
-		current->comm, gfp_mask, order,
+		current->comm, oc->gfp_mask, oc->order,
 		current->signal->oom_score_adj);
 	cpuset_print_task_mems_allowed(current);
 	task_unlock(current);
@@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 	else
 		show_mem(SHOW_MEM_FILTER_NODES);
 	if (sysctl_oom_dump_tasks)
-		dump_tasks(memcg, nodemask);
+		dump_tasks(memcg, oc->nodemask);
 }
 
 /*
@@ -487,10 +481,9 @@ void oom_killer_enable(void)
  * Must be called while holding a reference to p, which will be released upon
  * returning.
  */
-void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 		      unsigned int points, unsigned long totalpages,
-		      struct mem_cgroup *memcg, nodemask_t *nodemask,
-		      const char *message)
+		      struct mem_cgroup *memcg, const char *message)
 {
 	struct task_struct *victim = p;
 	struct task_struct *child;
@@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	task_unlock(p);
 
 	if (__ratelimit(&oom_rs))
-		dump_header(p, gfp_mask, order, memcg, nodemask);
+		dump_header(oc, p, memcg);
 
 	task_lock(p);
 	pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
@@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 			/*
 			 * oom_badness() returns 0 if the thread is unkillable
 			 */
-			child_points = oom_badness(child, memcg, nodemask,
+			child_points = oom_badness(child, memcg, oc->nodemask,
 								totalpages);
 			if (child_points > victim_points) {
 				put_task_struct(victim);
@@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 /*
  * Determines whether the kernel must panic because of the panic_on_oom sysctl.
  */
-void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
-			int order, const nodemask_t *nodemask,
+void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
 			struct mem_cgroup *memcg)
 {
 	if (likely(!sysctl_panic_on_oom))
@@ -615,7 +607,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 		if (constraint != CONSTRAINT_NONE)
 			return;
 	}
-	dump_header(NULL, gfp_mask, order, memcg, nodemask);
+	dump_header(oc, NULL, memcg);
 	panic("Out of memory: %s panic_on_oom is enabled\n",
 		sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
 }
@@ -635,22 +627,16 @@ int unregister_oom_notifier(struct notifier_block *nb)
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
 /**
- * __out_of_memory - kill the "best" process when we run out of memory
- * @zonelist: zonelist pointer
- * @gfp_mask: memory allocation flags
- * @order: amount of memory being requested as a power of 2
- * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
+ * out_of_memory - kill the "best" process when we run out of memory
+ * @oc: pointer to struct oom_control
  *
  * If we run out of memory, we have the choice between either
  * killing a random task (bad), letting the system crash (worse)
  * OR try to be smart about which process to kill. Note that we
  * don't have to be perfect here, we just have to be good.
  */
-bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-		   int order, nodemask_t *nodemask, bool force_kill)
+bool out_of_memory(struct oom_control *oc)
 {
-	const nodemask_t *mpol_mask;
 	struct task_struct *p;
 	unsigned long totalpages;
 	unsigned long freed = 0;
@@ -684,30 +670,29 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	 * Check if there were limitations on the allocation (only relevant for
 	 * NUMA) that may require different handling.
 	 */
-	constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
-						&totalpages);
-	mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
-	check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
+	constraint = constrained_alloc(oc, &totalpages);
+	if (constraint != CONSTRAINT_MEMORY_POLICY)
+		oc->nodemask = NULL;
+	check_panic_on_oom(oc, constraint, NULL);
 
 	if (sysctl_oom_kill_allocating_task && current->mm &&
-	    !oom_unkillable_task(current, NULL, nodemask) &&
+	    !oom_unkillable_task(current, NULL, oc->nodemask) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
 		get_task_struct(current);
-		oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
-				 nodemask,
+		oom_kill_process(oc, current, 0, totalpages, NULL,
 				 "Out of memory (oom_kill_allocating_task)");
 		goto out;
 	}
 
-	p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
+	p = select_bad_process(oc, &points, totalpages);
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
-		dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
+		dump_header(oc, NULL, NULL);
 		panic("Out of memory and no killable processes...\n");
 	}
 	if (p != (void *)-1UL) {
-		oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
-				 nodemask, "Out of memory");
+		oom_kill_process(oc, p, points, totalpages, NULL,
+				 "Out of memory");
 		killed = 1;
 	}
 out:
@@ -728,13 +713,21 @@ out:
  */
 void pagefault_out_of_memory(void)
 {
+	struct oom_control oc = {
+		.zonelist = NULL,
+		.nodemask = NULL,
+		.gfp_mask = 0,
+		.order = 0,
+		.force_kill = false,
+	};
+
 	if (mem_cgroup_oom_synchronize(true))
 		return;
 
 	if (!mutex_trylock(&oom_lock))
 		return;
 
-	if (!out_of_memory(NULL, 0, 0, NULL, false)) {
+	if (!out_of_memory(&oc)) {
 		/*
 		 * There shouldn't be any user tasks runnable while the
 		 * OOM killer is disabled, so the current task has to
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index badc7d3bde43..96536144185c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2693,6 +2693,13 @@ static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	const struct alloc_context *ac, unsigned long *did_some_progress)
 {
+	struct oom_control oc = {
+		.zonelist = ac->zonelist,
+		.nodemask = ac->nodemask,
+		.gfp_mask = gfp_mask,
+		.order = order,
+		.force_kill = false,
+	};
 	struct page *page;
 
 	*did_some_progress = 0;
@@ -2744,8 +2751,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
-	if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
-			|| WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
+	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
 		*did_some_progress = 1;
 out:
 	mutex_unlock(&oom_lock);
-- 
cgit v1.2.3-70-g09d2


From 54e9e29132d7caefcad470281cae06ac34a982c8 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 8 Sep 2015 15:00:39 -0700
Subject: mm, oom: pass an oom order of -1 when triggered by sysrq

The force_kill member of struct oom_control isn't needed if an order of -1
is used instead.  This is the same as order == -1 in struct
compact_control which requires full memory compaction.

This patch introduces no functional change.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/tty/sysrq.c | 3 +--
 include/linux/oom.h | 1 -
 mm/memcontrol.c     | 1 -
 mm/oom_kill.c       | 5 ++---
 mm/page_alloc.c     | 1 -
 5 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index ed3e258f4ee9..95b330a9ea98 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -358,8 +358,7 @@ static void moom_callback(struct work_struct *ignored)
 		.zonelist = node_zonelist(first_memory_node, gfp_mask),
 		.nodemask = NULL,
 		.gfp_mask = gfp_mask,
-		.order = 0,
-		.force_kill = true,
+		.order = -1,
 	};
 
 	mutex_lock(&oom_lock);
diff --git a/include/linux/oom.h b/include/linux/oom.h
index cb29085ded37..8fb67b9e6110 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -17,7 +17,6 @@ struct oom_control {
 	nodemask_t	*nodemask;
 	gfp_t		gfp_mask;
 	int		order;
-	bool		force_kill;
 };
 
 /*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 573d90347aa2..9871f13fc35b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1550,7 +1550,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		.nodemask = NULL,
 		.gfp_mask = gfp_mask,
 		.order = order,
-		.force_kill = false,
 	};
 	struct mem_cgroup *iter;
 	unsigned long chosen_points = 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 80a7cbd89d66..77adc8e876aa 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -265,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 	 * Don't allow any other task to have access to the reserves.
 	 */
 	if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
-		if (!oc->force_kill)
+		if (oc->order != -1)
 			return OOM_SCAN_ABORT;
 	}
 	if (!task->mm)
@@ -278,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 	if (oom_task_origin(task))
 		return OOM_SCAN_SELECT;
 
-	if (task_will_free_mem(task) && !oc->force_kill)
+	if (task_will_free_mem(task) && oc->order != -1)
 		return OOM_SCAN_ABORT;
 
 	return OOM_SCAN_OK;
@@ -718,7 +718,6 @@ void pagefault_out_of_memory(void)
 		.nodemask = NULL,
 		.gfp_mask = 0,
 		.order = 0,
-		.force_kill = false,
 	};
 
 	if (mem_cgroup_oom_synchronize(true))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 96536144185c..5f9394df19bf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2698,7 +2698,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 		.nodemask = ac->nodemask,
 		.gfp_mask = gfp_mask,
 		.order = order,
-		.force_kill = false,
 	};
 	struct page *page;
 
-- 
cgit v1.2.3-70-g09d2


From 8989e4c7d4e3c30b55c998a1138cd06c92df7295 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 8 Sep 2015 15:00:44 -0700
Subject: mm, oom: add description of struct oom_control

Describe the purpose of struct oom_control and what each member does.

Also make gfp_mask and order const since they are never manipulated or
passed to functions that discard the qualifier.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 8fb67b9e6110..03e6257321f0 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -12,11 +12,25 @@ struct notifier_block;
 struct mem_cgroup;
 struct task_struct;
 
+/*
+ * Details of the page allocation that triggered the oom killer that are used to
+ * determine what should be killed.
+ */
 struct oom_control {
+	/* Used to determine cpuset */
 	struct zonelist *zonelist;
-	nodemask_t	*nodemask;
-	gfp_t		gfp_mask;
-	int		order;
+
+	/* Used to determine mempolicy */
+	nodemask_t *nodemask;
+
+	/* Used to determine cpuset and node locality requirement */
+	const gfp_t gfp_mask;
+
+	/*
+	 * order == -1 means the oom kill is required by sysrq, otherwise only
+	 * for display purposes.
+	 */
+	const int order;
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 33398cf2f360c5ce24c8a22436d52a06ad4e5eb5 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Tue, 8 Sep 2015 15:01:02 -0700
Subject: memcg: export struct mem_cgroup

mem_cgroup structure is defined in mm/memcontrol.c currently which means
that the code outside of this file has to use external API even for
trivial access stuff.

This patch exports mm_struct with its dependencies and makes some of the
exported functions inlines.  This even helps to reduce the code size a bit
(make defconfig + CONFIG_MEMCG=y)

  text		data    bss     dec     	 hex 	filename
  12355346        1823792 1089536 15268674         e8fb42 vmlinux.before
  12354970        1823792 1089536 15268298         e8f9ca vmlinux.after

This is not much (370B) but better than nothing.

We also save a function call in some hot paths like callers of
mem_cgroup_count_vm_event which is used for accounting.

The patch doesn't introduce any functional changes.

[vdavykov@parallels.com: inline memcg_kmem_is_active]
[vdavykov@parallels.com: do not expose type outside of CONFIG_MEMCG]
[akpm@linux-foundation.org: memcontrol.h needs eventfd.h for eventfd_ctx]
[akpm@linux-foundation.org: export mem_cgroup_from_task() to modules]
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 370 +++++++++++++++++++++++++++++++++++++++++----
 include/linux/swap.h       |  10 +-
 include/net/sock.h         |  28 ----
 mm/memcontrol.c            | 315 +-------------------------------------
 mm/memory-failure.c        |   2 +-
 mm/slab_common.c           |   2 +-
 mm/vmscan.c                |   2 +-
 7 files changed, 351 insertions(+), 378 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 73b02b0a8f60..ab2f6880e27b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -23,6 +23,11 @@
 #include <linux/vm_event_item.h>
 #include <linux/hardirq.h>
 #include <linux/jump_label.h>
+#include <linux/page_counter.h>
+#include <linux/vmpressure.h>
+#include <linux/eventfd.h>
+#include <linux/mmzone.h>
+#include <linux/writeback.h>
 
 struct mem_cgroup;
 struct page;
@@ -67,12 +72,221 @@ enum mem_cgroup_events_index {
 	MEMCG_NR_EVENTS,
 };
 
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremated by the number of pages. This counter is used for
+ * for trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+	MEM_CGROUP_TARGET_THRESH,
+	MEM_CGROUP_TARGET_SOFTLIMIT,
+	MEM_CGROUP_TARGET_NUMAINFO,
+	MEM_CGROUP_NTARGETS,
+};
+
+/*
+ * Bits in struct cg_proto.flags
+ */
+enum cg_proto_flags {
+	/* Currently active and new sockets should be assigned to cgroups */
+	MEMCG_SOCK_ACTIVE,
+	/* It was ever activated; we must disarm static keys on destruction */
+	MEMCG_SOCK_ACTIVATED,
+};
+
+struct cg_proto {
+	struct page_counter	memory_allocated;	/* Current allocated memory. */
+	struct percpu_counter	sockets_allocated;	/* Current number of sockets. */
+	int			memory_pressure;
+	long			sysctl_mem[3];
+	unsigned long		flags;
+	/*
+	 * memcg field is used to find which memcg we belong directly
+	 * Each memcg struct can hold more than one cg_proto, so container_of
+	 * won't really cut.
+	 *
+	 * The elegant solution would be having an inverse function to
+	 * proto_cgroup in struct proto, but that means polluting the structure
+	 * for everybody, instead of just for memcg users.
+	 */
+	struct mem_cgroup	*memcg;
+};
+
 #ifdef CONFIG_MEMCG
+struct mem_cgroup_stat_cpu {
+	long count[MEM_CGROUP_STAT_NSTATS];
+	unsigned long events[MEMCG_NR_EVENTS];
+	unsigned long nr_page_events;
+	unsigned long targets[MEM_CGROUP_NTARGETS];
+};
+
+struct mem_cgroup_reclaim_iter {
+	struct mem_cgroup *position;
+	/* scan generation, increased every round-trip */
+	unsigned int generation;
+};
+
+/*
+ * per-zone information in memory controller.
+ */
+struct mem_cgroup_per_zone {
+	struct lruvec		lruvec;
+	unsigned long		lru_size[NR_LRU_LISTS];
+
+	struct mem_cgroup_reclaim_iter	iter[DEF_PRIORITY + 1];
+
+	struct rb_node		tree_node;	/* RB tree node */
+	unsigned long		usage_in_excess;/* Set to the value by which */
+						/* the soft limit is exceeded*/
+	bool			on_tree;
+	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
+						/* use container_of	   */
+};
+
+struct mem_cgroup_per_node {
+	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_threshold {
+	struct eventfd_ctx *eventfd;
+	unsigned long threshold;
+};
+
+/* For threshold */
+struct mem_cgroup_threshold_ary {
+	/* An array index points to threshold just below or equal to usage. */
+	int current_threshold;
+	/* Size of entries[] */
+	unsigned int size;
+	/* Array of thresholds */
+	struct mem_cgroup_threshold entries[0];
+};
+
+struct mem_cgroup_thresholds {
+	/* Primary thresholds array */
+	struct mem_cgroup_threshold_ary *primary;
+	/*
+	 * Spare threshold array.
+	 * This is needed to make mem_cgroup_unregister_event() "never fail".
+	 * It must be able to store at least primary->size - 1 entries.
+	 */
+	struct mem_cgroup_threshold_ary *spare;
+};
+
+/*
+ * The memory controller data structure. The memory controller controls both
+ * page cache and RSS per cgroup. We would eventually like to provide
+ * statistics based on the statistics developed by Rik Van Riel for clock-pro,
+ * to help the administrator determine what knobs to tune.
+ */
+struct mem_cgroup {
+	struct cgroup_subsys_state css;
+
+	/* Accounted resources */
+	struct page_counter memory;
+	struct page_counter memsw;
+	struct page_counter kmem;
+
+	/* Normal memory consumption range */
+	unsigned long low;
+	unsigned long high;
+
+	unsigned long soft_limit;
+
+	/* vmpressure notifications */
+	struct vmpressure vmpressure;
+
+	/* css_online() has been completed */
+	int initialized;
+
+	/*
+	 * Should the accounting and control be hierarchical, per subtree?
+	 */
+	bool use_hierarchy;
+
+	/* protected by memcg_oom_lock */
+	bool		oom_lock;
+	int		under_oom;
+
+	int	swappiness;
+	/* OOM-Killer disable */
+	int		oom_kill_disable;
+
+	/* protect arrays of thresholds */
+	struct mutex thresholds_lock;
+
+	/* thresholds for memory usage. RCU-protected */
+	struct mem_cgroup_thresholds thresholds;
+
+	/* thresholds for mem+swap usage. RCU-protected */
+	struct mem_cgroup_thresholds memsw_thresholds;
+
+	/* For oom notifier event fd */
+	struct list_head oom_notify;
+
+	/*
+	 * Should we move charges of a task when a task is moved into this
+	 * mem_cgroup ? And what type of charges should we move ?
+	 */
+	unsigned long move_charge_at_immigrate;
+	/*
+	 * set > 0 if pages under this cgroup are moving to other cgroup.
+	 */
+	atomic_t		moving_account;
+	/* taken only while moving_account > 0 */
+	spinlock_t		move_lock;
+	struct task_struct	*move_lock_task;
+	unsigned long		move_lock_flags;
+	/*
+	 * percpu counter.
+	 */
+	struct mem_cgroup_stat_cpu __percpu *stat;
+	spinlock_t pcp_counter_lock;
+
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
+	struct cg_proto tcp_mem;
+#endif
+#if defined(CONFIG_MEMCG_KMEM)
+        /* Index in the kmem_cache->memcg_params.memcg_caches array */
+	int kmemcg_id;
+	bool kmem_acct_activated;
+	bool kmem_acct_active;
+#endif
+
+	int last_scanned_node;
+#if MAX_NUMNODES > 1
+	nodemask_t	scan_nodes;
+	atomic_t	numainfo_events;
+	atomic_t	numainfo_updating;
+#endif
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct list_head cgwb_list;
+	struct wb_domain cgwb_domain;
+#endif
+
+	/* List of events which userspace want to receive */
+	struct list_head event_list;
+	spinlock_t event_list_lock;
+
+	struct mem_cgroup_per_node *nodeinfo[0];
+	/* WARNING: nodeinfo must be the last member here */
+};
 extern struct cgroup_subsys_state *mem_cgroup_root_css;
 
-void mem_cgroup_events(struct mem_cgroup *memcg,
+/**
+ * mem_cgroup_events - count memory events against a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event index
+ * @nr: the number of events to account for
+ */
+static inline void mem_cgroup_events(struct mem_cgroup *memcg,
 		       enum mem_cgroup_events_index idx,
-		       unsigned int nr);
+		       unsigned int nr)
+{
+	this_cpu_add(memcg->stat->events[idx], nr);
+}
 
 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
 
@@ -90,15 +304,31 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
 struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
 
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
-			      struct mem_cgroup *root);
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
 
 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
 extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
-extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
+static inline
+struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
+	return css ? container_of(css, struct mem_cgroup, css) : NULL;
+}
+
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
+				   struct mem_cgroup *,
+				   struct mem_cgroup_reclaim_cookie *);
+void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+
+static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
+			      struct mem_cgroup *root)
+{
+	if (root == memcg)
+		return true;
+	if (!root->use_hierarchy)
+		return false;
+	return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
+}
 
 static inline bool mm_match_cgroup(struct mm_struct *mm,
 				   struct mem_cgroup *memcg)
@@ -114,22 +344,65 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
 	return match;
 }
 
-extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
 extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
 
-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
-				   struct mem_cgroup *,
-				   struct mem_cgroup_reclaim_cookie *);
-void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+static inline bool mem_cgroup_disabled(void)
+{
+	if (memory_cgrp_subsys.disabled)
+		return true;
+	return false;
+}
 
 /*
  * For memory reclaim.
  */
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
-void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
+
+void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
+		int nr_pages);
+
+static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+	struct mem_cgroup_per_zone *mz;
+	struct mem_cgroup *memcg;
+
+	if (mem_cgroup_disabled())
+		return true;
+
+	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+	memcg = mz->memcg;
+
+	return !!(memcg->css.flags & CSS_ONLINE);
+}
+
+static inline
+unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+{
+	struct mem_cgroup_per_zone *mz;
+
+	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+	return mz->lru_size[lru];
+}
+
+static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
+{
+	unsigned long inactive_ratio;
+	unsigned long inactive;
+	unsigned long active;
+	unsigned long gb;
+
+	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
+	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
+
+	gb = (inactive + active) >> (30 - PAGE_SHIFT);
+	if (gb)
+		inactive_ratio = int_sqrt(10 * gb);
+	else
+		inactive_ratio = 1;
+
+	return inactive * inactive_ratio < active;
+}
+
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 					struct task_struct *p);
 
@@ -156,18 +429,26 @@ bool mem_cgroup_oom_synchronize(bool wait);
 extern int do_swap_account;
 #endif
 
-static inline bool mem_cgroup_disabled(void)
-{
-	if (memory_cgrp_subsys.disabled)
-		return true;
-	return false;
-}
-
 struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page);
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
-				 enum mem_cgroup_stat_index idx, int val);
 void mem_cgroup_end_page_stat(struct mem_cgroup *memcg);
 
+/**
+ * mem_cgroup_update_page_stat - update page state statistics
+ * @memcg: memcg to account against
+ * @idx: page state item to account
+ * @val: number of pages (positive or negative)
+ *
+ * See mem_cgroup_begin_page_stat() for locking requirements.
+ */
+static inline void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
+				 enum mem_cgroup_stat_index idx, int val)
+{
+	VM_BUG_ON(!rcu_read_lock_held());
+
+	if (memcg)
+		this_cpu_add(memcg->stat->count[idx], val);
+}
+
 static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
 					    enum mem_cgroup_stat_index idx)
 {
@@ -184,13 +465,31 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask,
 						unsigned long *total_scanned);
 
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
 					     enum vm_event_item idx)
 {
+	struct mem_cgroup *memcg;
+
 	if (mem_cgroup_disabled())
 		return;
-	__mem_cgroup_count_vm_event(mm, idx);
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+	if (unlikely(!memcg))
+		goto out;
+
+	switch (idx) {
+	case PGFAULT:
+		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
+		break;
+	case PGMAJFAULT:
+		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
+		break;
+	default:
+		BUG();
+	}
+out:
+	rcu_read_unlock();
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void mem_cgroup_split_huge_fixup(struct page *head);
@@ -275,12 +574,6 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,
 	return true;
 }
 
-static inline struct cgroup_subsys_state
-		*mem_cgroup_css(struct mem_cgroup *memcg)
-{
-	return NULL;
-}
-
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
 		struct mem_cgroup *prev,
@@ -444,7 +737,10 @@ static inline bool memcg_kmem_enabled(void)
 	return static_key_false(&memcg_kmem_enabled_key);
 }
 
-bool memcg_kmem_is_active(struct mem_cgroup *memcg);
+static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+	return memcg->kmem_acct_active;
+}
 
 /*
  * In general, we'll do everything in our power to not incur in any overhead
@@ -463,7 +759,15 @@ void __memcg_kmem_commit_charge(struct page *page,
 				       struct mem_cgroup *memcg, int order);
 void __memcg_kmem_uncharge_pages(struct page *page, int order);
 
-int memcg_cache_id(struct mem_cgroup *memcg);
+/*
+ * helper for acessing a memcg's index. It will be used as an index in the
+ * child cache array in kmem_cache, and also to derive its name. This function
+ * will return -1 when this is not a kmem-limited memcg.
+ */
+static inline int memcg_cache_id(struct mem_cgroup *memcg)
+{
+	return memcg ? memcg->kmemcg_id : -1;
+}
 
 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
 void __memcg_kmem_put_cache(struct kmem_cache *cachep);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6282f1eb3d6a..2ce190709280 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -351,7 +351,15 @@ extern void check_move_unevictable_pages(struct page **, int nr_pages);
 extern int kswapd_run(int nid);
 extern void kswapd_stop(int nid);
 #ifdef CONFIG_MEMCG
-extern int mem_cgroup_swappiness(struct mem_cgroup *mem);
+static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
+{
+	/* root ? */
+	if (mem_cgroup_disabled() || !memcg->css.parent)
+		return vm_swappiness;
+
+	return memcg->swappiness;
+}
+
 #else
 static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
 {
diff --git a/include/net/sock.h b/include/net/sock.h
index 43c6abcf06ab..a98c71ea40c5 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1042,34 +1042,6 @@ struct proto {
 #endif
 };
 
-/*
- * Bits in struct cg_proto.flags
- */
-enum cg_proto_flags {
-	/* Currently active and new sockets should be assigned to cgroups */
-	MEMCG_SOCK_ACTIVE,
-	/* It was ever activated; we must disarm static keys on destruction */
-	MEMCG_SOCK_ACTIVATED,
-};
-
-struct cg_proto {
-	struct page_counter	memory_allocated;	/* Current allocated memory. */
-	struct percpu_counter	sockets_allocated;	/* Current number of sockets. */
-	int			memory_pressure;
-	long			sysctl_mem[3];
-	unsigned long		flags;
-	/*
-	 * memcg field is used to find which memcg we belong directly
-	 * Each memcg struct can hold more than one cg_proto, so container_of
-	 * won't really cut.
-	 *
-	 * The elegant solution would be having an inverse function to
-	 * proto_cgroup in struct proto, but that means polluting the structure
-	 * for everybody, instead of just for memcg users.
-	 */
-	struct mem_cgroup	*memcg;
-};
-
 int proto_register(struct proto *prot, int alloc_slab);
 void proto_unregister(struct proto *prot);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9871f13fc35b..6935f77589e7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -111,56 +111,10 @@ static const char * const mem_cgroup_lru_names[] = {
 	"unevictable",
 };
 
-/*
- * Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremated by the number of pages. This counter is used for
- * for trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- */
-enum mem_cgroup_events_target {
-	MEM_CGROUP_TARGET_THRESH,
-	MEM_CGROUP_TARGET_SOFTLIMIT,
-	MEM_CGROUP_TARGET_NUMAINFO,
-	MEM_CGROUP_NTARGETS,
-};
 #define THRESHOLDS_EVENTS_TARGET 128
 #define SOFTLIMIT_EVENTS_TARGET 1024
 #define NUMAINFO_EVENTS_TARGET	1024
 
-struct mem_cgroup_stat_cpu {
-	long count[MEM_CGROUP_STAT_NSTATS];
-	unsigned long events[MEMCG_NR_EVENTS];
-	unsigned long nr_page_events;
-	unsigned long targets[MEM_CGROUP_NTARGETS];
-};
-
-struct reclaim_iter {
-	struct mem_cgroup *position;
-	/* scan generation, increased every round-trip */
-	unsigned int generation;
-};
-
-/*
- * per-zone information in memory controller.
- */
-struct mem_cgroup_per_zone {
-	struct lruvec		lruvec;
-	unsigned long		lru_size[NR_LRU_LISTS];
-
-	struct reclaim_iter	iter[DEF_PRIORITY + 1];
-
-	struct rb_node		tree_node;	/* RB tree node */
-	unsigned long		usage_in_excess;/* Set to the value by which */
-						/* the soft limit is exceeded*/
-	bool			on_tree;
-	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
-						/* use container_of	   */
-};
-
-struct mem_cgroup_per_node {
-	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
-};
-
 /*
  * Cgroups above their limits are maintained in a RB-Tree, independent of
  * their hierarchy representation
@@ -181,32 +135,6 @@ struct mem_cgroup_tree {
 
 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 
-struct mem_cgroup_threshold {
-	struct eventfd_ctx *eventfd;
-	unsigned long threshold;
-};
-
-/* For threshold */
-struct mem_cgroup_threshold_ary {
-	/* An array index points to threshold just below or equal to usage. */
-	int current_threshold;
-	/* Size of entries[] */
-	unsigned int size;
-	/* Array of thresholds */
-	struct mem_cgroup_threshold entries[0];
-};
-
-struct mem_cgroup_thresholds {
-	/* Primary thresholds array */
-	struct mem_cgroup_threshold_ary *primary;
-	/*
-	 * Spare threshold array.
-	 * This is needed to make mem_cgroup_unregister_event() "never fail".
-	 * It must be able to store at least primary->size - 1 entries.
-	 */
-	struct mem_cgroup_threshold_ary *spare;
-};
-
 /* for OOM */
 struct mem_cgroup_eventfd_list {
 	struct list_head list;
@@ -256,113 +184,6 @@ struct mem_cgroup_event {
 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 
-/*
- * The memory controller data structure. The memory controller controls both
- * page cache and RSS per cgroup. We would eventually like to provide
- * statistics based on the statistics developed by Rik Van Riel for clock-pro,
- * to help the administrator determine what knobs to tune.
- */
-struct mem_cgroup {
-	struct cgroup_subsys_state css;
-
-	/* Accounted resources */
-	struct page_counter memory;
-	struct page_counter memsw;
-	struct page_counter kmem;
-
-	/* Normal memory consumption range */
-	unsigned long low;
-	unsigned long high;
-
-	unsigned long soft_limit;
-
-	/* vmpressure notifications */
-	struct vmpressure vmpressure;
-
-	/* css_online() has been completed */
-	int initialized;
-
-	/*
-	 * Should the accounting and control be hierarchical, per subtree?
-	 */
-	bool use_hierarchy;
-
-	/* protected by memcg_oom_lock */
-	bool		oom_lock;
-	int		under_oom;
-
-	int	swappiness;
-	/* OOM-Killer disable */
-	int		oom_kill_disable;
-
-	/* protect arrays of thresholds */
-	struct mutex thresholds_lock;
-
-	/* thresholds for memory usage. RCU-protected */
-	struct mem_cgroup_thresholds thresholds;
-
-	/* thresholds for mem+swap usage. RCU-protected */
-	struct mem_cgroup_thresholds memsw_thresholds;
-
-	/* For oom notifier event fd */
-	struct list_head oom_notify;
-
-	/*
-	 * Should we move charges of a task when a task is moved into this
-	 * mem_cgroup ? And what type of charges should we move ?
-	 */
-	unsigned long move_charge_at_immigrate;
-	/*
-	 * set > 0 if pages under this cgroup are moving to other cgroup.
-	 */
-	atomic_t		moving_account;
-	/* taken only while moving_account > 0 */
-	spinlock_t		move_lock;
-	struct task_struct	*move_lock_task;
-	unsigned long		move_lock_flags;
-	/*
-	 * percpu counter.
-	 */
-	struct mem_cgroup_stat_cpu __percpu *stat;
-	spinlock_t pcp_counter_lock;
-
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
-	struct cg_proto tcp_mem;
-#endif
-#if defined(CONFIG_MEMCG_KMEM)
-        /* Index in the kmem_cache->memcg_params.memcg_caches array */
-	int kmemcg_id;
-	bool kmem_acct_activated;
-	bool kmem_acct_active;
-#endif
-
-	int last_scanned_node;
-#if MAX_NUMNODES > 1
-	nodemask_t	scan_nodes;
-	atomic_t	numainfo_events;
-	atomic_t	numainfo_updating;
-#endif
-
-#ifdef CONFIG_CGROUP_WRITEBACK
-	struct list_head cgwb_list;
-	struct wb_domain cgwb_domain;
-#endif
-
-	/* List of events which userspace want to receive */
-	struct list_head event_list;
-	spinlock_t event_list_lock;
-
-	struct mem_cgroup_per_node *nodeinfo[0];
-	/* WARNING: nodeinfo must be the last member here */
-};
-
-#ifdef CONFIG_MEMCG_KMEM
-bool memcg_kmem_is_active(struct mem_cgroup *memcg)
-{
-	return memcg->kmem_acct_active;
-}
-#endif
-
 /* Stuffs for move charges at task migration. */
 /*
  * Types of charges to be moved.
@@ -423,11 +244,6 @@ enum res_type {
  */
 static DEFINE_MUTEX(memcg_create_mutex);
 
-struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
-{
-	return s ? container_of(s, struct mem_cgroup, css) : NULL;
-}
-
 /* Some nice accessors for the vmpressure. */
 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 {
@@ -593,11 +409,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
 	return &memcg->nodeinfo[nid]->zoneinfo[zid];
 }
 
-struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
-{
-	return &memcg->css;
-}
-
 /**
  * mem_cgroup_css_from_page - css of the memcg associated with a page
  * @page: page of interest
@@ -876,14 +687,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 }
 
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
-	struct mem_cgroup_per_zone *mz;
-
-	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
-	return mz->lru_size[lru];
-}
-
 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 						  int nid,
 						  unsigned int lru_mask)
@@ -986,6 +789,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 
 	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 }
+EXPORT_SYMBOL(mem_cgroup_from_task);
 
 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
@@ -1031,7 +835,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 				   struct mem_cgroup *prev,
 				   struct mem_cgroup_reclaim_cookie *reclaim)
 {
-	struct reclaim_iter *uninitialized_var(iter);
+	struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
 	struct cgroup_subsys_state *css = NULL;
 	struct mem_cgroup *memcg = NULL;
 	struct mem_cgroup *pos = NULL;
@@ -1173,30 +977,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
 	     iter != NULL;				\
 	     iter = mem_cgroup_iter(NULL, iter, NULL))
 
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
-{
-	struct mem_cgroup *memcg;
-
-	rcu_read_lock();
-	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
-	if (unlikely(!memcg))
-		goto out;
-
-	switch (idx) {
-	case PGFAULT:
-		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
-		break;
-	case PGMAJFAULT:
-		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
-		break;
-	default:
-		BUG();
-	}
-out:
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
-
 /**
  * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
  * @zone: zone of the wanted lruvec
@@ -1295,15 +1075,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 	VM_BUG_ON((long)(*lru_size) < 0);
 }
 
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
-{
-	if (root == memcg)
-		return true;
-	if (!root->use_hierarchy)
-		return false;
-	return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
-}
-
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *task_memcg;
@@ -1330,39 +1101,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
 	return ret;
 }
 
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
-	unsigned long inactive_ratio;
-	unsigned long inactive;
-	unsigned long active;
-	unsigned long gb;
-
-	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
-	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
-
-	gb = (inactive + active) >> (30 - PAGE_SHIFT);
-	if (gb)
-		inactive_ratio = int_sqrt(10 * gb);
-	else
-		inactive_ratio = 1;
-
-	return inactive * inactive_ratio < active;
-}
-
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
-{
-	struct mem_cgroup_per_zone *mz;
-	struct mem_cgroup *memcg;
-
-	if (mem_cgroup_disabled())
-		return true;
-
-	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
-	memcg = mz->memcg;
-
-	return !!(memcg->css.flags & CSS_ONLINE);
-}
-
 #define mem_cgroup_from_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 
@@ -1394,15 +1132,6 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
 	return margin;
 }
 
-int mem_cgroup_swappiness(struct mem_cgroup *memcg)
-{
-	/* root ? */
-	if (mem_cgroup_disabled() || !memcg->css.parent)
-		return vm_swappiness;
-
-	return memcg->swappiness;
-}
-
 /*
  * A routine for checking "mem" is under move_account() or not.
  *
@@ -2067,23 +1796,6 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(mem_cgroup_end_page_stat);
 
-/**
- * mem_cgroup_update_page_stat - update page state statistics
- * @memcg: memcg to account against
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- *
- * See mem_cgroup_begin_page_stat() for locking requirements.
- */
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
-				 enum mem_cgroup_stat_index idx, int val)
-{
-	VM_BUG_ON(!rcu_read_lock_held());
-
-	if (memcg)
-		this_cpu_add(memcg->stat->count[idx], val);
-}
-
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
  * TODO: maybe necessary to use big numbers in big irons.
@@ -2509,16 +2221,6 @@ void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
 	css_put_many(&memcg->css, nr_pages);
 }
 
-/*
- * helper for acessing a memcg's index. It will be used as an index in the
- * child cache array in kmem_cache, and also to derive its name. This function
- * will return -1 when this is not a kmem-limited memcg.
- */
-int memcg_cache_id(struct mem_cgroup *memcg)
-{
-	return memcg ? memcg->kmemcg_id : -1;
-}
-
 static int memcg_alloc_cache_id(void)
 {
 	int id, size;
@@ -5525,19 +5227,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
 	.early_init = 0,
 };
 
-/**
- * mem_cgroup_events - count memory events against a cgroup
- * @memcg: the memory cgroup
- * @idx: the event index
- * @nr: the number of events to account for
- */
-void mem_cgroup_events(struct mem_cgroup *memcg,
-		       enum mem_cgroup_events_index idx,
-		       unsigned int nr)
-{
-	this_cpu_add(memcg->stat->events[idx], nr);
-}
-
 /**
  * mem_cgroup_low - check if memory consumption is below the normal range
  * @root: the highest ancestor to consider
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1f4446a90cef..016c814101ed 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -146,7 +146,7 @@ static int hwpoison_filter_task(struct page *p)
 	if (!mem)
 		return -EINVAL;
 
-	css = mem_cgroup_css(mem);
+	css = &mem->css;
 	ino = cgroup_ino(css->cgroup);
 	css_put(css);
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index bde04a699ab6..5ce4faeb16fb 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -500,7 +500,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 			     struct kmem_cache *root_cache)
 {
 	static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
-	struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+	struct cgroup_subsys_state *css = &memcg->css;
 	struct memcg_cache_array *arr;
 	struct kmem_cache *s = NULL;
 	char *cache_name;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b1139039122a..bf23c88621ce 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -175,7 +175,7 @@ static bool sane_reclaim(struct scan_control *sc)
 	if (!memcg)
 		return true;
 #ifdef CONFIG_CGROUP_WRITEBACK
-	if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
+	if (memcg->css.cgroup)
 		return true;
 #endif
 	return false;
-- 
cgit v1.2.3-70-g09d2


From fabc3fdde00b54825ba23230aedbf88a735b4e49 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Tue, 8 Sep 2015 15:01:04 -0700
Subject: memcg: get rid of mem_cgroup_root_css for !CONFIG_MEMCG

The only user is cgwb_bdi_init and that one depends on
CONFIG_CGROUP_WRITEBACK which in turn depends on CONFIG_MEMCG so it
doesn't make much sense to definte an empty stub for !CONFIG_MEMCG.
Moreover ERR_PTR(-EINVAL) is ugly and would lead to runtime crashes if
used in unguarded code paths.  Better fail during compilation.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ab2f6880e27b..91bbe4ba6233 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -498,8 +498,6 @@ void mem_cgroup_split_huge_fixup(struct page *head);
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
 
-#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
-
 static inline void mem_cgroup_events(struct mem_cgroup *memcg,
 				     enum mem_cgroup_events_index idx,
 				     unsigned int nr)
-- 
cgit v1.2.3-70-g09d2


From 64219994898c8689c3d57668996f476f8c2d398c Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Tue, 8 Sep 2015 15:01:07 -0700
Subject: memcg: get rid of extern for functions in memcontrol.h

Most of the exported functions in this header are not marked extern so
change the rest to follow the same style.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 91bbe4ba6233..d92b80b63c5c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -306,10 +306,10 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
 
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
 
-extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
-extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
+struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
-extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
+struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
 	return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -344,7 +344,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
 	return match;
 }
 
-extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
+struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
 
 static inline bool mem_cgroup_disabled(void)
 {
@@ -403,8 +403,8 @@ static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 	return inactive * inactive_ratio < active;
 }
 
-extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
-					struct task_struct *p);
+void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
+				struct task_struct *p);
 
 static inline void mem_cgroup_oom_enable(void)
 {
@@ -719,8 +719,8 @@ static inline void sock_release_memcg(struct sock *sk)
 extern struct static_key memcg_kmem_enabled_key;
 
 extern int memcg_nr_cache_ids;
-extern void memcg_get_cache_ids(void);
-extern void memcg_put_cache_ids(void);
+void memcg_get_cache_ids(void);
+void memcg_put_cache_ids(void);
 
 /*
  * Helper macro to loop through all memcg-specific caches. Callers must still
-- 
cgit v1.2.3-70-g09d2


From bb14c2c75db972a1bf65fd63c8d5a0b41a8f263a Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 8 Sep 2015 15:01:25 -0700
Subject: mm: rename and move get/set_freepage_migratetype

The pair of get/set_freepage_migratetype() functions are used to cache
pageblock migratetype for a page put on a pcplist, so that it does not
have to be retrieved again when the page is put on a free list (e.g.
when pcplists become full).  Historically it was also assumed that the
value is accurate for pages on freelists (as the functions' names
unfortunately suggest), but that cannot be guaranteed without affecting
various allocator fast paths.  It is in fact not needed and all such
uses have been removed.

The last remaining (but pointless) usage related to pages of freelists
is in move_freepages(), which this patch removes.

To prevent further confusion, rename the functions to
get/set_pcppage_migratetype() and expand their description.  Since all
the users are now in mm/page_alloc.c, move the functions there from the
shared header.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Minchan Kim <minchan@kernel.org>
Acked-by: Michal Nazarewicz <mina86@mina86.com>
Cc: Laura Abbott <lauraa@codeaurora.org>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Seungho Park <seungho1.park@lge.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 12 ------------
 mm/page_alloc.c    | 41 ++++++++++++++++++++++++++++-------------
 2 files changed, 28 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4ec72ef1f04a..bab8ff89da50 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -309,18 +309,6 @@ struct inode;
 #define page_private(page)		((page)->private)
 #define set_page_private(page, v)	((page)->private = (v))
 
-/* It's valid only if the page is free path or free_list */
-static inline void set_freepage_migratetype(struct page *page, int migratetype)
-{
-	page->index = migratetype;
-}
-
-/* It's valid only if the page is free path or free_list */
-static inline int get_freepage_migratetype(struct page *page)
-{
-	return page->index;
-}
-
 /*
  * FIXME: take this include out, include page-flags.h in
  * files which need it (119 of them)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a329cfaf634d..252665d553b4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -125,6 +125,24 @@ unsigned long dirty_balance_reserve __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 
+/*
+ * A cached value of the page's pageblock's migratetype, used when the page is
+ * put on a pcplist. Used to avoid the pageblock migratetype lookup when
+ * freeing from pcplists in most cases, at the cost of possibly becoming stale.
+ * Also the migratetype set in the page does not necessarily match the pcplist
+ * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
+ * other index - this ensures that it will be put on the correct CMA freelist.
+ */
+static inline int get_pcppage_migratetype(struct page *page)
+{
+	return page->index;
+}
+
+static inline void set_pcppage_migratetype(struct page *page, int migratetype)
+{
+	page->index = migratetype;
+}
+
 #ifdef CONFIG_PM_SLEEP
 /*
  * The following functions are used by the suspend/hibernate code to temporarily
@@ -789,7 +807,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
 
-			mt = get_freepage_migratetype(page);
+			mt = get_pcppage_migratetype(page);
 			/* MIGRATE_ISOLATE page should not go to pcplists */
 			VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
 			/* Pageblock could have been isolated meanwhile */
@@ -956,7 +974,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	migratetype = get_pfnblock_migratetype(page, pfn);
 	local_irq_save(flags);
 	__count_vm_events(PGFREE, 1 << order);
-	set_freepage_migratetype(page, migratetype);
 	free_one_page(page_zone(page), page, pfn, order, migratetype);
 	local_irq_restore(flags);
 }
@@ -1384,7 +1401,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 		rmv_page_order(page);
 		area->nr_free--;
 		expand(zone, page, order, current_order, area, migratetype);
-		set_freepage_migratetype(page, migratetype);
+		set_pcppage_migratetype(page, migratetype);
 		return page;
 	}
 
@@ -1461,7 +1478,6 @@ int move_freepages(struct zone *zone,
 		order = page_order(page);
 		list_move(&page->lru,
 			  &zone->free_area[order].free_list[migratetype]);
-		set_freepage_migratetype(page, migratetype);
 		page += 1 << order;
 		pages_moved += 1 << order;
 	}
@@ -1631,14 +1647,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
 		expand(zone, page, order, current_order, area,
 					start_migratetype);
 		/*
-		 * The freepage_migratetype may differ from pageblock's
+		 * The pcppage_migratetype may differ from pageblock's
 		 * migratetype depending on the decisions in
-		 * try_to_steal_freepages(). This is OK as long as it
-		 * does not differ for MIGRATE_CMA pageblocks. For CMA
-		 * we need to make sure unallocated pages flushed from
-		 * pcp lists are returned to the correct freelist.
+		 * find_suitable_fallback(). This is OK as long as it does not
+		 * differ for MIGRATE_CMA pageblocks. Those can be used as
+		 * fallback only via special __rmqueue_cma_fallback() function
 		 */
-		set_freepage_migratetype(page, start_migratetype);
+		set_pcppage_migratetype(page, start_migratetype);
 
 		trace_mm_page_alloc_extfrag(page, order, current_order,
 			start_migratetype, fallback_mt);
@@ -1714,7 +1729,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		else
 			list_add_tail(&page->lru, list);
 		list = &page->lru;
-		if (is_migrate_cma(get_freepage_migratetype(page)))
+		if (is_migrate_cma(get_pcppage_migratetype(page)))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
 					      -(1 << order));
 	}
@@ -1911,7 +1926,7 @@ void free_hot_cold_page(struct page *page, bool cold)
 		return;
 
 	migratetype = get_pfnblock_migratetype(page, pfn);
-	set_freepage_migratetype(page, migratetype);
+	set_pcppage_migratetype(page, migratetype);
 	local_irq_save(flags);
 	__count_vm_event(PGFREE);
 
@@ -2116,7 +2131,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
 		if (!page)
 			goto failed;
 		__mod_zone_freepage_state(zone, -(1 << order),
-					  get_freepage_migratetype(page));
+					  get_pcppage_migratetype(page));
 	}
 
 	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
-- 
cgit v1.2.3-70-g09d2


From 5e9113731a3ce616e8b5aa128ffc1aeaa4942571 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Tue, 8 Sep 2015 15:01:28 -0700
Subject: mm/hugetlb: add cache of descriptors to resv_map for region_add

hugetlbfs is used today by applications that want a high degree of
control over huge page usage.  Often, large hugetlbfs files are used to
map a large number huge pages into the application processes.  The
applications know when page ranges within these large files will no
longer be used, and ideally would like to release them back to the
subpool or global pools for other uses.  The fallocate() system call
provides an interface for preallocation and hole punching within files.
This patch set adds fallocate functionality to hugetlbfs.

fallocate hole punch will want to remove a specific range of pages.
When pages are removed, their associated entries in the region/reserve
map will also be removed.  This will break an assumption in the
region_chg/region_add calling sequence.  If a new region descriptor must
be allocated, it is done as part of the region_chg processing.  In this
way, region_add can not fail because it does not need to attempt an
allocation.

To prepare for fallocate hole punch, create a "cache" of descriptors
that can be used by region_add if necessary.  region_chg will ensure
there are sufficient entries in the cache.  It will be necessary to
track the number of in progress add operations to know a sufficient
number of descriptors reside in the cache.  A new routine region_abort
is added to adjust this in progress count when add operations are
aborted.  vma_abort_reservation is also added for callers creating
reservations with vma_needs_reservation/vma_commit_reservation.

[akpm@linux-foundation.org: fix typo in comment, use more cols]
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |   3 +
 mm/hugetlb.c            | 174 ++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 155 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d891f949466a..e2d94960b38b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -35,6 +35,9 @@ struct resv_map {
 	struct kref refs;
 	spinlock_t lock;
 	struct list_head regions;
+	long adds_in_progress;
+	struct list_head region_cache;
+	long region_cache_count;
 };
 extern struct resv_map *resv_map_alloc(void);
 void resv_map_release(struct kref *ref);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 51ae41d0fbc0..4e5815ed7a8e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -240,11 +240,14 @@ struct file_region {
 
 /*
  * Add the huge page range represented by [f, t) to the reserve
- * map.  Existing regions will be expanded to accommodate the
- * specified range.  We know only existing regions need to be
- * expanded, because region_add is only called after region_chg
- * with the same range.  If a new file_region structure must
- * be allocated, it is done in region_chg.
+ * map.  In the normal case, existing regions will be expanded
+ * to accommodate the specified range.  Sufficient regions should
+ * exist for expansion due to the previous call to region_chg
+ * with the same range.  However, it is possible that region_del
+ * could have been called after region_chg and modifed the map
+ * in such a way that no region exists to be expanded.  In this
+ * case, pull a region descriptor from the cache associated with
+ * the map and use that for the new range.
  *
  * Return the number of new huge pages added to the map.  This
  * number is greater than or equal to zero.
@@ -261,6 +264,28 @@ static long region_add(struct resv_map *resv, long f, long t)
 		if (f <= rg->to)
 			break;
 
+	/*
+	 * If no region exists which can be expanded to include the
+	 * specified range, the list must have been modified by an
+	 * interleving call to region_del().  Pull a region descriptor
+	 * from the cache and use it for this range.
+	 */
+	if (&rg->link == head || t < rg->from) {
+		VM_BUG_ON(resv->region_cache_count <= 0);
+
+		resv->region_cache_count--;
+		nrg = list_first_entry(&resv->region_cache, struct file_region,
+					link);
+		list_del(&nrg->link);
+
+		nrg->from = f;
+		nrg->to = t;
+		list_add(&nrg->link, rg->link.prev);
+
+		add += t - f;
+		goto out_locked;
+	}
+
 	/* Round our left edge to the current segment if it encloses us. */
 	if (f > rg->from)
 		f = rg->from;
@@ -294,6 +319,8 @@ static long region_add(struct resv_map *resv, long f, long t)
 	add += t - nrg->to;		/* Added to end of region */
 	nrg->to = t;
 
+out_locked:
+	resv->adds_in_progress--;
 	spin_unlock(&resv->lock);
 	VM_BUG_ON(add < 0);
 	return add;
@@ -312,11 +339,14 @@ static long region_add(struct resv_map *resv, long f, long t)
  * so that the subsequent region_add call will have all the
  * regions it needs and will not fail.
  *
- * Returns the number of huge pages that need to be added
- * to the existing reservation map for the range [f, t).
- * This number is greater or equal to zero.  -ENOMEM is
- * returned if a new file_region structure is needed and can
- * not be allocated.
+ * Upon entry, region_chg will also examine the cache of region descriptors
+ * associated with the map.  If there are not enough descriptors cached, one
+ * will be allocated for the in progress add operation.
+ *
+ * Returns the number of huge pages that need to be added to the existing
+ * reservation map for the range [f, t).  This number is greater or equal to
+ * zero.  -ENOMEM is returned if a new file_region structure or cache entry
+ * is needed and can not be allocated.
  */
 static long region_chg(struct resv_map *resv, long f, long t)
 {
@@ -326,6 +356,31 @@ static long region_chg(struct resv_map *resv, long f, long t)
 
 retry:
 	spin_lock(&resv->lock);
+retry_locked:
+	resv->adds_in_progress++;
+
+	/*
+	 * Check for sufficient descriptors in the cache to accommodate
+	 * the number of in progress add operations.
+	 */
+	if (resv->adds_in_progress > resv->region_cache_count) {
+		struct file_region *trg;
+
+		VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
+		/* Must drop lock to allocate a new descriptor. */
+		resv->adds_in_progress--;
+		spin_unlock(&resv->lock);
+
+		trg = kmalloc(sizeof(*trg), GFP_KERNEL);
+		if (!trg)
+			return -ENOMEM;
+
+		spin_lock(&resv->lock);
+		list_add(&trg->link, &resv->region_cache);
+		resv->region_cache_count++;
+		goto retry_locked;
+	}
+
 	/* Locate the region we are before or in. */
 	list_for_each_entry(rg, head, link)
 		if (f <= rg->to)
@@ -336,6 +391,7 @@ retry:
 	 * size such that we can guarantee to record the reservation. */
 	if (&rg->link == head || t < rg->from) {
 		if (!nrg) {
+			resv->adds_in_progress--;
 			spin_unlock(&resv->lock);
 			nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
 			if (!nrg)
@@ -384,6 +440,25 @@ out_nrg:
 	return chg;
 }
 
+/*
+ * Abort the in progress add operation.  The adds_in_progress field
+ * of the resv_map keeps track of the operations in progress between
+ * calls to region_chg and region_add.  Operations are sometimes
+ * aborted after the call to region_chg.  In such cases, region_abort
+ * is called to decrement the adds_in_progress counter.
+ *
+ * NOTE: The range arguments [f, t) are not needed or used in this
+ * routine.  They are kept to make reading the calling code easier as
+ * arguments will match the associated region_chg call.
+ */
+static void region_abort(struct resv_map *resv, long f, long t)
+{
+	spin_lock(&resv->lock);
+	VM_BUG_ON(!resv->region_cache_count);
+	resv->adds_in_progress--;
+	spin_unlock(&resv->lock);
+}
+
 /*
  * Truncate the reserve map at index 'end'.  Modify/truncate any
  * region which contains end.  Delete any regions past end.
@@ -544,22 +619,44 @@ static void set_vma_private_data(struct vm_area_struct *vma,
 struct resv_map *resv_map_alloc(void)
 {
 	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
-	if (!resv_map)
+	struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
+
+	if (!resv_map || !rg) {
+		kfree(resv_map);
+		kfree(rg);
 		return NULL;
+	}
 
 	kref_init(&resv_map->refs);
 	spin_lock_init(&resv_map->lock);
 	INIT_LIST_HEAD(&resv_map->regions);
 
+	resv_map->adds_in_progress = 0;
+
+	INIT_LIST_HEAD(&resv_map->region_cache);
+	list_add(&rg->link, &resv_map->region_cache);
+	resv_map->region_cache_count = 1;
+
 	return resv_map;
 }
 
 void resv_map_release(struct kref *ref)
 {
 	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+	struct list_head *head = &resv_map->region_cache;
+	struct file_region *rg, *trg;
 
 	/* Clear out any active regions before we release the map. */
 	region_truncate(resv_map, 0);
+
+	/* ... and any entries left in the cache */
+	list_for_each_entry_safe(rg, trg, head, link) {
+		list_del(&rg->link);
+		kfree(rg);
+	}
+
+	VM_BUG_ON(resv_map->adds_in_progress);
+
 	kfree(resv_map);
 }
 
@@ -1473,16 +1570,18 @@ static void return_unused_surplus_pages(struct hstate *h,
 	}
 }
 
+
 /*
- * vma_needs_reservation and vma_commit_reservation are used by the huge
- * page allocation routines to manage reservations.
+ * vma_needs_reservation, vma_commit_reservation and vma_abort_reservation
+ * are used by the huge page allocation routines to manage reservations.
  *
  * vma_needs_reservation is called to determine if the huge page at addr
  * within the vma has an associated reservation.  If a reservation is
  * needed, the value 1 is returned.  The caller is then responsible for
  * managing the global reservation and subpool usage counts.  After
  * the huge page has been allocated, vma_commit_reservation is called
- * to add the page to the reservation map.
+ * to add the page to the reservation map.  If the reservation must be
+ * aborted instead of committed, vma_abort_reservation is called.
  *
  * In the normal case, vma_commit_reservation returns the same value
  * as the preceding vma_needs_reservation call.  The only time this
@@ -1490,9 +1589,14 @@ static void return_unused_surplus_pages(struct hstate *h,
  * is the responsibility of the caller to notice the difference and
  * take appropriate action.
  */
+enum vma_resv_mode {
+	VMA_NEEDS_RESV,
+	VMA_COMMIT_RESV,
+	VMA_ABORT_RESV,
+};
 static long __vma_reservation_common(struct hstate *h,
 				struct vm_area_struct *vma, unsigned long addr,
-				bool commit)
+				enum vma_resv_mode mode)
 {
 	struct resv_map *resv;
 	pgoff_t idx;
@@ -1503,10 +1607,20 @@ static long __vma_reservation_common(struct hstate *h,
 		return 1;
 
 	idx = vma_hugecache_offset(h, vma, addr);
-	if (commit)
-		ret = region_add(resv, idx, idx + 1);
-	else
+	switch (mode) {
+	case VMA_NEEDS_RESV:
 		ret = region_chg(resv, idx, idx + 1);
+		break;
+	case VMA_COMMIT_RESV:
+		ret = region_add(resv, idx, idx + 1);
+		break;
+	case VMA_ABORT_RESV:
+		region_abort(resv, idx, idx + 1);
+		ret = 0;
+		break;
+	default:
+		BUG();
+	}
 
 	if (vma->vm_flags & VM_MAYSHARE)
 		return ret;
@@ -1517,13 +1631,19 @@ static long __vma_reservation_common(struct hstate *h,
 static long vma_needs_reservation(struct hstate *h,
 			struct vm_area_struct *vma, unsigned long addr)
 {
-	return __vma_reservation_common(h, vma, addr, false);
+	return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
 }
 
 static long vma_commit_reservation(struct hstate *h,
 			struct vm_area_struct *vma, unsigned long addr)
 {
-	return __vma_reservation_common(h, vma, addr, true);
+	return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
+}
+
+static void vma_abort_reservation(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long addr)
+{
+	(void)__vma_reservation_common(h, vma, addr, VMA_ABORT_RESV);
 }
 
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
@@ -1549,8 +1669,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	if (chg < 0)
 		return ERR_PTR(-ENOMEM);
 	if (chg || avoid_reserve)
-		if (hugepage_subpool_get_pages(spool, 1) < 0)
+		if (hugepage_subpool_get_pages(spool, 1) < 0) {
+			vma_abort_reservation(h, vma, addr);
 			return ERR_PTR(-ENOSPC);
+		}
 
 	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
 	if (ret)
@@ -1596,6 +1718,7 @@ out_uncharge_cgroup:
 out_subpool_put:
 	if (chg || avoid_reserve)
 		hugepage_subpool_put_pages(spool, 1);
+	vma_abort_reservation(h, vma, addr);
 	return ERR_PTR(-ENOSPC);
 }
 
@@ -3236,11 +3359,14 @@ retry:
 	 * any allocations necessary to record that reservation occur outside
 	 * the spinlock.
 	 */
-	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
+	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
 		if (vma_needs_reservation(h, vma, address) < 0) {
 			ret = VM_FAULT_OOM;
 			goto backout_unlocked;
 		}
+		/* Just decrements count, does not deallocate */
+		vma_abort_reservation(h, vma, address);
+	}
 
 	ptl = huge_pte_lockptr(h, mm, ptep);
 	spin_lock(ptl);
@@ -3387,6 +3513,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			ret = VM_FAULT_OOM;
 			goto out_mutex;
 		}
+		/* Just decrements count, does not deallocate */
+		vma_abort_reservation(h, vma, address);
 
 		if (!(vma->vm_flags & VM_MAYSHARE))
 			pagecache_page = hugetlbfs_pagecache_page(h,
@@ -3726,6 +3854,8 @@ int hugetlb_reserve_pages(struct inode *inode,
 	}
 	return 0;
 out_err:
+	if (!vma || vma->vm_flags & VM_MAYSHARE)
+		region_abort(resv_map, from, to);
 	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
 		kref_put(&resv_map->refs, resv_map_release);
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From c672c7f29f2fdb73e1f72911bf499675c81fcdbb Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Tue, 8 Sep 2015 15:01:35 -0700
Subject: mm/hugetlb: expose hugetlb fault mutex for use by fallocate

hugetlb page faults are currently synchronized by the table of mutexes
(htlb_fault_mutex_table).  fallocate code will need to synchronize with
the page fault code when it allocates or deletes pages.  Expose
interfaces so that fallocate operations can be synchronized with page
faults.  Minor name changes to be more consistent with other global
hugetlb symbols.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  5 +++++
 mm/hugetlb.c            | 20 ++++++++++----------
 2 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e2d94960b38b..530cf6fc24c7 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -88,6 +88,11 @@ int dequeue_hwpoisoned_huge_page(struct page *page);
 bool isolate_huge_page(struct page *page, struct list_head *list);
 void putback_active_hugepage(struct page *page);
 void free_huge_page(struct page *page);
+extern struct mutex *hugetlb_fault_mutex_table;
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+				struct vm_area_struct *vma,
+				struct address_space *mapping,
+				pgoff_t idx, unsigned long address);
 
 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 78e7eded4063..070880fe1ff7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -64,7 +64,7 @@ DEFINE_SPINLOCK(hugetlb_lock);
  * prevent spurious OOMs when the hugepage pool is fully utilized.
  */
 static int num_fault_mutexes;
-static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
 
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
@@ -2482,7 +2482,7 @@ static void __exit hugetlb_exit(void)
 	}
 
 	kobject_put(hugepages_kobj);
-	kfree(htlb_fault_mutex_table);
+	kfree(hugetlb_fault_mutex_table);
 }
 module_exit(hugetlb_exit);
 
@@ -2515,12 +2515,12 @@ static int __init hugetlb_init(void)
 #else
 	num_fault_mutexes = 1;
 #endif
-	htlb_fault_mutex_table =
+	hugetlb_fault_mutex_table =
 		kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
-	BUG_ON(!htlb_fault_mutex_table);
+	BUG_ON(!hugetlb_fault_mutex_table);
 
 	for (i = 0; i < num_fault_mutexes; i++)
-		mutex_init(&htlb_fault_mutex_table[i]);
+		mutex_init(&hugetlb_fault_mutex_table[i]);
 	return 0;
 }
 module_init(hugetlb_init);
@@ -3454,7 +3454,7 @@ backout_unlocked:
 }
 
 #ifdef CONFIG_SMP
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
 			    struct vm_area_struct *vma,
 			    struct address_space *mapping,
 			    pgoff_t idx, unsigned long address)
@@ -3479,7 +3479,7 @@ static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
  * For uniprocesor systems we always use a single mutex, so just
  * return 0 and avoid the hashing overhead.
  */
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
 			    struct vm_area_struct *vma,
 			    struct address_space *mapping,
 			    pgoff_t idx, unsigned long address)
@@ -3527,8 +3527,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * get spurious allocation failures if two CPUs race to instantiate
 	 * the same page in the page cache.
 	 */
-	hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
-	mutex_lock(&htlb_fault_mutex_table[hash]);
+	hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
+	mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
 	entry = huge_ptep_get(ptep);
 	if (huge_pte_none(entry)) {
@@ -3613,7 +3613,7 @@ out_ptl:
 		put_page(pagecache_page);
 	}
 out_mutex:
-	mutex_unlock(&htlb_fault_mutex_table[hash]);
+	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 	/*
 	 * Generally it's safe to hold refcount during waiting page lock. But
 	 * here we just wait to defer the next page fault to avoid busy loop and
-- 
cgit v1.2.3-70-g09d2


From b5cec28d36f5ee6b4e6f68a0a40aa1e4045d6d99 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Tue, 8 Sep 2015 15:01:41 -0700
Subject: hugetlbfs: truncate_hugepages() takes a range of pages

Modify truncate_hugepages() to take a range of pages (start, end)
instead of simply start.  If an end value of LLONG_MAX is passed, the
current "truncate" functionality is maintained.  Existing callers are
modified to pass LLONG_MAX as end of range.  By keying off end ==
LLONG_MAX, the routine behaves differently for truncate and hole punch.
Page removal is now synchronized with page allocation via faults by
using the fault mutex table.  The hole punch case can experience the
rare region_del error and must handle accordingly.

Add the routine hugetlb_fix_reserve_counts to fix up reserve counts in
the case where region_del returns an error.

Since the routine handles more than just the truncate case, it is
renamed to remove_inode_hugepages().  To be consistent, the routine
truncate_huge_page() is renamed remove_huge_page().

Downstream of remove_inode_hugepages(), the routine
hugetlb_unreserve_pages() is also modified to take a range of pages.
hugetlb_unreserve_pages is modified to detect an error from region_del and
pass it back to the caller.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    | 98 ++++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/hugetlb.h |  4 +-
 mm/hugetlb.c            | 40 ++++++++++++++++++--
 3 files changed, 128 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b1e197d38abb..1ef630f81c99 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -293,26 +293,61 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
 	return -EINVAL;
 }
 
-static void truncate_huge_page(struct page *page)
+static void remove_huge_page(struct page *page)
 {
 	ClearPageDirty(page);
 	ClearPageUptodate(page);
 	delete_from_page_cache(page);
 }
 
-static void truncate_hugepages(struct inode *inode, loff_t lstart)
+
+/*
+ * remove_inode_hugepages handles two distinct cases: truncation and hole
+ * punch.  There are subtle differences in operation for each case.
+
+ * truncation is indicated by end of range being LLONG_MAX
+ *	In this case, we first scan the range and release found pages.
+ *	After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
+ *	maps and global counts.
+ * hole punch is indicated if end is not LLONG_MAX
+ *	In the hole punch case we scan the range and release found pages.
+ *	Only when releasing a page is the associated region/reserv map
+ *	deleted.  The region/reserv map for ranges without associated
+ *	pages are not modified.
+ * Note: If the passed end of range value is beyond the end of file, but
+ * not LLONG_MAX this routine still performs a hole punch operation.
+ */
+static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
+				   loff_t lend)
 {
 	struct hstate *h = hstate_inode(inode);
 	struct address_space *mapping = &inode->i_data;
 	const pgoff_t start = lstart >> huge_page_shift(h);
+	const pgoff_t end = lend >> huge_page_shift(h);
+	struct vm_area_struct pseudo_vma;
 	struct pagevec pvec;
 	pgoff_t next;
 	int i, freed = 0;
+	long lookup_nr = PAGEVEC_SIZE;
+	bool truncate_op = (lend == LLONG_MAX);
 
+	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
 	pagevec_init(&pvec, 0);
 	next = start;
-	while (1) {
-		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+	while (next < end) {
+		/*
+		 * Make sure to never grab more pages that we
+		 * might possibly need.
+		 */
+		if (end - next < lookup_nr)
+			lookup_nr = end - next;
+
+		/*
+		 * This pagevec_lookup() may return pages past 'end',
+		 * so we must check for page->index > end.
+		 */
+		if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) {
 			if (next == start)
 				break;
 			next = start;
@@ -321,26 +356,69 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
 
 		for (i = 0; i < pagevec_count(&pvec); ++i) {
 			struct page *page = pvec.pages[i];
+			u32 hash;
+
+			hash = hugetlb_fault_mutex_hash(h, current->mm,
+							&pseudo_vma,
+							mapping, next, 0);
+			mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
 			lock_page(page);
+			if (page->index >= end) {
+				unlock_page(page);
+				mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+				next = end;	/* we are done */
+				break;
+			}
+
+			/*
+			 * If page is mapped, it was faulted in after being
+			 * unmapped.  Do nothing in this race case.  In the
+			 * normal case page is not mapped.
+			 */
+			if (!page_mapped(page)) {
+				bool rsv_on_error = !PagePrivate(page);
+				/*
+				 * We must free the huge page and remove
+				 * from page cache (remove_huge_page) BEFORE
+				 * removing the region/reserve map
+				 * (hugetlb_unreserve_pages).  In rare out
+				 * of memory conditions, removal of the
+				 * region/reserve map could fail.  Before
+				 * free'ing the page, note PagePrivate which
+				 * is used in case of error.
+				 */
+				remove_huge_page(page);
+				freed++;
+				if (!truncate_op) {
+					if (unlikely(hugetlb_unreserve_pages(
+							inode, next,
+							next + 1, 1)))
+						hugetlb_fix_reserve_counts(
+							inode, rsv_on_error);
+				}
+			}
+
 			if (page->index > next)
 				next = page->index;
+
 			++next;
-			truncate_huge_page(page);
 			unlock_page(page);
-			freed++;
+
+			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 		}
 		huge_pagevec_release(&pvec);
 	}
-	BUG_ON(!lstart && mapping->nrpages);
-	hugetlb_unreserve_pages(inode, start, freed);
+
+	if (truncate_op)
+		(void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
 }
 
 static void hugetlbfs_evict_inode(struct inode *inode)
 {
 	struct resv_map *resv_map;
 
-	truncate_hugepages(inode, 0);
+	remove_inode_hugepages(inode, 0, LLONG_MAX);
 	resv_map = (struct resv_map *)inode->i_mapping->private_data;
 	/* root inode doesn't have the resv_map, so we should check it */
 	if (resv_map)
@@ -397,7 +475,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 	if (!RB_EMPTY_ROOT(&mapping->i_mmap))
 		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
 	i_mmap_unlock_write(mapping);
-	truncate_hugepages(inode, offset);
+	remove_inode_hugepages(inode, offset, LLONG_MAX);
 	return 0;
 }
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 530cf6fc24c7..35afca1692fb 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -83,11 +83,13 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
 						vm_flags_t vm_flags);
-void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
+long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
+						long freed);
 int dequeue_hwpoisoned_huge_page(struct page *page);
 bool isolate_huge_page(struct page *page, struct list_head *list);
 void putback_active_hugepage(struct page *page);
 void free_huge_page(struct page *page);
+void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve);
 extern struct mutex *hugetlb_fault_mutex_table;
 u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
 				struct vm_area_struct *vma,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 070880fe1ff7..61c52cd5f77b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -546,6 +546,28 @@ retry:
 	return del;
 }
 
+/*
+ * A rare out of memory error was encountered which prevented removal of
+ * the reserve map region for a page.  The huge page itself was free'ed
+ * and removed from the page cache.  This routine will adjust the subpool
+ * usage count, and the global reserve count if needed.  By incrementing
+ * these counts, the reserve map entry which could not be deleted will
+ * appear as a "reserved" entry instead of simply dangling with incorrect
+ * counts.
+ */
+void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
+{
+	struct hugepage_subpool *spool = subpool_inode(inode);
+	long rsv_adjust;
+
+	rsv_adjust = hugepage_subpool_get_pages(spool, 1);
+	if (restore_reserve && rsv_adjust) {
+		struct hstate *h = hstate_inode(inode);
+
+		hugetlb_acct_memory(h, 1);
+	}
+}
+
 /*
  * Count and return the number of huge pages in the reserve map
  * that intersect with the range [f, t).
@@ -3909,7 +3931,8 @@ out_err:
 	return ret;
 }
 
-void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
+								long freed)
 {
 	struct hstate *h = hstate_inode(inode);
 	struct resv_map *resv_map = inode_resv_map(inode);
@@ -3917,8 +3940,17 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 	struct hugepage_subpool *spool = subpool_inode(inode);
 	long gbl_reserve;
 
-	if (resv_map)
-		chg = region_del(resv_map, offset, LONG_MAX);
+	if (resv_map) {
+		chg = region_del(resv_map, start, end);
+		/*
+		 * region_del() can fail in the rare case where a region
+		 * must be split and another region descriptor can not be
+		 * allocated.  If end == LONG_MAX, it will not fail.
+		 */
+		if (chg < 0)
+			return chg;
+	}
+
 	spin_lock(&inode->i_lock);
 	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
 	spin_unlock(&inode->i_lock);
@@ -3929,6 +3961,8 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 	 */
 	gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
 	hugetlb_acct_memory(h, -gbl_reserve);
+
+	return 0;
 }
 
 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
-- 
cgit v1.2.3-70-g09d2


From ab76ad540a50191308e5bb6b5e2d9e26c78616d3 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Tue, 8 Sep 2015 15:01:50 -0700
Subject: hugetlbfs: New huge_add_to_page_cache helper routine

Currently, there is only a single place where hugetlbfs pages are added
to the page cache.  The new fallocate code be adding a second one, so
break the functionality out into its own helper.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  2 ++
 mm/hugetlb.c            | 27 ++++++++++++++++++---------
 2 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 35afca1692fb..1222fb07a746 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -333,6 +333,8 @@ struct huge_bootmem_page {
 struct page *alloc_huge_page_node(struct hstate *h, int nid);
 struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
 				unsigned long addr, int avoid_reserve);
+int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+			pgoff_t idx);
 
 /* arch callback */
 int __init alloc_bootmem_huge_page(struct hstate *h);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 114ad6ce7030..d45eacc5653e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3375,6 +3375,23 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
 	return page != NULL;
 }
 
+int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+			   pgoff_t idx)
+{
+	struct inode *inode = mapping->host;
+	struct hstate *h = hstate_inode(inode);
+	int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+
+	if (err)
+		return err;
+	ClearPagePrivate(page);
+
+	spin_lock(&inode->i_lock);
+	inode->i_blocks += blocks_per_huge_page(h);
+	spin_unlock(&inode->i_lock);
+	return 0;
+}
+
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			   struct address_space *mapping, pgoff_t idx,
 			   unsigned long address, pte_t *ptep, unsigned int flags)
@@ -3422,21 +3439,13 @@ retry:
 		set_page_huge_active(page);
 
 		if (vma->vm_flags & VM_MAYSHARE) {
-			int err;
-			struct inode *inode = mapping->host;
-
-			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+			int err = huge_add_to_page_cache(page, mapping, idx);
 			if (err) {
 				put_page(page);
 				if (err == -EEXIST)
 					goto retry;
 				goto out;
 			}
-			ClearPagePrivate(page);
-
-			spin_lock(&inode->i_lock);
-			inode->i_blocks += blocks_per_huge_page(h);
-			spin_unlock(&inode->i_lock);
 		} else {
 			lock_page(page);
 			if (unlikely(anon_vma_prepare(vma))) {
-- 
cgit v1.2.3-70-g09d2


From 70c3547e36f5c9fbc4caecfeca98f0effa6932c5 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Tue, 8 Sep 2015 15:01:54 -0700
Subject: hugetlbfs: add hugetlbfs_fallocate()

This is based on the shmem version, but it has diverged quite a bit.  We
have no swap to worry about, nor the new file sealing.  Add
synchronication via the fault mutex table to coordinate page faults,
fallocate allocation and fallocate hole punch.

What this allows us to do is move physical memory in and out of a
hugetlbfs file without having it mapped.  This also gives us the ability
to support MADV_REMOVE since it is currently implemented using
fallocate().  MADV_REMOVE lets madvise() remove pages from the middle of
a hugetlbfs file, which wasn't possible before.

hugetlbfs fallocate only operates on whole huge pages.

Based on code by Dave Hansen.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    | 179 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/hugetlb.h |   3 +
 mm/hugetlb.c            |   2 +-
 3 files changed, 182 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 1ef630f81c99..316adb968b65 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -12,6 +12,7 @@
 #include <linux/thread_info.h>
 #include <asm/current.h>
 #include <linux/sched.h>		/* remove ASAP */
+#include <linux/falloc.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/file.h>
@@ -84,6 +85,29 @@ static const match_table_t tokens = {
 	{Opt_err,	NULL},
 };
 
+#ifdef CONFIG_NUMA
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+					struct inode *inode, pgoff_t index)
+{
+	vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
+							index);
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+	mpol_cond_put(vma->vm_policy);
+}
+#else
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+					struct inode *inode, pgoff_t index)
+{
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+}
+#endif
+
 static void huge_pagevec_release(struct pagevec *pvec)
 {
 	int i;
@@ -479,6 +503,158 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 	return 0;
 }
 
+static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct hstate *h = hstate_inode(inode);
+	loff_t hpage_size = huge_page_size(h);
+	loff_t hole_start, hole_end;
+
+	/*
+	 * For hole punch round up the beginning offset of the hole and
+	 * round down the end.
+	 */
+	hole_start = round_up(offset, hpage_size);
+	hole_end = round_down(offset + len, hpage_size);
+
+	if (hole_end > hole_start) {
+		struct address_space *mapping = inode->i_mapping;
+
+		mutex_lock(&inode->i_mutex);
+		i_mmap_lock_write(mapping);
+		if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+			hugetlb_vmdelete_list(&mapping->i_mmap,
+						hole_start >> PAGE_SHIFT,
+						hole_end  >> PAGE_SHIFT);
+		i_mmap_unlock_write(mapping);
+		remove_inode_hugepages(inode, hole_start, hole_end);
+		mutex_unlock(&inode->i_mutex);
+	}
+
+	return 0;
+}
+
+static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
+				loff_t len)
+{
+	struct inode *inode = file_inode(file);
+	struct address_space *mapping = inode->i_mapping;
+	struct hstate *h = hstate_inode(inode);
+	struct vm_area_struct pseudo_vma;
+	struct mm_struct *mm = current->mm;
+	loff_t hpage_size = huge_page_size(h);
+	unsigned long hpage_shift = huge_page_shift(h);
+	pgoff_t start, index, end;
+	int error;
+	u32 hash;
+
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		return hugetlbfs_punch_hole(inode, offset, len);
+
+	/*
+	 * Default preallocate case.
+	 * For this range, start is rounded down and end is rounded up
+	 * as well as being converted to page offsets.
+	 */
+	start = offset >> hpage_shift;
+	end = (offset + len + hpage_size - 1) >> hpage_shift;
+
+	mutex_lock(&inode->i_mutex);
+
+	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+	error = inode_newsize_ok(inode, offset + len);
+	if (error)
+		goto out;
+
+	/*
+	 * Initialize a pseudo vma as this is required by the huge page
+	 * allocation routines.  If NUMA is configured, use page index
+	 * as input to create an allocation policy.
+	 */
+	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
+	pseudo_vma.vm_file = file;
+
+	for (index = start; index < end; index++) {
+		/*
+		 * This is supposed to be the vaddr where the page is being
+		 * faulted in, but we have no vaddr here.
+		 */
+		struct page *page;
+		unsigned long addr;
+		int avoid_reserve = 0;
+
+		cond_resched();
+
+		/*
+		 * fallocate(2) manpage permits EINTR; we may have been
+		 * interrupted because we are using up too much memory.
+		 */
+		if (signal_pending(current)) {
+			error = -EINTR;
+			break;
+		}
+
+		/* Set numa allocation policy based on index */
+		hugetlb_set_vma_policy(&pseudo_vma, inode, index);
+
+		/* addr is the offset within the file (zero based) */
+		addr = index * hpage_size;
+
+		/* mutex taken here, fault path and hole punch */
+		hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
+						index, addr);
+		mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+		/* See if already present in mapping to avoid alloc/free */
+		page = find_get_page(mapping, index);
+		if (page) {
+			put_page(page);
+			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+			hugetlb_drop_vma_policy(&pseudo_vma);
+			continue;
+		}
+
+		/* Allocate page and add to page cache */
+		page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
+		hugetlb_drop_vma_policy(&pseudo_vma);
+		if (IS_ERR(page)) {
+			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+			error = PTR_ERR(page);
+			goto out;
+		}
+		clear_huge_page(page, addr, pages_per_huge_page(h));
+		__SetPageUptodate(page);
+		error = huge_add_to_page_cache(page, mapping, index);
+		if (unlikely(error)) {
+			put_page(page);
+			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+			goto out;
+		}
+
+		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+		/*
+		 * page_put due to reference from alloc_huge_page()
+		 * unlock_page because locked by add_to_page_cache()
+		 */
+		put_page(page);
+		unlock_page(page);
+	}
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+		i_size_write(inode, offset + len);
+	inode->i_ctime = CURRENT_TIME;
+	spin_lock(&inode->i_lock);
+	inode->i_private = NULL;
+	spin_unlock(&inode->i_lock);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return error;
+}
+
 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
@@ -790,7 +966,8 @@ const struct file_operations hugetlbfs_file_operations = {
 	.mmap			= hugetlbfs_file_mmap,
 	.fsync			= noop_fsync,
 	.get_unmapped_area	= hugetlb_get_unmapped_area,
-	.llseek		= default_llseek,
+	.llseek			= default_llseek,
+	.fallocate		= hugetlbfs_fallocate,
 };
 
 static const struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 1222fb07a746..5e35379f58a5 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -330,6 +330,8 @@ struct huge_bootmem_page {
 #endif
 };
 
+struct page *alloc_huge_page(struct vm_area_struct *vma,
+				unsigned long addr, int avoid_reserve);
 struct page *alloc_huge_page_node(struct hstate *h, int nid);
 struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
 				unsigned long addr, int avoid_reserve);
@@ -483,6 +485,7 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
 
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
+#define alloc_huge_page(v, a, r) NULL
 #define alloc_huge_page_node(h, nid) NULL
 #define alloc_huge_page_noerr(v, a, r) NULL
 #define alloc_bootmem_huge_page(h) NULL
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d45eacc5653e..cd1280c487ff 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1727,7 +1727,7 @@ static void vma_end_reservation(struct hstate *h,
 	(void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
 }
 
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
+struct page *alloc_huge_page(struct vm_area_struct *vma,
 				    unsigned long addr, int avoid_reserve)
 {
 	struct hugepage_subpool *spool = subpool_vma(vma);
-- 
cgit v1.2.3-70-g09d2


From c5c5c9d1008fb15945d0173b3ca75931ef53ae1f Mon Sep 17 00:00:00 2001
From: Tang Chen <tangchen@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 15:02:00 -0700
Subject: mm/memblock.c: make memblock_overlaps_region() return bool.

memblock_overlaps_region() checks if the given memblock region
intersects a region in memblock.  If so, it returns the index of the
intersected region.

But its only caller is memblock_is_region_reserved(), and it returns 0
if false, non-zero if true.

Both of these should return bool.

Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h |  2 +-
 mm/memblock.c            | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index cc4b01972060..d312ae3b51fc 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -323,7 +323,7 @@ void memblock_enforce_memory_limit(phys_addr_t memory_limit);
 int memblock_is_memory(phys_addr_t addr);
 int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
 int memblock_is_reserved(phys_addr_t addr);
-int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
+bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
 
 extern void __memblock_dump_all(void);
 
diff --git a/mm/memblock.c b/mm/memblock.c
index bde61e8c28c5..08a5126338db 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -91,7 +91,7 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
 	return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
 }
 
-static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
+static bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
 					phys_addr_t base, phys_addr_t size)
 {
 	unsigned long i;
@@ -103,7 +103,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
 			break;
 	}
 
-	return (i < type->cnt) ? i : -1;
+	return i < type->cnt;
 }
 
 /*
@@ -1566,12 +1566,12 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
  * Check if the region [@base, @base+@size) intersects a reserved memory block.
  *
  * RETURNS:
- * 0 if false, non-zero if true
+ * True if they intersect, false if not.
  */
-int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
+bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
 {
 	memblock_cap_size(base, &size);
-	return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+	return memblock_overlaps_region(&memblock.reserved, base, size);
 }
 
 void __init_memblock memblock_trim_memory(phys_addr_t align)
-- 
cgit v1.2.3-70-g09d2


From 95cf82ecc1fcb44df1768162343cc8eb88083b86 Mon Sep 17 00:00:00 2001
From: Tang Chen <tangchen@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 15:02:03 -0700
Subject: mem-hotplug: handle node hole when initializing numa_meminfo.

When parsing SRAT, all memory ranges are added into numa_meminfo.  In
numa_init(), before entering numa_cleanup_meminfo(), all possible memory
ranges are in numa_meminfo.  And numa_cleanup_meminfo() removes all
ranges over max_pfn or empty.

But, this only works if the nodes are continuous.  Let's have a look at
the following example:

We have an SRAT like this:
SRAT: Node 0 PXM 0 [mem 0x00000000-0x5fffffff]
SRAT: Node 0 PXM 0 [mem 0x100000000-0x1ffffffffff]
SRAT: Node 1 PXM 1 [mem 0x20000000000-0x3ffffffffff]
SRAT: Node 4 PXM 2 [mem 0x40000000000-0x5ffffffffff] hotplug
SRAT: Node 5 PXM 3 [mem 0x60000000000-0x7ffffffffff] hotplug
SRAT: Node 2 PXM 4 [mem 0x80000000000-0x9ffffffffff] hotplug
SRAT: Node 3 PXM 5 [mem 0xa0000000000-0xbffffffffff] hotplug
SRAT: Node 6 PXM 6 [mem 0xc0000000000-0xdffffffffff] hotplug
SRAT: Node 7 PXM 7 [mem 0xe0000000000-0xfffffffffff] hotplug

On boot, only node 0,1,2,3 exist.

And the numa_meminfo will look like this:
numa_meminfo.nr_blks = 9
1. on node 0: [0, 60000000]
2. on node 0: [100000000, 20000000000]
3. on node 1: [20000000000, 40000000000]
4. on node 4: [40000000000, 60000000000]
5. on node 5: [60000000000, 80000000000]
6. on node 2: [80000000000, a0000000000]
7. on node 3: [a0000000000, a0800000000]
8. on node 6: [c0000000000, a0800000000]
9. on node 7: [e0000000000, a0800000000]

And numa_cleanup_meminfo() will merge 1 and 2, and remove 8,9 because the
end address is over max_pfn, which is a0800000000.  But 4 and 5 are not
removed because their end addresses are less then max_pfn.  But in fact,
node 4 and 5 don't exist.

In a word, numa_cleanup_meminfo() is not able to handle holes between nodes.

Since memory ranges in node 4 and 5 are in numa_meminfo, in
numa_register_memblks(), node 4 and 5 will be mistakenly set to online.

If you run lscpu, it will show:
NUMA node0 CPU(s):     0-14,128-142
NUMA node1 CPU(s):     15-29,143-157
NUMA node2 CPU(s):
NUMA node3 CPU(s):
NUMA node4 CPU(s):     62-76,190-204
NUMA node5 CPU(s):     78-92,206-220

In this patch, we use memblock_overlaps_region() to check if ranges in
numa_meminfo overlap with ranges in memory_block.  Since memory_block
contains all available memory at boot time, if they overlap, it means the
ranges exist.  If not, then remove them from numa_meminfo.

After this patch, lscpu will show:
NUMA node0 CPU(s):     0-14,128-142
NUMA node1 CPU(s):     15-29,143-157
NUMA node4 CPU(s):     62-76,190-204
NUMA node5 CPU(s):     78-92,206-220

Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/numa.c       | 6 ++++--
 include/linux/memblock.h | 2 ++
 mm/memblock.c            | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 4053bb58bf92..c3b3f653ed0c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -246,8 +246,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 		bi->start = max(bi->start, low);
 		bi->end = min(bi->end, high);
 
-		/* and there's no empty block */
-		if (bi->start >= bi->end)
+		/* and there's no empty or non-exist block */
+		if (bi->start >= bi->end ||
+		    !memblock_overlaps_region(&memblock.memory,
+			bi->start, bi->end - bi->start))
 			numa_remove_memblk_from(i--, mi);
 	}
 
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index d312ae3b51fc..c518eb589260 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -77,6 +77,8 @@ int memblock_remove(phys_addr_t base, phys_addr_t size);
 int memblock_free(phys_addr_t base, phys_addr_t size);
 int memblock_reserve(phys_addr_t base, phys_addr_t size);
 void memblock_trim_memory(phys_addr_t align);
+bool memblock_overlaps_region(struct memblock_type *type,
+			      phys_addr_t base, phys_addr_t size);
 int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
diff --git a/mm/memblock.c b/mm/memblock.c
index 08a5126338db..509255223688 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -91,7 +91,7 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
 	return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
 }
 
-static bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
+bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
 					phys_addr_t base, phys_addr_t size)
 {
 	unsigned long i;
-- 
cgit v1.2.3-70-g09d2


From c5b4e1b02f2a0c2309ecd58a235a2f5ee4eb0074 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Tue, 8 Sep 2015 15:02:09 -0700
Subject: mm, page_isolation: make set/unset_migratetype_isolate() file-local

Nowaday, set/unset_migratetype_isolate() is defined and used only in
mm/page_isolation, so let's limit the scope within the file.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-isolation.h | 5 -----
 mm/page_isolation.c            | 5 +++--
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 2dc1e1697b45..047d64706f2a 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -65,11 +65,6 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
 			bool skip_hwpoisoned_pages);
 
-/*
- * Internal functions. Changes pageblock's migrate type.
- */
-int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages);
-void unset_migratetype_isolate(struct page *page, unsigned migratetype);
 struct page *alloc_migrate_target(struct page *page, unsigned long private,
 				int **resultp);
 
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 32fdc1df05e5..4568fd58f70a 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -9,7 +9,8 @@
 #include <linux/hugetlb.h>
 #include "internal.h"
 
-int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
+static int set_migratetype_isolate(struct page *page,
+				bool skip_hwpoisoned_pages)
 {
 	struct zone *zone;
 	unsigned long flags, pfn;
@@ -72,7 +73,7 @@ out:
 	return ret;
 }
 
-void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 {
 	struct zone *zone;
 	unsigned long flags, nr_pages;
-- 
cgit v1.2.3-70-g09d2


From 64b990d2957cb535fe1c17b9694d5d4f7de69962 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 8 Sep 2015 15:02:15 -0700
Subject: mm: drop __nocast from vm_flags_t definition

__nocast does no good for vm_flags_t. It only produces useless sparse
warnings.

Let's drop it.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c8d0a73d64c4..3d6baa7d4534 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -235,7 +235,7 @@ struct page_frag_cache {
 	bool pfmemalloc;
 };
 
-typedef unsigned long __nocast vm_flags_t;
+typedef unsigned long vm_flags_t;
 
 /*
  * A region containing a mapping of a non-memory backed file under NOMMU
-- 
cgit v1.2.3-70-g09d2


From ad82362b2defd4adad87d8538617b2f51a4bf9c3 Mon Sep 17 00:00:00 2001
From: "Sean O. Stalley" <sean.stalley@intel.com>
Date: Tue, 8 Sep 2015 15:02:27 -0700
Subject: mm: add dma_pool_zalloc() call to DMA API

Add a wrapper function for dma_pool_alloc() to get zeroed memory.

Signed-off-by: Sean O. Stalley <sean.stalley@intel.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Gilles Muller <Gilles.Muller@lip6.fr>
Cc: Nicolas Palix <nicolas.palix@imag.fr>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/DMA-API.txt | 7 +++++++
 include/linux/dmapool.h   | 6 ++++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 7eba542eff7c..edccacd4f048 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -104,6 +104,13 @@ crossing restrictions, pass 0 for alloc; passing 4096 says memory allocated
 from this pool must not cross 4KByte boundaries.
 
 
+	void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
+			      dma_addr_t *handle)
+
+Wraps dma_pool_alloc() and also zeroes the returned memory if the
+allocation attempt succeeded.
+
+
 	void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
 			dma_addr_t *dma_handle);
 
diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h
index e1043f79122f..53ba737505df 100644
--- a/include/linux/dmapool.h
+++ b/include/linux/dmapool.h
@@ -24,6 +24,12 @@ void dma_pool_destroy(struct dma_pool *pool);
 void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 		     dma_addr_t *handle);
 
+static inline void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
+				    dma_addr_t *handle)
+{
+	return dma_pool_alloc(pool, mem_flags | __GFP_ZERO, handle);
+}
+
 void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr);
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 01a7fd337b2c2af97e9c55bb9406a222a2e209d3 Mon Sep 17 00:00:00 2001
From: "Sean O. Stalley" <sean.stalley@intel.com>
Date: Tue, 8 Sep 2015 15:02:30 -0700
Subject: pci: mm: add pci_pool_zalloc() call

Add a wrapper function for pci_pool_alloc() to get zeroed memory.

Signed-off-by: Sean O. Stalley <sean.stalley@intel.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Gilles Muller <Gilles.Muller@lip6.fr>
Cc: Nicolas Palix <nicolas.palix@imag.fr>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pci.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 1a64733c48c7..e90eb22de628 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1227,6 +1227,8 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode,
 		dma_pool_create(name, &pdev->dev, size, align, allocation)
 #define	pci_pool_destroy(pool) dma_pool_destroy(pool)
 #define	pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle)
+#define	pci_pool_zalloc(pool, flags, handle) \
+		dma_pool_zalloc(pool, flags, handle)
 #define	pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr)
 
 struct msix_entry {
-- 
cgit v1.2.3-70-g09d2


From 94bf4ec84a84d3ab2513b4e681fd3d083328d76d Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Tue, 8 Sep 2015 15:03:15 -0700
Subject: mm/hwpoison: introduce put_hwpoison_page to put refcount for memory
 error handling

Introduce put_hwpoison_page to put refcount for memory error handling.

Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Suggested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h  |  1 +
 mm/memory-failure.c | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bab8ff89da50..11df1a8ea38b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2169,6 +2169,7 @@ extern int memory_failure(unsigned long pfn, int trapno, int flags);
 extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
 extern int unpoison_memory(unsigned long pfn);
 extern int get_hwpoison_page(struct page *page);
+extern void put_hwpoison_page(struct page *page);
 extern int sysctl_memory_failure_early_kill;
 extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p, int access);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 863544d84a09..5ceb8253e33b 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -934,6 +934,27 @@ int get_hwpoison_page(struct page *page)
 }
 EXPORT_SYMBOL_GPL(get_hwpoison_page);
 
+/**
+ * put_hwpoison_page() - Put refcount for memory error handling:
+ * @page:	raw error page (hit by memory error)
+ */
+void put_hwpoison_page(struct page *page)
+{
+	struct page *head = compound_head(page);
+
+	if (PageHuge(head)) {
+		put_page(head);
+		return;
+	}
+
+	if (PageTransHuge(head))
+		if (page != head)
+			put_page(head);
+
+	put_page(page);
+}
+EXPORT_SYMBOL_GPL(put_hwpoison_page);
+
 /*
  * Do all that is necessary to remove user space mappings. Unmap
  * the pages and send SIGBUS to the processes if the data was dirty.
-- 
cgit v1.2.3-70-g09d2


From 8e30456b6c56029ecbb43b777519175e478adfbf Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Tue, 8 Sep 2015 15:03:24 -0700
Subject: mm/hwpoison: introduce num_poisoned_pages wrappers

num_poisoned_pages counter will be changed outside mm/memory-failure.c
by a subsequent patch, so this patch prepares wrappers to manipulate it.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Tested-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swapops.h | 23 +++++++++++++++++++++++
 mm/memory-failure.c     | 30 ++++++++++++++----------------
 2 files changed, 37 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index cedf3d3c373f..ec04669f2a3b 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -164,6 +164,9 @@ static inline int is_write_migration_entry(swp_entry_t entry)
 #endif
 
 #ifdef CONFIG_MEMORY_FAILURE
+
+extern atomic_long_t num_poisoned_pages __read_mostly;
+
 /*
  * Support for hardware poisoned pages
  */
@@ -177,6 +180,26 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
 {
 	return swp_type(entry) == SWP_HWPOISON;
 }
+
+static inline void num_poisoned_pages_inc(void)
+{
+	atomic_long_inc(&num_poisoned_pages);
+}
+
+static inline void num_poisoned_pages_dec(void)
+{
+	atomic_long_dec(&num_poisoned_pages);
+}
+
+static inline void num_poisoned_pages_add(long num)
+{
+	atomic_long_add(num, &num_poisoned_pages);
+}
+
+static inline void num_poisoned_pages_sub(long num)
+{
+	atomic_long_sub(num, &num_poisoned_pages);
+}
 #else
 
 static inline swp_entry_t make_hwpoison_entry(struct page *page)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5420d3819adf..393ea13b0754 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1121,7 +1121,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 		nr_pages = 1 << compound_order(hpage);
 	else /* normal page or thp */
 		nr_pages = 1;
-	atomic_long_add(nr_pages, &num_poisoned_pages);
+	num_poisoned_pages_add(nr_pages);
 
 	/*
 	 * We need/can do nothing about count=0 pages.
@@ -1149,7 +1149,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 			if (PageHWPoison(hpage)) {
 				if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
 				    || (p != hpage && TestSetPageHWPoison(hpage))) {
-					atomic_long_sub(nr_pages, &num_poisoned_pages);
+					num_poisoned_pages_sub(nr_pages);
 					unlock_page(hpage);
 					return 0;
 				}
@@ -1173,7 +1173,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 			else
 				pr_err("MCE: %#lx: thp split failed\n", pfn);
 			if (TestClearPageHWPoison(p))
-				atomic_long_sub(nr_pages, &num_poisoned_pages);
+				num_poisoned_pages_sub(nr_pages);
 			put_hwpoison_page(p);
 			return -EBUSY;
 		}
@@ -1233,14 +1233,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	 */
 	if (!PageHWPoison(p)) {
 		printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
-		atomic_long_sub(nr_pages, &num_poisoned_pages);
+		num_poisoned_pages_sub(nr_pages);
 		unlock_page(hpage);
 		put_hwpoison_page(hpage);
 		return 0;
 	}
 	if (hwpoison_filter(p)) {
 		if (TestClearPageHWPoison(p))
-			atomic_long_sub(nr_pages, &num_poisoned_pages);
+			num_poisoned_pages_sub(nr_pages);
 		unlock_page(hpage);
 		put_hwpoison_page(hpage);
 		return 0;
@@ -1469,7 +1469,7 @@ int unpoison_memory(unsigned long pfn)
 			return 0;
 		}
 		if (TestClearPageHWPoison(p))
-			atomic_long_dec(&num_poisoned_pages);
+			num_poisoned_pages_dec();
 		pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
 		return 0;
 	}
@@ -1483,7 +1483,7 @@ int unpoison_memory(unsigned long pfn)
 	 */
 	if (TestClearPageHWPoison(page)) {
 		pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
-		atomic_long_sub(nr_pages, &num_poisoned_pages);
+		num_poisoned_pages_sub(nr_pages);
 		freeit = 1;
 		if (PageHuge(page))
 			clear_page_hwpoison_huge_page(page);
@@ -1619,11 +1619,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
 		if (PageHuge(page)) {
 			set_page_hwpoison_huge_page(hpage);
 			dequeue_hwpoisoned_huge_page(hpage);
-			atomic_long_add(1 << compound_order(hpage),
-					&num_poisoned_pages);
+			num_poisoned_pages_add(1 << compound_order(hpage));
 		} else {
 			SetPageHWPoison(page);
-			atomic_long_inc(&num_poisoned_pages);
+			num_poisoned_pages_inc();
 		}
 	}
 	return ret;
@@ -1662,7 +1661,7 @@ static int __soft_offline_page(struct page *page, int flags)
 		put_hwpoison_page(page);
 		pr_info("soft_offline: %#lx: invalidated\n", pfn);
 		SetPageHWPoison(page);
-		atomic_long_inc(&num_poisoned_pages);
+		num_poisoned_pages_inc();
 		return 0;
 	}
 
@@ -1683,7 +1682,7 @@ static int __soft_offline_page(struct page *page, int flags)
 					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
 		if (!TestSetPageHWPoison(page))
-			atomic_long_inc(&num_poisoned_pages);
+			num_poisoned_pages_dec();
 		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
 		if (ret) {
@@ -1699,7 +1698,7 @@ static int __soft_offline_page(struct page *page, int flags)
 			if (ret > 0)
 				ret = -EIO;
 			if (TestClearPageHWPoison(page))
-				atomic_long_dec(&num_poisoned_pages);
+				num_poisoned_pages_dec();
 		}
 	} else {
 		pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
@@ -1765,11 +1764,10 @@ int soft_offline_page(struct page *page, int flags)
 		if (PageHuge(page)) {
 			set_page_hwpoison_huge_page(hpage);
 			if (!dequeue_hwpoisoned_huge_page(hpage))
-				atomic_long_add(1 << compound_order(hpage),
-					&num_poisoned_pages);
+				num_poisoned_pages_add(1 << compound_order(hpage));
 		} else {
 			if (!TestSetPageHWPoison(page))
-				atomic_long_inc(&num_poisoned_pages);
+				num_poisoned_pages_inc();
 		}
 	}
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From da1b13ccfbebe0b9d69b5d61eff0a675e19e69a5 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Tue, 8 Sep 2015 15:03:27 -0700
Subject: mm/hwpoison: fix race between soft_offline_page and unpoison_memory

Wanpeng Li reported a race between soft_offline_page() and
unpoison_memory(), which causes the following kernel panic:

   BUG: Bad page state in process bash  pfn:97000
   page:ffffea00025c0000 count:0 mapcount:1 mapping:          (null) index:0x7f4fdbe00
   flags: 0x1fffff80080048(uptodate|active|swapbacked)
   page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
   bad because of flags:
   flags: 0x40(active)
   Modules linked in: snd_hda_codec_hdmi i915 rpcsec_gss_krb5 nfsv4 dns_resolver bnep rfcomm nfsd bluetooth auth_rpcgss nfs_acl nfs rfkill lockd grace sunrpc i2c_algo_bit drm_kms_helper snd_hda_codec_realtek snd_hda_codec_generic drm snd_hda_intel fscache snd_hda_codec x86_pkg_temp_thermal coretemp kvm_intel snd_hda_core snd_hwdep kvm snd_pcm snd_seq_dummy snd_seq_oss crct10dif_pclmul snd_seq_midi crc32_pclmul snd_seq_midi_event ghash_clmulni_intel snd_rawmidi aesni_intel lrw gf128mul snd_seq glue_helper ablk_helper snd_seq_device cryptd fuse snd_timer dcdbas serio_raw mei_me parport_pc snd mei ppdev i2c_core video lp soundcore parport lpc_ich shpchp mfd_core ext4 mbcache jbd2 sd_mod e1000e ahci ptp libahci crc32c_intel libata pps_core
   CPU: 3 PID: 2211 Comm: bash Not tainted 4.2.0-rc5-mm1+ #45
   Hardware name: Dell Inc. OptiPlex 7020/0F5C5X, BIOS A03 01/08/2015
   Call Trace:
     dump_stack+0x48/0x5c
     bad_page+0xe6/0x140
     free_pages_prepare+0x2f9/0x320
     ? uncharge_list+0xdd/0x100
     free_hot_cold_page+0x40/0x170
     __put_single_page+0x20/0x30
     put_page+0x25/0x40
     unmap_and_move+0x1a6/0x1f0
     migrate_pages+0x100/0x1d0
     ? kill_procs+0x100/0x100
     ? unlock_page+0x6f/0x90
     __soft_offline_page+0x127/0x2a0
     soft_offline_page+0xa6/0x200

This race is explained like below:

  CPU0                    CPU1

  soft_offline_page
  __soft_offline_page
  TestSetPageHWPoison
                        unpoison_memory
                        PageHWPoison check (true)
                        TestClearPageHWPoison
                        put_page    -> release refcount held by get_hwpoison_page in unpoison_memory
                        put_page    -> release refcount held by isolate_lru_page in __soft_offline_page
  migrate_pages

The second put_page() releases refcount held by isolate_lru_page() which
will lead to unmap_and_move() releases the last refcount of page and w/
mapcount still 1 since try_to_unmap() is not called if there is only one
user map the page.  Anyway, the page refcount and mapcount will still
mess if the page is mapped by multiple users.

This race was introduced by commit 4491f71260 ("mm/memory-failure: set
PageHWPoison before migrate_pages()"), which focuses on preventing the
reuse of successfully migrated page.  Before this commit we prevent the
reuse by changing the migratetype to MIGRATE_ISOLATE during soft
offlining, which has the following problems, so simply reverting the
commit is not a best option:

  1) it doesn't eliminate the reuse completely, because
     set_migratetype_isolate() can fail to set MIGRATE_ISOLATE to the
     target page if the pageblock of the page contains one or more
     unmovable pages (i.e.  has_unmovable_pages() returns true).

  2) the original code changes migratetype to MIGRATE_ISOLATE
     forcibly, and sets it to MIGRATE_MOVABLE forcibly after soft offline,
     regardless of the original migratetype state, which could impact
     other subsystems like memory hotplug or compaction.

This patch moves PageSetHWPoison just after put_page() in
unmap_and_move(), which closes up the reported race window and minimizes
another race window b/w SetPageHWPoison and reallocation (which causes
the reuse of soft-offlined page.) The latter race window still exists
but it's acceptable, because it's rare and effectively the same as
ordinary "containment failure" case even if it happens, so keep the
window open is acceptable.

Fixes: 4491f71260 ("mm/memory-failure: set PageHWPoison before migrate_pages()")
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reported-by: Wanpeng Li <wanpeng.li@hotmail.com>
Tested-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swapops.h | 14 ++++++++++++++
 mm/memory-failure.c     |  4 ----
 mm/migrate.c            |  9 +++++----
 3 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index ec04669f2a3b..5c3a5f3e7eec 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -181,6 +181,11 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
 	return swp_type(entry) == SWP_HWPOISON;
 }
 
+static inline bool test_set_page_hwpoison(struct page *page)
+{
+	return TestSetPageHWPoison(page);
+}
+
 static inline void num_poisoned_pages_inc(void)
 {
 	atomic_long_inc(&num_poisoned_pages);
@@ -211,6 +216,15 @@ static inline int is_hwpoison_entry(swp_entry_t swp)
 {
 	return 0;
 }
+
+static inline bool test_set_page_hwpoison(struct page *page)
+{
+	return false;
+}
+
+static inline void num_poisoned_pages_inc(void)
+{
+}
 #endif
 
 #if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 393ea13b0754..b0664c23838b 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1681,8 +1681,6 @@ static int __soft_offline_page(struct page *page, int flags)
 		inc_zone_page_state(page, NR_ISOLATED_ANON +
 					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
-		if (!TestSetPageHWPoison(page))
-			num_poisoned_pages_dec();
 		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
 		if (ret) {
@@ -1697,8 +1695,6 @@ static int __soft_offline_page(struct page *page, int flags)
 				pfn, ret, page->flags);
 			if (ret > 0)
 				ret = -EIO;
-			if (TestClearPageHWPoison(page))
-				num_poisoned_pages_dec();
 		}
 	} else {
 		pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
diff --git a/mm/migrate.c b/mm/migrate.c
index 5c08cab5419e..918defbdda0e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -880,8 +880,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 	/* Establish migration ptes or remove ptes */
 	if (page_mapped(page)) {
 		try_to_unmap(page,
-			TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
-			TTU_IGNORE_HWPOISON);
+			TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 		page_was_mapped = 1;
 	}
 
@@ -952,9 +951,11 @@ out:
 		dec_zone_page_state(page, NR_ISOLATED_ANON +
 				page_is_file_cache(page));
 		/* Soft-offlined page shouldn't go through lru cache list */
-		if (reason == MR_MEMORY_FAILURE)
+		if (reason == MR_MEMORY_FAILURE) {
 			put_page(page);
-		else
+			if (!test_set_page_hwpoison(page))
+				num_poisoned_pages_inc();
+		} else
 			putback_lru_page(page);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 96db800f5d73cd5c49461253d45766e094f0f8c2 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 8 Sep 2015 15:03:50 -0700
Subject: mm: rename alloc_pages_exact_node() to __alloc_pages_node()

alloc_pages_exact_node() was introduced in commit 6484eb3e2a81 ("page
allocator: do not check NUMA node ID when the caller knows the node is
valid") as an optimized variant of alloc_pages_node(), that doesn't
fallback to current node for nid == NUMA_NO_NODE.  Unfortunately the
name of the function can easily suggest that the allocation is
restricted to the given node and fails otherwise.  In truth, the node is
only preferred, unless __GFP_THISNODE is passed among the gfp flags.

The misleading name has lead to mistakes in the past, see for example
commits 5265047ac301 ("mm, thp: really limit transparent hugepage
allocation to local node") and b360edb43f8e ("mm, mempolicy:
migrate_to_node should only migrate to node").

Another issue with the name is that there's a family of
alloc_pages_exact*() functions where 'exact' means exact size (instead
of page order), which leads to more confusion.

To prevent further mistakes, this patch effectively renames
alloc_pages_exact_node() to __alloc_pages_node() to better convey that
it's an optimized variant of alloc_pages_node() not intended for general
usage.  Both functions get described in comments.

It has been also considered to really provide a convenience function for
allocations restricted to a node, but the major opinion seems to be that
__GFP_THISNODE already provides that functionality and we shouldn't
duplicate the API needlessly.  The number of users would be small
anyway.

Existing callers of alloc_pages_exact_node() are simply converted to
call __alloc_pages_node(), with the exception of sba_alloc_coherent()
which open-codes the check for NUMA_NO_NODE, so it is converted to use
alloc_pages_node() instead.  This means it no longer performs some
VM_BUG_ON checks, and since the current check for nid in
alloc_pages_node() uses a 'nid < 0' comparison (which includes
NUMA_NO_NODE), it may hide wrong values which would be previously
exposed.

Both differences will be rectified by the next patch.

To sum up, this patch makes no functional changes, except temporarily
hiding potentially buggy callers.  Restricting the checks in
alloc_pages_node() is left for the next patch which can in turn expose
more existing buggy callers.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Robin Holt <robinmholt@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Gleb Natapov <gleb@kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Cliff Whickman <cpw@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/hp/common/sba_iommu.c   |  6 +-----
 arch/ia64/kernel/uncached.c       |  2 +-
 arch/ia64/sn/pci/pci_dma.c        |  2 +-
 arch/powerpc/platforms/cell/ras.c |  2 +-
 arch/x86/kvm/vmx.c                |  2 +-
 drivers/misc/sgi-xp/xpc_uv.c      |  2 +-
 include/linux/gfp.h               | 23 +++++++++++++++--------
 kernel/profile.c                  |  8 ++++----
 mm/filemap.c                      |  2 +-
 mm/huge_memory.c                  |  2 +-
 mm/hugetlb.c                      |  4 ++--
 mm/memory-failure.c               |  2 +-
 mm/mempolicy.c                    |  4 ++--
 mm/migrate.c                      |  4 ++--
 mm/page_alloc.c                   |  2 --
 mm/slab.c                         |  2 +-
 mm/slob.c                         |  4 ++--
 mm/slub.c                         |  2 +-
 18 files changed, 38 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 344387a55406..a6d6190c9d24 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1140,13 +1140,9 @@ sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 
 #ifdef CONFIG_NUMA
 	{
-		int node = ioc->node;
 		struct page *page;
 
-		if (node == NUMA_NO_NODE)
-			node = numa_node_id();
-
-		page = alloc_pages_exact_node(node, flags, get_order(size));
+		page = alloc_pages_node(ioc->node, flags, get_order(size));
 		if (unlikely(!page))
 			return NULL;
 
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index 20e8a9b21d75..f3976da36721 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -97,7 +97,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
 
 	/* attempt to allocate a granule's worth of cached memory pages */
 
-	page = alloc_pages_exact_node(nid,
+	page = __alloc_pages_node(nid,
 				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
 				IA64_GRANULE_SHIFT-PAGE_SHIFT);
 	if (!page) {
diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c
index d0853e8e8623..8f59907007cb 100644
--- a/arch/ia64/sn/pci/pci_dma.c
+++ b/arch/ia64/sn/pci/pci_dma.c
@@ -92,7 +92,7 @@ static void *sn_dma_alloc_coherent(struct device *dev, size_t size,
 	 */
 	node = pcibus_to_node(pdev->bus);
 	if (likely(node >=0)) {
-		struct page *p = alloc_pages_exact_node(node,
+		struct page *p = __alloc_pages_node(node,
 						flags, get_order(size));
 
 		if (likely(p))
diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c
index e865d748179b..2d4f60c0119a 100644
--- a/arch/powerpc/platforms/cell/ras.c
+++ b/arch/powerpc/platforms/cell/ras.c
@@ -123,7 +123,7 @@ static int __init cbe_ptcal_enable_on_node(int nid, int order)
 
 	area->nid = nid;
 	area->order = order;
-	area->pages = alloc_pages_exact_node(area->nid,
+	area->pages = __alloc_pages_node(area->nid,
 						GFP_KERNEL|__GFP_THISNODE,
 						area->order);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4a4eec30cc08..148ea2016022 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3150,7 +3150,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
 	struct page *pages;
 	struct vmcs *vmcs;
 
-	pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
+	pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
 	if (!pages)
 		return NULL;
 	vmcs = page_address(pages);
diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
index 95c894482fdd..340b44d9e8cf 100644
--- a/drivers/misc/sgi-xp/xpc_uv.c
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -239,7 +239,7 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name,
 	mq->mmr_blade = uv_cpu_to_blade_id(cpu);
 
 	nid = cpu_to_node(cpu);
-	page = alloc_pages_exact_node(nid,
+	page = __alloc_pages_node(nid,
 				      GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
 				      pg_order);
 	if (page == NULL) {
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 3bd64b115999..d2c142bc872e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -303,20 +303,28 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
 	return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
 }
 
-static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
-						unsigned int order)
+/*
+ * Allocate pages, preferring the node given as nid. The node must be valid and
+ * online. For more general interface, see alloc_pages_node().
+ */
+static inline struct page *
+__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
 {
-	/* Unknown node is current node */
-	if (nid < 0)
-		nid = numa_node_id();
+	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
 
 	return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
 }
 
-static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
+/*
+ * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
+ * prefer the current CPU's node.
+ */
+static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 						unsigned int order)
 {
-	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
+	/* Unknown node is current node */
+	if (nid < 0)
+		nid = numa_node_id();
 
 	return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
 }
@@ -357,7 +365,6 @@ extern unsigned long get_zeroed_page(gfp_t gfp_mask);
 
 void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
 void free_pages_exact(void *virt, size_t size);
-/* This is different from alloc_pages_exact_node !!! */
 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
 
 #define __get_free_page(gfp_mask) \
diff --git a/kernel/profile.c b/kernel/profile.c
index a7bcd28d6e9f..99513e1160e5 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -339,7 +339,7 @@ static int profile_cpu_callback(struct notifier_block *info,
 		node = cpu_to_mem(cpu);
 		per_cpu(cpu_profile_flip, cpu) = 0;
 		if (!per_cpu(cpu_profile_hits, cpu)[1]) {
-			page = alloc_pages_exact_node(node,
+			page = __alloc_pages_node(node,
 					GFP_KERNEL | __GFP_ZERO,
 					0);
 			if (!page)
@@ -347,7 +347,7 @@ static int profile_cpu_callback(struct notifier_block *info,
 			per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
 		}
 		if (!per_cpu(cpu_profile_hits, cpu)[0]) {
-			page = alloc_pages_exact_node(node,
+			page = __alloc_pages_node(node,
 					GFP_KERNEL | __GFP_ZERO,
 					0);
 			if (!page)
@@ -543,14 +543,14 @@ static int create_hash_tables(void)
 		int node = cpu_to_mem(cpu);
 		struct page *page;
 
-		page = alloc_pages_exact_node(node,
+		page = __alloc_pages_node(node,
 				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
 				0);
 		if (!page)
 			goto out_cleanup;
 		per_cpu(cpu_profile_hits, cpu)[1]
 				= (struct profile_hit *)page_address(page);
-		page = alloc_pages_exact_node(node,
+		page = __alloc_pages_node(node,
 				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
 				0);
 		if (!page)
diff --git a/mm/filemap.c b/mm/filemap.c
index 30d69c0c5a38..72940fb38666 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -674,7 +674,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
 		do {
 			cpuset_mems_cookie = read_mems_allowed_begin();
 			n = cpuset_mem_spread_node();
-			page = alloc_pages_exact_node(n, gfp, 0);
+			page = __alloc_pages_node(n, gfp, 0);
 		} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
 
 		return page;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 71a4822c832b..883f613ada7e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2414,7 +2414,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
 	 */
 	up_read(&mm->mmap_sem);
 
-	*hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
+	*hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
 	if (unlikely(!*hpage)) {
 		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
 		*hpage = ERR_PTR(-ENOMEM);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cd1280c487ff..999fb0aef8f1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1331,7 +1331,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
 	struct page *page;
 
-	page = alloc_pages_exact_node(nid,
+	page = __alloc_pages_node(nid,
 		htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
 						__GFP_REPEAT|__GFP_NOWARN,
 		huge_page_order(h));
@@ -1483,7 +1483,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
 				   __GFP_REPEAT|__GFP_NOWARN,
 				   huge_page_order(h));
 	else
-		page = alloc_pages_exact_node(nid,
+		page = __alloc_pages_node(nid,
 			htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
 			__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index bba2d7c2c9ce..eeda6485e76c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1521,7 +1521,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
 		return alloc_huge_page_node(page_hstate(compound_head(p)),
 						   nid);
 	else
-		return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+		return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
 }
 
 /*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d6f2caee28c0..87a177917cb2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -942,7 +942,7 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
 		return alloc_huge_page_node(page_hstate(compound_head(page)),
 					node);
 	else
-		return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
+		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
 						    __GFP_THISNODE, 0);
 }
 
@@ -1998,7 +1998,7 @@ retry_cpuset:
 		nmask = policy_nodemask(gfp, pol);
 		if (!nmask || node_isset(hpage_node, *nmask)) {
 			mpol_cond_put(pol);
-			page = alloc_pages_exact_node(hpage_node,
+			page = __alloc_pages_node(hpage_node,
 						gfp | __GFP_THISNODE, order);
 			goto out;
 		}
diff --git a/mm/migrate.c b/mm/migrate.c
index 918defbdda0e..02ce25df16c2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1195,7 +1195,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
 		return alloc_huge_page_node(page_hstate(compound_head(p)),
 					pm->node);
 	else
-		return alloc_pages_exact_node(pm->node,
+		return __alloc_pages_node(pm->node,
 				GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
 }
 
@@ -1555,7 +1555,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
 	int nid = (int) data;
 	struct page *newpage;
 
-	newpage = alloc_pages_exact_node(nid,
+	newpage = __alloc_pages_node(nid,
 					 (GFP_HIGHUSER_MOVABLE |
 					  __GFP_THISNODE | __GFP_NOMEMALLOC |
 					  __GFP_NORETRY | __GFP_NOWARN) &
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 252665d553b4..bdaa0cf8fd41 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3511,8 +3511,6 @@ EXPORT_SYMBOL(alloc_pages_exact);
  *
  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
  * back.
- * Note this is not alloc_pages_exact_node() which allocates on a specific node,
- * but is not exact.
  */
 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
 {
diff --git a/mm/slab.c b/mm/slab.c
index 60c936938b84..c77ebe6cc87c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1595,7 +1595,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 	if (memcg_charge_slab(cachep, flags, cachep->gfporder))
 		return NULL;
 
-	page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
+	page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
 	if (!page) {
 		memcg_uncharge_slab(cachep, cachep->gfporder);
 		slab_out_of_memory(cachep, flags, nodeid);
diff --git a/mm/slob.c b/mm/slob.c
index 165bbd3cd606..0d7e5df74d1f 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -45,7 +45,7 @@
  * NUMA support in SLOB is fairly simplistic, pushing most of the real
  * logic down to the page allocator, and simply doing the node accounting
  * on the upper levels. In the event that a node id is explicitly
- * provided, alloc_pages_exact_node() with the specified node id is used
+ * provided, __alloc_pages_node() with the specified node id is used
  * instead. The common case (or when the node id isn't explicitly provided)
  * will default to the current node, as per numa_node_id().
  *
@@ -193,7 +193,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
 
 #ifdef CONFIG_NUMA
 	if (node != NUMA_NO_NODE)
-		page = alloc_pages_exact_node(node, gfp, order);
+		page = __alloc_pages_node(node, gfp, order);
 	else
 #endif
 		page = alloc_pages(gfp, order);
diff --git a/mm/slub.c b/mm/slub.c
index 084184e706c6..f614b5dc396b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1334,7 +1334,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
 	if (node == NUMA_NO_NODE)
 		page = alloc_pages(flags, order);
 	else
-		page = alloc_pages_exact_node(node, flags, order);
+		page = __alloc_pages_node(node, flags, order);
 
 	if (!page)
 		memcg_uncharge_slab(s, order);
-- 
cgit v1.2.3-70-g09d2


From 0bc35a970c01c50e3bcc4b5a612787346024e5db Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 8 Sep 2015 15:03:53 -0700
Subject: mm: unify checks in alloc_pages_node() and __alloc_pages_node()

Perform the same debug checks in alloc_pages_node() as are done in
__alloc_pages_node(), by making the former function a wrapper of the
latter one.

In addition to better diagnostics in DEBUG_VM builds for situations
which have been already fatal (e.g.  out-of-bounds node id), there are
two visible changes for potential existing buggy callers of
alloc_pages_node():

- calling alloc_pages_node() with any negative nid (e.g. due to arithmetic
  overflow) was treated as passing NUMA_NO_NODE and fallback to local node was
  applied. This will now be fatal.
- calling alloc_pages_node() with an offline node will now be checked for
  DEBUG_VM builds. Since it's not fatal if the node has been previously online,
  and this patch may expose some existing buggy callers, change the VM_BUG_ON
  in __alloc_pages_node() to VM_WARN_ON.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index d2c142bc872e..4a12cae2fb0c 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -310,23 +310,23 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
 static inline struct page *
 __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
 {
-	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
+	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
+	VM_WARN_ON(!node_online(nid));
 
 	return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
 }
 
 /*
  * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
- * prefer the current CPU's node.
+ * prefer the current CPU's node. Otherwise node must be valid and online.
  */
 static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 						unsigned int order)
 {
-	/* Unknown node is current node */
-	if (nid < 0)
+	if (nid == NUMA_NO_NODE)
 		nid = numa_node_id();
 
-	return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
+	return __alloc_pages_node(nid, gfp_mask, order);
 }
 
 #ifdef CONFIG_NUMA
-- 
cgit v1.2.3-70-g09d2


From 82c1fc714763b823169958a98196d9be56c63b30 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 8 Sep 2015 15:03:56 -0700
Subject: mm: use numa_mem_id() in alloc_pages_node()

alloc_pages_node() might fail when called with NUMA_NO_NODE and
__GFP_THISNODE on a CPU belonging to a memoryless node.  To make the
local-node fallback more robust and prevent such situations, use
numa_mem_id(), which was introduced for similar scenarios in the slab
context.

Suggested-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 4a12cae2fb0c..f92cbd2f4450 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -318,13 +318,14 @@ __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
 
 /*
  * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
- * prefer the current CPU's node. Otherwise node must be valid and online.
+ * prefer the current CPU's closest node. Otherwise node must be valid and
+ * online.
  */
 static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 						unsigned int order)
 {
 	if (nid == NUMA_NO_NODE)
-		nid = numa_node_id();
+		nid = numa_mem_id();
 
 	return __alloc_pages_node(nid, gfp_mask, order);
 }
-- 
cgit v1.2.3-70-g09d2


From 7d3f3938236b4bb878214e6791e76fd8409bdeee Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 8 Sep 2015 15:04:35 -0700
Subject: zsmalloc/zram: introduce zs_pool_stats api

`zs_compact_control' accounts the number of migrated objects but it has
a limited lifespan -- we lose it as soon as zs_compaction() returns back
to zram.  It worked fine, because (a) zram had it's own counter of
migrated objects and (b) only zram could trigger compaction.  However,
this does not work for automatic pool compaction (not issued by zram).
To account objects migrated during auto-compaction (issued by the
shrinker) we need to store this number in zs_pool.

Define a new `struct zs_pool_stats' structure to keep zs_pool's stats
there.  It provides only `num_migrated', as of this writing, but it
surely can be extended.

A new zsmalloc zs_pool_stats() symbol exports zs_pool's stats back to
caller.

Use zs_pool_stats() in zram and remove `num_migrated' from zram_stats.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Suggested-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 15 +++++++++------
 drivers/block/zram/zram_drv.h |  1 -
 include/linux/zsmalloc.h      |  6 ++++++
 mm/zsmalloc.c                 | 29 +++++++++++++++--------------
 4 files changed, 30 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9c01f5bfa33f..bcde5c321090 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -388,7 +388,6 @@ static ssize_t comp_algorithm_store(struct device *dev,
 static ssize_t compact_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
-	unsigned long nr_migrated;
 	struct zram *zram = dev_to_zram(dev);
 	struct zram_meta *meta;
 
@@ -399,8 +398,7 @@ static ssize_t compact_store(struct device *dev,
 	}
 
 	meta = zram->meta;
-	nr_migrated = zs_compact(meta->mem_pool);
-	atomic64_add(nr_migrated, &zram->stats.num_migrated);
+	zs_compact(meta->mem_pool);
 	up_read(&zram->init_lock);
 
 	return len;
@@ -428,26 +426,31 @@ static ssize_t mm_stat_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
 	struct zram *zram = dev_to_zram(dev);
+	struct zs_pool_stats pool_stats;
 	u64 orig_size, mem_used = 0;
 	long max_used;
 	ssize_t ret;
 
+	memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
+
 	down_read(&zram->init_lock);
-	if (init_done(zram))
+	if (init_done(zram)) {
 		mem_used = zs_get_total_pages(zram->meta->mem_pool);
+		zs_pool_stats(zram->meta->mem_pool, &pool_stats);
+	}
 
 	orig_size = atomic64_read(&zram->stats.pages_stored);
 	max_used = atomic_long_read(&zram->stats.max_used_pages);
 
 	ret = scnprintf(buf, PAGE_SIZE,
-			"%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n",
+			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
 			orig_size << PAGE_SHIFT,
 			(u64)atomic64_read(&zram->stats.compr_data_size),
 			mem_used << PAGE_SHIFT,
 			zram->limit_pages << PAGE_SHIFT,
 			max_used << PAGE_SHIFT,
 			(u64)atomic64_read(&zram->stats.zero_pages),
-			(u64)atomic64_read(&zram->stats.num_migrated));
+			pool_stats.num_migrated);
 	up_read(&zram->init_lock);
 
 	return ret;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 6dbe2df506bf..8e92339686d7 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -78,7 +78,6 @@ struct zram_stats {
 	atomic64_t compr_data_size;	/* compressed size of pages stored */
 	atomic64_t num_reads;	/* failed + successful */
 	atomic64_t num_writes;	/* --do-- */
-	atomic64_t num_migrated;	/* no. of migrated object */
 	atomic64_t failed_reads;	/* can happen when memory is too low */
 	atomic64_t failed_writes;	/* can happen when memory is too low */
 	atomic64_t invalid_io;	/* non-page-aligned I/O requests */
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 1338190b5478..ad3d23239043 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -34,6 +34,11 @@ enum zs_mapmode {
 	 */
 };
 
+struct zs_pool_stats {
+	/* How many objects were migrated */
+	unsigned long num_migrated;
+};
+
 struct zs_pool;
 
 struct zs_pool *zs_create_pool(char *name, gfp_t flags);
@@ -49,4 +54,5 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
 unsigned long zs_get_total_pages(struct zs_pool *pool);
 unsigned long zs_compact(struct zs_pool *pool);
 
+void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats);
 #endif
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 2a1f95249f12..8f76d8875aca 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -245,6 +245,7 @@ struct zs_pool {
 	gfp_t flags;	/* allocation flags used when growing pool */
 	atomic_long_t pages_allocated;
 
+	struct zs_pool_stats stats;
 #ifdef CONFIG_ZSMALLOC_STAT
 	struct dentry *stat_dentry;
 #endif
@@ -1578,7 +1579,7 @@ struct zs_compact_control {
 	 /* Starting object index within @s_page which used for live object
 	  * in the subpage. */
 	int index;
-	/* how many of objects are migrated */
+	/* How many of objects were migrated */
 	int nr_migrated;
 };
 
@@ -1590,7 +1591,6 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
 	struct page *s_page = cc->s_page;
 	struct page *d_page = cc->d_page;
 	unsigned long index = cc->index;
-	int nr_migrated = 0;
 	int ret = 0;
 
 	while (1) {
@@ -1617,13 +1617,12 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
 		record_obj(handle, free_obj);
 		unpin_tag(handle);
 		obj_free(pool, class, used_obj);
-		nr_migrated++;
+		cc->nr_migrated++;
 	}
 
 	/* Remember last position in this iteration */
 	cc->s_page = s_page;
 	cc->index = index;
-	cc->nr_migrated = nr_migrated;
 
 	return ret;
 }
@@ -1699,14 +1698,13 @@ static unsigned long zs_can_compact(struct size_class *class)
 	return obj_wasted * get_pages_per_zspage(class->size);
 }
 
-static unsigned long __zs_compact(struct zs_pool *pool,
-				struct size_class *class)
+static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 {
 	struct zs_compact_control cc;
 	struct page *src_page;
 	struct page *dst_page = NULL;
-	unsigned long nr_total_migrated = 0;
 
+	cc.nr_migrated = 0;
 	spin_lock(&class->lock);
 	while ((src_page = isolate_source_page(class))) {
 
@@ -1728,7 +1726,6 @@ static unsigned long __zs_compact(struct zs_pool *pool,
 				break;
 
 			putback_zspage(pool, class, dst_page);
-			nr_total_migrated += cc.nr_migrated;
 		}
 
 		/* Stop if we couldn't find slot */
@@ -1738,7 +1735,6 @@ static unsigned long __zs_compact(struct zs_pool *pool,
 		putback_zspage(pool, class, dst_page);
 		putback_zspage(pool, class, src_page);
 		spin_unlock(&class->lock);
-		nr_total_migrated += cc.nr_migrated;
 		cond_resched();
 		spin_lock(&class->lock);
 	}
@@ -1746,15 +1742,14 @@ static unsigned long __zs_compact(struct zs_pool *pool,
 	if (src_page)
 		putback_zspage(pool, class, src_page);
 
-	spin_unlock(&class->lock);
+	pool->stats.num_migrated += cc.nr_migrated;
 
-	return nr_total_migrated;
+	spin_unlock(&class->lock);
 }
 
 unsigned long zs_compact(struct zs_pool *pool)
 {
 	int i;
-	unsigned long nr_migrated = 0;
 	struct size_class *class;
 
 	for (i = zs_size_classes - 1; i >= 0; i--) {
@@ -1763,13 +1758,19 @@ unsigned long zs_compact(struct zs_pool *pool)
 			continue;
 		if (class->index != i)
 			continue;
-		nr_migrated += __zs_compact(pool, class);
+		__zs_compact(pool, class);
 	}
 
-	return nr_migrated;
+	return pool->stats.num_migrated;
 }
 EXPORT_SYMBOL_GPL(zs_compact);
 
+void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats)
+{
+	memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats));
+}
+EXPORT_SYMBOL_GPL(zs_pool_stats);
+
 /**
  * zs_create_pool - Creates an allocation pool to work from.
  * @flags: allocation flags used to allocate pool metadata
-- 
cgit v1.2.3-70-g09d2


From 860c707dca155a56dfa115ddd6c00959296144a6 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 8 Sep 2015 15:04:38 -0700
Subject: zsmalloc: account the number of compacted pages

Compaction returns back to zram the number of migrated objects, which is
quite uninformative -- we have objects of different sizes so user space
cannot obtain any valuable data from that number.  Change compaction to
operate in terms of pages and return back to compaction issuer the
number of pages that were freed during compaction.  So from now on we
will export more meaningful value in zram<id>/mm_stat -- the number of
freed (compacted) pages.

This requires:
 (a) a rename of `num_migrated' to 'pages_compacted'
 (b) a internal API change -- return first_page's fullness_group from
     putback_zspage(), so we know when putback_zspage() did
     free_zspage().  It helps us to account compaction stats correctly.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/blockdev/zram.txt |  3 ++-
 drivers/block/zram/zram_drv.c   |  2 +-
 include/linux/zsmalloc.h        |  4 ++--
 mm/zsmalloc.c                   | 27 +++++++++++++++++----------
 4 files changed, 22 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index c4de576093af..62435bb25266 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -144,7 +144,8 @@ mem_used_max      RW    the maximum amount memory zram have consumed to
                         store compressed data
 mem_limit         RW    the maximum amount of memory ZRAM can use to store
                         the compressed data
-num_migrated      RO    the number of objects migrated migrated by compaction
+pages_compacted   RO    the number of pages freed during compaction
+                        (available only via zram<id>/mm_stat node)
 compact           WO    trigger memory compaction
 
 WARNING
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index bcde5c321090..f1c4bb34e007 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -450,7 +450,7 @@ static ssize_t mm_stat_show(struct device *dev,
 			zram->limit_pages << PAGE_SHIFT,
 			max_used << PAGE_SHIFT,
 			(u64)atomic64_read(&zram->stats.zero_pages),
-			pool_stats.num_migrated);
+			pool_stats.pages_compacted);
 	up_read(&zram->init_lock);
 
 	return ret;
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index ad3d23239043..6398dfae53f1 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -35,8 +35,8 @@ enum zs_mapmode {
 };
 
 struct zs_pool_stats {
-	/* How many objects were migrated */
-	unsigned long num_migrated;
+	/* How many pages were migrated (freed) */
+	unsigned long pages_compacted;
 };
 
 struct zs_pool;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 8f76d8875aca..b7b4a5612ec7 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1579,8 +1579,6 @@ struct zs_compact_control {
 	 /* Starting object index within @s_page which used for live object
 	  * in the subpage. */
 	int index;
-	/* How many of objects were migrated */
-	int nr_migrated;
 };
 
 static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
@@ -1617,7 +1615,6 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
 		record_obj(handle, free_obj);
 		unpin_tag(handle);
 		obj_free(pool, class, used_obj);
-		cc->nr_migrated++;
 	}
 
 	/* Remember last position in this iteration */
@@ -1643,8 +1640,17 @@ static struct page *isolate_target_page(struct size_class *class)
 	return page;
 }
 
-static void putback_zspage(struct zs_pool *pool, struct size_class *class,
-				struct page *first_page)
+/*
+ * putback_zspage - add @first_page into right class's fullness list
+ * @pool: target pool
+ * @class: destination class
+ * @first_page: target page
+ *
+ * Return @fist_page's fullness_group
+ */
+static enum fullness_group putback_zspage(struct zs_pool *pool,
+			struct size_class *class,
+			struct page *first_page)
 {
 	enum fullness_group fullness;
 
@@ -1662,6 +1668,8 @@ static void putback_zspage(struct zs_pool *pool, struct size_class *class,
 
 		free_zspage(first_page);
 	}
+
+	return fullness;
 }
 
 static struct page *isolate_source_page(struct size_class *class)
@@ -1704,7 +1712,6 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 	struct page *src_page;
 	struct page *dst_page = NULL;
 
-	cc.nr_migrated = 0;
 	spin_lock(&class->lock);
 	while ((src_page = isolate_source_page(class))) {
 
@@ -1733,7 +1740,9 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 			break;
 
 		putback_zspage(pool, class, dst_page);
-		putback_zspage(pool, class, src_page);
+		if (putback_zspage(pool, class, src_page) == ZS_EMPTY)
+			pool->stats.pages_compacted +=
+				get_pages_per_zspage(class->size);
 		spin_unlock(&class->lock);
 		cond_resched();
 		spin_lock(&class->lock);
@@ -1742,8 +1751,6 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 	if (src_page)
 		putback_zspage(pool, class, src_page);
 
-	pool->stats.num_migrated += cc.nr_migrated;
-
 	spin_unlock(&class->lock);
 }
 
@@ -1761,7 +1768,7 @@ unsigned long zs_compact(struct zs_pool *pool)
 		__zs_compact(pool, class);
 	}
 
-	return pool->stats.num_migrated;
+	return pool->stats.pages_compacted;
 }
 EXPORT_SYMBOL_GPL(zs_compact);
 
-- 
cgit v1.2.3-70-g09d2


From 5b999aadbae65696a148f55250d94b6f3d74071e Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <0x7f454c46@gmail.com>
Date: Tue, 8 Sep 2015 15:05:00 -0700
Subject: mm: swap: zswap: maybe_preload & refactoring

zswap_get_swap_cache_page and read_swap_cache_async have pretty much the
same code with only significant difference in return value and usage of
swap_readpage.

I a helper __read_swap_cache_async() with the common code.  Behavior
change: now zswap_get_swap_cache_page will use radix_tree_maybe_preload
instead radix_tree_preload.  Looks like, this wasn't changed only by the
reason of code duplication.

Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@fb.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Herrmann <dh.herrmann@gmail.com>
Cc: Seth Jennings <sjennings@variantweb.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  3 +++
 mm/swap_state.c      | 37 ++++++++++++++++++--------
 mm/zswap.c           | 73 +++++-----------------------------------------------
 3 files changed, 35 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2ce190709280..7ba7dccaf0e7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -406,6 +406,9 @@ extern void free_pages_and_swap_cache(struct page **, int);
 extern struct page *lookup_swap_cache(swp_entry_t);
 extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
 			struct vm_area_struct *vma, unsigned long addr);
+extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
+			struct vm_area_struct *vma, unsigned long addr,
+			bool *new_page_allocated);
 extern struct page *swapin_readahead(swp_entry_t, gfp_t,
 			struct vm_area_struct *vma, unsigned long addr);
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8bc8e66138da..d504adb7fa5f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -288,17 +288,14 @@ struct page * lookup_swap_cache(swp_entry_t entry)
 	return page;
 }
 
-/* 
- * Locate a page of swap in physical memory, reserving swap cache space
- * and reading the disk if it is not already cached.
- * A failure return means that either the page allocation failed or that
- * the swap entry is no longer in use.
- */
-struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
-			struct vm_area_struct *vma, unsigned long addr)
+struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+			struct vm_area_struct *vma, unsigned long addr,
+			bool *new_page_allocated)
 {
 	struct page *found_page, *new_page = NULL;
+	struct address_space *swapper_space = swap_address_space(entry);
 	int err;
+	*new_page_allocated = false;
 
 	do {
 		/*
@@ -306,8 +303,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * called after lookup_swap_cache() failed, re-calling
 		 * that would confuse statistics.
 		 */
-		found_page = find_get_page(swap_address_space(entry),
-					entry.val);
+		found_page = find_get_page(swapper_space, entry.val);
 		if (found_page)
 			break;
 
@@ -366,7 +362,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			 * Initiate read into locked page and return.
 			 */
 			lru_cache_add_anon(new_page);
-			swap_readpage(new_page);
+			*new_page_allocated = true;
 			return new_page;
 		}
 		radix_tree_preload_end();
@@ -384,6 +380,25 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	return found_page;
 }
 
+/*
+ * Locate a page of swap in physical memory, reserving swap cache space
+ * and reading the disk if it is not already cached.
+ * A failure return means that either the page allocation failed or that
+ * the swap entry is no longer in use.
+ */
+struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+			struct vm_area_struct *vma, unsigned long addr)
+{
+	bool page_was_allocated;
+	struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
+			vma, addr, &page_was_allocated);
+
+	if (page_was_allocated)
+		swap_readpage(retpage);
+
+	return retpage;
+}
+
 static unsigned long swapin_nr_pages(unsigned long offset)
 {
 	static unsigned long prev_offset;
diff --git a/mm/zswap.c b/mm/zswap.c
index 2d5727baed59..09208c7c86f3 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -446,75 +446,14 @@ enum zswap_get_swap_ret {
 static int zswap_get_swap_cache_page(swp_entry_t entry,
 				struct page **retpage)
 {
-	struct page *found_page, *new_page = NULL;
-	struct address_space *swapper_space = swap_address_space(entry);
-	int err;
+	bool page_was_allocated;
 
-	*retpage = NULL;
-	do {
-		/*
-		 * First check the swap cache.  Since this is normally
-		 * called after lookup_swap_cache() failed, re-calling
-		 * that would confuse statistics.
-		 */
-		found_page = find_get_page(swapper_space, entry.val);
-		if (found_page)
-			break;
-
-		/*
-		 * Get a new page to read into from swap.
-		 */
-		if (!new_page) {
-			new_page = alloc_page(GFP_KERNEL);
-			if (!new_page)
-				break; /* Out of memory */
-		}
-
-		/*
-		 * call radix_tree_preload() while we can wait.
-		 */
-		err = radix_tree_preload(GFP_KERNEL);
-		if (err)
-			break;
-
-		/*
-		 * Swap entry may have been freed since our caller observed it.
-		 */
-		err = swapcache_prepare(entry);
-		if (err == -EEXIST) { /* seems racy */
-			radix_tree_preload_end();
-			continue;
-		}
-		if (err) { /* swp entry is obsolete ? */
-			radix_tree_preload_end();
-			break;
-		}
-
-		/* May fail (-ENOMEM) if radix-tree node allocation failed. */
-		__set_page_locked(new_page);
-		SetPageSwapBacked(new_page);
-		err = __add_to_swap_cache(new_page, entry);
-		if (likely(!err)) {
-			radix_tree_preload_end();
-			lru_cache_add_anon(new_page);
-			*retpage = new_page;
-			return ZSWAP_SWAPCACHE_NEW;
-		}
-		radix_tree_preload_end();
-		ClearPageSwapBacked(new_page);
-		__clear_page_locked(new_page);
-		/*
-		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
-		 * clear SWAP_HAS_CACHE flag.
-		 */
-		swapcache_free(entry);
-	} while (err != -ENOMEM);
-
-	if (new_page)
-		page_cache_release(new_page);
-	if (!found_page)
+	*retpage = __read_swap_cache_async(entry, GFP_KERNEL,
+			NULL, 0, &page_was_allocated);
+	if (page_was_allocated)
+		return ZSWAP_SWAPCACHE_NEW;
+	if (!*retpage)
 		return ZSWAP_SWAPCACHE_FAIL;
-	*retpage = found_page;
 	return ZSWAP_SWAPCACHE_EXIST;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 786727799a85aeabc20cab5ecfb72771bcbd6b85 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Tue, 8 Sep 2015 15:05:03 -0700
Subject: mm: zpool: constify the zpool_ops

The structure zpool_ops is not modified so make the pointer to it a
pointer to const.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Acked-by: Dan Streetman <ddstreet@ieee.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/zpool.h | 4 ++--
 mm/zbud.c             | 4 ++--
 mm/zpool.c            | 4 ++--
 mm/zsmalloc.c         | 3 ++-
 mm/zswap.c            | 2 +-
 5 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index d30eff3d84d5..c924a28d9805 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -37,7 +37,7 @@ enum zpool_mapmode {
 };
 
 struct zpool *zpool_create_pool(char *type, char *name,
-			gfp_t gfp, struct zpool_ops *ops);
+			gfp_t gfp, const struct zpool_ops *ops);
 
 char *zpool_get_type(struct zpool *pool);
 
@@ -81,7 +81,7 @@ struct zpool_driver {
 	atomic_t refcount;
 	struct list_head list;
 
-	void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops,
+	void *(*create)(char *name, gfp_t gfp, const struct zpool_ops *ops,
 			struct zpool *zpool);
 	void (*destroy)(void *pool);
 
diff --git a/mm/zbud.c b/mm/zbud.c
index f3bf6f7627d8..6f8158d64864 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -99,7 +99,7 @@ struct zbud_pool {
 	struct zbud_ops *ops;
 #ifdef CONFIG_ZPOOL
 	struct zpool *zpool;
-	struct zpool_ops *zpool_ops;
+	const struct zpool_ops *zpool_ops;
 #endif
 };
 
@@ -138,7 +138,7 @@ static struct zbud_ops zbud_zpool_ops = {
 };
 
 static void *zbud_zpool_create(char *name, gfp_t gfp,
-			       struct zpool_ops *zpool_ops,
+			       const struct zpool_ops *zpool_ops,
 			       struct zpool *zpool)
 {
 	struct zbud_pool *pool;
diff --git a/mm/zpool.c b/mm/zpool.c
index 722a4f60e90b..951db32b833f 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -22,7 +22,7 @@ struct zpool {
 
 	struct zpool_driver *driver;
 	void *pool;
-	struct zpool_ops *ops;
+	const struct zpool_ops *ops;
 
 	struct list_head list;
 };
@@ -115,7 +115,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
  * Returns: New zpool on success, NULL on failure.
  */
 struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
-		struct zpool_ops *ops)
+		const struct zpool_ops *ops)
 {
 	struct zpool_driver *driver;
 	struct zpool *zpool;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 089120429c18..f135b1b6fcdc 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -311,7 +311,8 @@ static void record_obj(unsigned long handle, unsigned long obj)
 
 #ifdef CONFIG_ZPOOL
 
-static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops,
+static void *zs_zpool_create(char *name, gfp_t gfp,
+			     const struct zpool_ops *zpool_ops,
 			     struct zpool *zpool)
 {
 	return zs_create_pool(name, gfp);
diff --git a/mm/zswap.c b/mm/zswap.c
index 09208c7c86f3..48a1d081e2a5 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -755,7 +755,7 @@ static void zswap_frontswap_invalidate_area(unsigned type)
 	zswap_trees[type] = NULL;
 }
 
-static struct zpool_ops zswap_zpool_ops = {
+static const struct zpool_ops zswap_zpool_ops = {
 	.evict = zswap_writeback_entry
 };
 
-- 
cgit v1.2.3-70-g09d2


From c83db4f419e7105af38cdcca80cc51213214a2c8 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Tue, 8 Sep 2015 15:05:06 -0700
Subject: mm: zbud: constify the zbud_ops

The structure zbud_ops is not modified so make the pointer to it a
pointer to const.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Acked-by: Dan Streetman <ddstreet@ieee.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/zbud.h | 2 +-
 mm/zbud.c            | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/zbud.h b/include/linux/zbud.h
index f9d41a6e361f..e183a0a65ac1 100644
--- a/include/linux/zbud.h
+++ b/include/linux/zbud.h
@@ -9,7 +9,7 @@ struct zbud_ops {
 	int (*evict)(struct zbud_pool *pool, unsigned long handle);
 };
 
-struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops);
+struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops);
 void zbud_destroy_pool(struct zbud_pool *pool);
 int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
 	unsigned long *handle);
diff --git a/mm/zbud.c b/mm/zbud.c
index 6f8158d64864..fa48bcdff9d5 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -96,7 +96,7 @@ struct zbud_pool {
 	struct list_head buddied;
 	struct list_head lru;
 	u64 pages_nr;
-	struct zbud_ops *ops;
+	const struct zbud_ops *ops;
 #ifdef CONFIG_ZPOOL
 	struct zpool *zpool;
 	const struct zpool_ops *zpool_ops;
@@ -133,7 +133,7 @@ static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
 		return -ENOENT;
 }
 
-static struct zbud_ops zbud_zpool_ops = {
+static const struct zbud_ops zbud_zpool_ops = {
 	.evict =	zbud_zpool_evict
 };
 
@@ -302,7 +302,7 @@ static int num_free_chunks(struct zbud_header *zhdr)
  * Return: pointer to the new zbud pool or NULL if the metadata allocation
  * failed.
  */
-struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
+struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
 {
 	struct zbud_pool *pool;
 	int i;
-- 
cgit v1.2.3-70-g09d2


From 4eafbd15b6c84cd3f6c76022c8a6c27f7cc076e1 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Tue, 8 Sep 2015 18:41:01 +0200
Subject: PM / OPP: add dev_pm_opp_get_suspend_opp() helper

Add dev_pm_opp_get_suspend_opp() helper to obtain suspend opp.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp.c | 30 ++++++++++++++++++++++++++++++
 include/linux/pm_opp.h   |  6 ++++++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c
index bb703b5ebaff..3df62dbcec3a 100644
--- a/drivers/base/power/opp.c
+++ b/drivers/base/power/opp.c
@@ -340,6 +340,36 @@ unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
 
+/**
+ * dev_pm_opp_get_suspend_opp() - Get suspend opp
+ * @dev:	device for which we do this operation
+ *
+ * Return: This function returns pointer to the suspend opp if it is
+ * defined, otherwise it returns NULL.
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. The reason for the same is that the opp pointer which is
+ * returned will remain valid for use with opp_get_{voltage, freq} only while
+ * under the locked area. The pointer returned must be used prior to unlocking
+ * with rcu_read_unlock() to maintain the integrity of the pointer.
+ */
+struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
+{
+	struct device_opp *dev_opp;
+	struct dev_pm_opp *opp;
+
+	opp_rcu_lockdep_assert();
+
+	dev_opp = _find_device_opp(dev);
+	if (IS_ERR(dev_opp))
+		opp = NULL;
+	else
+		opp = dev_opp->suspend_opp;
+
+	return opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_suspend_opp);
+
 /**
  * dev_pm_opp_get_opp_count() - Get number of opps available in the opp list
  * @dev:	device for which we do this operation
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index cab7ba55bedb..e817722ee3f0 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -34,6 +34,7 @@ bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp);
 
 int dev_pm_opp_get_opp_count(struct device *dev);
 unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev);
+struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev);
 
 struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
 					      unsigned long freq,
@@ -80,6 +81,11 @@ static inline unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
 	return 0;
 }
 
+static inline struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
+{
+	return NULL;
+}
+
 static inline struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
 					unsigned long freq, bool available)
 {
-- 
cgit v1.2.3-70-g09d2


From 792aec47d59d951865cc617a97b6e6be53d4b977 Mon Sep 17 00:00:00 2001
From: "Woojung.Huh@microchip.com" <Woojung.Huh@microchip.com>
Date: Wed, 9 Sep 2015 20:49:53 +0000
Subject: add microchip LAN88xx phy driver

Add Microchip LAN88XX phy driver for phylib.

Signed-off-by: Woojung Huh <woojung.huh@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig      |   5 ++
 drivers/net/phy/Makefile     |   1 +
 drivers/net/phy/microchip.c  | 148 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/microchipphy.h |  73 +++++++++++++++++++++
 4 files changed, 227 insertions(+)
 create mode 100644 drivers/net/phy/microchip.c
 create mode 100644 include/linux/microchipphy.h

(limited to 'include/linux')

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index c07030dbe748..c5ad98ace5d0 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -127,6 +127,11 @@ config DP83867_PHY
 	---help---
 	  Currently supports the DP83867 PHY.
 
+config MICROCHIP_PHY
+	tristate "Drivers for Microchip PHYs"
+	help
+	  Supports the LAN88XX PHYs.
+
 config FIXED_PHY
 	tristate "Driver for MDIO Bus/PHY emulation with fixed speed/link PHYs"
 	depends on PHYLIB
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index 9bb103358c74..87f079c4b2c7 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -37,3 +37,4 @@ obj-$(CONFIG_MDIO_BUS_MUX_MMIOREG) += mdio-mux-mmioreg.o
 obj-$(CONFIG_MDIO_SUN4I)	+= mdio-sun4i.o
 obj-$(CONFIG_MDIO_MOXART)	+= mdio-moxart.o
 obj-$(CONFIG_MDIO_BCM_UNIMAC)	+= mdio-bcm-unimac.o
+obj-$(CONFIG_MICROCHIP_PHY)	+= microchip.o
diff --git a/drivers/net/phy/microchip.c b/drivers/net/phy/microchip.c
new file mode 100644
index 000000000000..c0a20ebd083b
--- /dev/null
+++ b/drivers/net/phy/microchip.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (C) 2015 Microchip Technology
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mii.h>
+#include <linux/ethtool.h>
+#include <linux/phy.h>
+#include <linux/microchipphy.h>
+
+#define DRIVER_AUTHOR	"WOOJUNG HUH <woojung.huh@microchip.com>"
+#define DRIVER_DESC	"Microchip LAN88XX PHY driver"
+
+struct lan88xx_priv {
+	int	chip_id;
+	int	chip_rev;
+	__u32	wolopts;
+};
+
+static int lan88xx_phy_config_intr(struct phy_device *phydev)
+{
+	int rc;
+
+	if (phydev->interrupts == PHY_INTERRUPT_ENABLED) {
+		/* unmask all source and clear them before enable */
+		rc = phy_write(phydev, LAN88XX_INT_MASK, 0x7FFF);
+		rc = phy_read(phydev, LAN88XX_INT_STS);
+		rc = phy_write(phydev, LAN88XX_INT_MASK,
+			       LAN88XX_INT_MASK_MDINTPIN_EN_ |
+			       LAN88XX_INT_MASK_LINK_CHANGE_);
+	} else {
+		rc = phy_write(phydev, LAN88XX_INT_MASK, 0);
+	}
+
+	return rc < 0 ? rc : 0;
+}
+
+static int lan88xx_phy_ack_interrupt(struct phy_device *phydev)
+{
+	int rc = phy_read(phydev, LAN88XX_INT_STS);
+
+	return rc < 0 ? rc : 0;
+}
+
+int lan88xx_suspend(struct phy_device *phydev)
+{
+	struct lan88xx_priv *priv = phydev->priv;
+
+	/* do not power down PHY when WOL is enabled */
+	if (!priv->wolopts)
+		genphy_suspend(phydev);
+
+	return 0;
+}
+
+static int lan88xx_probe(struct phy_device *phydev)
+{
+	struct device *dev = &phydev->dev;
+	struct lan88xx_priv *priv;
+
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->wolopts = 0;
+
+	/* these values can be used to identify internal PHY */
+	priv->chip_id = phy_read_mmd_indirect(phydev, LAN88XX_MMD3_CHIP_ID,
+					      3, phydev->addr);
+	priv->chip_rev = phy_read_mmd_indirect(phydev, LAN88XX_MMD3_CHIP_REV,
+					       3, phydev->addr);
+
+	phydev->priv = priv;
+
+	return 0;
+}
+
+static void lan88xx_remove(struct phy_device *phydev)
+{
+	struct device *dev = &phydev->dev;
+	struct lan88xx_priv *priv = phydev->priv;
+
+	if (priv)
+		devm_kfree(dev, priv);
+}
+
+static int lan88xx_set_wol(struct phy_device *phydev,
+			   struct ethtool_wolinfo *wol)
+{
+	struct lan88xx_priv *priv = phydev->priv;
+
+	priv->wolopts = wol->wolopts;
+
+	return 0;
+}
+
+static struct phy_driver microchip_phy_driver[] = {
+{
+	.phy_id		= 0x0007c130,
+	.phy_id_mask	= 0xfffffff0,
+	.name		= "Microchip LAN88xx",
+
+	.features	= (PHY_GBIT_FEATURES |
+			   SUPPORTED_Pause | SUPPORTED_Asym_Pause),
+	.flags		= PHY_HAS_INTERRUPT | PHY_HAS_MAGICANEG,
+
+	.probe		= lan88xx_probe,
+	.remove		= lan88xx_remove,
+
+	.config_init	= genphy_config_init,
+	.config_aneg	= genphy_config_aneg,
+	.read_status	= genphy_read_status,
+
+	.ack_interrupt	= lan88xx_phy_ack_interrupt,
+	.config_intr	= lan88xx_phy_config_intr,
+
+	.suspend	= lan88xx_suspend,
+	.resume		= genphy_resume,
+	.set_wol	= lan88xx_set_wol,
+
+	.driver		= { .owner = THIS_MODULE, }
+} };
+
+module_phy_driver(microchip_phy_driver);
+
+static struct mdio_device_id __maybe_unused microchip_tbl[] = {
+	{ 0x0007c130, 0xfffffff0 },
+	{ }
+};
+
+MODULE_DEVICE_TABLE(mdio, microchip_tbl);
+
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_LICENSE("GPL");
diff --git a/include/linux/microchipphy.h b/include/linux/microchipphy.h
new file mode 100644
index 000000000000..eb492d47f717
--- /dev/null
+++ b/include/linux/microchipphy.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2015 Microchip Technology
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _MICROCHIPPHY_H
+#define _MICROCHIPPHY_H
+
+#define LAN88XX_INT_MASK			(0x19)
+#define LAN88XX_INT_MASK_MDINTPIN_EN_		(0x8000)
+#define LAN88XX_INT_MASK_SPEED_CHANGE_		(0x4000)
+#define LAN88XX_INT_MASK_LINK_CHANGE_		(0x2000)
+#define LAN88XX_INT_MASK_FDX_CHANGE_		(0x1000)
+#define LAN88XX_INT_MASK_AUTONEG_ERR_		(0x0800)
+#define LAN88XX_INT_MASK_AUTONEG_DONE_		(0x0400)
+#define LAN88XX_INT_MASK_POE_DETECT_		(0x0200)
+#define LAN88XX_INT_MASK_SYMBOL_ERR_		(0x0100)
+#define LAN88XX_INT_MASK_FAST_LINK_FAIL_	(0x0080)
+#define LAN88XX_INT_MASK_WOL_EVENT_		(0x0040)
+#define LAN88XX_INT_MASK_EXTENDED_INT_		(0x0020)
+#define LAN88XX_INT_MASK_RESERVED_		(0x0010)
+#define LAN88XX_INT_MASK_FALSE_CARRIER_		(0x0008)
+#define LAN88XX_INT_MASK_LINK_SPEED_DS_		(0x0004)
+#define LAN88XX_INT_MASK_MASTER_SLAVE_DONE_	(0x0002)
+#define LAN88XX_INT_MASK_RX__ER_		(0x0001)
+
+#define LAN88XX_INT_STS				(0x1A)
+#define LAN88XX_INT_STS_INT_ACTIVE_		(0x8000)
+#define LAN88XX_INT_STS_SPEED_CHANGE_		(0x4000)
+#define LAN88XX_INT_STS_LINK_CHANGE_		(0x2000)
+#define LAN88XX_INT_STS_FDX_CHANGE_		(0x1000)
+#define LAN88XX_INT_STS_AUTONEG_ERR_		(0x0800)
+#define LAN88XX_INT_STS_AUTONEG_DONE_		(0x0400)
+#define LAN88XX_INT_STS_POE_DETECT_		(0x0200)
+#define LAN88XX_INT_STS_SYMBOL_ERR_		(0x0100)
+#define LAN88XX_INT_STS_FAST_LINK_FAIL_		(0x0080)
+#define LAN88XX_INT_STS_WOL_EVENT_		(0x0040)
+#define LAN88XX_INT_STS_EXTENDED_INT_		(0x0020)
+#define LAN88XX_INT_STS_RESERVED_		(0x0010)
+#define LAN88XX_INT_STS_FALSE_CARRIER_		(0x0008)
+#define LAN88XX_INT_STS_LINK_SPEED_DS_		(0x0004)
+#define LAN88XX_INT_STS_MASTER_SLAVE_DONE_	(0x0002)
+#define LAN88XX_INT_STS_RX_ER_			(0x0001)
+
+#define LAN88XX_EXT_PAGE_ACCESS			(0x1F)
+#define LAN88XX_EXT_PAGE_SPACE_0		(0x0000)
+#define LAN88XX_EXT_PAGE_SPACE_1		(0x0001)
+#define LAN88XX_EXT_PAGE_SPACE_2		(0x0002)
+
+/* Extended Register Page 1 space */
+#define LAN88XX_EXT_MODE_CTRL			(0x13)
+#define LAN88XX_EXT_MODE_CTRL_MDIX_MASK_	(0x000C)
+#define LAN88XX_EXT_MODE_CTRL_AUTO_MDIX_	(0x0000)
+#define LAN88XX_EXT_MODE_CTRL_MDI_		(0x0008)
+#define LAN88XX_EXT_MODE_CTRL_MDI_X_		(0x000C)
+
+/* MMD 3 Registers */
+#define	LAN88XX_MMD3_CHIP_ID			(32877)
+#define	LAN88XX_MMD3_CHIP_REV			(32878)
+
+#endif /* _MICROCHIPPHY_H */
-- 
cgit v1.2.3-70-g09d2


From 6bb0fef489f667cf701853054f44579754f00a06 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 10 Sep 2015 02:10:57 +0200
Subject: netlink, mmap: fix edge-case leakages in nf queue zero-copy

When netlink mmap on receive side is the consumer of nf queue data,
it can happen that in some edge cases, we write skb shared info into
the user space mmap buffer:

Assume a possible rx ring frame size of only 4096, and the network skb,
which is being zero-copied into the netlink skb, contains page frags
with an overall skb->len larger than the linear part of the netlink
skb.

skb_zerocopy(), which is generic and thus not aware of the fact that
shared info cannot be accessed for such skbs then tries to write and
fill frags, thus leaking kernel data/pointers and in some corner cases
possibly writing out of bounds of the mmap area (when filling the
last slot in the ring buffer this way).

I.e. the ring buffer slot is then of status NL_MMAP_STATUS_VALID, has
an advertised length larger than 4096, where the linear part is visible
at the slot beginning, and the leaked sizeof(struct skb_shared_info)
has been written to the beginning of the next slot (also corrupting
the struct nl_mmap_hdr slot header incl. status etc), since skb->end
points to skb->data + ring->frame_size - NL_MMAP_HDRLEN.

The fix adds and lets __netlink_alloc_skb() take the actual needed
linear room for the network skb + meta data into account. It's completely
irrelevant for non-mmaped netlink sockets, but in case mmap sockets
are used, it can be decided whether the available skb_tailroom() is
really large enough for the buffer, or whether it needs to internally
fallback to a normal alloc_skb().

>From nf queue side, the information whether the destination port is
an mmap RX ring is not really available without extra port-to-socket
lookup, thus it can only be determined in lower layers i.e. when
__netlink_alloc_skb() is called that checks internally for this. I
chose to add the extra ldiff parameter as mmap will then still work:
We have data_len and hlen in nfqnl_build_packet_message(), data_len
is the full length (capped at queue->copy_range) for skb_zerocopy()
and hlen some possible part of data_len that needs to be copied; the
rem_len variable indicates the needed remaining linear mmap space.

The only other workaround in nf queue internally would be after
allocation time by f.e. cap'ing the data_len to the skb_tailroom()
iff we deal with an mmap skb, but that would 1) expose the fact that
we use a mmap skb to upper layers, and 2) trim the skb where we
otherwise could just have moved the full skb into the normal receive
queue.

After the patch, in my test case the ring slot doesn't fit and therefore
shows NL_MMAP_STATUS_COPY, where a full skb carries all the data and
thus needs to be picked up via recv().

Fixes: 3ab1f683bf8b ("nfnetlink: add support for memory mapped netlink")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h              | 13 +++++++++++--
 net/netfilter/nfnetlink_queue_core.c |  5 +++--
 net/netlink/af_netlink.c             | 18 ++++++++++++------
 3 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 9120edb650a0..639e9b8b0e4d 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -68,8 +68,17 @@ extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group);
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
 extern int netlink_has_listeners(struct sock *sk, unsigned int group);
-extern struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
-					 u32 dst_portid, gfp_t gfp_mask);
+
+extern struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
+					   unsigned int ldiff, u32 dst_portid,
+					   gfp_t gfp_mask);
+static inline struct sk_buff *
+netlink_alloc_skb(struct sock *ssk, unsigned int size, u32 dst_portid,
+		  gfp_t gfp_mask)
+{
+	return __netlink_alloc_skb(ssk, size, 0, dst_portid, gfp_mask);
+}
+
 extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock);
 extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid,
 			     __u32 group, gfp_t allocation);
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 685cc6a17163..a5cd6d90b78b 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -301,7 +301,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 			   __be32 **packet_id_ptr)
 {
 	size_t size;
-	size_t data_len = 0, cap_len = 0;
+	size_t data_len = 0, cap_len = 0, rem_len = 0;
 	unsigned int hlen = 0;
 	struct sk_buff *skb;
 	struct nlattr *nla;
@@ -360,6 +360,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 		hlen = min_t(unsigned int, hlen, data_len);
 		size += sizeof(struct nlattr) + hlen;
 		cap_len = entskb->len;
+		rem_len = data_len - hlen;
 		break;
 	}
 
@@ -377,7 +378,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 			size += nla_total_size(seclen);
 	}
 
-	skb = nfnetlink_alloc_skb(net, size, queue->peer_portid,
+	skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid,
 				  GFP_ATOMIC);
 	if (!skb) {
 		skb_tx_error(entskb);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 173817a5dfad..7f86d3b55060 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1844,15 +1844,16 @@ retry:
 }
 EXPORT_SYMBOL(netlink_unicast);
 
-struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
-				  u32 dst_portid, gfp_t gfp_mask)
+struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
+				    unsigned int ldiff, u32 dst_portid,
+				    gfp_t gfp_mask)
 {
 #ifdef CONFIG_NETLINK_MMAP
+	unsigned int maxlen, linear_size;
 	struct sock *sk = NULL;
 	struct sk_buff *skb;
 	struct netlink_ring *ring;
 	struct nl_mmap_hdr *hdr;
-	unsigned int maxlen;
 
 	sk = netlink_getsockbyportid(ssk, dst_portid);
 	if (IS_ERR(sk))
@@ -1863,7 +1864,11 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
 	if (ring->pg_vec == NULL)
 		goto out_put;
 
-	if (ring->frame_size - NL_MMAP_HDRLEN < size)
+	/* We need to account the full linear size needed as a ring
+	 * slot cannot have non-linear parts.
+	 */
+	linear_size = size + ldiff;
+	if (ring->frame_size - NL_MMAP_HDRLEN < linear_size)
 		goto out_put;
 
 	skb = alloc_skb_head(gfp_mask);
@@ -1877,13 +1882,14 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
 
 	/* check again under lock */
 	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
-	if (maxlen < size)
+	if (maxlen < linear_size)
 		goto out_free;
 
 	netlink_forward_ring(ring);
 	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
 	if (hdr == NULL)
 		goto err2;
+
 	netlink_ring_setup_skb(skb, sk, ring, hdr);
 	netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
 	atomic_inc(&ring->pending);
@@ -1909,7 +1915,7 @@ out:
 #endif
 	return alloc_skb(size, gfp_mask);
 }
-EXPORT_SYMBOL_GPL(netlink_alloc_skb);
+EXPORT_SYMBOL_GPL(__netlink_alloc_skb);
 
 int netlink_has_listeners(struct sock *sk, unsigned int group)
 {
-- 
cgit v1.2.3-70-g09d2


From 3f0e131221eb951c45c93d1cce9db73889be2a5e Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Wed, 9 Sep 2015 15:35:16 -0700
Subject: zpool: add zpool_has_pool()

This series makes creation of the zpool and compressor dynamic, so that
they can be changed at runtime.  This makes using/configuring zswap
easier, as before this zswap had to be configured at boot time, using boot
params.

This uses a single list to track both the zpool and compressor together,
although Seth had mentioned an alternative which is to track the zpools
and compressors using separate lists.  In the most common case, only a
single zpool and single compressor, using one list is slightly simpler
than using two lists, and for the uncommon case of multiple zpools and/or
compressors, using one list is slightly less simple (and uses slightly
more memory, probably) than using two lists.

This patch (of 4):

Add zpool_has_pool() function, indicating if the specified type of zpool
is available (i.e.  zsmalloc or zbud).  This allows checking if a pool is
available, without actually trying to allocate it, similar to
crypto_has_alg().

This is used by a following patch to zswap that enables the dynamic
runtime creation of zswap zpools.

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Acked-by: Seth Jennings <sjennings@variantweb.net>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/zpool.h |  2 ++
 mm/zpool.c            | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index c924a28d9805..42f8ec992452 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -36,6 +36,8 @@ enum zpool_mapmode {
 	ZPOOL_MM_DEFAULT = ZPOOL_MM_RW
 };
 
+bool zpool_has_pool(char *type);
+
 struct zpool *zpool_create_pool(char *type, char *name,
 			gfp_t gfp, const struct zpool_ops *ops);
 
diff --git a/mm/zpool.c b/mm/zpool.c
index 68d2dd8ed2d8..8f670d3e8706 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -99,6 +99,39 @@ static void zpool_put_driver(struct zpool_driver *driver)
 	module_put(driver->owner);
 }
 
+/**
+ * zpool_has_pool() - Check if the pool driver is available
+ * @type	The type of the zpool to check (e.g. zbud, zsmalloc)
+ *
+ * This checks if the @type pool driver is available.  This will try to load
+ * the requested module, if needed, but there is no guarantee the module will
+ * still be loaded and available immediately after calling.  If this returns
+ * true, the caller should assume the pool is available, but must be prepared
+ * to handle the @zpool_create_pool() returning failure.  However if this
+ * returns false, the caller should assume the requested pool type is not
+ * available; either the requested pool type module does not exist, or could
+ * not be loaded, and calling @zpool_create_pool() with the pool type will
+ * fail.
+ *
+ * Returns: true if @type pool is available, false if not
+ */
+bool zpool_has_pool(char *type)
+{
+	struct zpool_driver *driver = zpool_get_driver(type);
+
+	if (!driver) {
+		request_module("zpool-%s", type);
+		driver = zpool_get_driver(type);
+	}
+
+	if (!driver)
+		return false;
+
+	zpool_put_driver(driver);
+	return true;
+}
+EXPORT_SYMBOL(zpool_has_pool);
+
 /**
  * zpool_create_pool() - Create a new zpool
  * @type	The type of the zpool to create (e.g. zbud, zsmalloc)
-- 
cgit v1.2.3-70-g09d2


From 2fc045247089ad4ed611ec20cc3a736c0212bf1a Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 9 Sep 2015 15:35:28 -0700
Subject: memcg: add page_cgroup_ino helper

This patchset introduces a new user API for tracking user memory pages
that have not been used for a given period of time.  The purpose of this
is to provide the userspace with the means of tracking a workload's
working set, i.e.  the set of pages that are actively used by the
workload.  Knowing the working set size can be useful for partitioning the
system more efficiently, e.g.  by tuning memory cgroup limits
appropriately, or for job placement within a compute cluster.

==== USE CASES ====

The unified cgroup hierarchy has memory.low and memory.high knobs, which
are defined as the low and high boundaries for the workload working set
size.  However, the working set size of a workload may be unknown or
change in time.  With this patch set, one can periodically estimate the
amount of memory unused by each cgroup and tune their memory.low and
memory.high parameters accordingly, therefore optimizing the overall
memory utilization.

Another use case is balancing workloads within a compute cluster.  Knowing
how much memory is not really used by a workload unit may help take a more
optimal decision when considering migrating the unit to another node
within the cluster.

Also, as noted by Minchan, this would be useful for per-process reclaim
(https://lwn.net/Articles/545668/). With idle tracking, we could reclaim idle
pages only by smart user memory manager.

==== USER API ====

The user API consists of two new files:

 * /sys/kernel/mm/page_idle/bitmap.  This file implements a bitmap where each
   bit corresponds to a page, indexed by PFN. When the bit is set, the
   corresponding page is idle. A page is considered idle if it has not been
   accessed since it was marked idle. To mark a page idle one should set the
   bit corresponding to the page by writing to the file. A value written to the
   file is OR-ed with the current bitmap value. Only user memory pages can be
   marked idle, for other page types input is silently ignored. Writing to this
   file beyond max PFN results in the ENXIO error. Only available when
   CONFIG_IDLE_PAGE_TRACKING is set.

   This file can be used to estimate the amount of pages that are not
   used by a particular workload as follows:

   1. mark all pages of interest idle by setting corresponding bits in the
      /sys/kernel/mm/page_idle/bitmap
   2. wait until the workload accesses its working set
   3. read /sys/kernel/mm/page_idle/bitmap and count the number of bits set

 * /proc/kpagecgroup.  This file contains a 64-bit inode number of the
   memory cgroup each page is charged to, indexed by PFN. Only available when
   CONFIG_MEMCG is set.

   This file can be used to find all pages (including unmapped file pages)
   accounted to a particular cgroup. Using /sys/kernel/mm/page_idle/bitmap, one
   can then estimate the cgroup working set size.

For an example of using these files for estimating the amount of unused
memory pages per each memory cgroup, please see the script attached
below.

==== REASONING ====

The reason to introduce the new user API instead of using
/proc/PID/{clear_refs,smaps} is that the latter has two serious
drawbacks:

 - it does not count unmapped file pages
 - it affects the reclaimer logic

The new API attempts to overcome them both. For more details on how it
is achieved, please see the comment to patch 6.

==== PATCHSET STRUCTURE ====

The patch set is organized as follows:

 - patch 1 adds page_cgroup_ino() helper for the sake of
   /proc/kpagecgroup and patches 2-3 do related cleanup
 - patch 4 adds /proc/kpagecgroup, which reports cgroup ino each page is
   charged to
 - patch 5 introduces a new mmu notifier callback, clear_young, which is
   a lightweight version of clear_flush_young; it is used in patch 6
 - patch 6 implements the idle page tracking feature, including the
   userspace API, /sys/kernel/mm/page_idle/bitmap
 - patch 7 exports idle flag via /proc/kpageflags

==== SIMILAR WORKS ====

Originally, the patch for tracking idle memory was proposed back in 2011
by Michel Lespinasse (see http://lwn.net/Articles/459269/).  The main
difference between Michel's patch and this one is that Michel implemented
a kernel space daemon for estimating idle memory size per cgroup while
this patch only provides the userspace with the minimal API for doing the
job, leaving the rest up to the userspace.  However, they both share the
same idea of Idle/Young page flags to avoid affecting the reclaimer logic.

==== PERFORMANCE EVALUATION ====

SPECjvm2008 (https://www.spec.org/jvm2008/) was used to evaluate the
performance impact introduced by this patch set.  Three runs were carried
out:

 - base: kernel without the patch
 - patched: patched kernel, the feature is not used
 - patched-active: patched kernel, 1 minute-period daemon is used for
   tracking idle memory

For tracking idle memory, idlememstat utility was used:
https://github.com/locker/idlememstat

testcase            base            patched        patched-active

compiler       537.40 ( 0.00)%   532.26 (-0.96)%   538.31 ( 0.17)%
compress       305.47 ( 0.00)%   301.08 (-1.44)%   300.71 (-1.56)%
crypto         284.32 ( 0.00)%   282.21 (-0.74)%   284.87 ( 0.19)%
derby          411.05 ( 0.00)%   413.44 ( 0.58)%   412.07 ( 0.25)%
mpegaudio      189.96 ( 0.00)%   190.87 ( 0.48)%   189.42 (-0.28)%
scimark.large   46.85 ( 0.00)%    46.41 (-0.94)%    47.83 ( 2.09)%
scimark.small  412.91 ( 0.00)%   415.41 ( 0.61)%   421.17 ( 2.00)%
serial         204.23 ( 0.00)%   213.46 ( 4.52)%   203.17 (-0.52)%
startup         36.76 ( 0.00)%    35.49 (-3.45)%    35.64 (-3.05)%
sunflow        115.34 ( 0.00)%   115.08 (-0.23)%   117.37 ( 1.76)%
xml            620.55 ( 0.00)%   619.95 (-0.10)%   620.39 (-0.03)%

composite      211.50 ( 0.00)%   211.15 (-0.17)%   211.67 ( 0.08)%

time idlememstat:

17.20user 65.16system 2:15:23elapsed 1%CPU (0avgtext+0avgdata 8476maxresident)k
448inputs+40outputs (1major+36052minor)pagefaults 0swaps

==== SCRIPT FOR COUNTING IDLE PAGES PER CGROUP ====
#! /usr/bin/python
#

import os
import stat
import errno
import struct

CGROUP_MOUNT = "/sys/fs/cgroup/memory"
BUFSIZE = 8 * 1024  # must be multiple of 8

def get_hugepage_size():
    with open("/proc/meminfo", "r") as f:
        for s in f:
            k, v = s.split(":")
            if k == "Hugepagesize":
                return int(v.split()[0]) * 1024

PAGE_SIZE = os.sysconf("SC_PAGE_SIZE")
HUGEPAGE_SIZE = get_hugepage_size()

def set_idle():
    f = open("/sys/kernel/mm/page_idle/bitmap", "wb", BUFSIZE)
    while True:
        try:
            f.write(struct.pack("Q", pow(2, 64) - 1))
        except IOError as err:
            if err.errno == errno.ENXIO:
                break
            raise
    f.close()

def count_idle():
    f_flags = open("/proc/kpageflags", "rb", BUFSIZE)
    f_cgroup = open("/proc/kpagecgroup", "rb", BUFSIZE)

    with open("/sys/kernel/mm/page_idle/bitmap", "rb", BUFSIZE) as f:
        while f.read(BUFSIZE): pass  # update idle flag

    idlememsz = {}
    while True:
        s1, s2 = f_flags.read(8), f_cgroup.read(8)
        if not s1 or not s2:
            break

        flags, = struct.unpack('Q', s1)
        cgino, = struct.unpack('Q', s2)

        unevictable = (flags >> 18) & 1
        huge = (flags >> 22) & 1
        idle = (flags >> 25) & 1

        if idle and not unevictable:
            idlememsz[cgino] = idlememsz.get(cgino, 0) + \
                (HUGEPAGE_SIZE if huge else PAGE_SIZE)

    f_flags.close()
    f_cgroup.close()
    return idlememsz

if __name__ == "__main__":
    print "Setting the idle flag for each page..."
    set_idle()

    raw_input("Wait until the workload accesses its working set, "
              "then press Enter")

    print "Counting idle pages..."
    idlememsz = count_idle()

    for dir, subdirs, files in os.walk(CGROUP_MOUNT):
        ino = os.stat(dir)[stat.ST_INO]
        print dir + ": " + str(idlememsz.get(ino, 0) / 1024) + " kB"
==== END SCRIPT ====

This patch (of 8):

Add page_cgroup_ino() helper to memcg.

This function returns the inode number of the closest online ancestor of
the memory cgroup a page is charged to.  It is required for exporting
information about which page is charged to which cgroup to userspace,
which will be introduced by a following patch.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Reviewed-by: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  1 +
 mm/memcontrol.c            | 28 ++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d92b80b63c5c..f56c818e56bc 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -345,6 +345,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
 }
 
 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
+ino_t page_cgroup_ino(struct page *page);
 
 static inline bool mem_cgroup_disabled(void)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1742a2db89c7..01009726d412 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -441,6 +441,34 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 	return &memcg->css;
 }
 
+/**
+ * page_cgroup_ino - return inode number of the memcg a page is charged to
+ * @page: the page
+ *
+ * Look up the closest online ancestor of the memory cgroup @page is charged to
+ * and return its inode number or 0 if @page is not charged to any cgroup. It
+ * is safe to call this function without holding a reference to @page.
+ *
+ * Note, this function is inherently racy, because there is nothing to prevent
+ * the cgroup inode from getting torn down and potentially reallocated a moment
+ * after page_cgroup_ino() returns, so it only should be used by callers that
+ * do not care (such as procfs interfaces).
+ */
+ino_t page_cgroup_ino(struct page *page)
+{
+	struct mem_cgroup *memcg;
+	unsigned long ino = 0;
+
+	rcu_read_lock();
+	memcg = READ_ONCE(page->mem_cgroup);
+	while (memcg && !(memcg->css.flags & CSS_ONLINE))
+		memcg = parent_mem_cgroup(memcg);
+	if (memcg)
+		ino = cgroup_ino(memcg->css.cgroup);
+	rcu_read_unlock();
+	return ino;
+}
+
 static struct mem_cgroup_per_zone *
 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
-- 
cgit v1.2.3-70-g09d2


From e993d905c81e2c0f669f2f8e8327df86738baebe Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 9 Sep 2015 15:35:35 -0700
Subject: memcg: zap try_get_mem_cgroup_from_page

It is only used in mem_cgroup_try_charge, so fold it in and zap it.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Reviewed-by: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  9 +--------
 mm/memcontrol.c            | 48 ++++++++++++----------------------------------
 2 files changed, 13 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f56c818e56bc..ad800e62cb7a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -305,11 +305,9 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
 
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
-
-struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
-
 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
+
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
 	return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -556,11 +554,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
 	return &zone->lruvec;
 }
 
-static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
-	return NULL;
-}
-
 static inline bool mm_match_cgroup(struct mm_struct *mm,
 		struct mem_cgroup *memcg)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 01009726d412..6ddaeba34e09 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2099,40 +2099,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
 	css_put_many(&memcg->css, nr_pages);
 }
 
-/*
- * try_get_mem_cgroup_from_page - look up page's memcg association
- * @page: the page
- *
- * Look up, get a css reference, and return the memcg that owns @page.
- *
- * The page must be locked to prevent racing with swap-in and page
- * cache charges.  If coming from an unlocked page table, the caller
- * must ensure the page is on the LRU or this can race with charging.
- */
-struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
-	struct mem_cgroup *memcg;
-	unsigned short id;
-	swp_entry_t ent;
-
-	VM_BUG_ON_PAGE(!PageLocked(page), page);
-
-	memcg = page->mem_cgroup;
-	if (memcg) {
-		if (!css_tryget_online(&memcg->css))
-			memcg = NULL;
-	} else if (PageSwapCache(page)) {
-		ent.val = page_private(page);
-		id = lookup_swap_cgroup_id(ent);
-		rcu_read_lock();
-		memcg = mem_cgroup_from_id(id);
-		if (memcg && !css_tryget_online(&memcg->css))
-			memcg = NULL;
-		rcu_read_unlock();
-	}
-	return memcg;
-}
-
 static void lock_page_lru(struct page *page, int *isolated)
 {
 	struct zone *zone = page_zone(page);
@@ -5329,8 +5295,20 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 		 * the page lock, which serializes swap cache removal, which
 		 * in turn serializes uncharging.
 		 */
+		VM_BUG_ON_PAGE(!PageLocked(page), page);
 		if (page->mem_cgroup)
 			goto out;
+
+		if (do_swap_account) {
+			swp_entry_t ent = { .val = page_private(page), };
+			unsigned short id = lookup_swap_cgroup_id(ent);
+
+			rcu_read_lock();
+			memcg = mem_cgroup_from_id(id);
+			if (memcg && !css_tryget_online(&memcg->css))
+				memcg = NULL;
+			rcu_read_unlock();
+		}
 	}
 
 	if (PageTransHuge(page)) {
@@ -5338,8 +5316,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 	}
 
-	if (do_swap_account && PageSwapCache(page))
-		memcg = try_get_mem_cgroup_from_page(page);
 	if (!memcg)
 		memcg = get_mem_cgroup_from_mm(mm);
 
-- 
cgit v1.2.3-70-g09d2


From 1d7715c676a1566c2e4c3e77d16b1f9bb4909025 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 9 Sep 2015 15:35:41 -0700
Subject: mmu-notifier: add clear_young callback

In the scope of the idle memory tracking feature, which is introduced by
the following patch, we need to clear the referenced/accessed bit not only
in primary, but also in secondary ptes.  The latter is required in order
to estimate wss of KVM VMs.  At the same time we want to avoid flushing
tlb, because it is quite expensive and it won't really affect the final
result.

Currently, there is no function for clearing pte young bit that would meet
our requirements, so this patch introduces one.  To achieve that we have
to add a new mmu-notifier callback, clear_young, since there is no method
for testing-and-clearing a secondary pte w/o flushing tlb.  The new method
is not mandatory and currently only implemented by KVM.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Reviewed-by: Andres Lagar-Cavilla <andreslc@google.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
 mm/mmu_notifier.c            | 17 +++++++++++++++++
 virt/kvm/kvm_main.c          | 31 +++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 61cd67f4d788..a5b17137c683 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -65,6 +65,16 @@ struct mmu_notifier_ops {
 				 unsigned long start,
 				 unsigned long end);
 
+	/*
+	 * clear_young is a lightweight version of clear_flush_young. Like the
+	 * latter, it is supposed to test-and-clear the young/accessed bitflag
+	 * in the secondary pte, but it may omit flushing the secondary tlb.
+	 */
+	int (*clear_young)(struct mmu_notifier *mn,
+			   struct mm_struct *mm,
+			   unsigned long start,
+			   unsigned long end);
+
 	/*
 	 * test_young is called to check the young/accessed bitflag in
 	 * the secondary pte. This is used to know if the page is
@@ -203,6 +213,9 @@ extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 					  unsigned long start,
 					  unsigned long end);
+extern int __mmu_notifier_clear_young(struct mm_struct *mm,
+				      unsigned long start,
+				      unsigned long end);
 extern int __mmu_notifier_test_young(struct mm_struct *mm,
 				     unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
@@ -231,6 +244,15 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return 0;
 }
 
+static inline int mmu_notifier_clear_young(struct mm_struct *mm,
+					   unsigned long start,
+					   unsigned long end)
+{
+	if (mm_has_notifiers(mm))
+		return __mmu_notifier_clear_young(mm, start, end);
+	return 0;
+}
+
 static inline int mmu_notifier_test_young(struct mm_struct *mm,
 					  unsigned long address)
 {
@@ -311,6 +333,28 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	__young;							\
 })
 
+#define ptep_clear_young_notify(__vma, __address, __ptep)		\
+({									\
+	int __young;							\
+	struct vm_area_struct *___vma = __vma;				\
+	unsigned long ___address = __address;				\
+	__young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
+	__young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,	\
+					    ___address + PAGE_SIZE);	\
+	__young;							\
+})
+
+#define pmdp_clear_young_notify(__vma, __address, __pmdp)		\
+({									\
+	int __young;							\
+	struct vm_area_struct *___vma = __vma;				\
+	unsigned long ___address = __address;				\
+	__young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
+	__young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,	\
+					    ___address + PMD_SIZE);	\
+	__young;							\
+})
+
 #define	ptep_clear_flush_notify(__vma, __address, __ptep)		\
 ({									\
 	unsigned long ___addr = __address & PAGE_MASK;			\
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 3b9b3d0741b2..5fbdd367bbed 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -123,6 +123,23 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return young;
 }
 
+int __mmu_notifier_clear_young(struct mm_struct *mm,
+			       unsigned long start,
+			       unsigned long end)
+{
+	struct mmu_notifier *mn;
+	int young = 0, id;
+
+	id = srcu_read_lock(&srcu);
+	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+		if (mn->ops->clear_young)
+			young |= mn->ops->clear_young(mn, mm, start, end);
+	}
+	srcu_read_unlock(&srcu, id);
+
+	return young;
+}
+
 int __mmu_notifier_test_young(struct mm_struct *mm,
 			      unsigned long address)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d8db2f8fce9c..268fc0a5a932 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -387,6 +387,36 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 	return young;
 }
 
+static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
+					struct mm_struct *mm,
+					unsigned long start,
+					unsigned long end)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int young, idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	spin_lock(&kvm->mmu_lock);
+	/*
+	 * Even though we do not flush TLB, this will still adversely
+	 * affect performance on pre-Haswell Intel EPT, where there is
+	 * no EPT Access Bit to clear so that we have to tear down EPT
+	 * tables instead. If we find this unacceptable, we can always
+	 * add a parameter to kvm_age_hva so that it effectively doesn't
+	 * do anything on clear_young.
+	 *
+	 * Also note that currently we never issue secondary TLB flushes
+	 * from clear_young, leaving this job up to the regular system
+	 * cadence. If we find this inaccurate, we might come up with a
+	 * more sophisticated heuristic later.
+	 */
+	young = kvm_age_hva(kvm, start, end);
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	return young;
+}
+
 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
 				       unsigned long address)
@@ -419,6 +449,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
+	.clear_young		= kvm_mmu_notifier_clear_young,
 	.test_young		= kvm_mmu_notifier_test_young,
 	.change_pte		= kvm_mmu_notifier_change_pte,
 	.release		= kvm_mmu_notifier_release,
-- 
cgit v1.2.3-70-g09d2


From 33c3fc71c8cfa3cc3a98beaa901c069c177dc295 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 9 Sep 2015 15:35:45 -0700
Subject: mm: introduce idle page tracking

Knowing the portion of memory that is not used by a certain application or
memory cgroup (idle memory) can be useful for partitioning the system
efficiently, e.g.  by setting memory cgroup limits appropriately.
Currently, the only means to estimate the amount of idle memory provided
by the kernel is /proc/PID/{clear_refs,smaps}: the user can clear the
access bit for all pages mapped to a particular process by writing 1 to
clear_refs, wait for some time, and then count smaps:Referenced.  However,
this method has two serious shortcomings:

 - it does not count unmapped file pages
 - it affects the reclaimer logic

To overcome these drawbacks, this patch introduces two new page flags,
Idle and Young, and a new sysfs file, /sys/kernel/mm/page_idle/bitmap.
A page's Idle flag can only be set from userspace by setting bit in
/sys/kernel/mm/page_idle/bitmap at the offset corresponding to the page,
and it is cleared whenever the page is accessed either through page tables
(it is cleared in page_referenced() in this case) or using the read(2)
system call (mark_page_accessed()). Thus by setting the Idle flag for
pages of a particular workload, which can be found e.g.  by reading
/proc/PID/pagemap, waiting for some time to let the workload access its
working set, and then reading the bitmap file, one can estimate the amount
of pages that are not used by the workload.

The Young page flag is used to avoid interference with the memory
reclaimer.  A page's Young flag is set whenever the Access bit of a page
table entry pointing to the page is cleared by writing to the bitmap file.
If page_referenced() is called on a Young page, it will add 1 to its
return value, therefore concealing the fact that the Access bit was
cleared.

Note, since there is no room for extra page flags on 32 bit, this feature
uses extended page flags when compiled on 32 bit.

[akpm@linux-foundation.org: fix build]
[akpm@linux-foundation.org: kpageidle requires an MMU]
[akpm@linux-foundation.org: decouple from page-flags rework]
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Reviewed-by: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/00-INDEX               |   2 +
 Documentation/vm/idle_page_tracking.txt |  98 ++++++++++++++
 fs/proc/page.c                          |   3 +
 fs/proc/task_mmu.c                      |   5 +-
 include/linux/mmu_notifier.h            |   2 +
 include/linux/page-flags.h              |  11 ++
 include/linux/page_ext.h                |   4 +
 include/linux/page_idle.h               | 110 +++++++++++++++
 mm/Kconfig                              |  12 ++
 mm/Makefile                             |   1 +
 mm/debug.c                              |   4 +
 mm/huge_memory.c                        |  12 +-
 mm/migrate.c                            |   6 +
 mm/page_ext.c                           |   4 +
 mm/page_idle.c                          | 232 ++++++++++++++++++++++++++++++++
 mm/rmap.c                               |   6 +
 mm/swap.c                               |   3 +
 17 files changed, 512 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/vm/idle_page_tracking.txt
 create mode 100644 include/linux/page_idle.h
 create mode 100644 mm/page_idle.c

(limited to 'include/linux')

diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
index 081c49777abb..6a5e2a102a45 100644
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -14,6 +14,8 @@ hugetlbpage.txt
 	- a brief summary of hugetlbpage support in the Linux kernel.
 hwpoison.txt
 	- explains what hwpoison is
+idle_page_tracking.txt
+	- description of the idle page tracking feature.
 ksm.txt
 	- how to use the Kernel Samepage Merging feature.
 numa
diff --git a/Documentation/vm/idle_page_tracking.txt b/Documentation/vm/idle_page_tracking.txt
new file mode 100644
index 000000000000..85dcc3bb85dc
--- /dev/null
+++ b/Documentation/vm/idle_page_tracking.txt
@@ -0,0 +1,98 @@
+MOTIVATION
+
+The idle page tracking feature allows to track which memory pages are being
+accessed by a workload and which are idle. This information can be useful for
+estimating the workload's working set size, which, in turn, can be taken into
+account when configuring the workload parameters, setting memory cgroup limits,
+or deciding where to place the workload within a compute cluster.
+
+It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
+
+USER API
+
+The idle page tracking API is located at /sys/kernel/mm/page_idle. Currently,
+it consists of the only read-write file, /sys/kernel/mm/page_idle/bitmap.
+
+The file implements a bitmap where each bit corresponds to a memory page. The
+bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
+mapped to bit #i%64 of array element #i/64, byte order is native. When a bit is
+set, the corresponding page is idle.
+
+A page is considered idle if it has not been accessed since it was marked idle
+(for more details on what "accessed" actually means see the IMPLEMENTATION
+DETAILS section). To mark a page idle one has to set the bit corresponding to
+the page by writing to the file. A value written to the file is OR-ed with the
+current bitmap value.
+
+Only accesses to user memory pages are tracked. These are pages mapped to a
+process address space, page cache and buffer pages, swap cache pages. For other
+page types (e.g. SLAB pages) an attempt to mark a page idle is silently ignored,
+and hence such pages are never reported idle.
+
+For huge pages the idle flag is set only on the head page, so one has to read
+/proc/kpageflags in order to correctly count idle huge pages.
+
+Reading from or writing to /sys/kernel/mm/page_idle/bitmap will return
+-EINVAL if you are not starting the read/write on an 8-byte boundary, or
+if the size of the read/write is not a multiple of 8 bytes. Writing to
+this file beyond max PFN will return -ENXIO.
+
+That said, in order to estimate the amount of pages that are not used by a
+workload one should:
+
+ 1. Mark all the workload's pages as idle by setting corresponding bits in
+    /sys/kernel/mm/page_idle/bitmap. The pages can be found by reading
+    /proc/pid/pagemap if the workload is represented by a process, or by
+    filtering out alien pages using /proc/kpagecgroup in case the workload is
+    placed in a memory cgroup.
+
+ 2. Wait until the workload accesses its working set.
+
+ 3. Read /sys/kernel/mm/page_idle/bitmap and count the number of bits set. If
+    one wants to ignore certain types of pages, e.g. mlocked pages since they
+    are not reclaimable, he or she can filter them out using /proc/kpageflags.
+
+See Documentation/vm/pagemap.txt for more information about /proc/pid/pagemap,
+/proc/kpageflags, and /proc/kpagecgroup.
+
+IMPLEMENTATION DETAILS
+
+The kernel internally keeps track of accesses to user memory pages in order to
+reclaim unreferenced pages first on memory shortage conditions. A page is
+considered referenced if it has been recently accessed via a process address
+space, in which case one or more PTEs it is mapped to will have the Accessed bit
+set, or marked accessed explicitly by the kernel (see mark_page_accessed()). The
+latter happens when:
+
+ - a userspace process reads or writes a page using a system call (e.g. read(2)
+   or write(2))
+
+ - a page that is used for storing filesystem buffers is read or written,
+   because a process needs filesystem metadata stored in it (e.g. lists a
+   directory tree)
+
+ - a page is accessed by a device driver using get_user_pages()
+
+When a dirty page is written to swap or disk as a result of memory reclaim or
+exceeding the dirty memory limit, it is not marked referenced.
+
+The idle memory tracking feature adds a new page flag, the Idle flag. This flag
+is set manually, by writing to /sys/kernel/mm/page_idle/bitmap (see the USER API
+section), and cleared automatically whenever a page is referenced as defined
+above.
+
+When a page is marked idle, the Accessed bit must be cleared in all PTEs it is
+mapped to, otherwise we will not be able to detect accesses to the page coming
+from a process address space. To avoid interference with the reclaimer, which,
+as noted above, uses the Accessed bit to promote actively referenced pages, one
+more page flag is introduced, the Young flag. When the PTE Accessed bit is
+cleared as a result of setting or updating a page's Idle flag, the Young flag
+is set on the page. The reclaimer treats the Young flag as an extra PTE
+Accessed bit and therefore will consider such a page as referenced.
+
+Since the idle memory tracking feature is based on the memory reclaimer logic,
+it only works with pages that are on an LRU list, other pages are silently
+ignored. That means it will ignore a user memory page if it is isolated, but
+since there are usually not many of them, it should not affect the overall
+result noticeably. In order not to stall scanning of the idle page bitmap,
+locked pages may be skipped too.
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 70d23245dd43..c2d29edcaa6b 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -10,12 +10,15 @@
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
 #include <linux/kernel-page-flags.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
+#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
 
 /* /proc/kpagecount - an array exposing page counts
  *
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 41f1a50c10c9..e2d46adb54b4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -13,6 +13,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
 
 #include <asm/elf.h>
 #include <asm/uaccess.h>
@@ -459,7 +460,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
 
 	mss->resident += size;
 	/* Accumulate the size in pages that have been accessed. */
-	if (young || PageReferenced(page))
+	if (young || page_is_young(page) || PageReferenced(page))
 		mss->referenced += size;
 	mapcount = page_mapcount(page);
 	if (mapcount >= 2) {
@@ -807,6 +808,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 
 		/* Clear accessed and referenced bits. */
 		pmdp_test_and_clear_young(vma, addr, pmd);
+		test_and_clear_page_young(page);
 		ClearPageReferenced(page);
 out:
 		spin_unlock(ptl);
@@ -834,6 +836,7 @@ out:
 
 		/* Clear accessed and referenced bits. */
 		ptep_test_and_clear_young(vma, addr, pte);
+		test_and_clear_page_young(page);
 		ClearPageReferenced(page);
 	}
 	pte_unmap_unlock(pte - 1, ptl);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index a5b17137c683..a1a210d59961 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -471,6 +471,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young
+#define ptep_clear_young_notify ptep_test_and_clear_young
+#define pmdp_clear_young_notify pmdp_test_and_clear_young
 #define	ptep_clear_flush_notify ptep_clear_flush
 #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
 #define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 41c93844fb1d..416509e26d6d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -108,6 +108,10 @@ enum pageflags {
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	PG_compound_lock,
+#endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+	PG_young,
+	PG_idle,
 #endif
 	__NR_PAGEFLAGS,
 
@@ -289,6 +293,13 @@ PAGEFLAG_FALSE(HWPoison)
 #define __PG_HWPOISON 0
 #endif
 
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+TESTPAGEFLAG(Young, young)
+SETPAGEFLAG(Young, young)
+TESTCLEARFLAG(Young, young)
+PAGEFLAG(Idle, idle)
+#endif
+
 /*
  * On an anonymous page mapped into a user virtual memory area,
  * page->mapping points to its anon_vma, not to a struct address_space;
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index c42981cd99aa..17f118a82854 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -26,6 +26,10 @@ enum page_ext_flags {
 	PAGE_EXT_DEBUG_POISON,		/* Page is poisoned */
 	PAGE_EXT_DEBUG_GUARD,
 	PAGE_EXT_OWNER,
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+	PAGE_EXT_YOUNG,
+	PAGE_EXT_IDLE,
+#endif
 };
 
 /*
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
new file mode 100644
index 000000000000..bf268fa92c5b
--- /dev/null
+++ b/include/linux/page_idle.h
@@ -0,0 +1,110 @@
+#ifndef _LINUX_MM_PAGE_IDLE_H
+#define _LINUX_MM_PAGE_IDLE_H
+
+#include <linux/bitops.h>
+#include <linux/page-flags.h>
+#include <linux/page_ext.h>
+
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+
+#ifdef CONFIG_64BIT
+static inline bool page_is_young(struct page *page)
+{
+	return PageYoung(page);
+}
+
+static inline void set_page_young(struct page *page)
+{
+	SetPageYoung(page);
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+	return TestClearPageYoung(page);
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+	return PageIdle(page);
+}
+
+static inline void set_page_idle(struct page *page)
+{
+	SetPageIdle(page);
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+	ClearPageIdle(page);
+}
+#else /* !CONFIG_64BIT */
+/*
+ * If there is not enough space to store Idle and Young bits in page flags, use
+ * page ext flags instead.
+ */
+extern struct page_ext_operations page_idle_ops;
+
+static inline bool page_is_young(struct page *page)
+{
+	return test_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
+}
+
+static inline void set_page_young(struct page *page)
+{
+	set_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+	return test_and_clear_bit(PAGE_EXT_YOUNG,
+				  &lookup_page_ext(page)->flags);
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+	return test_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+
+static inline void set_page_idle(struct page *page)
+{
+	set_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+	clear_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+#endif /* CONFIG_64BIT */
+
+#else /* !CONFIG_IDLE_PAGE_TRACKING */
+
+static inline bool page_is_young(struct page *page)
+{
+	return false;
+}
+
+static inline void set_page_young(struct page *page)
+{
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+	return false;
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+	return false;
+}
+
+static inline void set_page_idle(struct page *page)
+{
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+}
+
+#endif /* CONFIG_IDLE_PAGE_TRACKING */
+
+#endif /* _LINUX_MM_PAGE_IDLE_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 3a4070f5ab79..6413d027c0b2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -649,6 +649,18 @@ config DEFERRED_STRUCT_PAGE_INIT
 	  processes running early in the lifetime of the systemm until kswapd
 	  finishes the initialisation.
 
+config IDLE_PAGE_TRACKING
+	bool "Enable idle page tracking"
+	depends on SYSFS && MMU
+	select PAGE_EXTENSION if !64BIT
+	help
+	  This feature allows to estimate the amount of user pages that have
+	  not been touched during a given period of time. This information can
+	  be useful to tune memory cgroup limits and/or for job placement
+	  within a compute cluster.
+
+	  See Documentation/vm/idle_page_tracking.txt for more details.
+
 config ZONE_DEVICE
 	bool "Device memory (pmem, etc...) hotplug support" if EXPERT
 	default !ZONE_DMA
diff --git a/mm/Makefile b/mm/Makefile
index b424d5e5b6ff..56f8eed73f1a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -79,3 +79,4 @@ obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
 obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
+obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
diff --git a/mm/debug.c b/mm/debug.c
index 76089ddf99ea..6c1b3ea61bfd 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -48,6 +48,10 @@ static const struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	{1UL << PG_compound_lock,	"compound_lock"	},
 #endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+	{1UL << PG_young,		"young"		},
+	{1UL << PG_idle,		"idle"		},
+#endif
 };
 
 static void dump_flags(unsigned long flags,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b16279cbd91d..4b06b8db9df2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -25,6 +25,7 @@
 #include <linux/migrate.h>
 #include <linux/hashtable.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -1757,6 +1758,11 @@ static void __split_huge_page_refcount(struct page *page,
 		/* clear PageTail before overwriting first_page */
 		smp_wmb();
 
+		if (page_is_young(page))
+			set_page_young(page_tail);
+		if (page_is_idle(page))
+			set_page_idle(page_tail);
+
 		/*
 		 * __split_huge_page_splitting() already set the
 		 * splitting bit in all pmd that could map this
@@ -2262,7 +2268,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 
 		/* If there is no mapped pte young don't collapse the page */
-		if (pte_young(pteval) || PageReferenced(page) ||
+		if (pte_young(pteval) ||
+		    page_is_young(page) || PageReferenced(page) ||
 		    mmu_notifier_test_young(vma->vm_mm, address))
 			referenced = true;
 	}
@@ -2693,7 +2700,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 		 */
 		if (page_count(page) != 1 + !!PageSwapCache(page))
 			goto out_unmap;
-		if (pte_young(pteval) || PageReferenced(page) ||
+		if (pte_young(pteval) ||
+		    page_is_young(page) || PageReferenced(page) ||
 		    mmu_notifier_test_young(vma->vm_mm, address))
 			referenced = true;
 	}
diff --git a/mm/migrate.c b/mm/migrate.c
index 02ce25df16c2..c3cb566af3e2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -37,6 +37,7 @@
 #include <linux/gfp.h>
 #include <linux/balloon_compaction.h>
 #include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlbflush.h>
 
@@ -524,6 +525,11 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 			__set_page_dirty_nobuffers(newpage);
  	}
 
+	if (page_is_young(page))
+		set_page_young(newpage);
+	if (page_is_idle(page))
+		set_page_idle(newpage);
+
 	/*
 	 * Copy NUMA information to the new page, to prevent over-eager
 	 * future migrations of this same page.
diff --git a/mm/page_ext.c b/mm/page_ext.c
index d86fd2f5353f..292ca7b8debd 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -6,6 +6,7 @@
 #include <linux/vmalloc.h>
 #include <linux/kmemleak.h>
 #include <linux/page_owner.h>
+#include <linux/page_idle.h>
 
 /*
  * struct page extension
@@ -59,6 +60,9 @@ static struct page_ext_operations *page_ext_ops[] = {
 #ifdef CONFIG_PAGE_OWNER
 	&page_owner_ops,
 #endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+	&page_idle_ops,
+#endif
 };
 
 static unsigned long total_usage;
diff --git a/mm/page_idle.c b/mm/page_idle.c
new file mode 100644
index 000000000000..d5dd79041484
--- /dev/null
+++ b/mm/page_idle.c
@@ -0,0 +1,232 @@
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_ext.h>
+#include <linux/page_idle.h>
+
+#define BITMAP_CHUNK_SIZE	sizeof(u64)
+#define BITMAP_CHUNK_BITS	(BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
+
+/*
+ * Idle page tracking only considers user memory pages, for other types of
+ * pages the idle flag is always unset and an attempt to set it is silently
+ * ignored.
+ *
+ * We treat a page as a user memory page if it is on an LRU list, because it is
+ * always safe to pass such a page to rmap_walk(), which is essential for idle
+ * page tracking. With such an indicator of user pages we can skip isolated
+ * pages, but since there are not usually many of them, it will hardly affect
+ * the overall result.
+ *
+ * This function tries to get a user memory page by pfn as described above.
+ */
+static struct page *page_idle_get_page(unsigned long pfn)
+{
+	struct page *page;
+	struct zone *zone;
+
+	if (!pfn_valid(pfn))
+		return NULL;
+
+	page = pfn_to_page(pfn);
+	if (!page || !PageLRU(page) ||
+	    !get_page_unless_zero(page))
+		return NULL;
+
+	zone = page_zone(page);
+	spin_lock_irq(&zone->lru_lock);
+	if (unlikely(!PageLRU(page))) {
+		put_page(page);
+		page = NULL;
+	}
+	spin_unlock_irq(&zone->lru_lock);
+	return page;
+}
+
+static int page_idle_clear_pte_refs_one(struct page *page,
+					struct vm_area_struct *vma,
+					unsigned long addr, void *arg)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	spinlock_t *ptl;
+	pmd_t *pmd;
+	pte_t *pte;
+	bool referenced = false;
+
+	if (unlikely(PageTransHuge(page))) {
+		pmd = page_check_address_pmd(page, mm, addr,
+					     PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+		if (pmd) {
+			referenced = pmdp_clear_young_notify(vma, addr, pmd);
+			spin_unlock(ptl);
+		}
+	} else {
+		pte = page_check_address(page, mm, addr, &ptl, 0);
+		if (pte) {
+			referenced = ptep_clear_young_notify(vma, addr, pte);
+			pte_unmap_unlock(pte, ptl);
+		}
+	}
+	if (referenced) {
+		clear_page_idle(page);
+		/*
+		 * We cleared the referenced bit in a mapping to this page. To
+		 * avoid interference with page reclaim, mark it young so that
+		 * page_referenced() will return > 0.
+		 */
+		set_page_young(page);
+	}
+	return SWAP_AGAIN;
+}
+
+static void page_idle_clear_pte_refs(struct page *page)
+{
+	/*
+	 * Since rwc.arg is unused, rwc is effectively immutable, so we
+	 * can make it static const to save some cycles and stack.
+	 */
+	static const struct rmap_walk_control rwc = {
+		.rmap_one = page_idle_clear_pte_refs_one,
+		.anon_lock = page_lock_anon_vma_read,
+	};
+	bool need_lock;
+
+	if (!page_mapped(page) ||
+	    !page_rmapping(page))
+		return;
+
+	need_lock = !PageAnon(page) || PageKsm(page);
+	if (need_lock && !trylock_page(page))
+		return;
+
+	rmap_walk(page, (struct rmap_walk_control *)&rwc);
+
+	if (need_lock)
+		unlock_page(page);
+}
+
+static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
+				     struct bin_attribute *attr, char *buf,
+				     loff_t pos, size_t count)
+{
+	u64 *out = (u64 *)buf;
+	struct page *page;
+	unsigned long pfn, end_pfn;
+	int bit;
+
+	if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+		return -EINVAL;
+
+	pfn = pos * BITS_PER_BYTE;
+	if (pfn >= max_pfn)
+		return 0;
+
+	end_pfn = pfn + count * BITS_PER_BYTE;
+	if (end_pfn > max_pfn)
+		end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+
+	for (; pfn < end_pfn; pfn++) {
+		bit = pfn % BITMAP_CHUNK_BITS;
+		if (!bit)
+			*out = 0ULL;
+		page = page_idle_get_page(pfn);
+		if (page) {
+			if (page_is_idle(page)) {
+				/*
+				 * The page might have been referenced via a
+				 * pte, in which case it is not idle. Clear
+				 * refs and recheck.
+				 */
+				page_idle_clear_pte_refs(page);
+				if (page_is_idle(page))
+					*out |= 1ULL << bit;
+			}
+			put_page(page);
+		}
+		if (bit == BITMAP_CHUNK_BITS - 1)
+			out++;
+		cond_resched();
+	}
+	return (char *)out - buf;
+}
+
+static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
+				      struct bin_attribute *attr, char *buf,
+				      loff_t pos, size_t count)
+{
+	const u64 *in = (u64 *)buf;
+	struct page *page;
+	unsigned long pfn, end_pfn;
+	int bit;
+
+	if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+		return -EINVAL;
+
+	pfn = pos * BITS_PER_BYTE;
+	if (pfn >= max_pfn)
+		return -ENXIO;
+
+	end_pfn = pfn + count * BITS_PER_BYTE;
+	if (end_pfn > max_pfn)
+		end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+
+	for (; pfn < end_pfn; pfn++) {
+		bit = pfn % BITMAP_CHUNK_BITS;
+		if ((*in >> bit) & 1) {
+			page = page_idle_get_page(pfn);
+			if (page) {
+				page_idle_clear_pte_refs(page);
+				set_page_idle(page);
+				put_page(page);
+			}
+		}
+		if (bit == BITMAP_CHUNK_BITS - 1)
+			in++;
+		cond_resched();
+	}
+	return (char *)in - buf;
+}
+
+static struct bin_attribute page_idle_bitmap_attr =
+		__BIN_ATTR(bitmap, S_IRUSR | S_IWUSR,
+			   page_idle_bitmap_read, page_idle_bitmap_write, 0);
+
+static struct bin_attribute *page_idle_bin_attrs[] = {
+	&page_idle_bitmap_attr,
+	NULL,
+};
+
+static struct attribute_group page_idle_attr_group = {
+	.bin_attrs = page_idle_bin_attrs,
+	.name = "page_idle",
+};
+
+#ifndef CONFIG_64BIT
+static bool need_page_idle(void)
+{
+	return true;
+}
+struct page_ext_operations page_idle_ops = {
+	.need = need_page_idle,
+};
+#endif
+
+static int __init page_idle_init(void)
+{
+	int err;
+
+	err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
+	if (err) {
+		pr_err("page_idle: register sysfs failed\n");
+		return err;
+	}
+	return 0;
+}
+subsys_initcall(page_idle_init);
diff --git a/mm/rmap.c b/mm/rmap.c
index 0db38e7d0a72..f5b5c1f3dcd7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -59,6 +59,7 @@
 #include <linux/migrate.h>
 #include <linux/hugetlb.h>
 #include <linux/backing-dev.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlbflush.h>
 
@@ -886,6 +887,11 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 		pte_unmap_unlock(pte, ptl);
 	}
 
+	if (referenced)
+		clear_page_idle(page);
+	if (test_and_clear_page_young(page))
+		referenced++;
+
 	if (referenced) {
 		pra->referenced++;
 		pra->vm_flags |= vma->vm_flags;
diff --git a/mm/swap.c b/mm/swap.c
index a3a0a2f1f7c3..983f692a47fd 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -32,6 +32,7 @@
 #include <linux/gfp.h>
 #include <linux/uio.h>
 #include <linux/hugetlb.h>
+#include <linux/page_idle.h>
 
 #include "internal.h"
 
@@ -622,6 +623,8 @@ void mark_page_accessed(struct page *page)
 	} else if (!PageReferenced(page)) {
 		SetPageReferenced(page);
 	}
+	if (page_is_idle(page))
+		clear_page_idle(page);
 }
 EXPORT_SYMBOL(mark_page_accessed);
 
-- 
cgit v1.2.3-70-g09d2


From 8a5e5e02fc83aaf67053ab53b359af08c6c49aaf Mon Sep 17 00:00:00 2001
From: Vasily Kulikov <segoon@openwall.com>
Date: Wed, 9 Sep 2015 15:36:00 -0700
Subject: include/linux/poison.h: fix LIST_POISON{1,2} offset

Poison pointer values should be small enough to find a room in
non-mmap'able/hardly-mmap'able space.  E.g.  on x86 "poison pointer space"
is located starting from 0x0.  Given unprivileged users cannot mmap
anything below mmap_min_addr, it should be safe to use poison pointers
lower than mmap_min_addr.

The current poison pointer values of LIST_POISON{1,2} might be too big for
mmap_min_addr values equal or less than 1 MB (common case, e.g.  Ubuntu
uses only 0x10000).  There is little point to use such a big value given
the "poison pointer space" below 1 MB is not yet exhausted.  Changing it
to a smaller value solves the problem for small mmap_min_addr setups.

The values are suggested by Solar Designer:
http://www.openwall.com/lists/oss-security/2015/05/02/6

Signed-off-by: Vasily Kulikov <segoon@openwall.com>
Cc: Solar Designer <solar@openwall.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/poison.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/poison.h b/include/linux/poison.h
index 2110a81c5e2a..253c9b4198ef 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -19,8 +19,8 @@
  * under normal circumstances, used to verify that nobody uses
  * non-initialized list entries.
  */
-#define LIST_POISON1  ((void *) 0x00100100 + POISON_POINTER_DELTA)
-#define LIST_POISON2  ((void *) 0x00200200 + POISON_POINTER_DELTA)
+#define LIST_POISON1  ((void *) 0x100 + POISON_POINTER_DELTA)
+#define LIST_POISON2  ((void *) 0x200 + POISON_POINTER_DELTA)
 
 /********** include/linux/timer.h **********/
 /*
-- 
cgit v1.2.3-70-g09d2


From 8b839635e73575990e92cce1f19f5b1d7febd3fa Mon Sep 17 00:00:00 2001
From: Vasily Kulikov <segoon@openwall.com>
Date: Wed, 9 Sep 2015 15:36:03 -0700
Subject: include/linux/poison.h: remove not-used poison pointer macros

Signed-off-by: Vasily Kulikov <segoon@openwall.com>
Cc: Solar Designer <solar@openwall.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/poison.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/poison.h b/include/linux/poison.h
index 253c9b4198ef..317e16de09e5 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -69,10 +69,6 @@
 #define ATM_POISON_FREE		0x12
 #define ATM_POISON		0xdeadbeef
 
-/********** net/ **********/
-#define NEIGHBOR_DEAD		0xdeadbeef
-#define NETFILTER_LINK_POISON	0xdead57ac
-
 /********** kernel/mutexes **********/
 #define MUTEX_DEBUG_INIT	0x11
 #define MUTEX_DEBUG_FREE	0x22
@@ -83,7 +79,4 @@
 /********** security/ **********/
 #define KEY_DESTROY		0xbd
 
-/********** sound/oss/ **********/
-#define OSS_POISON_FREE		0xAB
-
 #endif
-- 
cgit v1.2.3-70-g09d2


From 515a9adce0f0c3d2ef20f869c12902d03851a273 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Wed, 9 Sep 2015 15:36:12 -0700
Subject: include/linux/printk.h: include pr_fmt in pr_debug_ratelimited

The other two implementations of pr_debug_ratelimited include pr_fmt,
along with every other pr_* function.  But pr_debug_ratelimited forgot to
add it with the CONFIG_DYNAMIC_DEBUG implementation.

This patch unifies the behavior.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index a6298b27ac99..6545d911054f 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -404,10 +404,10 @@ do {									\
 	static DEFINE_RATELIMIT_STATE(_rs,				\
 				      DEFAULT_RATELIMIT_INTERVAL,	\
 				      DEFAULT_RATELIMIT_BURST);		\
-	DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt);			\
+	DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt));		\
 	if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) &&	\
 	    __ratelimit(&_rs))						\
-		__dynamic_pr_debug(&descriptor, fmt, ##__VA_ARGS__);	\
+		__dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__);	\
 } while (0)
 #elif defined(DEBUG)
 #define pr_debug_ratelimited(fmt, ...)					\
-- 
cgit v1.2.3-70-g09d2


From cdf17449af1d9b596742c260134edd6c1fac2792 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 9 Sep 2015 15:37:11 -0700
Subject: hexdump: do not print debug dumps for !CONFIG_DEBUG

print_hex_dump_debug() is likely supposed to be analogous to pr_debug() or
dev_dbg() & friends.  Currently it will adhere to dynamic debug, but will
not stub out prints if CONFIG_DEBUG is not set.  Let's make it do the
right thing, because I am tired of having my dmesg buffer full of hex
dumps on production systems.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 6545d911054f..9729565c25ff 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -456,11 +456,17 @@ static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
 			     groupsize, buf, len, ascii)	\
 	dynamic_hex_dump(prefix_str, prefix_type, rowsize,	\
 			 groupsize, buf, len, ascii)
-#else
+#elif defined(DEBUG)
 #define print_hex_dump_debug(prefix_str, prefix_type, rowsize,		\
 			     groupsize, buf, len, ascii)		\
 	print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize,	\
 		       groupsize, buf, len, ascii)
-#endif /* defined(CONFIG_DYNAMIC_DEBUG) */
+#else
+static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type,
+					int rowsize, int groupsize,
+					const void *buf, size_t len, bool ascii)
+{
+}
+#endif
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From b40bdb7fb2b8359d5dfe19a91c147465c3d0359b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 9 Sep 2015 15:37:16 -0700
Subject: lib/string_helpers: rename "esc" arg to "only"

To further clarify the purpose of the "esc" argument, rename it to "only"
to reflect that it is a limit, not a list of additional characters to
escape.

Signed-off-by: Kees Cook <keescook@chromium.org>
Suggested-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string_helpers.h | 14 +++++++-------
 lib/string_helpers.c           | 14 +++++++-------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 71f711db4500..dabe643eb5fa 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -48,24 +48,24 @@ static inline int string_unescape_any_inplace(char *buf)
 #define ESCAPE_HEX		0x20
 
 int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
-		unsigned int flags, const char *esc);
+		unsigned int flags, const char *only);
 
 static inline int string_escape_mem_any_np(const char *src, size_t isz,
-		char *dst, size_t osz, const char *esc)
+		char *dst, size_t osz, const char *only)
 {
-	return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, esc);
+	return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, only);
 }
 
 static inline int string_escape_str(const char *src, char *dst, size_t sz,
-		unsigned int flags, const char *esc)
+		unsigned int flags, const char *only)
 {
-	return string_escape_mem(src, strlen(src), dst, sz, flags, esc);
+	return string_escape_mem(src, strlen(src), dst, sz, flags, only);
 }
 
 static inline int string_escape_str_any_np(const char *src, char *dst,
-		size_t sz, const char *esc)
+		size_t sz, const char *only)
 {
-	return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, esc);
+	return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, only);
 }
 
 #endif
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 0a307a97d489..54036ce2e2dd 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -432,8 +432,8 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  *		all previous together
  *	%ESCAPE_HEX:
  *		'\xHH' - byte with hexadecimal value HH (2 digits)
- * @esc:	NULL-terminated string containing characters used to limit
- *		the selected escape class. If characters are included in @esc
+ * @only:	NULL-terminated string containing characters used to limit
+ *		the selected escape class. If characters are included in @only
  *		that would not normally be escaped by the classes selected
  *		in @flags, they will be copied to @dst unescaped.
  *
@@ -442,7 +442,7 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  * in the following sequence.
  *	1. The character is matched to the printable class, if asked, and in
  *	   case of match it passes through to the output.
- *	2. The character is not matched to the one from @esc string and thus
+ *	2. The character is not matched to the one from @only string and thus
  *	   must go as-is to the output.
  *	3. The character is checked if it falls into the class given by @flags.
  *	   %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any
@@ -460,11 +460,11 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  * dst for a '\0' terminator if and only if ret < osz.
  */
 int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
-		      unsigned int flags, const char *esc)
+		      unsigned int flags, const char *only)
 {
 	char *p = dst;
 	char *end = p + osz;
-	bool is_dict = esc && *esc;
+	bool is_dict = only && *only;
 
 	while (isz--) {
 		unsigned char c = *src++;
@@ -473,7 +473,7 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 		 * Apply rules in the following sequence:
 		 *	- the character is printable, when @flags has
 		 *	  %ESCAPE_NP bit set
-		 *	- the @esc string is supplied and does not contain a
+		 *	- the @only string is supplied and does not contain a
 		 *	  character under question
 		 *	- the character doesn't fall into a class of symbols
 		 *	  defined by given @flags
@@ -481,7 +481,7 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 		 * output buffer.
 		 */
 		if ((flags & ESCAPE_NP && isprint(c)) ||
-		    (is_dict && !strchr(esc, c))) {
+		    (is_dict && !strchr(only, c))) {
 			/* do nothing */
 		} else {
 			if (flags & ESCAPE_SPACE && escape_space(c, &p, end))
-- 
cgit v1.2.3-70-g09d2


From 90f023030e26ce8f981b3e688cb79329d8d07cc3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 9 Sep 2015 15:38:22 -0700
Subject: kmod: use system_unbound_wq instead of khelper

We need to launch the usermodehelper kernel threads with the widest
affinity and this is partly why we use khelper.  This workqueue has
unbound properties and thus a wide affinity inherited by all its children.

Now khelper also has special properties that we aren't much interested in:
ordered and singlethread.  There is really no need about ordering as all
we do is creating kernel threads.  This can be done concurrently.  And
singlethread is a useless limitation as well.

The workqueue engine already proposes generic unbound workqueues that
don't share these useless properties and handle well parallel jobs.

The only worrysome specific is their affinity to the node of the current
CPU.  It's fine for creating the usermodehelper kernel threads but those
inherit this affinity for longer jobs such as requesting modules.

This patch proposes to use these node affine unbound workqueues assuming
that a node is sufficient to handle several parallel usermodehelper
requests.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmod.h |  2 --
 init/main.c          |  1 -
 kernel/kmod.c        | 40 +++++++++++++++++-----------------------
 3 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 0555cc66a15b..fcfd2bf14d3f 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -85,8 +85,6 @@ enum umh_disable_depth {
 	UMH_DISABLED,
 };
 
-extern void usermodehelper_init(void);
-
 extern int __usermodehelper_disable(enum umh_disable_depth depth);
 extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth);
 
diff --git a/init/main.c b/init/main.c
index 56506553d4d8..9e64d7097f1a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -877,7 +877,6 @@ static void __init do_initcalls(void)
 static void __init do_basic_setup(void)
 {
 	cpuset_init_smp();
-	usermodehelper_init();
 	shmem_init();
 	driver_init();
 	init_irq_proc();
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 81c67050c5aa..d38b2dab99a7 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,8 +45,6 @@
 
 extern int max_threads;
 
-static struct workqueue_struct *khelper_wq;
-
 #define CAP_BSET	(void *)1
 #define CAP_PI		(void *)2
 
@@ -225,7 +223,7 @@ static int call_usermodehelper_exec_async(void *data)
 	spin_unlock_irq(&current->sighand->siglock);
 
 	/*
-	 * Our parent is khelper which runs with elevated scheduling
+	 * Our parent (unbound workqueue) runs with elevated scheduling
 	 * priority. Avoid propagating that into the userspace child.
 	 */
 	set_user_nice(current, 0);
@@ -268,9 +266,10 @@ out:
 }
 
 /*
- * Handles UMH_WAIT_PROC. Our parent khelper can't wait for usermodehelper
- * completion without blocking every other pending requests. That's why
- * we use a kernel thread dedicated for that purpose.
+ * Handles UMH_WAIT_PROC. Our parent (unbound workqueue) might not be able to
+ * run enough instances to handle usermodehelper completions without blocking
+ * some other pending requests. That's why we use a kernel thread dedicated for
+ * that purpose.
  */
 static int call_usermodehelper_exec_sync(void *data)
 {
@@ -312,14 +311,15 @@ static int call_usermodehelper_exec_sync(void *data)
 /*
  * This function doesn't strictly needs to be called asynchronously. But we
  * need to create the usermodehelper kernel threads from a task that is affine
- * to all CPUs (or nohz housekeeping ones) such that they inherit a widest
- * affinity irrespective of call_usermodehelper() callers with possibly reduced
- * affinity (eg: per-cpu workqueues). We don't want usermodehelper targets to
- * contend any busy CPU.
- * Khelper provides such wide affinity.
+ * to an optimized set of CPUs (or nohz housekeeping ones) such that they
+ * inherit a widest affinity irrespective of call_usermodehelper() callers with
+ * possibly reduced affinity (eg: per-cpu workqueues). We don't want
+ * usermodehelper targets to contend a busy CPU.
+ *
+ * Unbound workqueues provide such wide affinity.
  *
- * Besides, khelper provides the privilege level that caller might not have to
- * perform the usermodehelper request.
+ * Besides, workqueues provide the privilege level that caller might not have
+ * to perform the usermodehelper request.
  *
  */
 static void call_usermodehelper_exec_work(struct work_struct *work)
@@ -549,8 +549,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
  *        from interrupt context.
  *
  * Runs a user-space application.  The application is started
- * asynchronously if wait is not set, and runs as a child of khelper.
- * (ie. it runs with full root capabilities and wide affinity).
+ * asynchronously if wait is not set, and runs as a child of system workqueues.
+ * (ie. it runs with full root capabilities and optimized affinity).
  */
 int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 {
@@ -562,7 +562,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 		return -EINVAL;
 	}
 	helper_lock();
-	if (!khelper_wq || usermodehelper_disabled) {
+	if (usermodehelper_disabled) {
 		retval = -EBUSY;
 		goto out;
 	}
@@ -574,7 +574,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 	sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
 	sub_info->wait = wait;
 
-	queue_work(khelper_wq, &sub_info->work);
+	queue_work(system_unbound_wq, &sub_info->work);
 	if (wait == UMH_NO_WAIT)	/* task has freed sub_info */
 		goto unlock;
 
@@ -704,9 +704,3 @@ struct ctl_table usermodehelper_table[] = {
 	},
 	{ }
 };
-
-void __init usermodehelper_init(void)
-{
-	khelper_wq = create_singlethread_workqueue("khelper");
-	BUG_ON(!khelper_wq);
-}
-- 
cgit v1.2.3-70-g09d2


From 37607102c4426cf92aeb5da1b1d9a79ba6d95e3f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 9 Sep 2015 15:38:33 -0700
Subject: seq_file: provide an analogue of print_hex_dump()

This introduces a new helper and switches current users to use it.  All
patches are compiled tested. kmemleak is tested via its own test suite.

This patch (of 6):

The new seq_hex_dump() is a complete analogue of print_hex_dump().

We have few users of this functionality already. It allows to reduce their
codebase.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Joe Perches <joe@perches.com>
Cc: Tadeusz Struk <tadeusz.struk@intel.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Tuchscherer <ingo.tuchscherer@de.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Vladimir Kondratiev <qca_vkondrat@qca.qualcomm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c            | 42 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/seq_file.h |  4 ++++
 2 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index ce9e39fd5daf..263b125dbcf4 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -12,6 +12,7 @@
 #include <linux/slab.h>
 #include <linux/cred.h>
 #include <linux/mm.h>
+#include <linux/printk.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -773,6 +774,47 @@ void seq_pad(struct seq_file *m, char c)
 }
 EXPORT_SYMBOL(seq_pad);
 
+/* A complete analogue of print_hex_dump() */
+void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
+		  int rowsize, int groupsize, const void *buf, size_t len,
+		  bool ascii)
+{
+	const u8 *ptr = buf;
+	int i, linelen, remaining = len;
+	int ret;
+
+	if (rowsize != 16 && rowsize != 32)
+		rowsize = 16;
+
+	for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) {
+		linelen = min(remaining, rowsize);
+		remaining -= rowsize;
+
+		switch (prefix_type) {
+		case DUMP_PREFIX_ADDRESS:
+			seq_printf(m, "%s%p: ", prefix_str, ptr + i);
+			break;
+		case DUMP_PREFIX_OFFSET:
+			seq_printf(m, "%s%.8x: ", prefix_str, i);
+			break;
+		default:
+			seq_printf(m, "%s", prefix_str);
+			break;
+		}
+
+		ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
+					 m->buf + m->count, m->size - m->count,
+					 ascii);
+		if (ret >= m->size - m->count) {
+			seq_set_overflow(m);
+		} else {
+			m->count += ret;
+			seq_putc(m, '\n');
+		}
+	}
+}
+EXPORT_SYMBOL(seq_hex_dump);
+
 struct list_head *seq_list_start(struct list_head *head, loff_t pos)
 {
 	struct list_head *lh;
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index d4c7271382cb..adeadbd6d7bf 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -122,6 +122,10 @@ int seq_write(struct seq_file *seq, const void *data, size_t len);
 __printf(2, 3) int seq_printf(struct seq_file *, const char *, ...);
 __printf(2, 0) int seq_vprintf(struct seq_file *, const char *, va_list args);
 
+void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
+		  int rowsize, int groupsize, const void *buf, size_t len,
+		  bool ascii);
+
 int seq_path(struct seq_file *, const struct path *, const char *);
 int seq_file_path(struct seq_file *, struct file *, const char *);
 int seq_dentry(struct seq_file *, struct dentry *, const char *);
-- 
cgit v1.2.3-70-g09d2


From a43cac0d9dc2073ff2245a171429ddbe1accece7 Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Wed, 9 Sep 2015 15:38:51 -0700
Subject: kexec: split kexec_file syscall code to kexec_file.c

Split kexec_file syscall related code to another file kernel/kexec_file.c
so that the #ifdef CONFIG_KEXEC_FILE in kexec.c can be dropped.

Sharing variables and functions are moved to kernel/kexec_internal.h per
suggestion from Vivek and Petr.

[akpm@linux-foundation.org: fix bisectability]
[akpm@linux-foundation.org: declare the various arch_kexec functions]
[akpm@linux-foundation.org: fix build]
Signed-off-by: Dave Young <dyoung@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Petr Tesarik <ptesarik@suse.cz>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Josh Boyer <jwboyer@fedoraproject.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kexec.h   |   11 +
 kernel/Makefile         |    1 +
 kernel/kexec.c          | 1056 +----------------------------------------------
 kernel/kexec_file.c     | 1045 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/kexec_internal.h |   22 +
 5 files changed, 1090 insertions(+), 1045 deletions(-)
 create mode 100644 kernel/kexec_file.c
 create mode 100644 kernel/kexec_internal.h

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index b63218f68c4b..ab150ade0d18 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -318,6 +318,17 @@ int crash_shrink_memory(unsigned long new_size);
 size_t crash_get_memory_size(void);
 void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
 
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+					 unsigned long buf_len);
+void * __weak arch_kexec_kernel_image_load(struct kimage *image);
+int __weak arch_kimage_file_post_load_cleanup(struct kimage *image);
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+					unsigned long buf_len);
+int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr,
+					Elf_Shdr *sechdrs, unsigned int relsec);
+int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+					unsigned int relsec);
+
 #else /* !CONFIG_KEXEC */
 struct pt_regs;
 struct task_struct;
diff --git a/kernel/Makefile b/kernel/Makefile
index e0d7587e7684..1b4890af5a65 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_MODULE_SIG) += module_signing.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a785c1015e25..2d73ecfa5505 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -28,10 +28,10 @@
 #include <linux/suspend.h>
 #include <linux/device.h>
 #include <linux/freezer.h>
+#include <linux/vmalloc.h>
 #include <linux/pm.h>
 #include <linux/cpu.h>
 #include <linux/console.h>
-#include <linux/vmalloc.h>
 #include <linux/swap.h>
 #include <linux/syscore_ops.h>
 #include <linux/compiler.h>
@@ -44,6 +44,9 @@
 
 #include <crypto/hash.h>
 #include <crypto/sha.h>
+#include "kexec_internal.h"
+
+DEFINE_MUTEX(kexec_mutex);
 
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
@@ -57,16 +60,6 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
 /* Flag to indicate we are going to kexec a new kernel */
 bool kexec_in_progress = false;
 
-/*
- * Declare these symbols weak so that if architecture provides a purgatory,
- * these will be overridden.
- */
-char __weak kexec_purgatory[0];
-size_t __weak kexec_purgatory_size = 0;
-
-#ifdef CONFIG_KEXEC_FILE
-static int kexec_calculate_store_digests(struct kimage *image);
-#endif
 
 /* Location of the reserved area for the crash kernel */
 struct resource crashk_res = {
@@ -146,8 +139,6 @@ int kexec_should_crash(struct task_struct *p)
  */
 #define KIMAGE_NO_DEST (-1UL)
 
-static int kimage_is_destination_range(struct kimage *image,
-				       unsigned long start, unsigned long end);
 static struct page *kimage_alloc_page(struct kimage *image,
 				       gfp_t gfp_mask,
 				       unsigned long dest);
@@ -169,7 +160,7 @@ static int copy_user_segment_list(struct kimage *image,
 	return ret;
 }
 
-static int sanity_check_segment_list(struct kimage *image)
+int sanity_check_segment_list(struct kimage *image)
 {
 	int result, i;
 	unsigned long nr_segments = image->nr_segments;
@@ -259,7 +250,7 @@ static int sanity_check_segment_list(struct kimage *image)
 	return 0;
 }
 
-static struct kimage *do_kimage_alloc_init(void)
+struct kimage *do_kimage_alloc_init(void)
 {
 	struct kimage *image;
 
@@ -286,8 +277,6 @@ static struct kimage *do_kimage_alloc_init(void)
 	return image;
 }
 
-static void kimage_free_page_list(struct list_head *list);
-
 static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
 			     unsigned long nr_segments,
 			     struct kexec_segment __user *segments,
@@ -354,283 +343,7 @@ out_free_image:
 	return ret;
 }
 
-#ifdef CONFIG_KEXEC_FILE
-static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
-{
-	struct fd f = fdget(fd);
-	int ret;
-	struct kstat stat;
-	loff_t pos;
-	ssize_t bytes = 0;
-
-	if (!f.file)
-		return -EBADF;
-
-	ret = vfs_getattr(&f.file->f_path, &stat);
-	if (ret)
-		goto out;
-
-	if (stat.size > INT_MAX) {
-		ret = -EFBIG;
-		goto out;
-	}
-
-	/* Don't hand 0 to vmalloc, it whines. */
-	if (stat.size == 0) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	*buf = vmalloc(stat.size);
-	if (!*buf) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	pos = 0;
-	while (pos < stat.size) {
-		bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
-				    stat.size - pos);
-		if (bytes < 0) {
-			vfree(*buf);
-			ret = bytes;
-			goto out;
-		}
-
-		if (bytes == 0)
-			break;
-		pos += bytes;
-	}
-
-	if (pos != stat.size) {
-		ret = -EBADF;
-		vfree(*buf);
-		goto out;
-	}
-
-	*buf_len = pos;
-out:
-	fdput(f);
-	return ret;
-}
-
-/* Architectures can provide this probe function */
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
-					 unsigned long buf_len)
-{
-	return -ENOEXEC;
-}
-
-void * __weak arch_kexec_kernel_image_load(struct kimage *image)
-{
-	return ERR_PTR(-ENOEXEC);
-}
-
-void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
-}
-
-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
-					unsigned long buf_len)
-{
-	return -EKEYREJECTED;
-}
-
-/* Apply relocations of type RELA */
-int __weak
-arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
-				 unsigned int relsec)
-{
-	pr_err("RELA relocation unsupported.\n");
-	return -ENOEXEC;
-}
-
-/* Apply relocations of type REL */
-int __weak
-arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
-			     unsigned int relsec)
-{
-	pr_err("REL relocation unsupported.\n");
-	return -ENOEXEC;
-}
-
-/*
- * Free up memory used by kernel, initrd, and command line. This is temporary
- * memory allocation which is not needed any more after these buffers have
- * been loaded into separate segments and have been copied elsewhere.
- */
-static void kimage_file_post_load_cleanup(struct kimage *image)
-{
-	struct purgatory_info *pi = &image->purgatory_info;
-
-	vfree(image->kernel_buf);
-	image->kernel_buf = NULL;
-
-	vfree(image->initrd_buf);
-	image->initrd_buf = NULL;
-
-	kfree(image->cmdline_buf);
-	image->cmdline_buf = NULL;
-
-	vfree(pi->purgatory_buf);
-	pi->purgatory_buf = NULL;
-
-	vfree(pi->sechdrs);
-	pi->sechdrs = NULL;
-
-	/* See if architecture has anything to cleanup post load */
-	arch_kimage_file_post_load_cleanup(image);
-
-	/*
-	 * Above call should have called into bootloader to free up
-	 * any data stored in kimage->image_loader_data. It should
-	 * be ok now to free it up.
-	 */
-	kfree(image->image_loader_data);
-	image->image_loader_data = NULL;
-}
-
-/*
- * In file mode list of segments is prepared by kernel. Copy relevant
- * data from user space, do error checking, prepare segment list
- */
-static int
-kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
-			     const char __user *cmdline_ptr,
-			     unsigned long cmdline_len, unsigned flags)
-{
-	int ret = 0;
-	void *ldata;
-
-	ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
-				&image->kernel_buf_len);
-	if (ret)
-		return ret;
-
-	/* Call arch image probe handlers */
-	ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
-					    image->kernel_buf_len);
-
-	if (ret)
-		goto out;
-
-#ifdef CONFIG_KEXEC_VERIFY_SIG
-	ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
-					   image->kernel_buf_len);
-	if (ret) {
-		pr_debug("kernel signature verification failed.\n");
-		goto out;
-	}
-	pr_debug("kernel signature verification successful.\n");
-#endif
-	/* It is possible that there no initramfs is being loaded */
-	if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
-		ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
-					&image->initrd_buf_len);
-		if (ret)
-			goto out;
-	}
-
-	if (cmdline_len) {
-		image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
-		if (!image->cmdline_buf) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
-				     cmdline_len);
-		if (ret) {
-			ret = -EFAULT;
-			goto out;
-		}
-
-		image->cmdline_buf_len = cmdline_len;
-
-		/* command line should be a string with last byte null */
-		if (image->cmdline_buf[cmdline_len - 1] != '\0') {
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-
-	/* Call arch image load handlers */
-	ldata = arch_kexec_kernel_image_load(image);
-
-	if (IS_ERR(ldata)) {
-		ret = PTR_ERR(ldata);
-		goto out;
-	}
-
-	image->image_loader_data = ldata;
-out:
-	/* In case of error, free up all allocated memory in this function */
-	if (ret)
-		kimage_file_post_load_cleanup(image);
-	return ret;
-}
-
-static int
-kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
-		       int initrd_fd, const char __user *cmdline_ptr,
-		       unsigned long cmdline_len, unsigned long flags)
-{
-	int ret;
-	struct kimage *image;
-	bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
-
-	image = do_kimage_alloc_init();
-	if (!image)
-		return -ENOMEM;
-
-	image->file_mode = 1;
-
-	if (kexec_on_panic) {
-		/* Enable special crash kernel control page alloc policy. */
-		image->control_page = crashk_res.start;
-		image->type = KEXEC_TYPE_CRASH;
-	}
-
-	ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
-					   cmdline_ptr, cmdline_len, flags);
-	if (ret)
-		goto out_free_image;
-
-	ret = sanity_check_segment_list(image);
-	if (ret)
-		goto out_free_post_load_bufs;
-
-	ret = -ENOMEM;
-	image->control_code_page = kimage_alloc_control_pages(image,
-					   get_order(KEXEC_CONTROL_PAGE_SIZE));
-	if (!image->control_code_page) {
-		pr_err("Could not allocate control_code_buffer\n");
-		goto out_free_post_load_bufs;
-	}
-
-	if (!kexec_on_panic) {
-		image->swap_page = kimage_alloc_control_pages(image, 0);
-		if (!image->swap_page) {
-			pr_err("Could not allocate swap buffer\n");
-			goto out_free_control_pages;
-		}
-	}
-
-	*rimage = image;
-	return 0;
-out_free_control_pages:
-	kimage_free_page_list(&image->control_pages);
-out_free_post_load_bufs:
-	kimage_file_post_load_cleanup(image);
-out_free_image:
-	kfree(image);
-	return ret;
-}
-#else /* CONFIG_KEXEC_FILE */
-static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
-#endif /* CONFIG_KEXEC_FILE */
-
-static int kimage_is_destination_range(struct kimage *image,
+int kimage_is_destination_range(struct kimage *image,
 					unsigned long start,
 					unsigned long end)
 {
@@ -676,7 +389,7 @@ static void kimage_free_pages(struct page *page)
 	__free_pages(page, order);
 }
 
-static void kimage_free_page_list(struct list_head *list)
+void kimage_free_page_list(struct list_head *list)
 {
 	struct list_head *pos, *next;
 
@@ -892,7 +605,7 @@ static void kimage_free_extra_pages(struct kimage *image)
 	kimage_free_page_list(&image->unusable_pages);
 
 }
-static void kimage_terminate(struct kimage *image)
+void kimage_terminate(struct kimage *image)
 {
 	if (*image->entry != 0)
 		image->entry++;
@@ -913,7 +626,7 @@ static void kimage_free_entry(kimage_entry_t entry)
 	kimage_free_pages(page);
 }
 
-static void kimage_free(struct kimage *image)
+void kimage_free(struct kimage *image)
 {
 	kimage_entry_t *ptr, entry;
 	kimage_entry_t ind = 0;
@@ -1204,7 +917,7 @@ out:
 	return result;
 }
 
-static int kimage_load_segment(struct kimage *image,
+int kimage_load_segment(struct kimage *image,
 				struct kexec_segment *segment)
 {
 	int result = -ENOMEM;
@@ -1245,8 +958,6 @@ struct kimage *kexec_image;
 struct kimage *kexec_crash_image;
 int kexec_load_disabled;
 
-static DEFINE_MUTEX(kexec_mutex);
-
 SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 		struct kexec_segment __user *, segments, unsigned long, flags)
 {
@@ -1391,85 +1102,6 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
 }
 #endif
 
-#ifdef CONFIG_KEXEC_FILE
-SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
-		unsigned long, cmdline_len, const char __user *, cmdline_ptr,
-		unsigned long, flags)
-{
-	int ret = 0, i;
-	struct kimage **dest_image, *image;
-
-	/* We only trust the superuser with rebooting the system. */
-	if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
-		return -EPERM;
-
-	/* Make sure we have a legal set of flags */
-	if (flags != (flags & KEXEC_FILE_FLAGS))
-		return -EINVAL;
-
-	image = NULL;
-
-	if (!mutex_trylock(&kexec_mutex))
-		return -EBUSY;
-
-	dest_image = &kexec_image;
-	if (flags & KEXEC_FILE_ON_CRASH)
-		dest_image = &kexec_crash_image;
-
-	if (flags & KEXEC_FILE_UNLOAD)
-		goto exchange;
-
-	/*
-	 * In case of crash, new kernel gets loaded in reserved region. It is
-	 * same memory where old crash kernel might be loaded. Free any
-	 * current crash dump kernel before we corrupt it.
-	 */
-	if (flags & KEXEC_FILE_ON_CRASH)
-		kimage_free(xchg(&kexec_crash_image, NULL));
-
-	ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
-				     cmdline_len, flags);
-	if (ret)
-		goto out;
-
-	ret = machine_kexec_prepare(image);
-	if (ret)
-		goto out;
-
-	ret = kexec_calculate_store_digests(image);
-	if (ret)
-		goto out;
-
-	for (i = 0; i < image->nr_segments; i++) {
-		struct kexec_segment *ksegment;
-
-		ksegment = &image->segment[i];
-		pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
-			 i, ksegment->buf, ksegment->bufsz, ksegment->mem,
-			 ksegment->memsz);
-
-		ret = kimage_load_segment(image, &image->segment[i]);
-		if (ret)
-			goto out;
-	}
-
-	kimage_terminate(image);
-
-	/*
-	 * Free up any temporary buffers allocated which are not needed
-	 * after image has been loaded
-	 */
-	kimage_file_post_load_cleanup(image);
-exchange:
-	image = xchg(dest_image, image);
-out:
-	mutex_unlock(&kexec_mutex);
-	kimage_free(image);
-	return ret;
-}
-
-#endif /* CONFIG_KEXEC_FILE */
-
 void crash_kexec(struct pt_regs *regs)
 {
 	/* Take the kexec_mutex here to prevent sys_kexec_load
@@ -2024,672 +1656,6 @@ static int __init crash_save_vmcoreinfo_init(void)
 
 subsys_initcall(crash_save_vmcoreinfo_init);
 
-#ifdef CONFIG_KEXEC_FILE
-static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
-				    struct kexec_buf *kbuf)
-{
-	struct kimage *image = kbuf->image;
-	unsigned long temp_start, temp_end;
-
-	temp_end = min(end, kbuf->buf_max);
-	temp_start = temp_end - kbuf->memsz;
-
-	do {
-		/* align down start */
-		temp_start = temp_start & (~(kbuf->buf_align - 1));
-
-		if (temp_start < start || temp_start < kbuf->buf_min)
-			return 0;
-
-		temp_end = temp_start + kbuf->memsz - 1;
-
-		/*
-		 * Make sure this does not conflict with any of existing
-		 * segments
-		 */
-		if (kimage_is_destination_range(image, temp_start, temp_end)) {
-			temp_start = temp_start - PAGE_SIZE;
-			continue;
-		}
-
-		/* We found a suitable memory range */
-		break;
-	} while (1);
-
-	/* If we are here, we found a suitable memory range */
-	kbuf->mem = temp_start;
-
-	/* Success, stop navigating through remaining System RAM ranges */
-	return 1;
-}
-
-static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
-				     struct kexec_buf *kbuf)
-{
-	struct kimage *image = kbuf->image;
-	unsigned long temp_start, temp_end;
-
-	temp_start = max(start, kbuf->buf_min);
-
-	do {
-		temp_start = ALIGN(temp_start, kbuf->buf_align);
-		temp_end = temp_start + kbuf->memsz - 1;
-
-		if (temp_end > end || temp_end > kbuf->buf_max)
-			return 0;
-		/*
-		 * Make sure this does not conflict with any of existing
-		 * segments
-		 */
-		if (kimage_is_destination_range(image, temp_start, temp_end)) {
-			temp_start = temp_start + PAGE_SIZE;
-			continue;
-		}
-
-		/* We found a suitable memory range */
-		break;
-	} while (1);
-
-	/* If we are here, we found a suitable memory range */
-	kbuf->mem = temp_start;
-
-	/* Success, stop navigating through remaining System RAM ranges */
-	return 1;
-}
-
-static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
-{
-	struct kexec_buf *kbuf = (struct kexec_buf *)arg;
-	unsigned long sz = end - start + 1;
-
-	/* Returning 0 will take to next memory range */
-	if (sz < kbuf->memsz)
-		return 0;
-
-	if (end < kbuf->buf_min || start > kbuf->buf_max)
-		return 0;
-
-	/*
-	 * Allocate memory top down with-in ram range. Otherwise bottom up
-	 * allocation.
-	 */
-	if (kbuf->top_down)
-		return locate_mem_hole_top_down(start, end, kbuf);
-	return locate_mem_hole_bottom_up(start, end, kbuf);
-}
-
-/*
- * Helper function for placing a buffer in a kexec segment. This assumes
- * that kexec_mutex is held.
- */
-int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
-		     unsigned long memsz, unsigned long buf_align,
-		     unsigned long buf_min, unsigned long buf_max,
-		     bool top_down, unsigned long *load_addr)
-{
-
-	struct kexec_segment *ksegment;
-	struct kexec_buf buf, *kbuf;
-	int ret;
-
-	/* Currently adding segment this way is allowed only in file mode */
-	if (!image->file_mode)
-		return -EINVAL;
-
-	if (image->nr_segments >= KEXEC_SEGMENT_MAX)
-		return -EINVAL;
-
-	/*
-	 * Make sure we are not trying to add buffer after allocating
-	 * control pages. All segments need to be placed first before
-	 * any control pages are allocated. As control page allocation
-	 * logic goes through list of segments to make sure there are
-	 * no destination overlaps.
-	 */
-	if (!list_empty(&image->control_pages)) {
-		WARN_ON(1);
-		return -EINVAL;
-	}
-
-	memset(&buf, 0, sizeof(struct kexec_buf));
-	kbuf = &buf;
-	kbuf->image = image;
-	kbuf->buffer = buffer;
-	kbuf->bufsz = bufsz;
-
-	kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
-	kbuf->buf_align = max(buf_align, PAGE_SIZE);
-	kbuf->buf_min = buf_min;
-	kbuf->buf_max = buf_max;
-	kbuf->top_down = top_down;
-
-	/* Walk the RAM ranges and allocate a suitable range for the buffer */
-	if (image->type == KEXEC_TYPE_CRASH)
-		ret = walk_iomem_res("Crash kernel",
-				     IORESOURCE_MEM | IORESOURCE_BUSY,
-				     crashk_res.start, crashk_res.end, kbuf,
-				     locate_mem_hole_callback);
-	else
-		ret = walk_system_ram_res(0, -1, kbuf,
-					  locate_mem_hole_callback);
-	if (ret != 1) {
-		/* A suitable memory range could not be found for buffer */
-		return -EADDRNOTAVAIL;
-	}
-
-	/* Found a suitable memory range */
-	ksegment = &image->segment[image->nr_segments];
-	ksegment->kbuf = kbuf->buffer;
-	ksegment->bufsz = kbuf->bufsz;
-	ksegment->mem = kbuf->mem;
-	ksegment->memsz = kbuf->memsz;
-	image->nr_segments++;
-	*load_addr = ksegment->mem;
-	return 0;
-}
-
-/* Calculate and store the digest of segments */
-static int kexec_calculate_store_digests(struct kimage *image)
-{
-	struct crypto_shash *tfm;
-	struct shash_desc *desc;
-	int ret = 0, i, j, zero_buf_sz, sha_region_sz;
-	size_t desc_size, nullsz;
-	char *digest;
-	void *zero_buf;
-	struct kexec_sha_region *sha_regions;
-	struct purgatory_info *pi = &image->purgatory_info;
-
-	zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
-	zero_buf_sz = PAGE_SIZE;
-
-	tfm = crypto_alloc_shash("sha256", 0, 0);
-	if (IS_ERR(tfm)) {
-		ret = PTR_ERR(tfm);
-		goto out;
-	}
-
-	desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
-	desc = kzalloc(desc_size, GFP_KERNEL);
-	if (!desc) {
-		ret = -ENOMEM;
-		goto out_free_tfm;
-	}
-
-	sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
-	sha_regions = vzalloc(sha_region_sz);
-	if (!sha_regions)
-		goto out_free_desc;
-
-	desc->tfm   = tfm;
-	desc->flags = 0;
-
-	ret = crypto_shash_init(desc);
-	if (ret < 0)
-		goto out_free_sha_regions;
-
-	digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
-	if (!digest) {
-		ret = -ENOMEM;
-		goto out_free_sha_regions;
-	}
-
-	for (j = i = 0; i < image->nr_segments; i++) {
-		struct kexec_segment *ksegment;
-
-		ksegment = &image->segment[i];
-		/*
-		 * Skip purgatory as it will be modified once we put digest
-		 * info in purgatory.
-		 */
-		if (ksegment->kbuf == pi->purgatory_buf)
-			continue;
-
-		ret = crypto_shash_update(desc, ksegment->kbuf,
-					  ksegment->bufsz);
-		if (ret)
-			break;
-
-		/*
-		 * Assume rest of the buffer is filled with zero and
-		 * update digest accordingly.
-		 */
-		nullsz = ksegment->memsz - ksegment->bufsz;
-		while (nullsz) {
-			unsigned long bytes = nullsz;
-
-			if (bytes > zero_buf_sz)
-				bytes = zero_buf_sz;
-			ret = crypto_shash_update(desc, zero_buf, bytes);
-			if (ret)
-				break;
-			nullsz -= bytes;
-		}
-
-		if (ret)
-			break;
-
-		sha_regions[j].start = ksegment->mem;
-		sha_regions[j].len = ksegment->memsz;
-		j++;
-	}
-
-	if (!ret) {
-		ret = crypto_shash_final(desc, digest);
-		if (ret)
-			goto out_free_digest;
-		ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
-						sha_regions, sha_region_sz, 0);
-		if (ret)
-			goto out_free_digest;
-
-		ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
-						digest, SHA256_DIGEST_SIZE, 0);
-		if (ret)
-			goto out_free_digest;
-	}
-
-out_free_digest:
-	kfree(digest);
-out_free_sha_regions:
-	vfree(sha_regions);
-out_free_desc:
-	kfree(desc);
-out_free_tfm:
-	kfree(tfm);
-out:
-	return ret;
-}
-
-/* Actually load purgatory. Lot of code taken from kexec-tools */
-static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
-				  unsigned long max, int top_down)
-{
-	struct purgatory_info *pi = &image->purgatory_info;
-	unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
-	unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
-	unsigned char *buf_addr, *src;
-	int i, ret = 0, entry_sidx = -1;
-	const Elf_Shdr *sechdrs_c;
-	Elf_Shdr *sechdrs = NULL;
-	void *purgatory_buf = NULL;
-
-	/*
-	 * sechdrs_c points to section headers in purgatory and are read
-	 * only. No modifications allowed.
-	 */
-	sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
-
-	/*
-	 * We can not modify sechdrs_c[] and its fields. It is read only.
-	 * Copy it over to a local copy where one can store some temporary
-	 * data and free it at the end. We need to modify ->sh_addr and
-	 * ->sh_offset fields to keep track of permanent and temporary
-	 * locations of sections.
-	 */
-	sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-	if (!sechdrs)
-		return -ENOMEM;
-
-	memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-
-	/*
-	 * We seem to have multiple copies of sections. First copy is which
-	 * is embedded in kernel in read only section. Some of these sections
-	 * will be copied to a temporary buffer and relocated. And these
-	 * sections will finally be copied to their final destination at
-	 * segment load time.
-	 *
-	 * Use ->sh_offset to reflect section address in memory. It will
-	 * point to original read only copy if section is not allocatable.
-	 * Otherwise it will point to temporary copy which will be relocated.
-	 *
-	 * Use ->sh_addr to contain final address of the section where it
-	 * will go during execution time.
-	 */
-	for (i = 0; i < pi->ehdr->e_shnum; i++) {
-		if (sechdrs[i].sh_type == SHT_NOBITS)
-			continue;
-
-		sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
-						sechdrs[i].sh_offset;
-	}
-
-	/*
-	 * Identify entry point section and make entry relative to section
-	 * start.
-	 */
-	entry = pi->ehdr->e_entry;
-	for (i = 0; i < pi->ehdr->e_shnum; i++) {
-		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-			continue;
-
-		if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
-			continue;
-
-		/* Make entry section relative */
-		if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
-		    ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
-		     pi->ehdr->e_entry)) {
-			entry_sidx = i;
-			entry -= sechdrs[i].sh_addr;
-			break;
-		}
-	}
-
-	/* Determine how much memory is needed to load relocatable object. */
-	buf_align = 1;
-	bss_align = 1;
-	buf_sz = 0;
-	bss_sz = 0;
-
-	for (i = 0; i < pi->ehdr->e_shnum; i++) {
-		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-			continue;
-
-		align = sechdrs[i].sh_addralign;
-		if (sechdrs[i].sh_type != SHT_NOBITS) {
-			if (buf_align < align)
-				buf_align = align;
-			buf_sz = ALIGN(buf_sz, align);
-			buf_sz += sechdrs[i].sh_size;
-		} else {
-			/* bss section */
-			if (bss_align < align)
-				bss_align = align;
-			bss_sz = ALIGN(bss_sz, align);
-			bss_sz += sechdrs[i].sh_size;
-		}
-	}
-
-	/* Determine the bss padding required to align bss properly */
-	bss_pad = 0;
-	if (buf_sz & (bss_align - 1))
-		bss_pad = bss_align - (buf_sz & (bss_align - 1));
-
-	memsz = buf_sz + bss_pad + bss_sz;
-
-	/* Allocate buffer for purgatory */
-	purgatory_buf = vzalloc(buf_sz);
-	if (!purgatory_buf) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	if (buf_align < bss_align)
-		buf_align = bss_align;
-
-	/* Add buffer to segment list */
-	ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
-				buf_align, min, max, top_down,
-				&pi->purgatory_load_addr);
-	if (ret)
-		goto out;
-
-	/* Load SHF_ALLOC sections */
-	buf_addr = purgatory_buf;
-	load_addr = curr_load_addr = pi->purgatory_load_addr;
-	bss_addr = load_addr + buf_sz + bss_pad;
-
-	for (i = 0; i < pi->ehdr->e_shnum; i++) {
-		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-			continue;
-
-		align = sechdrs[i].sh_addralign;
-		if (sechdrs[i].sh_type != SHT_NOBITS) {
-			curr_load_addr = ALIGN(curr_load_addr, align);
-			offset = curr_load_addr - load_addr;
-			/* We already modifed ->sh_offset to keep src addr */
-			src = (char *) sechdrs[i].sh_offset;
-			memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
-
-			/* Store load address and source address of section */
-			sechdrs[i].sh_addr = curr_load_addr;
-
-			/*
-			 * This section got copied to temporary buffer. Update
-			 * ->sh_offset accordingly.
-			 */
-			sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
-
-			/* Advance to the next address */
-			curr_load_addr += sechdrs[i].sh_size;
-		} else {
-			bss_addr = ALIGN(bss_addr, align);
-			sechdrs[i].sh_addr = bss_addr;
-			bss_addr += sechdrs[i].sh_size;
-		}
-	}
-
-	/* Update entry point based on load address of text section */
-	if (entry_sidx >= 0)
-		entry += sechdrs[entry_sidx].sh_addr;
-
-	/* Make kernel jump to purgatory after shutdown */
-	image->start = entry;
-
-	/* Used later to get/set symbol values */
-	pi->sechdrs = sechdrs;
-
-	/*
-	 * Used later to identify which section is purgatory and skip it
-	 * from checksumming.
-	 */
-	pi->purgatory_buf = purgatory_buf;
-	return ret;
-out:
-	vfree(sechdrs);
-	vfree(purgatory_buf);
-	return ret;
-}
-
-static int kexec_apply_relocations(struct kimage *image)
-{
-	int i, ret;
-	struct purgatory_info *pi = &image->purgatory_info;
-	Elf_Shdr *sechdrs = pi->sechdrs;
-
-	/* Apply relocations */
-	for (i = 0; i < pi->ehdr->e_shnum; i++) {
-		Elf_Shdr *section, *symtab;
-
-		if (sechdrs[i].sh_type != SHT_RELA &&
-		    sechdrs[i].sh_type != SHT_REL)
-			continue;
-
-		/*
-		 * For section of type SHT_RELA/SHT_REL,
-		 * ->sh_link contains section header index of associated
-		 * symbol table. And ->sh_info contains section header
-		 * index of section to which relocations apply.
-		 */
-		if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
-		    sechdrs[i].sh_link >= pi->ehdr->e_shnum)
-			return -ENOEXEC;
-
-		section = &sechdrs[sechdrs[i].sh_info];
-		symtab = &sechdrs[sechdrs[i].sh_link];
-
-		if (!(section->sh_flags & SHF_ALLOC))
-			continue;
-
-		/*
-		 * symtab->sh_link contain section header index of associated
-		 * string table.
-		 */
-		if (symtab->sh_link >= pi->ehdr->e_shnum)
-			/* Invalid section number? */
-			continue;
-
-		/*
-		 * Respective architecture needs to provide support for applying
-		 * relocations of type SHT_RELA/SHT_REL.
-		 */
-		if (sechdrs[i].sh_type == SHT_RELA)
-			ret = arch_kexec_apply_relocations_add(pi->ehdr,
-							       sechdrs, i);
-		else if (sechdrs[i].sh_type == SHT_REL)
-			ret = arch_kexec_apply_relocations(pi->ehdr,
-							   sechdrs, i);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-/* Load relocatable purgatory object and relocate it appropriately */
-int kexec_load_purgatory(struct kimage *image, unsigned long min,
-			 unsigned long max, int top_down,
-			 unsigned long *load_addr)
-{
-	struct purgatory_info *pi = &image->purgatory_info;
-	int ret;
-
-	if (kexec_purgatory_size <= 0)
-		return -EINVAL;
-
-	if (kexec_purgatory_size < sizeof(Elf_Ehdr))
-		return -ENOEXEC;
-
-	pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
-
-	if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
-	    || pi->ehdr->e_type != ET_REL
-	    || !elf_check_arch(pi->ehdr)
-	    || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
-		return -ENOEXEC;
-
-	if (pi->ehdr->e_shoff >= kexec_purgatory_size
-	    || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
-	    kexec_purgatory_size - pi->ehdr->e_shoff))
-		return -ENOEXEC;
-
-	ret = __kexec_load_purgatory(image, min, max, top_down);
-	if (ret)
-		return ret;
-
-	ret = kexec_apply_relocations(image);
-	if (ret)
-		goto out;
-
-	*load_addr = pi->purgatory_load_addr;
-	return 0;
-out:
-	vfree(pi->sechdrs);
-	vfree(pi->purgatory_buf);
-	return ret;
-}
-
-static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
-					    const char *name)
-{
-	Elf_Sym *syms;
-	Elf_Shdr *sechdrs;
-	Elf_Ehdr *ehdr;
-	int i, k;
-	const char *strtab;
-
-	if (!pi->sechdrs || !pi->ehdr)
-		return NULL;
-
-	sechdrs = pi->sechdrs;
-	ehdr = pi->ehdr;
-
-	for (i = 0; i < ehdr->e_shnum; i++) {
-		if (sechdrs[i].sh_type != SHT_SYMTAB)
-			continue;
-
-		if (sechdrs[i].sh_link >= ehdr->e_shnum)
-			/* Invalid strtab section number */
-			continue;
-		strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
-		syms = (Elf_Sym *)sechdrs[i].sh_offset;
-
-		/* Go through symbols for a match */
-		for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
-			if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
-				continue;
-
-			if (strcmp(strtab + syms[k].st_name, name) != 0)
-				continue;
-
-			if (syms[k].st_shndx == SHN_UNDEF ||
-			    syms[k].st_shndx >= ehdr->e_shnum) {
-				pr_debug("Symbol: %s has bad section index %d.\n",
-						name, syms[k].st_shndx);
-				return NULL;
-			}
-
-			/* Found the symbol we are looking for */
-			return &syms[k];
-		}
-	}
-
-	return NULL;
-}
-
-void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
-{
-	struct purgatory_info *pi = &image->purgatory_info;
-	Elf_Sym *sym;
-	Elf_Shdr *sechdr;
-
-	sym = kexec_purgatory_find_symbol(pi, name);
-	if (!sym)
-		return ERR_PTR(-EINVAL);
-
-	sechdr = &pi->sechdrs[sym->st_shndx];
-
-	/*
-	 * Returns the address where symbol will finally be loaded after
-	 * kexec_load_segment()
-	 */
-	return (void *)(sechdr->sh_addr + sym->st_value);
-}
-
-/*
- * Get or set value of a symbol. If "get_value" is true, symbol value is
- * returned in buf otherwise symbol value is set based on value in buf.
- */
-int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
-				   void *buf, unsigned int size, bool get_value)
-{
-	Elf_Sym *sym;
-	Elf_Shdr *sechdrs;
-	struct purgatory_info *pi = &image->purgatory_info;
-	char *sym_buf;
-
-	sym = kexec_purgatory_find_symbol(pi, name);
-	if (!sym)
-		return -EINVAL;
-
-	if (sym->st_size != size) {
-		pr_err("symbol %s size mismatch: expected %lu actual %u\n",
-		       name, (unsigned long)sym->st_size, size);
-		return -EINVAL;
-	}
-
-	sechdrs = pi->sechdrs;
-
-	if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
-		pr_err("symbol %s is in a bss section. Cannot %s\n", name,
-		       get_value ? "get" : "set");
-		return -EINVAL;
-	}
-
-	sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
-					sym->st_value;
-
-	if (get_value)
-		memcpy((void *)buf, sym_buf, size);
-	else
-		memcpy((void *)sym_buf, buf, size);
-
-	return 0;
-}
-#endif /* CONFIG_KEXEC_FILE */
-
 /*
  * Move into place and start executing a preloaded standalone
  * executable.  If nothing was preloaded return an error.
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
new file mode 100644
index 000000000000..6a9a3f2a0e8e
--- /dev/null
+++ b/kernel/kexec_file.c
@@ -0,0 +1,1045 @@
+/*
+ * kexec: kexec_file_load system call
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ *      Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <linux/syscalls.h>
+#include <linux/vmalloc.h>
+#include "kexec_internal.h"
+
+/*
+ * Declare these symbols weak so that if architecture provides a purgatory,
+ * these will be overridden.
+ */
+char __weak kexec_purgatory[0];
+size_t __weak kexec_purgatory_size = 0;
+
+static int kexec_calculate_store_digests(struct kimage *image);
+
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
+{
+	struct fd f = fdget(fd);
+	int ret;
+	struct kstat stat;
+	loff_t pos;
+	ssize_t bytes = 0;
+
+	if (!f.file)
+		return -EBADF;
+
+	ret = vfs_getattr(&f.file->f_path, &stat);
+	if (ret)
+		goto out;
+
+	if (stat.size > INT_MAX) {
+		ret = -EFBIG;
+		goto out;
+	}
+
+	/* Don't hand 0 to vmalloc, it whines. */
+	if (stat.size == 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	*buf = vmalloc(stat.size);
+	if (!*buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	pos = 0;
+	while (pos < stat.size) {
+		bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
+				    stat.size - pos);
+		if (bytes < 0) {
+			vfree(*buf);
+			ret = bytes;
+			goto out;
+		}
+
+		if (bytes == 0)
+			break;
+		pos += bytes;
+	}
+
+	if (pos != stat.size) {
+		ret = -EBADF;
+		vfree(*buf);
+		goto out;
+	}
+
+	*buf_len = pos;
+out:
+	fdput(f);
+	return ret;
+}
+
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+					 unsigned long buf_len)
+{
+	return -ENOEXEC;
+}
+
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+	return ERR_PTR(-ENOEXEC);
+}
+
+int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+	return -EINVAL;
+}
+
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+					unsigned long buf_len)
+{
+	return -EKEYREJECTED;
+}
+
+/* Apply relocations of type RELA */
+int __weak
+arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+				 unsigned int relsec)
+{
+	pr_err("RELA relocation unsupported.\n");
+	return -ENOEXEC;
+}
+
+/* Apply relocations of type REL */
+int __weak
+arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+			     unsigned int relsec)
+{
+	pr_err("REL relocation unsupported.\n");
+	return -ENOEXEC;
+}
+
+/*
+ * Free up memory used by kernel, initrd, and command line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+void kimage_file_post_load_cleanup(struct kimage *image)
+{
+	struct purgatory_info *pi = &image->purgatory_info;
+
+	vfree(image->kernel_buf);
+	image->kernel_buf = NULL;
+
+	vfree(image->initrd_buf);
+	image->initrd_buf = NULL;
+
+	kfree(image->cmdline_buf);
+	image->cmdline_buf = NULL;
+
+	vfree(pi->purgatory_buf);
+	pi->purgatory_buf = NULL;
+
+	vfree(pi->sechdrs);
+	pi->sechdrs = NULL;
+
+	/* See if architecture has anything to cleanup post load */
+	arch_kimage_file_post_load_cleanup(image);
+
+	/*
+	 * Above call should have called into bootloader to free up
+	 * any data stored in kimage->image_loader_data. It should
+	 * be ok now to free it up.
+	 */
+	kfree(image->image_loader_data);
+	image->image_loader_data = NULL;
+}
+
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+			     const char __user *cmdline_ptr,
+			     unsigned long cmdline_len, unsigned flags)
+{
+	int ret = 0;
+	void *ldata;
+
+	ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+				&image->kernel_buf_len);
+	if (ret)
+		return ret;
+
+	/* Call arch image probe handlers */
+	ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+					    image->kernel_buf_len);
+
+	if (ret)
+		goto out;
+
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+	ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+					   image->kernel_buf_len);
+	if (ret) {
+		pr_debug("kernel signature verification failed.\n");
+		goto out;
+	}
+	pr_debug("kernel signature verification successful.\n");
+#endif
+	/* It is possible that there no initramfs is being loaded */
+	if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+		ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+					&image->initrd_buf_len);
+		if (ret)
+			goto out;
+	}
+
+	if (cmdline_len) {
+		image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+		if (!image->cmdline_buf) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+				     cmdline_len);
+		if (ret) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		image->cmdline_buf_len = cmdline_len;
+
+		/* command line should be a string with last byte null */
+		if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	/* Call arch image load handlers */
+	ldata = arch_kexec_kernel_image_load(image);
+
+	if (IS_ERR(ldata)) {
+		ret = PTR_ERR(ldata);
+		goto out;
+	}
+
+	image->image_loader_data = ldata;
+out:
+	/* In case of error, free up all allocated memory in this function */
+	if (ret)
+		kimage_file_post_load_cleanup(image);
+	return ret;
+}
+
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+		       int initrd_fd, const char __user *cmdline_ptr,
+		       unsigned long cmdline_len, unsigned long flags)
+{
+	int ret;
+	struct kimage *image;
+	bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+
+	image = do_kimage_alloc_init();
+	if (!image)
+		return -ENOMEM;
+
+	image->file_mode = 1;
+
+	if (kexec_on_panic) {
+		/* Enable special crash kernel control page alloc policy. */
+		image->control_page = crashk_res.start;
+		image->type = KEXEC_TYPE_CRASH;
+	}
+
+	ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+					   cmdline_ptr, cmdline_len, flags);
+	if (ret)
+		goto out_free_image;
+
+	ret = sanity_check_segment_list(image);
+	if (ret)
+		goto out_free_post_load_bufs;
+
+	ret = -ENOMEM;
+	image->control_code_page = kimage_alloc_control_pages(image,
+					   get_order(KEXEC_CONTROL_PAGE_SIZE));
+	if (!image->control_code_page) {
+		pr_err("Could not allocate control_code_buffer\n");
+		goto out_free_post_load_bufs;
+	}
+
+	if (!kexec_on_panic) {
+		image->swap_page = kimage_alloc_control_pages(image, 0);
+		if (!image->swap_page) {
+			pr_err("Could not allocate swap buffer\n");
+			goto out_free_control_pages;
+		}
+	}
+
+	*rimage = image;
+	return 0;
+out_free_control_pages:
+	kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+	kimage_file_post_load_cleanup(image);
+out_free_image:
+	kfree(image);
+	return ret;
+}
+
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+		unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+		unsigned long, flags)
+{
+	int ret = 0, i;
+	struct kimage **dest_image, *image;
+
+	/* We only trust the superuser with rebooting the system. */
+	if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+		return -EPERM;
+
+	/* Make sure we have a legal set of flags */
+	if (flags != (flags & KEXEC_FILE_FLAGS))
+		return -EINVAL;
+
+	image = NULL;
+
+	if (!mutex_trylock(&kexec_mutex))
+		return -EBUSY;
+
+	dest_image = &kexec_image;
+	if (flags & KEXEC_FILE_ON_CRASH)
+		dest_image = &kexec_crash_image;
+
+	if (flags & KEXEC_FILE_UNLOAD)
+		goto exchange;
+
+	/*
+	 * In case of crash, new kernel gets loaded in reserved region. It is
+	 * same memory where old crash kernel might be loaded. Free any
+	 * current crash dump kernel before we corrupt it.
+	 */
+	if (flags & KEXEC_FILE_ON_CRASH)
+		kimage_free(xchg(&kexec_crash_image, NULL));
+
+	ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+				     cmdline_len, flags);
+	if (ret)
+		goto out;
+
+	ret = machine_kexec_prepare(image);
+	if (ret)
+		goto out;
+
+	ret = kexec_calculate_store_digests(image);
+	if (ret)
+		goto out;
+
+	for (i = 0; i < image->nr_segments; i++) {
+		struct kexec_segment *ksegment;
+
+		ksegment = &image->segment[i];
+		pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+			 i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+			 ksegment->memsz);
+
+		ret = kimage_load_segment(image, &image->segment[i]);
+		if (ret)
+			goto out;
+	}
+
+	kimage_terminate(image);
+
+	/*
+	 * Free up any temporary buffers allocated which are not needed
+	 * after image has been loaded
+	 */
+	kimage_file_post_load_cleanup(image);
+exchange:
+	image = xchg(dest_image, image);
+out:
+	mutex_unlock(&kexec_mutex);
+	kimage_free(image);
+	return ret;
+}
+
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+				    struct kexec_buf *kbuf)
+{
+	struct kimage *image = kbuf->image;
+	unsigned long temp_start, temp_end;
+
+	temp_end = min(end, kbuf->buf_max);
+	temp_start = temp_end - kbuf->memsz;
+
+	do {
+		/* align down start */
+		temp_start = temp_start & (~(kbuf->buf_align - 1));
+
+		if (temp_start < start || temp_start < kbuf->buf_min)
+			return 0;
+
+		temp_end = temp_start + kbuf->memsz - 1;
+
+		/*
+		 * Make sure this does not conflict with any of existing
+		 * segments
+		 */
+		if (kimage_is_destination_range(image, temp_start, temp_end)) {
+			temp_start = temp_start - PAGE_SIZE;
+			continue;
+		}
+
+		/* We found a suitable memory range */
+		break;
+	} while (1);
+
+	/* If we are here, we found a suitable memory range */
+	kbuf->mem = temp_start;
+
+	/* Success, stop navigating through remaining System RAM ranges */
+	return 1;
+}
+
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+				     struct kexec_buf *kbuf)
+{
+	struct kimage *image = kbuf->image;
+	unsigned long temp_start, temp_end;
+
+	temp_start = max(start, kbuf->buf_min);
+
+	do {
+		temp_start = ALIGN(temp_start, kbuf->buf_align);
+		temp_end = temp_start + kbuf->memsz - 1;
+
+		if (temp_end > end || temp_end > kbuf->buf_max)
+			return 0;
+		/*
+		 * Make sure this does not conflict with any of existing
+		 * segments
+		 */
+		if (kimage_is_destination_range(image, temp_start, temp_end)) {
+			temp_start = temp_start + PAGE_SIZE;
+			continue;
+		}
+
+		/* We found a suitable memory range */
+		break;
+	} while (1);
+
+	/* If we are here, we found a suitable memory range */
+	kbuf->mem = temp_start;
+
+	/* Success, stop navigating through remaining System RAM ranges */
+	return 1;
+}
+
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+	struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+	unsigned long sz = end - start + 1;
+
+	/* Returning 0 will take to next memory range */
+	if (sz < kbuf->memsz)
+		return 0;
+
+	if (end < kbuf->buf_min || start > kbuf->buf_max)
+		return 0;
+
+	/*
+	 * Allocate memory top down with-in ram range. Otherwise bottom up
+	 * allocation.
+	 */
+	if (kbuf->top_down)
+		return locate_mem_hole_top_down(start, end, kbuf);
+	return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+		     unsigned long memsz, unsigned long buf_align,
+		     unsigned long buf_min, unsigned long buf_max,
+		     bool top_down, unsigned long *load_addr)
+{
+
+	struct kexec_segment *ksegment;
+	struct kexec_buf buf, *kbuf;
+	int ret;
+
+	/* Currently adding segment this way is allowed only in file mode */
+	if (!image->file_mode)
+		return -EINVAL;
+
+	if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+		return -EINVAL;
+
+	/*
+	 * Make sure we are not trying to add buffer after allocating
+	 * control pages. All segments need to be placed first before
+	 * any control pages are allocated. As control page allocation
+	 * logic goes through list of segments to make sure there are
+	 * no destination overlaps.
+	 */
+	if (!list_empty(&image->control_pages)) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	memset(&buf, 0, sizeof(struct kexec_buf));
+	kbuf = &buf;
+	kbuf->image = image;
+	kbuf->buffer = buffer;
+	kbuf->bufsz = bufsz;
+
+	kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+	kbuf->buf_align = max(buf_align, PAGE_SIZE);
+	kbuf->buf_min = buf_min;
+	kbuf->buf_max = buf_max;
+	kbuf->top_down = top_down;
+
+	/* Walk the RAM ranges and allocate a suitable range for the buffer */
+	if (image->type == KEXEC_TYPE_CRASH)
+		ret = walk_iomem_res("Crash kernel",
+				     IORESOURCE_MEM | IORESOURCE_BUSY,
+				     crashk_res.start, crashk_res.end, kbuf,
+				     locate_mem_hole_callback);
+	else
+		ret = walk_system_ram_res(0, -1, kbuf,
+					  locate_mem_hole_callback);
+	if (ret != 1) {
+		/* A suitable memory range could not be found for buffer */
+		return -EADDRNOTAVAIL;
+	}
+
+	/* Found a suitable memory range */
+	ksegment = &image->segment[image->nr_segments];
+	ksegment->kbuf = kbuf->buffer;
+	ksegment->bufsz = kbuf->bufsz;
+	ksegment->mem = kbuf->mem;
+	ksegment->memsz = kbuf->memsz;
+	image->nr_segments++;
+	*load_addr = ksegment->mem;
+	return 0;
+}
+
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+	struct crypto_shash *tfm;
+	struct shash_desc *desc;
+	int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+	size_t desc_size, nullsz;
+	char *digest;
+	void *zero_buf;
+	struct kexec_sha_region *sha_regions;
+	struct purgatory_info *pi = &image->purgatory_info;
+
+	zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+	zero_buf_sz = PAGE_SIZE;
+
+	tfm = crypto_alloc_shash("sha256", 0, 0);
+	if (IS_ERR(tfm)) {
+		ret = PTR_ERR(tfm);
+		goto out;
+	}
+
+	desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+	desc = kzalloc(desc_size, GFP_KERNEL);
+	if (!desc) {
+		ret = -ENOMEM;
+		goto out_free_tfm;
+	}
+
+	sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+	sha_regions = vzalloc(sha_region_sz);
+	if (!sha_regions)
+		goto out_free_desc;
+
+	desc->tfm   = tfm;
+	desc->flags = 0;
+
+	ret = crypto_shash_init(desc);
+	if (ret < 0)
+		goto out_free_sha_regions;
+
+	digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+	if (!digest) {
+		ret = -ENOMEM;
+		goto out_free_sha_regions;
+	}
+
+	for (j = i = 0; i < image->nr_segments; i++) {
+		struct kexec_segment *ksegment;
+
+		ksegment = &image->segment[i];
+		/*
+		 * Skip purgatory as it will be modified once we put digest
+		 * info in purgatory.
+		 */
+		if (ksegment->kbuf == pi->purgatory_buf)
+			continue;
+
+		ret = crypto_shash_update(desc, ksegment->kbuf,
+					  ksegment->bufsz);
+		if (ret)
+			break;
+
+		/*
+		 * Assume rest of the buffer is filled with zero and
+		 * update digest accordingly.
+		 */
+		nullsz = ksegment->memsz - ksegment->bufsz;
+		while (nullsz) {
+			unsigned long bytes = nullsz;
+
+			if (bytes > zero_buf_sz)
+				bytes = zero_buf_sz;
+			ret = crypto_shash_update(desc, zero_buf, bytes);
+			if (ret)
+				break;
+			nullsz -= bytes;
+		}
+
+		if (ret)
+			break;
+
+		sha_regions[j].start = ksegment->mem;
+		sha_regions[j].len = ksegment->memsz;
+		j++;
+	}
+
+	if (!ret) {
+		ret = crypto_shash_final(desc, digest);
+		if (ret)
+			goto out_free_digest;
+		ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
+						sha_regions, sha_region_sz, 0);
+		if (ret)
+			goto out_free_digest;
+
+		ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
+						digest, SHA256_DIGEST_SIZE, 0);
+		if (ret)
+			goto out_free_digest;
+	}
+
+out_free_digest:
+	kfree(digest);
+out_free_sha_regions:
+	vfree(sha_regions);
+out_free_desc:
+	kfree(desc);
+out_free_tfm:
+	kfree(tfm);
+out:
+	return ret;
+}
+
+/* Actually load purgatory. Lot of code taken from kexec-tools */
+static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
+				  unsigned long max, int top_down)
+{
+	struct purgatory_info *pi = &image->purgatory_info;
+	unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
+	unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+	unsigned char *buf_addr, *src;
+	int i, ret = 0, entry_sidx = -1;
+	const Elf_Shdr *sechdrs_c;
+	Elf_Shdr *sechdrs = NULL;
+	void *purgatory_buf = NULL;
+
+	/*
+	 * sechdrs_c points to section headers in purgatory and are read
+	 * only. No modifications allowed.
+	 */
+	sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+
+	/*
+	 * We can not modify sechdrs_c[] and its fields. It is read only.
+	 * Copy it over to a local copy where one can store some temporary
+	 * data and free it at the end. We need to modify ->sh_addr and
+	 * ->sh_offset fields to keep track of permanent and temporary
+	 * locations of sections.
+	 */
+	sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+	if (!sechdrs)
+		return -ENOMEM;
+
+	memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+
+	/*
+	 * We seem to have multiple copies of sections. First copy is which
+	 * is embedded in kernel in read only section. Some of these sections
+	 * will be copied to a temporary buffer and relocated. And these
+	 * sections will finally be copied to their final destination at
+	 * segment load time.
+	 *
+	 * Use ->sh_offset to reflect section address in memory. It will
+	 * point to original read only copy if section is not allocatable.
+	 * Otherwise it will point to temporary copy which will be relocated.
+	 *
+	 * Use ->sh_addr to contain final address of the section where it
+	 * will go during execution time.
+	 */
+	for (i = 0; i < pi->ehdr->e_shnum; i++) {
+		if (sechdrs[i].sh_type == SHT_NOBITS)
+			continue;
+
+		sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
+						sechdrs[i].sh_offset;
+	}
+
+	/*
+	 * Identify entry point section and make entry relative to section
+	 * start.
+	 */
+	entry = pi->ehdr->e_entry;
+	for (i = 0; i < pi->ehdr->e_shnum; i++) {
+		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+			continue;
+
+		if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
+			continue;
+
+		/* Make entry section relative */
+		if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
+		    ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
+		     pi->ehdr->e_entry)) {
+			entry_sidx = i;
+			entry -= sechdrs[i].sh_addr;
+			break;
+		}
+	}
+
+	/* Determine how much memory is needed to load relocatable object. */
+	buf_align = 1;
+	bss_align = 1;
+	buf_sz = 0;
+	bss_sz = 0;
+
+	for (i = 0; i < pi->ehdr->e_shnum; i++) {
+		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+			continue;
+
+		align = sechdrs[i].sh_addralign;
+		if (sechdrs[i].sh_type != SHT_NOBITS) {
+			if (buf_align < align)
+				buf_align = align;
+			buf_sz = ALIGN(buf_sz, align);
+			buf_sz += sechdrs[i].sh_size;
+		} else {
+			/* bss section */
+			if (bss_align < align)
+				bss_align = align;
+			bss_sz = ALIGN(bss_sz, align);
+			bss_sz += sechdrs[i].sh_size;
+		}
+	}
+
+	/* Determine the bss padding required to align bss properly */
+	bss_pad = 0;
+	if (buf_sz & (bss_align - 1))
+		bss_pad = bss_align - (buf_sz & (bss_align - 1));
+
+	memsz = buf_sz + bss_pad + bss_sz;
+
+	/* Allocate buffer for purgatory */
+	purgatory_buf = vzalloc(buf_sz);
+	if (!purgatory_buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (buf_align < bss_align)
+		buf_align = bss_align;
+
+	/* Add buffer to segment list */
+	ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
+				buf_align, min, max, top_down,
+				&pi->purgatory_load_addr);
+	if (ret)
+		goto out;
+
+	/* Load SHF_ALLOC sections */
+	buf_addr = purgatory_buf;
+	load_addr = curr_load_addr = pi->purgatory_load_addr;
+	bss_addr = load_addr + buf_sz + bss_pad;
+
+	for (i = 0; i < pi->ehdr->e_shnum; i++) {
+		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+			continue;
+
+		align = sechdrs[i].sh_addralign;
+		if (sechdrs[i].sh_type != SHT_NOBITS) {
+			curr_load_addr = ALIGN(curr_load_addr, align);
+			offset = curr_load_addr - load_addr;
+			/* We already modifed ->sh_offset to keep src addr */
+			src = (char *) sechdrs[i].sh_offset;
+			memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+
+			/* Store load address and source address of section */
+			sechdrs[i].sh_addr = curr_load_addr;
+
+			/*
+			 * This section got copied to temporary buffer. Update
+			 * ->sh_offset accordingly.
+			 */
+			sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+
+			/* Advance to the next address */
+			curr_load_addr += sechdrs[i].sh_size;
+		} else {
+			bss_addr = ALIGN(bss_addr, align);
+			sechdrs[i].sh_addr = bss_addr;
+			bss_addr += sechdrs[i].sh_size;
+		}
+	}
+
+	/* Update entry point based on load address of text section */
+	if (entry_sidx >= 0)
+		entry += sechdrs[entry_sidx].sh_addr;
+
+	/* Make kernel jump to purgatory after shutdown */
+	image->start = entry;
+
+	/* Used later to get/set symbol values */
+	pi->sechdrs = sechdrs;
+
+	/*
+	 * Used later to identify which section is purgatory and skip it
+	 * from checksumming.
+	 */
+	pi->purgatory_buf = purgatory_buf;
+	return ret;
+out:
+	vfree(sechdrs);
+	vfree(purgatory_buf);
+	return ret;
+}
+
+static int kexec_apply_relocations(struct kimage *image)
+{
+	int i, ret;
+	struct purgatory_info *pi = &image->purgatory_info;
+	Elf_Shdr *sechdrs = pi->sechdrs;
+
+	/* Apply relocations */
+	for (i = 0; i < pi->ehdr->e_shnum; i++) {
+		Elf_Shdr *section, *symtab;
+
+		if (sechdrs[i].sh_type != SHT_RELA &&
+		    sechdrs[i].sh_type != SHT_REL)
+			continue;
+
+		/*
+		 * For section of type SHT_RELA/SHT_REL,
+		 * ->sh_link contains section header index of associated
+		 * symbol table. And ->sh_info contains section header
+		 * index of section to which relocations apply.
+		 */
+		if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
+		    sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+			return -ENOEXEC;
+
+		section = &sechdrs[sechdrs[i].sh_info];
+		symtab = &sechdrs[sechdrs[i].sh_link];
+
+		if (!(section->sh_flags & SHF_ALLOC))
+			continue;
+
+		/*
+		 * symtab->sh_link contain section header index of associated
+		 * string table.
+		 */
+		if (symtab->sh_link >= pi->ehdr->e_shnum)
+			/* Invalid section number? */
+			continue;
+
+		/*
+		 * Respective architecture needs to provide support for applying
+		 * relocations of type SHT_RELA/SHT_REL.
+		 */
+		if (sechdrs[i].sh_type == SHT_RELA)
+			ret = arch_kexec_apply_relocations_add(pi->ehdr,
+							       sechdrs, i);
+		else if (sechdrs[i].sh_type == SHT_REL)
+			ret = arch_kexec_apply_relocations(pi->ehdr,
+							   sechdrs, i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/* Load relocatable purgatory object and relocate it appropriately */
+int kexec_load_purgatory(struct kimage *image, unsigned long min,
+			 unsigned long max, int top_down,
+			 unsigned long *load_addr)
+{
+	struct purgatory_info *pi = &image->purgatory_info;
+	int ret;
+
+	if (kexec_purgatory_size <= 0)
+		return -EINVAL;
+
+	if (kexec_purgatory_size < sizeof(Elf_Ehdr))
+		return -ENOEXEC;
+
+	pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+
+	if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
+	    || pi->ehdr->e_type != ET_REL
+	    || !elf_check_arch(pi->ehdr)
+	    || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
+		return -ENOEXEC;
+
+	if (pi->ehdr->e_shoff >= kexec_purgatory_size
+	    || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
+	    kexec_purgatory_size - pi->ehdr->e_shoff))
+		return -ENOEXEC;
+
+	ret = __kexec_load_purgatory(image, min, max, top_down);
+	if (ret)
+		return ret;
+
+	ret = kexec_apply_relocations(image);
+	if (ret)
+		goto out;
+
+	*load_addr = pi->purgatory_load_addr;
+	return 0;
+out:
+	vfree(pi->sechdrs);
+	vfree(pi->purgatory_buf);
+	return ret;
+}
+
+static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+					    const char *name)
+{
+	Elf_Sym *syms;
+	Elf_Shdr *sechdrs;
+	Elf_Ehdr *ehdr;
+	int i, k;
+	const char *strtab;
+
+	if (!pi->sechdrs || !pi->ehdr)
+		return NULL;
+
+	sechdrs = pi->sechdrs;
+	ehdr = pi->ehdr;
+
+	for (i = 0; i < ehdr->e_shnum; i++) {
+		if (sechdrs[i].sh_type != SHT_SYMTAB)
+			continue;
+
+		if (sechdrs[i].sh_link >= ehdr->e_shnum)
+			/* Invalid strtab section number */
+			continue;
+		strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
+		syms = (Elf_Sym *)sechdrs[i].sh_offset;
+
+		/* Go through symbols for a match */
+		for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+			if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+				continue;
+
+			if (strcmp(strtab + syms[k].st_name, name) != 0)
+				continue;
+
+			if (syms[k].st_shndx == SHN_UNDEF ||
+			    syms[k].st_shndx >= ehdr->e_shnum) {
+				pr_debug("Symbol: %s has bad section index %d.\n",
+						name, syms[k].st_shndx);
+				return NULL;
+			}
+
+			/* Found the symbol we are looking for */
+			return &syms[k];
+		}
+	}
+
+	return NULL;
+}
+
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+	struct purgatory_info *pi = &image->purgatory_info;
+	Elf_Sym *sym;
+	Elf_Shdr *sechdr;
+
+	sym = kexec_purgatory_find_symbol(pi, name);
+	if (!sym)
+		return ERR_PTR(-EINVAL);
+
+	sechdr = &pi->sechdrs[sym->st_shndx];
+
+	/*
+	 * Returns the address where symbol will finally be loaded after
+	 * kexec_load_segment()
+	 */
+	return (void *)(sechdr->sh_addr + sym->st_value);
+}
+
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+				   void *buf, unsigned int size, bool get_value)
+{
+	Elf_Sym *sym;
+	Elf_Shdr *sechdrs;
+	struct purgatory_info *pi = &image->purgatory_info;
+	char *sym_buf;
+
+	sym = kexec_purgatory_find_symbol(pi, name);
+	if (!sym)
+		return -EINVAL;
+
+	if (sym->st_size != size) {
+		pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+		       name, (unsigned long)sym->st_size, size);
+		return -EINVAL;
+	}
+
+	sechdrs = pi->sechdrs;
+
+	if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+		pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+		       get_value ? "get" : "set");
+		return -EINVAL;
+	}
+
+	sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
+					sym->st_value;
+
+	if (get_value)
+		memcpy((void *)buf, sym_buf, size);
+	else
+		memcpy((void *)sym_buf, buf, size);
+
+	return 0;
+}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
new file mode 100644
index 000000000000..e4392a698ad4
--- /dev/null
+++ b/kernel/kexec_internal.h
@@ -0,0 +1,22 @@
+#ifndef LINUX_KEXEC_INTERNAL_H
+#define LINUX_KEXEC_INTERNAL_H
+
+#include <linux/kexec.h>
+
+struct kimage *do_kimage_alloc_init(void);
+int sanity_check_segment_list(struct kimage *image);
+void kimage_free_page_list(struct list_head *list);
+void kimage_free(struct kimage *image);
+int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
+void kimage_terminate(struct kimage *image);
+int kimage_is_destination_range(struct kimage *image,
+				unsigned long start, unsigned long end);
+
+extern struct mutex kexec_mutex;
+
+#ifdef CONFIG_KEXEC_FILE
+void kimage_file_post_load_cleanup(struct kimage *image);
+#else /* CONFIG_KEXEC_FILE */
+static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
+#endif /* CONFIG_KEXEC_FILE */
+#endif /* LINUX_KEXEC_INTERNAL_H */
-- 
cgit v1.2.3-70-g09d2


From 2965faa5e03d1e71e9ff9aa143fff39e0a77543a Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Wed, 9 Sep 2015 15:38:55 -0700
Subject: kexec: split kexec_load syscall from kexec core code

There are two kexec load syscalls, kexec_load another and kexec_file_load.
 kexec_file_load has been splited as kernel/kexec_file.c.  In this patch I
split kexec_load syscall code to kernel/kexec.c.

And add a new kconfig option KEXEC_CORE, so we can disable kexec_load and
use kexec_file_load only, or vice verse.

The original requirement is from Ted Ts'o, he want kexec kernel signature
being checked with CONFIG_KEXEC_VERIFY_SIG enabled.  But kexec-tools use
kexec_load syscall can bypass the checking.

Vivek Goyal proposed to create a common kconfig option so user can compile
in only one syscall for loading kexec kernel.  KEXEC/KEXEC_FILE selects
KEXEC_CORE so that old config files still work.

Because there's general code need CONFIG_KEXEC_CORE, so I updated all the
architecture Kconfig with a new option KEXEC_CORE, and let KEXEC selects
KEXEC_CORE in arch Kconfig.  Also updated general kernel code with to
kexec_load syscall.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Dave Young <dyoung@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Petr Tesarik <ptesarik@suse.cz>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Josh Boyer <jwboyer@fedoraproject.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                  |    3 +
 arch/arm/Kconfig              |    1 +
 arch/ia64/Kconfig             |    1 +
 arch/m68k/Kconfig             |    1 +
 arch/mips/Kconfig             |    1 +
 arch/powerpc/Kconfig          |    1 +
 arch/s390/Kconfig             |    1 +
 arch/sh/Kconfig               |    1 +
 arch/tile/Kconfig             |    1 +
 arch/x86/Kconfig              |    3 +-
 arch/x86/boot/header.S        |    2 +-
 arch/x86/include/asm/kdebug.h |    2 +-
 arch/x86/kernel/Makefile      |    4 +-
 arch/x86/kernel/kvmclock.c    |    4 +-
 arch/x86/kernel/reboot.c      |    4 +-
 arch/x86/kernel/setup.c       |    2 +-
 arch/x86/kernel/vmlinux.lds.S |    2 +-
 arch/x86/kvm/vmx.c            |    8 +-
 arch/x86/platform/efi/efi.c   |    4 +-
 arch/x86/platform/uv/uv_nmi.c |    6 +-
 drivers/firmware/efi/Kconfig  |    2 +-
 drivers/pci/pci-driver.c      |    2 +-
 include/linux/kexec.h         |    6 +-
 init/initramfs.c              |    4 +-
 kernel/Makefile               |    1 +
 kernel/events/core.c          |    2 +-
 kernel/kexec.c                | 1495 +---------------------------------------
 kernel/kexec_core.c           | 1511 +++++++++++++++++++++++++++++++++++++++++
 kernel/ksysfs.c               |    6 +-
 kernel/printk/printk.c        |    2 +-
 kernel/reboot.c               |    2 +-
 kernel/sysctl.c               |    2 +-
 32 files changed, 1560 insertions(+), 1527 deletions(-)
 create mode 100644 kernel/kexec_core.c

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 8f3564930580..4e949e58b192 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -2,6 +2,9 @@
 # General architecture dependent options
 #
 
+config KEXEC_CORE
+	bool
+
 config OPROFILE
 	tristate "OProfile system profiling"
 	depends on PROFILING
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 0d1b717e1eca..72ad724c67ae 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -2020,6 +2020,7 @@ config KEXEC
 	bool "Kexec system call (EXPERIMENTAL)"
 	depends on (!SMP || PM_SLEEP_SMP)
 	depends on !CPU_V7M
+	select KEXEC_CORE
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 42a91a7aa2b0..eb0249e37981 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -518,6 +518,7 @@ source "drivers/sn/Kconfig"
 config KEXEC
 	bool "kexec system call"
 	depends on !IA64_HP_SIM && (!SMP || HOTPLUG_CPU)
+	select KEXEC_CORE
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 2dd8f63bfbbb..498b567f007b 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -95,6 +95,7 @@ config MMU_SUN3
 config KEXEC
 	bool "kexec system call"
 	depends on M68KCLASSIC
+	select KEXEC_CORE
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 752acca8de1f..e3aa5b0b4ef1 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2597,6 +2597,7 @@ source "kernel/Kconfig.preempt"
 
 config KEXEC
 	bool "Kexec system call"
+	select KEXEC_CORE
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b447918b9e2c..9a7057ec2154 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -420,6 +420,7 @@ config PPC64_SUPPORTS_MEMORY_FAILURE
 config KEXEC
 	bool "kexec system call"
 	depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP))
+	select KEXEC_CORE
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 4827870f7a6d..1d57000b1b24 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -48,6 +48,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 
 config KEXEC
 	def_bool y
+	select KEXEC_CORE
 
 config AUDIT_ARCH
 	def_bool y
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 50057fed819d..d514df7e04dd 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -602,6 +602,7 @@ source kernel/Kconfig.hz
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
 	depends on SUPERH32 && MMU
+	select KEXEC_CORE
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 2ba12d761723..106c21bd7f44 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -205,6 +205,7 @@ source "kernel/Kconfig.hz"
 
 config KEXEC
 	bool "kexec system call"
+	select KEXEC_CORE
 	---help---
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc0d73eac047..7aef2d52daa0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1754,6 +1754,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call"
+	select KEXEC_CORE
 	---help---
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
@@ -1770,8 +1771,8 @@ config KEXEC
 
 config KEXEC_FILE
 	bool "kexec file based system call"
+	select KEXEC_CORE
 	select BUILD_BIN2C
-	depends on KEXEC
 	depends on X86_64
 	depends on CRYPTO=y
 	depends on CRYPTO_SHA256=y
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 16ef02596db2..2d6b309c8e9a 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -414,7 +414,7 @@ xloadflags:
 # define XLF23 0
 #endif
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC)
+#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC_CORE)
 # define XLF4 XLF_EFI_KEXEC
 #else
 # define XLF4 0
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 32ce71375b21..b130d59406fb 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -29,7 +29,7 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs,
 extern void __show_regs(struct pt_regs *regs, int all);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 extern int in_crash_kexec;
 #else
 /* no crash dump is ever in progress if no crash kernel can be kexec'd */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9ffdf25e5b86..b1b78ffe01d0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -71,8 +71,8 @@ obj-$(CONFIG_LIVEPATCH)		+= livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
-obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE)	+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC_FILE)	+= kexec-bzimage64.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
 obj-y				+= kprobes/
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 49487b488061..2c7aafa70702 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -200,7 +200,7 @@ static void kvm_setup_secondary_clock(void)
  * kind of shutdown from our side, we unregister the clock by writting anything
  * that does not have the 'enable' bit set in the msr
  */
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
 	native_write_msr(msr_kvm_system_time, 0, 0);
@@ -259,7 +259,7 @@ void __init kvmclock_init(void)
 	x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
 	x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
 	machine_ops.shutdown  = kvm_shutdown;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
 	kvm_get_preset_lpj();
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 86db4bcd7ce5..02693dd9a079 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -673,7 +673,7 @@ struct machine_ops machine_ops = {
 	.emergency_restart = native_machine_emergency_restart,
 	.restart = native_machine_restart,
 	.halt = native_machine_halt,
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	.crash_shutdown = native_machine_crash_shutdown,
 #endif
 };
@@ -703,7 +703,7 @@ void machine_halt(void)
 	machine_ops.halt();
 }
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 void machine_crash_shutdown(struct pt_regs *regs)
 {
 	machine_ops.crash_shutdown(regs);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index baadbf90a7c5..fdb7f2a2d328 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -478,7 +478,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
  * --------- Crashkernel reservation ------------------------------
  */
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 
 /*
  * Keep the crash kernel below this limit.  On 32 bits earlier kernels
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 00bf300fd846..74e4bf11f562 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -364,7 +364,7 @@ INIT_PER_CPU(irq_stack_union);
 
 #endif /* CONFIG_X86_32 */
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 #include <asm/kexec.h>
 
 . = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 148ea2016022..d01986832afc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1264,7 +1264,7 @@ static void vmcs_load(struct vmcs *vmcs)
 		       vmcs, phys_addr);
 }
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 /*
  * This bitmap is used to indicate whether the vmclear
  * operation is enabled on all cpus. All disabled by
@@ -1302,7 +1302,7 @@ static void crash_vmclear_local_loaded_vmcss(void)
 #else
 static inline void crash_enable_local_vmclear(int cpu) { }
 static inline void crash_disable_local_vmclear(int cpu) { }
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 static void __loaded_vmcs_clear(void *arg)
 {
@@ -10411,7 +10411,7 @@ static int __init vmx_init(void)
 	if (r)
 		return r;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
 			   crash_vmclear_local_loaded_vmcss);
 #endif
@@ -10421,7 +10421,7 @@ static int __init vmx_init(void)
 
 static void __exit vmx_exit(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
 	synchronize_rcu();
 #endif
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index e4308fe6afe8..1db84c0758b7 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -650,7 +650,7 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
 
 static void __init save_runtime_map(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	efi_memory_desc_t *md;
 	void *tmp, *p, *q = NULL;
 	int count = 0;
@@ -748,7 +748,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
 
 static void __init kexec_enter_virtual_mode(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	efi_memory_desc_t *md;
 	void *p;
 
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 020c101c255f..5c9f63fa6abf 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -492,7 +492,7 @@ static void uv_nmi_touch_watchdogs(void)
 	touch_nmi_watchdog();
 }
 
-#if defined(CONFIG_KEXEC)
+#if defined(CONFIG_KEXEC_CORE)
 static atomic_t uv_nmi_kexec_failed;
 static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 {
@@ -519,13 +519,13 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 	uv_nmi_sync_exit(0);
 }
 
-#else /* !CONFIG_KEXEC */
+#else /* !CONFIG_KEXEC_CORE */
 static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 {
 	if (master)
 		pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
 }
-#endif /* !CONFIG_KEXEC */
+#endif /* !CONFIG_KEXEC_CORE */
 
 #ifdef CONFIG_KGDB
 #ifdef CONFIG_KGDB_KDB
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 54071c148340..84533e02fbf8 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -43,7 +43,7 @@ config EFI_VARS_PSTORE_DEFAULT_DISABLE
 
 config EFI_RUNTIME_MAP
 	bool "Export efi runtime maps to sysfs"
-	depends on X86 && EFI && KEXEC
+	depends on X86 && EFI && KEXEC_CORE
 	default y
 	help
 	  Export efi runtime memory maps to /sys/firmware/efi/runtime-map.
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 52a880ca1768..dd652f2ae03d 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -467,7 +467,7 @@ static void pci_device_shutdown(struct device *dev)
 	pci_msi_shutdown(pci_dev);
 	pci_msix_shutdown(pci_dev);
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	/*
 	 * If this is a kexec reboot, turn off Bus Master bit on the
 	 * device to tell it to not continue to do DMA. Don't touch
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index ab150ade0d18..d140b1e9faa7 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -16,7 +16,7 @@
 
 #include <uapi/linux/kexec.h>
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 #include <linux/list.h>
 #include <linux/linkage.h>
 #include <linux/compat.h>
@@ -329,13 +329,13 @@ int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr,
 int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 					unsigned int relsec);
 
-#else /* !CONFIG_KEXEC */
+#else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
 static inline void crash_kexec(struct pt_regs *regs) { }
 static inline int kexec_should_crash(struct task_struct *p) { return 0; }
 #define kexec_in_progress false
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 #endif /* !defined(__ASSEBMLY__) */
 
diff --git a/init/initramfs.c b/init/initramfs.c
index ad1bd7787bbb..b32ad7d97ac9 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -526,14 +526,14 @@ extern unsigned long __initramfs_size;
 
 static void __init free_initrd(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	unsigned long crashk_start = (unsigned long)__va(crashk_res.start);
 	unsigned long crashk_end   = (unsigned long)__va(crashk_res.end);
 #endif
 	if (do_retain_initrd)
 		goto skip;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	/*
 	 * If the initrd region is overlapped with crashkernel reserved region,
 	 * free only memory that is not part of crashkernel region.
diff --git a/kernel/Makefile b/kernel/Makefile
index 1b4890af5a65..d4988410b410 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_MODULE_SIG) += module_signing.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e8183895691c..f548f69c4299 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9094,7 +9094,7 @@ static void perf_event_init_cpu(int cpu)
 	mutex_unlock(&swhash->hlist_mutex);
 }
 
-#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
 static void __perf_event_exit_context(void *__info)
 {
 	struct remove_event re = { .detach_group = true };
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2d73ecfa5505..4c5edc357923 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1,148 +1,23 @@
 /*
- * kexec.c - kexec system call
+ * kexec.c - kexec_load system call
  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
  *
  * This source code is licensed under the GNU General Public License,
  * Version 2.  See the file COPYING for more details.
  */
 
-#define pr_fmt(fmt)	"kexec: " fmt
-
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
 #include <linux/kexec.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
-#include <linux/highmem.h>
 #include <linux/syscalls.h>
-#include <linux/reboot.h>
-#include <linux/ioport.h>
-#include <linux/hardirq.h>
-#include <linux/elf.h>
-#include <linux/elfcore.h>
-#include <linux/utsname.h>
-#include <linux/numa.h>
-#include <linux/suspend.h>
-#include <linux/device.h>
-#include <linux/freezer.h>
 #include <linux/vmalloc.h>
-#include <linux/pm.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
-#include <linux/swap.h>
-#include <linux/syscore_ops.h>
-#include <linux/compiler.h>
-#include <linux/hugetlb.h>
-
-#include <asm/page.h>
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/sections.h>
+#include <linux/slab.h>
 
-#include <crypto/hash.h>
-#include <crypto/sha.h>
 #include "kexec_internal.h"
 
-DEFINE_MUTEX(kexec_mutex);
-
-/* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t __percpu *crash_notes;
-
-/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-
-/* Flag to indicate we are going to kexec a new kernel */
-bool kexec_in_progress = false;
-
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
-	.name  = "Crash kernel",
-	.start = 0,
-	.end   = 0,
-	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-struct resource crashk_low_res = {
-	.name  = "Crash kernel",
-	.start = 0,
-	.end   = 0,
-	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-int kexec_should_crash(struct task_struct *p)
-{
-	/*
-	 * If crash_kexec_post_notifiers is enabled, don't run
-	 * crash_kexec() here yet, which must be run after panic
-	 * notifiers in panic().
-	 */
-	if (crash_kexec_post_notifiers)
-		return 0;
-	/*
-	 * There are 4 panic() calls in do_exit() path, each of which
-	 * corresponds to each of these 4 conditions.
-	 */
-	if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
-		return 1;
-	return 0;
-}
-
-/*
- * When kexec transitions to the new kernel there is a one-to-one
- * mapping between physical and virtual addresses.  On processors
- * where you can disable the MMU this is trivial, and easy.  For
- * others it is still a simple predictable page table to setup.
- *
- * In that environment kexec copies the new kernel to its final
- * resting place.  This means I can only support memory whose
- * physical address can fit in an unsigned long.  In particular
- * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
- * If the assembly stub has more restrictive requirements
- * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
- * defined more restrictively in <asm/kexec.h>.
- *
- * The code for the transition from the current kernel to the
- * the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
- * page of memory is necessary, but some architectures require more.
- * Because this memory must be identity mapped in the transition from
- * virtual to physical addresses it must live in the range
- * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
- * modifiable.
- *
- * The assembly stub in the control code buffer is passed a linked list
- * of descriptor pages detailing the source pages of the new kernel,
- * and the destination addresses of those source pages.  As this data
- * structure is not used in the context of the current OS, it must
- * be self-contained.
- *
- * The code has been made to work with highmem pages and will use a
- * destination page in its final resting place (if it happens
- * to allocate it).  The end product of this is that most of the
- * physical address space, and most of RAM can be used.
- *
- * Future directions include:
- *  - allocating a page table with the control code buffer identity
- *    mapped, to simplify machine_kexec and make kexec_on_panic more
- *    reliable.
- */
-
-/*
- * KIMAGE_NO_DEST is an impossible destination address..., for
- * allocating pages whose destination address we do not care about.
- */
-#define KIMAGE_NO_DEST (-1UL)
-
-static struct page *kimage_alloc_page(struct kimage *image,
-				       gfp_t gfp_mask,
-				       unsigned long dest);
-
 static int copy_user_segment_list(struct kimage *image,
 				  unsigned long nr_segments,
 				  struct kexec_segment __user *segments)
@@ -160,123 +35,6 @@ static int copy_user_segment_list(struct kimage *image,
 	return ret;
 }
 
-int sanity_check_segment_list(struct kimage *image)
-{
-	int result, i;
-	unsigned long nr_segments = image->nr_segments;
-
-	/*
-	 * Verify we have good destination addresses.  The caller is
-	 * responsible for making certain we don't attempt to load
-	 * the new image into invalid or reserved areas of RAM.  This
-	 * just verifies it is an address we can use.
-	 *
-	 * Since the kernel does everything in page size chunks ensure
-	 * the destination addresses are page aligned.  Too many
-	 * special cases crop of when we don't do this.  The most
-	 * insidious is getting overlapping destination addresses
-	 * simply because addresses are changed to page size
-	 * granularity.
-	 */
-	result = -EADDRNOTAVAIL;
-	for (i = 0; i < nr_segments; i++) {
-		unsigned long mstart, mend;
-
-		mstart = image->segment[i].mem;
-		mend   = mstart + image->segment[i].memsz;
-		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
-			return result;
-		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
-			return result;
-	}
-
-	/* Verify our destination addresses do not overlap.
-	 * If we alloed overlapping destination addresses
-	 * through very weird things can happen with no
-	 * easy explanation as one segment stops on another.
-	 */
-	result = -EINVAL;
-	for (i = 0; i < nr_segments; i++) {
-		unsigned long mstart, mend;
-		unsigned long j;
-
-		mstart = image->segment[i].mem;
-		mend   = mstart + image->segment[i].memsz;
-		for (j = 0; j < i; j++) {
-			unsigned long pstart, pend;
-			pstart = image->segment[j].mem;
-			pend   = pstart + image->segment[j].memsz;
-			/* Do the segments overlap ? */
-			if ((mend > pstart) && (mstart < pend))
-				return result;
-		}
-	}
-
-	/* Ensure our buffer sizes are strictly less than
-	 * our memory sizes.  This should always be the case,
-	 * and it is easier to check up front than to be surprised
-	 * later on.
-	 */
-	result = -EINVAL;
-	for (i = 0; i < nr_segments; i++) {
-		if (image->segment[i].bufsz > image->segment[i].memsz)
-			return result;
-	}
-
-	/*
-	 * Verify we have good destination addresses.  Normally
-	 * the caller is responsible for making certain we don't
-	 * attempt to load the new image into invalid or reserved
-	 * areas of RAM.  But crash kernels are preloaded into a
-	 * reserved area of ram.  We must ensure the addresses
-	 * are in the reserved area otherwise preloading the
-	 * kernel could corrupt things.
-	 */
-
-	if (image->type == KEXEC_TYPE_CRASH) {
-		result = -EADDRNOTAVAIL;
-		for (i = 0; i < nr_segments; i++) {
-			unsigned long mstart, mend;
-
-			mstart = image->segment[i].mem;
-			mend = mstart + image->segment[i].memsz - 1;
-			/* Ensure we are within the crash kernel limits */
-			if ((mstart < crashk_res.start) ||
-			    (mend > crashk_res.end))
-				return result;
-		}
-	}
-
-	return 0;
-}
-
-struct kimage *do_kimage_alloc_init(void)
-{
-	struct kimage *image;
-
-	/* Allocate a controlling structure */
-	image = kzalloc(sizeof(*image), GFP_KERNEL);
-	if (!image)
-		return NULL;
-
-	image->head = 0;
-	image->entry = &image->head;
-	image->last_entry = &image->head;
-	image->control_page = ~0; /* By default this does not apply */
-	image->type = KEXEC_TYPE_DEFAULT;
-
-	/* Initialize the list of control pages */
-	INIT_LIST_HEAD(&image->control_pages);
-
-	/* Initialize the list of destination pages */
-	INIT_LIST_HEAD(&image->dest_pages);
-
-	/* Initialize the list of unusable pages */
-	INIT_LIST_HEAD(&image->unusable_pages);
-
-	return image;
-}
-
 static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
 			     unsigned long nr_segments,
 			     struct kexec_segment __user *segments,
@@ -343,597 +101,6 @@ out_free_image:
 	return ret;
 }
 
-int kimage_is_destination_range(struct kimage *image,
-					unsigned long start,
-					unsigned long end)
-{
-	unsigned long i;
-
-	for (i = 0; i < image->nr_segments; i++) {
-		unsigned long mstart, mend;
-
-		mstart = image->segment[i].mem;
-		mend = mstart + image->segment[i].memsz;
-		if ((end > mstart) && (start < mend))
-			return 1;
-	}
-
-	return 0;
-}
-
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
-{
-	struct page *pages;
-
-	pages = alloc_pages(gfp_mask, order);
-	if (pages) {
-		unsigned int count, i;
-		pages->mapping = NULL;
-		set_page_private(pages, order);
-		count = 1 << order;
-		for (i = 0; i < count; i++)
-			SetPageReserved(pages + i);
-	}
-
-	return pages;
-}
-
-static void kimage_free_pages(struct page *page)
-{
-	unsigned int order, count, i;
-
-	order = page_private(page);
-	count = 1 << order;
-	for (i = 0; i < count; i++)
-		ClearPageReserved(page + i);
-	__free_pages(page, order);
-}
-
-void kimage_free_page_list(struct list_head *list)
-{
-	struct list_head *pos, *next;
-
-	list_for_each_safe(pos, next, list) {
-		struct page *page;
-
-		page = list_entry(pos, struct page, lru);
-		list_del(&page->lru);
-		kimage_free_pages(page);
-	}
-}
-
-static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
-							unsigned int order)
-{
-	/* Control pages are special, they are the intermediaries
-	 * that are needed while we copy the rest of the pages
-	 * to their final resting place.  As such they must
-	 * not conflict with either the destination addresses
-	 * or memory the kernel is already using.
-	 *
-	 * The only case where we really need more than one of
-	 * these are for architectures where we cannot disable
-	 * the MMU and must instead generate an identity mapped
-	 * page table for all of the memory.
-	 *
-	 * At worst this runs in O(N) of the image size.
-	 */
-	struct list_head extra_pages;
-	struct page *pages;
-	unsigned int count;
-
-	count = 1 << order;
-	INIT_LIST_HEAD(&extra_pages);
-
-	/* Loop while I can allocate a page and the page allocated
-	 * is a destination page.
-	 */
-	do {
-		unsigned long pfn, epfn, addr, eaddr;
-
-		pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
-		if (!pages)
-			break;
-		pfn   = page_to_pfn(pages);
-		epfn  = pfn + count;
-		addr  = pfn << PAGE_SHIFT;
-		eaddr = epfn << PAGE_SHIFT;
-		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
-			      kimage_is_destination_range(image, addr, eaddr)) {
-			list_add(&pages->lru, &extra_pages);
-			pages = NULL;
-		}
-	} while (!pages);
-
-	if (pages) {
-		/* Remember the allocated page... */
-		list_add(&pages->lru, &image->control_pages);
-
-		/* Because the page is already in it's destination
-		 * location we will never allocate another page at
-		 * that address.  Therefore kimage_alloc_pages
-		 * will not return it (again) and we don't need
-		 * to give it an entry in image->segment[].
-		 */
-	}
-	/* Deal with the destination pages I have inadvertently allocated.
-	 *
-	 * Ideally I would convert multi-page allocations into single
-	 * page allocations, and add everything to image->dest_pages.
-	 *
-	 * For now it is simpler to just free the pages.
-	 */
-	kimage_free_page_list(&extra_pages);
-
-	return pages;
-}
-
-static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
-						      unsigned int order)
-{
-	/* Control pages are special, they are the intermediaries
-	 * that are needed while we copy the rest of the pages
-	 * to their final resting place.  As such they must
-	 * not conflict with either the destination addresses
-	 * or memory the kernel is already using.
-	 *
-	 * Control pages are also the only pags we must allocate
-	 * when loading a crash kernel.  All of the other pages
-	 * are specified by the segments and we just memcpy
-	 * into them directly.
-	 *
-	 * The only case where we really need more than one of
-	 * these are for architectures where we cannot disable
-	 * the MMU and must instead generate an identity mapped
-	 * page table for all of the memory.
-	 *
-	 * Given the low demand this implements a very simple
-	 * allocator that finds the first hole of the appropriate
-	 * size in the reserved memory region, and allocates all
-	 * of the memory up to and including the hole.
-	 */
-	unsigned long hole_start, hole_end, size;
-	struct page *pages;
-
-	pages = NULL;
-	size = (1 << order) << PAGE_SHIFT;
-	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
-	hole_end   = hole_start + size - 1;
-	while (hole_end <= crashk_res.end) {
-		unsigned long i;
-
-		if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
-			break;
-		/* See if I overlap any of the segments */
-		for (i = 0; i < image->nr_segments; i++) {
-			unsigned long mstart, mend;
-
-			mstart = image->segment[i].mem;
-			mend   = mstart + image->segment[i].memsz - 1;
-			if ((hole_end >= mstart) && (hole_start <= mend)) {
-				/* Advance the hole to the end of the segment */
-				hole_start = (mend + (size - 1)) & ~(size - 1);
-				hole_end   = hole_start + size - 1;
-				break;
-			}
-		}
-		/* If I don't overlap any segments I have found my hole! */
-		if (i == image->nr_segments) {
-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
-			break;
-		}
-	}
-	if (pages)
-		image->control_page = hole_end;
-
-	return pages;
-}
-
-
-struct page *kimage_alloc_control_pages(struct kimage *image,
-					 unsigned int order)
-{
-	struct page *pages = NULL;
-
-	switch (image->type) {
-	case KEXEC_TYPE_DEFAULT:
-		pages = kimage_alloc_normal_control_pages(image, order);
-		break;
-	case KEXEC_TYPE_CRASH:
-		pages = kimage_alloc_crash_control_pages(image, order);
-		break;
-	}
-
-	return pages;
-}
-
-static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
-{
-	if (*image->entry != 0)
-		image->entry++;
-
-	if (image->entry == image->last_entry) {
-		kimage_entry_t *ind_page;
-		struct page *page;
-
-		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
-		if (!page)
-			return -ENOMEM;
-
-		ind_page = page_address(page);
-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
-		image->entry = ind_page;
-		image->last_entry = ind_page +
-				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
-	}
-	*image->entry = entry;
-	image->entry++;
-	*image->entry = 0;
-
-	return 0;
-}
-
-static int kimage_set_destination(struct kimage *image,
-				   unsigned long destination)
-{
-	int result;
-
-	destination &= PAGE_MASK;
-	result = kimage_add_entry(image, destination | IND_DESTINATION);
-
-	return result;
-}
-
-
-static int kimage_add_page(struct kimage *image, unsigned long page)
-{
-	int result;
-
-	page &= PAGE_MASK;
-	result = kimage_add_entry(image, page | IND_SOURCE);
-
-	return result;
-}
-
-
-static void kimage_free_extra_pages(struct kimage *image)
-{
-	/* Walk through and free any extra destination pages I may have */
-	kimage_free_page_list(&image->dest_pages);
-
-	/* Walk through and free any unusable pages I have cached */
-	kimage_free_page_list(&image->unusable_pages);
-
-}
-void kimage_terminate(struct kimage *image)
-{
-	if (*image->entry != 0)
-		image->entry++;
-
-	*image->entry = IND_DONE;
-}
-
-#define for_each_kimage_entry(image, ptr, entry) \
-	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
-		ptr = (entry & IND_INDIRECTION) ? \
-			phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
-
-static void kimage_free_entry(kimage_entry_t entry)
-{
-	struct page *page;
-
-	page = pfn_to_page(entry >> PAGE_SHIFT);
-	kimage_free_pages(page);
-}
-
-void kimage_free(struct kimage *image)
-{
-	kimage_entry_t *ptr, entry;
-	kimage_entry_t ind = 0;
-
-	if (!image)
-		return;
-
-	kimage_free_extra_pages(image);
-	for_each_kimage_entry(image, ptr, entry) {
-		if (entry & IND_INDIRECTION) {
-			/* Free the previous indirection page */
-			if (ind & IND_INDIRECTION)
-				kimage_free_entry(ind);
-			/* Save this indirection page until we are
-			 * done with it.
-			 */
-			ind = entry;
-		} else if (entry & IND_SOURCE)
-			kimage_free_entry(entry);
-	}
-	/* Free the final indirection page */
-	if (ind & IND_INDIRECTION)
-		kimage_free_entry(ind);
-
-	/* Handle any machine specific cleanup */
-	machine_kexec_cleanup(image);
-
-	/* Free the kexec control pages... */
-	kimage_free_page_list(&image->control_pages);
-
-	/*
-	 * Free up any temporary buffers allocated. This might hit if
-	 * error occurred much later after buffer allocation.
-	 */
-	if (image->file_mode)
-		kimage_file_post_load_cleanup(image);
-
-	kfree(image);
-}
-
-static kimage_entry_t *kimage_dst_used(struct kimage *image,
-					unsigned long page)
-{
-	kimage_entry_t *ptr, entry;
-	unsigned long destination = 0;
-
-	for_each_kimage_entry(image, ptr, entry) {
-		if (entry & IND_DESTINATION)
-			destination = entry & PAGE_MASK;
-		else if (entry & IND_SOURCE) {
-			if (page == destination)
-				return ptr;
-			destination += PAGE_SIZE;
-		}
-	}
-
-	return NULL;
-}
-
-static struct page *kimage_alloc_page(struct kimage *image,
-					gfp_t gfp_mask,
-					unsigned long destination)
-{
-	/*
-	 * Here we implement safeguards to ensure that a source page
-	 * is not copied to its destination page before the data on
-	 * the destination page is no longer useful.
-	 *
-	 * To do this we maintain the invariant that a source page is
-	 * either its own destination page, or it is not a
-	 * destination page at all.
-	 *
-	 * That is slightly stronger than required, but the proof
-	 * that no problems will not occur is trivial, and the
-	 * implementation is simply to verify.
-	 *
-	 * When allocating all pages normally this algorithm will run
-	 * in O(N) time, but in the worst case it will run in O(N^2)
-	 * time.   If the runtime is a problem the data structures can
-	 * be fixed.
-	 */
-	struct page *page;
-	unsigned long addr;
-
-	/*
-	 * Walk through the list of destination pages, and see if I
-	 * have a match.
-	 */
-	list_for_each_entry(page, &image->dest_pages, lru) {
-		addr = page_to_pfn(page) << PAGE_SHIFT;
-		if (addr == destination) {
-			list_del(&page->lru);
-			return page;
-		}
-	}
-	page = NULL;
-	while (1) {
-		kimage_entry_t *old;
-
-		/* Allocate a page, if we run out of memory give up */
-		page = kimage_alloc_pages(gfp_mask, 0);
-		if (!page)
-			return NULL;
-		/* If the page cannot be used file it away */
-		if (page_to_pfn(page) >
-				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
-			list_add(&page->lru, &image->unusable_pages);
-			continue;
-		}
-		addr = page_to_pfn(page) << PAGE_SHIFT;
-
-		/* If it is the destination page we want use it */
-		if (addr == destination)
-			break;
-
-		/* If the page is not a destination page use it */
-		if (!kimage_is_destination_range(image, addr,
-						  addr + PAGE_SIZE))
-			break;
-
-		/*
-		 * I know that the page is someones destination page.
-		 * See if there is already a source page for this
-		 * destination page.  And if so swap the source pages.
-		 */
-		old = kimage_dst_used(image, addr);
-		if (old) {
-			/* If so move it */
-			unsigned long old_addr;
-			struct page *old_page;
-
-			old_addr = *old & PAGE_MASK;
-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
-			copy_highpage(page, old_page);
-			*old = addr | (*old & ~PAGE_MASK);
-
-			/* The old page I have found cannot be a
-			 * destination page, so return it if it's
-			 * gfp_flags honor the ones passed in.
-			 */
-			if (!(gfp_mask & __GFP_HIGHMEM) &&
-			    PageHighMem(old_page)) {
-				kimage_free_pages(old_page);
-				continue;
-			}
-			addr = old_addr;
-			page = old_page;
-			break;
-		} else {
-			/* Place the page on the destination list I
-			 * will use it later.
-			 */
-			list_add(&page->lru, &image->dest_pages);
-		}
-	}
-
-	return page;
-}
-
-static int kimage_load_normal_segment(struct kimage *image,
-					 struct kexec_segment *segment)
-{
-	unsigned long maddr;
-	size_t ubytes, mbytes;
-	int result;
-	unsigned char __user *buf = NULL;
-	unsigned char *kbuf = NULL;
-
-	result = 0;
-	if (image->file_mode)
-		kbuf = segment->kbuf;
-	else
-		buf = segment->buf;
-	ubytes = segment->bufsz;
-	mbytes = segment->memsz;
-	maddr = segment->mem;
-
-	result = kimage_set_destination(image, maddr);
-	if (result < 0)
-		goto out;
-
-	while (mbytes) {
-		struct page *page;
-		char *ptr;
-		size_t uchunk, mchunk;
-
-		page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
-		if (!page) {
-			result  = -ENOMEM;
-			goto out;
-		}
-		result = kimage_add_page(image, page_to_pfn(page)
-								<< PAGE_SHIFT);
-		if (result < 0)
-			goto out;
-
-		ptr = kmap(page);
-		/* Start with a clear page */
-		clear_page(ptr);
-		ptr += maddr & ~PAGE_MASK;
-		mchunk = min_t(size_t, mbytes,
-				PAGE_SIZE - (maddr & ~PAGE_MASK));
-		uchunk = min(ubytes, mchunk);
-
-		/* For file based kexec, source pages are in kernel memory */
-		if (image->file_mode)
-			memcpy(ptr, kbuf, uchunk);
-		else
-			result = copy_from_user(ptr, buf, uchunk);
-		kunmap(page);
-		if (result) {
-			result = -EFAULT;
-			goto out;
-		}
-		ubytes -= uchunk;
-		maddr  += mchunk;
-		if (image->file_mode)
-			kbuf += mchunk;
-		else
-			buf += mchunk;
-		mbytes -= mchunk;
-	}
-out:
-	return result;
-}
-
-static int kimage_load_crash_segment(struct kimage *image,
-					struct kexec_segment *segment)
-{
-	/* For crash dumps kernels we simply copy the data from
-	 * user space to it's destination.
-	 * We do things a page at a time for the sake of kmap.
-	 */
-	unsigned long maddr;
-	size_t ubytes, mbytes;
-	int result;
-	unsigned char __user *buf = NULL;
-	unsigned char *kbuf = NULL;
-
-	result = 0;
-	if (image->file_mode)
-		kbuf = segment->kbuf;
-	else
-		buf = segment->buf;
-	ubytes = segment->bufsz;
-	mbytes = segment->memsz;
-	maddr = segment->mem;
-	while (mbytes) {
-		struct page *page;
-		char *ptr;
-		size_t uchunk, mchunk;
-
-		page = pfn_to_page(maddr >> PAGE_SHIFT);
-		if (!page) {
-			result  = -ENOMEM;
-			goto out;
-		}
-		ptr = kmap(page);
-		ptr += maddr & ~PAGE_MASK;
-		mchunk = min_t(size_t, mbytes,
-				PAGE_SIZE - (maddr & ~PAGE_MASK));
-		uchunk = min(ubytes, mchunk);
-		if (mchunk > uchunk) {
-			/* Zero the trailing part of the page */
-			memset(ptr + uchunk, 0, mchunk - uchunk);
-		}
-
-		/* For file based kexec, source pages are in kernel memory */
-		if (image->file_mode)
-			memcpy(ptr, kbuf, uchunk);
-		else
-			result = copy_from_user(ptr, buf, uchunk);
-		kexec_flush_icache_page(page);
-		kunmap(page);
-		if (result) {
-			result = -EFAULT;
-			goto out;
-		}
-		ubytes -= uchunk;
-		maddr  += mchunk;
-		if (image->file_mode)
-			kbuf += mchunk;
-		else
-			buf += mchunk;
-		mbytes -= mchunk;
-	}
-out:
-	return result;
-}
-
-int kimage_load_segment(struct kimage *image,
-				struct kexec_segment *segment)
-{
-	int result = -ENOMEM;
-
-	switch (image->type) {
-	case KEXEC_TYPE_DEFAULT:
-		result = kimage_load_normal_segment(image, segment);
-		break;
-	case KEXEC_TYPE_CRASH:
-		result = kimage_load_crash_segment(image, segment);
-		break;
-	}
-
-	return result;
-}
-
 /*
  * Exec Kernel system call: for obvious reasons only root may call it.
  *
@@ -954,9 +121,6 @@ int kimage_load_segment(struct kimage *image,
  * kexec does not sync, or unmount filesystems so if you need
  * that to happen you need to do that yourself.
  */
-struct kimage *kexec_image;
-struct kimage *kexec_crash_image;
-int kexec_load_disabled;
 
 SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 		struct kexec_segment __user *, segments, unsigned long, flags)
@@ -1051,18 +215,6 @@ out:
 	return result;
 }
 
-/*
- * Add and remove page tables for crashkernel memory
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak crash_map_reserved_pages(void)
-{}
-
-void __weak crash_unmap_reserved_pages(void)
-{}
-
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
 		       compat_ulong_t, nr_segments,
@@ -1101,646 +253,3 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
 	return sys_kexec_load(entry, nr_segments, ksegments, flags);
 }
 #endif
-
-void crash_kexec(struct pt_regs *regs)
-{
-	/* Take the kexec_mutex here to prevent sys_kexec_load
-	 * running on one cpu from replacing the crash kernel
-	 * we are using after a panic on a different cpu.
-	 *
-	 * If the crash kernel was not located in a fixed area
-	 * of memory the xchg(&kexec_crash_image) would be
-	 * sufficient.  But since I reuse the memory...
-	 */
-	if (mutex_trylock(&kexec_mutex)) {
-		if (kexec_crash_image) {
-			struct pt_regs fixed_regs;
-
-			crash_setup_regs(&fixed_regs, regs);
-			crash_save_vmcoreinfo();
-			machine_crash_shutdown(&fixed_regs);
-			machine_kexec(kexec_crash_image);
-		}
-		mutex_unlock(&kexec_mutex);
-	}
-}
-
-size_t crash_get_memory_size(void)
-{
-	size_t size = 0;
-	mutex_lock(&kexec_mutex);
-	if (crashk_res.end != crashk_res.start)
-		size = resource_size(&crashk_res);
-	mutex_unlock(&kexec_mutex);
-	return size;
-}
-
-void __weak crash_free_reserved_phys_range(unsigned long begin,
-					   unsigned long end)
-{
-	unsigned long addr;
-
-	for (addr = begin; addr < end; addr += PAGE_SIZE)
-		free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
-}
-
-int crash_shrink_memory(unsigned long new_size)
-{
-	int ret = 0;
-	unsigned long start, end;
-	unsigned long old_size;
-	struct resource *ram_res;
-
-	mutex_lock(&kexec_mutex);
-
-	if (kexec_crash_image) {
-		ret = -ENOENT;
-		goto unlock;
-	}
-	start = crashk_res.start;
-	end = crashk_res.end;
-	old_size = (end == 0) ? 0 : end - start + 1;
-	if (new_size >= old_size) {
-		ret = (new_size == old_size) ? 0 : -EINVAL;
-		goto unlock;
-	}
-
-	ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
-	if (!ram_res) {
-		ret = -ENOMEM;
-		goto unlock;
-	}
-
-	start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
-	end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
-
-	crash_map_reserved_pages();
-	crash_free_reserved_phys_range(end, crashk_res.end);
-
-	if ((start == end) && (crashk_res.parent != NULL))
-		release_resource(&crashk_res);
-
-	ram_res->start = end;
-	ram_res->end = crashk_res.end;
-	ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
-	ram_res->name = "System RAM";
-
-	crashk_res.end = end - 1;
-
-	insert_resource(&iomem_resource, ram_res);
-	crash_unmap_reserved_pages();
-
-unlock:
-	mutex_unlock(&kexec_mutex);
-	return ret;
-}
-
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
-			    size_t data_len)
-{
-	struct elf_note note;
-
-	note.n_namesz = strlen(name) + 1;
-	note.n_descsz = data_len;
-	note.n_type   = type;
-	memcpy(buf, &note, sizeof(note));
-	buf += (sizeof(note) + 3)/4;
-	memcpy(buf, name, note.n_namesz);
-	buf += (note.n_namesz + 3)/4;
-	memcpy(buf, data, note.n_descsz);
-	buf += (note.n_descsz + 3)/4;
-
-	return buf;
-}
-
-static void final_note(u32 *buf)
-{
-	struct elf_note note;
-
-	note.n_namesz = 0;
-	note.n_descsz = 0;
-	note.n_type   = 0;
-	memcpy(buf, &note, sizeof(note));
-}
-
-void crash_save_cpu(struct pt_regs *regs, int cpu)
-{
-	struct elf_prstatus prstatus;
-	u32 *buf;
-
-	if ((cpu < 0) || (cpu >= nr_cpu_ids))
-		return;
-
-	/* Using ELF notes here is opportunistic.
-	 * I need a well defined structure format
-	 * for the data I pass, and I need tags
-	 * on the data to indicate what information I have
-	 * squirrelled away.  ELF notes happen to provide
-	 * all of that, so there is no need to invent something new.
-	 */
-	buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
-	if (!buf)
-		return;
-	memset(&prstatus, 0, sizeof(prstatus));
-	prstatus.pr_pid = current->pid;
-	elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
-	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
-			      &prstatus, sizeof(prstatus));
-	final_note(buf);
-}
-
-static int __init crash_notes_memory_init(void)
-{
-	/* Allocate memory for saving cpu registers. */
-	crash_notes = alloc_percpu(note_buf_t);
-	if (!crash_notes) {
-		pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
-		return -ENOMEM;
-	}
-	return 0;
-}
-subsys_initcall(crash_notes_memory_init);
-
-
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-
-
-/*
- * This function parses command lines in the format
- *
- *   crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
-					unsigned long long system_ram,
-					unsigned long long *crash_size,
-					unsigned long long *crash_base)
-{
-	char *cur = cmdline, *tmp;
-
-	/* for each entry of the comma-separated list */
-	do {
-		unsigned long long start, end = ULLONG_MAX, size;
-
-		/* get the start of the range */
-		start = memparse(cur, &tmp);
-		if (cur == tmp) {
-			pr_warn("crashkernel: Memory value expected\n");
-			return -EINVAL;
-		}
-		cur = tmp;
-		if (*cur != '-') {
-			pr_warn("crashkernel: '-' expected\n");
-			return -EINVAL;
-		}
-		cur++;
-
-		/* if no ':' is here, than we read the end */
-		if (*cur != ':') {
-			end = memparse(cur, &tmp);
-			if (cur == tmp) {
-				pr_warn("crashkernel: Memory value expected\n");
-				return -EINVAL;
-			}
-			cur = tmp;
-			if (end <= start) {
-				pr_warn("crashkernel: end <= start\n");
-				return -EINVAL;
-			}
-		}
-
-		if (*cur != ':') {
-			pr_warn("crashkernel: ':' expected\n");
-			return -EINVAL;
-		}
-		cur++;
-
-		size = memparse(cur, &tmp);
-		if (cur == tmp) {
-			pr_warn("Memory value expected\n");
-			return -EINVAL;
-		}
-		cur = tmp;
-		if (size >= system_ram) {
-			pr_warn("crashkernel: invalid size\n");
-			return -EINVAL;
-		}
-
-		/* match ? */
-		if (system_ram >= start && system_ram < end) {
-			*crash_size = size;
-			break;
-		}
-	} while (*cur++ == ',');
-
-	if (*crash_size > 0) {
-		while (*cur && *cur != ' ' && *cur != '@')
-			cur++;
-		if (*cur == '@') {
-			cur++;
-			*crash_base = memparse(cur, &tmp);
-			if (cur == tmp) {
-				pr_warn("Memory value expected after '@'\n");
-				return -EINVAL;
-			}
-		}
-	}
-
-	return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- *	crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
-					   unsigned long long *crash_size,
-					   unsigned long long *crash_base)
-{
-	char *cur = cmdline;
-
-	*crash_size = memparse(cmdline, &cur);
-	if (cmdline == cur) {
-		pr_warn("crashkernel: memory value expected\n");
-		return -EINVAL;
-	}
-
-	if (*cur == '@')
-		*crash_base = memparse(cur+1, &cur);
-	else if (*cur != ' ' && *cur != '\0') {
-		pr_warn("crashkernel: unrecognized char\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW  1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
-	[SUFFIX_HIGH] = ",high",
-	[SUFFIX_LOW]  = ",low",
-	[SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix"  crashkernel command lines like
- *
- *	crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
-					   unsigned long long	*crash_size,
-					   const char *suffix)
-{
-	char *cur = cmdline;
-
-	*crash_size = memparse(cmdline, &cur);
-	if (cmdline == cur) {
-		pr_warn("crashkernel: memory value expected\n");
-		return -EINVAL;
-	}
-
-	/* check with suffix */
-	if (strncmp(cur, suffix, strlen(suffix))) {
-		pr_warn("crashkernel: unrecognized char\n");
-		return -EINVAL;
-	}
-	cur += strlen(suffix);
-	if (*cur != ' ' && *cur != '\0') {
-		pr_warn("crashkernel: unrecognized char\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static __init char *get_last_crashkernel(char *cmdline,
-			     const char *name,
-			     const char *suffix)
-{
-	char *p = cmdline, *ck_cmdline = NULL;
-
-	/* find crashkernel and use the last one if there are more */
-	p = strstr(p, name);
-	while (p) {
-		char *end_p = strchr(p, ' ');
-		char *q;
-
-		if (!end_p)
-			end_p = p + strlen(p);
-
-		if (!suffix) {
-			int i;
-
-			/* skip the one with any known suffix */
-			for (i = 0; suffix_tbl[i]; i++) {
-				q = end_p - strlen(suffix_tbl[i]);
-				if (!strncmp(q, suffix_tbl[i],
-					     strlen(suffix_tbl[i])))
-					goto next;
-			}
-			ck_cmdline = p;
-		} else {
-			q = end_p - strlen(suffix);
-			if (!strncmp(q, suffix, strlen(suffix)))
-				ck_cmdline = p;
-		}
-next:
-		p = strstr(p+1, name);
-	}
-
-	if (!ck_cmdline)
-		return NULL;
-
-	return ck_cmdline;
-}
-
-static int __init __parse_crashkernel(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base,
-			     const char *name,
-			     const char *suffix)
-{
-	char	*first_colon, *first_space;
-	char	*ck_cmdline;
-
-	BUG_ON(!crash_size || !crash_base);
-	*crash_size = 0;
-	*crash_base = 0;
-
-	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
-	if (!ck_cmdline)
-		return -EINVAL;
-
-	ck_cmdline += strlen(name);
-
-	if (suffix)
-		return parse_crashkernel_suffix(ck_cmdline, crash_size,
-				suffix);
-	/*
-	 * if the commandline contains a ':', then that's the extended
-	 * syntax -- if not, it must be the classic syntax
-	 */
-	first_colon = strchr(ck_cmdline, ':');
-	first_space = strchr(ck_cmdline, ' ');
-	if (first_colon && (!first_space || first_colon < first_space))
-		return parse_crashkernel_mem(ck_cmdline, system_ram,
-				crash_size, crash_base);
-
-	return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
-{
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-					"crashkernel=", NULL);
-}
-
-int __init parse_crashkernel_high(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
-{
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-				"crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
-
-int __init parse_crashkernel_low(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
-{
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-				"crashkernel=", suffix_tbl[SUFFIX_LOW]);
-}
-
-static void update_vmcoreinfo_note(void)
-{
-	u32 *buf = vmcoreinfo_note;
-
-	if (!vmcoreinfo_size)
-		return;
-	buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
-			      vmcoreinfo_size);
-	final_note(buf);
-}
-
-void crash_save_vmcoreinfo(void)
-{
-	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
-	update_vmcoreinfo_note();
-}
-
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
-	va_list args;
-	char buf[0x50];
-	size_t r;
-
-	va_start(args, fmt);
-	r = vscnprintf(buf, sizeof(buf), fmt, args);
-	va_end(args);
-
-	r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-
-	memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
-	vmcoreinfo_size += r;
-}
-
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-
-unsigned long __weak paddr_vmcoreinfo_note(void)
-{
-	return __pa((unsigned long)(char *)&vmcoreinfo_note);
-}
-
-static int __init crash_save_vmcoreinfo_init(void)
-{
-	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
-	VMCOREINFO_PAGESIZE(PAGE_SIZE);
-
-	VMCOREINFO_SYMBOL(init_uts_ns);
-	VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
-	VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
-	VMCOREINFO_SYMBOL(_stext);
-	VMCOREINFO_SYMBOL(vmap_area_list);
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-	VMCOREINFO_SYMBOL(mem_map);
-	VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
-	VMCOREINFO_SYMBOL(mem_section);
-	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
-	VMCOREINFO_STRUCT_SIZE(mem_section);
-	VMCOREINFO_OFFSET(mem_section, section_mem_map);
-#endif
-	VMCOREINFO_STRUCT_SIZE(page);
-	VMCOREINFO_STRUCT_SIZE(pglist_data);
-	VMCOREINFO_STRUCT_SIZE(zone);
-	VMCOREINFO_STRUCT_SIZE(free_area);
-	VMCOREINFO_STRUCT_SIZE(list_head);
-	VMCOREINFO_SIZE(nodemask_t);
-	VMCOREINFO_OFFSET(page, flags);
-	VMCOREINFO_OFFSET(page, _count);
-	VMCOREINFO_OFFSET(page, mapping);
-	VMCOREINFO_OFFSET(page, lru);
-	VMCOREINFO_OFFSET(page, _mapcount);
-	VMCOREINFO_OFFSET(page, private);
-	VMCOREINFO_OFFSET(pglist_data, node_zones);
-	VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-	VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
-	VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
-	VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
-	VMCOREINFO_OFFSET(pglist_data, node_id);
-	VMCOREINFO_OFFSET(zone, free_area);
-	VMCOREINFO_OFFSET(zone, vm_stat);
-	VMCOREINFO_OFFSET(zone, spanned_pages);
-	VMCOREINFO_OFFSET(free_area, free_list);
-	VMCOREINFO_OFFSET(list_head, next);
-	VMCOREINFO_OFFSET(list_head, prev);
-	VMCOREINFO_OFFSET(vmap_area, va_start);
-	VMCOREINFO_OFFSET(vmap_area, list);
-	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
-	log_buf_kexec_setup();
-	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
-	VMCOREINFO_NUMBER(NR_FREE_PAGES);
-	VMCOREINFO_NUMBER(PG_lru);
-	VMCOREINFO_NUMBER(PG_private);
-	VMCOREINFO_NUMBER(PG_swapcache);
-	VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
-	VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
-	VMCOREINFO_NUMBER(PG_head_mask);
-	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLBFS
-	VMCOREINFO_SYMBOL(free_huge_page);
-#endif
-
-	arch_crash_save_vmcoreinfo();
-	update_vmcoreinfo_note();
-
-	return 0;
-}
-
-subsys_initcall(crash_save_vmcoreinfo_init);
-
-/*
- * Move into place and start executing a preloaded standalone
- * executable.  If nothing was preloaded return an error.
- */
-int kernel_kexec(void)
-{
-	int error = 0;
-
-	if (!mutex_trylock(&kexec_mutex))
-		return -EBUSY;
-	if (!kexec_image) {
-		error = -EINVAL;
-		goto Unlock;
-	}
-
-#ifdef CONFIG_KEXEC_JUMP
-	if (kexec_image->preserve_context) {
-		lock_system_sleep();
-		pm_prepare_console();
-		error = freeze_processes();
-		if (error) {
-			error = -EBUSY;
-			goto Restore_console;
-		}
-		suspend_console();
-		error = dpm_suspend_start(PMSG_FREEZE);
-		if (error)
-			goto Resume_console;
-		/* At this point, dpm_suspend_start() has been called,
-		 * but *not* dpm_suspend_end(). We *must* call
-		 * dpm_suspend_end() now.  Otherwise, drivers for
-		 * some devices (e.g. interrupt controllers) become
-		 * desynchronized with the actual state of the
-		 * hardware at resume time, and evil weirdness ensues.
-		 */
-		error = dpm_suspend_end(PMSG_FREEZE);
-		if (error)
-			goto Resume_devices;
-		error = disable_nonboot_cpus();
-		if (error)
-			goto Enable_cpus;
-		local_irq_disable();
-		error = syscore_suspend();
-		if (error)
-			goto Enable_irqs;
-	} else
-#endif
-	{
-		kexec_in_progress = true;
-		kernel_restart_prepare(NULL);
-		migrate_to_reboot_cpu();
-
-		/*
-		 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
-		 * no further code needs to use CPU hotplug (which is true in
-		 * the reboot case). However, the kexec path depends on using
-		 * CPU hotplug again; so re-enable it here.
-		 */
-		cpu_hotplug_enable();
-		pr_emerg("Starting new kernel\n");
-		machine_shutdown();
-	}
-
-	machine_kexec(kexec_image);
-
-#ifdef CONFIG_KEXEC_JUMP
-	if (kexec_image->preserve_context) {
-		syscore_resume();
- Enable_irqs:
-		local_irq_enable();
- Enable_cpus:
-		enable_nonboot_cpus();
-		dpm_resume_start(PMSG_RESTORE);
- Resume_devices:
-		dpm_resume_end(PMSG_RESTORE);
- Resume_console:
-		resume_console();
-		thaw_processes();
- Restore_console:
-		pm_restore_console();
-		unlock_system_sleep();
-	}
-#endif
-
- Unlock:
-	mutex_unlock(&kexec_mutex);
-	return error;
-}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
new file mode 100644
index 000000000000..9aa25c034b2e
--- /dev/null
+++ b/kernel/kexec_core.c
@@ -0,0 +1,1511 @@
+/*
+ * kexec.c - kexec system call core code.
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt)	"kexec: " fmt
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/ioport.h>
+#include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/utsname.h>
+#include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/console.h>
+#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/syscore_ops.h>
+#include <linux/compiler.h>
+#include <linux/hugetlb.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include "kexec_internal.h"
+
+DEFINE_MUTEX(kexec_mutex);
+
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t __percpu *crash_notes;
+
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
+
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+	.name  = "Crash kernel",
+	.start = 0,
+	.end   = 0,
+	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+struct resource crashk_low_res = {
+	.name  = "Crash kernel",
+	.start = 0,
+	.end   = 0,
+	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+int kexec_should_crash(struct task_struct *p)
+{
+	/*
+	 * If crash_kexec_post_notifiers is enabled, don't run
+	 * crash_kexec() here yet, which must be run after panic
+	 * notifiers in panic().
+	 */
+	if (crash_kexec_post_notifiers)
+		return 0;
+	/*
+	 * There are 4 panic() calls in do_exit() path, each of which
+	 * corresponds to each of these 4 conditions.
+	 */
+	if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+		return 1;
+	return 0;
+}
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses.  On processors
+ * where you can disable the MMU this is trivial, and easy.  For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place.  This means I can only support memory whose
+ * physical address can fit in an unsigned long.  In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages.  As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it).  The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ *  - allocating a page table with the control code buffer identity
+ *    mapped, to simplify machine_kexec and make kexec_on_panic more
+ *    reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static struct page *kimage_alloc_page(struct kimage *image,
+				       gfp_t gfp_mask,
+				       unsigned long dest);
+
+int sanity_check_segment_list(struct kimage *image)
+{
+	int result, i;
+	unsigned long nr_segments = image->nr_segments;
+
+	/*
+	 * Verify we have good destination addresses.  The caller is
+	 * responsible for making certain we don't attempt to load
+	 * the new image into invalid or reserved areas of RAM.  This
+	 * just verifies it is an address we can use.
+	 *
+	 * Since the kernel does everything in page size chunks ensure
+	 * the destination addresses are page aligned.  Too many
+	 * special cases crop of when we don't do this.  The most
+	 * insidious is getting overlapping destination addresses
+	 * simply because addresses are changed to page size
+	 * granularity.
+	 */
+	result = -EADDRNOTAVAIL;
+	for (i = 0; i < nr_segments; i++) {
+		unsigned long mstart, mend;
+
+		mstart = image->segment[i].mem;
+		mend   = mstart + image->segment[i].memsz;
+		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+			return result;
+		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+			return result;
+	}
+
+	/* Verify our destination addresses do not overlap.
+	 * If we alloed overlapping destination addresses
+	 * through very weird things can happen with no
+	 * easy explanation as one segment stops on another.
+	 */
+	result = -EINVAL;
+	for (i = 0; i < nr_segments; i++) {
+		unsigned long mstart, mend;
+		unsigned long j;
+
+		mstart = image->segment[i].mem;
+		mend   = mstart + image->segment[i].memsz;
+		for (j = 0; j < i; j++) {
+			unsigned long pstart, pend;
+
+			pstart = image->segment[j].mem;
+			pend   = pstart + image->segment[j].memsz;
+			/* Do the segments overlap ? */
+			if ((mend > pstart) && (mstart < pend))
+				return result;
+		}
+	}
+
+	/* Ensure our buffer sizes are strictly less than
+	 * our memory sizes.  This should always be the case,
+	 * and it is easier to check up front than to be surprised
+	 * later on.
+	 */
+	result = -EINVAL;
+	for (i = 0; i < nr_segments; i++) {
+		if (image->segment[i].bufsz > image->segment[i].memsz)
+			return result;
+	}
+
+	/*
+	 * Verify we have good destination addresses.  Normally
+	 * the caller is responsible for making certain we don't
+	 * attempt to load the new image into invalid or reserved
+	 * areas of RAM.  But crash kernels are preloaded into a
+	 * reserved area of ram.  We must ensure the addresses
+	 * are in the reserved area otherwise preloading the
+	 * kernel could corrupt things.
+	 */
+
+	if (image->type == KEXEC_TYPE_CRASH) {
+		result = -EADDRNOTAVAIL;
+		for (i = 0; i < nr_segments; i++) {
+			unsigned long mstart, mend;
+
+			mstart = image->segment[i].mem;
+			mend = mstart + image->segment[i].memsz - 1;
+			/* Ensure we are within the crash kernel limits */
+			if ((mstart < crashk_res.start) ||
+			    (mend > crashk_res.end))
+				return result;
+		}
+	}
+
+	return 0;
+}
+
+struct kimage *do_kimage_alloc_init(void)
+{
+	struct kimage *image;
+
+	/* Allocate a controlling structure */
+	image = kzalloc(sizeof(*image), GFP_KERNEL);
+	if (!image)
+		return NULL;
+
+	image->head = 0;
+	image->entry = &image->head;
+	image->last_entry = &image->head;
+	image->control_page = ~0; /* By default this does not apply */
+	image->type = KEXEC_TYPE_DEFAULT;
+
+	/* Initialize the list of control pages */
+	INIT_LIST_HEAD(&image->control_pages);
+
+	/* Initialize the list of destination pages */
+	INIT_LIST_HEAD(&image->dest_pages);
+
+	/* Initialize the list of unusable pages */
+	INIT_LIST_HEAD(&image->unusable_pages);
+
+	return image;
+}
+
+int kimage_is_destination_range(struct kimage *image,
+					unsigned long start,
+					unsigned long end)
+{
+	unsigned long i;
+
+	for (i = 0; i < image->nr_segments; i++) {
+		unsigned long mstart, mend;
+
+		mstart = image->segment[i].mem;
+		mend = mstart + image->segment[i].memsz;
+		if ((end > mstart) && (start < mend))
+			return 1;
+	}
+
+	return 0;
+}
+
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+{
+	struct page *pages;
+
+	pages = alloc_pages(gfp_mask, order);
+	if (pages) {
+		unsigned int count, i;
+
+		pages->mapping = NULL;
+		set_page_private(pages, order);
+		count = 1 << order;
+		for (i = 0; i < count; i++)
+			SetPageReserved(pages + i);
+	}
+
+	return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+	unsigned int order, count, i;
+
+	order = page_private(page);
+	count = 1 << order;
+	for (i = 0; i < count; i++)
+		ClearPageReserved(page + i);
+	__free_pages(page, order);
+}
+
+void kimage_free_page_list(struct list_head *list)
+{
+	struct list_head *pos, *next;
+
+	list_for_each_safe(pos, next, list) {
+		struct page *page;
+
+		page = list_entry(pos, struct page, lru);
+		list_del(&page->lru);
+		kimage_free_pages(page);
+	}
+}
+
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+							unsigned int order)
+{
+	/* Control pages are special, they are the intermediaries
+	 * that are needed while we copy the rest of the pages
+	 * to their final resting place.  As such they must
+	 * not conflict with either the destination addresses
+	 * or memory the kernel is already using.
+	 *
+	 * The only case where we really need more than one of
+	 * these are for architectures where we cannot disable
+	 * the MMU and must instead generate an identity mapped
+	 * page table for all of the memory.
+	 *
+	 * At worst this runs in O(N) of the image size.
+	 */
+	struct list_head extra_pages;
+	struct page *pages;
+	unsigned int count;
+
+	count = 1 << order;
+	INIT_LIST_HEAD(&extra_pages);
+
+	/* Loop while I can allocate a page and the page allocated
+	 * is a destination page.
+	 */
+	do {
+		unsigned long pfn, epfn, addr, eaddr;
+
+		pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
+		if (!pages)
+			break;
+		pfn   = page_to_pfn(pages);
+		epfn  = pfn + count;
+		addr  = pfn << PAGE_SHIFT;
+		eaddr = epfn << PAGE_SHIFT;
+		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+			      kimage_is_destination_range(image, addr, eaddr)) {
+			list_add(&pages->lru, &extra_pages);
+			pages = NULL;
+		}
+	} while (!pages);
+
+	if (pages) {
+		/* Remember the allocated page... */
+		list_add(&pages->lru, &image->control_pages);
+
+		/* Because the page is already in it's destination
+		 * location we will never allocate another page at
+		 * that address.  Therefore kimage_alloc_pages
+		 * will not return it (again) and we don't need
+		 * to give it an entry in image->segment[].
+		 */
+	}
+	/* Deal with the destination pages I have inadvertently allocated.
+	 *
+	 * Ideally I would convert multi-page allocations into single
+	 * page allocations, and add everything to image->dest_pages.
+	 *
+	 * For now it is simpler to just free the pages.
+	 */
+	kimage_free_page_list(&extra_pages);
+
+	return pages;
+}
+
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+						      unsigned int order)
+{
+	/* Control pages are special, they are the intermediaries
+	 * that are needed while we copy the rest of the pages
+	 * to their final resting place.  As such they must
+	 * not conflict with either the destination addresses
+	 * or memory the kernel is already using.
+	 *
+	 * Control pages are also the only pags we must allocate
+	 * when loading a crash kernel.  All of the other pages
+	 * are specified by the segments and we just memcpy
+	 * into them directly.
+	 *
+	 * The only case where we really need more than one of
+	 * these are for architectures where we cannot disable
+	 * the MMU and must instead generate an identity mapped
+	 * page table for all of the memory.
+	 *
+	 * Given the low demand this implements a very simple
+	 * allocator that finds the first hole of the appropriate
+	 * size in the reserved memory region, and allocates all
+	 * of the memory up to and including the hole.
+	 */
+	unsigned long hole_start, hole_end, size;
+	struct page *pages;
+
+	pages = NULL;
+	size = (1 << order) << PAGE_SHIFT;
+	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+	hole_end   = hole_start + size - 1;
+	while (hole_end <= crashk_res.end) {
+		unsigned long i;
+
+		if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
+			break;
+		/* See if I overlap any of the segments */
+		for (i = 0; i < image->nr_segments; i++) {
+			unsigned long mstart, mend;
+
+			mstart = image->segment[i].mem;
+			mend   = mstart + image->segment[i].memsz - 1;
+			if ((hole_end >= mstart) && (hole_start <= mend)) {
+				/* Advance the hole to the end of the segment */
+				hole_start = (mend + (size - 1)) & ~(size - 1);
+				hole_end   = hole_start + size - 1;
+				break;
+			}
+		}
+		/* If I don't overlap any segments I have found my hole! */
+		if (i == image->nr_segments) {
+			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+			break;
+		}
+	}
+	if (pages)
+		image->control_page = hole_end;
+
+	return pages;
+}
+
+
+struct page *kimage_alloc_control_pages(struct kimage *image,
+					 unsigned int order)
+{
+	struct page *pages = NULL;
+
+	switch (image->type) {
+	case KEXEC_TYPE_DEFAULT:
+		pages = kimage_alloc_normal_control_pages(image, order);
+		break;
+	case KEXEC_TYPE_CRASH:
+		pages = kimage_alloc_crash_control_pages(image, order);
+		break;
+	}
+
+	return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+	if (*image->entry != 0)
+		image->entry++;
+
+	if (image->entry == image->last_entry) {
+		kimage_entry_t *ind_page;
+		struct page *page;
+
+		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+		if (!page)
+			return -ENOMEM;
+
+		ind_page = page_address(page);
+		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+		image->entry = ind_page;
+		image->last_entry = ind_page +
+				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+	}
+	*image->entry = entry;
+	image->entry++;
+	*image->entry = 0;
+
+	return 0;
+}
+
+static int kimage_set_destination(struct kimage *image,
+				   unsigned long destination)
+{
+	int result;
+
+	destination &= PAGE_MASK;
+	result = kimage_add_entry(image, destination | IND_DESTINATION);
+
+	return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+	int result;
+
+	page &= PAGE_MASK;
+	result = kimage_add_entry(image, page | IND_SOURCE);
+
+	return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+	/* Walk through and free any extra destination pages I may have */
+	kimage_free_page_list(&image->dest_pages);
+
+	/* Walk through and free any unusable pages I have cached */
+	kimage_free_page_list(&image->unusable_pages);
+
+}
+void kimage_terminate(struct kimage *image)
+{
+	if (*image->entry != 0)
+		image->entry++;
+
+	*image->entry = IND_DONE;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+		ptr = (entry & IND_INDIRECTION) ? \
+			phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+	struct page *page;
+
+	page = pfn_to_page(entry >> PAGE_SHIFT);
+	kimage_free_pages(page);
+}
+
+void kimage_free(struct kimage *image)
+{
+	kimage_entry_t *ptr, entry;
+	kimage_entry_t ind = 0;
+
+	if (!image)
+		return;
+
+	kimage_free_extra_pages(image);
+	for_each_kimage_entry(image, ptr, entry) {
+		if (entry & IND_INDIRECTION) {
+			/* Free the previous indirection page */
+			if (ind & IND_INDIRECTION)
+				kimage_free_entry(ind);
+			/* Save this indirection page until we are
+			 * done with it.
+			 */
+			ind = entry;
+		} else if (entry & IND_SOURCE)
+			kimage_free_entry(entry);
+	}
+	/* Free the final indirection page */
+	if (ind & IND_INDIRECTION)
+		kimage_free_entry(ind);
+
+	/* Handle any machine specific cleanup */
+	machine_kexec_cleanup(image);
+
+	/* Free the kexec control pages... */
+	kimage_free_page_list(&image->control_pages);
+
+	/*
+	 * Free up any temporary buffers allocated. This might hit if
+	 * error occurred much later after buffer allocation.
+	 */
+	if (image->file_mode)
+		kimage_file_post_load_cleanup(image);
+
+	kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+					unsigned long page)
+{
+	kimage_entry_t *ptr, entry;
+	unsigned long destination = 0;
+
+	for_each_kimage_entry(image, ptr, entry) {
+		if (entry & IND_DESTINATION)
+			destination = entry & PAGE_MASK;
+		else if (entry & IND_SOURCE) {
+			if (page == destination)
+				return ptr;
+			destination += PAGE_SIZE;
+		}
+	}
+
+	return NULL;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image,
+					gfp_t gfp_mask,
+					unsigned long destination)
+{
+	/*
+	 * Here we implement safeguards to ensure that a source page
+	 * is not copied to its destination page before the data on
+	 * the destination page is no longer useful.
+	 *
+	 * To do this we maintain the invariant that a source page is
+	 * either its own destination page, or it is not a
+	 * destination page at all.
+	 *
+	 * That is slightly stronger than required, but the proof
+	 * that no problems will not occur is trivial, and the
+	 * implementation is simply to verify.
+	 *
+	 * When allocating all pages normally this algorithm will run
+	 * in O(N) time, but in the worst case it will run in O(N^2)
+	 * time.   If the runtime is a problem the data structures can
+	 * be fixed.
+	 */
+	struct page *page;
+	unsigned long addr;
+
+	/*
+	 * Walk through the list of destination pages, and see if I
+	 * have a match.
+	 */
+	list_for_each_entry(page, &image->dest_pages, lru) {
+		addr = page_to_pfn(page) << PAGE_SHIFT;
+		if (addr == destination) {
+			list_del(&page->lru);
+			return page;
+		}
+	}
+	page = NULL;
+	while (1) {
+		kimage_entry_t *old;
+
+		/* Allocate a page, if we run out of memory give up */
+		page = kimage_alloc_pages(gfp_mask, 0);
+		if (!page)
+			return NULL;
+		/* If the page cannot be used file it away */
+		if (page_to_pfn(page) >
+				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+			list_add(&page->lru, &image->unusable_pages);
+			continue;
+		}
+		addr = page_to_pfn(page) << PAGE_SHIFT;
+
+		/* If it is the destination page we want use it */
+		if (addr == destination)
+			break;
+
+		/* If the page is not a destination page use it */
+		if (!kimage_is_destination_range(image, addr,
+						  addr + PAGE_SIZE))
+			break;
+
+		/*
+		 * I know that the page is someones destination page.
+		 * See if there is already a source page for this
+		 * destination page.  And if so swap the source pages.
+		 */
+		old = kimage_dst_used(image, addr);
+		if (old) {
+			/* If so move it */
+			unsigned long old_addr;
+			struct page *old_page;
+
+			old_addr = *old & PAGE_MASK;
+			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+			copy_highpage(page, old_page);
+			*old = addr | (*old & ~PAGE_MASK);
+
+			/* The old page I have found cannot be a
+			 * destination page, so return it if it's
+			 * gfp_flags honor the ones passed in.
+			 */
+			if (!(gfp_mask & __GFP_HIGHMEM) &&
+			    PageHighMem(old_page)) {
+				kimage_free_pages(old_page);
+				continue;
+			}
+			addr = old_addr;
+			page = old_page;
+			break;
+		}
+		/* Place the page on the destination list, to be used later */
+		list_add(&page->lru, &image->dest_pages);
+	}
+
+	return page;
+}
+
+static int kimage_load_normal_segment(struct kimage *image,
+					 struct kexec_segment *segment)
+{
+	unsigned long maddr;
+	size_t ubytes, mbytes;
+	int result;
+	unsigned char __user *buf = NULL;
+	unsigned char *kbuf = NULL;
+
+	result = 0;
+	if (image->file_mode)
+		kbuf = segment->kbuf;
+	else
+		buf = segment->buf;
+	ubytes = segment->bufsz;
+	mbytes = segment->memsz;
+	maddr = segment->mem;
+
+	result = kimage_set_destination(image, maddr);
+	if (result < 0)
+		goto out;
+
+	while (mbytes) {
+		struct page *page;
+		char *ptr;
+		size_t uchunk, mchunk;
+
+		page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+		if (!page) {
+			result  = -ENOMEM;
+			goto out;
+		}
+		result = kimage_add_page(image, page_to_pfn(page)
+								<< PAGE_SHIFT);
+		if (result < 0)
+			goto out;
+
+		ptr = kmap(page);
+		/* Start with a clear page */
+		clear_page(ptr);
+		ptr += maddr & ~PAGE_MASK;
+		mchunk = min_t(size_t, mbytes,
+				PAGE_SIZE - (maddr & ~PAGE_MASK));
+		uchunk = min(ubytes, mchunk);
+
+		/* For file based kexec, source pages are in kernel memory */
+		if (image->file_mode)
+			memcpy(ptr, kbuf, uchunk);
+		else
+			result = copy_from_user(ptr, buf, uchunk);
+		kunmap(page);
+		if (result) {
+			result = -EFAULT;
+			goto out;
+		}
+		ubytes -= uchunk;
+		maddr  += mchunk;
+		if (image->file_mode)
+			kbuf += mchunk;
+		else
+			buf += mchunk;
+		mbytes -= mchunk;
+	}
+out:
+	return result;
+}
+
+static int kimage_load_crash_segment(struct kimage *image,
+					struct kexec_segment *segment)
+{
+	/* For crash dumps kernels we simply copy the data from
+	 * user space to it's destination.
+	 * We do things a page at a time for the sake of kmap.
+	 */
+	unsigned long maddr;
+	size_t ubytes, mbytes;
+	int result;
+	unsigned char __user *buf = NULL;
+	unsigned char *kbuf = NULL;
+
+	result = 0;
+	if (image->file_mode)
+		kbuf = segment->kbuf;
+	else
+		buf = segment->buf;
+	ubytes = segment->bufsz;
+	mbytes = segment->memsz;
+	maddr = segment->mem;
+	while (mbytes) {
+		struct page *page;
+		char *ptr;
+		size_t uchunk, mchunk;
+
+		page = pfn_to_page(maddr >> PAGE_SHIFT);
+		if (!page) {
+			result  = -ENOMEM;
+			goto out;
+		}
+		ptr = kmap(page);
+		ptr += maddr & ~PAGE_MASK;
+		mchunk = min_t(size_t, mbytes,
+				PAGE_SIZE - (maddr & ~PAGE_MASK));
+		uchunk = min(ubytes, mchunk);
+		if (mchunk > uchunk) {
+			/* Zero the trailing part of the page */
+			memset(ptr + uchunk, 0, mchunk - uchunk);
+		}
+
+		/* For file based kexec, source pages are in kernel memory */
+		if (image->file_mode)
+			memcpy(ptr, kbuf, uchunk);
+		else
+			result = copy_from_user(ptr, buf, uchunk);
+		kexec_flush_icache_page(page);
+		kunmap(page);
+		if (result) {
+			result = -EFAULT;
+			goto out;
+		}
+		ubytes -= uchunk;
+		maddr  += mchunk;
+		if (image->file_mode)
+			kbuf += mchunk;
+		else
+			buf += mchunk;
+		mbytes -= mchunk;
+	}
+out:
+	return result;
+}
+
+int kimage_load_segment(struct kimage *image,
+				struct kexec_segment *segment)
+{
+	int result = -ENOMEM;
+
+	switch (image->type) {
+	case KEXEC_TYPE_DEFAULT:
+		result = kimage_load_normal_segment(image, segment);
+		break;
+	case KEXEC_TYPE_CRASH:
+		result = kimage_load_crash_segment(image, segment);
+		break;
+	}
+
+	return result;
+}
+
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
+int kexec_load_disabled;
+
+void crash_kexec(struct pt_regs *regs)
+{
+	/* Take the kexec_mutex here to prevent sys_kexec_load
+	 * running on one cpu from replacing the crash kernel
+	 * we are using after a panic on a different cpu.
+	 *
+	 * If the crash kernel was not located in a fixed area
+	 * of memory the xchg(&kexec_crash_image) would be
+	 * sufficient.  But since I reuse the memory...
+	 */
+	if (mutex_trylock(&kexec_mutex)) {
+		if (kexec_crash_image) {
+			struct pt_regs fixed_regs;
+
+			crash_setup_regs(&fixed_regs, regs);
+			crash_save_vmcoreinfo();
+			machine_crash_shutdown(&fixed_regs);
+			machine_kexec(kexec_crash_image);
+		}
+		mutex_unlock(&kexec_mutex);
+	}
+}
+
+size_t crash_get_memory_size(void)
+{
+	size_t size = 0;
+
+	mutex_lock(&kexec_mutex);
+	if (crashk_res.end != crashk_res.start)
+		size = resource_size(&crashk_res);
+	mutex_unlock(&kexec_mutex);
+	return size;
+}
+
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+					   unsigned long end)
+{
+	unsigned long addr;
+
+	for (addr = begin; addr < end; addr += PAGE_SIZE)
+		free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+}
+
+int crash_shrink_memory(unsigned long new_size)
+{
+	int ret = 0;
+	unsigned long start, end;
+	unsigned long old_size;
+	struct resource *ram_res;
+
+	mutex_lock(&kexec_mutex);
+
+	if (kexec_crash_image) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+	start = crashk_res.start;
+	end = crashk_res.end;
+	old_size = (end == 0) ? 0 : end - start + 1;
+	if (new_size >= old_size) {
+		ret = (new_size == old_size) ? 0 : -EINVAL;
+		goto unlock;
+	}
+
+	ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+	if (!ram_res) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
+	end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
+
+	crash_map_reserved_pages();
+	crash_free_reserved_phys_range(end, crashk_res.end);
+
+	if ((start == end) && (crashk_res.parent != NULL))
+		release_resource(&crashk_res);
+
+	ram_res->start = end;
+	ram_res->end = crashk_res.end;
+	ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+	ram_res->name = "System RAM";
+
+	crashk_res.end = end - 1;
+
+	insert_resource(&iomem_resource, ram_res);
+	crash_unmap_reserved_pages();
+
+unlock:
+	mutex_unlock(&kexec_mutex);
+	return ret;
+}
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+			    size_t data_len)
+{
+	struct elf_note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = data_len;
+	note.n_type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) + 3)/4;
+	memcpy(buf, name, note.n_namesz);
+	buf += (note.n_namesz + 3)/4;
+	memcpy(buf, data, note.n_descsz);
+	buf += (note.n_descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	struct elf_note note;
+
+	note.n_namesz = 0;
+	note.n_descsz = 0;
+	note.n_type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+	struct elf_prstatus prstatus;
+	u32 *buf;
+
+	if ((cpu < 0) || (cpu >= nr_cpu_ids))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * I need a well defined structure format
+	 * for the data I pass, and I need tags
+	 * on the data to indicate what information I have
+	 * squirrelled away.  ELF notes happen to provide
+	 * all of that, so there is no need to invent something new.
+	 */
+	buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+	if (!buf)
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	prstatus.pr_pid = current->pid;
+	elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+			      &prstatus, sizeof(prstatus));
+	final_note(buf);
+}
+
+static int __init crash_notes_memory_init(void)
+{
+	/* Allocate memory for saving cpu registers. */
+	crash_notes = alloc_percpu(note_buf_t);
+	if (!crash_notes) {
+		pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+subsys_initcall(crash_notes_memory_init);
+
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ *   crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+					unsigned long long system_ram,
+					unsigned long long *crash_size,
+					unsigned long long *crash_base)
+{
+	char *cur = cmdline, *tmp;
+
+	/* for each entry of the comma-separated list */
+	do {
+		unsigned long long start, end = ULLONG_MAX, size;
+
+		/* get the start of the range */
+		start = memparse(cur, &tmp);
+		if (cur == tmp) {
+			pr_warn("crashkernel: Memory value expected\n");
+			return -EINVAL;
+		}
+		cur = tmp;
+		if (*cur != '-') {
+			pr_warn("crashkernel: '-' expected\n");
+			return -EINVAL;
+		}
+		cur++;
+
+		/* if no ':' is here, than we read the end */
+		if (*cur != ':') {
+			end = memparse(cur, &tmp);
+			if (cur == tmp) {
+				pr_warn("crashkernel: Memory value expected\n");
+				return -EINVAL;
+			}
+			cur = tmp;
+			if (end <= start) {
+				pr_warn("crashkernel: end <= start\n");
+				return -EINVAL;
+			}
+		}
+
+		if (*cur != ':') {
+			pr_warn("crashkernel: ':' expected\n");
+			return -EINVAL;
+		}
+		cur++;
+
+		size = memparse(cur, &tmp);
+		if (cur == tmp) {
+			pr_warn("Memory value expected\n");
+			return -EINVAL;
+		}
+		cur = tmp;
+		if (size >= system_ram) {
+			pr_warn("crashkernel: invalid size\n");
+			return -EINVAL;
+		}
+
+		/* match ? */
+		if (system_ram >= start && system_ram < end) {
+			*crash_size = size;
+			break;
+		}
+	} while (*cur++ == ',');
+
+	if (*crash_size > 0) {
+		while (*cur && *cur != ' ' && *cur != '@')
+			cur++;
+		if (*cur == '@') {
+			cur++;
+			*crash_base = memparse(cur, &tmp);
+			if (cur == tmp) {
+				pr_warn("Memory value expected after '@'\n");
+				return -EINVAL;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ *	crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+					   unsigned long long *crash_size,
+					   unsigned long long *crash_base)
+{
+	char *cur = cmdline;
+
+	*crash_size = memparse(cmdline, &cur);
+	if (cmdline == cur) {
+		pr_warn("crashkernel: memory value expected\n");
+		return -EINVAL;
+	}
+
+	if (*cur == '@')
+		*crash_base = memparse(cur+1, &cur);
+	else if (*cur != ' ' && *cur != '\0') {
+		pr_warn("crashkernel: unrecognized char\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW  1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+	[SUFFIX_HIGH] = ",high",
+	[SUFFIX_LOW]  = ",low",
+	[SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix"  crashkernel command lines like
+ *
+ *	crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+					   unsigned long long	*crash_size,
+					   const char *suffix)
+{
+	char *cur = cmdline;
+
+	*crash_size = memparse(cmdline, &cur);
+	if (cmdline == cur) {
+		pr_warn("crashkernel: memory value expected\n");
+		return -EINVAL;
+	}
+
+	/* check with suffix */
+	if (strncmp(cur, suffix, strlen(suffix))) {
+		pr_warn("crashkernel: unrecognized char\n");
+		return -EINVAL;
+	}
+	cur += strlen(suffix);
+	if (*cur != ' ' && *cur != '\0') {
+		pr_warn("crashkernel: unrecognized char\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+			     const char *name,
+			     const char *suffix)
+{
+	char *p = cmdline, *ck_cmdline = NULL;
+
+	/* find crashkernel and use the last one if there are more */
+	p = strstr(p, name);
+	while (p) {
+		char *end_p = strchr(p, ' ');
+		char *q;
+
+		if (!end_p)
+			end_p = p + strlen(p);
+
+		if (!suffix) {
+			int i;
+
+			/* skip the one with any known suffix */
+			for (i = 0; suffix_tbl[i]; i++) {
+				q = end_p - strlen(suffix_tbl[i]);
+				if (!strncmp(q, suffix_tbl[i],
+					     strlen(suffix_tbl[i])))
+					goto next;
+			}
+			ck_cmdline = p;
+		} else {
+			q = end_p - strlen(suffix);
+			if (!strncmp(q, suffix, strlen(suffix)))
+				ck_cmdline = p;
+		}
+next:
+		p = strstr(p+1, name);
+	}
+
+	if (!ck_cmdline)
+		return NULL;
+
+	return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base,
+			     const char *name,
+			     const char *suffix)
+{
+	char	*first_colon, *first_space;
+	char	*ck_cmdline;
+
+	BUG_ON(!crash_size || !crash_base);
+	*crash_size = 0;
+	*crash_base = 0;
+
+	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+
+	if (!ck_cmdline)
+		return -EINVAL;
+
+	ck_cmdline += strlen(name);
+
+	if (suffix)
+		return parse_crashkernel_suffix(ck_cmdline, crash_size,
+				suffix);
+	/*
+	 * if the commandline contains a ':', then that's the extended
+	 * syntax -- if not, it must be the classic syntax
+	 */
+	first_colon = strchr(ck_cmdline, ':');
+	first_space = strchr(ck_cmdline, ' ');
+	if (first_colon && (!first_space || first_colon < first_space))
+		return parse_crashkernel_mem(ck_cmdline, system_ram,
+				crash_size, crash_base);
+
+	return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base)
+{
+	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+					"crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base)
+{
+	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+				"crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base)
+{
+	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+				"crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
+
+static void update_vmcoreinfo_note(void)
+{
+	u32 *buf = vmcoreinfo_note;
+
+	if (!vmcoreinfo_size)
+		return;
+	buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+			      vmcoreinfo_size);
+	final_note(buf);
+}
+
+void crash_save_vmcoreinfo(void)
+{
+	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+	update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+	va_list args;
+	char buf[0x50];
+	size_t r;
+
+	va_start(args, fmt);
+	r = vscnprintf(buf, sizeof(buf), fmt, args);
+	va_end(args);
+
+	r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+
+	memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+	vmcoreinfo_size += r;
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+unsigned long __weak paddr_vmcoreinfo_note(void)
+{
+	return __pa((unsigned long)(char *)&vmcoreinfo_note);
+}
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+	VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+	VMCOREINFO_SYMBOL(init_uts_ns);
+	VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+	VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
+	VMCOREINFO_SYMBOL(_stext);
+	VMCOREINFO_SYMBOL(vmap_area_list);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+	VMCOREINFO_SYMBOL(mem_map);
+	VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+	VMCOREINFO_SYMBOL(mem_section);
+	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+	VMCOREINFO_STRUCT_SIZE(mem_section);
+	VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+	VMCOREINFO_STRUCT_SIZE(page);
+	VMCOREINFO_STRUCT_SIZE(pglist_data);
+	VMCOREINFO_STRUCT_SIZE(zone);
+	VMCOREINFO_STRUCT_SIZE(free_area);
+	VMCOREINFO_STRUCT_SIZE(list_head);
+	VMCOREINFO_SIZE(nodemask_t);
+	VMCOREINFO_OFFSET(page, flags);
+	VMCOREINFO_OFFSET(page, _count);
+	VMCOREINFO_OFFSET(page, mapping);
+	VMCOREINFO_OFFSET(page, lru);
+	VMCOREINFO_OFFSET(page, _mapcount);
+	VMCOREINFO_OFFSET(page, private);
+	VMCOREINFO_OFFSET(pglist_data, node_zones);
+	VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+	VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+	VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+	VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+	VMCOREINFO_OFFSET(pglist_data, node_id);
+	VMCOREINFO_OFFSET(zone, free_area);
+	VMCOREINFO_OFFSET(zone, vm_stat);
+	VMCOREINFO_OFFSET(zone, spanned_pages);
+	VMCOREINFO_OFFSET(free_area, free_list);
+	VMCOREINFO_OFFSET(list_head, next);
+	VMCOREINFO_OFFSET(list_head, prev);
+	VMCOREINFO_OFFSET(vmap_area, va_start);
+	VMCOREINFO_OFFSET(vmap_area, list);
+	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+	log_buf_kexec_setup();
+	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+	VMCOREINFO_NUMBER(NR_FREE_PAGES);
+	VMCOREINFO_NUMBER(PG_lru);
+	VMCOREINFO_NUMBER(PG_private);
+	VMCOREINFO_NUMBER(PG_swapcache);
+	VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+	VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+	VMCOREINFO_NUMBER(PG_head_mask);
+	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLBFS
+	VMCOREINFO_SYMBOL(free_huge_page);
+#endif
+
+	arch_crash_save_vmcoreinfo();
+	update_vmcoreinfo_note();
+
+	return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
+
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable.  If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+	int error = 0;
+
+	if (!mutex_trylock(&kexec_mutex))
+		return -EBUSY;
+	if (!kexec_image) {
+		error = -EINVAL;
+		goto Unlock;
+	}
+
+#ifdef CONFIG_KEXEC_JUMP
+	if (kexec_image->preserve_context) {
+		lock_system_sleep();
+		pm_prepare_console();
+		error = freeze_processes();
+		if (error) {
+			error = -EBUSY;
+			goto Restore_console;
+		}
+		suspend_console();
+		error = dpm_suspend_start(PMSG_FREEZE);
+		if (error)
+			goto Resume_console;
+		/* At this point, dpm_suspend_start() has been called,
+		 * but *not* dpm_suspend_end(). We *must* call
+		 * dpm_suspend_end() now.  Otherwise, drivers for
+		 * some devices (e.g. interrupt controllers) become
+		 * desynchronized with the actual state of the
+		 * hardware at resume time, and evil weirdness ensues.
+		 */
+		error = dpm_suspend_end(PMSG_FREEZE);
+		if (error)
+			goto Resume_devices;
+		error = disable_nonboot_cpus();
+		if (error)
+			goto Enable_cpus;
+		local_irq_disable();
+		error = syscore_suspend();
+		if (error)
+			goto Enable_irqs;
+	} else
+#endif
+	{
+		kexec_in_progress = true;
+		kernel_restart_prepare(NULL);
+		migrate_to_reboot_cpu();
+
+		/*
+		 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+		 * no further code needs to use CPU hotplug (which is true in
+		 * the reboot case). However, the kexec path depends on using
+		 * CPU hotplug again; so re-enable it here.
+		 */
+		cpu_hotplug_enable();
+		pr_emerg("Starting new kernel\n");
+		machine_shutdown();
+	}
+
+	machine_kexec(kexec_image);
+
+#ifdef CONFIG_KEXEC_JUMP
+	if (kexec_image->preserve_context) {
+		syscore_resume();
+ Enable_irqs:
+		local_irq_enable();
+ Enable_cpus:
+		enable_nonboot_cpus();
+		dpm_resume_start(PMSG_RESTORE);
+ Resume_devices:
+		dpm_resume_end(PMSG_RESTORE);
+ Resume_console:
+		resume_console();
+		thaw_processes();
+ Restore_console:
+		pm_restore_console();
+		unlock_system_sleep();
+	}
+#endif
+
+ Unlock:
+	mutex_unlock(&kexec_mutex);
+	return error;
+}
+
+/*
+ * Add and remove page tables for crashkernel memory
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak crash_map_reserved_pages(void)
+{}
+
+void __weak crash_unmap_reserved_pages(void)
+{}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6683ccef9fff..e83b26464061 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -90,7 +90,7 @@ static ssize_t profiling_store(struct kobject *kobj,
 KERNEL_ATTR_RW(profiling);
 #endif
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 static ssize_t kexec_loaded_show(struct kobject *kobj,
 				 struct kobj_attribute *attr, char *buf)
 {
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(vmcoreinfo);
 
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 /* whether file capabilities are enabled */
 static ssize_t fscaps_show(struct kobject *kobj,
@@ -196,7 +196,7 @@ static struct attribute * kernel_attrs[] = {
 #ifdef CONFIG_PROFILING
 	&profiling_attr.attr,
 #endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	&kexec_loaded_attr.attr,
 	&kexec_crash_loaded_attr.attr,
 	&kexec_crash_size_attr.attr,
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index cf8c24203368..8f0324ef72ab 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -835,7 +835,7 @@ const struct file_operations kmsg_fops = {
 	.release = devkmsg_release,
 };
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 /*
  * This appends the listed symbols to /proc/vmcore
  *
diff --git a/kernel/reboot.c b/kernel/reboot.c
index d20c85d9f8c0..bd30a973fe94 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -346,7 +346,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 		kernel_restart(buffer);
 		break;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	case LINUX_REBOOT_CMD_KEXEC:
 		ret = kernel_kexec();
 		break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 19b62b522158..715cc57cc66a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -621,7 +621,7 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	{
 		.procname	= "kexec_load_disabled",
 		.data		= &kexec_load_disabled,
-- 
cgit v1.2.3-70-g09d2


From 1fcfd8db7f82fa1f533a6f0e4155614ff4144d56 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 9 Sep 2015 15:39:29 -0700
Subject: mm, mpx: add "vm_flags_t vm_flags" arg to do_mmap_pgoff()

Add the additional "vm_flags_t vm_flags" argument to do_mmap_pgoff(),
rename it to do_mmap(), and re-introduce do_mmap_pgoff() as a simple
wrapper on top of do_mmap().  Perhaps we should update the callers of
do_mmap_pgoff() and kill it later.

This way mpx_mmap() can simply call do_mmap(vm_flags => VM_MPX) and do not
play with vm internals.

After this change mmap_region() has a single user outside of mmap.c,
arch/tile/mm/elf.c:arch_setup_additional_pages().  It would be nice to
change arch/tile/ and unexport mmap_region().

[kirill@shutemov.name: fix build]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Tested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/mpx.c  | 51 +++++++--------------------------------------------
 include/linux/mm.h | 12 ++++++++++--
 mm/mmap.c          | 10 ++++------
 mm/nommu.c         | 19 ++++++++++---------
 4 files changed, 31 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index db1b0bc5017c..134948b0926f 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -42,58 +42,21 @@ static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
  */
 static unsigned long mpx_mmap(unsigned long len)
 {
-	unsigned long ret;
-	unsigned long addr, pgoff;
 	struct mm_struct *mm = current->mm;
-	vm_flags_t vm_flags;
-	struct vm_area_struct *vma;
+	unsigned long addr, populate;
 
 	/* Only bounds table can be allocated here */
 	if (len != mpx_bt_size_bytes(mm))
 		return -EINVAL;
 
 	down_write(&mm->mmap_sem);
-
-	/* Too many mappings? */
-	if (mm->map_count > sysctl_max_map_count) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	/* Obtain the address to map to. we verify (or select) it and ensure
-	 * that it represents a valid section of the address space.
-	 */
-	addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE);
-	if (addr & ~PAGE_MASK) {
-		ret = addr;
-		goto out;
-	}
-
-	vm_flags = VM_READ | VM_WRITE | VM_MPX |
-			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
-
-	/* Set pgoff according to addr for anon_vma */
-	pgoff = addr >> PAGE_SHIFT;
-
-	ret = mmap_region(NULL, addr, len, vm_flags, pgoff);
-	if (IS_ERR_VALUE(ret))
-		goto out;
-
-	vma = find_vma(mm, ret);
-	if (!vma) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	if (vm_flags & VM_LOCKED) {
-		up_write(&mm->mmap_sem);
-		mm_populate(ret, len);
-		return ret;
-	}
-
-out:
+	addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
+			MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate);
 	up_write(&mm->mmap_sem);
-	return ret;
+	if (populate)
+		mm_populate(addr, populate);
+
+	return addr;
 }
 
 enum reg_type {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f25a957bf0ab..fda728e3c27d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1873,11 +1873,19 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
 
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
 	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff);
-extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+extern unsigned long do_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot, unsigned long flags,
-	unsigned long pgoff, unsigned long *populate);
+	vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t);
 
+static inline unsigned long
+do_mmap_pgoff(struct file *file, unsigned long addr,
+	unsigned long len, unsigned long prot, unsigned long flags,
+	unsigned long pgoff, unsigned long *populate)
+{
+	return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate);
+}
+
 #ifdef CONFIG_MMU
 extern int __mm_populate(unsigned long addr, unsigned long len,
 			 int ignore_errors);
diff --git a/mm/mmap.c b/mm/mmap.c
index b6be3249f0a9..c739d6db7193 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1260,14 +1260,12 @@ static inline int mlock_future_check(struct mm_struct *mm,
 /*
  * The caller must hold down_write(&current->mm->mmap_sem).
  */
-
-unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+unsigned long do_mmap(struct file *file, unsigned long addr,
 			unsigned long len, unsigned long prot,
-			unsigned long flags, unsigned long pgoff,
-			unsigned long *populate)
+			unsigned long flags, vm_flags_t vm_flags,
+			unsigned long pgoff, unsigned long *populate)
 {
 	struct mm_struct *mm = current->mm;
-	vm_flags_t vm_flags;
 
 	*populate = 0;
 
@@ -1311,7 +1309,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	 * to. we assume access permissions have been handled by the open
 	 * of the memory object, so we don't do any here.
 	 */
-	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+	vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
 			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
 	if (flags & MAP_LOCKED)
diff --git a/mm/nommu.c b/mm/nommu.c
index 1cc0709fcaa5..ab14a2014dea 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1233,18 +1233,19 @@ enomem:
 /*
  * handle mapping creation for uClinux
  */
-unsigned long do_mmap_pgoff(struct file *file,
-			    unsigned long addr,
-			    unsigned long len,
-			    unsigned long prot,
-			    unsigned long flags,
-			    unsigned long pgoff,
-			    unsigned long *populate)
+unsigned long do_mmap(struct file *file,
+			unsigned long addr,
+			unsigned long len,
+			unsigned long prot,
+			unsigned long flags,
+			vm_flags_t vm_flags,
+			unsigned long pgoff,
+			unsigned long *populate)
 {
 	struct vm_area_struct *vma;
 	struct vm_region *region;
 	struct rb_node *rb;
-	unsigned long capabilities, vm_flags, result;
+	unsigned long capabilities, result;
 	int ret;
 
 	*populate = 0;
@@ -1262,7 +1263,7 @@ unsigned long do_mmap_pgoff(struct file *file,
 
 	/* we've determined that we can make the mapping, now translate what we
 	 * now know into VMA flags */
-	vm_flags = determine_vm_flags(file, prot, flags, capabilities);
+	vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
 
 	/* we're going to need to record the mapping */
 	region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
-- 
cgit v1.2.3-70-g09d2


From 5b25b13ab08f616efd566347d809b4ece54570d1 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 11 Sep 2015 13:07:39 -0700
Subject: sys_membarrier(): system-wide memory barrier (generic, x86)

Here is an implementation of a new system call, sys_membarrier(), which
executes a memory barrier on all threads running on the system.  It is
implemented by calling synchronize_sched().  It can be used to
distribute the cost of user-space memory barriers asymmetrically by
transforming pairs of memory barriers into pairs consisting of
sys_membarrier() and a compiler barrier.  For synchronization primitives
that distinguish between read-side and write-side (e.g.  userspace RCU
[1], rwlocks), the read-side can be accelerated significantly by moving
the bulk of the memory barrier overhead to the write-side.

The existing applications of which I am aware that would be improved by
this system call are as follows:

* Through Userspace RCU library (http://urcu.so)
  - DNS server (Knot DNS) https://www.knot-dns.cz/
  - Network sniffer (http://netsniff-ng.org/)
  - Distributed object storage (https://sheepdog.github.io/sheepdog/)
  - User-space tracing (http://lttng.org)
  - Network storage system (https://www.gluster.org/)
  - Virtual routers (https://events.linuxfoundation.org/sites/events/files/slides/DPDK_RCU_0MQ.pdf)
  - Financial software (https://lkml.org/lkml/2015/3/23/189)

Those projects use RCU in userspace to increase read-side speed and
scalability compared to locking.  Especially in the case of RCU used by
libraries, sys_membarrier can speed up the read-side by moving the bulk of
the memory barrier cost to synchronize_rcu().

* Direct users of sys_membarrier
  - core dotnet garbage collector (https://github.com/dotnet/coreclr/issues/198)

Microsoft core dotnet GC developers are planning to use the mprotect()
side-effect of issuing memory barriers through IPIs as a way to implement
Windows FlushProcessWriteBuffers() on Linux.  They are referring to
sys_membarrier in their github thread, specifically stating that
sys_membarrier() is what they are looking for.

To explain the benefit of this scheme, let's introduce two example threads:

Thread A (non-frequent, e.g. executing liburcu synchronize_rcu())
Thread B (frequent, e.g. executing liburcu
rcu_read_lock()/rcu_read_unlock())

In a scheme where all smp_mb() in thread A are ordering memory accesses
with respect to smp_mb() present in Thread B, we can change each
smp_mb() within Thread A into calls to sys_membarrier() and each
smp_mb() within Thread B into compiler barriers "barrier()".

Before the change, we had, for each smp_mb() pairs:

Thread A                    Thread B
previous mem accesses       previous mem accesses
smp_mb()                    smp_mb()
following mem accesses      following mem accesses

After the change, these pairs become:

Thread A                    Thread B
prev mem accesses           prev mem accesses
sys_membarrier()            barrier()
follow mem accesses         follow mem accesses

As we can see, there are two possible scenarios: either Thread B memory
accesses do not happen concurrently with Thread A accesses (1), or they
do (2).

1) Non-concurrent Thread A vs Thread B accesses:

Thread A                    Thread B
prev mem accesses
sys_membarrier()
follow mem accesses
                            prev mem accesses
                            barrier()
                            follow mem accesses

In this case, thread B accesses will be weakly ordered. This is OK,
because at that point, thread A is not particularly interested in
ordering them with respect to its own accesses.

2) Concurrent Thread A vs Thread B accesses

Thread A                    Thread B
prev mem accesses           prev mem accesses
sys_membarrier()            barrier()
follow mem accesses         follow mem accesses

In this case, thread B accesses, which are ensured to be in program
order thanks to the compiler barrier, will be "upgraded" to full
smp_mb() by synchronize_sched().

* Benchmarks

On Intel Xeon E5405 (8 cores)
(one thread is calling sys_membarrier, the other 7 threads are busy
looping)

1000 non-expedited sys_membarrier calls in 33s =3D 33 milliseconds/call.

* User-space user of this system call: Userspace RCU library

Both the signal-based and the sys_membarrier userspace RCU schemes
permit us to remove the memory barrier from the userspace RCU
rcu_read_lock() and rcu_read_unlock() primitives, thus significantly
accelerating them. These memory barriers are replaced by compiler
barriers on the read-side, and all matching memory barriers on the
write-side are turned into an invocation of a memory barrier on all
active threads in the process. By letting the kernel perform this
synchronization rather than dumbly sending a signal to every process
threads (as we currently do), we diminish the number of unnecessary wake
ups and only issue the memory barriers on active threads. Non-running
threads do not need to execute such barrier anyway, because these are
implied by the scheduler context switches.

Results in liburcu:

Operations in 10s, 6 readers, 2 writers:

memory barriers in reader:    1701557485 reads, 2202847 writes
signal-based scheme:          9830061167 reads,    6700 writes
sys_membarrier:               9952759104 reads,     425 writes
sys_membarrier (dyn. check):  7970328887 reads,     425 writes

The dynamic sys_membarrier availability check adds some overhead to
the read-side compared to the signal-based scheme, but besides that,
sys_membarrier slightly outperforms the signal-based scheme. However,
this non-expedited sys_membarrier implementation has a much slower grace
period than signal and memory barrier schemes.

Besides diminishing the number of wake-ups, one major advantage of the
membarrier system call over the signal-based scheme is that it does not
need to reserve a signal. This plays much more nicely with libraries,
and with processes injected into for tracing purposes, for which we
cannot expect that signals will be unused by the application.

An expedited version of this system call can be added later on to speed
up the grace period. Its implementation will likely depend on reading
the cpu_curr()->mm without holding each CPU's rq lock.

This patch adds the system call to x86 and to asm-generic.

[1] http://urcu.so

membarrier(2) man page:

MEMBARRIER(2)              Linux Programmer's Manual             MEMBARRIER(2)

NAME
       membarrier - issue memory barriers on a set of threads

SYNOPSIS
       #include <linux/membarrier.h>

       int membarrier(int cmd, int flags);

DESCRIPTION
       The cmd argument is one of the following:

       MEMBARRIER_CMD_QUERY
              Query  the  set  of  supported commands. It returns a bitmask of
              supported commands.

       MEMBARRIER_CMD_SHARED
              Execute a memory barrier on all threads running on  the  system.
              Upon  return from system call, the caller thread is ensured that
              all running threads have passed through a state where all memory
              accesses  to  user-space  addresses  match program order between
              entry to and return from the system  call  (non-running  threads
              are de facto in such a state). This covers threads from all pro=E2=80=90
              cesses running on the system.  This command returns 0.

       The flags argument needs to be 0. For future extensions.

       All memory accesses performed  in  program  order  from  each  targeted
       thread is guaranteed to be ordered with respect to sys_membarrier(). If
       we use the semantic "barrier()" to represent a compiler barrier forcing
       memory  accesses  to  be performed in program order across the barrier,
       and smp_mb() to represent explicit memory barriers forcing full  memory
       ordering  across  the barrier, we have the following ordering table for
       each pair of barrier(), sys_membarrier() and smp_mb():

       The pair ordering is detailed as (O: ordered, X: not ordered):

                              barrier()   smp_mb() sys_membarrier()
              barrier()          X           X            O
              smp_mb()           X           O            O
              sys_membarrier()   O           O            O

RETURN VALUE
       On success, these system calls return zero.  On error, -1 is  returned,
       and errno is set appropriately. For a given command, with flags
       argument set to 0, this system call is guaranteed to always return the
       same value until reboot.

ERRORS
       ENOSYS System call is not implemented.

       EINVAL Invalid arguments.

Linux                             2015-04-15                     MEMBARRIER(2)

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nicholas Miell <nmiell@comcast.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Pranith Kumar <bobby.prani@gmail.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS                            |  8 +++++
 arch/x86/entry/syscalls/syscall_32.tbl |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 include/linux/syscalls.h               |  2 ++
 include/uapi/asm-generic/unistd.h      |  4 ++-
 include/uapi/linux/Kbuild              |  1 +
 include/uapi/linux/membarrier.h        | 53 +++++++++++++++++++++++++++
 init/Kconfig                           | 12 +++++++
 kernel/Makefile                        |  1 +
 kernel/membarrier.c                    | 66 ++++++++++++++++++++++++++++++++++
 kernel/sys_ni.c                        |  3 ++
 11 files changed, 151 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/membarrier.h
 create mode 100644 kernel/membarrier.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 310da4295c70..e77bc84dc580 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6789,6 +6789,14 @@ W:	http://www.mellanox.com
 Q:	http://patchwork.ozlabs.org/project/netdev/list/
 F:	drivers/net/ethernet/mellanox/mlxsw/
 
+MEMBARRIER SUPPORT
+M:	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+M:	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
+L:	linux-kernel@vger.kernel.org
+S:	Supported
+F:	kernel/membarrier.c
+F:	include/uapi/linux/membarrier.h
+
 MEMORY MANAGEMENT
 L:	linux-mm@kvack.org
 W:	http://www.linux-mm.org
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 477bfa6db370..7663c455b9f6 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -381,3 +381,4 @@
 372	i386	recvmsg			sys_recvmsg			compat_sys_recvmsg
 373	i386	shutdown		sys_shutdown
 374	i386	userfaultfd		sys_userfaultfd
+375	i386	membarrier		sys_membarrier
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 81c490634db9..278842fdf1f6 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -330,6 +330,7 @@
 321	common	bpf			sys_bpf
 322	64	execveat		stub_execveat
 323	common	userfaultfd		sys_userfaultfd
+324	common	membarrier		sys_membarrier
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 08001317aee7..a460e2ef2843 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -885,4 +885,6 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename,
 			const char __user *const __user *argv,
 			const char __user *const __user *envp, int flags);
 
+asmlinkage long sys_membarrier(int cmd, int flags);
+
 #endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index e016bd9b1a04..8da542a2874d 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -709,9 +709,11 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
 __SYSCALL(__NR_bpf, sys_bpf)
 #define __NR_execveat 281
 __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
+#define __NR_membarrier 282
+__SYSCALL(__NR_membarrier, sys_membarrier)
 
 #undef __NR_syscalls
-#define __NR_syscalls 282
+#define __NR_syscalls 283
 
 /*
  * All syscalls below here should go away really,
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 70ff1d9abf0d..f7b2db44eb4b 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -252,6 +252,7 @@ header-y += mdio.h
 header-y += media.h
 header-y += media-bus-format.h
 header-y += mei.h
+header-y += membarrier.h
 header-y += memfd.h
 header-y += mempolicy.h
 header-y += meye.h
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
new file mode 100644
index 000000000000..e0b108bd2624
--- /dev/null
+++ b/include/uapi/linux/membarrier.h
@@ -0,0 +1,53 @@
+#ifndef _UAPI_LINUX_MEMBARRIER_H
+#define _UAPI_LINUX_MEMBARRIER_H
+
+/*
+ * linux/membarrier.h
+ *
+ * membarrier system call API
+ *
+ * Copyright (c) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * enum membarrier_cmd - membarrier system call command
+ * @MEMBARRIER_CMD_QUERY:   Query the set of supported commands. It returns
+ *                          a bitmask of valid commands.
+ * @MEMBARRIER_CMD_SHARED:  Execute a memory barrier on all running threads.
+ *                          Upon return from system call, the caller thread
+ *                          is ensured that all running threads have passed
+ *                          through a state where all memory accesses to
+ *                          user-space addresses match program order between
+ *                          entry to and return from the system call
+ *                          (non-running threads are de facto in such a
+ *                          state). This covers threads from all processes
+ *                          running on the system. This command returns 0.
+ *
+ * Command to be passed to the membarrier system call. The commands need to
+ * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
+ * the value 0.
+ */
+enum membarrier_cmd {
+	MEMBARRIER_CMD_QUERY = 0,
+	MEMBARRIER_CMD_SHARED = (1 << 0),
+};
+
+#endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/init/Kconfig b/init/Kconfig
index 02da9f1fd9df..c24b6f767bf0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1602,6 +1602,18 @@ config PCI_QUIRKS
 	  bugs/quirks. Disable this only if your target machine is
 	  unaffected by PCI quirks.
 
+config MEMBARRIER
+	bool "Enable membarrier() system call" if EXPERT
+	default y
+	help
+	  Enable the membarrier() system call that allows issuing memory
+	  barriers across all running threads, which can be used to distribute
+	  the cost of user-space memory barriers asymmetrically by transforming
+	  pairs of memory barriers into pairs consisting of membarrier() and a
+	  compiler barrier.
+
+	  If unsure, say Y.
+
 config EMBEDDED
 	bool "Embedded system"
 	option allnoconfig_y
diff --git a/kernel/Makefile b/kernel/Makefile
index d4988410b410..53abf008ecb3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
 
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
 
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
new file mode 100644
index 000000000000..536c727a56e9
--- /dev/null
+++ b/kernel/membarrier.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd:   Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, or if the command argument is invalid,
+ * this system call returns -EINVAL. For a given command, with flags argument
+ * set to 0, this system call is guaranteed to always return the same value
+ * until reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ *                        barrier()   smp_mb() sys_membarrier()
+ *        barrier()          X           X            O
+ *        smp_mb()           X           O            O
+ *        sys_membarrier()   O           O            O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+	if (unlikely(flags))
+		return -EINVAL;
+	switch (cmd) {
+	case MEMBARRIER_CMD_QUERY:
+		return MEMBARRIER_CMD_BITMASK;
+	case MEMBARRIER_CMD_SHARED:
+		if (num_online_cpus() > 1)
+			synchronize_sched();
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 03c3875d9958..a02decf15583 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -245,3 +245,6 @@ cond_syscall(sys_bpf);
 
 /* execveat */
 cond_syscall(sys_execveat);
+
+/* membarrier */
+cond_syscall(sys_membarrier);
-- 
cgit v1.2.3-70-g09d2


From 6798a8caaf64fa68b9ab2044e070fe4545034e03 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 11 Sep 2015 13:07:48 -0700
Subject: fs/seq_file: convert int seq_vprint/seq_printf/etc... returns to void

The seq_<foo> function return values were frequently misused.

See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to
     seq_has_overflowed() and make public")

All uses of these return values have been removed, so convert the
return types to void.

Miscellanea:

o Move seq_put_decimal_<type> and seq_escape prototypes closer the
  other seq_vprintf prototypes
o Reorder seq_putc and seq_puts to return early on overflow
o Add argument names to seq_vprintf and seq_printf
o Update the seq_escape kernel-doc
o Convert a couple of leading spaces to tabs in seq_escape

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/iommu/omap-iommu-debug.c |  3 +-
 fs/nsfs.c                        |  3 +-
 fs/seq_file.c                    | 70 ++++++++++++++++++----------------------
 include/linux/seq_file.h         | 19 +++++------
 4 files changed, 45 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/omap-iommu-debug.c b/drivers/iommu/omap-iommu-debug.c
index 0717aa96ce39..9bc20e2119a3 100644
--- a/drivers/iommu/omap-iommu-debug.c
+++ b/drivers/iommu/omap-iommu-debug.c
@@ -135,8 +135,9 @@ __dump_tlb_entries(struct omap_iommu *obj, struct cr_regs *crs, int num)
 static ssize_t iotlb_dump_cr(struct omap_iommu *obj, struct cr_regs *cr,
 			     struct seq_file *s)
 {
-	return seq_printf(s, "%08x %08x %01x\n", cr->cam, cr->ram,
+	seq_printf(s, "%08x %08x %01x\n", cr->cam, cr->ram,
 			  (cr->cam & MMU_CAM_P) ? 1 : 0);
+	return 0;
 }
 
 static size_t omap_dump_tlb_entries(struct omap_iommu *obj, struct seq_file *s)
diff --git a/fs/nsfs.c b/fs/nsfs.c
index e4905fbf3396..8f20d6016e20 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -142,7 +142,8 @@ static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
 	struct inode *inode = d_inode(dentry);
 	const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
 
-	return seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
+	seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
+	return 0;
 }
 
 static const struct super_operations nsfs_ops = {
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 263b125dbcf4..225586e141ca 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -372,16 +372,16 @@ EXPORT_SYMBOL(seq_release);
  *	@esc:	set of characters that need escaping
  *
  *	Puts string into buffer, replacing each occurrence of character from
- *	@esc with usual octal escape.  Returns 0 in case of success, -1 - in
- *	case of overflow.
+ *	@esc with usual octal escape.
+ *	Use seq_has_overflowed() to check for errors.
  */
-int seq_escape(struct seq_file *m, const char *s, const char *esc)
+void seq_escape(struct seq_file *m, const char *s, const char *esc)
 {
 	char *end = m->buf + m->size;
-        char *p;
+	char *p;
 	char c;
 
-        for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
+	for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
 		if (!strchr(esc, c)) {
 			*p++ = c;
 			continue;
@@ -394,14 +394,13 @@ int seq_escape(struct seq_file *m, const char *s, const char *esc)
 			continue;
 		}
 		seq_set_overflow(m);
-		return -1;
-        }
+		return;
+	}
 	m->count = p - m->buf;
-        return 0;
 }
 EXPORT_SYMBOL(seq_escape);
 
-int seq_vprintf(struct seq_file *m, const char *f, va_list args)
+void seq_vprintf(struct seq_file *m, const char *f, va_list args)
 {
 	int len;
 
@@ -409,24 +408,20 @@ int seq_vprintf(struct seq_file *m, const char *f, va_list args)
 		len = vsnprintf(m->buf + m->count, m->size - m->count, f, args);
 		if (m->count + len < m->size) {
 			m->count += len;
-			return 0;
+			return;
 		}
 	}
 	seq_set_overflow(m);
-	return -1;
 }
 EXPORT_SYMBOL(seq_vprintf);
 
-int seq_printf(struct seq_file *m, const char *f, ...)
+void seq_printf(struct seq_file *m, const char *f, ...)
 {
-	int ret;
 	va_list args;
 
 	va_start(args, f);
-	ret = seq_vprintf(m, f, args);
+	seq_vprintf(m, f, args);
 	va_end(args);
-
-	return ret;
 }
 EXPORT_SYMBOL(seq_printf);
 
@@ -664,26 +659,25 @@ int seq_open_private(struct file *filp, const struct seq_operations *ops,
 }
 EXPORT_SYMBOL(seq_open_private);
 
-int seq_putc(struct seq_file *m, char c)
+void seq_putc(struct seq_file *m, char c)
 {
-	if (m->count < m->size) {
-		m->buf[m->count++] = c;
-		return 0;
-	}
-	return -1;
+	if (m->count >= m->size)
+		return;
+
+	m->buf[m->count++] = c;
 }
 EXPORT_SYMBOL(seq_putc);
 
-int seq_puts(struct seq_file *m, const char *s)
+void seq_puts(struct seq_file *m, const char *s)
 {
 	int len = strlen(s);
-	if (m->count + len < m->size) {
-		memcpy(m->buf + m->count, s, len);
-		m->count += len;
-		return 0;
+
+	if (m->count + len >= m->size) {
+		seq_set_overflow(m);
+		return;
 	}
-	seq_set_overflow(m);
-	return -1;
+	memcpy(m->buf + m->count, s, len);
+	m->count += len;
 }
 EXPORT_SYMBOL(seq_puts);
 
@@ -694,8 +688,8 @@ EXPORT_SYMBOL(seq_puts);
  * This routine is very quick when you show lots of numbers.
  * In usual cases, it will be better to use seq_printf(). It's easier to read.
  */
-int seq_put_decimal_ull(struct seq_file *m, char delimiter,
-			unsigned long long num)
+void seq_put_decimal_ull(struct seq_file *m, char delimiter,
+			 unsigned long long num)
 {
 	int len;
 
@@ -707,35 +701,33 @@ int seq_put_decimal_ull(struct seq_file *m, char delimiter,
 
 	if (num < 10) {
 		m->buf[m->count++] = num + '0';
-		return 0;
+		return;
 	}
 
 	len = num_to_str(m->buf + m->count, m->size - m->count, num);
 	if (!len)
 		goto overflow;
 	m->count += len;
-	return 0;
+	return;
+
 overflow:
 	seq_set_overflow(m);
-	return -1;
 }
 EXPORT_SYMBOL(seq_put_decimal_ull);
 
-int seq_put_decimal_ll(struct seq_file *m, char delimiter,
-			long long num)
+void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num)
 {
 	if (num < 0) {
 		if (m->count + 3 >= m->size) {
 			seq_set_overflow(m);
-			return -1;
+			return;
 		}
 		if (delimiter)
 			m->buf[m->count++] = delimiter;
 		num = -num;
 		delimiter = '-';
 	}
-	return seq_put_decimal_ull(m, delimiter, num);
-
+	seq_put_decimal_ull(m, delimiter, num);
 }
 EXPORT_SYMBOL(seq_put_decimal_ll);
 
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index adeadbd6d7bf..dde00defbaa5 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -114,13 +114,18 @@ int seq_open(struct file *, const struct seq_operations *);
 ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
 loff_t seq_lseek(struct file *, loff_t, int);
 int seq_release(struct inode *, struct file *);
-int seq_escape(struct seq_file *, const char *, const char *);
-int seq_putc(struct seq_file *m, char c);
-int seq_puts(struct seq_file *m, const char *s);
 int seq_write(struct seq_file *seq, const void *data, size_t len);
 
-__printf(2, 3) int seq_printf(struct seq_file *, const char *, ...);
-__printf(2, 0) int seq_vprintf(struct seq_file *, const char *, va_list args);
+__printf(2, 0)
+void seq_vprintf(struct seq_file *m, const char *fmt, va_list args);
+__printf(2, 3)
+void seq_printf(struct seq_file *m, const char *fmt, ...);
+void seq_putc(struct seq_file *m, char c);
+void seq_puts(struct seq_file *m, const char *s);
+void seq_put_decimal_ull(struct seq_file *m, char delimiter,
+			 unsigned long long num);
+void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num);
+void seq_escape(struct seq_file *m, const char *s, const char *esc);
 
 void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
 		  int rowsize, int groupsize, const void *buf, size_t len,
@@ -138,10 +143,6 @@ int single_release(struct inode *, struct file *);
 void *__seq_open_private(struct file *, const struct seq_operations *, int);
 int seq_open_private(struct file *, const struct seq_operations *, int);
 int seq_release_private(struct inode *, struct file *);
-int seq_put_decimal_ull(struct seq_file *m, char delimiter,
-			unsigned long long num);
-int seq_put_decimal_ll(struct seq_file *m, char delimiter,
-			long long num);
 
 static inline struct user_namespace *seq_user_ns(struct seq_file *seq)
 {
-- 
cgit v1.2.3-70-g09d2


From 10fbd36e362a0f367e34a7cd876a81295d8fc5ca Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 27 May 2015 15:32:15 -0700
Subject: blk: rq_data_dir() should not return a boolean
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

rq_data_dir() returns either READ or WRITE (0 == READ, 1 == WRITE), not
a boolean value.

Now, admittedly the "!= 0" doesn't really change the value (0 stays as
zero, 1 stays as one), but it's not only redundant, it confuses gcc, and
causes gcc to warn about the construct

    switch (rq_data_dir(req)) {
        case READ:
            ...
        case WRITE:
            ...

that we have in a few drivers.

Now, the gcc warning is silly and stupid (it seems to warn not about the
switch value having a different type from the case statements, but about
_any_ boolean switch value), but in this case the code itself is silly
and stupid too, so let's just change it, and get rid of warnings like
this:

  drivers/block/hd.c: In function ‘hd_request’:
  drivers/block/hd.c:630:11: warning: switch condition has boolean value [-Wswitch-bool]
     switch (rq_data_dir(req)) {

The odd '!= 0' came in when "cmd_flags" got turned into a "u64" in
commit 5953316dbf90 ("block: make rq->cmd_flags be 64-bit") and is
presumably because the old code (that just did a logical 'and' with 1)
would then end up making the type of rq_data_dir() be u64 too.

But if we want to retain the old regular integer type, let's just cast
the result to 'int' rather than use that rather odd '!= 0'.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 708923b9b623..38a5ff772a37 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -584,7 +584,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
-#define rq_data_dir(rq)		(((rq)->cmd_flags & 1) != 0)
+#define rq_data_dir(rq)		((int)((rq)->cmd_flags & 1))
 
 /*
  * Driver can handle struct request, if it either has an old style
-- 
cgit v1.2.3-70-g09d2