summaryrefslogtreecommitdiff
path: root/mm/memory_hotplug.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-01-09 11:18:47 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-01-09 11:18:47 -0800
commitfb46e22a9e3863e08aef8815df9f17d0f4b9aede (patch)
tree83e052911fa8d8d90bcf9de2796e17e19040613f /mm/memory_hotplug.c
parentd30e51aa7b1f6fa7dd78d4598d1e4c047fcc3fb9 (diff)
parent5e0a760b44417f7cadd79de2204d6247109558a0 (diff)
Merge tag 'mm-stable-2024-01-08-15-31' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: "Many singleton patches against the MM code. The patch series which are included in this merge do the following: - Peng Zhang has done some mapletree maintainance work in the series 'maple_tree: add mt_free_one() and mt_attr() helpers' 'Some cleanups of maple tree' - In the series 'mm: use memmap_on_memory semantics for dax/kmem' Vishal Verma has altered the interworking between memory-hotplug and dax/kmem so that newly added 'device memory' can more easily have its memmap placed within that newly added memory. - Matthew Wilcox continues folio-related work (including a few fixes) in the patch series 'Add folio_zero_tail() and folio_fill_tail()' 'Make folio_start_writeback return void' 'Fix fault handler's handling of poisoned tail pages' 'Convert aops->error_remove_page to ->error_remove_folio' 'Finish two folio conversions' 'More swap folio conversions' - Kefeng Wang has also contributed folio-related work in the series 'mm: cleanup and use more folio in page fault' - Jim Cromie has improved the kmemleak reporting output in the series 'tweak kmemleak report format'. - In the series 'stackdepot: allow evicting stack traces' Andrey Konovalov to permits clients (in this case KASAN) to cause eviction of no longer needed stack traces. - Charan Teja Kalla has fixed some accounting issues in the page allocator's atomic reserve calculations in the series 'mm: page_alloc: fixes for high atomic reserve caluculations'. - Dmitry Rokosov has added to the samples/ dorectory some sample code for a userspace memcg event listener application. See the series 'samples: introduce cgroup events listeners'. - Some mapletree maintanance work from Liam Howlett in the series 'maple_tree: iterator state changes'. - Nhat Pham has improved zswap's approach to writeback in the series 'workload-specific and memory pressure-driven zswap writeback'. - DAMON/DAMOS feature and maintenance work from SeongJae Park in the series 'mm/damon: let users feed and tame/auto-tune DAMOS' 'selftests/damon: add Python-written DAMON functionality tests' 'mm/damon: misc updates for 6.8' - Yosry Ahmed has improved memcg's stats flushing in the series 'mm: memcg: subtree stats flushing and thresholds'. - In the series 'Multi-size THP for anonymous memory' Ryan Roberts has added a runtime opt-in feature to transparent hugepages which improves performance by allocating larger chunks of memory during anonymous page faults. - Matthew Wilcox has also contributed some cleanup and maintenance work against eh buffer_head code int he series 'More buffer_head cleanups'. - Suren Baghdasaryan has done work on Andrea Arcangeli's series 'userfaultfd move option'. UFFDIO_MOVE permits userspace heap compaction algorithms to move userspace's pages around rather than UFFDIO_COPY'a alloc/copy/free. - Stefan Roesch has developed a 'KSM Advisor', in the series 'mm/ksm: Add ksm advisor'. This is a governor which tunes KSM's scanning aggressiveness in response to userspace's current needs. - Chengming Zhou has optimized zswap's temporary working memory use in the series 'mm/zswap: dstmem reuse optimizations and cleanups'. - Matthew Wilcox has performed some maintenance work on the writeback code, both code and within filesystems. The series is 'Clean up the writeback paths'. - Andrey Konovalov has optimized KASAN's handling of alloc and free stack traces for secondary-level allocators, in the series 'kasan: save mempool stack traces'. - Andrey also performed some KASAN maintenance work in the series 'kasan: assorted clean-ups'. - David Hildenbrand has gone to town on the rmap code. Cleanups, more pte batching, folio conversions and more. See the series 'mm/rmap: interface overhaul'. - Kinsey Ho has contributed some maintenance work on the MGLRU code in the series 'mm/mglru: Kconfig cleanup'. - Matthew Wilcox has contributed lruvec page accounting code cleanups in the series 'Remove some lruvec page accounting functions'" * tag 'mm-stable-2024-01-08-15-31' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (361 commits) mm, treewide: rename MAX_ORDER to MAX_PAGE_ORDER mm, treewide: introduce NR_PAGE_ORDERS selftests/mm: add separate UFFDIO_MOVE test for PMD splitting selftests/mm: skip test if application doesn't has root privileges selftests/mm: conform test to TAP format output selftests: mm: hugepage-mmap: conform to TAP format output selftests/mm: gup_test: conform test to TAP format output mm/selftests: hugepage-mremap: conform test to TAP format output mm/vmstat: move pgdemote_* out of CONFIG_NUMA_BALANCING mm: zsmalloc: return -ENOSPC rather than -EINVAL in zs_malloc while size is too large mm/memcontrol: remove __mod_lruvec_page_state() mm/khugepaged: use a folio more in collapse_file() slub: use a folio in __kmalloc_large_node slub: use folio APIs in free_large_kmalloc() slub: use alloc_pages_node() in alloc_slab_page() mm: remove inc/dec lruvec page state functions mm: ratelimit stat flush from workingset shrinker kasan: stop leaking stack trace handles mm/mglru: remove CONFIG_TRANSPARENT_HUGEPAGE mm/mglru: add dummy pmd_dirty() ...
Diffstat (limited to 'mm/memory_hotplug.c')
-rw-r--r--mm/memory_hotplug.c219
1 files changed, 140 insertions, 79 deletions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7a5fc89a8652..b3c0ff52bb72 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -645,7 +645,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
unsigned long pfn;
/*
- * Online the pages in MAX_ORDER aligned chunks. The callback might
+ * Online the pages in MAX_PAGE_ORDER aligned chunks. The callback might
* decide to not expose all pages to the buddy (e.g., expose them
* later). We account all pages as being online and belonging to this
* zone ("present").
@@ -660,12 +660,13 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
* Free to online pages in the largest chunks alignment allows.
*
* __ffs() behaviour is undefined for 0. start == 0 is
- * MAX_ORDER-aligned, Set order to MAX_ORDER for the case.
+ * MAX_PAGE_ORDER-aligned, Set order to MAX_PAGE_ORDER for
+ * the case.
*/
if (pfn)
- order = min_t(int, MAX_ORDER, __ffs(pfn));
+ order = min_t(int, MAX_PAGE_ORDER, __ffs(pfn));
else
- order = MAX_ORDER;
+ order = MAX_PAGE_ORDER;
(*online_page_callback)(pfn_to_page(pfn), order);
pfn += (1UL << order);
@@ -1380,6 +1381,85 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
return arch_supports_memmap_on_memory(vmemmap_size);
}
+static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
+{
+ unsigned long memblock_size = memory_block_size_bytes();
+ u64 cur_start;
+
+ /*
+ * For memmap_on_memory, the altmaps were added on a per-memblock
+ * basis; we have to process each individual memory block.
+ */
+ for (cur_start = start; cur_start < start + size;
+ cur_start += memblock_size) {
+ struct vmem_altmap *altmap = NULL;
+ struct memory_block *mem;
+
+ mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(cur_start)));
+ if (WARN_ON_ONCE(!mem))
+ continue;
+
+ altmap = mem->altmap;
+ mem->altmap = NULL;
+
+ remove_memory_block_devices(cur_start, memblock_size);
+
+ arch_remove_memory(cur_start, memblock_size, altmap);
+
+ /* Verify that all vmemmap pages have actually been freed. */
+ WARN(altmap->alloc, "Altmap not fully unmapped");
+ kfree(altmap);
+ }
+}
+
+static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
+ u64 start, u64 size)
+{
+ unsigned long memblock_size = memory_block_size_bytes();
+ u64 cur_start;
+ int ret;
+
+ for (cur_start = start; cur_start < start + size;
+ cur_start += memblock_size) {
+ struct mhp_params params = { .pgprot =
+ pgprot_mhp(PAGE_KERNEL) };
+ struct vmem_altmap mhp_altmap = {
+ .base_pfn = PHYS_PFN(cur_start),
+ .end_pfn = PHYS_PFN(cur_start + memblock_size - 1),
+ };
+
+ mhp_altmap.free = memory_block_memmap_on_memory_pages();
+ params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap),
+ GFP_KERNEL);
+ if (!params.altmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* call arch's memory hotadd */
+ ret = arch_add_memory(nid, cur_start, memblock_size, &params);
+ if (ret < 0) {
+ kfree(params.altmap);
+ goto out;
+ }
+
+ /* create memory block devices after memory was added */
+ ret = create_memory_block_devices(cur_start, memblock_size,
+ params.altmap, group);
+ if (ret) {
+ arch_remove_memory(cur_start, memblock_size, NULL);
+ kfree(params.altmap);
+ goto out;
+ }
+ }
+
+ return 0;
+out:
+ if (ret && cur_start != start)
+ remove_memory_blocks_and_altmaps(start, cur_start - start);
+ return ret;
+}
+
/*
* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
* and online/offline operations (triggered e.g. by sysfs).
@@ -1390,10 +1470,6 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
enum memblock_flags memblock_flags = MEMBLOCK_NONE;
- struct vmem_altmap mhp_altmap = {
- .base_pfn = PHYS_PFN(res->start),
- .end_pfn = PHYS_PFN(res->end),
- };
struct memory_group *group = NULL;
u64 start, size;
bool new_node = false;
@@ -1436,30 +1512,22 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
/*
* Self hosted memmap array
*/
- if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
- if (mhp_supports_memmap_on_memory(size)) {
- mhp_altmap.free = memory_block_memmap_on_memory_pages();
- params.altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
- if (!params.altmap) {
- ret = -ENOMEM;
- goto error;
- }
+ if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
+ mhp_supports_memmap_on_memory(memory_block_size_bytes())) {
+ ret = create_altmaps_and_memory_blocks(nid, group, start, size);
+ if (ret)
+ goto error;
+ } else {
+ ret = arch_add_memory(nid, start, size, &params);
+ if (ret < 0)
+ goto error;
- memcpy(params.altmap, &mhp_altmap, sizeof(mhp_altmap));
+ /* create memory block devices after memory was added */
+ ret = create_memory_block_devices(start, size, NULL, group);
+ if (ret) {
+ arch_remove_memory(start, size, params.altmap);
+ goto error;
}
- /* fallback to not using altmap */
- }
-
- /* call arch's memory hotadd */
- ret = arch_add_memory(nid, start, size, &params);
- if (ret < 0)
- goto error_free;
-
- /* create memory block devices after memory was added */
- ret = create_memory_block_devices(start, size, params.altmap, group);
- if (ret) {
- arch_remove_memory(start, size, params.altmap);
- goto error_free;
}
if (new_node) {
@@ -1496,8 +1564,6 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
-error_free:
- kfree(params.altmap);
error:
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_remove(start, size);
@@ -2067,17 +2133,13 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
return 0;
}
-static int test_has_altmap_cb(struct memory_block *mem, void *arg)
+static int count_memory_range_altmaps_cb(struct memory_block *mem, void *arg)
{
- struct memory_block **mem_ptr = (struct memory_block **)arg;
- /*
- * return the memblock if we have altmap
- * and break callback.
- */
- if (mem->altmap) {
- *mem_ptr = mem;
- return 1;
- }
+ u64 *num_altmaps = (u64 *)arg;
+
+ if (mem->altmap)
+ *num_altmaps += 1;
+
return 0;
}
@@ -2151,11 +2213,29 @@ void try_offline_node(int nid)
}
EXPORT_SYMBOL(try_offline_node);
+static int memory_blocks_have_altmaps(u64 start, u64 size)
+{
+ u64 num_memblocks = size / memory_block_size_bytes();
+ u64 num_altmaps = 0;
+
+ if (!mhp_memmap_on_memory())
+ return 0;
+
+ walk_memory_blocks(start, size, &num_altmaps,
+ count_memory_range_altmaps_cb);
+
+ if (num_altmaps == 0)
+ return 0;
+
+ if (WARN_ON_ONCE(num_memblocks != num_altmaps))
+ return -EINVAL;
+
+ return 1;
+}
+
static int __ref try_remove_memory(u64 start, u64 size)
{
- struct memory_block *mem;
- int rc = 0, nid = NUMA_NO_NODE;
- struct vmem_altmap *altmap = NULL;
+ int rc, nid = NUMA_NO_NODE;
BUG_ON(check_hotplug_memory_range(start, size));
@@ -2172,45 +2252,26 @@ static int __ref try_remove_memory(u64 start, u64 size)
if (rc)
return rc;
- /*
- * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
- * the same granularity it was added - a single memory block.
- */
- if (mhp_memmap_on_memory()) {
- rc = walk_memory_blocks(start, size, &mem, test_has_altmap_cb);
- if (rc) {
- if (size != memory_block_size_bytes()) {
- pr_warn("Refuse to remove %#llx - %#llx,"
- "wrong granularity\n",
- start, start + size);
- return -EINVAL;
- }
- altmap = mem->altmap;
- /*
- * Mark altmap NULL so that we can add a debug
- * check on memblock free.
- */
- mem->altmap = NULL;
- }
- }
-
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
- /*
- * Memory block device removal under the device_hotplug_lock is
- * a barrier against racing online attempts.
- */
- remove_memory_block_devices(start, size);
-
mem_hotplug_begin();
- arch_remove_memory(start, size, altmap);
-
- /* Verify that all vmemmap pages have actually been freed. */
- if (altmap) {
- WARN(altmap->alloc, "Altmap not fully unmapped");
- kfree(altmap);
+ rc = memory_blocks_have_altmaps(start, size);
+ if (rc < 0) {
+ mem_hotplug_done();
+ return rc;
+ } else if (!rc) {
+ /*
+ * Memory block device removal under the device_hotplug_lock is
+ * a barrier against racing online attempts.
+ * No altmaps present, do the removal directly
+ */
+ remove_memory_block_devices(start, size);
+ arch_remove_memory(start, size, NULL);
+ } else {
+ /* all memblocks in the range have altmaps */
+ remove_memory_blocks_and_altmaps(start, size);
}
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {