From cfe3236d32d07bf261760472fee5d7029eb02779 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 2 Mar 2023 19:58:29 +0800 Subject: mm: huge_memory: convert __do_huge_pmd_anonymous_page() to use a folio Patch series "mm: remove cgroup_throttle_swaprate() completely", v2. Convert all the caller functions of cgroup_throttle_swaprate() to use folios, and use folio_throttle_swaprate(), which allows us to remove cgroup_throttle_swaprate() completely. This patch (of 7): Convert from page to folio within __do_huge_pmd_anonymous_page(), as we need the precise page which is to be stored at this PTE in the folio, the function still keep a page as the parameter. Link: https://lkml.kernel.org/r/20230302115835.105364-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20230302115835.105364-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 032fb0ef9cd1..0c1df321ad64 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -656,19 +656,20 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, gfp_t gfp) { struct vm_area_struct *vma = vmf->vma; + struct folio *folio = page_folio(page); pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret = 0; - VM_BUG_ON_PAGE(!PageCompound(page), page); + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); - if (mem_cgroup_charge(page_folio(page), vma->vm_mm, gfp)) { - put_page(page); + if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + folio_put(folio); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); return VM_FAULT_FALLBACK; } - cgroup_throttle_swaprate(page, gfp); + folio_throttle_swaprate(folio, gfp); pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) { @@ -678,11 +679,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, clear_huge_page(page, vmf->address, HPAGE_PMD_NR); /* - * The memory barrier inside __SetPageUptodate makes sure that + * The memory barrier inside __folio_mark_uptodate makes sure that * clear_huge_page writes become visible before the set_pmd_at() * write. */ - __SetPageUptodate(page); + __folio_mark_uptodate(folio); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) { @@ -697,7 +698,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, /* Deliver the page fault to userland */ if (userfaultfd_missing(vma)) { spin_unlock(vmf->ptl); - put_page(page); + folio_put(folio); pte_free(vma->vm_mm, pgtable); ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); @@ -706,8 +707,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - page_add_new_anon_rmap(page, vma, haddr); - lru_cache_add_inactive_or_unevictable(page, vma); + folio_add_new_anon_rmap(folio, vma, haddr); + folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); @@ -724,7 +725,7 @@ unlock_release: release: if (pgtable) pte_free(vma->vm_mm, pgtable); - put_page(page); + folio_put(folio); return ret; } -- cgit v1.2.3-70-g09d2 From 3c556d2425b04054e22045d4ef7d34f163b7a71a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 15 Mar 2023 13:16:42 -0400 Subject: mm/thp: rename TRANSPARENT_HUGEPAGE_NEVER_DAX to _UNSUPPORTED TRANSPARENT_HUGEPAGE_NEVER_DAX has nothing to do with DAX. It's set when has_transparent_hugepage() returns false, checked in hugepage_vma_check() and will disable THP completely if false. Rename it to TRANSPARENT_HUGEPAGE_UNSUPPORTED to reflect its real purpose. [peterx@redhat.com: fix comment, per David] Link: https://lkml.kernel.org/r/ZBMzQW674oHQJV7F@x1n Link: https://lkml.kernel.org/r/20230315171642.1244625-1-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: Aneesh Kumar K.V Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 +- mm/huge_memory.c | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 70bd867eba94..9a3a3af2dd80 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -79,7 +79,7 @@ static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, } enum transparent_hugepage_flag { - TRANSPARENT_HUGEPAGE_NEVER_DAX, + TRANSPARENT_HUGEPAGE_UNSUPPORTED, TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0c1df321ad64..cbf095e7f059 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -88,7 +88,7 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, /* * If the hardware/firmware marked hugepage support disabled. */ - if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) + if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) return false; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ @@ -460,11 +460,7 @@ static int __init hugepage_init(void) struct kobject *hugepage_kobj; if (!has_transparent_hugepage()) { - /* - * Hardware doesn't support hugepages, hence disable - * DAX PMD support. - */ - transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX; + transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED; return -EINVAL; } -- cgit v1.2.3-70-g09d2 From 1fb130b226a6385362899381e0025ba413cb27e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Mar 2023 15:34:04 +0100 Subject: mm: don't look at xarray value entries in split_huge_pages_in_file Patch series "return an ERR_PTR from __filemap_get_folio", v3. __filemap_get_folio and its wrappers can return NULL for three different conditions, which in some cases requires the caller to reverse engineer the decision making. This is fixed by returning an ERR_PTR instead of NULL and thus transporting the reason for the failure. But to make that work we first need to ensure that no xa_value special case is returned and thus return the FGP_ENTRY flag. It turns out that flag is barely used and can usually be deal with in a better way. This patch (of 7): split_huge_pages_in_file never wants to do anything with the special value enties. Switch to using filemap_get_folio to not even see them. Link: https://lkml.kernel.org/r/20230307143410.28031-1-hch@lst.de Link: https://lkml.kernel.org/r/20230307143410.28031-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Cc: Hugh Dickins Cc: Andreas Gruenbacher Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- mm/huge_memory.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cbf095e7f059..70008dd7f215 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3089,11 +3089,10 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, mapping = candidate->f_mapping; for (index = off_start; index < off_end; index += nr_pages) { - struct folio *folio = __filemap_get_folio(mapping, index, - FGP_ENTRY, 0); + struct folio *folio = filemap_get_folio(mapping, index); nr_pages = 1; - if (xa_is_value(folio) || !folio) + if (!folio) continue; if (!folio_test_large(folio)) -- cgit v1.2.3-70-g09d2 From 66dabbb65d673aef40dd17bf62c042be8f6d4a4b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Mar 2023 15:34:10 +0100 Subject: mm: return an ERR_PTR from __filemap_get_folio Instead of returning NULL for all errors, distinguish between: - no entry found and not asked to allocated (-ENOENT) - failed to allocate memory (-ENOMEM) - would block (-EAGAIN) so that callers don't have to guess the error based on the passed in flags. Also pass through the error through the direct callers: filemap_get_folio, filemap_lock_folio filemap_grab_folio and filemap_get_incore_folio. [hch@lst.de: fix null-pointer deref] Link: https://lkml.kernel.org/r/20230310070023.GA13563@lst.de Link: https://lkml.kernel.org/r/20230310043137.GA1624890@u2004 Link: https://lkml.kernel.org/r/20230307143410.28031-8-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Ryusuke Konishi [nilfs2] Cc: Andreas Gruenbacher Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- fs/afs/dir.c | 10 +++++----- fs/afs/dir_edit.c | 2 +- fs/afs/write.c | 4 ++-- fs/ext4/inode.c | 2 +- fs/ext4/move_extent.c | 8 ++++---- fs/hugetlbfs/inode.c | 2 +- fs/iomap/buffered-io.c | 11 ++--------- fs/netfs/buffered_read.c | 4 ++-- fs/nfs/file.c | 4 ++-- fs/nilfs2/page.c | 6 +++--- include/linux/pagemap.h | 11 ++++++----- mm/filemap.c | 14 ++++++++------ mm/folio-compat.c | 2 +- mm/huge_memory.c | 2 +- mm/hugetlb.c | 6 ++++-- mm/memcontrol.c | 2 +- mm/mincore.c | 2 +- mm/shmem.c | 4 ++-- mm/swap_state.c | 17 ++++++++++------- mm/swapfile.c | 4 ++-- mm/truncate.c | 15 ++++++++------- 21 files changed, 67 insertions(+), 65 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 82690d1dd49a..f92b9e62d567 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -319,16 +319,16 @@ expand: struct folio *folio; folio = filemap_get_folio(mapping, i); - if (!folio) { + if (IS_ERR(folio)) { if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) afs_stat_v(dvnode, n_inval); - - ret = -ENOMEM; folio = __filemap_get_folio(mapping, i, FGP_LOCK | FGP_CREAT, mapping->gfp_mask); - if (!folio) + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto error; + } folio_attach_private(folio, (void *)1); folio_unlock(folio); } @@ -524,7 +524,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, */ folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE, FGP_ACCESSED, 0); - if (!folio) { + if (IS_ERR(folio)) { ret = afs_bad(dvnode, afs_file_error_dir_missing_page); break; } diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index 0ab7752d1b75..f0eddccbdd95 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -115,7 +115,7 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index) folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping->gfp_mask); - if (!folio) + if (IS_ERR(folio)) clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); else if (folio && !folio_test_private(folio)) folio_attach_private(folio, (void *)1); diff --git a/fs/afs/write.c b/fs/afs/write.c index 571f3b9a417e..c822d6006033 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -232,7 +232,7 @@ static void afs_kill_pages(struct address_space *mapping, _debug("kill %lx (to %lx)", index, last); folio = filemap_get_folio(mapping, index); - if (!folio) { + if (IS_ERR(folio)) { next = index + 1; continue; } @@ -270,7 +270,7 @@ static void afs_redirty_pages(struct writeback_control *wbc, _debug("redirty %llx @%llx", len, start); folio = filemap_get_folio(mapping, index); - if (!folio) { + if (IS_ERR(folio)) { next = index + 1; continue; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf0b7dea4900..d7973743417b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5395,7 +5395,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) while (1) { struct folio *folio = filemap_lock_folio(inode->i_mapping, inode->i_size >> PAGE_SHIFT); - if (!folio) + if (IS_ERR(folio)) return; ret = __ext4_journalled_invalidate_folio(folio, offset, folio_size(folio) - offset); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2de9829aed63..7bf6d069199c 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -141,18 +141,18 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, flags = memalloc_nofs_save(); folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags, mapping_gfp_mask(mapping[0])); - if (!folio[0]) { + if (IS_ERR(folio[0])) { memalloc_nofs_restore(flags); - return -ENOMEM; + return PTR_ERR(folio[0]); } folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags, mapping_gfp_mask(mapping[1])); memalloc_nofs_restore(flags); - if (!folio[1]) { + if (IS_ERR(folio[1])) { folio_unlock(folio[0]); folio_put(folio[0]); - return -ENOMEM; + return PTR_ERR(folio[1]); } /* * __filemap_get_folio() may not wait on folio's writeback if diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9062da6da567..702d79639c0d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -697,7 +697,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h, struct folio *folio; folio = filemap_lock_folio(mapping, idx); - if (!folio) + if (IS_ERR(folio)) return; start = start & ~huge_page_mask(h); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 6f4c97a6d7e9..96bb56c203f4 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -468,19 +468,12 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) { unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; - struct folio *folio; if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; - folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, + return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); - if (folio) - return folio; - - if (iter->flags & IOMAP_NOWAIT) - return ERR_PTR(-EAGAIN); - return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL_GPL(iomap_get_folio); @@ -911,7 +904,7 @@ static int iomap_write_delalloc_scan(struct inode *inode, /* grab locked page */ folio = filemap_lock_folio(inode->i_mapping, start_byte >> PAGE_SHIFT); - if (!folio) { + if (IS_ERR(folio)) { start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + PAGE_SIZE; continue; diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 7679a68e8193..209726a9cfdb 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -350,8 +350,8 @@ int netfs_write_begin(struct netfs_inode *ctx, retry: folio = __filemap_get_folio(mapping, index, fgp_flags, mapping_gfp_mask(mapping)); - if (!folio) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); if (ctx->ops->check_write_begin) { /* Allow the netfs (eg. ceph) to flush conflicts. */ diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 893625eacab9..1d03406e6c03 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -336,8 +336,8 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, start: folio = nfs_folio_grab_cache_write_begin(mapping, pos >> PAGE_SHIFT); - if (!folio) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); *pagep = &folio->page; ret = nfs_flush_incompatible(file, folio); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 41ccd43cd979..5cf30827f244 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -259,10 +259,10 @@ repeat: NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state"); dfolio = filemap_grab_folio(dmap, folio->index); - if (unlikely(!dfolio)) { + if (unlikely(IS_ERR(dfolio))) { /* No empty page is added to the page cache */ - err = -ENOMEM; folio_unlock(folio); + err = PTR_ERR(dfolio); break; } if (unlikely(!folio_buffers(folio))) @@ -311,7 +311,7 @@ repeat: folio_lock(folio); dfolio = filemap_lock_folio(dmap, index); - if (dfolio) { + if (!IS_ERR(dfolio)) { /* overwrite existing folio in the destination cache */ WARN_ON(folio_test_dirty(dfolio)); nilfs_copy_page(&dfolio->page, &folio->page, 0); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 306a0f63cea8..fdcd595d2294 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -520,7 +520,8 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, * Looks up the page cache entry at @mapping & @index. If a folio is * present, it is returned with an increased refcount. * - * Otherwise, %NULL is returned. + * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for + * this index. Will not return a shadow, swap or DAX entry. */ static inline struct folio *filemap_get_folio(struct address_space *mapping, pgoff_t index) @@ -537,8 +538,8 @@ static inline struct folio *filemap_get_folio(struct address_space *mapping, * present, it is returned locked with an increased refcount. * * Context: May sleep. - * Return: A folio or %NULL if there is no folio in the cache for this - * index. Will not return a shadow, swap or DAX entry. + * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for + * this index. Will not return a shadow, swap or DAX entry. */ static inline struct folio *filemap_lock_folio(struct address_space *mapping, pgoff_t index) @@ -555,8 +556,8 @@ static inline struct folio *filemap_lock_folio(struct address_space *mapping, * a new folio is created. The folio is locked, marked as accessed, and * returned. * - * Return: A found or created folio. NULL if no folio is found and failed to - * create a folio. + * Return: A found or created folio. ERR_PTR(-ENOMEM) if no folio is found + * and failed to create a folio. */ static inline struct folio *filemap_grab_folio(struct address_space *mapping, pgoff_t index) diff --git a/mm/filemap.c b/mm/filemap.c index ac161b50f5bc..a34abfe8c654 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1907,7 +1907,7 @@ out: * * If there is a page cache page, it is returned with an increased refcount. * - * Return: The found folio or %NULL otherwise. + * Return: The found folio or an ERR_PTR() otherwise. */ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, int fgp_flags, gfp_t gfp) @@ -1925,7 +1925,7 @@ repeat: if (fgp_flags & FGP_NOWAIT) { if (!folio_trylock(folio)) { folio_put(folio); - return NULL; + return ERR_PTR(-EAGAIN); } } else { folio_lock(folio); @@ -1964,7 +1964,7 @@ no_page: folio = filemap_alloc_folio(gfp, 0); if (!folio) - return NULL; + return ERR_PTR(-ENOMEM); if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; @@ -1989,6 +1989,8 @@ no_page: folio_unlock(folio); } + if (!folio) + return ERR_PTR(-ENOENT); return folio; } EXPORT_SYMBOL(__filemap_get_folio); @@ -3258,7 +3260,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) * Do we have something in the page cache already? */ folio = filemap_get_folio(mapping, index); - if (likely(folio)) { + if (likely(!IS_ERR(folio))) { /* * We found the page, so try async readahead before waiting for * the lock. @@ -3287,7 +3289,7 @@ retry_find: folio = __filemap_get_folio(mapping, index, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask); - if (!folio) { + if (IS_ERR(folio)) { if (fpin) goto out_retry; filemap_invalidate_unlock_shared(mapping); @@ -3638,7 +3640,7 @@ static struct folio *do_read_cache_folio(struct address_space *mapping, filler = mapping->a_ops->read_folio; repeat: folio = filemap_get_folio(mapping, index); - if (!folio) { + if (IS_ERR(folio)) { folio = filemap_alloc_folio(gfp, 0); if (!folio) return ERR_PTR(-ENOMEM); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 1754daa85d35..2511c055a35f 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -97,7 +97,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, struct folio *folio; folio = __filemap_get_folio(mapping, index, fgp_flags, gfp); - if (!folio) + if (IS_ERR(folio)) return NULL; return folio_file_page(folio, index); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 70008dd7f215..2d860e70fe88 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3092,7 +3092,7 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, struct folio *folio = filemap_get_folio(mapping, index); nr_pages = 1; - if (!folio) + if (IS_ERR(folio)) continue; if (!folio_test_large(folio)) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 07abcb6eb203..712e32b38295 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5780,7 +5780,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, */ new_folio = false; folio = filemap_lock_folio(mapping, idx); - if (!folio) { + if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; @@ -6071,6 +6071,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vma_end_reservation(h, vma, haddr); pagecache_folio = filemap_lock_folio(mapping, idx); + if (IS_ERR(pagecache_folio)) + pagecache_folio = NULL; } ptl = huge_pte_lock(h, mm, ptep); @@ -6182,7 +6184,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (is_continue) { ret = -EFAULT; folio = filemap_lock_folio(mapping, idx); - if (!folio) + if (IS_ERR(folio)) goto out; folio_in_pagecache = true; } else if (!*pagep) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 13ec89c45389..0524add35cae 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5705,7 +5705,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, /* shmem/tmpfs may report page out on swap: account for that too. */ index = linear_page_index(vma, addr); folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index); - if (!folio) + if (IS_ERR(folio)) return NULL; return folio_file_page(folio, index); } diff --git a/mm/mincore.c b/mm/mincore.c index d359650b0f75..2d5be013a25a 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -61,7 +61,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ folio = filemap_get_incore_folio(mapping, index); - if (folio) { + if (!IS_ERR(folio)) { present = folio_test_uptodate(folio); folio_put(folio); } diff --git a/mm/shmem.c b/mm/shmem.c index 93cb39852a16..fa6e38f2f55f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -605,7 +605,7 @@ next: index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT; folio = filemap_get_folio(inode->i_mapping, index); - if (!folio) + if (IS_ERR(folio)) goto drop; /* No huge page at the end of the file: nothing to split */ @@ -3214,7 +3214,7 @@ static const char *shmem_get_link(struct dentry *dentry, if (!dentry) { folio = filemap_get_folio(inode->i_mapping, 0); - if (!folio) + if (IS_ERR(folio)) return ERR_PTR(-ECHILD); if (PageHWPoison(folio_page(folio, 0)) || !folio_test_uptodate(folio)) { diff --git a/mm/swap_state.c b/mm/swap_state.c index 92234f4b51d2..b76a65ac28b3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -336,7 +336,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, struct folio *folio; folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); - if (folio) { + if (!IS_ERR(folio)) { bool vma_ra = swap_use_vma_readahead(); bool readahead; @@ -366,6 +366,8 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, if (!vma || !vma_ra) atomic_inc(&swapin_readahead_hits); } + } else { + folio = NULL; } return folio; @@ -388,23 +390,24 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, struct swap_info_struct *si; struct folio *folio = filemap_get_entry(mapping, index); + if (!folio) + return ERR_PTR(-ENOENT); if (!xa_is_value(folio)) - goto out; + return folio; if (!shmem_mapping(mapping)) - return NULL; + return ERR_PTR(-ENOENT); swp = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (non_swap_entry(swp)) - return NULL; + return ERR_PTR(-ENOENT); /* Prevent swapoff from happening to us */ si = get_swap_device(swp); if (!si) - return NULL; + return ERR_PTR(-ENOENT); index = swp_offset(swp); folio = filemap_get_folio(swap_address_space(swp), index); put_swap_device(si); -out: return folio; } @@ -431,7 +434,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); put_swap_device(si); - if (folio) + if (!IS_ERR(folio)) return folio_file_page(folio, swp_offset(entry)); /* diff --git a/mm/swapfile.c b/mm/swapfile.c index c1b97436f811..00b3e46becad 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -136,7 +136,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, int ret = 0; folio = filemap_get_folio(swap_address_space(entry), offset); - if (!folio) + if (IS_ERR(folio)) return 0; /* * When this function is called from scan_swap_map_slots() and it's @@ -2095,7 +2095,7 @@ retry: entry = swp_entry(type, i); folio = filemap_get_folio(swap_address_space(entry), i); - if (!folio) + if (IS_ERR(folio)) continue; /* diff --git a/mm/truncate.c b/mm/truncate.c index 7b4ea4c4a46b..86de31ed4d32 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -375,7 +375,7 @@ void truncate_inode_pages_range(struct address_space *mapping, same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0); - if (folio) { + if (!IS_ERR(folio)) { same_folio = lend < folio_pos(folio) + folio_size(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio->index + folio_nr_pages(folio); @@ -387,14 +387,15 @@ void truncate_inode_pages_range(struct address_space *mapping, folio = NULL; } - if (!same_folio) + if (!same_folio) { folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT, FGP_LOCK, 0); - if (folio) { - if (!truncate_inode_partial_folio(folio, lstart, lend)) - end = folio->index; - folio_unlock(folio); - folio_put(folio); + if (!IS_ERR(folio)) { + if (!truncate_inode_partial_folio(folio, lstart, lend)) + end = folio->index; + folio_unlock(folio); + folio_put(folio); + } } index = start; -- cgit v1.2.3-70-g09d2 From 23baf831a32c04f9a968812511540b1b3e648bf5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Mar 2023 14:31:33 +0300 Subject: mm, treewide: redefine MAX_ORDER sanely MAX_ORDER currently defined as number of orders page allocator supports: user can ask buddy allocator for page order between 0 and MAX_ORDER-1. This definition is counter-intuitive and lead to number of bugs all over the kernel. Change the definition of MAX_ORDER to be inclusive: the range of orders user can ask from buddy allocator is 0..MAX_ORDER now. [kirill@shutemov.name: fix min() warning] Link: https://lkml.kernel.org/r/20230315153800.32wib3n5rickolvh@box [akpm@linux-foundation.org: fix another min_t warning] [kirill@shutemov.name: fixups per Zi Yan] Link: https://lkml.kernel.org/r/20230316232144.b7ic4cif4kjiabws@box.shutemov.name [akpm@linux-foundation.org: fix underlining in docs] Link: https://lore.kernel.org/oe-kbuild-all/202303191025.VRCTk6mP-lkp@intel.com/ Link: https://lkml.kernel.org/r/20230315113133.11326-11-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Michael Ellerman [powerpc] Cc: "Kirill A. Shutemov" Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 6 ++-- Documentation/admin-guide/kernel-parameters.txt | 2 +- arch/arc/Kconfig | 4 +-- arch/arm/Kconfig | 9 ++---- arch/arm/configs/imx_v6_v7_defconfig | 2 +- arch/arm/configs/milbeaut_m10v_defconfig | 2 +- arch/arm/configs/oxnas_v6_defconfig | 2 +- arch/arm/configs/pxa_defconfig | 2 +- arch/arm/configs/sama7_defconfig | 2 +- arch/arm/configs/sp7021_defconfig | 2 +- arch/arm64/Kconfig | 27 ++++++++---------- arch/arm64/include/asm/sparsemem.h | 2 +- arch/arm64/kvm/hyp/include/nvhe/gfp.h | 2 +- arch/arm64/kvm/hyp/nvhe/page_alloc.c | 10 +++---- arch/csky/Kconfig | 2 +- arch/ia64/Kconfig | 8 +++--- arch/ia64/include/asm/sparsemem.h | 4 +-- arch/ia64/mm/hugetlbpage.c | 2 +- arch/loongarch/Kconfig | 15 ++++------ arch/m68k/Kconfig.cpu | 5 +--- arch/mips/Kconfig | 19 ++++++------- arch/nios2/Kconfig | 7 ++--- arch/powerpc/Kconfig | 27 ++++++++---------- arch/powerpc/configs/85xx/ge_imp3a_defconfig | 2 +- arch/powerpc/configs/fsl-emb-nonhw.config | 2 +- arch/powerpc/mm/book3s64/iommu_api.c | 2 +- arch/powerpc/mm/hugetlbpage.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- arch/sh/configs/ecovec24_defconfig | 2 +- arch/sh/mm/Kconfig | 17 +++++------ arch/sparc/Kconfig | 5 +--- arch/sparc/kernel/pci_sun4v.c | 2 +- arch/sparc/kernel/traps_64.c | 2 +- arch/sparc/mm/tsb.c | 4 +-- arch/um/kernel/um_arch.c | 4 +-- arch/xtensa/Kconfig | 5 +--- drivers/base/regmap/regmap-debugfs.c | 8 +++--- drivers/block/floppy.c | 2 +- drivers/crypto/ccp/sev-dev.c | 2 +- drivers/crypto/hisilicon/sgl.c | 6 ++-- drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 +- drivers/gpu/drm/i915/gem/selftests/huge_pages.c | 2 +- drivers/gpu/drm/ttm/ttm_pool.c | 22 +++++++------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 2 +- drivers/iommu/dma-iommu.c | 2 +- drivers/irqchip/irq-gic-v3-its.c | 4 +-- drivers/md/dm-bufio.c | 2 +- drivers/misc/genwqe/card_dev.c | 2 +- drivers/misc/genwqe/card_utils.c | 4 +-- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +- drivers/net/ethernet/ibm/ibmvnic.h | 2 +- drivers/video/fbdev/hyperv_fb.c | 4 +-- drivers/video/fbdev/vermilion/vermilion.c | 2 +- drivers/virtio/virtio_balloon.c | 2 +- drivers/virtio/virtio_mem.c | 12 ++++---- fs/ramfs/file-nommu.c | 2 +- include/drm/ttm/ttm_pool.h | 2 +- include/linux/hugetlb.h | 2 +- include/linux/mmzone.h | 10 +++---- include/linux/pageblock-flags.h | 4 +-- include/linux/slab.h | 6 ++-- kernel/crash_core.c | 2 +- kernel/dma/pool.c | 6 ++-- kernel/events/ring_buffer.c | 6 ++-- mm/Kconfig | 10 +++---- mm/compaction.c | 8 +++--- mm/debug_vm_pgtable.c | 4 +-- mm/huge_memory.c | 2 +- mm/hugetlb.c | 4 +-- mm/kmsan/init.c | 6 ++-- mm/memblock.c | 2 +- mm/memory_hotplug.c | 4 +-- mm/page_alloc.c | 38 ++++++++++++------------- mm/page_isolation.c | 12 ++++---- mm/page_owner.c | 6 ++-- mm/page_reporting.c | 6 ++-- mm/shuffle.h | 2 +- mm/slab.c | 2 +- mm/slub.c | 6 ++-- mm/vmscan.c | 2 +- mm/vmstat.c | 14 ++++----- net/smc/smc_ib.c | 2 +- security/integrity/ima/ima_crypto.c | 2 +- tools/testing/memblock/linux/mmzone.h | 6 ++-- 84 files changed, 223 insertions(+), 253 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index 86fd88492870..c18d94fa6470 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -172,7 +172,7 @@ variables. Offset of the free_list's member. This value is used to compute the number of free pages. -Each zone has a free_area structure array called free_area[MAX_ORDER]. +Each zone has a free_area structure array called free_area[MAX_ORDER + 1]. The free_list represents a linked list of free page blocks. (list_head, next|prev) @@ -189,8 +189,8 @@ Offsets of the vmap_area's members. They carry vmalloc-specific information. Makedumpfile gets the start address of the vmalloc region from this. -(zone.free_area, MAX_ORDER) ---------------------------- +(zone.free_area, MAX_ORDER + 1) +------------------------------- Free areas descriptor. User-space tools use this value to iterate the free_area ranges. MAX_ORDER is used by the zone buddy allocator. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6221a1d057dd..50da4f26fad5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3969,7 +3969,7 @@ [KNL] Minimal page reporting order Format: Adjust the minimal page reporting order. The page - reporting is disabled when it exceeds (MAX_ORDER-1). + reporting is disabled when it exceeds MAX_ORDER. panic= [KNL] Kernel behaviour on panic: delay timeout > 0: seconds before rebooting diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index d9a13ccf89a3..ab6d701365bb 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -556,7 +556,7 @@ endmenu # "ARC Architecture Configuration" config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - default "12" if ARC_HUGEPAGE_16M - default "11" + default "11" if ARC_HUGEPAGE_16M + default "10" source "kernel/power/Kconfig" diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index e24a9820e12f..929e646e84b9 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1355,9 +1355,9 @@ config ARM_MODULE_PLTS config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - default "12" if SOC_AM33XX - default "9" if SA1111 - default "11" + default "11" if SOC_AM33XX + default "8" if SA1111 + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -1366,9 +1366,6 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - config ALIGNMENT_TRAP def_bool CPU_CP15_MMU select HAVE_PROC_CPU if PROC_FS diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig index 6dc6fed12af8..345a67e67dbd 100644 --- a/arch/arm/configs/imx_v6_v7_defconfig +++ b/arch/arm/configs/imx_v6_v7_defconfig @@ -31,7 +31,7 @@ CONFIG_SOC_VF610=y CONFIG_SMP=y CONFIG_ARM_PSCI=y CONFIG_HIGHMEM=y -CONFIG_ARCH_FORCE_MAX_ORDER=14 +CONFIG_ARCH_FORCE_MAX_ORDER=13 CONFIG_CMDLINE="noinitrd console=ttymxc0,115200" CONFIG_KEXEC=y CONFIG_CPU_FREQ=y diff --git a/arch/arm/configs/milbeaut_m10v_defconfig b/arch/arm/configs/milbeaut_m10v_defconfig index bd29e5012cb0..385ad0f391a8 100644 --- a/arch/arm/configs/milbeaut_m10v_defconfig +++ b/arch/arm/configs/milbeaut_m10v_defconfig @@ -26,7 +26,7 @@ CONFIG_THUMB2_KERNEL=y # CONFIG_THUMB2_AVOID_R_ARM_THM_JUMP11 is not set # CONFIG_ARM_PATCH_IDIV is not set CONFIG_HIGHMEM=y -CONFIG_ARCH_FORCE_MAX_ORDER=12 +CONFIG_ARCH_FORCE_MAX_ORDER=11 CONFIG_SECCOMP=y CONFIG_KEXEC=y CONFIG_EFI=y diff --git a/arch/arm/configs/oxnas_v6_defconfig b/arch/arm/configs/oxnas_v6_defconfig index 70a67b3fc91b..90779812c6dd 100644 --- a/arch/arm/configs/oxnas_v6_defconfig +++ b/arch/arm/configs/oxnas_v6_defconfig @@ -12,7 +12,7 @@ CONFIG_ARCH_OXNAS=y CONFIG_MACH_OX820=y CONFIG_SMP=y CONFIG_NR_CPUS=16 -CONFIG_ARCH_FORCE_MAX_ORDER=12 +CONFIG_ARCH_FORCE_MAX_ORDER=11 CONFIG_SECCOMP=y CONFIG_ARM_APPENDED_DTB=y CONFIG_ARM_ATAG_DTB_COMPAT=y diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig index e656d3af2266..b46e39369dbb 100644 --- a/arch/arm/configs/pxa_defconfig +++ b/arch/arm/configs/pxa_defconfig @@ -20,7 +20,7 @@ CONFIG_PXA_SHARPSL=y CONFIG_MACH_AKITA=y CONFIG_MACH_BORZOI=y CONFIG_AEABI=y -CONFIG_ARCH_FORCE_MAX_ORDER=9 +CONFIG_ARCH_FORCE_MAX_ORDER=8 CONFIG_CMDLINE="root=/dev/ram0 ro" CONFIG_KEXEC=y CONFIG_CPU_FREQ=y diff --git a/arch/arm/configs/sama7_defconfig b/arch/arm/configs/sama7_defconfig index 0d964c613d71..954112041403 100644 --- a/arch/arm/configs/sama7_defconfig +++ b/arch/arm/configs/sama7_defconfig @@ -19,7 +19,7 @@ CONFIG_ATMEL_CLOCKSOURCE_TCB=y # CONFIG_CACHE_L2X0 is not set # CONFIG_ARM_PATCH_IDIV is not set # CONFIG_CPU_SW_DOMAIN_PAN is not set -CONFIG_ARCH_FORCE_MAX_ORDER=15 +CONFIG_ARCH_FORCE_MAX_ORDER=14 CONFIG_UACCESS_WITH_MEMCPY=y # CONFIG_ATAGS is not set CONFIG_CMDLINE="console=ttyS0,115200 earlyprintk ignore_loglevel" diff --git a/arch/arm/configs/sp7021_defconfig b/arch/arm/configs/sp7021_defconfig index 5bca2eb59b86..c6448ac860b6 100644 --- a/arch/arm/configs/sp7021_defconfig +++ b/arch/arm/configs/sp7021_defconfig @@ -17,7 +17,7 @@ CONFIG_ARCH_SUNPLUS=y # CONFIG_VDSO is not set CONFIG_SMP=y CONFIG_THUMB2_KERNEL=y -CONFIG_ARCH_FORCE_MAX_ORDER=12 +CONFIG_ARCH_FORCE_MAX_ORDER=11 CONFIG_VFP=y CONFIG_NEON=y CONFIG_MODULES=y diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1023e896d46b..cb5c6aa3254e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1476,22 +1476,22 @@ config XEN # include/linux/mmzone.h requires the following to be true: # -# MAX_ORDER - 1 + PAGE_SHIFT <= SECTION_SIZE_BITS +# MAX_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS # -# so the maximum value of MAX_ORDER is SECTION_SIZE_BITS + 1 - PAGE_SHIFT: +# so the maximum value of MAX_ORDER is SECTION_SIZE_BITS - PAGE_SHIFT: # # | SECTION_SIZE_BITS | PAGE_SHIFT | max MAX_ORDER | default MAX_ORDER | # ----+-------------------+--------------+-----------------+--------------------+ -# 4K | 27 | 12 | 16 | 11 | -# 16K | 27 | 14 | 14 | 12 | -# 64K | 29 | 16 | 14 | 14 | +# 4K | 27 | 12 | 15 | 10 | +# 16K | 27 | 14 | 13 | 11 | +# 64K | 29 | 16 | 13 | 13 | config ARCH_FORCE_MAX_ORDER int "Maximum zone order" if ARM64_4K_PAGES || ARM64_16K_PAGES - default "14" if ARM64_64K_PAGES - range 12 14 if ARM64_16K_PAGES - default "12" if ARM64_16K_PAGES - range 11 16 if ARM64_4K_PAGES - default "11" + default "13" if ARM64_64K_PAGES + range 11 13 if ARM64_16K_PAGES + default "11" if ARM64_16K_PAGES + range 10 15 if ARM64_4K_PAGES + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -1500,14 +1500,11 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - We make sure that we can allocate up to a HugePage size for each configuration. Hence we have : - MAX_ORDER = (PMD_SHIFT - PAGE_SHIFT) + 1 => PAGE_SHIFT - 2 + MAX_ORDER = PMD_SHIFT - PAGE_SHIFT => PAGE_SHIFT - 3 - However for 4K, we choose a higher default value, 11 as opposed to 10, giving us + However for 4K, we choose a higher default value, 10 as opposed to 9, giving us 4M allocations matching the default size used by generic code. config UNMAP_KERNEL_AT_EL0 diff --git a/arch/arm64/include/asm/sparsemem.h b/arch/arm64/include/asm/sparsemem.h index 4b73463423c3..5f5437621029 100644 --- a/arch/arm64/include/asm/sparsemem.h +++ b/arch/arm64/include/asm/sparsemem.h @@ -10,7 +10,7 @@ /* * Section size must be at least 512MB for 64K base * page size config. Otherwise it will be less than - * (MAX_ORDER - 1) and the build process will fail. + * MAX_ORDER and the build process will fail. */ #ifdef CONFIG_ARM64_64K_PAGES #define SECTION_SIZE_BITS 29 diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h index 0a048dc06a7d..fe5472a184a3 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h +++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h @@ -16,7 +16,7 @@ struct hyp_pool { * API at EL2. */ hyp_spinlock_t lock; - struct list_head free_area[MAX_ORDER]; + struct list_head free_area[MAX_ORDER + 1]; phys_addr_t range_start; phys_addr_t range_end; unsigned short max_order; diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index 803ba3222e75..b1e392186a0f 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -110,7 +110,7 @@ static void __hyp_attach_page(struct hyp_pool *pool, * after coalescing, so make sure to mark it HYP_NO_ORDER proactively. */ p->order = HYP_NO_ORDER; - for (; (order + 1) < pool->max_order; order++) { + for (; (order + 1) <= pool->max_order; order++) { buddy = __find_buddy_avail(pool, p, order); if (!buddy) break; @@ -203,9 +203,9 @@ void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order) hyp_spin_lock(&pool->lock); /* Look for a high-enough-order page */ - while (i < pool->max_order && list_empty(&pool->free_area[i])) + while (i <= pool->max_order && list_empty(&pool->free_area[i])) i++; - if (i >= pool->max_order) { + if (i > pool->max_order) { hyp_spin_unlock(&pool->lock); return NULL; } @@ -228,8 +228,8 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, int i; hyp_spin_lock_init(&pool->lock); - pool->max_order = min(MAX_ORDER, get_order((nr_pages + 1) << PAGE_SHIFT)); - for (i = 0; i < pool->max_order; i++) + pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT)); + for (i = 0; i <= pool->max_order; i++) INIT_LIST_HEAD(&pool->free_area[i]); pool->range_start = phys; pool->range_end = phys + (nr_pages << PAGE_SHIFT); diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index dba02da6fa34..c694fac43bed 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -334,7 +334,7 @@ config HIGHMEM config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - default "11" + default "10" config DRAM_BASE hex "DRAM start addr (the same with memory-section in dts)" diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index d7e4a24e8644..0d2f41fa56ee 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -202,10 +202,10 @@ config IA64_CYCLONE If you're unsure, answer N. config ARCH_FORCE_MAX_ORDER - int "MAX_ORDER (11 - 17)" if !HUGETLB_PAGE - range 11 17 if !HUGETLB_PAGE - default "17" if HUGETLB_PAGE - default "11" + int "MAX_ORDER (10 - 16)" if !HUGETLB_PAGE + range 10 16 if !HUGETLB_PAGE + default "16" if HUGETLB_PAGE + default "10" config SMP bool "Symmetric multi-processing support" diff --git a/arch/ia64/include/asm/sparsemem.h b/arch/ia64/include/asm/sparsemem.h index 84e8ce387b69..a58f8b466d96 100644 --- a/arch/ia64/include/asm/sparsemem.h +++ b/arch/ia64/include/asm/sparsemem.h @@ -12,9 +12,9 @@ #define SECTION_SIZE_BITS (30) #define MAX_PHYSMEM_BITS (50) #ifdef CONFIG_ARCH_FORCE_MAX_ORDER -#if ((CONFIG_ARCH_FORCE_MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS) +#if (CONFIG_ARCH_FORCE_MAX_ORDER + PAGE_SHIFT > SECTION_SIZE_BITS) #undef SECTION_SIZE_BITS -#define SECTION_SIZE_BITS (CONFIG_ARCH_FORCE_MAX_ORDER - 1 + PAGE_SHIFT) +#define SECTION_SIZE_BITS (CONFIG_ARCH_FORCE_MAX_ORDER + PAGE_SHIFT) #endif #endif diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 380d2f3966c9..e8dd4323fb86 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -170,7 +170,7 @@ static int __init hugetlb_setup_sz(char *str) size = memparse(str, &str); if (*str || !is_power_of_2(size) || !(tr_pages & size) || size <= PAGE_SIZE || - size >= (1UL << PAGE_SHIFT << MAX_ORDER)) { + size > (1UL << PAGE_SHIFT << MAX_ORDER)) { printk(KERN_WARNING "Invalid huge page size specified\n"); return 1; } diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 7fd51257e0ed..272a3a12c98d 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -420,12 +420,12 @@ config NODES_SHIFT config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - range 14 64 if PAGE_SIZE_64KB - default "14" if PAGE_SIZE_64KB - range 12 64 if PAGE_SIZE_16KB - default "12" if PAGE_SIZE_16KB - range 11 64 - default "11" + range 13 63 if PAGE_SIZE_64KB + default "13" if PAGE_SIZE_64KB + range 11 63 if PAGE_SIZE_16KB + default "11" if PAGE_SIZE_16KB + range 10 63 + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -434,9 +434,6 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - The page size is not necessarily 4KB. Keep this in mind when choosing a value for this option. diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu index 9380f6e3bb66..c9df6572133f 100644 --- a/arch/m68k/Kconfig.cpu +++ b/arch/m68k/Kconfig.cpu @@ -400,7 +400,7 @@ config SINGLE_MEMORY_CHUNK config ARCH_FORCE_MAX_ORDER int "Maximum zone order" if ADVANCED depends on !SINGLE_MEMORY_CHUNK - default "11" + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -413,9 +413,6 @@ config ARCH_FORCE_MAX_ORDER value also defines the minimal size of the hole that allows freeing unused memory map. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - config 060_WRITETHROUGH bool "Use write-through caching for 68060 supervisor accesses" depends on ADVANCED && M68060 diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index e2f3ca73f40d..3e8b765b8c7b 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2137,14 +2137,14 @@ endchoice config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - range 14 64 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB - default "14" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB - range 13 64 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_32KB - default "13" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_32KB - range 12 64 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB - default "12" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB - range 0 64 - default "11" + range 13 63 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB + default "13" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB + range 12 63 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_32KB + default "12" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_32KB + range 11 63 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB + default "11" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB + range 0 63 + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -2153,9 +2153,6 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - The page size is not necessarily 4KB. Keep this in mind when choosing a value for this option. diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index a582f72104f3..89708b95978c 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -46,8 +46,8 @@ source "kernel/Kconfig.hz" config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - range 9 20 - default "11" + range 8 19 + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -56,9 +56,6 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - endmenu source "arch/nios2/platform/Kconfig.platform" diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 49c6d36b2b3e..24d56536b269 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -897,18 +897,18 @@ config DATA_SHIFT config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - range 8 9 if PPC64 && PPC_64K_PAGES - default "9" if PPC64 && PPC_64K_PAGES - range 13 13 if PPC64 && !PPC_64K_PAGES - default "13" if PPC64 && !PPC_64K_PAGES - range 9 64 if PPC32 && PPC_16K_PAGES - default "9" if PPC32 && PPC_16K_PAGES - range 7 64 if PPC32 && PPC_64K_PAGES - default "7" if PPC32 && PPC_64K_PAGES - range 5 64 if PPC32 && PPC_256K_PAGES - default "5" if PPC32 && PPC_256K_PAGES - range 11 64 - default "11" + range 7 8 if PPC64 && PPC_64K_PAGES + default "8" if PPC64 && PPC_64K_PAGES + range 12 12 if PPC64 && !PPC_64K_PAGES + default "12" if PPC64 && !PPC_64K_PAGES + range 8 63 if PPC32 && PPC_16K_PAGES + default "8" if PPC32 && PPC_16K_PAGES + range 6 63 if PPC32 && PPC_64K_PAGES + default "6" if PPC32 && PPC_64K_PAGES + range 4 63 if PPC32 && PPC_256K_PAGES + default "4" if PPC32 && PPC_256K_PAGES + range 10 63 + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -917,9 +917,6 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - The page size is not necessarily 4KB. For example, on 64-bit systems, 64KB pages can be enabled via CONFIG_PPC_64K_PAGES. Keep this in mind when choosing a value for this option. diff --git a/arch/powerpc/configs/85xx/ge_imp3a_defconfig b/arch/powerpc/configs/85xx/ge_imp3a_defconfig index ea719898b581..6cb7e90d52c1 100644 --- a/arch/powerpc/configs/85xx/ge_imp3a_defconfig +++ b/arch/powerpc/configs/85xx/ge_imp3a_defconfig @@ -30,7 +30,7 @@ CONFIG_PREEMPT=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set CONFIG_BINFMT_MISC=m CONFIG_MATH_EMULATION=y -CONFIG_ARCH_FORCE_MAX_ORDER=17 +CONFIG_ARCH_FORCE_MAX_ORDER=16 CONFIG_PCI=y CONFIG_PCIEPORTBUS=y CONFIG_PCI_MSI=y diff --git a/arch/powerpc/configs/fsl-emb-nonhw.config b/arch/powerpc/configs/fsl-emb-nonhw.config index ab8a8c4530d9..3009b0efaf34 100644 --- a/arch/powerpc/configs/fsl-emb-nonhw.config +++ b/arch/powerpc/configs/fsl-emb-nonhw.config @@ -41,7 +41,7 @@ CONFIG_FIXED_PHY=y CONFIG_FONT_8x16=y CONFIG_FONT_8x8=y CONFIG_FONTS=y -CONFIG_ARCH_FORCE_MAX_ORDER=13 +CONFIG_ARCH_FORCE_MAX_ORDER=12 CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FRAME_WARN=1024 CONFIG_FTL=y diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index 7fcfba162e0d..81d7185e2ae8 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -97,7 +97,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } mmap_read_lock(mm); - chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) / + chunk = (1UL << (PAGE_SHIFT + MAX_ORDER)) / sizeof(struct vm_area_struct *); chunk = min(chunk, entries); for (entry = 0; entry < entries; entry += chunk) { diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index f1ba8d1e8c1a..b900933507da 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -615,7 +615,7 @@ void __init gigantic_hugetlb_cma_reserve(void) order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT; if (order) { - VM_WARN_ON(order < MAX_ORDER); + VM_WARN_ON(order <= MAX_ORDER); hugetlb_cma_reserve(order); } } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 4f6e20a35aa1..5a81f106068e 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1740,7 +1740,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) * DMA window can be larger than available memory, which will * cause errors later. */ - const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1); + const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER); /* * We create the default window as big as we can. The constraint is diff --git a/arch/sh/configs/ecovec24_defconfig b/arch/sh/configs/ecovec24_defconfig index b52e14ccb450..4d655e8d4d74 100644 --- a/arch/sh/configs/ecovec24_defconfig +++ b/arch/sh/configs/ecovec24_defconfig @@ -8,7 +8,7 @@ CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set CONFIG_CPU_SUBTYPE_SH7724=y -CONFIG_ARCH_FORCE_MAX_ORDER=12 +CONFIG_ARCH_FORCE_MAX_ORDER=11 CONFIG_MEMORY_SIZE=0x10000000 CONFIG_FLATMEM_MANUAL=y CONFIG_SH_ECOVEC=y diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index 411fdc0901f7..40271090bd7d 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -20,13 +20,13 @@ config PAGE_OFFSET config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - range 9 64 if PAGE_SIZE_16KB - default "9" if PAGE_SIZE_16KB - range 7 64 if PAGE_SIZE_64KB - default "7" if PAGE_SIZE_64KB - range 11 64 - default "14" if !MMU - default "11" + range 8 63 if PAGE_SIZE_16KB + default "8" if PAGE_SIZE_16KB + range 6 63 if PAGE_SIZE_64KB + default "6" if PAGE_SIZE_64KB + range 10 63 + default "13" if !MMU + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -35,9 +35,6 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - The page size is not necessarily 4KB. Keep this in mind when choosing a value for this option. diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 84437a4c6545..e3242bf5a8df 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -271,7 +271,7 @@ config ARCH_SPARSEMEM_DEFAULT config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - default "13" + default "12" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -280,9 +280,6 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 13 means that the largest free memory block is 2^12 pages. - if SPARC64 || COMPILE_TEST source "kernel/power/Kconfig" endif diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index 384480971805..7d91ca6aa675 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -193,7 +193,7 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size, size = IO_PAGE_ALIGN(size); order = get_order(size); - if (unlikely(order >= MAX_ORDER)) + if (unlikely(order > MAX_ORDER)) return NULL; npages = size >> IO_PAGE_SHIFT; diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 5b4de4a89dec..08ffd17d5ec3 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -897,7 +897,7 @@ void __init cheetah_ecache_flush_init(void) /* Now allocate error trap reporting scoreboard. */ sz = NR_CPUS * (2 * sizeof(struct cheetah_err_info)); - for (order = 0; order < MAX_ORDER; order++) { + for (order = 0; order <= MAX_ORDER; order++) { if ((PAGE_SIZE << order) >= sz) break; } diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c index dba8dffe2113..5e2931a18409 100644 --- a/arch/sparc/mm/tsb.c +++ b/arch/sparc/mm/tsb.c @@ -402,8 +402,8 @@ void tsb_grow(struct mm_struct *mm, unsigned long tsb_index, unsigned long rss) unsigned long new_rss_limit; gfp_t gfp_flags; - if (max_tsb_size > (PAGE_SIZE << (MAX_ORDER - 1))) - max_tsb_size = (PAGE_SIZE << (MAX_ORDER - 1)); + if (max_tsb_size > PAGE_SIZE << MAX_ORDER) + max_tsb_size = PAGE_SIZE << MAX_ORDER; new_cache_index = 0; for (new_size = 8192; new_size < max_tsb_size; new_size <<= 1UL) { diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 5e5a9c8e0e5d..8dcda617b8bf 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -368,10 +368,10 @@ int __init linux_main(int argc, char **argv) max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC; /* - * Zones have to begin on a 1 << MAX_ORDER-1 page boundary, + * Zones have to begin on a 1 << MAX_ORDER page boundary, * so this makes sure that's true for highmem */ - max_physmem &= ~((1 << (PAGE_SHIFT + MAX_ORDER - 1)) - 1); + max_physmem &= ~((1 << (PAGE_SHIFT + MAX_ORDER)) - 1); if (physmem_size + iomem_size > max_physmem) { highmem = physmem_size + iomem_size - max_physmem; physmem_size -= highmem; diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index bcb0c5d2abc2..3eee334ba873 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -773,7 +773,7 @@ config HIGHMEM config ARCH_FORCE_MAX_ORDER int "Maximum zone order" - default "11" + default "10" help The kernel memory allocator divides physically contiguous memory blocks into "zones", where each zone is a power of two number of @@ -782,9 +782,6 @@ config ARCH_FORCE_MAX_ORDER blocks of physically contiguous memory, then you may need to increase this value. - This config option is actually maximum order plus one. For example, - a value of 11 means that the largest free memory block is 2^10 pages. - endmenu menu "Power management options" diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c index 817eda2075aa..c491fabe3617 100644 --- a/drivers/base/regmap/regmap-debugfs.c +++ b/drivers/base/regmap/regmap-debugfs.c @@ -226,8 +226,8 @@ static ssize_t regmap_read_debugfs(struct regmap *map, unsigned int from, if (*ppos < 0 || !count) return -EINVAL; - if (count > (PAGE_SIZE << (MAX_ORDER - 1))) - count = PAGE_SIZE << (MAX_ORDER - 1); + if (count > (PAGE_SIZE << MAX_ORDER)) + count = PAGE_SIZE << MAX_ORDER; buf = kmalloc(count, GFP_KERNEL); if (!buf) @@ -373,8 +373,8 @@ static ssize_t regmap_reg_ranges_read_file(struct file *file, if (*ppos < 0 || !count) return -EINVAL; - if (count > (PAGE_SIZE << (MAX_ORDER - 1))) - count = PAGE_SIZE << (MAX_ORDER - 1); + if (count > (PAGE_SIZE << MAX_ORDER)) + count = PAGE_SIZE << MAX_ORDER; buf = kmalloc(count, GFP_KERNEL); if (!buf) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 90d2dfb6448e..cec2c20f5e59 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3079,7 +3079,7 @@ static void raw_cmd_free(struct floppy_raw_cmd **ptr) } } -#define MAX_LEN (1UL << (MAX_ORDER - 1) << PAGE_SHIFT) +#define MAX_LEN (1UL << MAX_ORDER << PAGE_SHIFT) static int raw_cmd_copyin(int cmd, void __user *param, struct floppy_raw_cmd **rcmd) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index e2f25926eb51..bf095baca244 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -886,7 +886,7 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp) /* * The length of the ID shouldn't be assumed by software since * it may change in the future. The allocation size is limited - * to 1 << (PAGE_SHIFT + MAX_ORDER - 1) by the page allocator. + * to 1 << (PAGE_SHIFT + MAX_ORDER) by the page allocator. * If the allocation fails, simply return ENOMEM rather than * warning in the kernel log. */ diff --git a/drivers/crypto/hisilicon/sgl.c b/drivers/crypto/hisilicon/sgl.c index 09586a837b1e..3df7a256e919 100644 --- a/drivers/crypto/hisilicon/sgl.c +++ b/drivers/crypto/hisilicon/sgl.c @@ -70,11 +70,11 @@ struct hisi_acc_sgl_pool *hisi_acc_create_sgl_pool(struct device *dev, HISI_ACC_SGL_ALIGN_SIZE); /* - * the pool may allocate a block of memory of size PAGE_SIZE * 2^(MAX_ORDER - 1), + * the pool may allocate a block of memory of size PAGE_SIZE * 2^MAX_ORDER, * block size may exceed 2^31 on ia64, so the max of block size is 2^31 */ - block_size = 1 << (PAGE_SHIFT + MAX_ORDER <= 32 ? - PAGE_SHIFT + MAX_ORDER - 1 : 31); + block_size = 1 << (PAGE_SHIFT + MAX_ORDER < 32 ? + PAGE_SHIFT + MAX_ORDER : 31); sgl_num_per_block = block_size / sgl_size; block_num = count / sgl_num_per_block; remain_sgl = count % sgl_num_per_block; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c b/drivers/gpu/drm/i915/gem/i915_gem_internal.c index eae9e9f6d3bf..6bc26b4b06b8 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c @@ -36,7 +36,7 @@ static int i915_gem_object_get_pages_internal(struct drm_i915_gem_object *obj) struct sg_table *st; struct scatterlist *sg; unsigned int npages; /* restricted by sg_alloc_table */ - int max_order = MAX_ORDER - 1; + int max_order = MAX_ORDER; unsigned int max_segment; gfp_t gfp; diff --git a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c index defece0bcb81..99f39a5feca1 100644 --- a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c +++ b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c @@ -115,7 +115,7 @@ static int get_huge_pages(struct drm_i915_gem_object *obj) do { struct page *page; - GEM_BUG_ON(order >= MAX_ORDER); + GEM_BUG_ON(order > MAX_ORDER); page = alloc_pages(GFP | __GFP_ZERO, order); if (!page) goto err; diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index aa116a7bbae3..6c8585abe08d 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -65,11 +65,11 @@ module_param(page_pool_size, ulong, 0644); static atomic_long_t allocated_pages; -static struct ttm_pool_type global_write_combined[MAX_ORDER]; -static struct ttm_pool_type global_uncached[MAX_ORDER]; +static struct ttm_pool_type global_write_combined[MAX_ORDER + 1]; +static struct ttm_pool_type global_uncached[MAX_ORDER + 1]; -static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER]; -static struct ttm_pool_type global_dma32_uncached[MAX_ORDER]; +static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER + 1]; +static struct ttm_pool_type global_dma32_uncached[MAX_ORDER + 1]; static spinlock_t shrinker_lock; static struct list_head shrinker_list; @@ -405,7 +405,7 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt, else gfp_flags |= GFP_HIGHUSER; - for (order = min_t(unsigned int, MAX_ORDER - 1, __fls(num_pages)); + for (order = min_t(unsigned int, MAX_ORDER, __fls(num_pages)); num_pages; order = min_t(unsigned int, order, __fls(num_pages))) { struct ttm_pool_type *pt; @@ -542,7 +542,7 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev, if (use_dma_alloc) { for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) - for (j = 0; j < MAX_ORDER; ++j) + for (j = 0; j <= MAX_ORDER; ++j) ttm_pool_type_init(&pool->caching[i].orders[j], pool, i, j); } @@ -562,7 +562,7 @@ void ttm_pool_fini(struct ttm_pool *pool) if (pool->use_dma_alloc) { for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) - for (j = 0; j < MAX_ORDER; ++j) + for (j = 0; j <= MAX_ORDER; ++j) ttm_pool_type_fini(&pool->caching[i].orders[j]); } @@ -616,7 +616,7 @@ static void ttm_pool_debugfs_header(struct seq_file *m) unsigned int i; seq_puts(m, "\t "); - for (i = 0; i < MAX_ORDER; ++i) + for (i = 0; i <= MAX_ORDER; ++i) seq_printf(m, " ---%2u---", i); seq_puts(m, "\n"); } @@ -627,7 +627,7 @@ static void ttm_pool_debugfs_orders(struct ttm_pool_type *pt, { unsigned int i; - for (i = 0; i < MAX_ORDER; ++i) + for (i = 0; i <= MAX_ORDER; ++i) seq_printf(m, " %8u", ttm_pool_type_count(&pt[i])); seq_puts(m, "\n"); } @@ -736,7 +736,7 @@ int ttm_pool_mgr_init(unsigned long num_pages) spin_lock_init(&shrinker_lock); INIT_LIST_HEAD(&shrinker_list); - for (i = 0; i < MAX_ORDER; ++i) { + for (i = 0; i <= MAX_ORDER; ++i) { ttm_pool_type_init(&global_write_combined[i], NULL, ttm_write_combined, i); ttm_pool_type_init(&global_uncached[i], NULL, ttm_uncached, i); @@ -769,7 +769,7 @@ void ttm_pool_mgr_fini(void) { unsigned int i; - for (i = 0; i < MAX_ORDER; ++i) { + for (i = 0; i <= MAX_ORDER; ++i) { ttm_pool_type_fini(&global_write_combined[i]); ttm_pool_type_fini(&global_uncached[i]); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 8d772ea8a583..b574c58a3487 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -182,7 +182,7 @@ #ifdef CONFIG_CMA_ALIGNMENT #define Q_MAX_SZ_SHIFT (PAGE_SHIFT + CONFIG_CMA_ALIGNMENT) #else -#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + MAX_ORDER - 1) +#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + MAX_ORDER) #endif /* diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index ac996fd6bd9c..7a9f0b0bddbd 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -736,7 +736,7 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev, struct page **pages; unsigned int i = 0, nid = dev_to_node(dev); - order_mask &= GENMASK(MAX_ORDER - 1, 0); + order_mask &= GENMASK(MAX_ORDER, 0); if (!order_mask) return NULL; diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 586271b8aa39..85790b870877 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2440,8 +2440,8 @@ static bool its_parse_indirect_baser(struct its_node *its, * feature is not supported by hardware. */ new_order = max_t(u32, get_order(esz << ids), new_order); - if (new_order >= MAX_ORDER) { - new_order = MAX_ORDER - 1; + if (new_order > MAX_ORDER) { + new_order = MAX_ORDER; ids = ilog2(PAGE_ORDER_TO_SIZE(new_order) / (int)esz); pr_warn("ITS@%pa: %s Table too large, reduce ids %llu->%u\n", &its->phys_base, its_base_type_string[type], diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index cf077f9b30c3..733053c2eaa0 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -408,7 +408,7 @@ static void __cache_size_refresh(void) * If the allocation may fail we use __get_free_pages. Memory fragmentation * won't have a fatal effect here, but it just causes flushes of some other * buffers and more I/O will be performed. Don't use __get_free_pages if it - * always fails (i.e. order >= MAX_ORDER). + * always fails (i.e. order > MAX_ORDER). * * If the allocation shouldn't fail we use __vmalloc. This is only for the * initial reserve allocation, so there's no risk of wasting all vmalloc diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c index d0e27438a73c..55fc5b80e649 100644 --- a/drivers/misc/genwqe/card_dev.c +++ b/drivers/misc/genwqe/card_dev.c @@ -443,7 +443,7 @@ static int genwqe_mmap(struct file *filp, struct vm_area_struct *vma) if (vsize == 0) return -EINVAL; - if (get_order(vsize) >= MAX_ORDER) + if (get_order(vsize) > MAX_ORDER) return -ENOMEM; dma_map = kzalloc(sizeof(struct dma_mapping), GFP_KERNEL); diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c index ac29698d085a..1c798d6b2dfb 100644 --- a/drivers/misc/genwqe/card_utils.c +++ b/drivers/misc/genwqe/card_utils.c @@ -210,7 +210,7 @@ u32 genwqe_crc32(u8 *buff, size_t len, u32 init) void *__genwqe_alloc_consistent(struct genwqe_dev *cd, size_t size, dma_addr_t *dma_handle) { - if (get_order(size) >= MAX_ORDER) + if (get_order(size) > MAX_ORDER) return NULL; return dma_alloc_coherent(&cd->pci_dev->dev, size, dma_handle, @@ -308,7 +308,7 @@ int genwqe_alloc_sync_sgl(struct genwqe_dev *cd, struct genwqe_sgl *sgl, sgl->write = write; sgl->sgl_size = genwqe_sgl_size(sgl->nr_pages); - if (get_order(sgl->sgl_size) >= MAX_ORDER) { + if (get_order(sgl->sgl_size) > MAX_ORDER) { dev_err(&pci_dev->dev, "[%s] err: too much memory requested!\n", __func__); return ret; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 25be7f8ac7cd..3973ca6adf4c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1041,7 +1041,7 @@ static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) return; order = get_order(alloc_size); - if (order >= MAX_ORDER) { + if (order > MAX_ORDER) { if (net_ratelimit()) dev_warn(ring_to_dev(ring), "failed to allocate tx spare buffer, exceed to max order\n"); return; diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index b35c9b6f913b..4e18b4cefa97 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -75,7 +75,7 @@ * pool for the 4MB. Thus the 16 Rx and Tx queues require 32 * 5 = 160 * plus 16 for the TSO pools for a total of 176 LTB mappings per VNIC. */ -#define IBMVNIC_ONE_LTB_MAX ((u32)((1 << (MAX_ORDER - 1)) * PAGE_SIZE)) +#define IBMVNIC_ONE_LTB_MAX ((u32)((1 << MAX_ORDER) * PAGE_SIZE)) #define IBMVNIC_ONE_LTB_SIZE min((u32)(8 << 20), IBMVNIC_ONE_LTB_MAX) #define IBMVNIC_LTB_SET_SIZE (38 << 20) diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c index ec3f6cf05f8c..34781dec3856 100644 --- a/drivers/video/fbdev/hyperv_fb.c +++ b/drivers/video/fbdev/hyperv_fb.c @@ -946,7 +946,7 @@ static phys_addr_t hvfb_get_phymem(struct hv_device *hdev, if (request_size == 0) return -1; - if (order < MAX_ORDER) { + if (order <= MAX_ORDER) { /* Call alloc_pages if the size is less than 2^MAX_ORDER */ page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); if (!page) @@ -977,7 +977,7 @@ static void hvfb_release_phymem(struct hv_device *hdev, { unsigned int order = get_order(size); - if (order < MAX_ORDER) + if (order <= MAX_ORDER) __free_pages(pfn_to_page(paddr >> PAGE_SHIFT), order); else dma_free_coherent(&hdev->device, diff --git a/drivers/video/fbdev/vermilion/vermilion.c b/drivers/video/fbdev/vermilion/vermilion.c index 0374ee6b6d03..32e74e02a02f 100644 --- a/drivers/video/fbdev/vermilion/vermilion.c +++ b/drivers/video/fbdev/vermilion/vermilion.c @@ -197,7 +197,7 @@ static int vmlfb_alloc_vram(struct vml_info *vinfo, va = &vinfo->vram[i]; order = 0; - while (requested > (PAGE_SIZE << order) && order < MAX_ORDER) + while (requested > (PAGE_SIZE << order) && order <= MAX_ORDER) order++; err = vmlfb_alloc_vram_area(va, order, 0); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 3f78a3a1eb75..5b15936a5214 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -33,7 +33,7 @@ #define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \ __GFP_NOMEMALLOC) /* The order of free page blocks to report to host */ -#define VIRTIO_BALLOON_HINT_BLOCK_ORDER (MAX_ORDER - 1) +#define VIRTIO_BALLOON_HINT_BLOCK_ORDER MAX_ORDER /* The size of a free page block in bytes */ #define VIRTIO_BALLOON_HINT_BLOCK_BYTES \ (1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT)) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 0c2892ec6817..835f6cc2fb66 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -1120,13 +1120,13 @@ static void virtio_mem_clear_fake_offline(unsigned long pfn, */ static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) { - unsigned long order = MAX_ORDER - 1; + unsigned long order = MAX_ORDER; unsigned long i; /* * We might get called for ranges that don't cover properly aligned - * MAX_ORDER - 1 pages; however, we can only online properly aligned - * pages with an order of MAX_ORDER - 1 at maximum. + * MAX_ORDER pages; however, we can only online properly aligned + * pages with an order of MAX_ORDER at maximum. */ while (!IS_ALIGNED(pfn | nr_pages, 1 << order)) order--; @@ -1237,9 +1237,9 @@ static void virtio_mem_online_page(struct virtio_mem *vm, bool do_online; /* - * We can get called with any order up to MAX_ORDER - 1. If our - * subblock size is smaller than that and we have a mixture of plugged - * and unplugged subblocks within such a page, we have to process in + * We can get called with any order up to MAX_ORDER. If our subblock + * size is smaller than that and we have a mixture of plugged and + * unplugged subblocks within such a page, we have to process in * smaller granularity. In that case we'll adjust the order exactly once * within the loop. */ diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 2f67516bb9bf..9fbb9b5256f7 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) /* make various checks */ order = get_order(newsize); - if (unlikely(order >= MAX_ORDER)) + if (unlikely(order > MAX_ORDER)) return -EFBIG; ret = inode_newsize_ok(inode, newsize); diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h index ef09b23d29e3..8ce14f9d202a 100644 --- a/include/drm/ttm/ttm_pool.h +++ b/include/drm/ttm/ttm_pool.h @@ -72,7 +72,7 @@ struct ttm_pool { bool use_dma32; struct { - struct ttm_pool_type orders[MAX_ORDER]; + struct ttm_pool_type orders[MAX_ORDER + 1]; } caching[TTM_NUM_CACHING_TYPES]; }; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7c977d234aba..8fb7d91cd0b1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -818,7 +818,7 @@ static inline unsigned huge_page_shift(struct hstate *h) static inline bool hstate_is_gigantic(struct hstate *h) { - return huge_page_order(h) >= MAX_ORDER; + return huge_page_order(h) > MAX_ORDER; } static inline unsigned int pages_per_huge_page(const struct hstate *h) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index bf8786d45b31..35b11cc210d9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -26,11 +26,11 @@ /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_ARCH_FORCE_MAX_ORDER -#define MAX_ORDER 11 +#define MAX_ORDER 10 #else #define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER #endif -#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) +#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER) /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed @@ -93,7 +93,7 @@ static inline bool migratetype_is_mergeable(int mt) } #define for_each_migratetype_order(order, type) \ - for (order = 0; order < MAX_ORDER; order++) \ + for (order = 0; order <= MAX_ORDER; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) extern int page_group_by_mobility_disabled; @@ -922,7 +922,7 @@ struct zone { CACHELINE_PADDING(_pad1_); /* free areas of different sizes */ - struct free_area free_area[MAX_ORDER]; + struct free_area free_area[MAX_ORDER + 1]; /* zone flags, see below */ unsigned long flags; @@ -1745,7 +1745,7 @@ static inline bool movable_only_nodes(nodemask_t *nodes) #define SECTION_BLOCKFLAGS_BITS \ ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) -#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS +#if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS #error Allocator MAX_ORDER exceeds SECTION_SIZE #endif diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 5f1ae07d724b..e83c4c095041 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -41,14 +41,14 @@ extern unsigned int pageblock_order; * Huge pages are a constant size, but don't exceed the maximum allocation * granularity. */ -#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER - 1) +#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ #else /* CONFIG_HUGETLB_PAGE */ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ -#define pageblock_order (MAX_ORDER-1) +#define pageblock_order MAX_ORDER #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/slab.h b/include/linux/slab.h index 45af70315a94..aa4575ef2965 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -284,7 +284,7 @@ static inline unsigned int arch_slab_minalign(void) * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 5 #endif @@ -292,7 +292,7 @@ static inline unsigned int arch_slab_minalign(void) #ifdef CONFIG_SLUB #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif @@ -305,7 +305,7 @@ static inline unsigned int arch_slab_minalign(void) * be allocated from the same page. */ #define KMALLOC_SHIFT_HIGH PAGE_SHIFT -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 755f5f08ab38..90ce1dfd591c 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -474,7 +474,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(list_head, prev); VMCOREINFO_OFFSET(vmap_area, va_start); VMCOREINFO_OFFSET(vmap_area, list); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); + VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1); log_buf_vmcoreinfo_setup(); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); VMCOREINFO_NUMBER(NR_FREE_PAGES); diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 4d40dcce7604..1acec2e22827 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -84,8 +84,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, void *addr; int ret = -ENOMEM; - /* Cannot allocate larger than MAX_ORDER-1 */ - order = min(get_order(pool_size), MAX_ORDER-1); + /* Cannot allocate larger than MAX_ORDER */ + order = min(get_order(pool_size), MAX_ORDER); do { pool_size = 1 << (PAGE_SHIFT + order); @@ -190,7 +190,7 @@ static int __init dma_atomic_pool_init(void) /* * If coherent_pool was not used on the command line, default the pool - * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1. + * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER. */ if (!atomic_pool_size) { unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index d6bbdb7830b2..a0433f37b024 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -609,8 +609,8 @@ static struct page *rb_alloc_aux_page(int node, int order) { struct page *page; - if (order >= MAX_ORDER) - order = MAX_ORDER - 1; + if (order > MAX_ORDER) + order = MAX_ORDER; do { page = alloc_pages_node(node, PERF_AUX_GFP, order); @@ -814,7 +814,7 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) size = sizeof(struct perf_buffer); size += nr_pages * sizeof(void *); - if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER) + if (order_base_2(size) > PAGE_SHIFT+MAX_ORDER) goto fail; node = (cpu == -1) ? cpu : cpu_to_node(cpu); diff --git a/mm/Kconfig b/mm/Kconfig index ca98b2072df5..969286ab14a1 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -346,9 +346,9 @@ config SHUFFLE_PAGE_ALLOCATOR the presence of a memory-side-cache. There are also incidental security benefits as it reduces the predictability of page allocations to compliment SLAB_FREELIST_RANDOM, but the - default granularity of shuffling on the "MAX_ORDER - 1" i.e, - 10th order of pages is selected based on cache utilization - benefits on x86. + default granularity of shuffling on the MAX_ORDER i.e, 10th + order of pages is selected based on cache utilization benefits + on x86. While the randomization improves cache utilization it may negatively impact workloads on platforms without a cache. For @@ -666,8 +666,8 @@ config HUGETLB_PAGE_SIZE_VARIABLE HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available on a platform. - Note that the pageblock_order cannot exceed MAX_ORDER - 1 and will be - clamped down to MAX_ORDER - 1. + Note that the pageblock_order cannot exceed MAX_ORDER and will be + clamped down to MAX_ORDER. config CONTIG_ALLOC def_bool (MEMORY_ISOLATION && COMPACTION) || CMA diff --git a/mm/compaction.c b/mm/compaction.c index 5a9501e0ae01..709136556b9e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -583,7 +583,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (PageCompound(page)) { const unsigned int order = compound_order(page); - if (likely(order < MAX_ORDER)) { + if (likely(order <= MAX_ORDER)) { blockpfn += (1UL << order) - 1; cursor += (1UL << order) - 1; } @@ -938,7 +938,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * a valid page order. Consider only values in the * valid order range to prevent low_pfn overflow. */ - if (freepage_order > 0 && freepage_order < MAX_ORDER) + if (freepage_order > 0 && freepage_order <= MAX_ORDER) low_pfn += (1UL << freepage_order) - 1; continue; } @@ -954,7 +954,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (PageCompound(page) && !cc->alloc_contig) { const unsigned int order = compound_order(page); - if (likely(order < MAX_ORDER)) + if (likely(order <= MAX_ORDER)) low_pfn += (1UL << order) - 1; goto isolate_fail; } @@ -2124,7 +2124,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) /* Direct compactor: Is a suitable page free? */ ret = COMPACT_NO_SUITABLE_PAGE; - for (order = cc->order; order < MAX_ORDER; order++) { + for (order = cc->order; order <= MAX_ORDER; order++) { struct free_area *area = &cc->zone->free_area[order]; bool can_steal; diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 4362021b1ce7..c54177aabebd 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -1086,7 +1086,7 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order) struct page *page = NULL; #ifdef CONFIG_CONTIG_ALLOC - if (order >= MAX_ORDER) { + if (order > MAX_ORDER) { page = alloc_contig_pages((1 << order), GFP_KERNEL, first_online_node, NULL); if (page) { @@ -1096,7 +1096,7 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order) } #endif - if (order < MAX_ORDER) + if (order <= MAX_ORDER) page = alloc_pages(GFP_KERNEL, order); return page; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2d860e70fe88..1df386d13d24 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -467,7 +467,7 @@ static int __init hugepage_init(void) /* * hugepages can't be allocated by the buddy allocator */ - MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER); + MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_ORDER); /* * we use page->mapping and page->index in second tail page * as list_head: assuming THP order >= 2 diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 712e32b38295..9122e50ae02a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2090,7 +2090,7 @@ pgoff_t hugetlb_basepage_index(struct page *page) pgoff_t index = page_index(page_head); unsigned long compound_idx; - if (compound_order(page_head) >= MAX_ORDER) + if (compound_order(page_head) > MAX_ORDER) compound_idx = page_to_pfn(page) - page_to_pfn(page_head); else compound_idx = page - page_head; @@ -4497,7 +4497,7 @@ static int __init default_hugepagesz_setup(char *s) * The number of default huge pages (for this size) could have been * specified as the first hugetlb parameter: hugepages=X. If so, * then default_hstate_max_huge_pages is set. If the default huge - * page size is gigantic (>= MAX_ORDER), then the pages must be + * page size is gigantic (> MAX_ORDER), then the pages must be * allocated here from bootmem allocator. */ if (default_hstate_max_huge_pages) { diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c index 7fb794242fad..ffedf4dbc49d 100644 --- a/mm/kmsan/init.c +++ b/mm/kmsan/init.c @@ -96,7 +96,7 @@ void __init kmsan_init_shadow(void) struct metadata_page_pair { struct page *shadow, *origin; }; -static struct metadata_page_pair held_back[MAX_ORDER] __initdata; +static struct metadata_page_pair held_back[MAX_ORDER + 1] __initdata; /* * Eager metadata allocation. When the memblock allocator is freeing pages to @@ -211,8 +211,8 @@ static void kmsan_memblock_discard(void) * order=N-1, * - repeat. */ - collect.order = MAX_ORDER - 1; - for (int i = MAX_ORDER - 1; i >= 0; i--) { + collect.order = MAX_ORDER; + for (int i = MAX_ORDER; i >= 0; i--) { if (held_back[i].shadow) smallstack_push(&collect, held_back[i].shadow); if (held_back[i].origin) diff --git a/mm/memblock.c b/mm/memblock.c index 25fd0626a9e7..7911224b1ed3 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2043,7 +2043,7 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) int order; while (start < end) { - order = min(MAX_ORDER - 1UL, __ffs(start)); + order = min_t(int, MAX_ORDER, __ffs(start)); while (start + (1UL << order) > end) order--; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index db3b270254f1..c8f0a8c2d049 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -596,7 +596,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) unsigned long pfn; /* - * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might + * Online the pages in MAX_ORDER aligned chunks. The callback might * decide to not expose all pages to the buddy (e.g., expose them * later). We account all pages as being online and belonging to this * zone ("present"). @@ -605,7 +605,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) * this and the first chunk to online will be pageblock_nr_pages. */ for (pfn = start_pfn; pfn < end_pfn;) { - int order = min(MAX_ORDER - 1UL, __ffs(pfn)); + int order = min_t(int, MAX_ORDER, __ffs(pfn)); (*online_page_callback)(pfn_to_page(pfn), order); pfn += (1UL << order); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0936bde1d486..c3e49d028a7a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1063,7 +1063,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, unsigned long higher_page_pfn; struct page *higher_page; - if (order >= MAX_ORDER - 2) + if (order >= MAX_ORDER - 1) return false; higher_page_pfn = buddy_pfn & pfn; @@ -1118,7 +1118,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); - while (order < MAX_ORDER - 1) { + while (order < MAX_ORDER) { if (compaction_capture(capc, page, order, migratetype)) { __mod_zone_freepage_state(zone, -(1 << order), migratetype); @@ -2499,7 +2499,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, struct page *page; /* Find a page of the appropriate size in the preferred list */ - for (current_order = order; current_order < MAX_ORDER; ++current_order) { + for (current_order = order; current_order <= MAX_ORDER; ++current_order) { area = &(zone->free_area[current_order]); page = get_page_from_free_area(area, migratetype); if (!page) @@ -2871,7 +2871,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, continue; spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) { + for (order = 0; order <= MAX_ORDER; order++) { struct free_area *area = &(zone->free_area[order]); page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); @@ -2955,7 +2955,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, * approximates finding the pageblock with the most free pages, which * would be too costly to do exactly. */ - for (current_order = MAX_ORDER - 1; current_order >= min_order; + for (current_order = MAX_ORDER; current_order >= min_order; --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, @@ -2981,7 +2981,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, return false; find_smallest: - for (current_order = order; current_order < MAX_ORDER; + for (current_order = order; current_order <= MAX_ORDER; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, @@ -2994,7 +2994,7 @@ find_smallest: * This should not happen - we already found a suitable fallback * when looking for the largest page. */ - VM_BUG_ON(current_order == MAX_ORDER); + VM_BUG_ON(current_order > MAX_ORDER); do_steal: page = get_page_from_free_area(area, fallback_mt); @@ -3955,7 +3955,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true; /* For a high-order request, check at least one suitable page is free */ - for (o = order; o < MAX_ORDER; o++) { + for (o = order; o <= MAX_ORDER; o++) { struct free_area *area = &z->free_area[o]; int mt; @@ -5475,7 +5475,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, * There are several places where we assume that the order value is sane * so bail out early if the request is out of bound. */ - if (WARN_ON_ONCE_GFP(order >= MAX_ORDER, gfp)) + if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp)) return NULL; gfp &= gfp_allowed_mask; @@ -6205,8 +6205,8 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i for_each_populated_zone(zone) { unsigned int order; - unsigned long nr[MAX_ORDER], flags, total = 0; - unsigned char types[MAX_ORDER]; + unsigned long nr[MAX_ORDER + 1], flags, total = 0; + unsigned char types[MAX_ORDER + 1]; if (zone_idx(zone) > max_zone_idx) continue; @@ -6216,7 +6216,7 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i printk(KERN_CONT "%s: ", zone->name); spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) { + for (order = 0; order <= MAX_ORDER; order++) { struct free_area *area = &zone->free_area[order]; int type; @@ -6230,7 +6230,7 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i } } spin_unlock_irqrestore(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) { + for (order = 0; order <= MAX_ORDER; order++) { printk(KERN_CONT "%lu*%lukB ", nr[order], K(1UL) << order); if (nr[order]) @@ -7581,7 +7581,7 @@ static inline void setup_usemap(struct zone *zone) {} /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __init set_pageblock_order(void) { - unsigned int order = MAX_ORDER - 1; + unsigned int order = MAX_ORDER; /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) @@ -9076,7 +9076,7 @@ void *__init alloc_large_system_hash(const char *tablename, else table = memblock_alloc_raw(size, SMP_CACHE_BYTES); - } else if (get_order(size) >= MAX_ORDER || hashdist) { + } else if (get_order(size) > MAX_ORDER || hashdist) { table = vmalloc_huge(size, gfp_flags); virt = true; if (table) @@ -9290,7 +9290,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, order = 0; outer_start = start; while (!PageBuddy(pfn_to_page(outer_start))) { - if (++order >= MAX_ORDER) { + if (++order > MAX_ORDER) { outer_start = start; break; } @@ -9540,7 +9540,7 @@ bool is_free_buddy_page(struct page *page) unsigned long pfn = page_to_pfn(page); unsigned int order; - for (order = 0; order < MAX_ORDER; order++) { + for (order = 0; order <= MAX_ORDER; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); if (PageBuddy(page_head) && @@ -9548,7 +9548,7 @@ bool is_free_buddy_page(struct page *page) break; } - return order < MAX_ORDER; + return order <= MAX_ORDER; } EXPORT_SYMBOL(is_free_buddy_page); @@ -9599,7 +9599,7 @@ bool take_page_off_buddy(struct page *page) bool ret = false; spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) { + for (order = 0; order <= MAX_ORDER; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); int page_order = buddy_order(page_head); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 47fbc1696466..c6f3605e37ab 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -226,7 +226,7 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) */ if (PageBuddy(page)) { order = buddy_order(page); - if (order >= pageblock_order && order < MAX_ORDER - 1) { + if (order >= pageblock_order && order < MAX_ORDER) { buddy = find_buddy_page_pfn(page, page_to_pfn(page), order, NULL); if (buddy && !is_migrate_isolate_page(buddy)) { @@ -290,11 +290,11 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * isolate_single_pageblock() * @migratetype: migrate type to set in error recovery. * - * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one + * Free and in-use pages can be as big as MAX_ORDER and contain more than one * pageblock. When not all pageblocks within a page are isolated at the same * time, free page accounting can go wrong. For example, in the case of - * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks. - * [ MAX_ORDER-1 ] + * MAX_ORDER = pageblock_order + 1, a MAX_ORDER page has two pagelbocks. + * [ MAX_ORDER ] * [ pageblock0 | pageblock1 ] * When either pageblock is isolated, if it is a free page, the page is not * split into separate migratetype lists, which is supposed to; if it is an @@ -451,7 +451,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, * the free page to the right migratetype list. * * head_pfn is not used here as a hugetlb page order - * can be bigger than MAX_ORDER-1, but after it is + * can be bigger than MAX_ORDER, but after it is * freed, the free page order is not. Use pfn within * the range to find the head of the free page. */ @@ -459,7 +459,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, outer_pfn = pfn; while (!PageBuddy(pfn_to_page(outer_pfn))) { /* stop if we cannot find the free page */ - if (++order >= MAX_ORDER) + if (++order > MAX_ORDER) goto failed; outer_pfn &= ~0UL << order; } diff --git a/mm/page_owner.c b/mm/page_owner.c index 220cdeddc295..31169b3e7f06 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -315,7 +315,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, unsigned long freepage_order; freepage_order = buddy_order_unsafe(page); - if (freepage_order < MAX_ORDER) + if (freepage_order <= MAX_ORDER) pfn += (1UL << freepage_order) - 1; continue; } @@ -549,7 +549,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (PageBuddy(page)) { unsigned long freepage_order = buddy_order_unsafe(page); - if (freepage_order < MAX_ORDER) + if (freepage_order <= MAX_ORDER) pfn += (1UL << freepage_order) - 1; continue; } @@ -657,7 +657,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) if (PageBuddy(page)) { unsigned long order = buddy_order_unsafe(page); - if (order > 0 && order < MAX_ORDER) + if (order > 0 && order <= MAX_ORDER) pfn += (1UL << order) - 1; continue; } diff --git a/mm/page_reporting.c b/mm/page_reporting.c index 275b466de37b..b021f482a4cb 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -20,7 +20,7 @@ static int page_order_update_notify(const char *val, const struct kernel_param * * If param is set beyond this limit, order is set to default * pageblock_order value */ - return param_set_uint_minmax(val, kp, 0, MAX_ORDER-1); + return param_set_uint_minmax(val, kp, 0, MAX_ORDER); } static const struct kernel_param_ops page_reporting_param_ops = { @@ -276,7 +276,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev, return err; /* Process each free list starting from lowest order/mt */ - for (order = page_reporting_order; order < MAX_ORDER; order++) { + for (order = page_reporting_order; order <= MAX_ORDER; order++) { for (mt = 0; mt < MIGRATE_TYPES; mt++) { /* We do not pull pages from the isolate free list */ if (is_migrate_isolate(mt)) @@ -370,7 +370,7 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) */ if (page_reporting_order == -1) { - if (prdev->order > 0 && prdev->order < MAX_ORDER) + if (prdev->order > 0 && prdev->order <= MAX_ORDER) page_reporting_order = prdev->order; else page_reporting_order = pageblock_order; diff --git a/mm/shuffle.h b/mm/shuffle.h index cec62984f7d3..a6bdf54f96f1 100644 --- a/mm/shuffle.h +++ b/mm/shuffle.h @@ -4,7 +4,7 @@ #define _MM_SHUFFLE_H #include -#define SHUFFLE_ORDER (MAX_ORDER-1) +#define SHUFFLE_ORDER MAX_ORDER #ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key); diff --git a/mm/slab.c b/mm/slab.c index edbe722fb906..6b7c172158e5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -465,7 +465,7 @@ static int __init slab_max_order_setup(char *str) { get_option(&str, &slab_max_order); slab_max_order = slab_max_order < 0 ? 0 : - min(slab_max_order, MAX_ORDER - 1); + min(slab_max_order, MAX_ORDER); slab_max_order_set = true; return 1; diff --git a/mm/slub.c b/mm/slub.c index 32eb6b50fe18..f49d669ff604 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4171,8 +4171,8 @@ static inline int calculate_order(unsigned int size) /* * Doh this slab cannot be placed using slub_max_order. */ - order = calc_slab_order(size, 1, MAX_ORDER - 1, 1); - if (order < MAX_ORDER) + order = calc_slab_order(size, 1, MAX_ORDER, 1); + if (order <= MAX_ORDER) return order; return -ENOSYS; } @@ -4697,7 +4697,7 @@ __setup("slub_min_order=", setup_slub_min_order); static int __init setup_slub_max_order(char *str) { get_option(&str, (int *)&slub_max_order); - slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1); + slub_max_order = min_t(unsigned int, slub_max_order, MAX_ORDER); return 1; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 8faac4310cb5..98719e72b5e2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7002,7 +7002,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, * scan_control uses s8 fields for order, priority, and reclaim_idx. * Confirm they are large enough for max values. */ - BUILD_BUG_ON(MAX_ORDER > S8_MAX); + BUILD_BUG_ON(MAX_ORDER >= S8_MAX); BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); diff --git a/mm/vmstat.c b/mm/vmstat.c index 1ea6a5ce1c41..b7307627772d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1055,7 +1055,7 @@ static void fill_contig_page_info(struct zone *zone, info->free_blocks_total = 0; info->free_blocks_suitable = 0; - for (order = 0; order < MAX_ORDER; order++) { + for (order = 0; order <= MAX_ORDER; order++) { unsigned long blocks; /* @@ -1088,7 +1088,7 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in { unsigned long requested = 1UL << order; - if (WARN_ON_ONCE(order >= MAX_ORDER)) + if (WARN_ON_ONCE(order > MAX_ORDER)) return 0; if (!info->free_blocks_total) @@ -1462,7 +1462,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, int order; seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) + for (order = 0; order <= MAX_ORDER; ++order) /* * Access to nr_free is lockless as nr_free is used only for * printing purposes. Use data_race to avoid KCSAN warning. @@ -1491,7 +1491,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m, pgdat->node_id, zone->name, migratetype_names[mtype]); - for (order = 0; order < MAX_ORDER; ++order) { + for (order = 0; order <= MAX_ORDER; ++order) { unsigned long freecount = 0; struct free_area *area; struct list_head *curr; @@ -1531,7 +1531,7 @@ static void pagetypeinfo_showfree(struct seq_file *m, void *arg) /* Print header */ seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); - for (order = 0; order < MAX_ORDER; ++order) + for (order = 0; order <= MAX_ORDER; ++order) seq_printf(m, "%6d ", order); seq_putc(m, '\n'); @@ -2153,7 +2153,7 @@ static void unusable_show_print(struct seq_file *m, seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) { + for (order = 0; order <= MAX_ORDER; ++order) { fill_contig_page_info(zone, order, &info); index = unusable_free_index(order, &info); seq_printf(m, "%d.%03d ", index / 1000, index % 1000); @@ -2205,7 +2205,7 @@ static void extfrag_show_print(struct seq_file *m, seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) { + for (order = 0; order <= MAX_ORDER; ++order) { fill_contig_page_info(zone, order, &info); index = __fragmentation_index(order, &info); seq_printf(m, "%2d.%03d ", index / 1000, index % 1000); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 854772dd52fd..9b66d6aeeb1a 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -843,7 +843,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) goto out; /* the calculated number of cq entries fits to mlx5 cq allocation */ cqe_size_order = cache_line_size() == 128 ? 7 : 6; - smc_order = MAX_ORDER - cqe_size_order - 1; + smc_order = MAX_ORDER - cqe_size_order; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 64499056648a..51ad29940f05 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -38,7 +38,7 @@ static int param_set_bufsize(const char *val, const struct kernel_param *kp) size = memparse(val, NULL); order = get_order(size); - if (order >= MAX_ORDER) + if (order > MAX_ORDER) return -EINVAL; ima_maxorder = order; ima_bufsize = PAGE_SIZE << order; diff --git a/tools/testing/memblock/linux/mmzone.h b/tools/testing/memblock/linux/mmzone.h index e65f89b12f1c..134f8eab0768 100644 --- a/tools/testing/memblock/linux/mmzone.h +++ b/tools/testing/memblock/linux/mmzone.h @@ -17,10 +17,10 @@ enum zone_type { }; #define MAX_NR_ZONES __MAX_NR_ZONES -#define MAX_ORDER 11 -#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) +#define MAX_ORDER 10 +#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER) -#define pageblock_order (MAX_ORDER - 1) +#define pageblock_order MAX_ORDER #define pageblock_nr_pages BIT(pageblock_order) #define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) #define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) -- cgit v1.2.3-70-g09d2 From 7b806d229ef151c2997c041b791dfa89593e0e1b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 12 Mar 2023 23:40:14 +0000 Subject: mm: remove vmf_insert_pfn_xxx_prot() for huge page-table entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This functionality's sole user, the drm ttm module, removed support for it in commit 0d979509539e ("drm/ttm: remove ttm_bo_vm_insert_huge()") as the whole approach is currently unworkable without a PMD/PUD special bit and updates to GUP. Link: https://lkml.kernel.org/r/604c2ad79659d4b8a6e3e1611c6219d5d3233988.1678661628.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Cc: Christian König Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Thomas Hellström Cc: Aaron Tomlin Cc: Christoph Lameter Cc: Frederic Weisbecker Cc: Heiko Carstens Cc: Huacai Chen Cc: Marcelo Tosatti Cc: Peter Xu Cc: "Russell King (Oracle)" Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 39 ++------------------------------------- mm/huge_memory.c | 31 +++++++++++++------------------ 2 files changed, 15 insertions(+), 55 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9a3a3af2dd80..20284387b841 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -39,44 +39,9 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, unsigned long cp_flags); -vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, - pgprot_t pgprot, bool write); -/** - * vmf_insert_pfn_pmd - insert a pmd size pfn - * @vmf: Structure describing the fault - * @pfn: pfn to insert - * @pgprot: page protection to use - * @write: whether it's a write fault - * - * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. - * - * Return: vm_fault_t value. - */ -static inline vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, - bool write) -{ - return vmf_insert_pfn_pmd_prot(vmf, pfn, vmf->vma->vm_page_prot, write); -} -vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn, - pgprot_t pgprot, bool write); - -/** - * vmf_insert_pfn_pud - insert a pud size pfn - * @vmf: Structure describing the fault - * @pfn: pfn to insert - * @pgprot: page protection to use - * @write: whether it's a write fault - * - * Insert a pud size pfn. See vmf_insert_pfn() for additional info. - * - * Return: vm_fault_t value. - */ -static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, - bool write) -{ - return vmf_insert_pfn_pud_prot(vmf, pfn, vmf->vma->vm_page_prot, write); -} +vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write); +vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_UNSUPPORTED, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1df386d13d24..81a5689806af 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -885,23 +885,20 @@ out_unlock: } /** - * vmf_insert_pfn_pmd_prot - insert a pmd size pfn + * vmf_insert_pfn_pmd - insert a pmd size pfn * @vmf: Structure describing the fault * @pfn: pfn to insert - * @pgprot: page protection to use * @write: whether it's a write fault * - * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and - * also consult the vmf_insert_mixed_prot() documentation when - * @pgprot != @vmf->vma->vm_page_prot. + * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. * * Return: vm_fault_t value. */ -vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, - pgprot_t pgprot, bool write) +vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) { unsigned long addr = vmf->address & PMD_MASK; struct vm_area_struct *vma = vmf->vma; + pgprot_t pgprot = vma->vm_page_prot; pgtable_t pgtable = NULL; /* @@ -929,7 +926,7 @@ vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot); +EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) @@ -940,9 +937,10 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) } static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, pfn_t pfn, pgprot_t prot, bool write) + pud_t *pud, pfn_t pfn, bool write) { struct mm_struct *mm = vma->vm_mm; + pgprot_t prot = vma->vm_page_prot; pud_t entry; spinlock_t *ptl; @@ -976,23 +974,20 @@ out_unlock: } /** - * vmf_insert_pfn_pud_prot - insert a pud size pfn + * vmf_insert_pfn_pud - insert a pud size pfn * @vmf: Structure describing the fault * @pfn: pfn to insert - * @pgprot: page protection to use * @write: whether it's a write fault * - * Insert a pud size pfn. See vmf_insert_pfn() for additional info and - * also consult the vmf_insert_mixed_prot() documentation when - * @pgprot != @vmf->vma->vm_page_prot. + * Insert a pud size pfn. See vmf_insert_pfn() for additional info. * * Return: vm_fault_t value. */ -vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn, - pgprot_t pgprot, bool write) +vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) { unsigned long addr = vmf->address & PUD_MASK; struct vm_area_struct *vma = vmf->vma; + pgprot_t pgprot = vma->vm_page_prot; /* * If we had pud_special, we could avoid all these restrictions, @@ -1010,10 +1005,10 @@ vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn, track_pfn_insert(vma, &pgprot, pfn); - insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write); + insert_pfn_pud(vma, addr, vmf->pud, pfn, write); return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot); +EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, -- cgit v1.2.3-70-g09d2 From 27da93d8e6d5633ac065c9316afc0f0240303c0a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 5 Apr 2023 18:02:36 +0200 Subject: mm/userfaultfd: don't consider uffd-wp bit of writable migration entries If we end up with a writable migration entry that has the uffd-wp bit set, we already messed up: the source PTE/PMD was writable, which means we could have modified the page without notifying uffd first. Setting the uffd-wp bit always implies converting migration entries to !writable migration entries. Commit 8f34f1eac382 ("mm/userfaultfd: fix uffd-wp special cases for fork()") documents that "3. Forget to carry over uffd-wp bit for a write migration huge pmd entry", but it doesn't really say why that should be relevant. So let's remove that code to avoid hiding an eventual underlying issue (in the future, we might want to warn when creating writable migration entries that have the uffd-wp bit set -- or even better when turning a PTE writable that still has the uffd-wp bit set). This now matches the handling for hugetlb migration entries in hugetlb_change_protection(). In copy_huge_pmd()/copy_nonpresent_pte()/copy_hugetlb_page_range(), we still transfer the uffd-bit also for writable migration entries, but simply because we have unified handling for "writable" and "readable-exclusive" migration entries, and we care about transferring the uffd-wp bit for the latter. Link: https://lkml.kernel.org/r/20230405160236.587705-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Peter Xu Cc: Muhammad Usama Anjum Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 -- mm/mprotect.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 03d78901a7a7..0dbce09916c4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1845,8 +1845,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd)) newpmd = pmd_swp_mksoft_dirty(newpmd); - if (pmd_swp_uffd_wp(*pmd)) - newpmd = pmd_swp_mkuffd_wp(newpmd); } else { newpmd = *pmd; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 204194155863..92d3d3ca390a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -223,8 +223,6 @@ static long change_pte_range(struct mmu_gather *tlb, newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); - if (pte_swp_uffd_wp(oldpte)) - newpte = pte_swp_mkuffd_wp(newpte); } else if (is_writable_device_private_entry(entry)) { /* * We do not preserve soft-dirtiness. See -- cgit v1.2.3-70-g09d2 From 3c811f7883c4ee5a34ba4354381bde062888dd31 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 11 Apr 2023 16:25:10 +0200 Subject: mm/migrate: revert "mm/migrate: fix wrongly apply write bit after mkdirty on sparc64" This reverts commit 96a9c287e25d ("mm/migrate: fix wrongly apply write bit after mkdirty on sparc64"). Now that sparc64 mkdirty handling is fixed and no longer sets a PTE/PMD writable that shouldn't be writable, let's revert the temporary fix. The mkdirty mm selftest still passes with this change on sparc64. Note that loongarch handling was fixed in commit bf2f34a506e6 ("LoongArch: Set _PAGE_DIRTY only if _PAGE_WRITE is set in {pmd,pte}_mkdirty()"). Link: https://lkml.kernel.org/r/20230411142512.438404-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: David S. Miller Cc: Hugh Dickins Cc: Peter Xu Cc: Sam Ravnborg Cc: Shuah Khan Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/huge_memory.c | 6 ++---- mm/migrate.c | 2 -- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0dbce09916c4..00a7a98f2ba2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3276,6 +3276,8 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); + if (is_writable_migration_entry(entry)) + pmde = maybe_pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_mkuffd_wp(pmde); if (!is_migration_entry_young(entry)) @@ -3283,10 +3285,6 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) /* NOTE: this may contain setting soft-dirty on some archs */ if (PageDirty(new) && is_migration_entry_dirty(entry)) pmde = pmd_mkdirty(pmde); - if (is_writable_migration_entry(entry)) - pmde = maybe_pmd_mkwrite(pmde, vma); - else - pmde = pmd_wrprotect(pmde); if (PageAnon(new)) { rmap_t rmap_flags = RMAP_COMPOUND; diff --git a/mm/migrate.c b/mm/migrate.c index afe21c48dc6e..41f2154d66ad 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -225,8 +225,6 @@ static bool remove_migration_pte(struct folio *folio, pte = maybe_mkwrite(pte, vma); else if (pte_swp_uffd_wp(*pvmw.pte)) pte = pte_mkuffd_wp(pte); - else - pte = pte_wrprotect(pte); if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) rmap_flags |= RMAP_EXCLUSIVE; -- cgit v1.2.3-70-g09d2 From 5436d6556937de6236ffa1829550d13702569dab Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 11 Apr 2023 16:25:11 +0200 Subject: mm/huge_memory: revert "Partly revert "mm/thp: carry over dirty bit when thp splits on pmd"" This reverts commit 624a2c94f5b7 ("Partly revert "mm/thp: carry over dirty bit when thp splits on pmd"") and the fixup in commit e833bc503405 ("mm/thp: re-apply mkdirty for small pages after split"). Now that sparc64 mkdirty handling is fixed and no longer sets a PTE/PMD writable that shouldn't be writable, let's revert the temporary fix and remove the stale comment. The mkdirty mm selftest still passes with this change on sparc64. Note that loongarch handling was fixed in commit bf2f34a506e6 ("LoongArch: Set _PAGE_DIRTY only if _PAGE_WRITE is set in {pmd,pte}_mkdirty()") Link: https://lkml.kernel.org/r/20230411142512.438404-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: David S. Miller Cc: Hugh Dickins Cc: Peter Xu Cc: Sam Ravnborg Cc: Shuah Khan Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/huge_memory.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 00a7a98f2ba2..5700b1c727b6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2236,18 +2236,13 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = maybe_mkwrite(entry, vma); if (anon_exclusive) SetPageAnonExclusive(page + i); + if (!write) + entry = pte_wrprotect(entry); if (!young) entry = pte_mkold(entry); /* NOTE: this may set soft-dirty too on some archs */ if (dirty) entry = pte_mkdirty(entry); - /* - * NOTE: this needs to happen after pte_mkdirty, - * because some archs (sparc64, loongarch) could - * set hw write bit when mkdirty. - */ - if (!write) - entry = pte_wrprotect(entry); if (soft_dirty) entry = pte_mksoft_dirty(entry); if (uffd_wp) -- cgit v1.2.3-70-g09d2 From 1462c52e9f2b99e72022ed8979bfc969894bb3da Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 11 Apr 2023 16:25:12 +0200 Subject: mm/huge_memory: conditionally call maybe_mkwrite() and drop pte_wrprotect() in __split_huge_pmd_locked() No need to call maybe_mkwrite() to then wrprotect if the source PMD was not writable. It's worth nothing that this now allows for PTEs to be writable even if the source PMD was not writable: if vma->vm_page_prot includes write permissions. As documented in commit 931298e103c2 ("mm/userfaultfd: rely on vma->vm_page_prot in uffd_wp_range()"), any mechanism that intends to have pages wrprotected (COW, writenotify, mprotect, uffd-wp, softdirty, ...) has to properly adjust vma->vm_page_prot upfront, to not include write permissions. If vma->vm_page_prot includes write permissions, the PTE/PMD can be writable as default. This now mimics the handling in mm/migrate.c:remove_migration_pte() and in mm/huge_memory.c:remove_migration_pmd(), which has been in place for a long time (except that 96a9c287e25d ("mm/migrate: fix wrongly apply write bit after mkdirty on sparc64") temporarily changed it). Link: https://lkml.kernel.org/r/20230411142512.438404-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: David S. Miller Cc: Hugh Dickins Cc: Peter Xu Cc: Sam Ravnborg Cc: Shuah Khan Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/huge_memory.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5700b1c727b6..c23fa39dec92 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2233,11 +2233,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_swp_mkuffd_wp(entry); } else { entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); - entry = maybe_mkwrite(entry, vma); + if (write) + entry = maybe_mkwrite(entry, vma); if (anon_exclusive) SetPageAnonExclusive(page + i); - if (!write) - entry = pte_wrprotect(entry); if (!young) entry = pte_mkold(entry); /* NOTE: this may set soft-dirty too on some archs */ -- cgit v1.2.3-70-g09d2 From f3ebdf042df4e08bab1d5f8bf1c4b959d8741c10 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 18 Apr 2023 16:21:13 +0200 Subject: mm: don't check VMA write permissions if the PTE/PMD indicates write permissions Staring at the comment "Recheck VMA as permissions can change since migration started" in remove_migration_pte() can result in confusion, because if the source PTE/PMD indicates write permissions, then there should be no need to check VMA write permissions when restoring migration entries or PTE-mapping a PMD. Commit d3cb8bf6081b ("mm: migrate: Close race between migration completion and mprotect") introduced the maybe_mkwrite() handling in remove_migration_pte() in 2014, stating that a race between mprotect() and migration finishing would be possible, and that we could end up with a writable PTE that should be readable. However, mprotect() code first updates vma->vm_flags / vma->vm_page_prot and then walks the page tables to (a) set all present writable PTEs to read-only and (b) convert all writable migration entries to readable migration entries. While walking the page tables and modifying the entries, migration code has to grab the PT locks to synchronize against concurrent page table modifications. Assuming migration would find a writable migration entry (while holding the PT lock) and replace it with a writable present PTE, surely mprotect() code didn't stumble over the writable migration entry yet (converting it into a readable migration entry) and would instead wait for the PT lock to convert the now present writable PTE into a read-only PTE. As mprotect() didn't finish yet, the behavior is just like migration didn't happen: a writable PTE will be converted to a read-only PTE. So it's fine to rely on the writability information in the source PTE/PMD and not recheck against the VMA as long as we're holding the PT lock to synchronize with anyone who concurrently wants to downgrade write permissions (like mprotect()) by first adjusting vma->vm_flags / vma->vm_page_prot to then walk over the page tables to adjust the page table entries. Running test cases that should reveal such races -- mprotect(PROT_READ) racing with page migration or THP splitting -- for multiple hours did not reveal an issue with this cleanup. Link: https://lkml.kernel.org/r/20230418142113.439494-1-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Kirill A. Shutemov Reviewed-by: Alistair Popple Cc: Mel Gorman Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/huge_memory.c | 4 ++-- mm/migrate.c | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c23fa39dec92..624671aaa60d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2234,7 +2234,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } else { entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); if (write) - entry = maybe_mkwrite(entry, vma); + entry = pte_mkwrite(entry); if (anon_exclusive) SetPageAnonExclusive(page + i); if (!young) @@ -3271,7 +3271,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); if (is_writable_migration_entry(entry)) - pmde = maybe_pmd_mkwrite(pmde, vma); + pmde = pmd_mkwrite(pmde); if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_mkuffd_wp(pmde); if (!is_migration_entry_young(entry)) diff --git a/mm/migrate.c b/mm/migrate.c index 5d95e09b1618..02cace7955d4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -213,16 +213,13 @@ static bool remove_migration_pte(struct folio *folio, if (pte_swp_soft_dirty(*pvmw.pte)) pte = pte_mksoft_dirty(pte); - /* - * Recheck VMA as permissions can change since migration started - */ entry = pte_to_swp_entry(*pvmw.pte); if (!is_migration_entry_young(entry)) pte = pte_mkold(pte); if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) pte = pte_mkdirty(pte); if (is_writable_migration_entry(entry)) - pte = maybe_mkwrite(pte, vma); + pte = pte_mkwrite(pte); else if (pte_swp_uffd_wp(*pvmw.pte)) pte = pte_mkuffd_wp(pte); -- cgit v1.2.3-70-g09d2