summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c562
1 files changed, 430 insertions, 132 deletions
diff --git a/mm/memory.c b/mm/memory.c
index cda2c12c500b..2366578015ad 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -666,17 +666,16 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
return NULL;
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
{
unsigned long pfn = pmd_pfn(pmd);
- /*
- * There is no pmd_special() but there may be special pmds, e.g.
- * in a direct-access (dax) mapping, so let's just replicate the
- * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
- */
+ /* Currently it's only used for huge pfnmaps */
+ if (unlikely(pmd_special(pmd)))
+ return NULL;
+
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
@@ -927,8 +926,11 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* We have a prealloc page, all good! Take it
* over and copy the page & arm it.
*/
+
+ if (copy_mc_user_highpage(&new_folio->page, page, addr, src_vma))
+ return -EHWPOISON;
+
*prealloc = NULL;
- copy_user_highpage(&new_folio->page, page, addr, src_vma);
__folio_mark_uptodate(new_folio);
folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE);
folio_add_lru_vma(new_folio, dst_vma);
@@ -1167,8 +1169,9 @@ again:
/*
* If we need a pre-allocated page for this pte, drop the
* locks, allocate, and try again.
+ * If copy failed due to hwpoison in source page, break out.
*/
- if (unlikely(ret == -EAGAIN))
+ if (unlikely(ret == -EAGAIN || ret == -EHWPOISON))
break;
if (unlikely(prealloc)) {
/*
@@ -1198,7 +1201,7 @@ again:
goto out;
}
entry.val = 0;
- } else if (ret == -EBUSY) {
+ } else if (ret == -EBUSY || unlikely(ret == -EHWPOISON)) {
goto out;
} else if (ret == -EAGAIN) {
prealloc = folio_prealloc(src_mm, src_vma, addr, false);
@@ -4001,6 +4004,194 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
+static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct folio *folio;
+ swp_entry_t entry;
+
+ folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
+ vmf->address, false);
+ if (!folio)
+ return NULL;
+
+ entry = pte_to_swp_entry(vmf->orig_pte);
+ if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
+ GFP_KERNEL, entry)) {
+ folio_put(folio);
+ return NULL;
+ }
+
+ return folio;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
+{
+ struct swap_info_struct *si = swp_swap_info(entry);
+ pgoff_t offset = swp_offset(entry);
+ int i;
+
+ /*
+ * While allocating a large folio and doing swap_read_folio, which is
+ * the case the being faulted pte doesn't have swapcache. We need to
+ * ensure all PTEs have no cache as well, otherwise, we might go to
+ * swap devices while the content is in swapcache.
+ */
+ for (i = 0; i < max_nr; i++) {
+ if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
+ return i;
+ }
+
+ return i;
+}
+
+/*
+ * Check if the PTEs within a range are contiguous swap entries
+ * and have consistent swapcache, zeromap.
+ */
+static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
+{
+ unsigned long addr;
+ swp_entry_t entry;
+ int idx;
+ pte_t pte;
+
+ addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
+ idx = (vmf->address - addr) / PAGE_SIZE;
+ pte = ptep_get(ptep);
+
+ if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
+ return false;
+ entry = pte_to_swp_entry(pte);
+ if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
+ return false;
+
+ /*
+ * swap_read_folio() can't handle the case a large folio is hybridly
+ * from different backends. And they are likely corner cases. Similar
+ * things might be added once zswap support large folios.
+ */
+ if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
+ return false;
+ if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
+ return false;
+
+ return true;
+}
+
+static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset,
+ unsigned long addr,
+ unsigned long orders)
+{
+ int order, nr;
+
+ order = highest_order(orders);
+
+ /*
+ * To swap in a THP with nr pages, we require that its first swap_offset
+ * is aligned with that number, as it was when the THP was swapped out.
+ * This helps filter out most invalid entries.
+ */
+ while (orders) {
+ nr = 1 << order;
+ if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr)
+ break;
+ order = next_order(&orders, order);
+ }
+
+ return orders;
+}
+
+static struct folio *alloc_swap_folio(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long orders;
+ struct folio *folio;
+ unsigned long addr;
+ swp_entry_t entry;
+ spinlock_t *ptl;
+ pte_t *pte;
+ gfp_t gfp;
+ int order;
+
+ /*
+ * If uffd is active for the vma we need per-page fault fidelity to
+ * maintain the uffd semantics.
+ */
+ if (unlikely(userfaultfd_armed(vma)))
+ goto fallback;
+
+ /*
+ * A large swapped out folio could be partially or fully in zswap. We
+ * lack handling for such cases, so fallback to swapping in order-0
+ * folio.
+ */
+ if (!zswap_never_enabled())
+ goto fallback;
+
+ entry = pte_to_swp_entry(vmf->orig_pte);
+ /*
+ * Get a list of all the (large) orders below PMD_ORDER that are enabled
+ * and suitable for swapping THP.
+ */
+ orders = thp_vma_allowable_orders(vma, vma->vm_flags,
+ TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
+ orders = thp_vma_suitable_orders(vma, vmf->address, orders);
+ orders = thp_swap_suitable_orders(swp_offset(entry),
+ vmf->address, orders);
+
+ if (!orders)
+ goto fallback;
+
+ pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address & PMD_MASK, &ptl);
+ if (unlikely(!pte))
+ goto fallback;
+
+ /*
+ * For do_swap_page, find the highest order where the aligned range is
+ * completely swap entries with contiguous swap offsets.
+ */
+ order = highest_order(orders);
+ while (orders) {
+ addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+ if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order))
+ break;
+ order = next_order(&orders, order);
+ }
+
+ pte_unmap_unlock(pte, ptl);
+
+ /* Try allocating the highest of the remaining orders. */
+ gfp = vma_thp_gfp_mask(vma);
+ while (orders) {
+ addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+ folio = vma_alloc_folio(gfp, order, vma, addr, true);
+ if (folio) {
+ if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
+ gfp, entry))
+ return folio;
+ folio_put(folio);
+ }
+ order = next_order(&orders, order);
+ }
+
+fallback:
+ return __alloc_swap_folio(vmf);
+}
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static inline bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
+{
+ return false;
+}
+
+static struct folio *alloc_swap_folio(struct vm_fault *vmf)
+{
+ return __alloc_swap_folio(vmf);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -4089,35 +4280,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
- /*
- * Prevent parallel swapin from proceeding with
- * the cache flag. Otherwise, another thread may
- * finish swapin first, free the entry, and swapout
- * reusing the same entry. It's undetectable as
- * pte_same() returns true due to entry reuse.
- */
- if (swapcache_prepare(entry)) {
- /* Relax a bit to prevent rapid repeated page faults */
- schedule_timeout_uninterruptible(1);
- goto out;
- }
- need_clear_cache = true;
-
/* skip swapcache */
- folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
- vma, vmf->address, false);
- page = &folio->page;
+ folio = alloc_swap_folio(vmf);
if (folio) {
__folio_set_locked(folio);
__folio_set_swapbacked(folio);
- if (mem_cgroup_swapin_charge_folio(folio,
- vma->vm_mm, GFP_KERNEL,
- entry)) {
- ret = VM_FAULT_OOM;
+ nr_pages = folio_nr_pages(folio);
+ if (folio_test_large(folio))
+ entry.val = ALIGN_DOWN(entry.val, nr_pages);
+ /*
+ * Prevent parallel swapin from proceeding with
+ * the cache flag. Otherwise, another thread
+ * may finish swapin first, free the entry, and
+ * swapout reusing the same entry. It's
+ * undetectable as pte_same() returns true due
+ * to entry reuse.
+ */
+ if (swapcache_prepare(entry, nr_pages)) {
+ /*
+ * Relax a bit to prevent rapid
+ * repeated page faults.
+ */
+ schedule_timeout_uninterruptible(1);
goto out_page;
}
- mem_cgroup_swapin_uncharge_swap(entry);
+ need_clear_cache = true;
+
+ mem_cgroup_swapin_uncharge_swap(entry, nr_pages);
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
@@ -4131,10 +4321,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio->private = NULL;
}
} else {
- page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+ folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
vmf);
- if (page)
- folio = page_folio(page);
swapcache = folio;
}
@@ -4155,6 +4343,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
+ page = folio_file_page(folio, swp_offset(entry));
} else if (PageHWPoison(page)) {
/*
* hwpoisoned dirty swapcache pages are kept for killing
@@ -4224,6 +4413,24 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_nomap;
}
+ /* allocated large folios for SWP_SYNCHRONOUS_IO */
+ if (folio_test_large(folio) && !folio_test_swapcache(folio)) {
+ unsigned long nr = folio_nr_pages(folio);
+ unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
+ unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE;
+ pte_t *folio_ptep = vmf->pte - idx;
+ pte_t folio_pte = ptep_get(folio_ptep);
+
+ if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
+ swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
+ goto out_nomap;
+
+ page_idx = idx;
+ address = folio_start;
+ ptep = folio_ptep;
+ goto check_folio;
+ }
+
nr_pages = 1;
page_idx = 0;
address = vmf->address;
@@ -4355,11 +4562,12 @@ check_folio:
folio_add_lru_vma(folio, vma);
} else if (!folio_test_anon(folio)) {
/*
- * We currently only expect small !anon folios, which are either
- * fully exclusive or fully shared. If we ever get large folios
- * here, we have to be careful.
+ * We currently only expect small !anon folios which are either
+ * fully exclusive or fully shared, or new allocated large
+ * folios which are fully exclusive. If we ever get large
+ * folios within swapcache here, we have to be careful.
*/
- VM_WARN_ON_ONCE(folio_test_large(folio));
+ VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio));
VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
} else {
@@ -4402,7 +4610,7 @@ unlock:
out:
/* Clear the swap cache pin for direct swapin after PTL unlock */
if (need_clear_cache)
- swapcache_clear(si, entry);
+ swapcache_clear(si, entry, nr_pages);
if (si)
put_swap_device(si);
return ret;
@@ -4418,7 +4626,7 @@ out_release:
folio_put(swapcache);
}
if (need_clear_cache)
- swapcache_clear(si, entry);
+ swapcache_clear(si, entry, nr_pages);
if (si)
put_swap_device(si);
return ret;
@@ -4612,9 +4820,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
folio_ref_add(folio, nr_pages - 1);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
-#endif
folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
folio_add_lru_vma(folio, vma);
setpte:
@@ -5109,10 +5315,14 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
if (ret & VM_FAULT_DONE_COW)
return ret;
- copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
+ if (copy_mc_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma)) {
+ ret = VM_FAULT_HWPOISON;
+ goto unlock;
+ }
__folio_mark_uptodate(folio);
ret |= finish_fault(vmf);
+unlock:
unlock_page(vmf->page);
put_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -5217,16 +5427,46 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
return ret;
}
-int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
- unsigned long addr, int page_nid, int *flags)
+int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
+ unsigned long addr, int *flags,
+ bool writable, int *last_cpupid)
{
struct vm_area_struct *vma = vmf->vma;
+ /*
+ * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
+ * much anyway since they can be in shared cache state. This misses
+ * the case where a mapping is writable but the process never writes
+ * to it but pte_write gets cleared during protection updates and
+ * pte_dirty has unpredictable behaviour between PTE scan updates,
+ * background writeback, dirty balancing and application behaviour.
+ */
+ if (!writable)
+ *flags |= TNF_NO_GROUP;
+
+ /*
+ * Flag if the folio is shared between multiple address spaces. This
+ * is later used when determining whether to group tasks together
+ */
+ if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
+ *flags |= TNF_SHARED;
+ /*
+ * For memory tiering mode, cpupid of slow memory page is used
+ * to record page access time. So use default value.
+ */
+ if (folio_use_access_time(folio))
+ *last_cpupid = (-1 & LAST_CPUPID_MASK);
+ else
+ *last_cpupid = folio_last_cpupid(folio);
+
/* Record the current PID acceesing VMA */
vma_set_access_pid_bit(vma);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (page_nid == numa_node_id()) {
+#ifdef CONFIG_NUMA_BALANCING
+ count_memcg_folio_events(folio, NUMA_HINT_FAULTS, 1);
+#endif
+ if (folio_nid(folio) == numa_node_id()) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
*flags |= TNF_FAULT_LOCAL;
}
@@ -5328,36 +5568,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
if (!folio || folio_is_zone_device(folio))
goto out_map;
- /*
- * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
- * much anyway since they can be in shared cache state. This misses
- * the case where a mapping is writable but the process never writes
- * to it but pte_write gets cleared during protection updates and
- * pte_dirty has unpredictable behaviour between PTE scan updates,
- * background writeback, dirty balancing and application behaviour.
- */
- if (!writable)
- flags |= TNF_NO_GROUP;
-
- /*
- * Flag if the folio is shared between multiple address spaces. This
- * is later used when determining whether to group tasks together
- */
- if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
- flags |= TNF_SHARED;
-
nid = folio_nid(folio);
nr_pages = folio_nr_pages(folio);
- /*
- * For memory tiering mode, cpupid of slow memory page is used
- * to record page access time. So use default value.
- */
- if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
- !node_is_toptier(nid))
- last_cpupid = (-1 & LAST_CPUPID_MASK);
- else
- last_cpupid = folio_last_cpupid(folio);
- target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags);
+
+ target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags,
+ writable, &last_cpupid);
if (target_nid == NUMA_NO_NODE)
goto out_map;
if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
@@ -6013,10 +6228,6 @@ retry:
if (!vma_start_read(vma))
goto inval;
- /* Check since vm_start/vm_end might change before we lock the VMA */
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
- goto inval_end_read;
-
/* Check if the VMA got isolated after we found it */
if (vma->detached) {
vma_end_read(vma);
@@ -6024,6 +6235,16 @@ retry:
/* The area was replaced with another one */
goto retry;
}
+ /*
+ * At this point, we have a stable reference to a VMA: The VMA is
+ * locked and we know it hasn't already been isolated.
+ * From here on, we can access the VMA without worrying about which
+ * fields are accessible for RCU readers.
+ */
+
+ /* Check since vm_start/vm_end might change before we lock the VMA */
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ goto inval_end_read;
rcu_read_unlock();
return vma;
@@ -6108,78 +6329,155 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
}
#endif /* __PAGETABLE_PMD_FOLDED */
+static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
+ spinlock_t *lock, pte_t *ptep,
+ pgprot_t pgprot, unsigned long pfn_base,
+ unsigned long addr_mask, bool writable,
+ bool special)
+{
+ args->lock = lock;
+ args->ptep = ptep;
+ args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
+ args->pgprot = pgprot;
+ args->writable = writable;
+ args->special = special;
+}
+
+static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_LOCKDEP
+ struct address_space *mapping = vma->vm_file->f_mapping;
+
+ if (mapping)
+ lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) ||
+ lockdep_is_held(&vma->vm_mm->mmap_lock));
+ else
+ lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock));
+#endif
+}
+
/**
- * follow_pte - look up PTE at a user virtual address
- * @vma: the memory mapping
- * @address: user virtual address
- * @ptepp: location to store found PTE
- * @ptlp: location to store the lock for the PTE
+ * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address
+ * @args: Pointer to struct @follow_pfnmap_args
+ *
+ * The caller needs to setup args->vma and args->address to point to the
+ * virtual address as the target of such lookup. On a successful return,
+ * the results will be put into other output fields.
*
- * On a successful return, the pointer to the PTE is stored in @ptepp;
- * the corresponding lock is taken and its location is stored in @ptlp.
+ * After the caller finished using the fields, the caller must invoke
+ * another follow_pfnmap_end() to proper releases the locks and resources
+ * of such look up request.
*
- * The contents of the PTE are only stable until @ptlp is released using
- * pte_unmap_unlock(). This function will fail if the PTE is non-present.
- * Present PTEs may include PTEs that map refcounted pages, such as
- * anonymous folios in COW mappings.
+ * During the start() and end() calls, the results in @args will be valid
+ * as proper locks will be held. After the end() is called, all the fields
+ * in @follow_pfnmap_args will be invalid to be further accessed. Further
+ * use of such information after end() may require proper synchronizations
+ * by the caller with page table updates, otherwise it can create a
+ * security bug.
*
- * Callers must be careful when relying on PTE content after
- * pte_unmap_unlock(). Especially if the PTE maps a refcounted page,
- * callers must protect against invalidation with MMU notifiers; otherwise
- * access to the PFN at a later point in time can trigger use-after-free.
+ * If the PTE maps a refcounted page, callers are responsible to protect
+ * against invalidation with MMU notifiers; otherwise access to the PFN at
+ * a later point in time can trigger use-after-free.
*
* Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
- * should be taken for read.
+ * should be taken for read, and the mmap semaphore cannot be released
+ * before the end() is invoked.
*
* This function must not be used to modify PTE content.
*
- * Return: zero on success, -ve otherwise.
+ * Return: zero on success, negative otherwise.
*/
-int follow_pte(struct vm_area_struct *vma, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp)
+int follow_pfnmap_start(struct follow_pfnmap_args *args)
{
+ struct vm_area_struct *vma = args->vma;
+ unsigned long address = args->address;
struct mm_struct *mm = vma->vm_mm;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *ptep;
+ spinlock_t *lock;
+ pgd_t *pgdp;
+ p4d_t *p4dp, p4d;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+
+ pfnmap_lockdep_assert(vma);
- mmap_assert_locked(mm);
if (unlikely(address < vma->vm_start || address >= vma->vm_end))
goto out;
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
goto out;
-
- pgd = pgd_offset(mm, address);
- if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+retry:
+ pgdp = pgd_offset(mm, address);
+ if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp)))
goto out;
- p4d = p4d_offset(pgd, address);
- if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
+ p4dp = p4d_offset(pgdp, address);
+ p4d = READ_ONCE(*p4dp);
+ if (p4d_none(p4d) || unlikely(p4d_bad(p4d)))
goto out;
- pud = pud_offset(p4d, address);
- if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+ pudp = pud_offset(p4dp, address);
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
goto out;
+ if (pud_leaf(pud)) {
+ lock = pud_lock(mm, pudp);
+ if (!unlikely(pud_leaf(pud))) {
+ spin_unlock(lock);
+ goto retry;
+ }
+ pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud),
+ pud_pfn(pud), PUD_MASK, pud_write(pud),
+ pud_special(pud));
+ return 0;
+ }
- pmd = pmd_offset(pud, address);
- VM_BUG_ON(pmd_trans_huge(*pmd));
+ pmdp = pmd_offset(pudp, address);
+ pmd = pmdp_get_lockless(pmdp);
+ if (pmd_leaf(pmd)) {
+ lock = pmd_lock(mm, pmdp);
+ if (!unlikely(pmd_leaf(pmd))) {
+ spin_unlock(lock);
+ goto retry;
+ }
+ pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd),
+ pmd_pfn(pmd), PMD_MASK, pmd_write(pmd),
+ pmd_special(pmd));
+ return 0;
+ }
- ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
+ ptep = pte_offset_map_lock(mm, pmdp, address, &lock);
if (!ptep)
goto out;
- if (!pte_present(ptep_get(ptep)))
+ pte = ptep_get(ptep);
+ if (!pte_present(pte))
goto unlock;
- *ptepp = ptep;
+ pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte),
+ pte_pfn(pte), PAGE_MASK, pte_write(pte),
+ pte_special(pte));
return 0;
unlock:
- pte_unmap_unlock(ptep, *ptlp);
+ pte_unmap_unlock(ptep, lock);
out:
return -EINVAL;
}
-EXPORT_SYMBOL_GPL(follow_pte);
+EXPORT_SYMBOL_GPL(follow_pfnmap_start);
+
+/**
+ * follow_pfnmap_end(): End a follow_pfnmap_start() process
+ * @args: Pointer to struct @follow_pfnmap_args
+ *
+ * Must be used in pair of follow_pfnmap_start(). See the start() function
+ * above for more information.
+ */
+void follow_pfnmap_end(struct follow_pfnmap_args *args)
+{
+ if (args->lock)
+ spin_unlock(args->lock);
+ if (args->ptep)
+ pte_unmap(args->ptep);
+}
+EXPORT_SYMBOL_GPL(follow_pfnmap_end);
#ifdef CONFIG_HAVE_IOREMAP_PROT
/**
@@ -6200,34 +6498,34 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
resource_size_t phys_addr;
unsigned long prot = 0;
void __iomem *maddr;
- pte_t *ptep, pte;
- spinlock_t *ptl;
int offset = offset_in_page(addr);
int ret = -EINVAL;
+ bool writable;
+ struct follow_pfnmap_args args = { .vma = vma, .address = addr };
retry:
- if (follow_pte(vma, addr, &ptep, &ptl))
+ if (follow_pfnmap_start(&args))
return -EINVAL;
- pte = ptep_get(ptep);
- pte_unmap_unlock(ptep, ptl);
-
- prot = pgprot_val(pte_pgprot(pte));
- phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
+ prot = pgprot_val(args.pgprot);
+ phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT;
+ writable = args.writable;
+ follow_pfnmap_end(&args);
- if ((write & FOLL_WRITE) && !pte_write(pte))
+ if ((write & FOLL_WRITE) && !writable)
return -EINVAL;
maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
if (!maddr)
return -ENOMEM;
- if (follow_pte(vma, addr, &ptep, &ptl))
+ if (follow_pfnmap_start(&args))
goto out_unmap;
- if (!pte_same(pte, ptep_get(ptep))) {
- pte_unmap_unlock(ptep, ptl);
+ if ((prot != pgprot_val(args.pgprot)) ||
+ (phys_addr != (args.pfn << PAGE_SHIFT)) ||
+ (writable != args.writable)) {
+ follow_pfnmap_end(&args);
iounmap(maddr);
-
goto retry;
}
@@ -6236,7 +6534,7 @@ retry:
else
memcpy_fromio(buf, maddr + offset, len);
ret = len;
- pte_unmap_unlock(ptep, ptl);
+ follow_pfnmap_end(&args);
out_unmap:
iounmap(maddr);
@@ -6587,7 +6885,7 @@ long copy_folio_from_user(struct folio *dst_folio,
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
-#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
+#if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS
static struct kmem_cache *page_ptl_cachep;