diff options
Diffstat (limited to 'fs/dax.c')
| -rw-r--r-- | fs/dax.c | 382 | 
1 files changed, 165 insertions, 217 deletions
| @@ -42,6 +42,9 @@  #define DAX_WAIT_TABLE_BITS 12  #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) +/* The 'colour' (ie low bits) within a PMD of a page offset.  */ +#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1) +  static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];  static int __init init_dax_wait_table(void) @@ -54,6 +57,40 @@ static int __init init_dax_wait_table(void)  }  fs_initcall(init_dax_wait_table); +/* + * We use lowest available bit in exceptional entry for locking, one bit for + * the entry size (PMD) and two more to tell us if the entry is a zero page or + * an empty entry that is just used for locking.  In total four special bits. + * + * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE + * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem + * block allocation. + */ +#define RADIX_DAX_SHIFT		(RADIX_TREE_EXCEPTIONAL_SHIFT + 4) +#define RADIX_DAX_ENTRY_LOCK	(1 << RADIX_TREE_EXCEPTIONAL_SHIFT) +#define RADIX_DAX_PMD		(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) +#define RADIX_DAX_ZERO_PAGE	(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) +#define RADIX_DAX_EMPTY		(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) + +static unsigned long dax_radix_sector(void *entry) +{ +	return (unsigned long)entry >> RADIX_DAX_SHIFT; +} + +static void *dax_radix_locked_entry(sector_t sector, unsigned long flags) +{ +	return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | +			((unsigned long)sector << RADIX_DAX_SHIFT) | +			RADIX_DAX_ENTRY_LOCK); +} + +static unsigned int dax_radix_order(void *entry) +{ +	if ((unsigned long)entry & RADIX_DAX_PMD) +		return PMD_SHIFT - PAGE_SHIFT; +	return 0; +} +  static int dax_is_pmd_entry(void *entry)  {  	return (unsigned long)entry & RADIX_DAX_PMD; @@ -66,7 +103,7 @@ static int dax_is_pte_entry(void *entry)  static int dax_is_zero_entry(void *entry)  { -	return (unsigned long)entry & RADIX_DAX_HZP; +	return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;  }  static int dax_is_empty_entry(void *entry) @@ -98,7 +135,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,  	 * the range covered by the PMD map to the same bit lock.  	 */  	if (dax_is_pmd_entry(entry)) -		index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); +		index &= ~PG_PMD_COLOUR;  	key->mapping = mapping;  	key->entry_start = index; @@ -121,6 +158,31 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo  }  /* + * We do not necessarily hold the mapping->tree_lock when we call this + * function so it is possible that 'entry' is no longer a valid item in the + * radix tree.  This is okay because all we really need to do is to find the + * correct waitqueue where tasks might be waiting for that old 'entry' and + * wake them. + */ +static void dax_wake_mapping_entry_waiter(struct address_space *mapping, +		pgoff_t index, void *entry, bool wake_all) +{ +	struct exceptional_entry_key key; +	wait_queue_head_t *wq; + +	wq = dax_entry_waitqueue(mapping, index, entry, &key); + +	/* +	 * Checking for locked entry and prepare_to_wait_exclusive() happens +	 * under mapping->tree_lock, ditto for entry handling in our callers. +	 * So at this point all tasks that could have seen our entry locked +	 * must be in the waitqueue and the following check will see them. +	 */ +	if (waitqueue_active(wq)) +		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); +} + +/*   * Check whether the given slot is locked. The function must be called with   * mapping->tree_lock held   */ @@ -181,7 +243,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,  	for (;;) {  		entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,  					  &slot); -		if (!entry || !radix_tree_exceptional_entry(entry) || +		if (!entry || +		    WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||  		    !slot_locked(mapping, slot)) {  			if (slotp)  				*slotp = slot; @@ -216,14 +279,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,  }  static void put_locked_mapping_entry(struct address_space *mapping, -				     pgoff_t index, void *entry) +		pgoff_t index)  { -	if (!radix_tree_exceptional_entry(entry)) { -		unlock_page(entry); -		put_page(entry); -	} else { -		dax_unlock_mapping_entry(mapping, index); -	} +	dax_unlock_mapping_entry(mapping, index);  }  /* @@ -233,7 +291,7 @@ static void put_locked_mapping_entry(struct address_space *mapping,  static void put_unlocked_mapping_entry(struct address_space *mapping,  				       pgoff_t index, void *entry)  { -	if (!radix_tree_exceptional_entry(entry)) +	if (!entry)  		return;  	/* We have to wake up next waiter for the radix tree entry lock */ @@ -241,15 +299,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,  }  /* - * Find radix tree entry at given index. If it points to a page, return with - * the page locked. If it points to the exceptional entry, return with the - * radix tree entry locked. If the radix tree doesn't contain given index, - * create empty exceptional entry for the index and return with it locked. + * Find radix tree entry at given index. If it points to an exceptional entry, + * return it with the radix tree entry locked. If the radix tree doesn't + * contain given index, create an empty exceptional entry for the index and + * return with it locked.   *   * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will   * either return that locked entry or will return an error.  This error will - * happen if there are any 4k entries (either zero pages or DAX entries) - * within the 2MiB range that we are requesting. + * happen if there are any 4k entries within the 2MiB range that we are + * requesting.   *   * We always favor 4k entries over 2MiB entries. There isn't a flow where we   * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB @@ -276,18 +334,21 @@ restart:  	spin_lock_irq(&mapping->tree_lock);  	entry = get_unlocked_mapping_entry(mapping, index, &slot); +	if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { +		entry = ERR_PTR(-EIO); +		goto out_unlock; +	} +  	if (entry) {  		if (size_flag & RADIX_DAX_PMD) { -			if (!radix_tree_exceptional_entry(entry) || -			    dax_is_pte_entry(entry)) { +			if (dax_is_pte_entry(entry)) {  				put_unlocked_mapping_entry(mapping, index,  						entry);  				entry = ERR_PTR(-EEXIST);  				goto out_unlock;  			}  		} else { /* trying to grab a PTE entry */ -			if (radix_tree_exceptional_entry(entry) && -			    dax_is_pmd_entry(entry) && +			if (dax_is_pmd_entry(entry) &&  			    (dax_is_zero_entry(entry) ||  			     dax_is_empty_entry(entry))) {  				pmd_downgrade = true; @@ -321,7 +382,7 @@ restart:  				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);  		if (err) {  			if (pmd_downgrade) -				put_locked_mapping_entry(mapping, index, entry); +				put_locked_mapping_entry(mapping, index);  			return ERR_PTR(err);  		}  		spin_lock_irq(&mapping->tree_lock); @@ -371,52 +432,12 @@ restart:  		spin_unlock_irq(&mapping->tree_lock);  		return entry;  	} -	/* Normal page in radix tree? */ -	if (!radix_tree_exceptional_entry(entry)) { -		struct page *page = entry; - -		get_page(page); -		spin_unlock_irq(&mapping->tree_lock); -		lock_page(page); -		/* Page got truncated? Retry... */ -		if (unlikely(page->mapping != mapping)) { -			unlock_page(page); -			put_page(page); -			goto restart; -		} -		return page; -	}  	entry = lock_slot(mapping, slot);   out_unlock:  	spin_unlock_irq(&mapping->tree_lock);  	return entry;  } -/* - * We do not necessarily hold the mapping->tree_lock when we call this - * function so it is possible that 'entry' is no longer a valid item in the - * radix tree.  This is okay because all we really need to do is to find the - * correct waitqueue where tasks might be waiting for that old 'entry' and - * wake them. - */ -void dax_wake_mapping_entry_waiter(struct address_space *mapping, -		pgoff_t index, void *entry, bool wake_all) -{ -	struct exceptional_entry_key key; -	wait_queue_head_t *wq; - -	wq = dax_entry_waitqueue(mapping, index, entry, &key); - -	/* -	 * Checking for locked entry and prepare_to_wait_exclusive() happens -	 * under mapping->tree_lock, ditto for entry handling in our callers. -	 * So at this point all tasks that could have seen our entry locked -	 * must be in the waitqueue and the following check will see them. -	 */ -	if (waitqueue_active(wq)) -		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); -} -  static int __dax_invalidate_mapping_entry(struct address_space *mapping,  					  pgoff_t index, bool trunc)  { @@ -426,7 +447,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,  	spin_lock_irq(&mapping->tree_lock);  	entry = get_unlocked_mapping_entry(mapping, index, NULL); -	if (!entry || !radix_tree_exceptional_entry(entry)) +	if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))  		goto out;  	if (!trunc &&  	    (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || @@ -468,50 +489,6 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,  	return __dax_invalidate_mapping_entry(mapping, index, false);  } -/* - * The user has performed a load from a hole in the file.  Allocating - * a new page in the file would cause excessive storage usage for - * workloads with sparse files.  We allocate a page cache page instead. - * We'll kick it out of the page cache if it's ever written to, - * otherwise it will simply fall out of the page cache under memory - * pressure without ever having been dirtied. - */ -static int dax_load_hole(struct address_space *mapping, void **entry, -			 struct vm_fault *vmf) -{ -	struct inode *inode = mapping->host; -	struct page *page; -	int ret; - -	/* Hole page already exists? Return it...  */ -	if (!radix_tree_exceptional_entry(*entry)) { -		page = *entry; -		goto finish_fault; -	} - -	/* This will replace locked radix tree entry with a hole page */ -	page = find_or_create_page(mapping, vmf->pgoff, -				   vmf->gfp_mask | __GFP_ZERO); -	if (!page) { -		ret = VM_FAULT_OOM; -		goto out; -	} - -finish_fault: -	vmf->page = page; -	ret = finish_fault(vmf); -	vmf->page = NULL; -	*entry = page; -	if (!ret) { -		/* Grab reference for PTE that is now referencing the page */ -		get_page(page); -		ret = VM_FAULT_NOPAGE; -	} -out: -	trace_dax_load_hole(inode, vmf, ret); -	return ret; -} -  static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,  		sector_t sector, size_t size, struct page *to,  		unsigned long vaddr) @@ -552,47 +529,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,  				      unsigned long flags)  {  	struct radix_tree_root *page_tree = &mapping->page_tree; -	int error = 0; -	bool hole_fill = false;  	void *new_entry;  	pgoff_t index = vmf->pgoff;  	if (vmf->flags & FAULT_FLAG_WRITE)  		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES); -	/* Replacing hole page with block mapping? */ -	if (!radix_tree_exceptional_entry(entry)) { -		hole_fill = true; -		/* -		 * Unmap the page now before we remove it from page cache below. -		 * The page is locked so it cannot be faulted in again. -		 */ -		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, -				    PAGE_SIZE, 0); -		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); -		if (error) -			return ERR_PTR(error); -	} else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { -		/* replacing huge zero page with PMD block mapping */ -		unmap_mapping_range(mapping, -			(vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); +	if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { +		/* we are replacing a zero page with block mapping */ +		if (dax_is_pmd_entry(entry)) +			unmap_mapping_range(mapping, +					(vmf->pgoff << PAGE_SHIFT) & PMD_MASK, +					PMD_SIZE, 0); +		else /* pte entry */ +			unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, +					PAGE_SIZE, 0);  	}  	spin_lock_irq(&mapping->tree_lock);  	new_entry = dax_radix_locked_entry(sector, flags); -	if (hole_fill) { -		__delete_from_page_cache(entry, NULL); -		/* Drop pagecache reference */ -		put_page(entry); -		error = __radix_tree_insert(page_tree, index, -				dax_radix_order(new_entry), new_entry); -		if (error) { -			new_entry = ERR_PTR(error); -			goto unlock; -		} -		mapping->nrexceptional++; -	} else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { +	if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {  		/*  		 * Only swap our new entry into the radix tree if the current  		 * entry is a zero page or an empty entry.  If a normal PTE or @@ -609,23 +566,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,  		WARN_ON_ONCE(ret != entry);  		__radix_tree_replace(page_tree, node, slot,  				     new_entry, NULL, NULL); +		entry = new_entry;  	} +  	if (vmf->flags & FAULT_FLAG_WRITE)  		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); - unlock: +  	spin_unlock_irq(&mapping->tree_lock); -	if (hole_fill) { -		radix_tree_preload_end(); -		/* -		 * We don't need hole page anymore, it has been replaced with -		 * locked radix tree entry now. -		 */ -		if (mapping->a_ops->freepage) -			mapping->a_ops->freepage(entry); -		unlock_page(entry); -		put_page(entry); -	} -	return new_entry; +	return entry;  }  static inline unsigned long @@ -646,11 +594,10 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,  	pte_t pte, *ptep = NULL;  	pmd_t *pmdp = NULL;  	spinlock_t *ptl; -	bool changed;  	i_mmap_lock_read(mapping);  	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { -		unsigned long address; +		unsigned long address, start, end;  		cond_resched(); @@ -658,8 +605,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,  			continue;  		address = pgoff_address(index, vma); -		changed = false; -		if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) + +		/* +		 * Note because we provide start/end to follow_pte_pmd it will +		 * call mmu_notifier_invalidate_range_start() on our behalf +		 * before taking any lock. +		 */ +		if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))  			continue;  		if (pmdp) { @@ -676,7 +628,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,  			pmd = pmd_wrprotect(pmd);  			pmd = pmd_mkclean(pmd);  			set_pmd_at(vma->vm_mm, address, pmdp, pmd); -			changed = true; +			mmu_notifier_invalidate_range(vma->vm_mm, start, end);  unlock_pmd:  			spin_unlock(ptl);  #endif @@ -691,13 +643,12 @@ unlock_pmd:  			pte = pte_wrprotect(pte);  			pte = pte_mkclean(pte);  			set_pte_at(vma->vm_mm, address, ptep, pte); -			changed = true; +			mmu_notifier_invalidate_range(vma->vm_mm, start, end);  unlock_pte:  			pte_unmap_unlock(ptep, ptl);  		} -		if (changed) -			mmu_notifier_invalidate_page(vma->vm_mm, address); +		mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);  	}  	i_mmap_unlock_read(mapping);  } @@ -724,7 +675,7 @@ static int dax_writeback_one(struct block_device *bdev,  	spin_lock_irq(&mapping->tree_lock);  	entry2 = get_unlocked_mapping_entry(mapping, index, &slot);  	/* Entry got punched out / reallocated? */ -	if (!entry2 || !radix_tree_exceptional_entry(entry2)) +	if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))  		goto put_unlocked;  	/*  	 * Entry got reallocated elsewhere? No need to writeback. We have to @@ -796,7 +747,7 @@ static int dax_writeback_one(struct block_device *bdev,  	trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);   dax_unlock:  	dax_read_unlock(id); -	put_locked_mapping_entry(mapping, index, entry); +	put_locked_mapping_entry(mapping, index);  	return ret;   put_unlocked: @@ -871,11 +822,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);  static int dax_insert_mapping(struct address_space *mapping,  		struct block_device *bdev, struct dax_device *dax_dev, -		sector_t sector, size_t size, void **entryp, +		sector_t sector, size_t size, void *entry,  		struct vm_area_struct *vma, struct vm_fault *vmf)  {  	unsigned long vaddr = vmf->address; -	void *entry = *entryp;  	void *ret, *kaddr;  	pgoff_t pgoff;  	int id, rc; @@ -896,47 +846,48 @@ static int dax_insert_mapping(struct address_space *mapping,  	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);  	if (IS_ERR(ret))  		return PTR_ERR(ret); -	*entryp = ret;  	trace_dax_insert_mapping(mapping->host, vmf, ret); -	return vm_insert_mixed(vma, vaddr, pfn); +	if (vmf->flags & FAULT_FLAG_WRITE) +		return vm_insert_mixed_mkwrite(vma, vaddr, pfn); +	else +		return vm_insert_mixed(vma, vaddr, pfn);  } -/** - * dax_pfn_mkwrite - handle first write to DAX page - * @vmf: The description of the fault +/* + * The user has performed a load from a hole in the file.  Allocating a new + * page in the file would cause excessive storage usage for workloads with + * sparse files.  Instead we insert a read-only mapping of the 4k zero page. + * If this page is ever written to we will re-fault and change the mapping to + * point to real DAX storage instead.   */ -int dax_pfn_mkwrite(struct vm_fault *vmf) +static int dax_load_hole(struct address_space *mapping, void *entry, +			 struct vm_fault *vmf)  { -	struct file *file = vmf->vma->vm_file; -	struct address_space *mapping = file->f_mapping;  	struct inode *inode = mapping->host; -	void *entry, **slot; -	pgoff_t index = vmf->pgoff; +	unsigned long vaddr = vmf->address; +	int ret = VM_FAULT_NOPAGE; +	struct page *zero_page; +	void *entry2; -	spin_lock_irq(&mapping->tree_lock); -	entry = get_unlocked_mapping_entry(mapping, index, &slot); -	if (!entry || !radix_tree_exceptional_entry(entry)) { -		if (entry) -			put_unlocked_mapping_entry(mapping, index, entry); -		spin_unlock_irq(&mapping->tree_lock); -		trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); -		return VM_FAULT_NOPAGE; +	zero_page = ZERO_PAGE(0); +	if (unlikely(!zero_page)) { +		ret = VM_FAULT_OOM; +		goto out;  	} -	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); -	entry = lock_slot(mapping, slot); -	spin_unlock_irq(&mapping->tree_lock); -	/* -	 * If we race with somebody updating the PTE and finish_mkwrite_fault() -	 * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry -	 * the fault in either case. -	 */ -	finish_mkwrite_fault(vmf); -	put_locked_mapping_entry(mapping, index, entry); -	trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); -	return VM_FAULT_NOPAGE; + +	entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, +			RADIX_DAX_ZERO_PAGE); +	if (IS_ERR(entry2)) { +		ret = VM_FAULT_SIGBUS; +		goto out; +	} + +	vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page)); +out: +	trace_dax_load_hole(inode, vmf, ret); +	return ret;  } -EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);  static bool dax_range_is_aligned(struct block_device *bdev,  				 unsigned int offset, unsigned int length) @@ -1056,6 +1007,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,  		if (map_len > end - pos)  			map_len = end - pos; +		/* +		 * The userspace address for the memory copy has already been +		 * validated via access_ok() in either vfs_read() or +		 * vfs_write(), depending on which operation we are doing. +		 */  		if (iov_iter_rw(iter) == WRITE)  			map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,  					map_len, iter); @@ -1220,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,  			major = VM_FAULT_MAJOR;  		}  		error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, -				sector, PAGE_SIZE, &entry, vmf->vma, vmf); +				sector, PAGE_SIZE, entry, vmf->vma, vmf);  		/* -EBUSY is fine, somebody else faulted on the same PTE */  		if (error == -EBUSY)  			error = 0; @@ -1228,7 +1184,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,  	case IOMAP_UNWRITTEN:  	case IOMAP_HOLE:  		if (!(vmf->flags & FAULT_FLAG_WRITE)) { -			vmf_ret = dax_load_hole(mapping, &entry, vmf); +			vmf_ret = dax_load_hole(mapping, entry, vmf);  			goto finish_iomap;  		}  		/*FALLTHRU*/ @@ -1255,21 +1211,15 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,  		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);  	}   unlock_entry: -	put_locked_mapping_entry(mapping, vmf->pgoff, entry); +	put_locked_mapping_entry(mapping, vmf->pgoff);   out:  	trace_dax_pte_fault_done(inode, vmf, vmf_ret);  	return vmf_ret;  }  #ifdef CONFIG_FS_DAX_PMD -/* - * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up - * more often than one might expect in the below functions. - */ -#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1) -  static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, -		loff_t pos, void **entryp) +		loff_t pos, void *entry)  {  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;  	const sector_t sector = dax_iomap_sector(iomap, pos); @@ -1280,7 +1230,7 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,  	void *ret = NULL, *kaddr;  	long length = 0;  	pgoff_t pgoff; -	pfn_t pfn; +	pfn_t pfn = {};  	int id;  	if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) @@ -1300,11 +1250,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,  		goto unlock_fallback;  	dax_read_unlock(id); -	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, +	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,  			RADIX_DAX_PMD);  	if (IS_ERR(ret))  		goto fallback; -	*entryp = ret;  	trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);  	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, @@ -1318,7 +1267,7 @@ fallback:  }  static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, -		void **entryp) +		void *entry)  {  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;  	unsigned long pmd_addr = vmf->address & PMD_MASK; @@ -1333,11 +1282,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,  	if (unlikely(!zero_page))  		goto fallback; -	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, -			RADIX_DAX_PMD | RADIX_DAX_HZP); +	ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, +			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);  	if (IS_ERR(ret))  		goto fallback; -	*entryp = ret;  	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);  	if (!pmd_none(*(vmf->pmd))) { @@ -1413,10 +1361,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,  		goto fallback;  	/* -	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX -	 * PMD or a HZP entry.  If it can't (because a 4k page is already in -	 * the tree, for instance), it will return -EEXIST and we just fall -	 * back to 4k entries. +	 * grab_mapping_entry() will make sure we get a 2MiB empty entry, a +	 * 2MiB zero page entry or a DAX PMD.  If it can't (because a 4k page +	 * is already in the tree, for instance), it will return -EEXIST and +	 * we just fall back to 4k entries.  	 */  	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);  	if (IS_ERR(entry)) @@ -1449,13 +1397,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,  	switch (iomap.type) {  	case IOMAP_MAPPED: -		result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); +		result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);  		break;  	case IOMAP_UNWRITTEN:  	case IOMAP_HOLE:  		if (WARN_ON_ONCE(write))  			break; -		result = dax_pmd_load_hole(vmf, &iomap, &entry); +		result = dax_pmd_load_hole(vmf, &iomap, entry);  		break;  	default:  		WARN_ON_ONCE(1); @@ -1478,7 +1426,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,  				&iomap);  	}   unlock_entry: -	put_locked_mapping_entry(mapping, pgoff, entry); +	put_locked_mapping_entry(mapping, pgoff);   fallback:  	if (result == VM_FAULT_FALLBACK) {  		split_huge_pmd(vma, vmf->pmd, vmf->address); | 
