diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-06-28 10:28:11 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-06-28 10:28:11 -0700 |
commit | 6e17c6de3ddf3073741d9c91a796ee696914d8a0 (patch) | |
tree | 2c425707f78642625dbe2c824c7fded2021e3dc7 /fs | |
parent | 6aeadf7896bff4ca230702daba8788455e6b866e (diff) | |
parent | acc72d59c7509540c27c49625cb4b5a8db1f1a84 (diff) |
Merge tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull mm updates from Andrew Morton:
- Yosry Ahmed brought back some cgroup v1 stats in OOM logs
- Yosry has also eliminated cgroup's atomic rstat flushing
- Nhat Pham adds the new cachestat() syscall. It provides userspace
with the ability to query pagecache status - a similar concept to
mincore() but more powerful and with improved usability
- Mel Gorman provides more optimizations for compaction, reducing the
prevalence of page rescanning
- Lorenzo Stoakes has done some maintanance work on the
get_user_pages() interface
- Liam Howlett continues with cleanups and maintenance work to the
maple tree code. Peng Zhang also does some work on maple tree
- Johannes Weiner has done some cleanup work on the compaction code
- David Hildenbrand has contributed additional selftests for
get_user_pages()
- Thomas Gleixner has contributed some maintenance and optimization
work for the vmalloc code
- Baolin Wang has provided some compaction cleanups,
- SeongJae Park continues maintenance work on the DAMON code
- Huang Ying has done some maintenance on the swap code's usage of
device refcounting
- Christoph Hellwig has some cleanups for the filemap/directio code
- Ryan Roberts provides two patch series which yield some
rationalization of the kernel's access to pte entries - use the
provided APIs rather than open-coding accesses
- Lorenzo Stoakes has some fixes to the interaction between pagecache
and directio access to file mappings
- John Hubbard has a series of fixes to the MM selftesting code
- ZhangPeng continues the folio conversion campaign
- Hugh Dickins has been working on the pagetable handling code, mainly
with a view to reducing the load on the mmap_lock
- Catalin Marinas has reduced the arm64 kmalloc() minimum alignment
from 128 to 8
- Domenico Cerasuolo has improved the zswap reclaim mechanism by
reorganizing the LRU management
- Matthew Wilcox provides some fixups to make gfs2 work better with the
buffer_head code
- Vishal Moola also has done some folio conversion work
- Matthew Wilcox has removed the remnants of the pagevec code - their
functionality is migrated over to struct folio_batch
* tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (380 commits)
mm/hugetlb: remove hugetlb_set_page_subpool()
mm: nommu: correct the range of mmap_sem_read_lock in task_mem()
hugetlb: revert use of page_cache_next_miss()
Revert "page cache: fix page_cache_next/prev_miss off by one"
mm/vmscan: fix root proactive reclaim unthrottling unbalanced node
mm: memcg: rename and document global_reclaim()
mm: kill [add|del]_page_to_lru_list()
mm: compaction: convert to use a folio in isolate_migratepages_block()
mm: zswap: fix double invalidate with exclusive loads
mm: remove unnecessary pagevec includes
mm: remove references to pagevec
mm: rename invalidate_mapping_pagevec to mapping_try_invalidate
mm: remove struct pagevec
net: convert sunrpc from pagevec to folio_batch
i915: convert i915_gpu_error to use a folio_batch
pagevec: rename fbatch_count()
mm: remove check_move_unevictable_pages()
drm: convert drm_gem_put_pages() to use a folio_batch
i915: convert shmem_sg_free_table() to use a folio_batch
scatterlist: add sg_set_folio()
...
Diffstat (limited to 'fs')
-rw-r--r-- | fs/afs/write.c | 16 | ||||
-rw-r--r-- | fs/btrfs/file.c | 6 | ||||
-rw-r--r-- | fs/buffer.c | 259 | ||||
-rw-r--r-- | fs/ceph/file.c | 6 | ||||
-rw-r--r-- | fs/direct-io.c | 10 | ||||
-rw-r--r-- | fs/exec.c | 2 | ||||
-rw-r--r-- | fs/ext4/file.c | 11 | ||||
-rw-r--r-- | fs/ext4/inode.c | 4 | ||||
-rw-r--r-- | fs/f2fs/file.c | 3 | ||||
-rw-r--r-- | fs/fs-writeback.c | 16 | ||||
-rw-r--r-- | fs/fuse/file.c | 45 | ||||
-rw-r--r-- | fs/gfs2/aops.c | 69 | ||||
-rw-r--r-- | fs/gfs2/aops.h | 2 | ||||
-rw-r--r-- | fs/gfs2/file.c | 6 | ||||
-rw-r--r-- | fs/hugetlbfs/inode.c | 13 | ||||
-rw-r--r-- | fs/iomap/buffered-io.c | 9 | ||||
-rw-r--r-- | fs/iomap/direct-io.c | 88 | ||||
-rw-r--r-- | fs/libfs.c | 41 | ||||
-rw-r--r-- | fs/nfs/file.c | 6 | ||||
-rw-r--r-- | fs/ntfs/aops.c | 2 | ||||
-rw-r--r-- | fs/ntfs/file.c | 2 | ||||
-rw-r--r-- | fs/ntfs3/file.c | 3 | ||||
-rw-r--r-- | fs/proc/task_mmu.c | 77 | ||||
-rw-r--r-- | fs/proc/task_nommu.c | 6 | ||||
-rw-r--r-- | fs/ramfs/inode.c | 2 | ||||
-rw-r--r-- | fs/reiserfs/inode.c | 9 | ||||
-rw-r--r-- | fs/userfaultfd.c | 62 | ||||
-rw-r--r-- | fs/xfs/xfs_file.c | 6 | ||||
-rw-r--r-- | fs/zonefs/file.c | 4 |
29 files changed, 364 insertions, 421 deletions
diff --git a/fs/afs/write.c b/fs/afs/write.c index 8750b99c3f56..9c7fd6fd8095 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -465,7 +465,7 @@ static void afs_extend_writeback(struct address_space *mapping, bool caching, unsigned int *_len) { - struct pagevec pvec; + struct folio_batch fbatch; struct folio *folio; unsigned long priv; unsigned int psize, filler = 0; @@ -476,7 +476,7 @@ static void afs_extend_writeback(struct address_space *mapping, unsigned int i; XA_STATE(xas, &mapping->i_pages, index); - pagevec_init(&pvec); + folio_batch_init(&fbatch); do { /* Firstly, we gather up a batch of contiguous dirty pages @@ -535,7 +535,7 @@ static void afs_extend_writeback(struct address_space *mapping, stop = false; index += folio_nr_pages(folio); - if (!pagevec_add(&pvec, &folio->page)) + if (!folio_batch_add(&fbatch, folio)) break; if (stop) break; @@ -545,14 +545,14 @@ static void afs_extend_writeback(struct address_space *mapping, xas_pause(&xas); rcu_read_unlock(); - /* Now, if we obtained any pages, we can shift them to being + /* Now, if we obtained any folios, we can shift them to being * writable and mark them for caching. */ - if (!pagevec_count(&pvec)) + if (!folio_batch_count(&fbatch)) break; - for (i = 0; i < pagevec_count(&pvec); i++) { - folio = page_folio(pvec.pages[i]); + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; trace_afs_folio_dirty(vnode, tracepoint_string("store+"), folio); if (!folio_clear_dirty_for_io(folio)) @@ -565,7 +565,7 @@ static void afs_extend_writeback(struct address_space *mapping, folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } while (!stop); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ba5b0c9f2bbd..fd03e689a6be 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1145,7 +1145,6 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return -EAGAIN; - current->backing_dev_info = inode_to_bdi(inode); ret = file_remove_privs(file); if (ret) return ret; @@ -1165,10 +1164,8 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, loff_t end_pos = round_up(pos + count, fs_info->sectorsize); ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); - if (ret) { - current->backing_dev_info = NULL; + if (ret) return ret; - } } return 0; @@ -1682,7 +1679,6 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, num_written = num_sync; } - current->backing_dev_info = NULL; return num_written; } diff --git a/fs/buffer.c b/fs/buffer.c index 93c7446d9221..cdd100273450 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -194,19 +194,19 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) pgoff_t index; struct buffer_head *bh; struct buffer_head *head; - struct page *page; + struct folio *folio; int all_mapped = 1; static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1); index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); - page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED); - if (!page) + folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0); + if (IS_ERR(folio)) goto out; spin_lock(&bd_mapping->private_lock); - if (!page_has_buffers(page)) + head = folio_buffers(folio); + if (!head) goto out_unlock; - head = page_buffers(page); bh = head; do { if (!buffer_mapped(bh)) @@ -236,7 +236,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) } out_unlock: spin_unlock(&bd_mapping->private_lock); - put_page(page); + folio_put(folio); out: return ret; } @@ -906,8 +906,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, } EXPORT_SYMBOL_GPL(alloc_page_buffers); -static inline void -link_dev_buffers(struct page *page, struct buffer_head *head) +static inline void link_dev_buffers(struct folio *folio, + struct buffer_head *head) { struct buffer_head *bh, *tail; @@ -917,7 +917,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head) bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_private(page, head); + folio_attach_private(folio, head); } static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) @@ -933,15 +933,14 @@ static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) } /* - * Initialise the state of a blockdev page's buffers. + * Initialise the state of a blockdev folio's buffers. */ -static sector_t -init_page_buffers(struct page *page, struct block_device *bdev, - sector_t block, int size) +static sector_t folio_init_buffers(struct folio *folio, + struct block_device *bdev, sector_t block, int size) { - struct buffer_head *head = page_buffers(page); + struct buffer_head *head = folio_buffers(folio); struct buffer_head *bh = head; - int uptodate = PageUptodate(page); + bool uptodate = folio_test_uptodate(folio); sector_t end_block = blkdev_max_block(bdev, size); do { @@ -975,7 +974,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, pgoff_t index, int size, int sizebits, gfp_t gfp) { struct inode *inode = bdev->bd_inode; - struct page *page; + struct folio *folio; struct buffer_head *bh; sector_t end_block; int ret = 0; @@ -991,42 +990,37 @@ grow_dev_page(struct block_device *bdev, sector_t block, */ gfp_mask |= __GFP_NOFAIL; - page = find_or_create_page(inode->i_mapping, index, gfp_mask); - - BUG_ON(!PageLocked(page)); + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp_mask); - if (page_has_buffers(page)) { - bh = page_buffers(page); + bh = folio_buffers(folio); + if (bh) { if (bh->b_size == size) { - end_block = init_page_buffers(page, bdev, - (sector_t)index << sizebits, - size); + end_block = folio_init_buffers(folio, bdev, + (sector_t)index << sizebits, size); goto done; } - if (!try_to_free_buffers(page_folio(page))) + if (!try_to_free_buffers(folio)) goto failed; } - /* - * Allocate some buffers for this page - */ - bh = alloc_page_buffers(page, size, true); + bh = folio_alloc_buffers(folio, size, true); /* - * Link the page to the buffers and initialise them. Take the + * Link the folio to the buffers and initialise them. Take the * lock to be atomic wrt __find_get_block(), which does not - * run under the page lock. + * run under the folio lock. */ spin_lock(&inode->i_mapping->private_lock); - link_dev_buffers(page, bh); - end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits, - size); + link_dev_buffers(folio, bh); + end_block = folio_init_buffers(folio, bdev, + (sector_t)index << sizebits, size); spin_unlock(&inode->i_mapping->private_lock); done: ret = (block < end_block) ? 1 : -ENXIO; failed: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return ret; } @@ -1763,7 +1757,7 @@ static struct buffer_head *folio_create_buffers(struct folio *folio, * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this * causes the writes to be flagged as synchronous writes. */ -int __block_write_full_page(struct inode *inode, struct page *page, +int __block_write_full_folio(struct inode *inode, struct folio *folio, get_block_t *get_block, struct writeback_control *wbc, bh_end_io_t *handler) { @@ -1775,14 +1769,14 @@ int __block_write_full_page(struct inode *inode, struct page *page, int nr_underway = 0; blk_opf_t write_flags = wbc_to_write_flags(wbc); - head = folio_create_buffers(page_folio(page), inode, + head = folio_create_buffers(folio, inode, (1 << BH_Dirty) | (1 << BH_Uptodate)); /* * Be very careful. We have no exclusion from block_dirty_folio * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it - * then we just miss that fact, and the page stays dirty. + * then we just miss that fact, and the folio stays dirty. * * Buffers outside i_size may be dirtied by block_dirty_folio; * handle that here by just cleaning them. @@ -1792,7 +1786,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, blocksize = bh->b_size; bbits = block_size_bits(blocksize); - block = (sector_t)page->index << (PAGE_SHIFT - bbits); + block = (sector_t)folio->index << (PAGE_SHIFT - bbits); last_block = (i_size_read(inode) - 1) >> bbits; /* @@ -1803,7 +1797,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, if (block > last_block) { /* * mapped buffers outside i_size will occur, because - * this page can be outside i_size when there is a + * this folio can be outside i_size when there is a * truncate in progress. */ /* @@ -1833,7 +1827,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, continue; /* * If it's a fully non-blocking write attempt and we cannot - * lock the buffer then redirty the page. Note that this can + * lock the buffer then redirty the folio. Note that this can * potentially cause a busy-wait loop from writeback threads * and kswapd activity, but those code paths have their own * higher-level throttling. @@ -1841,7 +1835,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else if (!trylock_buffer(bh)) { - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); continue; } if (test_clear_buffer_dirty(bh)) { @@ -1852,11 +1846,11 @@ int __block_write_full_page(struct inode *inode, struct page *page, } while ((bh = bh->b_this_page) != head); /* - * The page and its buffers are protected by PageWriteback(), so we can - * drop the bh refcounts early. + * The folio and its buffers are protected by the writeback flag, + * so we can drop the bh refcounts early. */ - BUG_ON(PageWriteback(page)); - set_page_writeback(page); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); do { struct buffer_head *next = bh->b_this_page; @@ -1866,20 +1860,20 @@ int __block_write_full_page(struct inode *inode, struct page *page, } bh = next; } while (bh != head); - unlock_page(page); + folio_unlock(folio); err = 0; done: if (nr_underway == 0) { /* - * The page was marked dirty, but the buffers were + * The folio was marked dirty, but the buffers were * clean. Someone wrote them back by hand with * write_dirty_buffer/submit_bh. A rare case. */ - end_page_writeback(page); + folio_end_writeback(folio); /* - * The page and buffer_heads can be released at any time from + * The folio and buffer_heads can be released at any time from * here on. */ } @@ -1890,7 +1884,7 @@ recover: * ENOSPC, or some other error. We may already have added some * blocks to the file, so we need to write these out to avoid * exposing stale data. - * The page is currently locked and not marked for writeback + * The folio is currently locked and not marked for writeback */ bh = head; /* Recovery: lock and submit the mapped buffers */ @@ -1902,15 +1896,15 @@ recover: } else { /* * The buffer may have been set dirty during - * attachment to a dirty page. + * attachment to a dirty folio. */ clear_buffer_dirty(bh); } } while ((bh = bh->b_this_page) != head); - SetPageError(page); - BUG_ON(PageWriteback(page)); - mapping_set_error(page->mapping, err); - set_page_writeback(page); + folio_set_error(folio); + BUG_ON(folio_test_writeback(folio)); + mapping_set_error(folio->mapping, err); + folio_start_writeback(folio); do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { @@ -1920,39 +1914,40 @@ recover: } bh = next; } while (bh != head); - unlock_page(page); + folio_unlock(folio); goto done; } -EXPORT_SYMBOL(__block_write_full_page); +EXPORT_SYMBOL(__block_write_full_folio); /* - * If a page has any new buffers, zero them out here, and mark them uptodate + * If a folio has any new buffers, zero them out here, and mark them uptodate * and dirty so they'll be written out (in order to prevent uninitialised * block data from leaking). And clear the new bit. */ -void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) +void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to) { - unsigned int block_start, block_end; + size_t block_start, block_end; struct buffer_head *head, *bh; - BUG_ON(!PageLocked(page)); - if (!page_has_buffers(page)) + BUG_ON(!folio_test_locked(folio)); + head = folio_buffers(folio); + if (!head) return; - bh = head = page_buffers(page); + bh = head; block_start = 0; do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { - if (!PageUptodate(page)) { - unsigned start, size; + if (!folio_test_uptodate(folio)) { + size_t start, xend; start = max(from, block_start); - size = min(to, block_end) - start; + xend = min(to, block_end); - zero_user(page, start, size); + folio_zero_segment(folio, start, xend); set_buffer_uptodate(bh); } @@ -1965,7 +1960,7 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) bh = bh->b_this_page; } while (bh != head); } -EXPORT_SYMBOL(page_zero_new_buffers); +EXPORT_SYMBOL(folio_zero_new_buffers); static void iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, @@ -2103,7 +2098,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) - page_zero_new_buffers(&folio->page, from, to); + folio_zero_new_buffers(folio, from, to); return err; } @@ -2115,15 +2110,15 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, } EXPORT_SYMBOL(__block_write_begin); -static int __block_commit_write(struct inode *inode, struct page *page, - unsigned from, unsigned to) +static int __block_commit_write(struct inode *inode, struct folio *folio, + size_t from, size_t to) { - unsigned block_start, block_end; - int partial = 0; + size_t block_start, block_end; + bool partial = false; unsigned blocksize; struct buffer_head *bh, *head; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); blocksize = bh->b_size; block_start = 0; @@ -2131,7 +2126,7 @@ static int __block_commit_write(struct inode *inode, struct page *page, block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) - partial = 1; + partial = true; } else { set_buffer_uptodate(bh); mark_buffer_dirty(bh); @@ -2146,11 +2141,11 @@ static int __block_commit_write(struct inode *inode, struct page *page, /* * If this is a partial write which happened to make all buffers * uptodate then we can optimize away a bogus read_folio() for - * the next read(). Here we 'discover' whether the page went + * the next read(). Here we 'discover' whether the folio went * uptodate as a result of this (potentially partial) write. */ if (!partial) - SetPageUptodate(page); + folio_mark_uptodate(folio); return 0; } @@ -2187,10 +2182,9 @@ int block_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; - unsigned start; - - start = pos & (PAGE_SIZE - 1); + size_t start = pos - folio_pos(folio); if (unlikely(copied < len)) { /* @@ -2202,18 +2196,18 @@ int block_write_end(struct file *file, struct address_space *mapping, * read_folio might come in and destroy our partial write. * * Do the simplest thing, and just treat any short write to a - * non uptodate page as a zero-length write, and force the + * non uptodate folio as a zero-length write, and force the * caller to redo the whole thing. */ - if (!PageUptodate(page)) + if (!folio_test_uptodate(folio)) copied = 0; - page_zero_new_buffers(page, start+copied, start+len); + folio_zero_new_buffers(folio, start+copied, start+len); } - flush_dcache_page(page); + flush_dcache_folio(folio); /* This could be a short (even 0-length) commit */ - __block_commit_write(inode, page, start, start+copied); + __block_commit_write(inode, folio, start, start + copied); return copied; } @@ -2536,8 +2530,9 @@ EXPORT_SYMBOL(cont_write_begin); int block_commit_write(struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; - __block_commit_write(inode,page,from,to); + struct folio *folio = page_folio(page); + struct inode *inode = folio->mapping->host; + __block_commit_write(inode, folio, from, to); return 0; } EXPORT_SYMBOL(block_commit_write); @@ -2563,38 +2558,37 @@ EXPORT_SYMBOL(block_commit_write); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vma->vm_file); unsigned long end; loff_t size; int ret; - lock_page(page); + folio_lock(folio); size = i_size_read(inode); - if ((page->mapping != inode->i_mapping) || - (page_offset(page) > size)) { + if ((folio->mapping != inode->i_mapping) || + (folio_pos(folio) >= size)) { /* We overload EFAULT to mean page got truncated */ ret = -EFAULT; goto out_unlock; } - /* page is wholly or partially inside EOF */ - if (((page->index + 1) << PAGE_SHIFT) > size) - end = size & ~PAGE_MASK; - else - end = PAGE_SIZE; + end = folio_size(folio); + /* folio is wholly or partially inside EOF */ + if (folio_pos(folio) + end > size) + end = size - folio_pos(folio); - ret = __block_write_begin(page, 0, end, get_block); + ret = __block_write_begin_int(folio, 0, end, get_block, NULL); if (!ret) - ret = block_commit_write(page, 0, end); + ret = __block_commit_write(inode, folio, 0, end); if (unlikely(ret < 0)) goto out_unlock; - set_page_dirty(page); - wait_for_stable_page(page); + folio_mark_dirty(folio); + folio_wait_stable(folio); return 0; out_unlock: - unlock_page(page); + folio_unlock(folio); return ret; } EXPORT_SYMBOL(block_page_mkwrite); @@ -2603,17 +2597,16 @@ int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block) { pgoff_t index = from >> PAGE_SHIFT; - unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize; sector_t iblock; - unsigned length, pos; + size_t offset, length, pos; struct inode *inode = mapping->host; - struct page *page; + struct folio *folio; struct buffer_head *bh; int err = 0; blocksize = i_blocksize(inode); - length = offset & (blocksize - 1); + length = from & (blocksize - 1); /* Block boundary? Nothing to do */ if (!length) @@ -2622,15 +2615,18 @@ int block_truncate_page(struct address_space *mapping, length = blocksize - length; iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits); - page = grab_cache_page(mapping, index); - if (!page) - return -ENOMEM; + folio = filemap_grab_folio(mapping, index); + if (IS_ERR(folio)) + return PTR_ERR(folio); - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); + bh = folio_buffers(folio); + if (!bh) { + folio_create_empty_buffers(folio, blocksize, 0); + bh = folio_buffers(folio); + } /* Find the buffer that contains "offset" */ - bh = page_buffers(page); + offset = offset_in_folio(folio, from); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; @@ -2649,7 +2645,7 @@ int block_truncate_page(struct address_space *mapping, } /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { @@ -2659,12 +2655,12 @@ int block_truncate_page(struct address_space *mapping, goto unlock; } - zero_user(page, offset, length); + folio_zero_range(folio, offset, length); mark_buffer_dirty(bh); unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } @@ -2676,33 +2672,32 @@ EXPORT_SYMBOL(block_truncate_page); int block_write_full_page(struct page *page, get_block_t *get_block, struct writeback_control *wbc) { - struct inode * const inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct inode * const inode = folio->mapping->host; loff_t i_size = i_size_read(inode); - const pgoff_t end_index = i_size >> PAGE_SHIFT; - unsigned offset; - /* Is the page fully inside i_size? */ - if (page->index < end_index) - return __block_write_full_page(inode, page, get_block, wbc, + /* Is the folio fully inside i_size? */ + if (folio_pos(folio) + folio_size(folio) <= i_size) + return __block_write_full_folio(inode, folio, get_block, wbc, end_buffer_async_write); - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_SIZE-1); - if (page->index >= end_index+1 || !offset) { - unlock_page(page); + /* Is the folio fully outside i_size? (truncate in progress) */ + if (folio_pos(folio) >= i_size) { + folio_unlock(folio); return 0; /* don't care */ } /* - * The page straddles i_size. It must be zeroed out on each and every + * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and + * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - zero_user_segment(page, offset, PAGE_SIZE); - return __block_write_full_page(inode, page, get_block, wbc, - end_buffer_async_write); + folio_zero_segment(folio, offset_in_folio(folio, i_size), + folio_size(folio)); + return __block_write_full_folio(inode, folio, get_block, wbc, + end_buffer_async_write); } EXPORT_SYMBOL(block_write_full_page); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4285f6cb5d3b..b1925232dc08 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1854,9 +1854,6 @@ retry_snap: else ceph_start_io_write(inode); - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - if (iocb->ki_flags & IOCB_APPEND) { err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); if (err < 0) @@ -1957,8 +1954,6 @@ retry_snap: * can not run at the same time */ written = generic_perform_write(iocb, from); - if (likely(written >= 0)) - iocb->ki_pos = pos + written; ceph_end_io_write(inode); } @@ -2003,7 +1998,6 @@ out: ceph_end_io_write(inode); out_unlocked: ceph_free_cap_flush(prealloc_cf); - current->backing_dev_info = NULL; return written ? written : err; } diff --git a/fs/direct-io.c b/fs/direct-io.c index 2ceb378b93c0..7bc494ee56b9 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -297,14 +297,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) * zeros from unwritten extents. */ if (flags & DIO_COMPLETE_INVALIDATE && - ret > 0 && dio_op == REQ_OP_WRITE && - dio->inode->i_mapping->nrpages) { - err = invalidate_inode_pages2_range(dio->inode->i_mapping, - offset >> PAGE_SHIFT, - (offset + ret - 1) >> PAGE_SHIFT); - if (err) - dio_warn_stale_pagecache(dio->iocb->ki_filp); - } + ret > 0 && dio_op == REQ_OP_WRITE) + kiocb_invalidate_post_direct_write(dio->iocb, ret); inode_dio_end(dio->inode); diff --git a/fs/exec.c b/fs/exec.c index a466e797c8e2..25c65b64544b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -220,7 +220,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, */ mmap_read_lock(bprm->mm); ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, - &page, NULL, NULL); + &page, NULL); mmap_read_unlock(bprm->mm); if (ret <= 0) return NULL; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index e8261900f4f3..6a16d07965f9 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -296,18 +296,13 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, if (ret <= 0) goto out; - current->backing_dev_info = inode_to_bdi(inode); ret = generic_perform_write(iocb, from); - current->backing_dev_info = NULL; out: inode_unlock(inode); - if (likely(ret > 0)) { - iocb->ki_pos += ret; - ret = generic_write_sync(iocb, ret); - } - - return ret; + if (unlikely(ret <= 0)) + return ret; + return generic_write_sync(iocb, ret); } static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 02de439bf1f0..9ca583360166 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1093,7 +1093,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) { - page_zero_new_buffers(&folio->page, from, to); + folio_zero_new_buffers(folio, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; @@ -1339,7 +1339,7 @@ static int ext4_write_end(struct file *file, } /* - * This is a private version of page_zero_new_buffers() which doesn't + * This is a private version of folio_zero_new_buffers() which doesn't * set the buffer to be dirty, since in data=journalled mode we need * to call ext4_dirty_journalled_data() instead. */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3fce122997ca..2435111a8532 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4543,12 +4543,9 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; - current->backing_dev_info = inode_to_bdi(inode); ret = generic_perform_write(iocb, from); - current->backing_dev_info = NULL; if (ret > 0) { - iocb->ki_pos += ret; f2fs_update_iostat(F2FS_I_SB(inode), inode, APP_BUFFERED_IO, ret); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ae4e51e91ee3..aca4b4811394 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2024,7 +2024,6 @@ static long wb_writeback(struct bdi_writeback *wb, struct blk_plug plug; blk_start_plug(&plug); - spin_lock(&wb->list_lock); for (;;) { /* * Stop writeback when nr_pages has been consumed @@ -2049,6 +2048,9 @@ static long wb_writeback(struct bdi_writeback *wb, if (work->for_background && !wb_over_bg_thresh(wb)) break; + + spin_lock(&wb->list_lock); + /* * Kupdate and background works are special and we want to * include all inodes that need writing. Livelock avoidance is @@ -2078,13 +2080,19 @@ static long wb_writeback(struct bdi_writeback *wb, * mean the overall work is done. So we keep looping as long * as made some progress on cleaning pages or inodes. */ - if (progress) + if (progress) { + spin_unlock(&wb->list_lock); continue; + } + /* * No more inodes for IO, bail */ - if (list_empty(&wb->b_more_io)) + if (list_empty(&wb->b_more_io)) { + spin_unlock(&wb->list_lock); break; + } + /* * Nothing written. Wait for some inode to * become available for writeback. Otherwise @@ -2096,9 +2104,7 @@ static long wb_writeback(struct bdi_writeback *wb, spin_unlock(&wb->list_lock); /* This function drops i_lock... */ inode_sleep_on_writeback(inode); - spin_lock(&wb->list_lock); } - spin_unlock(&wb->list_lock); blk_finish_plug(&plug); return nr_pages - work->nr_pages; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4553124f5406..bc4115288eec 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1280,13 +1280,13 @@ static inline unsigned int fuse_wr_pages(loff_t pos, size_t len, max_pages); } -static ssize_t fuse_perform_write(struct kiocb *iocb, - struct address_space *mapping, - struct iov_iter *ii, loff_t pos) +static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) { + struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + loff_t pos = iocb->ki_pos; int err = 0; ssize_t res = 0; @@ -1329,7 +1329,10 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, fuse_write_update_attr(inode, pos, res); clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - return res > 0 ? res : err; + if (!res) + return err; + iocb->ki_pos += res; + return res; } static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -1337,11 +1340,9 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; ssize_t written = 0; - ssize_t written_buffered = 0; struct inode *inode = mapping->host; ssize_t err; struct fuse_conn *fc = get_fuse_conn(inode); - loff_t endbyte = 0; if (fc->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ @@ -1362,9 +1363,6 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) writethrough: inode_lock(inode); - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - err = generic_write_checks(iocb, from); if (err <= 0) goto out; @@ -1378,38 +1376,15 @@ writethrough: goto out; if (iocb->ki_flags & IOCB_DIRECT) { - loff_t pos = iocb->ki_pos; written = generic_file_direct_write(iocb, from); if (written < 0 || !iov_iter_count(from)) goto out; - - pos += written; - - written_buffered = fuse_perform_write(iocb, mapping, from, pos); - if (written_buffered < 0) { - err = written_buffered; - goto out; - } - endbyte = pos + written_buffered - 1; - - err = filemap_write_and_wait_range(file->f_mapping, pos, - endbyte); - if (err) - goto out; - - invalidate_mapping_pages(file->f_mapping, - pos >> PAGE_SHIFT, - endbyte >> PAGE_SHIFT); - - written += written_buffered; - iocb->ki_pos = pos + written_buffered; + written = direct_write_fallback(iocb, from, written, + fuse_perform_write(iocb, from)); } else { - written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos); - if (written >= 0) - iocb->ki_pos += written; + written = fuse_perform_write(iocb, from); } out: - current->backing_dev_info = NULL; inode_unlock(inode); if (written > 0) written = generic_write_sync(iocb, written); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index a5f4be6b9213..1c407eba1e30 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -38,13 +38,13 @@ void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, - unsigned int from, unsigned int len) + size_t from, size_t len) { struct buffer_head *head = folio_buffers(folio); unsigned int bsize = head->b_size; struct buffer_head *bh; - unsigned int to = from + len; - unsigned int start, end; + size_t to = from + len; + size_t start, end; for (bh = head, start = 0; bh != head || !start; bh = bh->b_this_page, start = end) { @@ -82,61 +82,61 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, } /** - * gfs2_write_jdata_page - gfs2 jdata-specific version of block_write_full_page - * @page: The page to write + * gfs2_write_jdata_folio - gfs2 jdata-specific version of block_write_full_page + * @folio: The folio to write * @wbc: The writeback control * * This is the same as calling block_write_full_page, but it also * writes pages outside of i_size */ -static int gfs2_write_jdata_page(struct page *page, +static int gfs2_write_jdata_folio(struct folio *folio, struct writeback_control *wbc) { - struct inode * const inode = page->mapping->host; + struct inode * const inode = folio->mapping->host; loff_t i_size = i_size_read(inode); - const pgoff_t end_index = i_size >> PAGE_SHIFT; - unsigned offset; /* - * The page straddles i_size. It must be zeroed out on each and every + * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and + * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - offset = i_size & (PAGE_SIZE - 1); - if (page->index == end_index && offset) - zero_user_segment(page, offset, PAGE_SIZE); + if (folio_pos(folio) < i_size && + i_size < folio_pos(folio) + folio_size(folio)) + folio_zero_segment(folio, offset_in_folio(folio, i_size), + folio_size(folio)); - return __block_write_full_page(inode, page, gfs2_get_block_noalloc, wbc, - end_buffer_async_write); + return __block_write_full_folio(inode, folio, gfs2_get_block_noalloc, + wbc, end_buffer_async_write); } /** - * __gfs2_jdata_writepage - The core of jdata writepage - * @page: The page to write + * __gfs2_jdata_write_folio - The core of jdata writepage + * @folio: The folio to write * @wbc: The writeback control * * This is shared between writepage and writepages and implements the * core of the writepage operation. If a transaction is required then - * PageChecked will have been set and the transaction will have + * the checked flag will have been set and the transaction will have * already been started before this is called. */ - -static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) +static int __gfs2_jdata_write_folio(struct folio *folio, + struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - if (PageChecked(page)) { - ClearPageChecked(page); - if (!page_has_buffers(page)) { - create_empty_buffers(page, inode->i_sb->s_blocksize, - BIT(BH_Dirty)|BIT(BH_Uptodate)); + if (folio_test_checked(folio)) { + folio_clear_checked(folio); + if (!folio_buffers(folio)) { + folio_create_empty_buffers(folio, + inode->i_sb->s_blocksize, + BIT(BH_Dirty)|BIT(BH_Uptodate)); } - gfs2_trans_add_databufs(ip, page_folio(page), 0, PAGE_SIZE); + gfs2_trans_add_databufs(ip, folio, 0, folio_size(folio)); } - return gfs2_write_jdata_page(page, wbc); + return gfs2_write_jdata_folio(folio, wbc); } /** @@ -150,20 +150,21 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) goto out; - if (PageChecked(page) || current->journal_info) + if (folio_test_checked(folio) || current->journal_info) goto out_ignore; - return __gfs2_jdata_writepage(page, wbc); + return __gfs2_jdata_write_folio(folio, wbc); out_ignore: - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); out: - unlock_page(page); + folio_unlock(folio); return 0; } @@ -255,7 +256,7 @@ continue_unlock: trace_wbc_writepage(wbc, inode_to_bdi(inode)); - ret = __gfs2_jdata_writepage(&folio->page, wbc); + ret = __gfs2_jdata_write_folio(folio, wbc); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { folio_unlock(folio); diff --git a/fs/gfs2/aops.h b/fs/gfs2/aops.h index 09db1914425e..f08322ef41cf 100644 --- a/fs/gfs2/aops.h +++ b/fs/gfs2/aops.h @@ -10,6 +10,6 @@ extern void adjust_fs_space(struct inode *inode); extern void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, - unsigned int from, unsigned int len); + size_t from, size_t len); #endif /* __AOPS_DOT_H__ */ diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 1d679a3178ff..f146447eac63 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1052,15 +1052,11 @@ retry: goto out_unlock; } - current->backing_dev_info = inode_to_bdi(inode); pagefault_disable(); ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); pagefault_enable(); - current->backing_dev_info = NULL; - if (ret > 0) { - iocb->ki_pos += ret; + if (ret > 0) written += ret; - } if (inode == sdp->sd_rindex) gfs2_glock_dq_uninit(statfs_gh); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ecfdfb2529a3..7b17ccfa039d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -821,7 +821,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, */ struct folio *folio; unsigned long addr; - bool present; cond_resched(); @@ -834,9 +833,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, break; } - /* Set numa allocation policy based on index */ - hugetlb_set_vma_policy(&pseudo_vma, inode, index); - /* addr is the offset within the file (zero based) */ addr = index * hpage_size; @@ -845,12 +841,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ - rcu_read_lock(); - present = page_cache_next_miss(mapping, index, 1) != index; - rcu_read_unlock(); - if (present) { + folio = filemap_get_folio(mapping, index); + if (!IS_ERR(folio)) { + folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - hugetlb_drop_vma_policy(&pseudo_vma); continue; } @@ -862,6 +856,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ + hugetlb_set_vma_policy(&pseudo_vma, inode, index); folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); hugetlb_drop_vma_policy(&pseudo_vma); if (IS_ERR(folio)) { diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 0edab9deae2a..a4fa81af60d9 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -864,16 +864,19 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, .len = iov_iter_count(i), .flags = IOMAP_WRITE, }; - int ret; + ssize_t ret; if (iocb->ki_flags & IOCB_NOWAIT) iter.flags |= IOMAP_NOWAIT; while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_write_iter(&iter, i); - if (iter.pos == iocb->ki_pos) + + if (unlikely(ret < 0)) return ret; - return iter.pos - iocb->ki_pos; + ret = iter.pos - iocb->ki_pos; + iocb->ki_pos += ret; + return ret; } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 08873f0627dd..ea3b868c8355 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -81,7 +81,6 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) { const struct iomap_dio_ops *dops = dio->dops; struct kiocb *iocb = dio->iocb; - struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; ssize_t ret = dio->error; @@ -94,7 +93,6 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) if (offset + ret > dio->i_size && !(dio->flags & IOMAP_DIO_WRITE)) ret = dio->i_size - offset; - iocb->ki_pos += ret; } /* @@ -109,30 +107,25 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && dio->size && - (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { - int err; - err = invalidate_inode_pages2_range(inode->i_mapping, - offset >> PAGE_SHIFT, - (offset + dio->size - 1) >> PAGE_SHIFT); - if (err) - dio_warn_stale_pagecache(iocb->ki_filp); - } + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) + kiocb_invalidate_post_direct_write(iocb, dio->size); inode_dio_end(file_inode(iocb->ki_filp)); - /* - * If this is a DSYNC write, make sure we push it to stable storage now - * that we've written data. - */ - if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) - ret = generic_write_sync(iocb, ret); - if (ret > 0) - ret += dio->done_before; + if (ret > 0) { + iocb->ki_pos += ret; + /* + * If this is a DSYNC write, make sure we push it to stable + * storage now that we've written data. + */ + if (dio->flags & IOMAP_DIO_NEED_SYNC) + ret = generic_write_sync(iocb, ret); + if (ret > 0) + ret += dio->done_before; + } trace_iomap_dio_complete(iocb, dio->error, ret); kfree(dio); - return ret; } EXPORT_SYMBOL_GPL(iomap_dio_complete); @@ -478,7 +471,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before) { - struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); struct iomap_iter iomi = { .inode = inode, @@ -487,11 +479,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, .flags = IOMAP_DIRECT, .private = private, }; - loff_t end = iomi.pos + iomi.len - 1, ret = 0; bool wait_for_completion = is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT); struct blk_plug plug; struct iomap_dio *dio; + loff_t ret = 0; trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before); @@ -515,31 +507,29 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->submit.waiter = current; dio->submit.poll_bio = NULL; + if (iocb->ki_flags & IOCB_NOWAIT) + iomi.flags |= IOMAP_NOWAIT; + if (iov_iter_rw(iter) == READ) { if (iomi.pos >= dio->i_size) goto out_free_dio; - if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_needs_writeback(mapping, iomi.pos, - end)) { - ret = -EAGAIN; - goto out_free_dio; - } - iomi.flags |= IOMAP_NOWAIT; - } - if (user_backed_iter(iter)) dio->flags |= IOMAP_DIO_DIRTY; + + ret = kiocb_write_and_wait(iocb, iomi.len); + if (ret) + goto out_free_dio; } else { iomi.flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; - if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_has_page(mapping, iomi.pos, end)) { - ret = -EAGAIN; + if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { + ret = -EAGAIN; + if (iomi.pos >= dio->i_size || + iomi.pos + iomi.len > dio->i_size) goto out_free_dio; - } - iomi.flags |= IOMAP_NOWAIT; + iomi.flags |= IOMAP_OVERWRITE_ONLY; } /* for data sync or sync, we need sync completion processing */ @@ -555,31 +545,19 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!(iocb->ki_flags & IOCB_SYNC)) dio->flags |= IOMAP_DIO_WRITE_FUA; } - } - if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { - ret = -EAGAIN; - if (iomi.pos >= dio->i_size || - iomi.pos + iomi.len > dio->i_size) - goto out_free_dio; - iomi.flags |= IOMAP_OVERWRITE_ONLY; - } - - ret = filemap_write_and_wait_range(mapping, iomi.pos, end); - if (ret) - goto out_free_dio; - - if (iov_iter_rw(iter) == WRITE) { /* * Try to invalidate cache pages for the range we are writing. * If this invalidation fails, let the caller fall back to * buffered I/O. */ - if (invalidate_inode_pages2_range(mapping, - iomi.pos >> PAGE_SHIFT, end >> PAGE_SHIFT)) { - trace_iomap_dio_invalidate_fail(inode, iomi.pos, - iomi.len); - ret = -ENOTBLK; + ret = kiocb_invalidate_pages(iocb, iomi.len); + if (ret) { + if (ret != -EAGAIN) { + trace_iomap_dio_invalidate_fail(inode, iomi.pos, + iomi.len); + ret = -ENOTBLK; + } goto out_free_dio; } diff --git a/fs/libfs.c b/fs/libfs.c index 89cf614a3271..5b851315eeed 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1613,3 +1613,44 @@ u64 inode_query_iversion(struct inode *inode) return cur >> I_VERSION_QUERIED_SHIFT; } EXPORT_SYMBOL(inode_query_iversion); + +ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, + ssize_t direct_written, ssize_t buffered_written) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + loff_t pos = iocb->ki_pos - buffered_written; + loff_t end = iocb->ki_pos - 1; + int err; + + /* + * If the buffered write fallback returned an error, we want to return + * the number of bytes which were written by direct I/O, or the error + * code if that was zero. + * + * Note that this differs from normal direct-io semantics, which will + * return -EFOO even if some bytes were written. + */ + if (unlikely(buffered_written < 0)) { + if (direct_written) + return direct_written; + return buffered_written; + } + + /* + * We need to ensure that the page cache pages are written to disk and + * invalidated to preserve the expected O_DIRECT semantics. + */ + err = filemap_write_and_wait_range(mapping, pos, end); + if (err < 0) { + /* + * We don't know how much we wrote, so just return the number of + * bytes which were direct-written + */ + if (direct_written) + return direct_written; + return err; + } + invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); + return direct_written + buffered_written; +} +EXPORT_SYMBOL_GPL(direct_write_fallback); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 3855f3ce8d2d..79b1b3fcd3fc 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -669,17 +669,13 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) since = filemap_sample_wb_err(file->f_mapping); nfs_start_io_write(inode); result = generic_write_checks(iocb, from); - if (result > 0) { - current->backing_dev_info = inode_to_bdi(inode); + if (result > 0) result = generic_perform_write(iocb, from); - current->backing_dev_info = NULL; - } nfs_end_io_write(inode); if (result <= 0) goto out; written = result; - iocb->ki_pos += written; nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); if (mntflags & NFS_MOUNT_WRITE_EAGER) { diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index e8aeba124a95..4e158bce4192 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -526,7 +526,7 @@ err_out: * * Return 0 on success and -errno on error. * - * Based on ntfs_read_block() and __block_write_full_page(). + * Based on ntfs_read_block() and __block_write_full_folio(). */ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) { diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index e5e0ed58670b..cbc545999cfe 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1911,11 +1911,9 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(vi); /* We can write back this queue in page reclaim. */ - current->backing_dev_info = inode_to_bdi(vi); err = ntfs_prepare_file_for_write(iocb, from); if (iov_iter_count(from) && !err) written = ntfs_perform_write(file, from, iocb->ki_pos); - current->backing_dev_info = NULL; inode_unlock(vi); iocb->ki_pos += written; if (likely(written > 0)) diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 036efd85f60c..9be3e8edf4f3 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -849,7 +849,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) if (!pages) return -ENOMEM; - current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) goto out; @@ -1022,8 +1021,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) out: kfree(pages); - current->backing_dev_info = NULL; - if (err < 0) return err; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 420510f6a545..507cd4e59d07 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -538,13 +538,14 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; bool migration = false, young = false, dirty = false; + pte_t ptent = ptep_get(pte); - if (pte_present(*pte)) { - page = vm_normal_page(vma, addr, *pte); - young = pte_young(*pte); - dirty = pte_dirty(*pte); - } else if (is_swap_pte(*pte)) { - swp_entry_t swpent = pte_to_swp_entry(*pte); + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + young = pte_young(ptent); + dirty = pte_dirty(ptent); + } else if (is_swap_pte(ptent)) { + swp_entry_t swpent = pte_to_swp_entry(ptent); if (!non_swap_entry(swpent)) { int mapcount; @@ -631,14 +632,11 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, goto out; } - if (pmd_trans_unstable(pmd)) - goto out; - /* - * The mmap_lock held all the way back in m_start() is what - * keeps khugepaged out of here and from collapsing things - * in here. - */ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } for (; addr != end; pte++, addr += PAGE_SIZE) smaps_pte_entry(pte, addr, walk); pte_unmap_unlock(pte - 1, ptl); @@ -735,11 +733,12 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; struct page *page = NULL; + pte_t ptent = ptep_get(pte); - if (pte_present(*pte)) { - page = vm_normal_page(vma, addr, *pte); - } else if (is_swap_pte(*pte)) { - swp_entry_t swpent = pte_to_swp_entry(*pte); + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + } else if (is_swap_pte(ptent)) { + swp_entry_t swpent = pte_to_swp_entry(ptent); if (is_pfn_swap_entry(swpent)) page = pfn_swap_entry_to_page(swpent); @@ -1108,7 +1107,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, * Documentation/admin-guide/mm/soft-dirty.rst for full description * of how soft-dirty works. */ - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); if (pte_present(ptent)) { pte_t old_pte; @@ -1191,12 +1190,13 @@ out: return 0; } - if (pmd_trans_unstable(pmd)) - return 0; - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); @@ -1538,9 +1538,6 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, spin_unlock(ptl); return err; } - - if (pmd_trans_unstable(pmdp)) - return 0; #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* @@ -1548,10 +1545,14 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, * goes beyond vma->vm_end. */ orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return err; + } for (; addr < end; pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; - pme = pte_to_pagemap_entry(pm, vma, addr, *pte); + pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -1689,23 +1690,23 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, /* watch out for wraparound */ start_vaddr = end_vaddr; if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { + unsigned long end; + ret = mmap_read_lock_killable(mm); if (ret) goto out_free; start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); mmap_read_unlock(mm); + + end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT); + if (end >= start_vaddr && end < mm->task_size) + end_vaddr = end; } /* Ensure the address is inside the task */ if (start_vaddr > mm->task_size) start_vaddr = end_vaddr; - /* - * The odds are that this will stop walking way - * before end_vaddr, because the length of the - * user buffer is tracked in "pm", and the walk - * will stop when we hit the end of the buffer. - */ ret = 0; while (count && (start_vaddr < end_vaddr)) { int len; @@ -1887,16 +1888,18 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, spin_unlock(ptl); return 0; } - - if (pmd_trans_unstable(pmd)) - return 0; #endif orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } do { - struct page *page = can_gather_numa_stats(*pte, vma, addr); + pte_t ptent = ptep_get(pte); + struct page *page = can_gather_numa_stats(ptent, vma, addr); if (!page) continue; - gather_stats(page, md, pte_dirty(*pte), 1); + gather_stats(page, md, pte_dirty(ptent), 1); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 0ec35072a8e5..2c8b62265981 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -51,7 +51,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) sbytes += kobjsize(mm); else bytes += kobjsize(mm); - + if (current->fs && current->fs->users > 1) sbytes += kobjsize(current->fs); else @@ -69,13 +69,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) bytes += kobjsize(current); /* includes kernel stack */ + mmap_read_unlock(mm); + seq_printf(m, "Mem:\t%8lu bytes\n" "Slack:\t%8lu bytes\n" "Shared:\t%8lu bytes\n", bytes, slack, sbytes); - - mmap_read_unlock(mm); } unsigned long task_vsize(struct mm_struct *mm) diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 5ba580c78835..fef477c78107 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -278,7 +278,7 @@ int ramfs_init_fs_context(struct fs_context *fc) return 0; } -static void ramfs_kill_sb(struct super_block *sb) +void ramfs_kill_sb(struct super_block *sb) { kfree(sb->s_fs_info); kill_litter_super(sb); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index d8debbb6105f..77bd3b27059f 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2506,7 +2506,7 @@ out: /* * mason@suse.com: updated in 2.5.54 to follow the same general io - * start/recovery path as __block_write_full_page, along with special + * start/recovery path as __block_write_full_folio, along with special * code to handle reiserfs tails. */ static int reiserfs_write_full_page(struct page *page, @@ -2872,6 +2872,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; int ret = 0; int update_sd = 0; @@ -2887,12 +2888,12 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, start = pos & (PAGE_SIZE - 1); if (unlikely(copied < len)) { - if (!PageUptodate(page)) + if (!folio_test_uptodate(folio)) copied = 0; - page_zero_new_buffers(page, start + copied, start + len); + folio_zero_new_buffers(folio, start + copied, start + len); } - flush_dcache_page(page); + flush_dcache_folio(folio); reiserfs_commit_page(inode, page, start, start + copied); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 4e800bb7d2ab..7cecd49e078b 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -335,6 +335,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pud_t *pud; pmd_t *pmd, _pmd; pte_t *pte; + pte_t ptent; bool ret = true; mmap_assert_locked(mm); @@ -349,20 +350,13 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, if (!pud_present(*pud)) goto out; pmd = pmd_offset(pud, address); - /* - * READ_ONCE must function as a barrier with narrower scope - * and it must be equivalent to: - * _pmd = *pmd; barrier(); - * - * This is to deal with the instability (as in - * pmd_trans_unstable) of the pmd. - */ - _pmd = READ_ONCE(*pmd); +again: + _pmd = pmdp_get_lockless(pmd); if (pmd_none(_pmd)) goto out; ret = false; - if (!pmd_present(_pmd)) + if (!pmd_present(_pmd) || pmd_devmap(_pmd)) goto out; if (pmd_trans_huge(_pmd)) { @@ -371,19 +365,20 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, goto out; } - /* - * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it - * and use the standard pte_offset_map() instead of parsing _pmd. - */ pte = pte_offset_map(pmd, address); + if (!pte) { + ret = true; + goto again; + } /* * Lockless access: we're in a wait_event so it's ok if it * changes under us. PTE markers should be handled the same as none * ptes here. */ - if (pte_none_mostly(*pte)) + ptent = ptep_get(pte); + if (pte_none_mostly(ptent)) ret = true; - if (!pte_write(*pte) && (reason & VM_UFFD_WP)) + if (!pte_write(ptent) && (reason & VM_UFFD_WP)) ret = true; pte_unmap(pte); @@ -857,31 +852,26 @@ static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, return false; } -int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, +int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *unmaps) { - VMA_ITERATOR(vmi, mm, start); - struct vm_area_struct *vma; - - for_each_vma_range(vmi, vma, end) { - struct userfaultfd_unmap_ctx *unmap_ctx; - struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; + struct userfaultfd_unmap_ctx *unmap_ctx; + struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; - if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || - has_unmap_ctx(ctx, unmaps, start, end)) - continue; + if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || + has_unmap_ctx(ctx, unmaps, start, end)) + return 0; - unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL); - if (!unmap_ctx) - return -ENOMEM; + unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL); + if (!unmap_ctx) + return -ENOMEM; - userfaultfd_ctx_get(ctx); - atomic_inc(&ctx->mmap_changing); - unmap_ctx->ctx = ctx; - unmap_ctx->start = start; - unmap_ctx->end = end; - list_add_tail(&unmap_ctx->list, unmaps); - } + userfaultfd_ctx_get(ctx); + atomic_inc(&ctx->mmap_changing); + unmap_ctx->ctx = ctx; + unmap_ctx->start = start; + unmap_ctx->end = end; + list_add_tail(&unmap_ctx->list, unmaps); return 0; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 08d632668e94..0c6671eb11d6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -745,14 +745,9 @@ write_retry: if (ret) goto out; - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - trace_xfs_file_buffered_write(iocb, from); ret = iomap_file_buffered_write(iocb, from, &xfs_buffered_write_iomap_ops); - if (likely(ret >= 0)) - iocb->ki_pos += ret; /* * If we hit a space limit, try to free up some lingering preallocated @@ -781,7 +776,6 @@ write_retry: goto write_retry; } - current->backing_dev_info = NULL; out: if (iolock) xfs_iunlock(ip, iolock); diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 1451e7b48669..92c9aaae3663 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -644,9 +644,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, goto inode_unlock; ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); - if (ret > 0) - iocb->ki_pos += ret; - else if (ret == -EIO) + if (ret == -EIO) zonefs_io_error(inode, true); inode_unlock: |