diff options
Diffstat (limited to 'fs')
96 files changed, 1463 insertions, 875 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 1c8dc696d516..cebba4eaa0b5 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -62,12 +62,12 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)  	version = cpu_to_le32(v9inode->qid.version);  	path = cpu_to_le64(v9inode->qid.path);  	v9ses = v9fs_inode2v9ses(inode); -	v9inode->netfs_ctx.cache = +	v9inode->netfs.cache =  		fscache_acquire_cookie(v9fs_session_cache(v9ses),  				       0,  				       &path, sizeof(path),  				       &version, sizeof(version), -				       i_size_read(&v9inode->vfs_inode)); +				       i_size_read(&v9inode->netfs.inode));  	p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",  		 inode, v9fs_inode_cookie(v9inode)); diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 79df61fe0e59..baf2b152229e 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -152,7 +152,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,  	const unsigned char **wnames, *uname;  	int i, n, l, clone, access;  	struct v9fs_session_info *v9ses; -	struct p9_fid *fid, *old_fid = NULL; +	struct p9_fid *fid, *old_fid;  	v9ses = v9fs_dentry2v9ses(dentry);  	access = v9ses->flags & V9FS_ACCESS_MASK; @@ -194,13 +194,12 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,  		if (IS_ERR(fid))  			return fid; +		refcount_inc(&fid->count);  		v9fs_fid_add(dentry->d_sb->s_root, fid);  	}  	/* If we are root ourself just return that */ -	if (dentry->d_sb->s_root == dentry) { -		refcount_inc(&fid->count); +	if (dentry->d_sb->s_root == dentry)  		return fid; -	}  	/*  	 * Do a multipath walk with attached root.  	 * When walking parent we need to make sure we @@ -212,6 +211,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,  		fid = ERR_PTR(n);  		goto err_out;  	} +	old_fid = fid;  	clone = 1;  	i = 0;  	while (i < n) { @@ -221,19 +221,15 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,  		 * walk to ensure none of the patch component change  		 */  		fid = p9_client_walk(fid, l, &wnames[i], clone); +		/* non-cloning walk will return the same fid */ +		if (fid != old_fid) { +			p9_client_clunk(old_fid); +			old_fid = fid; +		}  		if (IS_ERR(fid)) { -			if (old_fid) { -				/* -				 * If we fail, clunk fid which are mapping -				 * to path component and not the last component -				 * of the path. -				 */ -				p9_client_clunk(old_fid); -			}  			kfree(wnames);  			goto err_out;  		} -		old_fid = fid;  		i += l;  		clone = 0;  	} diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index e28ddf763b3b..0129de2ea31a 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -625,7 +625,7 @@ static void v9fs_inode_init_once(void *foo)  	struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;  	memset(&v9inode->qid, 0, sizeof(v9inode->qid)); -	inode_init_once(&v9inode->vfs_inode); +	inode_init_once(&v9inode->netfs.inode);  }  /** diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index ec0e8df3b2eb..6acabc2e7dc9 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -109,11 +109,7 @@ struct v9fs_session_info {  #define V9FS_INO_INVALID_ATTR 0x01  struct v9fs_inode { -	struct { -		/* These must be contiguous */ -		struct inode	vfs_inode;	/* the VFS's inode record */ -		struct netfs_i_context netfs_ctx; /* Netfslib context */ -	}; +	struct netfs_inode netfs; /* Netfslib context and vfs inode */  	struct p9_qid qid;  	unsigned int cache_validity;  	struct p9_fid *writeback_fid; @@ -122,13 +118,13 @@ struct v9fs_inode {  static inline struct v9fs_inode *V9FS_I(const struct inode *inode)  { -	return container_of(inode, struct v9fs_inode, vfs_inode); +	return container_of(inode, struct v9fs_inode, netfs.inode);  }  static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode)  {  #ifdef CONFIG_9P_FSCACHE -	return netfs_i_cookie(&v9inode->vfs_inode); +	return netfs_i_cookie(&v9inode->netfs);  #else  	return NULL;  #endif diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 8ce82ff1e40a..d0833fa69faf 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -58,21 +58,33 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)   */  static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)  { +	struct inode *inode = file_inode(file); +	struct v9fs_inode *v9inode = V9FS_I(inode);  	struct p9_fid *fid = file->private_data; +	BUG_ON(!fid); + +	/* we might need to read from a fid that was opened write-only +	 * for read-modify-write of page cache, use the writeback fid +	 * for that */ +	if (rreq->origin == NETFS_READ_FOR_WRITE && +			(fid->mode & O_ACCMODE) == O_WRONLY) { +		fid = v9inode->writeback_fid; +		BUG_ON(!fid); +	} +  	refcount_inc(&fid->count);  	rreq->netfs_priv = fid;  	return 0;  }  /** - * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_request - * @mapping: unused mapping of request to cleanup - * @priv: private data to cleanup, a fid, guaranted non-null. + * v9fs_free_request - Cleanup request initialized by v9fs_init_rreq + * @rreq: The I/O request to clean up   */ -static void v9fs_req_cleanup(struct address_space *mapping, void *priv) +static void v9fs_free_request(struct netfs_io_request *rreq)  { -	struct p9_fid *fid = priv; +	struct p9_fid *fid = rreq->netfs_priv;  	p9_client_clunk(fid);  } @@ -94,9 +106,9 @@ static int v9fs_begin_cache_operation(struct netfs_io_request *rreq)  const struct netfs_request_ops v9fs_req_ops = {  	.init_request		= v9fs_init_request, +	.free_request		= v9fs_free_request,  	.begin_cache_operation	= v9fs_begin_cache_operation,  	.issue_read		= v9fs_issue_read, -	.cleanup		= v9fs_req_cleanup,  };  /** @@ -140,7 +152,7 @@ static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error,  	    transferred_or_error != -ENOBUFS) {  		version = cpu_to_le32(v9inode->qid.version);  		fscache_invalidate(v9fs_inode_cookie(v9inode), &version, -				   i_size_read(&v9inode->vfs_inode), 0); +				   i_size_read(&v9inode->netfs.inode), 0);  	}  } @@ -274,7 +286,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,  	 * file.  We need to do this before we get a lock on the page in case  	 * there's more than one writer competing for the same cache block.  	 */ -	retval = netfs_write_begin(filp, mapping, pos, len, &folio, fsdata); +	retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata);  	if (retval < 0)  		return retval; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 55367ecb9442..3d8297714772 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -234,7 +234,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)  	v9inode->writeback_fid = NULL;  	v9inode->cache_validity = 0;  	mutex_init(&v9inode->v_mutex); -	return &v9inode->vfs_inode; +	return &v9inode->netfs.inode;  }  /** @@ -252,7 +252,8 @@ void v9fs_free_inode(struct inode *inode)   */  static void v9fs_set_netfs_context(struct inode *inode)  { -	netfs_i_context_init(inode, &v9fs_req_ops); +	struct v9fs_inode *v9inode = V9FS_I(inode); +	netfs_inode_init(&v9inode->netfs, &v9fs_req_ops);  }  int v9fs_init_inode(struct v9fs_session_info *v9ses, @@ -1250,15 +1251,15 @@ static const char *v9fs_vfs_get_link(struct dentry *dentry,  		return ERR_PTR(-ECHILD);  	v9ses = v9fs_dentry2v9ses(dentry); -	fid = v9fs_fid_lookup(dentry); +	if (!v9fs_proto_dotu(v9ses)) +		return ERR_PTR(-EBADF); +  	p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); +	fid = v9fs_fid_lookup(dentry);  	if (IS_ERR(fid))  		return ERR_CAST(fid); -	if (!v9fs_proto_dotu(v9ses)) -		return ERR_PTR(-EBADF); -  	st = p9_client_stat(fid);  	p9_client_clunk(fid);  	if (IS_ERR(st)) diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index d17502a738a9..b6eb1160296c 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -274,6 +274,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,  	if (IS_ERR(ofid)) {  		err = PTR_ERR(ofid);  		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); +		p9_client_clunk(dfid);  		goto out;  	} @@ -285,6 +286,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,  	if (err) {  		p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",  			 err); +		p9_client_clunk(dfid);  		goto error;  	}  	err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), @@ -292,6 +294,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,  	if (err < 0) {  		p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",  			 err); +		p9_client_clunk(dfid);  		goto error;  	}  	v9fs_invalidate_inode_attr(dir); diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 1b4d5809808d..a484fa642808 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -30,7 +30,7 @@ void afs_invalidate_mmap_work(struct work_struct *work)  {  	struct afs_vnode *vnode = container_of(work, struct afs_vnode, cb_work); -	unmap_mapping_pages(vnode->vfs_inode.i_mapping, 0, 0, false); +	unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);  }  void afs_server_init_callback_work(struct work_struct *work) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 79f6b74336d2..56ae5cd5184f 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -109,7 +109,7 @@ struct afs_lookup_cookie {   */  static void afs_dir_read_cleanup(struct afs_read *req)  { -	struct address_space *mapping = req->vnode->vfs_inode.i_mapping; +	struct address_space *mapping = req->vnode->netfs.inode.i_mapping;  	struct folio *folio;  	pgoff_t last = req->nr_pages - 1; @@ -153,7 +153,7 @@ static bool afs_dir_check_folio(struct afs_vnode *dvnode, struct folio *folio,  		block = kmap_local_folio(folio, offset);  		if (block->hdr.magic != AFS_DIR_MAGIC) {  			printk("kAFS: %s(%lx): [%llx] bad magic %zx/%zx is %04hx\n", -			       __func__, dvnode->vfs_inode.i_ino, +			       __func__, dvnode->netfs.inode.i_ino,  			       pos, offset, size, ntohs(block->hdr.magic));  			trace_afs_dir_check_failed(dvnode, pos + offset, i_size);  			kunmap_local(block); @@ -183,7 +183,7 @@ error:  static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)  {  	union afs_xdr_dir_block *block; -	struct address_space *mapping = dvnode->vfs_inode.i_mapping; +	struct address_space *mapping = dvnode->netfs.inode.i_mapping;  	struct folio *folio;  	pgoff_t last = req->nr_pages - 1;  	size_t offset, size; @@ -217,7 +217,7 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)   */  static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)  { -	struct address_space *mapping = dvnode->vfs_inode.i_mapping; +	struct address_space *mapping = dvnode->netfs.inode.i_mapping;  	struct folio *folio;  	pgoff_t last = req->nr_pages - 1;  	int ret = 0; @@ -269,7 +269,7 @@ static int afs_dir_open(struct inode *inode, struct file *file)  static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)  	__acquires(&dvnode->validate_lock)  { -	struct address_space *mapping = dvnode->vfs_inode.i_mapping; +	struct address_space *mapping = dvnode->netfs.inode.i_mapping;  	struct afs_read *req;  	loff_t i_size;  	int nr_pages, i; @@ -287,7 +287,7 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)  	req->cleanup = afs_dir_read_cleanup;  expand: -	i_size = i_size_read(&dvnode->vfs_inode); +	i_size = i_size_read(&dvnode->netfs.inode);  	if (i_size < 2048) {  		ret = afs_bad(dvnode, afs_file_error_dir_small);  		goto error; @@ -305,7 +305,7 @@ expand:  	req->actual_len = i_size; /* May change */  	req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */  	req->data_version = dvnode->status.data_version; /* May change */ -	iov_iter_xarray(&req->def_iter, READ, &dvnode->vfs_inode.i_mapping->i_pages, +	iov_iter_xarray(&req->def_iter, READ, &dvnode->netfs.inode.i_mapping->i_pages,  			0, i_size);  	req->iter = &req->def_iter; @@ -897,7 +897,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,  out_op:  	if (op->error == 0) { -		inode = &op->file[1].vnode->vfs_inode; +		inode = &op->file[1].vnode->netfs.inode;  		op->file[1].vnode = NULL;  	} @@ -1139,7 +1139,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)  	afs_stat_v(dir, n_reval);  	/* search the directory for this vnode */ -	ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key, &dir_version); +	ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, key, &dir_version);  	switch (ret) {  	case 0:  		/* the filename maps to something */ @@ -1170,7 +1170,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)  			_debug("%pd: file deleted (uq %u -> %u I:%u)",  			       dentry, fid.unique,  			       vnode->fid.unique, -			       vnode->vfs_inode.i_generation); +			       vnode->netfs.inode.i_generation);  			goto not_found;  		}  		goto out_valid; @@ -1368,7 +1368,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry)  	if (d_really_is_positive(dentry)) {  		struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); -		clear_nlink(&vnode->vfs_inode); +		clear_nlink(&vnode->netfs.inode);  		set_bit(AFS_VNODE_DELETED, &vnode->flags);  		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);  		clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); @@ -1487,8 +1487,8 @@ static void afs_dir_remove_link(struct afs_operation *op)  		/* Already done */  	} else if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {  		write_seqlock(&vnode->cb_lock); -		drop_nlink(&vnode->vfs_inode); -		if (vnode->vfs_inode.i_nlink == 0) { +		drop_nlink(&vnode->netfs.inode); +		if (vnode->netfs.inode.i_nlink == 0) {  			set_bit(AFS_VNODE_DELETED, &vnode->flags);  			__afs_break_callback(vnode, afs_cb_break_for_unlink);  		} @@ -1504,7 +1504,7 @@ static void afs_dir_remove_link(struct afs_operation *op)  			op->error = ret;  	} -	_debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, op->error); +	_debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error);  }  static void afs_unlink_success(struct afs_operation *op) @@ -1680,8 +1680,8 @@ static void afs_link_success(struct afs_operation *op)  	afs_update_dentry_version(op, dvp, op->dentry);  	if (op->dentry_2->d_parent == op->dentry->d_parent)  		afs_update_dentry_version(op, dvp, op->dentry_2); -	ihold(&vp->vnode->vfs_inode); -	d_instantiate(op->dentry, &vp->vnode->vfs_inode); +	ihold(&vp->vnode->netfs.inode); +	d_instantiate(op->dentry, &vp->vnode->netfs.inode);  }  static void afs_link_put(struct afs_operation *op) diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index d98e109ecee9..0ab7752d1b75 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -109,7 +109,7 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block,   */  static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)  { -	struct address_space *mapping = vnode->vfs_inode.i_mapping; +	struct address_space *mapping = vnode->netfs.inode.i_mapping;  	struct folio *folio;  	folio = __filemap_get_folio(mapping, index, @@ -216,7 +216,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,  	_enter(",,{%d,%s},", name->len, name->name); -	i_size = i_size_read(&vnode->vfs_inode); +	i_size = i_size_read(&vnode->netfs.inode);  	if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||  	    (i_size & (AFS_DIR_BLOCK_SIZE - 1))) {  		clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); @@ -336,7 +336,7 @@ found_space:  	if (b < AFS_DIR_BLOCKS_WITH_CTR)  		meta->meta.alloc_ctrs[b] -= need_slots; -	inode_inc_iversion_raw(&vnode->vfs_inode); +	inode_inc_iversion_raw(&vnode->netfs.inode);  	afs_stat_v(vnode, n_dir_cr);  	_debug("Insert %s in %u[%u]", name->name, b, slot); @@ -383,7 +383,7 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,  	_enter(",,{%d,%s},", name->len, name->name); -	i_size = i_size_read(&vnode->vfs_inode); +	i_size = i_size_read(&vnode->netfs.inode);  	if (i_size < AFS_DIR_BLOCK_SIZE ||  	    i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||  	    (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { @@ -463,7 +463,7 @@ found_dirent:  	if (b < AFS_DIR_BLOCKS_WITH_CTR)  		meta->meta.alloc_ctrs[b] += need_slots; -	inode_set_iversion_raw(&vnode->vfs_inode, vnode->status.data_version); +	inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version);  	afs_stat_v(vnode, n_dir_rm);  	_debug("Remove %s from %u[%u]", name->name, b, slot); diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 45cfd50a9521..bb5807e87fa4 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -131,7 +131,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,  			goto out;  	} while (!d_is_negative(sdentry)); -	ihold(&vnode->vfs_inode); +	ihold(&vnode->netfs.inode);  	ret = afs_do_silly_rename(dvnode, vnode, dentry, sdentry, key);  	switch (ret) { @@ -148,7 +148,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,  		d_drop(sdentry);  	} -	iput(&vnode->vfs_inode); +	iput(&vnode->netfs.inode);  	dput(sdentry);  out:  	_leave(" = %d", ret); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index f120bcb8bf73..d7d9402ff718 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)  	/* there shouldn't be an existing inode */  	BUG_ON(!(inode->i_state & I_NEW)); -	netfs_i_context_init(inode, NULL); +	netfs_inode_init(&vnode->netfs, NULL);  	inode->i_size		= 0;  	inode->i_mode		= S_IFDIR | S_IRUGO | S_IXUGO;  	if (root) { diff --git a/fs/afs/file.c b/fs/afs/file.c index a8e8832179e4..42118a4f3383 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -194,7 +194,7 @@ int afs_release(struct inode *inode, struct file *file)  		afs_put_wb_key(af->wb);  	if ((file->f_mode & FMODE_WRITE)) { -		i_size = i_size_read(&vnode->vfs_inode); +		i_size = i_size_read(&vnode->netfs.inode);  		afs_set_cache_aux(vnode, &aux);  		fscache_unuse_cookie(afs_vnode_cache(vnode), &aux, &i_size);  	} else { @@ -325,7 +325,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)  	fsreq->iter	= &fsreq->def_iter;  	iov_iter_xarray(&fsreq->def_iter, READ, -			&fsreq->vnode->vfs_inode.i_mapping->i_pages, +			&fsreq->vnode->netfs.inode.i_mapping->i_pages,  			fsreq->pos, fsreq->len);  	afs_fetch_data(fsreq->vnode, fsreq); @@ -382,17 +382,17 @@ static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,  	return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;  } -static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv) +static void afs_free_request(struct netfs_io_request *rreq)  { -	key_put(netfs_priv); +	key_put(rreq->netfs_priv);  }  const struct netfs_request_ops afs_req_ops = {  	.init_request		= afs_init_request, +	.free_request		= afs_free_request,  	.begin_cache_operation	= afs_begin_cache_operation,  	.check_write_begin	= afs_check_write_begin,  	.issue_read		= afs_issue_read, -	.cleanup		= afs_priv_cleanup,  };  int afs_write_inode(struct inode *inode, struct writeback_control *wbc) diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index d222dfbe976b..7a3803ce3a22 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -232,14 +232,14 @@ int afs_put_operation(struct afs_operation *op)  	if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode)  		clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags);  	if (op->file[0].put_vnode) -		iput(&op->file[0].vnode->vfs_inode); +		iput(&op->file[0].vnode->netfs.inode);  	if (op->file[1].put_vnode) -		iput(&op->file[1].vnode->vfs_inode); +		iput(&op->file[1].vnode->netfs.inode);  	if (op->more_files) {  		for (i = 0; i < op->nr_files - 2; i++)  			if (op->more_files[i].put_vnode) -				iput(&op->more_files[i].vnode->vfs_inode); +				iput(&op->more_files[i].vnode->netfs.inode);  		kfree(op->more_files);  	} diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 30b066299d39..64dab70d4a4f 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -58,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren   */  static void afs_set_netfs_context(struct afs_vnode *vnode)  { -	netfs_i_context_init(&vnode->vfs_inode, &afs_req_ops); +	netfs_inode_init(&vnode->netfs, &afs_req_ops);  }  /* @@ -96,7 +96,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,  	inode->i_flags |= S_NOATIME;  	inode->i_uid = make_kuid(&init_user_ns, status->owner);  	inode->i_gid = make_kgid(&init_user_ns, status->group); -	set_nlink(&vnode->vfs_inode, status->nlink); +	set_nlink(&vnode->netfs.inode, status->nlink);  	switch (status->type) {  	case AFS_FTYPE_FILE: @@ -139,7 +139,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,  	afs_set_netfs_context(vnode);  	vnode->invalid_before	= status->data_version; -	inode_set_iversion_raw(&vnode->vfs_inode, status->data_version); +	inode_set_iversion_raw(&vnode->netfs.inode, status->data_version);  	if (!vp->scb.have_cb) {  		/* it's a symlink we just created (the fileserver @@ -163,7 +163,7 @@ static void afs_apply_status(struct afs_operation *op,  {  	struct afs_file_status *status = &vp->scb.status;  	struct afs_vnode *vnode = vp->vnode; -	struct inode *inode = &vnode->vfs_inode; +	struct inode *inode = &vnode->netfs.inode;  	struct timespec64 t;  	umode_t mode;  	bool data_changed = false; @@ -246,7 +246,7 @@ static void afs_apply_status(struct afs_operation *op,  		 * idea of what the size should be that's not the same as  		 * what's on the server.  		 */ -		vnode->netfs_ctx.remote_i_size = status->size; +		vnode->netfs.remote_i_size = status->size;  		if (change_size) {  			afs_set_i_size(vnode, status->size);  			inode->i_ctime = t; @@ -289,7 +289,7 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v  		 */  		if (vp->scb.status.abort_code == VNOVNODE) {  			set_bit(AFS_VNODE_DELETED, &vnode->flags); -			clear_nlink(&vnode->vfs_inode); +			clear_nlink(&vnode->netfs.inode);  			__afs_break_callback(vnode, afs_cb_break_for_deleted);  			op->flags &= ~AFS_OPERATION_DIR_CONFLICT;  		} @@ -306,8 +306,8 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v  		if (vp->scb.have_cb)  			afs_apply_callback(op, vp);  	} else if (vp->op_unlinked && !(op->flags & AFS_OPERATION_DIR_CONFLICT)) { -		drop_nlink(&vnode->vfs_inode); -		if (vnode->vfs_inode.i_nlink == 0) { +		drop_nlink(&vnode->netfs.inode); +		if (vnode->netfs.inode.i_nlink == 0) {  			set_bit(AFS_VNODE_DELETED, &vnode->flags);  			__afs_break_callback(vnode, afs_cb_break_for_deleted);  		} @@ -326,7 +326,7 @@ static void afs_fetch_status_success(struct afs_operation *op)  	struct afs_vnode *vnode = vp->vnode;  	int ret; -	if (vnode->vfs_inode.i_state & I_NEW) { +	if (vnode->netfs.inode.i_state & I_NEW) {  		ret = afs_inode_init_from_status(op, vp, vnode);  		op->error = ret;  		if (ret == 0) @@ -430,7 +430,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)  	struct afs_vnode_cache_aux aux;  	if (vnode->status.type != AFS_FTYPE_FILE) { -		vnode->netfs_ctx.cache = NULL; +		vnode->netfs.cache = NULL;  		return;  	} @@ -457,7 +457,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)  struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp)  {  	struct afs_vnode_param *dvp = &op->file[0]; -	struct super_block *sb = dvp->vnode->vfs_inode.i_sb; +	struct super_block *sb = dvp->vnode->netfs.inode.i_sb;  	struct afs_vnode *vnode;  	struct inode *inode;  	int ret; @@ -582,10 +582,10 @@ static void afs_zap_data(struct afs_vnode *vnode)  	/* nuke all the non-dirty pages that aren't locked, mapped or being  	 * written back in a regular file and completely discard the pages in a  	 * directory or symlink */ -	if (S_ISREG(vnode->vfs_inode.i_mode)) -		invalidate_remote_inode(&vnode->vfs_inode); +	if (S_ISREG(vnode->netfs.inode.i_mode)) +		invalidate_remote_inode(&vnode->netfs.inode);  	else -		invalidate_inode_pages2(vnode->vfs_inode.i_mapping); +		invalidate_inode_pages2(vnode->netfs.inode.i_mapping);  }  /* @@ -683,8 +683,8 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)  	       key_serial(key));  	if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) { -		if (vnode->vfs_inode.i_nlink) -			clear_nlink(&vnode->vfs_inode); +		if (vnode->netfs.inode.i_nlink) +			clear_nlink(&vnode->netfs.inode);  		goto valid;  	} @@ -745,7 +745,8 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path,  	_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); -	if (!(query_flags & AT_STATX_DONT_SYNC) && +	if (vnode->volume && +	    !(query_flags & AT_STATX_DONT_SYNC) &&  	    !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {  		key = afs_request_key(vnode->volume->cell);  		if (IS_ERR(key)) @@ -826,7 +827,7 @@ void afs_evict_inode(struct inode *inode)  static void afs_setattr_success(struct afs_operation *op)  {  	struct afs_vnode_param *vp = &op->file[0]; -	struct inode *inode = &vp->vnode->vfs_inode; +	struct inode *inode = &vp->vnode->netfs.inode;  	loff_t old_i_size = i_size_read(inode);  	op->setattr.old_i_size = old_i_size; @@ -843,7 +844,7 @@ static void afs_setattr_success(struct afs_operation *op)  static void afs_setattr_edit_file(struct afs_operation *op)  {  	struct afs_vnode_param *vp = &op->file[0]; -	struct inode *inode = &vp->vnode->vfs_inode; +	struct inode *inode = &vp->vnode->netfs.inode;  	if (op->setattr.attr->ia_valid & ATTR_SIZE) {  		loff_t size = op->setattr.attr->ia_size; @@ -875,7 +876,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,  		ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | ATTR_TOUCH;  	struct afs_operation *op;  	struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); -	struct inode *inode = &vnode->vfs_inode; +	struct inode *inode = &vnode->netfs.inode;  	loff_t i_size;  	int ret; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a30995901266..a6f25d9e75b5 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -619,12 +619,7 @@ enum afs_lock_state {   * leak from one inode to another.   */  struct afs_vnode { -	struct { -		/* These must be contiguous */ -		struct inode	vfs_inode;	/* the VFS's inode record */ -		struct netfs_i_context netfs_ctx; /* Netfslib context */ -	}; - +	struct netfs_inode	netfs;		/* Netfslib context and vfs inode */  	struct afs_volume	*volume;	/* volume on which vnode resides */  	struct afs_fid		fid;		/* the file identifier for this inode */  	struct afs_file_status	status;		/* AFS status info for this file */ @@ -675,7 +670,7 @@ struct afs_vnode {  static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode)  {  #ifdef CONFIG_AFS_FSCACHE -	return netfs_i_cookie(&vnode->vfs_inode); +	return netfs_i_cookie(&vnode->netfs);  #else  	return NULL;  #endif @@ -685,7 +680,7 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode,  				       struct fscache_cookie *cookie)  {  #ifdef CONFIG_AFS_FSCACHE -	vnode->netfs_ctx.cache = cookie; +	vnode->netfs.cache = cookie;  #endif  } @@ -892,7 +887,7 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl  	afs_set_cache_aux(vnode, &aux);  	fscache_invalidate(afs_vnode_cache(vnode), &aux, -			   i_size_read(&vnode->vfs_inode), flags); +			   i_size_read(&vnode->netfs.inode), flags);  }  /* @@ -1217,7 +1212,7 @@ static inline struct afs_net *afs_i2net(struct inode *inode)  static inline struct afs_net *afs_v2net(struct afs_vnode *vnode)  { -	return afs_i2net(&vnode->vfs_inode); +	return afs_i2net(&vnode->netfs.inode);  }  static inline struct afs_net *afs_sock2net(struct sock *sk) @@ -1593,12 +1588,12 @@ extern void yfs_fs_store_opaque_acl2(struct afs_operation *);   */  static inline struct afs_vnode *AFS_FS_I(struct inode *inode)  { -	return container_of(inode, struct afs_vnode, vfs_inode); +	return container_of(inode, struct afs_vnode, netfs.inode);  }  static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)  { -	return &vnode->vfs_inode; +	return &vnode->netfs.inode;  }  /* @@ -1621,8 +1616,8 @@ static inline void afs_update_dentry_version(struct afs_operation *op,   */  static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size)  { -	i_size_write(&vnode->vfs_inode, size); -	vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1; +	i_size_write(&vnode->netfs.inode, size); +	vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1;  }  /* diff --git a/fs/afs/super.c b/fs/afs/super.c index 1fea195b0b27..95d713074dc8 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -659,7 +659,7 @@ static void afs_i_init_once(void *_vnode)  	struct afs_vnode *vnode = _vnode;  	memset(vnode, 0, sizeof(*vnode)); -	inode_init_once(&vnode->vfs_inode); +	inode_init_once(&vnode->netfs.inode);  	mutex_init(&vnode->io_lock);  	init_rwsem(&vnode->validate_lock);  	spin_lock_init(&vnode->wb_lock); @@ -700,8 +700,8 @@ static struct inode *afs_alloc_inode(struct super_block *sb)  	init_rwsem(&vnode->rmdir_lock);  	INIT_WORK(&vnode->cb_work, afs_invalidate_mmap_work); -	_leave(" = %p", &vnode->vfs_inode); -	return &vnode->vfs_inode; +	_leave(" = %p", &vnode->netfs.inode); +	return &vnode->netfs.inode;  }  static void afs_free_inode(struct inode *inode) diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 94a3d247924b..cc665cef0abe 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -9,8 +9,7 @@  #include <linux/slab.h>  #include "internal.h" -unsigned __read_mostly afs_volume_gc_delay = 10; -unsigned __read_mostly afs_volume_record_life = 60 * 60; +static unsigned __read_mostly afs_volume_record_life = 60 * 60;  /*   * Insert a volume into a cell.  If there's an existing volume record, that is diff --git a/fs/afs/write.c b/fs/afs/write.c index 2236b2165e37..2c885b22de34 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -60,7 +60,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,  	 * file.  We need to do this before we get a lock on the page in case  	 * there's more than one writer competing for the same cache block.  	 */ -	ret = netfs_write_begin(file, mapping, pos, len, &folio, fsdata); +	ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata);  	if (ret < 0)  		return ret; @@ -146,10 +146,10 @@ int afs_write_end(struct file *file, struct address_space *mapping,  	write_end_pos = pos + copied; -	i_size = i_size_read(&vnode->vfs_inode); +	i_size = i_size_read(&vnode->netfs.inode);  	if (write_end_pos > i_size) {  		write_seqlock(&vnode->cb_lock); -		i_size = i_size_read(&vnode->vfs_inode); +		i_size = i_size_read(&vnode->netfs.inode);  		if (write_end_pos > i_size)  			afs_set_i_size(vnode, write_end_pos);  		write_sequnlock(&vnode->cb_lock); @@ -257,7 +257,7 @@ static void afs_redirty_pages(struct writeback_control *wbc,   */  static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len)  { -	struct address_space *mapping = vnode->vfs_inode.i_mapping; +	struct address_space *mapping = vnode->netfs.inode.i_mapping;  	struct folio *folio;  	pgoff_t end; @@ -354,7 +354,6 @@ static const struct afs_operation_ops afs_store_data_operation = {  static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos,  			  bool laundering)  { -	struct netfs_i_context *ictx = &vnode->netfs_ctx;  	struct afs_operation *op;  	struct afs_wb_key *wbk = NULL;  	loff_t size = iov_iter_count(iter); @@ -385,9 +384,9 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t  	op->store.write_iter = iter;  	op->store.pos = pos;  	op->store.size = size; -	op->store.i_size = max(pos + size, ictx->remote_i_size); +	op->store.i_size = max(pos + size, vnode->netfs.remote_i_size);  	op->store.laundering = laundering; -	op->mtime = vnode->vfs_inode.i_mtime; +	op->mtime = vnode->netfs.inode.i_mtime;  	op->flags |= AFS_OPERATION_UNINTR;  	op->ops = &afs_store_data_operation; @@ -554,7 +553,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,  	struct iov_iter iter;  	unsigned long priv;  	unsigned int offset, to, len, max_len; -	loff_t i_size = i_size_read(&vnode->vfs_inode); +	loff_t i_size = i_size_read(&vnode->netfs.inode);  	bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);  	bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode));  	long count = wbc->nr_to_write; @@ -845,7 +844,7 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)  	_enter("{%llx:%llu},{%zu},",  	       vnode->fid.vid, vnode->fid.vnode, count); -	if (IS_SWAPFILE(&vnode->vfs_inode)) { +	if (IS_SWAPFILE(&vnode->netfs.inode)) {  		printk(KERN_INFO  		       "AFS: Attempt to write to active swap file!\n");  		return -EBUSY; @@ -958,8 +957,8 @@ void afs_prune_wb_keys(struct afs_vnode *vnode)  	/* Discard unused keys */  	spin_lock(&vnode->wb_lock); -	if (!mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_WRITEBACK) && -	    !mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_DIRTY)) { +	if (!mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_WRITEBACK) && +	    !mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_DIRTY)) {  		list_for_each_entry_safe(wbk, tmp, &vnode->wb_keys, vnode_link) {  			if (refcount_read(&wbk->usage) == 1)  				list_move(&wbk->vnode_link, &graveyard); @@ -1034,6 +1033,6 @@ static void afs_write_to_cache(struct afs_vnode *vnode,  			       bool caching)  {  	fscache_write_to_cache(afs_vnode_cache(vnode), -			       vnode->vfs_inode.i_mapping, start, len, i_size, +			       vnode->netfs.inode.i_mapping, start, len, i_size,  			       afs_write_to_cache_done, vnode, caching);  } diff --git a/fs/attr.c b/fs/attr.c index 66899b6e9bd8..dbe996b0dedf 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -61,9 +61,15 @@ static bool chgrp_ok(struct user_namespace *mnt_userns,  		     const struct inode *inode, kgid_t gid)  {  	kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); -	if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) && -	    (in_group_p(gid) || gid_eq(gid, inode->i_gid))) -		return true; +	if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) { +		kgid_t mapped_gid; + +		if (gid_eq(gid, inode->i_gid)) +			return true; +		mapped_gid = mapped_kgid_fs(mnt_userns, i_user_ns(inode), gid); +		if (in_group_p(mapped_gid)) +			return true; +	}  	if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))  		return true;  	if (gid_eq(kgid, INVALID_GID) && @@ -123,12 +129,20 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,  	/* Make sure a caller can chmod. */  	if (ia_valid & ATTR_MODE) { +		kgid_t mapped_gid; +  		if (!inode_owner_or_capable(mnt_userns, inode))  			return -EPERM; + +		if (ia_valid & ATTR_GID) +			mapped_gid = mapped_kgid_fs(mnt_userns, +						i_user_ns(inode), attr->ia_gid); +		else +			mapped_gid = i_gid_into_mnt(mnt_userns, inode); +  		/* Also check the setgid bit! */ -               if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : -                                i_gid_into_mnt(mnt_userns, inode)) && -                    !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) +		if (!in_group_p(mapped_gid) && +		    !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))  			attr->ia_mode &= ~S_ISGID;  	} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 3ac668ace50a..35e0e860cc0b 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -104,6 +104,7 @@ struct btrfs_block_group {  	unsigned int relocating_repair:1;  	unsigned int chunk_item_inserted:1;  	unsigned int zone_is_active:1; +	unsigned int zoned_data_reloc_ongoing:1;  	int disk_cache_state; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0e49b1a0c071..415bf1823fb3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1330,6 +1330,8 @@ struct btrfs_replace_extent_info {  	 * existing extent into a file range.  	 */  	bool is_new_extent; +	/* Indicate if we should update the inode's mtime and ctime. */ +	bool update_times;  	/* Meaningful only if is_new_extent is true. */  	int qgroup_reserved;  	/* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 89e94ea2fef5..4ba005c41983 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4632,6 +4632,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)  	int ret;  	set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); + +	/* +	 * We may have the reclaim task running and relocating a data block group, +	 * in which case it may create delayed iputs. So stop it before we park +	 * the cleaner kthread otherwise we can get new delayed iputs after +	 * parking the cleaner, and that can make the async reclaim task to hang +	 * if it's waiting for delayed iputs to complete, since the cleaner is +	 * parked and can not run delayed iputs - this will make us hang when +	 * trying to stop the async reclaim task. +	 */ +	cancel_work_sync(&fs_info->reclaim_bgs_work);  	/*  	 * We don't want the cleaner to start new transactions, add more delayed  	 * iputs, etc. while we're closing. We can't use kthread_stop() yet @@ -4672,8 +4683,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)  	cancel_work_sync(&fs_info->async_data_reclaim_work);  	cancel_work_sync(&fs_info->preempt_reclaim_work); -	cancel_work_sync(&fs_info->reclaim_bgs_work); -  	/* Cancel or finish ongoing discard work */  	btrfs_discard_cleanup(fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0867c5cd6e01..4157ecc27d4b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3832,7 +3832,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,  	       block_group->start == fs_info->data_reloc_bg ||  	       fs_info->data_reloc_bg == 0); -	if (block_group->ro) { +	if (block_group->ro || block_group->zoned_data_reloc_ongoing) {  		ret = 1;  		goto out;  	} @@ -3894,8 +3894,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,  out:  	if (ret && ffe_ctl->for_treelog)  		fs_info->treelog_bg = 0; -	if (ret && ffe_ctl->for_data_reloc) +	if (ret && ffe_ctl->for_data_reloc && +	    fs_info->data_reloc_bg == block_group->start) { +		/* +		 * Do not allow further allocations from this block group. +		 * Compared to increasing the ->ro, setting the +		 * ->zoned_data_reloc_ongoing flag still allows nocow +		 *  writers to come in. See btrfs_inc_nocow_writers(). +		 * +		 * We need to disable an allocation to avoid an allocation of +		 * regular (non-relocation data) extent. With mix of relocation +		 * extents and regular extents, we can dispatch WRITE commands +		 * (for relocation extents) and ZONE APPEND commands (for +		 * regular extents) at the same time to the same zone, which +		 * easily break the write pointer. +		 */ +		block_group->zoned_data_reloc_ongoing = 1;  		fs_info->data_reloc_bg = 0; +	}  	spin_unlock(&fs_info->relocation_bg_lock);  	spin_unlock(&fs_info->treelog_bg_lock);  	spin_unlock(&block_group->lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8f6b544ae616..04e36343da3a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5241,13 +5241,14 @@ int extent_writepages(struct address_space *mapping,  	 */  	btrfs_zoned_data_reloc_lock(BTRFS_I(inode));  	ret = extent_write_cache_pages(mapping, wbc, &epd); -	btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));  	ASSERT(ret <= 0);  	if (ret < 0) { +		btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));  		end_write_bio(&epd, ret);  		return ret;  	}  	flush_write_bio(&epd); +	btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));  	return ret;  } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1fd827b99c1b..9dfde1af8a64 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2323,25 +2323,62 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	 */  	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); -	if (ret != BTRFS_NO_LOG_SYNC) { +	if (ret == BTRFS_NO_LOG_SYNC) { +		ret = btrfs_end_transaction(trans); +		goto out; +	} + +	/* We successfully logged the inode, attempt to sync the log. */ +	if (!ret) { +		ret = btrfs_sync_log(trans, root, &ctx);  		if (!ret) { -			ret = btrfs_sync_log(trans, root, &ctx); -			if (!ret) { -				ret = btrfs_end_transaction(trans); -				goto out; -			} -		} -		if (!full_sync) { -			ret = btrfs_wait_ordered_range(inode, start, len); -			if (ret) { -				btrfs_end_transaction(trans); -				goto out; -			} +			ret = btrfs_end_transaction(trans); +			goto out;  		} -		ret = btrfs_commit_transaction(trans); -	} else { +	} + +	/* +	 * At this point we need to commit the transaction because we had +	 * btrfs_need_log_full_commit() or some other error. +	 * +	 * If we didn't do a full sync we have to stop the trans handle, wait on +	 * the ordered extents, start it again and commit the transaction.  If +	 * we attempt to wait on the ordered extents here we could deadlock with +	 * something like fallocate() that is holding the extent lock trying to +	 * start a transaction while some other thread is trying to commit the +	 * transaction while we (fsync) are currently holding the transaction +	 * open. +	 */ +	if (!full_sync) {  		ret = btrfs_end_transaction(trans); +		if (ret) +			goto out; +		ret = btrfs_wait_ordered_range(inode, start, len); +		if (ret) +			goto out; + +		/* +		 * This is safe to use here because we're only interested in +		 * making sure the transaction that had the ordered extents is +		 * committed.  We aren't waiting on anything past this point, +		 * we're purely getting the transaction and committing it. +		 */ +		trans = btrfs_attach_transaction_barrier(root); +		if (IS_ERR(trans)) { +			ret = PTR_ERR(trans); + +			/* +			 * We committed the transaction and there's no currently +			 * running transaction, this means everything we care +			 * about made it to disk and we are done. +			 */ +			if (ret == -ENOENT) +				ret = 0; +			goto out; +		}  	} + +	ret = btrfs_commit_transaction(trans);  out:  	ASSERT(list_empty(&ctx.list));  	err = file_check_and_advance_wb_err(file); @@ -2719,7 +2756,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,  	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,  				      min_size, false); -	BUG_ON(ret); +	if (WARN_ON(ret)) +		goto out_trans;  	trans->block_rsv = rsv;  	cur_offset = start; @@ -2803,6 +2841,25 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,  			extent_info->file_offset += replace_len;  		} +		/* +		 * We are releasing our handle on the transaction, balance the +		 * dirty pages of the btree inode and flush delayed items, and +		 * then get a new transaction handle, which may now point to a +		 * new transaction in case someone else may have committed the +		 * transaction we used to replace/drop file extent items. So +		 * bump the inode's iversion and update mtime and ctime except +		 * if we are called from a dedupe context. This is because a +		 * power failure/crash may happen after the transaction is +		 * committed and before we finish replacing/dropping all the +		 * file extent items we need. +		 */ +		inode_inc_iversion(&inode->vfs_inode); + +		if (!extent_info || extent_info->update_times) { +			inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode); +			inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime; +		} +  		ret = btrfs_update_inode(trans, root, inode);  		if (ret)  			break; @@ -2819,7 +2876,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,  		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,  					      rsv, min_size, false); -		BUG_ON(ret);	/* shouldn't happen */ +		if (WARN_ON(ret)) +			break;  		trans->block_rsv = rsv;  		cur_offset = drop_args.drop_end; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 81737eff92f3..05e0c4a5affd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3195,6 +3195,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)  						ordered_extent->file_offset,  						ordered_extent->file_offset +  						logical_len); +		btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, +						  ordered_extent->disk_num_bytes);  	} else {  		BUG_ON(root == fs_info->tree_root);  		ret = insert_ordered_extent_file_extent(trans, ordered_extent); @@ -9897,6 +9899,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(  	extent_info.file_offset = file_offset;  	extent_info.extent_buf = (char *)&stack_fi;  	extent_info.is_new_extent = true; +	extent_info.update_times = true;  	extent_info.qgroup_reserved = qgroup_released;  	extent_info.insertions = 0; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 313d9d685adb..33461b4f9c8b 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -45,7 +45,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne  		start_ns = ktime_get_ns();  	down_read_nested(&eb->lock, nest); -	eb->lock_owner = current->pid;  	trace_btrfs_tree_read_lock(eb, start_ns);  } @@ -62,7 +61,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb)  int btrfs_try_tree_read_lock(struct extent_buffer *eb)  {  	if (down_read_trylock(&eb->lock)) { -		eb->lock_owner = current->pid;  		trace_btrfs_try_tree_read_lock(eb);  		return 1;  	} @@ -90,7 +88,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)  void btrfs_tree_read_unlock(struct extent_buffer *eb)  {  	trace_btrfs_tree_read_unlock(eb); -	eb->lock_owner = 0;  	up_read(&eb->lock);  } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index c39f8b3a5a4a..a3549d587464 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -344,6 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,  	int ret;  	const u64 len = olen_aligned;  	u64 last_dest_end = destoff; +	u64 prev_extent_end = off;  	ret = -ENOMEM;  	buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); @@ -363,7 +364,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,  	key.offset = off;  	while (1) { -		u64 next_key_min_offset = key.offset + 1;  		struct btrfs_file_extent_item *extent;  		u64 extent_gen;  		int type; @@ -431,14 +431,21 @@ process_slot:  		 * The first search might have left us at an extent item that  		 * ends before our target range's start, can happen if we have  		 * holes and NO_HOLES feature enabled. +		 * +		 * Subsequent searches may leave us on a file range we have +		 * processed before - this happens due to a race with ordered +		 * extent completion for a file range that is outside our source +		 * range, but that range was part of a file extent item that +		 * also covered a leading part of our source range.  		 */ -		if (key.offset + datal <= off) { +		if (key.offset + datal <= prev_extent_end) {  			path->slots[0]++;  			goto process_slot;  		} else if (key.offset >= off + len) {  			break;  		} -		next_key_min_offset = key.offset + datal; + +		prev_extent_end = key.offset + datal;  		size = btrfs_item_size(leaf, slot);  		read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),  				   size); @@ -489,6 +496,7 @@ process_slot:  			clone_info.file_offset = new_key.offset;  			clone_info.extent_buf = buf;  			clone_info.is_new_extent = false; +			clone_info.update_times = !no_time_update;  			ret = btrfs_replace_file_extents(BTRFS_I(inode), path,  					drop_start, new_key.offset + datal - 1,  					&clone_info, &trans); @@ -550,7 +558,7 @@ process_slot:  			break;  		btrfs_release_path(path); -		key.offset = next_key_min_offset; +		key.offset = prev_extent_end;  		if (fatal_signal_pending(current)) {  			ret = -EINTR; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b1fdc6a26c76..6627dd7875ee 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -763,6 +763,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,  				compress_force = false;  				no_compress++;  			} else { +				btrfs_err(info, "unrecognized compression value %s", +					  args[0].from);  				ret = -EINVAL;  				goto out;  			} @@ -821,8 +823,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,  		case Opt_thread_pool:  			ret = match_int(&args[0], &intarg);  			if (ret) { +				btrfs_err(info, "unrecognized thread_pool value %s", +					  args[0].from);  				goto out;  			} else if (intarg == 0) { +				btrfs_err(info, "invalid value 0 for thread_pool");  				ret = -EINVAL;  				goto out;  			} @@ -883,8 +888,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,  			break;  		case Opt_ratio:  			ret = match_int(&args[0], &intarg); -			if (ret) +			if (ret) { +				btrfs_err(info, "unrecognized metadata_ratio value %s", +					  args[0].from);  				goto out; +			}  			info->metadata_ratio = intarg;  			btrfs_info(info, "metadata ratio %u",  				   info->metadata_ratio); @@ -901,6 +909,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,  				btrfs_set_and_info(info, DISCARD_ASYNC,  						   "turning on async discard");  			} else { +				btrfs_err(info, "unrecognized discard mode value %s", +					  args[0].from);  				ret = -EINVAL;  				goto out;  			} @@ -933,6 +943,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,  				btrfs_set_and_info(info, FREE_SPACE_TREE,  						   "enabling free space tree");  			} else { +				btrfs_err(info, "unrecognized space_cache value %s", +					  args[0].from);  				ret = -EINVAL;  				goto out;  			} @@ -1014,8 +1026,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,  			break;  		case Opt_check_integrity_print_mask:  			ret = match_int(&args[0], &intarg); -			if (ret) +			if (ret) { +				btrfs_err(info, +				"unrecognized check_integrity_print_mask value %s", +					args[0].from);  				goto out; +			}  			info->check_integrity_print_mask = intarg;  			btrfs_info(info, "check_integrity_print_mask 0x%x",  				   info->check_integrity_print_mask); @@ -1030,13 +1046,15 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,  			goto out;  #endif  		case Opt_fatal_errors: -			if (strcmp(args[0].from, "panic") == 0) +			if (strcmp(args[0].from, "panic") == 0) {  				btrfs_set_opt(info->mount_opt,  					      PANIC_ON_FATAL_ERROR); -			else if (strcmp(args[0].from, "bug") == 0) +			} else if (strcmp(args[0].from, "bug") == 0) {  				btrfs_clear_opt(info->mount_opt,  					      PANIC_ON_FATAL_ERROR); -			else { +			} else { +				btrfs_err(info, "unrecognized fatal_errors value %s", +					  args[0].from);  				ret = -EINVAL;  				goto out;  			} @@ -1044,8 +1062,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,  		case Opt_commit_interval:  			intarg = 0;  			ret = match_int(&args[0], &intarg); -			if (ret) +			if (ret) { +				btrfs_err(info, "unrecognized commit_interval value %s", +					  args[0].from); +				ret = -EINVAL;  				goto out; +			}  			if (intarg == 0) {  				btrfs_info(info,  					   "using default commit interval %us", @@ -1059,8 +1081,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,  			break;  		case Opt_rescue:  			ret = parse_rescue_options(info, args[0].from); -			if (ret < 0) +			if (ret < 0) { +				btrfs_err(info, "unrecognized rescue value %s", +					  args[0].from);  				goto out; +			}  			break;  #ifdef CONFIG_BTRFS_DEBUG  		case Opt_fragment_all: @@ -1985,6 +2010,14 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)  	if (ret)  		goto restore; +	/* V1 cache is not supported for subpage mount. */ +	if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { +		btrfs_warn(fs_info, +	"v1 space cache is not supported for page size %lu with sectorsize %u", +			   PAGE_SIZE, fs_info->sectorsize); +		ret = -EINVAL; +		goto restore; +	}  	btrfs_remount_begin(fs_info, old_opts, *flags);  	btrfs_resize_thread_pool(fs_info,  		fs_info->thread_pool_size, old_thread_pool_size); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 11237a913bee..79e8c8cd75ed 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2139,3 +2139,30 @@ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)  	factor = div64_u64(used * 100, total);  	return factor >= fs_info->bg_reclaim_threshold;  } + +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, +				       u64 length) +{ +	struct btrfs_block_group *block_group; + +	if (!btrfs_is_zoned(fs_info)) +		return; + +	block_group = btrfs_lookup_block_group(fs_info, logical); +	/* It should be called on a previous data relocation block group. */ +	ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); + +	spin_lock(&block_group->lock); +	if (!block_group->zoned_data_reloc_ongoing) +		goto out; + +	/* All relocation extents are written. */ +	if (block_group->start + block_group->alloc_offset == logical + length) { +		/* Now, release this block group for further allocations. */ +		block_group->zoned_data_reloc_ongoing = 0; +	} + +out: +	spin_unlock(&block_group->lock); +	btrfs_put_block_group(block_group); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index bb1a189e11f9..6b2eec99162b 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -77,6 +77,8 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,  void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);  void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);  bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, +				       u64 length);  #else /* CONFIG_BLK_DEV_ZONED */  static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,  				     struct blk_zone *zone) @@ -243,6 +245,9 @@ static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)  {  	return false;  } + +static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, +						     u64 logical, u64 length) { }  #endif  static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e5221be6eb55..6dee88815491 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -394,11 +394,10 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)  	return 0;  } -static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) +static void ceph_netfs_free_request(struct netfs_io_request *rreq)  { -	struct inode *inode = mapping->host; -	struct ceph_inode_info *ci = ceph_inode(inode); -	int got = (uintptr_t)priv; +	struct ceph_inode_info *ci = ceph_inode(rreq->inode); +	int got = (uintptr_t)rreq->netfs_priv;  	if (got)  		ceph_put_cap_refs(ci, got); @@ -406,12 +405,12 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)  const struct netfs_request_ops ceph_netfs_ops = {  	.init_request		= ceph_init_request, +	.free_request		= ceph_netfs_free_request,  	.begin_cache_operation	= ceph_begin_cache_operation,  	.issue_read		= ceph_netfs_issue_read,  	.expand_readahead	= ceph_netfs_expand_readahead,  	.clamp_length		= ceph_netfs_clamp_length,  	.check_write_begin	= ceph_netfs_check_write_begin, -	.cleanup		= ceph_readahead_cleanup,  };  #ifdef CONFIG_CEPH_FSCACHE @@ -1322,10 +1321,11 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,  			    struct page **pagep, void **fsdata)  {  	struct inode *inode = file_inode(file); +	struct ceph_inode_info *ci = ceph_inode(inode);  	struct folio *folio = NULL;  	int r; -	r = netfs_write_begin(file, inode->i_mapping, pos, len, &folio, NULL); +	r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL);  	if (r == 0)  		folio_wait_fscache(folio);  	if (r < 0) { @@ -1798,7 +1798,7 @@ enum {  static int __ceph_pool_perm_get(struct ceph_inode_info *ci,  				s64 pool, struct ceph_string *pool_ns)  { -	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); +	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->netfs.inode);  	struct ceph_mds_client *mdsc = fsc->mdsc;  	struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;  	struct rb_node **p, *parent; @@ -1913,7 +1913,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,  				     0, false, true);  	err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); -	wr_req->r_mtime = ci->vfs_inode.i_mtime; +	wr_req->r_mtime = ci->netfs.inode.i_mtime;  	err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);  	if (!err) diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index ddea99922073..177d8e8d73fe 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -29,9 +29,9 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)  	if (!(inode->i_state & I_NEW))  		return; -	WARN_ON_ONCE(ci->netfs_ctx.cache); +	WARN_ON_ONCE(ci->netfs.cache); -	ci->netfs_ctx.cache = +	ci->netfs.cache =  		fscache_acquire_cookie(fsc->fscache, 0,  				       &ci->i_vino, sizeof(ci->i_vino),  				       &ci->i_version, sizeof(ci->i_version), diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 7255b790a4c1..dc502daac49a 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -28,7 +28,7 @@ void ceph_fscache_invalidate(struct inode *inode, bool dio_write);  static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)  { -	return netfs_i_cookie(&ci->vfs_inode); +	return netfs_i_cookie(&ci->netfs);  }  static inline void ceph_fscache_resize(struct inode *inode, loff_t to) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index bf2e94005598..38c930384d41 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -492,7 +492,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,  	struct ceph_mount_options *opt = mdsc->fsc->mount_options;  	ci->i_hold_caps_max = round_jiffies(jiffies +  					    opt->caps_wanted_delay_max * HZ); -	dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode, +	dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode,  	     ci->i_hold_caps_max - jiffies);  } @@ -507,7 +507,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,  static void __cap_delay_requeue(struct ceph_mds_client *mdsc,  				struct ceph_inode_info *ci)  { -	dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode, +	dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode,  	     ci->i_ceph_flags, ci->i_hold_caps_max);  	if (!mdsc->stopping) {  		spin_lock(&mdsc->cap_delay_lock); @@ -531,7 +531,7 @@ no_change:  static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,  				      struct ceph_inode_info *ci)  { -	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode); +	dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode);  	spin_lock(&mdsc->cap_delay_lock);  	ci->i_ceph_flags |= CEPH_I_FLUSH;  	if (!list_empty(&ci->i_cap_delay_list)) @@ -548,7 +548,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,  static void __cap_delay_cancel(struct ceph_mds_client *mdsc,  			       struct ceph_inode_info *ci)  { -	dout("__cap_delay_cancel %p\n", &ci->vfs_inode); +	dout("__cap_delay_cancel %p\n", &ci->netfs.inode);  	if (list_empty(&ci->i_cap_delay_list))  		return;  	spin_lock(&mdsc->cap_delay_lock); @@ -568,7 +568,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,  	 * Each time we receive FILE_CACHE anew, we increment  	 * i_rdcache_gen.  	 */ -	if (S_ISREG(ci->vfs_inode.i_mode) && +	if (S_ISREG(ci->netfs.inode.i_mode) &&  	    (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&  	    (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {  		ci->i_rdcache_gen++; @@ -583,14 +583,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,  	if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {  		if (issued & CEPH_CAP_FILE_SHARED)  			atomic_inc(&ci->i_shared_gen); -		if (S_ISDIR(ci->vfs_inode.i_mode)) { -			dout(" marking %p NOT complete\n", &ci->vfs_inode); +		if (S_ISDIR(ci->netfs.inode.i_mode)) { +			dout(" marking %p NOT complete\n", &ci->netfs.inode);  			__ceph_dir_clear_complete(ci);  		}  	}  	/* Wipe saved layout if we're losing DIR_CREATE caps */ -	if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && +	if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&  		!(issued & CEPH_CAP_DIR_CREATE)) {  	     ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));  	     memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); @@ -771,7 +771,7 @@ static int __cap_is_valid(struct ceph_cap *cap)  	if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {  		dout("__cap_is_valid %p cap %p issued %s " -		     "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode, +		     "but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode,  		     cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);  		return 0;  	} @@ -797,7 +797,7 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)  		if (!__cap_is_valid(cap))  			continue;  		dout("__ceph_caps_issued %p cap %p issued %s\n", -		     &ci->vfs_inode, cap, ceph_cap_string(cap->issued)); +		     &ci->netfs.inode, cap, ceph_cap_string(cap->issued));  		have |= cap->issued;  		if (implemented)  			*implemented |= cap->implemented; @@ -844,12 +844,12 @@ static void __touch_cap(struct ceph_cap *cap)  	spin_lock(&s->s_cap_lock);  	if (!s->s_cap_iterator) { -		dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, +		dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap,  		     s->s_mds);  		list_move_tail(&cap->session_caps, &s->s_caps);  	} else {  		dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n", -		     &cap->ci->vfs_inode, cap, s->s_mds); +		     &cap->ci->netfs.inode, cap, s->s_mds);  	}  	spin_unlock(&s->s_cap_lock);  } @@ -867,7 +867,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)  	if ((have & mask) == mask) {  		dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s" -		     " (mask %s)\n", ceph_ino(&ci->vfs_inode), +		     " (mask %s)\n", ceph_ino(&ci->netfs.inode),  		     ceph_cap_string(have),  		     ceph_cap_string(mask));  		return 1; @@ -879,7 +879,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)  			continue;  		if ((cap->issued & mask) == mask) {  			dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s" -			     " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap, +			     " (mask %s)\n", ceph_ino(&ci->netfs.inode), cap,  			     ceph_cap_string(cap->issued),  			     ceph_cap_string(mask));  			if (touch) @@ -891,7 +891,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)  		have |= cap->issued;  		if ((have & mask) == mask) {  			dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s" -			     " (mask %s)\n", ceph_ino(&ci->vfs_inode), +			     " (mask %s)\n", ceph_ino(&ci->netfs.inode),  			     ceph_cap_string(cap->issued),  			     ceph_cap_string(mask));  			if (touch) { @@ -919,7 +919,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)  int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,  				   int touch)  { -	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); +	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);  	int r;  	r = __ceph_caps_issued_mask(ci, mask, touch); @@ -950,7 +950,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,  int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)  { -	struct inode *inode = &ci->vfs_inode; +	struct inode *inode = &ci->netfs.inode;  	int ret;  	spin_lock(&ci->i_ceph_lock); @@ -969,8 +969,8 @@ int __ceph_caps_used(struct ceph_inode_info *ci)  	if (ci->i_rd_ref)  		used |= CEPH_CAP_FILE_RD;  	if (ci->i_rdcache_ref || -	    (S_ISREG(ci->vfs_inode.i_mode) && -	     ci->vfs_inode.i_data.nrpages)) +	    (S_ISREG(ci->netfs.inode.i_mode) && +	     ci->netfs.inode.i_data.nrpages))  		used |= CEPH_CAP_FILE_CACHE;  	if (ci->i_wr_ref)  		used |= CEPH_CAP_FILE_WR; @@ -993,11 +993,11 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)  	const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);  	const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);  	struct ceph_mount_options *opt = -		ceph_inode_to_client(&ci->vfs_inode)->mount_options; +		ceph_inode_to_client(&ci->netfs.inode)->mount_options;  	unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;  	unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; -	if (S_ISDIR(ci->vfs_inode.i_mode)) { +	if (S_ISDIR(ci->netfs.inode.i_mode)) {  		int want = 0;  		/* use used_cutoff here, to keep dir's wanted caps longer */ @@ -1050,7 +1050,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)  int __ceph_caps_wanted(struct ceph_inode_info *ci)  {  	int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); -	if (S_ISDIR(ci->vfs_inode.i_mode)) { +	if (S_ISDIR(ci->netfs.inode.i_mode)) {  		/* we want EXCL if holding caps of dir ops */  		if (w & CEPH_CAP_ANY_DIR_OPS)  			w |= CEPH_CAP_FILE_EXCL; @@ -1116,9 +1116,9 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)  	lockdep_assert_held(&ci->i_ceph_lock); -	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); +	dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode); -	mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc; +	mdsc = ceph_inode_to_client(&ci->netfs.inode)->mdsc;  	/* remove from inode's cap rbtree, and clear auth cap */  	rb_erase(&cap->ci_node, &ci->i_caps); @@ -1169,7 +1169,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)  		 * keep i_snap_realm.  		 */  		if (ci->i_wr_ref == 0 && ci->i_snap_realm) -			ceph_change_snap_realm(&ci->vfs_inode, NULL); +			ceph_change_snap_realm(&ci->netfs.inode, NULL);  		__cap_delay_cancel(mdsc, ci);  	} @@ -1188,11 +1188,11 @@ void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)  	lockdep_assert_held(&ci->i_ceph_lock); -	fsc = ceph_inode_to_client(&ci->vfs_inode); +	fsc = ceph_inode_to_client(&ci->netfs.inode);  	WARN_ON_ONCE(ci->i_auth_cap == cap &&  		     !list_empty(&ci->i_dirty_item) &&  		     !fsc->blocklisted && -		     !ceph_inode_is_shutdown(&ci->vfs_inode)); +		     !ceph_inode_is_shutdown(&ci->netfs.inode));  	__ceph_remove_cap(cap, queue_release);  } @@ -1343,7 +1343,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,  		       int flushing, u64 flush_tid, u64 oldest_flush_tid)  {  	struct ceph_inode_info *ci = cap->ci; -	struct inode *inode = &ci->vfs_inode; +	struct inode *inode = &ci->netfs.inode;  	int held, revoking;  	lockdep_assert_held(&ci->i_ceph_lock); @@ -1440,7 +1440,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,  static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)  {  	struct ceph_msg *msg; -	struct inode *inode = &ci->vfs_inode; +	struct inode *inode = &ci->netfs.inode;  	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);  	if (!msg) { @@ -1528,7 +1528,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,  		__releases(ci->i_ceph_lock)  		__acquires(ci->i_ceph_lock)  { -	struct inode *inode = &ci->vfs_inode; +	struct inode *inode = &ci->netfs.inode;  	struct ceph_mds_client *mdsc = session->s_mdsc;  	struct ceph_cap_snap *capsnap;  	u64 oldest_flush_tid = 0; @@ -1622,7 +1622,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,  void ceph_flush_snaps(struct ceph_inode_info *ci,  		      struct ceph_mds_session **psession)  { -	struct inode *inode = &ci->vfs_inode; +	struct inode *inode = &ci->netfs.inode;  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;  	struct ceph_mds_session *session = NULL;  	int mds; @@ -1682,8 +1682,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,  			   struct ceph_cap_flush **pcf)  {  	struct ceph_mds_client *mdsc = -		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; -	struct inode *inode = &ci->vfs_inode; +		ceph_sb_to_client(ci->netfs.inode.i_sb)->mdsc; +	struct inode *inode = &ci->netfs.inode;  	int was = ci->i_dirty_caps;  	int dirty = 0; @@ -1696,7 +1696,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,  		return 0;  	} -	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, +	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode,  	     ceph_cap_string(mask), ceph_cap_string(was),  	     ceph_cap_string(was | mask));  	ci->i_dirty_caps |= mask; @@ -1712,7 +1712,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,  				ci->i_snap_realm->cached_context);  		}  		dout(" inode %p now dirty snapc %p auth cap %p\n", -		     &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); +		     &ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap);  		BUG_ON(!list_empty(&ci->i_dirty_item));  		spin_lock(&mdsc->cap_dirty_lock);  		list_add(&ci->i_dirty_item, &session->s_cap_dirty); @@ -1875,7 +1875,7 @@ static int try_nonblocking_invalidate(struct inode *inode)  bool __ceph_should_report_size(struct ceph_inode_info *ci)  { -	loff_t size = i_size_read(&ci->vfs_inode); +	loff_t size = i_size_read(&ci->netfs.inode);  	/* mds will adjust max size according to the reported size */  	if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)  		return false; @@ -1900,7 +1900,7 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)  void ceph_check_caps(struct ceph_inode_info *ci, int flags,  		     struct ceph_mds_session *session)  { -	struct inode *inode = &ci->vfs_inode; +	struct inode *inode = &ci->netfs.inode;  	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);  	struct ceph_cap *cap;  	u64 flush_tid, oldest_flush_tid; @@ -2467,7 +2467,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,  	__releases(ci->i_ceph_lock)  	__acquires(ci->i_ceph_lock)  { -	struct inode *inode = &ci->vfs_inode; +	struct inode *inode = &ci->netfs.inode;  	struct ceph_cap *cap;  	struct ceph_cap_flush *cf;  	int ret; @@ -2560,7 +2560,7 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,  		cap = ci->i_auth_cap;  		if (!(cap && cap->session == session)) {  			pr_err("%p auth cap %p not mds%d ???\n", -				&ci->vfs_inode, cap, session->s_mds); +				&ci->netfs.inode, cap, session->s_mds);  			spin_unlock(&ci->i_ceph_lock);  			continue;  		} @@ -2610,7 +2610,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,  		cap = ci->i_auth_cap;  		if (!(cap && cap->session == session)) {  			pr_err("%p auth cap %p not mds%d ???\n", -				&ci->vfs_inode, cap, session->s_mds); +				&ci->netfs.inode, cap, session->s_mds);  			spin_unlock(&ci->i_ceph_lock);  			continue;  		} @@ -2630,7 +2630,7 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,  	lockdep_assert_held(&ci->i_ceph_lock); -	dout("%s %p flushing %s\n", __func__, &ci->vfs_inode, +	dout("%s %p flushing %s\n", __func__, &ci->netfs.inode,  	     ceph_cap_string(ci->i_flushing_caps));  	if (!list_empty(&ci->i_cap_flush_list)) { @@ -2673,10 +2673,10 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,  	}  	if (got & CEPH_CAP_FILE_BUFFER) {  		if (ci->i_wb_ref == 0) -			ihold(&ci->vfs_inode); +			ihold(&ci->netfs.inode);  		ci->i_wb_ref++;  		dout("%s %p wb %d -> %d (?)\n", __func__, -		     &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); +		     &ci->netfs.inode, ci->i_wb_ref-1, ci->i_wb_ref);  	}  } @@ -3004,7 +3004,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got  			return ret;  		} -		if (S_ISREG(ci->vfs_inode.i_mode) && +		if (S_ISREG(ci->netfs.inode.i_mode) &&  		    ci->i_inline_version != CEPH_INLINE_NONE &&  		    (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&  		    i_size_read(inode) > 0) { @@ -3094,7 +3094,7 @@ enum put_cap_refs_mode {  static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,  				enum put_cap_refs_mode mode)  { -	struct inode *inode = &ci->vfs_inode; +	struct inode *inode = &ci->netfs.inode;  	int last = 0, put = 0, flushsnaps = 0, wake = 0;  	bool check_flushsnaps = false; @@ -3202,7 +3202,7 @@ void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had)  void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,  				struct ceph_snap_context *snapc)  { -	struct inode *inode = &ci->vfs_inode; +	struct inode *inode = &ci->netfs.inode;  	struct ceph_cap_snap *capsnap = NULL, *iter;  	int put = 0;  	bool last = false; @@ -3698,7 +3698,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,  				     session->s_mds,  				     &list_first_entry(&session->s_cap_flushing,  						struct ceph_inode_info, -						i_flushing_item)->vfs_inode); +						i_flushing_item)->netfs.inode);  			}  		}  		mdsc->num_cap_flushing--; @@ -4345,7 +4345,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)  			break;  		list_del_init(&ci->i_cap_delay_list); -		inode = igrab(&ci->vfs_inode); +		inode = igrab(&ci->netfs.inode);  		if (inode) {  			spin_unlock(&mdsc->cap_delay_lock);  			dout("check_delayed_caps on %p\n", inode); @@ -4373,7 +4373,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)  	while (!list_empty(&s->s_cap_dirty)) {  		ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,  				      i_dirty_item); -		inode = &ci->vfs_inode; +		inode = &ci->netfs.inode;  		ihold(inode);  		dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));  		spin_unlock(&mdsc->cap_dirty_lock); @@ -4407,7 +4407,7 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,  void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)  { -	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); +	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);  	int bits = (fmode << 1) | 1;  	bool already_opened = false;  	int i; @@ -4441,7 +4441,7 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)   */  void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)  { -	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); +	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);  	int bits = (fmode << 1) | 1;  	bool is_closed = true;  	int i; @@ -4656,7 +4656,7 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali  	lockdep_assert_held(&ci->i_ceph_lock);  	dout("removing cap %p, ci is %p, inode is %p\n", -	     cap, ci, &ci->vfs_inode); +	     cap, ci, &ci->netfs.inode);  	is_auth = (cap == ci->i_auth_cap);  	__ceph_remove_cap(cap, false); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8c8226c0feac..da59e836a06e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -205,7 +205,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,  {  	struct ceph_inode_info *ci = ceph_inode(inode);  	struct ceph_mount_options *opt = -		ceph_inode_to_client(&ci->vfs_inode)->mount_options; +		ceph_inode_to_client(&ci->netfs.inode)->mount_options;  	struct ceph_file_info *fi;  	int ret; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index b7e9cac3aeef..56c53ab3618e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -176,7 +176,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,  	rb_insert_color(&frag->node, &ci->i_fragtree);  	dout("get_or_create_frag added %llx.%llx frag %x\n", -	     ceph_vinop(&ci->vfs_inode), f); +	     ceph_vinop(&ci->netfs.inode), f);  	return frag;  } @@ -457,10 +457,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)  	if (!ci)  		return NULL; -	dout("alloc_inode %p\n", &ci->vfs_inode); +	dout("alloc_inode %p\n", &ci->netfs.inode);  	/* Set parameters for the netfs library */ -	netfs_i_context_init(&ci->vfs_inode, &ceph_netfs_ops); +	netfs_inode_init(&ci->netfs, &ceph_netfs_ops);  	spin_lock_init(&ci->i_ceph_lock); @@ -547,7 +547,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)  	INIT_WORK(&ci->i_work, ceph_inode_work);  	ci->i_work_mask = 0;  	memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); -	return &ci->vfs_inode; +	return &ci->netfs.inode;  }  void ceph_free_inode(struct inode *inode) @@ -1978,7 +1978,7 @@ static void ceph_inode_work(struct work_struct *work)  {  	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,  						 i_work); -	struct inode *inode = &ci->vfs_inode; +	struct inode *inode = &ci->netfs.inode;  	if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {  		dout("writeback %p\n", inode); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f5d110d90b77..33f517d549ce 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1564,7 +1564,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,  	p = session->s_caps.next;  	while (p != &session->s_caps) {  		cap = list_entry(p, struct ceph_cap, session_caps); -		inode = igrab(&cap->ci->vfs_inode); +		inode = igrab(&cap->ci->netfs.inode);  		if (!inode) {  			p = p->next;  			continue; @@ -1622,7 +1622,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,  	int iputs;  	dout("removing cap %p, ci is %p, inode is %p\n", -	     cap, ci, &ci->vfs_inode); +	     cap, ci, &ci->netfs.inode);  	spin_lock(&ci->i_ceph_lock);  	iputs = ceph_purge_inode_cap(inode, cap, &invalidate);  	spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 322ee5add942..864cdaa0d2bd 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -521,7 +521,7 @@ static bool has_new_snaps(struct ceph_snap_context *o,  static void ceph_queue_cap_snap(struct ceph_inode_info *ci,  				struct ceph_cap_snap **pcapsnap)  { -	struct inode *inode = &ci->vfs_inode; +	struct inode *inode = &ci->netfs.inode;  	struct ceph_snap_context *old_snapc, *new_snapc;  	struct ceph_cap_snap *capsnap = *pcapsnap;  	struct ceph_buffer *old_blob = NULL; @@ -652,7 +652,7 @@ update_snapc:  int __ceph_finish_cap_snap(struct ceph_inode_info *ci,  			    struct ceph_cap_snap *capsnap)  { -	struct inode *inode = &ci->vfs_inode; +	struct inode *inode = &ci->netfs.inode;  	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);  	BUG_ON(capsnap->writing); @@ -712,7 +712,7 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)  	spin_lock(&realm->inodes_with_caps_lock);  	list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { -		struct inode *inode = igrab(&ci->vfs_inode); +		struct inode *inode = igrab(&ci->netfs.inode);  		if (!inode)  			continue;  		spin_unlock(&realm->inodes_with_caps_lock); @@ -904,7 +904,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)  	while (!list_empty(&mdsc->snap_flush_list)) {  		ci = list_first_entry(&mdsc->snap_flush_list,  				struct ceph_inode_info, i_snap_flush_item); -		inode = &ci->vfs_inode; +		inode = &ci->netfs.inode;  		ihold(inode);  		spin_unlock(&mdsc->snap_flush_lock);  		ceph_flush_snaps(ci, &session); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index b73b4f75462c..40140805bdcf 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -876,7 +876,7 @@ mempool_t *ceph_wb_pagevec_pool;  static void ceph_inode_init_once(void *foo)  {  	struct ceph_inode_info *ci = foo; -	inode_init_once(&ci->vfs_inode); +	inode_init_once(&ci->netfs.inode);  }  static int __init init_caches(void) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index dd7dac0f984a..f59dac66955b 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -316,11 +316,7 @@ struct ceph_inode_xattrs_info {   * Ceph inode.   */  struct ceph_inode_info { -	struct { -		/* These must be contiguous */ -		struct inode vfs_inode; -		struct netfs_i_context netfs_ctx; /* Netfslib context */ -	}; +	struct netfs_inode netfs; /* Netfslib context and vfs inode */  	struct ceph_vino i_vino;   /* ceph ino + snap */  	spinlock_t i_ceph_lock; @@ -436,7 +432,7 @@ struct ceph_inode_info {  static inline struct ceph_inode_info *  ceph_inode(const struct inode *inode)  { -	return container_of(inode, struct ceph_inode_info, vfs_inode); +	return container_of(inode, struct ceph_inode_info, netfs.inode);  }  static inline struct ceph_fs_client * @@ -1316,7 +1312,7 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci,  	has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY);  	if (had_quota != has_quota) -		ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota); +		ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota);  }  extern void ceph_handle_quota(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 8c2dc2c762a4..f141f5246163 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -57,7 +57,7 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)  static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,  				    size_t size)  { -	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); +	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);  	struct ceph_osd_client *osdc = &fsc->client->osdc;  	struct ceph_string *pool_ns;  	s64 pool = ci->i_layout.pool_id; @@ -69,7 +69,7 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,  	pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); -	dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); +	dout("ceph_vxattrcb_layout %p\n", &ci->netfs.inode);  	down_read(&osdc->lock);  	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);  	if (pool_name) { @@ -161,7 +161,7 @@ static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,  					 char *val, size_t size)  {  	ssize_t ret; -	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); +	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);  	struct ceph_osd_client *osdc = &fsc->client->osdc;  	s64 pool = ci->i_layout.pool_id;  	const char *pool_name; @@ -313,7 +313,7 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,  static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,  					  char *val, size_t size)  { -	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); +	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);  	return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid);  } @@ -321,7 +321,7 @@ static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,  static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci,  				       char *val, size_t size)  { -	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); +	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);  	return ceph_fmt_xattr(val, size, "client%lld",  			      ceph_client_gid(fsc->client)); @@ -629,7 +629,7 @@ static int __set_xattr(struct ceph_inode_info *ci,  	}  	dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n", -	     ceph_vinop(&ci->vfs_inode), xattr, name_len, name, val_len, val); +	     ceph_vinop(&ci->netfs.inode), xattr, name_len, name, val_len, val);  	return 0;  } @@ -871,7 +871,7 @@ struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci)  	struct ceph_buffer *old_blob = NULL;  	void *dest; -	dout("__build_xattrs_blob %p\n", &ci->vfs_inode); +	dout("__build_xattrs_blob %p\n", &ci->netfs.inode);  	if (ci->i_xattrs.dirty) {  		int need = __get_required_blob_size(ci, 0, 0); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 1dd995efd5b8..2cfbac8bb965 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -162,6 +162,8 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface)  		seq_printf(m, "\t\tIPv4: %pI4\n", &ipv4->sin_addr);  	else if (iface->sockaddr.ss_family == AF_INET6)  		seq_printf(m, "\t\tIPv6: %pI6\n", &ipv6->sin6_addr); +	if (!iface->is_active) +		seq_puts(m, "\t\t[for-cleanup]\n");  }  static int cifs_debug_files_proc_show(struct seq_file *m, void *v) @@ -221,6 +223,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)  	struct TCP_Server_Info *server;  	struct cifs_ses *ses;  	struct cifs_tcon *tcon; +	struct cifs_server_iface *iface;  	int c, i, j;  	seq_puts(m, @@ -456,11 +459,10 @@ skip_rdma:  			if (ses->iface_count)  				seq_printf(m, "\n\n\tServer interfaces: %zu",  					   ses->iface_count); -			for (j = 0; j < ses->iface_count; j++) { -				struct cifs_server_iface *iface; - -				iface = &ses->iface_list[j]; -				seq_printf(m, "\n\t%d)", j+1); +			j = 0; +			list_for_each_entry(iface, &ses->iface_list, +						 iface_head) { +				seq_printf(m, "\n\t%d)", ++j);  				cifs_dump_iface(m, iface);  				if (is_ses_using_iface(ses, iface))  					seq_puts(m, "\t\t[CONNECTED]\n"); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 12c872800326..8f2e003e0590 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -377,7 +377,7 @@ cifs_alloc_inode(struct super_block *sb)  	cifs_inode->flags = 0;  	spin_lock_init(&cifs_inode->writers_lock);  	cifs_inode->writers = 0; -	cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */ +	cifs_inode->netfs.inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */  	cifs_inode->server_eof = 0;  	cifs_inode->uniqueid = 0;  	cifs_inode->createtime = 0; @@ -389,12 +389,12 @@ cifs_alloc_inode(struct super_block *sb)  	 * Can not set i_flags here - they get immediately overwritten to zero  	 * by the VFS.  	 */ -	/* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */ +	/* cifs_inode->netfs.inode.i_flags = S_NOATIME | S_NOCMTIME; */  	INIT_LIST_HEAD(&cifs_inode->openFileList);  	INIT_LIST_HEAD(&cifs_inode->llist);  	INIT_LIST_HEAD(&cifs_inode->deferred_closes);  	spin_lock_init(&cifs_inode->deferred_lock); -	return &cifs_inode->vfs_inode; +	return &cifs_inode->netfs.inode;  }  static void @@ -1086,7 +1086,7 @@ struct file_system_type cifs_fs_type = {  };  MODULE_ALIAS_FS("cifs"); -static struct file_system_type smb3_fs_type = { +struct file_system_type smb3_fs_type = {  	.owner = THIS_MODULE,  	.name = "smb3",  	.init_fs_context = smb3_init_fs_context, @@ -1418,7 +1418,7 @@ cifs_init_once(void *inode)  {  	struct cifsInodeInfo *cifsi = inode; -	inode_init_once(&cifsi->vfs_inode); +	inode_init_once(&cifsi->netfs.inode);  	init_rwsem(&cifsi->lock_sem);  } diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index dd7e070ca243..b17be47a8e59 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -38,7 +38,7 @@ static inline unsigned long cifs_get_time(struct dentry *dentry)  	return (unsigned long) dentry->d_fsdata;  } -extern struct file_system_type cifs_fs_type; +extern struct file_system_type cifs_fs_type, smb3_fs_type;  extern const struct address_space_operations cifs_addr_ops;  extern const struct address_space_operations cifs_addr_ops_smallbuf; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index f873379066c7..a643c84ff1e9 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -80,6 +80,9 @@  #define SMB_DNS_RESOLVE_INTERVAL_MIN     120  #define SMB_DNS_RESOLVE_INTERVAL_DEFAULT 600 +/* smb multichannel query server interfaces interval in seconds */ +#define SMB_INTERFACE_POLL_INTERVAL	600 +  /* maximum number of PDUs in one compound */  #define MAX_COMPOUND 5 @@ -933,15 +936,67 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)  #endif  struct cifs_server_iface { +	struct list_head iface_head; +	struct kref refcount;  	size_t speed;  	unsigned int rdma_capable : 1;  	unsigned int rss_capable : 1; +	unsigned int is_active : 1; /* unset if non existent */  	struct sockaddr_storage sockaddr;  }; +/* release iface when last ref is dropped */ +static inline void +release_iface(struct kref *ref) +{ +	struct cifs_server_iface *iface = container_of(ref, +						       struct cifs_server_iface, +						       refcount); +	list_del_init(&iface->iface_head); +	kfree(iface); +} + +/* + * compare two interfaces a and b + * return 0 if everything matches. + * return 1 if a has higher link speed, or rdma capable, or rss capable + * return -1 otherwise. + */ +static inline int +iface_cmp(struct cifs_server_iface *a, struct cifs_server_iface *b) +{ +	int cmp_ret = 0; + +	WARN_ON(!a || !b); +	if (a->speed == b->speed) { +		if (a->rdma_capable == b->rdma_capable) { +			if (a->rss_capable == b->rss_capable) { +				cmp_ret = memcmp(&a->sockaddr, &b->sockaddr, +						 sizeof(a->sockaddr)); +				if (!cmp_ret) +					return 0; +				else if (cmp_ret > 0) +					return 1; +				else +					return -1; +			} else if (a->rss_capable > b->rss_capable) +				return 1; +			else +				return -1; +		} else if (a->rdma_capable > b->rdma_capable) +			return 1; +		else +			return -1; +	} else if (a->speed > b->speed) +		return 1; +	else +		return -1; +} +  struct cifs_chan {  	unsigned int in_reconnect : 1; /* if session setup in progress for this channel */  	struct TCP_Server_Info *server; +	struct cifs_server_iface *iface; /* interface in use */  	__u8 signkey[SMB3_SIGN_KEY_SIZE];  }; @@ -993,7 +1048,7 @@ struct cifs_ses {  	 */  	spinlock_t iface_lock;  	/* ========= begin: protected by iface_lock ======== */ -	struct cifs_server_iface *iface_list; +	struct list_head iface_list;  	size_t iface_count;  	unsigned long iface_last_update; /* jiffies */  	/* ========= end: protected by iface_lock ======== */ @@ -1203,6 +1258,7 @@ struct cifs_tcon {  #ifdef CONFIG_CIFS_DFS_UPCALL  	struct list_head ulist; /* cache update list */  #endif +	struct delayed_work	query_interfaces; /* query interfaces workqueue job */  };  /* @@ -1479,20 +1535,16 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);  #define CIFS_CACHE_RW_FLG	(CIFS_CACHE_READ_FLG | CIFS_CACHE_WRITE_FLG)  #define CIFS_CACHE_RHW_FLG	(CIFS_CACHE_RW_FLG | CIFS_CACHE_HANDLE_FLG) -#define CIFS_CACHE_READ(cinode) ((cinode->oplock & CIFS_CACHE_READ_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE)) +#define CIFS_CACHE_READ(cinode) ((cinode->oplock & CIFS_CACHE_READ_FLG) || (CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE))  #define CIFS_CACHE_HANDLE(cinode) (cinode->oplock & CIFS_CACHE_HANDLE_FLG) -#define CIFS_CACHE_WRITE(cinode) ((cinode->oplock & CIFS_CACHE_WRITE_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE)) +#define CIFS_CACHE_WRITE(cinode) ((cinode->oplock & CIFS_CACHE_WRITE_FLG) || (CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE))  /*   * One of these for each file inode   */  struct cifsInodeInfo { -	struct { -		/* These must be contiguous */ -		struct inode	vfs_inode;	/* the VFS's inode record */ -		struct netfs_i_context netfs_ctx; /* Netfslib context */ -	}; +	struct netfs_inode netfs; /* Netfslib context and vfs inode */  	bool can_cache_brlcks;  	struct list_head llist;	/* locks helb by this inode */  	/* @@ -1531,7 +1583,7 @@ struct cifsInodeInfo {  static inline struct cifsInodeInfo *  CIFS_I(struct inode *inode)  { -	return container_of(inode, struct cifsInodeInfo, vfs_inode); +	return container_of(inode, struct cifsInodeInfo, netfs.inode);  }  static inline struct cifs_sb_info * diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 3b7366ec03c7..d59aebefa71c 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -636,6 +636,13 @@ cifs_chan_clear_need_reconnect(struct cifs_ses *ses,  bool  cifs_chan_needs_reconnect(struct cifs_ses *ses,  			  struct TCP_Server_Info *server); +bool +cifs_chan_is_iface_active(struct cifs_ses *ses, +			  struct TCP_Server_Info *server); +int +cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server); +int +SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon);  void extract_unc_hostname(const char *unc, const char **h, size_t *len);  int copy_path_name(char *dst, const char *src); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d46702f5a663..fa29c9aae24b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -97,6 +97,10 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)  	if (!server->hostname)  		return -EINVAL; +	/* if server hostname isn't populated, there's nothing to do here */ +	if (server->hostname[0] == '\0') +		return 0; +  	len = strlen(server->hostname) + 3;  	unc = kmalloc(len, GFP_KERNEL); @@ -141,6 +145,25 @@ requeue_resolve:  	return rc;  } +static void smb2_query_server_interfaces(struct work_struct *work) +{ +	int rc; +	struct cifs_tcon *tcon = container_of(work, +					struct cifs_tcon, +					query_interfaces.work); + +	/* +	 * query server network interfaces, in case they change +	 */ +	rc = SMB3_request_interfaces(0, tcon); +	if (rc) { +		cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", +				__func__, rc); +	} + +	queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, +			   (SMB_INTERFACE_POLL_INTERVAL * HZ)); +}  static void cifs_resolve_server(struct work_struct *work)  { @@ -213,7 +236,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,  				      bool mark_smb_session)  {  	struct TCP_Server_Info *pserver; -	struct cifs_ses *ses; +	struct cifs_ses *ses, *nses;  	struct cifs_tcon *tcon;  	/* @@ -227,7 +250,20 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,  	spin_lock(&cifs_tcp_ses_lock); -	list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { +	list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) { +		/* check if iface is still active */ +		if (!cifs_chan_is_iface_active(ses, server)) { +			/* +			 * HACK: drop the lock before calling +			 * cifs_chan_update_iface to avoid deadlock +			 */ +			ses->ses_count++; +			spin_unlock(&cifs_tcp_ses_lock); +			cifs_chan_update_iface(ses, server); +			spin_lock(&cifs_tcp_ses_lock); +			ses->ses_count--; +		} +  		spin_lock(&ses->chan_lock);  		if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server))  			goto next_session; @@ -1890,9 +1926,11 @@ void cifs_put_smb_ses(struct cifs_ses *ses)  		int i;  		for (i = 1; i < chan_count; i++) { -			spin_unlock(&ses->chan_lock); +			if (ses->chans[i].iface) { +				kref_put(&ses->chans[i].iface->refcount, release_iface); +				ses->chans[i].iface = NULL; +			}  			cifs_put_tcp_session(ses->chans[i].server, 0); -			spin_lock(&ses->chan_lock);  			ses->chans[i].server = NULL;  		}  	} @@ -2266,6 +2304,9 @@ cifs_put_tcon(struct cifs_tcon *tcon)  	list_del_init(&tcon->tcon_list);  	spin_unlock(&cifs_tcp_ses_lock); +	/* cancel polling of interfaces */ +	cancel_delayed_work_sync(&tcon->query_interfaces); +  	if (tcon->use_witness) {  		int rc; @@ -2503,6 +2544,12 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)  	tcon->local_lease = ctx->local_lease;  	INIT_LIST_HEAD(&tcon->pending_opens); +	/* schedule query interfaces poll */ +	INIT_DELAYED_WORK(&tcon->query_interfaces, +			  smb2_query_server_interfaces); +	queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, +			   (SMB_INTERFACE_POLL_INTERVAL * HZ)); +  	spin_lock(&cifs_tcp_ses_lock);  	list_add(&tcon->tcon_list, &ses->tcon_list);  	spin_unlock(&cifs_tcp_ses_lock); @@ -3978,10 +4025,16 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,  		   struct nls_table *nls_info)  {  	int rc = -ENOSYS; +	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; +	struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;  	bool is_binding = false; -  	spin_lock(&cifs_tcp_ses_lock); +	if (server->dstaddr.ss_family == AF_INET6) +		scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); +	else +		scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); +  	if (ses->ses_status != SES_GOOD &&  	    ses->ses_status != SES_NEW &&  	    ses->ses_status != SES_NEED_RECON) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 1618e0537d58..e64cda7a7610 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2004,7 +2004,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,  					bool fsuid_only)  {  	struct cifsFileInfo *open_file = NULL; -	struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); +	struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->netfs.inode.i_sb);  	/* only filter by fsuid on multiuser mounts */  	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) @@ -2060,7 +2060,7 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags,  		return rc;  	} -	cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); +	cifs_sb = CIFS_SB(cifs_inode->netfs.inode.i_sb);  	/* only filter by fsuid on multiuser mounts */  	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) @@ -4669,14 +4669,14 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)  		/* This inode is open for write at least once */  		struct cifs_sb_info *cifs_sb; -		cifs_sb = CIFS_SB(cifsInode->vfs_inode.i_sb); +		cifs_sb = CIFS_SB(cifsInode->netfs.inode.i_sb);  		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {  			/* since no page cache to corrupt on directio  			we can change size safely */  			return true;  		} -		if (i_size_read(&cifsInode->vfs_inode) < end_of_file) +		if (i_size_read(&cifsInode->netfs.inode) < end_of_file)  			return true;  		return false; diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index a638b29e9062..23ef56f55ce5 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -101,13 +101,13 @@ void cifs_fscache_get_inode_cookie(struct inode *inode)  	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);  	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); -	cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd); +	cifs_fscache_fill_coherency(&cifsi->netfs.inode, &cd); -	cifsi->netfs_ctx.cache = +	cifsi->netfs.cache =  		fscache_acquire_cookie(tcon->fscache, 0,  				       &cifsi->uniqueid, sizeof(cifsi->uniqueid),  				       &cd, sizeof(cd), -				       i_size_read(&cifsi->vfs_inode)); +				       i_size_read(&cifsi->netfs.inode));  }  void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) @@ -131,7 +131,7 @@ void cifs_fscache_release_inode_cookie(struct inode *inode)  	if (cookie) {  		cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cookie);  		fscache_relinquish_cookie(cookie, false); -		cifsi->netfs_ctx.cache = NULL; +		cifsi->netfs.cache = NULL;  	}  } diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 52355c0912ae..aa3b941a5555 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -52,16 +52,16 @@ void cifs_fscache_fill_coherency(struct inode *inode,  	struct cifsInodeInfo *cifsi = CIFS_I(inode);  	memset(cd, 0, sizeof(*cd)); -	cd->last_write_time_sec   = cpu_to_le64(cifsi->vfs_inode.i_mtime.tv_sec); -	cd->last_write_time_nsec  = cpu_to_le32(cifsi->vfs_inode.i_mtime.tv_nsec); -	cd->last_change_time_sec  = cpu_to_le64(cifsi->vfs_inode.i_ctime.tv_sec); -	cd->last_change_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_ctime.tv_nsec); +	cd->last_write_time_sec   = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec); +	cd->last_write_time_nsec  = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec); +	cd->last_change_time_sec  = cpu_to_le64(cifsi->netfs.inode.i_ctime.tv_sec); +	cd->last_change_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_ctime.tv_nsec);  }  static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode)  { -	return netfs_i_cookie(inode); +	return netfs_i_cookie(&CIFS_I(inode)->netfs);  }  static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 2f9e7d2f81b6..81da81e18553 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -115,7 +115,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)  		 __func__, cifs_i->uniqueid);  	set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags);  	/* Invalidate fscache cookie */ -	cifs_fscache_fill_coherency(&cifs_i->vfs_inode, &cd); +	cifs_fscache_fill_coherency(&cifs_i->netfs.inode, &cd);  	fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0);  } @@ -2499,7 +2499,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,  		u64 len)  {  	struct cifsInodeInfo *cifs_i = CIFS_I(inode); -	struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_i->vfs_inode.i_sb); +	struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_i->netfs.inode.i_sb);  	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);  	struct TCP_Server_Info *server = tcon->ses->server;  	struct cifsFileInfo *cfile; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 35962a1a23b9..0e84e6fcf8ab 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -75,6 +75,7 @@ sesInfoAlloc(void)  		INIT_LIST_HEAD(&ret_buf->tcon_list);  		mutex_init(&ret_buf->session_mutex);  		spin_lock_init(&ret_buf->iface_lock); +		INIT_LIST_HEAD(&ret_buf->iface_list);  		spin_lock_init(&ret_buf->chan_lock);  	}  	return ret_buf; @@ -83,6 +84,8 @@ sesInfoAlloc(void)  void  sesInfoFree(struct cifs_ses *buf_to_free)  { +	struct cifs_server_iface *iface = NULL, *niface = NULL; +  	if (buf_to_free == NULL) {  		cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n");  		return; @@ -96,7 +99,11 @@ sesInfoFree(struct cifs_ses *buf_to_free)  	kfree(buf_to_free->user_name);  	kfree(buf_to_free->domainName);  	kfree_sensitive(buf_to_free->auth_key.response); -	kfree(buf_to_free->iface_list); +	spin_lock(&buf_to_free->iface_lock); +	list_for_each_entry_safe(iface, niface, &buf_to_free->iface_list, +				 iface_head) +		kref_put(&iface->refcount, release_iface); +	spin_unlock(&buf_to_free->iface_lock);  	kfree_sensitive(buf_to_free);  } @@ -537,11 +544,11 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)  	if (oplock == OPLOCK_EXCLUSIVE) {  		cinode->oplock = CIFS_CACHE_WRITE_FLG | CIFS_CACHE_READ_FLG;  		cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", -			 &cinode->vfs_inode); +			 &cinode->netfs.inode);  	} else if (oplock == OPLOCK_READ) {  		cinode->oplock = CIFS_CACHE_READ_FLG;  		cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", -			 &cinode->vfs_inode); +			 &cinode->netfs.inode);  	} else  		cinode->oplock = 0;  } @@ -1211,18 +1218,23 @@ static struct super_block *__cifs_get_super(void (*f)(struct super_block *, void  		.data = data,  		.sb = NULL,  	}; +	struct file_system_type **fs_type = (struct file_system_type *[]) { +		&cifs_fs_type, &smb3_fs_type, NULL, +	}; -	iterate_supers_type(&cifs_fs_type, f, &sd); - -	if (!sd.sb) -		return ERR_PTR(-EINVAL); -	/* -	 * Grab an active reference in order to prevent automounts (DFS links) -	 * of expiring and then freeing up our cifs superblock pointer while -	 * we're doing failover. -	 */ -	cifs_sb_active(sd.sb); -	return sd.sb; +	for (; *fs_type; fs_type++) { +		iterate_supers_type(*fs_type, f, &sd); +		if (sd.sb) { +			/* +			 * Grab an active reference in order to prevent automounts (DFS links) +			 * of expiring and then freeing up our cifs superblock pointer while +			 * we're doing failover. +			 */ +			cifs_sb_active(sd.sb); +			return sd.sb; +		} +	} +	return ERR_PTR(-EINVAL);  }  static void __cifs_put_super(struct super_block *sb) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 3b7915af1f62..b85718f32b53 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -58,7 +58,7 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface)  	spin_lock(&ses->chan_lock);  	for (i = 0; i < ses->chan_count; i++) { -		if (is_server_using_iface(ses->chans[i].server, iface)) { +		if (ses->chans[i].iface == iface) {  			spin_unlock(&ses->chan_lock);  			return true;  		} @@ -81,6 +81,9 @@ cifs_ses_get_chan_index(struct cifs_ses *ses,  	}  	/* If we didn't find the channel, it is likely a bug */ +	if (server) +		cifs_dbg(VFS, "unable to get chan index for server: 0x%llx", +			 server->conn_id);  	WARN_ON(1);  	return 0;  } @@ -143,16 +146,24 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses,  	return CIFS_CHAN_NEEDS_RECONNECT(ses, chan_index);  } +bool +cifs_chan_is_iface_active(struct cifs_ses *ses, +			  struct TCP_Server_Info *server) +{ +	unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + +	return ses->chans[chan_index].iface && +		ses->chans[chan_index].iface->is_active; +} +  /* returns number of channels added */  int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)  {  	int old_chan_count, new_chan_count;  	int left; -	int i = 0;  	int rc = 0;  	int tries = 0; -	struct cifs_server_iface *ifaces = NULL; -	size_t iface_count; +	struct cifs_server_iface *iface = NULL, *niface = NULL;  	spin_lock(&ses->chan_lock); @@ -182,32 +193,16 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)  	spin_unlock(&ses->chan_lock);  	/* -	 * Make a copy of the iface list at the time and use that -	 * instead so as to not hold the iface spinlock for opening -	 * channels -	 */ -	spin_lock(&ses->iface_lock); -	iface_count = ses->iface_count; -	if (iface_count <= 0) { -		spin_unlock(&ses->iface_lock); -		cifs_dbg(VFS, "no iface list available to open channels\n"); -		return 0; -	} -	ifaces = kmemdup(ses->iface_list, iface_count*sizeof(*ifaces), -			 GFP_ATOMIC); -	if (!ifaces) { -		spin_unlock(&ses->iface_lock); -		return 0; -	} -	spin_unlock(&ses->iface_lock); - -	/*  	 * Keep connecting to same, fastest, iface for all channels as  	 * long as its RSS. Try next fastest one if not RSS or channel  	 * creation fails.  	 */ +	spin_lock(&ses->iface_lock); +	iface = list_first_entry(&ses->iface_list, struct cifs_server_iface, +				 iface_head); +	spin_unlock(&ses->iface_lock); +  	while (left > 0) { -		struct cifs_server_iface *iface;  		tries++;  		if (tries > 3*ses->chan_max) { @@ -216,31 +211,128 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)  			break;  		} -		iface = &ifaces[i]; -		if (is_ses_using_iface(ses, iface) && !iface->rss_capable) { -			i = (i+1) % iface_count; -			continue; +		spin_lock(&ses->iface_lock); +		if (!ses->iface_count) { +			spin_unlock(&ses->iface_lock); +			break;  		} -		rc = cifs_ses_add_channel(cifs_sb, ses, iface); -		if (rc) { -			cifs_dbg(FYI, "failed to open extra channel on iface#%d rc=%d\n", -				 i, rc); -			i = (i+1) % iface_count; -			continue; +		list_for_each_entry_safe_from(iface, niface, &ses->iface_list, +				    iface_head) { +			/* skip ifaces that are unusable */ +			if (!iface->is_active || +			    (is_ses_using_iface(ses, iface) && +			     !iface->rss_capable)) { +				continue; +			} + +			/* take ref before unlock */ +			kref_get(&iface->refcount); + +			spin_unlock(&ses->iface_lock); +			rc = cifs_ses_add_channel(cifs_sb, ses, iface); +			spin_lock(&ses->iface_lock); + +			if (rc) { +				cifs_dbg(VFS, "failed to open extra channel on iface:%pIS rc=%d\n", +					 &iface->sockaddr, +					 rc); +				kref_put(&iface->refcount, release_iface); +				continue; +			} + +			cifs_dbg(FYI, "successfully opened new channel on iface:%pIS\n", +				 &iface->sockaddr); +			break;  		} +		spin_unlock(&ses->iface_lock); -		cifs_dbg(FYI, "successfully opened new channel on iface#%d\n", -			 i);  		left--;  		new_chan_count++;  	} -	kfree(ifaces);  	return new_chan_count - old_chan_count;  }  /* + * update the iface for the channel if necessary. + * will return 0 when iface is updated, 1 if removed, 2 otherwise + * Must be called with chan_lock held. + */ +int +cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) +{ +	unsigned int chan_index; +	struct cifs_server_iface *iface = NULL; +	struct cifs_server_iface *old_iface = NULL; +	int rc = 0; + +	spin_lock(&ses->chan_lock); +	chan_index = cifs_ses_get_chan_index(ses, server); +	if (!chan_index) { +		spin_unlock(&ses->chan_lock); +		return 0; +	} + +	if (ses->chans[chan_index].iface) { +		old_iface = ses->chans[chan_index].iface; +		if (old_iface->is_active) { +			spin_unlock(&ses->chan_lock); +			return 1; +		} +	} +	spin_unlock(&ses->chan_lock); + +	spin_lock(&ses->iface_lock); +	/* then look for a new one */ +	list_for_each_entry(iface, &ses->iface_list, iface_head) { +		if (!iface->is_active || +		    (is_ses_using_iface(ses, iface) && +		     !iface->rss_capable)) { +			continue; +		} +		kref_get(&iface->refcount); +	} + +	if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) { +		rc = 1; +		iface = NULL; +		cifs_dbg(FYI, "unable to find a suitable iface\n"); +	} + +	/* now drop the ref to the current iface */ +	if (old_iface && iface) { +		kref_put(&old_iface->refcount, release_iface); +		cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n", +			 &old_iface->sockaddr, +			 &iface->sockaddr); +	} else if (old_iface) { +		kref_put(&old_iface->refcount, release_iface); +		cifs_dbg(FYI, "releasing ref to iface: %pIS\n", +			 &old_iface->sockaddr); +	} else { +		WARN_ON(!iface); +		cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr); +	} +	spin_unlock(&ses->iface_lock); + +	spin_lock(&ses->chan_lock); +	chan_index = cifs_ses_get_chan_index(ses, server); +	ses->chans[chan_index].iface = iface; + +	/* No iface is found. if secondary chan, drop connection */ +	if (!iface && CIFS_SERVER_IS_CHAN(server)) +		ses->chans[chan_index].server = NULL; + +	spin_unlock(&ses->chan_lock); + +	if (!iface && CIFS_SERVER_IS_CHAN(server)) +		cifs_put_tcp_session(server, false); + +	return rc; +} + +/*   * If server is a channel of ses, return the corresponding enclosing   * cifs_chan otherwise return NULL.   */ @@ -301,7 +393,10 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,  	/* Auth */  	ctx.domainauto = ses->domainAuto;  	ctx.domainname = ses->domainName; -	ctx.server_hostname = ses->server->hostname; + +	/* no hostname for extra channels */ +	ctx.server_hostname = ""; +  	ctx.username = ses->user_name;  	ctx.password = ses->password;  	ctx.sectype = ses->sectype; @@ -349,6 +444,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,  		spin_unlock(&ses->chan_lock);  		goto out;  	} +	chan->iface = iface;  	ses->chan_count++;  	atomic_set(&ses->chan_seq, 0); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 98a76fa791c0..8802995b2d3d 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -512,73 +512,41 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)  static int  parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,  			size_t buf_len, -			struct cifs_server_iface **iface_list, -			size_t *iface_count) +			struct cifs_ses *ses)  {  	struct network_interface_info_ioctl_rsp *p;  	struct sockaddr_in *addr4;  	struct sockaddr_in6 *addr6;  	struct iface_info_ipv4 *p4;  	struct iface_info_ipv6 *p6; -	struct cifs_server_iface *info; +	struct cifs_server_iface *info = NULL, *iface = NULL, *niface = NULL; +	struct cifs_server_iface tmp_iface;  	ssize_t bytes_left;  	size_t next = 0;  	int nb_iface = 0; -	int rc = 0; - -	*iface_list = NULL; -	*iface_count = 0; - -	/* -	 * Fist pass: count and sanity check -	 */ +	int rc = 0, ret = 0;  	bytes_left = buf_len;  	p = buf; -	while (bytes_left >= sizeof(*p)) { -		nb_iface++; -		next = le32_to_cpu(p->Next); -		if (!next) { -			bytes_left -= sizeof(*p); -			break; -		} -		p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); -		bytes_left -= next; -	} - -	if (!nb_iface) { -		cifs_dbg(VFS, "%s: malformed interface info\n", __func__); -		rc = -EINVAL; -		goto out; -	} - -	/* Azure rounds the buffer size up 8, to a 16 byte boundary */ -	if ((bytes_left > 8) || p->Next) -		cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); - +	spin_lock(&ses->iface_lock);  	/* -	 * Second pass: extract info to internal structure +	 * Go through iface_list and do kref_put to remove +	 * any unused ifaces. ifaces in use will be removed +	 * when the last user calls a kref_put on it  	 */ - -	*iface_list = kcalloc(nb_iface, sizeof(**iface_list), GFP_KERNEL); -	if (!*iface_list) { -		rc = -ENOMEM; -		goto out; +	list_for_each_entry_safe(iface, niface, &ses->iface_list, +				 iface_head) { +		iface->is_active = 0; +		kref_put(&iface->refcount, release_iface);  	} +	spin_unlock(&ses->iface_lock); -	info = *iface_list; -	bytes_left = buf_len; -	p = buf;  	while (bytes_left >= sizeof(*p)) { -		info->speed = le64_to_cpu(p->LinkSpeed); -		info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0; -		info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0; - -		cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count); -		cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed); -		cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__, -			 le32_to_cpu(p->Capability)); +		memset(&tmp_iface, 0, sizeof(tmp_iface)); +		tmp_iface.speed = le64_to_cpu(p->LinkSpeed); +		tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0; +		tmp_iface.rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0;  		switch (p->Family) {  		/* @@ -587,7 +555,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,  		 * conversion explicit in case either one changes.  		 */  		case INTERNETWORK: -			addr4 = (struct sockaddr_in *)&info->sockaddr; +			addr4 = (struct sockaddr_in *)&tmp_iface.sockaddr;  			p4 = (struct iface_info_ipv4 *)p->Buffer;  			addr4->sin_family = AF_INET;  			memcpy(&addr4->sin_addr, &p4->IPv4Address, 4); @@ -599,7 +567,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,  				 &addr4->sin_addr);  			break;  		case INTERNETWORKV6: -			addr6 =	(struct sockaddr_in6 *)&info->sockaddr; +			addr6 =	(struct sockaddr_in6 *)&tmp_iface.sockaddr;  			p6 = (struct iface_info_ipv6 *)p->Buffer;  			addr6->sin6_family = AF_INET6;  			memcpy(&addr6->sin6_addr, &p6->IPv6Address, 16); @@ -619,46 +587,96 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,  			goto next_iface;  		} -		(*iface_count)++; -		info++; +		/* +		 * The iface_list is assumed to be sorted by speed. +		 * Check if the new interface exists in that list. +		 * NEVER change iface. it could be in use. +		 * Add a new one instead +		 */ +		spin_lock(&ses->iface_lock); +		iface = niface = NULL; +		list_for_each_entry_safe(iface, niface, &ses->iface_list, +					 iface_head) { +			ret = iface_cmp(iface, &tmp_iface); +			if (!ret) { +				/* just get a ref so that it doesn't get picked/freed */ +				iface->is_active = 1; +				kref_get(&iface->refcount); +				spin_unlock(&ses->iface_lock); +				goto next_iface; +			} else if (ret < 0) { +				/* all remaining ifaces are slower */ +				kref_get(&iface->refcount); +				break; +			} +		} +		spin_unlock(&ses->iface_lock); + +		/* no match. insert the entry in the list */ +		info = kmalloc(sizeof(struct cifs_server_iface), +			       GFP_KERNEL); +		if (!info) { +			rc = -ENOMEM; +			goto out; +		} +		memcpy(info, &tmp_iface, sizeof(tmp_iface)); + +		/* add this new entry to the list */ +		kref_init(&info->refcount); +		info->is_active = 1; + +		cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, ses->iface_count); +		cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed); +		cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__, +			 le32_to_cpu(p->Capability)); + +		spin_lock(&ses->iface_lock); +		if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) { +			list_add_tail(&info->iface_head, &iface->iface_head); +			kref_put(&iface->refcount, release_iface); +		} else +			list_add_tail(&info->iface_head, &ses->iface_list); +		spin_unlock(&ses->iface_lock); + +		ses->iface_count++; +		ses->iface_last_update = jiffies;  next_iface: +		nb_iface++;  		next = le32_to_cpu(p->Next); -		if (!next) +		if (!next) { +			bytes_left -= sizeof(*p);  			break; +		}  		p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next);  		bytes_left -= next;  	} -	if (!*iface_count) { +	if (!nb_iface) { +		cifs_dbg(VFS, "%s: malformed interface info\n", __func__);  		rc = -EINVAL;  		goto out;  	} -out: -	if (rc) { -		kfree(*iface_list); -		*iface_count = 0; -		*iface_list = NULL; -	} -	return rc; -} +	/* Azure rounds the buffer size up 8, to a 16 byte boundary */ +	if ((bytes_left > 8) || p->Next) +		cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); -static int compare_iface(const void *ia, const void *ib) -{ -	const struct cifs_server_iface *a = (struct cifs_server_iface *)ia; -	const struct cifs_server_iface *b = (struct cifs_server_iface *)ib; -	return a->speed == b->speed ? 0 : (a->speed > b->speed ? -1 : 1); +	if (!ses->iface_count) { +		rc = -EINVAL; +		goto out; +	} + +out: +	return rc;  } -static int +int  SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)  {  	int rc;  	unsigned int ret_data_len = 0;  	struct network_interface_info_ioctl_rsp *out_buf = NULL; -	struct cifs_server_iface *iface_list; -	size_t iface_count;  	struct cifs_ses *ses = tcon->ses;  	rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, @@ -674,21 +692,10 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)  		goto out;  	} -	rc = parse_server_interfaces(out_buf, ret_data_len, -				     &iface_list, &iface_count); +	rc = parse_server_interfaces(out_buf, ret_data_len, ses);  	if (rc)  		goto out; -	/* sort interfaces from fastest to slowest */ -	sort(iface_list, iface_count, sizeof(*iface_list), compare_iface, NULL); - -	spin_lock(&ses->iface_lock); -	kfree(ses->iface_list); -	ses->iface_list = iface_list; -	ses->iface_count = iface_count; -	ses->iface_last_update = jiffies; -	spin_unlock(&ses->iface_lock); -  out:  	kfree(out_buf);  	return rc; @@ -4260,15 +4267,15 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,  	if (oplock == SMB2_OPLOCK_LEVEL_BATCH) {  		cinode->oplock = CIFS_CACHE_RHW_FLG;  		cifs_dbg(FYI, "Batch Oplock granted on inode %p\n", -			 &cinode->vfs_inode); +			 &cinode->netfs.inode);  	} else if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) {  		cinode->oplock = CIFS_CACHE_RW_FLG;  		cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", -			 &cinode->vfs_inode); +			 &cinode->netfs.inode);  	} else if (oplock == SMB2_OPLOCK_LEVEL_II) {  		cinode->oplock = CIFS_CACHE_READ_FLG;  		cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", -			 &cinode->vfs_inode); +			 &cinode->netfs.inode);  	} else  		cinode->oplock = 0;  } @@ -4307,7 +4314,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,  	cinode->oplock = new_oplock;  	cifs_dbg(FYI, "%s Lease granted on inode %p\n", message, -		 &cinode->vfs_inode); +		 &cinode->netfs.inode);  }  static void diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 0e8c85249579..12b4dddaedb0 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -288,6 +288,9 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,  			mutex_unlock(&ses->session_mutex);  			rc = -EHOSTDOWN;  			goto failed; +		} else if (rc) { +			mutex_unlock(&ses->session_mutex); +			goto out;  		}  	} else {  		mutex_unlock(&ses->session_mutex); @@ -540,6 +543,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,  		      struct TCP_Server_Info *server, unsigned int *total_len)  {  	char *pneg_ctxt; +	char *hostname = NULL;  	unsigned int ctxt_len, neg_context_count;  	if (*total_len > 200) { @@ -567,16 +571,24 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,  	*total_len += ctxt_len;  	pneg_ctxt += ctxt_len; -	ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt, -					server->hostname); -	*total_len += ctxt_len; -	pneg_ctxt += ctxt_len; -  	build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);  	*total_len += sizeof(struct smb2_posix_neg_context);  	pneg_ctxt += sizeof(struct smb2_posix_neg_context); -	neg_context_count = 4; +	/* +	 * secondary channels don't have the hostname field populated +	 * use the hostname field in the primary channel instead +	 */ +	hostname = CIFS_SERVER_IS_CHAN(server) ? +		server->primary_server->hostname : server->hostname; +	if (hostname && (hostname[0] != 0)) { +		ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt, +					      hostname); +		*total_len += ctxt_len; +		pneg_ctxt += ctxt_len; +		neg_context_count = 4; +	} else /* second channels do not have a hostname */ +		neg_context_count = 3;  	if (server->compress_algorithm) {  		build_compression_ctxt((struct smb2_compression_capabilities_context *) @@ -5151,6 +5163,8 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,  	data = &info;  	size = sizeof(struct smb2_file_eof_info); +	trace_smb3_set_eof(xid, persistent_fid, tcon->tid, tcon->ses->Suid, le64_to_cpu(*eof)); +  	return send_set_info(xid, tcon, persistent_fid, volatile_fid,  			pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE,  			0, 1, &data, &size); diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index 2be5e0c8564d..6b88dc2e364f 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -121,6 +121,44 @@ DEFINE_SMB3_RW_DONE_EVENT(query_dir_done);  DEFINE_SMB3_RW_DONE_EVENT(zero_done);  DEFINE_SMB3_RW_DONE_EVENT(falloc_done); +/* For logging successful set EOF (truncate) */ +DECLARE_EVENT_CLASS(smb3_eof_class, +	TP_PROTO(unsigned int xid, +		__u64	fid, +		__u32	tid, +		__u64	sesid, +		__u64	offset), +	TP_ARGS(xid, fid, tid, sesid, offset), +	TP_STRUCT__entry( +		__field(unsigned int, xid) +		__field(__u64, fid) +		__field(__u32, tid) +		__field(__u64, sesid) +		__field(__u64, offset) +	), +	TP_fast_assign( +		__entry->xid = xid; +		__entry->fid = fid; +		__entry->tid = tid; +		__entry->sesid = sesid; +		__entry->offset = offset; +	), +	TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx", +		__entry->xid, __entry->sesid, __entry->tid, __entry->fid, +		__entry->offset) +) + +#define DEFINE_SMB3_EOF_EVENT(name)         \ +DEFINE_EVENT(smb3_eof_class, smb3_##name,   \ +	TP_PROTO(unsigned int xid,		\ +		__u64	fid,			\ +		__u32	tid,			\ +		__u64	sesid,			\ +		__u64	offset),		\ +	TP_ARGS(xid, fid, tid, sesid, offset)) + +DEFINE_SMB3_EOF_EVENT(set_eof); +  /*   * For handle based calls other than read and write, and get/set info   */ diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 76acc3721951..c6eaf7e9ea74 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -1198,7 +1198,9 @@ static int __exfat_rename(struct inode *old_parent_inode,  		return -ENOENT;  	} -	exfat_chain_dup(&olddir, &ei->dir); +	exfat_chain_set(&olddir, EXFAT_I(old_parent_inode)->start_clu, +		EXFAT_B_TO_CLU_ROUND_UP(i_size_read(old_parent_inode), sbi), +		EXFAT_I(old_parent_inode)->flags);  	dentry = ei->entry;  	ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 2c2f179b6977..43de293cef56 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -672,17 +672,14 @@ int ext2_empty_dir (struct inode * inode)  	void *page_addr = NULL;  	struct page *page = NULL;  	unsigned long i, npages = dir_pages(inode); -	int dir_has_error = 0;  	for (i = 0; i < npages; i++) {  		char *kaddr;  		ext2_dirent * de; -		page = ext2_get_page(inode, i, dir_has_error, &page_addr); +		page = ext2_get_page(inode, i, 0, &page_addr); -		if (IS_ERR(page)) { -			dir_has_error = 1; -			continue; -		} +		if (IS_ERR(page)) +			goto not_empty;  		kaddr = page_addr;  		de = (ext2_dirent *)kaddr; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 360ce3604a2d..e6b932219803 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1549,7 +1549,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)  	if (IS_ERR(raw_inode))   		return -EIO; -	/* For fields not not tracking in the in-memory inode, +	/* For fields not tracking in the in-memory inode,  	 * initialise them to zero for new inodes. */  	if (ei->i_state & EXT2_STATE_NEW)  		memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3dce7d058985..84c0eb55071d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -829,7 +829,7 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,  	ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",  		   inode->i_ino, create);  	return _ext4_get_block(inode, iblock, bh_result, -			       EXT4_GET_BLOCKS_IO_CREATE_EXT); +			       EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);  }  /* Maximum number of blocks we map for direct IO at once. */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9f12f29bc346..9e06334771a3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4104,6 +4104,15 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,  	size = size >> bsbits;  	start = start_off >> bsbits; +	/* +	 * For tiny groups (smaller than 8MB) the chosen allocation +	 * alignment may be larger than group size. Make sure the +	 * alignment does not move allocation to a different group which +	 * makes mballoc fail assertions later. +	 */ +	start = max(start, rounddown(ac->ac_o_ex.fe_logical, +			(ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); +  	/* don't cover already allocated blocks in selected range */  	if (ar->pleft && start <= ar->lleft) {  		size -= ar->lleft + 1 - start; @@ -4176,7 +4185,22 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,  	}  	rcu_read_unlock(); -	if (start + size <= ac->ac_o_ex.fe_logical && +	/* +	 * In this function "start" and "size" are normalized for better +	 * alignment and length such that we could preallocate more blocks. +	 * This normalization is done such that original request of +	 * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and +	 * "size" boundaries. +	 * (Note fe_len can be relaxed since FS block allocation API does not +	 * provide gurantee on number of contiguous blocks allocation since that +	 * depends upon free space left, etc). +	 * In case of inode pa, later we use the allocated blocks +	 * [pa_start + fe_logical - pa_lstart, fe_len/size] from the preallocated +	 * range of goal/best blocks [start, size] to put it at the +	 * ac_o_ex.fe_logical extent of this inode. +	 * (See ext4_mb_use_inode_pa() for more details) +	 */ +	if (start + size <= ac->ac_o_ex.fe_logical ||  			start > ac->ac_o_ex.fe_logical) {  		ext4_msg(ac->ac_sb, KERN_ERR,  			 "start %lu, size %lu, fe_logical %lu", diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 7a5353a8cfd7..42f590518b4c 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -438,7 +438,7 @@ int ext4_ext_migrate(struct inode *inode)  	/*  	 * Worst case we can touch the allocation bitmaps and a block -	 * group descriptor block.  We do need need to worry about +	 * group descriptor block.  We do need to worry about  	 * credits for modifying the quota inode.  	 */  	handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 47d0ca4c795b..db4ba99d1ceb 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1929,7 +1929,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,  			struct dx_hash_info *hinfo)  {  	unsigned blocksize = dir->i_sb->s_blocksize; -	unsigned count, continued; +	unsigned continued; +	int count;  	struct buffer_head *bh2;  	ext4_lblk_t newblock;  	u32 hash2; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 14695e2b5042..97fa7b4c645f 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -465,7 +465,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,  	/*  	 * In the first loop we prepare and mark buffers to submit. We have to  	 * mark all buffers in the page before submitting so that -	 * end_page_writeback() cannot be called from ext4_bio_end_io() when IO +	 * end_page_writeback() cannot be called from ext4_end_bio() when IO  	 * on the first buffer finishes and we are still working on submitting  	 * the second buffer.  	 */ diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 90a941d20dff..8b70a4701293 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -54,6 +54,16 @@ int ext4_resize_begin(struct super_block *sb)  		return -EPERM;  	/* +	 * If the reserved GDT blocks is non-zero, the resize_inode feature +	 * should always be set. +	 */ +	if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks && +	    !ext4_has_feature_resize_inode(sb)) { +		ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero"); +		return -EFSCORRUPTED; +	} + +	/*  	 * If we are not using the primary superblock/GDT copy don't resize,           * because the user tools have no way of handling this.  Probably a           * bad time to do it anyways. diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 450c918d68fc..845f2f8aee5f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -87,7 +87,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,  static int ext4_validate_options(struct fs_context *fc);  static int ext4_check_opt_consistency(struct fs_context *fc,  				      struct super_block *sb); -static int ext4_apply_options(struct fs_context *fc, struct super_block *sb); +static void ext4_apply_options(struct fs_context *fc, struct super_block *sb);  static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param);  static int ext4_get_tree(struct fs_context *fc);  static int ext4_reconfigure(struct fs_context *fc); @@ -1870,31 +1870,12 @@ ext4_sb_read_encoding(const struct ext4_super_block *es)  }  #endif -static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg) -{ -#ifdef CONFIG_FS_ENCRYPTION -	struct ext4_sb_info *sbi = EXT4_SB(sb); -	int err; - -	err = fscrypt_set_test_dummy_encryption(sb, arg, -						&sbi->s_dummy_enc_policy); -	if (err) { -		ext4_msg(sb, KERN_WARNING, -			 "Error while setting test dummy encryption [%d]", err); -		return err; -	} -	ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); -#endif -	return 0; -} -  #define EXT4_SPEC_JQUOTA			(1 <<  0)  #define EXT4_SPEC_JQFMT				(1 <<  1)  #define EXT4_SPEC_DATAJ				(1 <<  2)  #define EXT4_SPEC_SB_BLOCK			(1 <<  3)  #define EXT4_SPEC_JOURNAL_DEV			(1 <<  4)  #define EXT4_SPEC_JOURNAL_IOPRIO		(1 <<  5) -#define EXT4_SPEC_DUMMY_ENCRYPTION		(1 <<  6)  #define EXT4_SPEC_s_want_extra_isize		(1 <<  7)  #define EXT4_SPEC_s_max_batch_time		(1 <<  8)  #define EXT4_SPEC_s_min_batch_time		(1 <<  9) @@ -1911,7 +1892,7 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg)  struct ext4_fs_context {  	char		*s_qf_names[EXT4_MAXQUOTAS]; -	char		*test_dummy_enc_arg; +	struct fscrypt_dummy_policy dummy_enc_policy;  	int		s_jquota_fmt;	/* Format of quota to use */  #ifdef CONFIG_EXT4_DEBUG  	int s_fc_debug_max_replay; @@ -1953,7 +1934,7 @@ static void ext4_fc_free(struct fs_context *fc)  	for (i = 0; i < EXT4_MAXQUOTAS; i++)  		kfree(ctx->s_qf_names[i]); -	kfree(ctx->test_dummy_enc_arg); +	fscrypt_free_dummy_policy(&ctx->dummy_enc_policy);  	kfree(ctx);  } @@ -2029,6 +2010,29 @@ static int unnote_qf_name(struct fs_context *fc, int qtype)  }  #endif +static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param, +					    struct ext4_fs_context *ctx) +{ +	int err; + +	if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { +		ext4_msg(NULL, KERN_WARNING, +			 "test_dummy_encryption option not supported"); +		return -EINVAL; +	} +	err = fscrypt_parse_test_dummy_encryption(param, +						  &ctx->dummy_enc_policy); +	if (err == -EINVAL) { +		ext4_msg(NULL, KERN_WARNING, +			 "Value of option \"%s\" is unrecognized", param->key); +	} else if (err == -EEXIST) { +		ext4_msg(NULL, KERN_WARNING, +			 "Conflicting test_dummy_encryption options"); +		return -EINVAL; +	} +	return err; +} +  #define EXT4_SET_CTX(name)						\  static inline void ctx_set_##name(struct ext4_fs_context *ctx,		\  				  unsigned long flag)			\ @@ -2291,29 +2295,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)  		ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO;  		return 0;  	case Opt_test_dummy_encryption: -#ifdef CONFIG_FS_ENCRYPTION -		if (param->type == fs_value_is_flag) { -			ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION; -			ctx->test_dummy_enc_arg = NULL; -			return 0; -		} -		if (*param->string && -		    !(!strcmp(param->string, "v1") || -		      !strcmp(param->string, "v2"))) { -			ext4_msg(NULL, KERN_WARNING, -				 "Value of option \"%s\" is unrecognized", -				 param->key); -			return -EINVAL; -		} -		ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION; -		ctx->test_dummy_enc_arg = kmemdup_nul(param->string, param->size, -						      GFP_KERNEL); -		return 0; -#else -		ext4_msg(NULL, KERN_WARNING, -			 "test_dummy_encryption option not supported"); -		return -EINVAL; -#endif +		return ext4_parse_test_dummy_encryption(param, ctx);  	case Opt_dax:  	case Opt_dax_type:  #ifdef CONFIG_FS_DAX @@ -2504,7 +2486,8 @@ parse_failed:  	if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)  		m_ctx->journal_ioprio = s_ctx->journal_ioprio; -	ret = ext4_apply_options(fc, sb); +	ext4_apply_options(fc, sb); +	ret = 0;  out_free:  	if (fc) { @@ -2673,11 +2656,11 @@ err_jquota_specified:  static int ext4_check_test_dummy_encryption(const struct fs_context *fc,  					    struct super_block *sb)  { -#ifdef CONFIG_FS_ENCRYPTION  	const struct ext4_fs_context *ctx = fc->fs_private;  	const struct ext4_sb_info *sbi = EXT4_SB(sb); +	int err; -	if (!(ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION)) +	if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy))  		return 0;  	if (!ext4_has_feature_encrypt(sb)) { @@ -2691,14 +2674,46 @@ static int ext4_check_test_dummy_encryption(const struct fs_context *fc,  	 * needed to allow it to be set or changed during remount.  We do allow  	 * it to be specified during remount, but only if there is no change.  	 */ -	if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && -	    !sbi->s_dummy_enc_policy.policy) { +	if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { +		if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, +						 &ctx->dummy_enc_policy)) +			return 0;  		ext4_msg(NULL, KERN_WARNING, -			 "Can't set test_dummy_encryption on remount"); +			 "Can't set or change test_dummy_encryption on remount");  		return -EINVAL;  	} -#endif /* CONFIG_FS_ENCRYPTION */ -	return 0; +	/* Also make sure s_mount_opts didn't contain a conflicting value. */ +	if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) { +		if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, +						 &ctx->dummy_enc_policy)) +			return 0; +		ext4_msg(NULL, KERN_WARNING, +			 "Conflicting test_dummy_encryption options"); +		return -EINVAL; +	} +	/* +	 * fscrypt_add_test_dummy_key() technically changes the super_block, so +	 * technically it should be delayed until ext4_apply_options() like the +	 * other changes.  But since we never get here for remounts (see above), +	 * and this is the last chance to report errors, we do it here. +	 */ +	err = fscrypt_add_test_dummy_key(sb, &ctx->dummy_enc_policy); +	if (err) +		ext4_msg(NULL, KERN_WARNING, +			 "Error adding test dummy encryption key [%d]", err); +	return err; +} + +static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx, +					     struct super_block *sb) +{ +	if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) || +	    /* if already set, it was already verified to be the same */ +	    fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy)) +		return; +	EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy; +	memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy)); +	ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");  }  static int ext4_check_opt_consistency(struct fs_context *fc, @@ -2785,11 +2800,10 @@ fail_dax_change_remount:  	return ext4_check_quota_consistency(fc, sb);  } -static int ext4_apply_options(struct fs_context *fc, struct super_block *sb) +static void ext4_apply_options(struct fs_context *fc, struct super_block *sb)  {  	struct ext4_fs_context *ctx = fc->fs_private;  	struct ext4_sb_info *sbi = fc->s_fs_info; -	int ret = 0;  	sbi->s_mount_opt &= ~ctx->mask_s_mount_opt;  	sbi->s_mount_opt |= ctx->vals_s_mount_opt; @@ -2825,11 +2839,7 @@ static int ext4_apply_options(struct fs_context *fc, struct super_block *sb)  #endif  	ext4_apply_quota_options(fc, sb); - -	if (ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION) -		ret = ext4_set_test_dummy_encryption(sb, ctx->test_dummy_enc_arg); - -	return ret; +	ext4_apply_test_dummy_encryption(ctx, sb);  } @@ -4552,9 +4562,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)  	if (err < 0)  		goto failed_mount; -	err = ext4_apply_options(fc, sb); -	if (err < 0) -		goto failed_mount; +	ext4_apply_options(fc, sb);  #if IS_ENABLED(CONFIG_UNICODE)  	if (ext4_has_feature_casefold(sb) && !sb->s_encoding) { @@ -5302,14 +5310,6 @@ no_journal:  		err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,  					  GFP_KERNEL);  	} -	/* -	 * Update the checksum after updating free space/inode -	 * counters.  Otherwise the superblock can have an incorrect -	 * checksum in the buffer cache until it is written out and -	 * e2fsprogs programs trying to open a file system immediately -	 * after it is mounted can fail. -	 */ -	ext4_superblock_csum_set(sb);  	if (!err)  		err = percpu_counter_init(&sbi->s_dirs_counter,  					  ext4_count_dirs(sb), GFP_KERNEL); @@ -5367,6 +5367,14 @@ no_journal:  	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;  	ext4_orphan_cleanup(sb, es);  	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; +	/* +	 * Update the checksum after updating free space/inode counters and +	 * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect +	 * checksum in the buffer cache until it is written out and +	 * e2fsprogs programs trying to open a file system immediately +	 * after it is mounted can fail. +	 */ +	ext4_superblock_csum_set(sb);  	if (needs_recovery) {  		ext4_msg(sb, KERN_INFO, "recovery complete");  		err = ext4_mark_recovery_complete(sb, es); @@ -5898,7 +5906,6 @@ static void ext4_update_super(struct super_block *sb)  static int ext4_commit_super(struct super_block *sb)  {  	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; -	int error = 0;  	if (!sbh)  		return -EINVAL; @@ -5907,6 +5914,13 @@ static int ext4_commit_super(struct super_block *sb)  	ext4_update_super(sb); +	lock_buffer(sbh); +	/* Buffer got discarded which means block device got invalidated */ +	if (!buffer_mapped(sbh)) { +		unlock_buffer(sbh); +		return -EIO; +	} +  	if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {  		/*  		 * Oh, dear.  A previous attempt to write the @@ -5921,17 +5935,21 @@ static int ext4_commit_super(struct super_block *sb)  		clear_buffer_write_io_error(sbh);  		set_buffer_uptodate(sbh);  	} -	BUFFER_TRACE(sbh, "marking dirty"); -	mark_buffer_dirty(sbh); -	error = __sync_dirty_buffer(sbh, -		REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); +	get_bh(sbh); +	/* Clear potential dirty bit if it was journalled update */ +	clear_buffer_dirty(sbh); +	sbh->b_end_io = end_buffer_write_sync; +	submit_bh(REQ_OP_WRITE, +		  REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh); +	wait_on_buffer(sbh);  	if (buffer_write_io_error(sbh)) {  		ext4_msg(sb, KERN_ERR, "I/O error while writing "  		       "superblock");  		clear_buffer_write_io_error(sbh);  		set_buffer_uptodate(sbh); +		return -EIO;  	} -	return error; +	return 0;  }  /* diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 042325349098..564e28a1aa94 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1895,11 +1895,10 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,  			unlock_buffer(bs->bh);  			ea_bdebug(bs->bh, "cloning"); -			s->base = kmalloc(bs->bh->b_size, GFP_NOFS); +			s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);  			error = -ENOMEM;  			if (s->base == NULL)  				goto cleanup; -			memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);  			s->first = ENTRY(header(s->base)+1);  			header(s->base)->h_refcount = cpu_to_le32(1);  			s->here = ENTRY(s->base + offset); diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index be599f31d3c4..d84c5f6cc09d 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -91,8 +91,9 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)  	unsigned int cnt;  	struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE];  	struct iostat_lat_info *io_lat = sbi->iostat_io_lat; +	unsigned long flags; -	spin_lock_bh(&sbi->iostat_lat_lock); +	spin_lock_irqsave(&sbi->iostat_lat_lock, flags);  	for (idx = 0; idx < MAX_IO_TYPE; idx++) {  		for (io = 0; io < NR_PAGE_TYPE; io++) {  			cnt = io_lat->bio_cnt[idx][io]; @@ -106,7 +107,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)  			io_lat->bio_cnt[idx][io] = 0;  		}  	} -	spin_unlock_bh(&sbi->iostat_lat_lock); +	spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags);  	trace_f2fs_iostat_latency(sbi, iostat_lat);  } @@ -115,14 +116,15 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)  {  	unsigned long long iostat_diff[NR_IO_TYPE];  	int i; +	unsigned long flags;  	if (time_is_after_jiffies(sbi->iostat_next_period))  		return;  	/* Need double check under the lock */ -	spin_lock_bh(&sbi->iostat_lock); +	spin_lock_irqsave(&sbi->iostat_lock, flags);  	if (time_is_after_jiffies(sbi->iostat_next_period)) { -		spin_unlock_bh(&sbi->iostat_lock); +		spin_unlock_irqrestore(&sbi->iostat_lock, flags);  		return;  	}  	sbi->iostat_next_period = jiffies + @@ -133,7 +135,7 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)  				sbi->prev_rw_iostat[i];  		sbi->prev_rw_iostat[i] = sbi->rw_iostat[i];  	} -	spin_unlock_bh(&sbi->iostat_lock); +	spin_unlock_irqrestore(&sbi->iostat_lock, flags);  	trace_f2fs_iostat(sbi, iostat_diff); @@ -145,25 +147,27 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi)  	struct iostat_lat_info *io_lat = sbi->iostat_io_lat;  	int i; -	spin_lock_bh(&sbi->iostat_lock); +	spin_lock_irq(&sbi->iostat_lock);  	for (i = 0; i < NR_IO_TYPE; i++) {  		sbi->rw_iostat[i] = 0;  		sbi->prev_rw_iostat[i] = 0;  	} -	spin_unlock_bh(&sbi->iostat_lock); +	spin_unlock_irq(&sbi->iostat_lock); -	spin_lock_bh(&sbi->iostat_lat_lock); +	spin_lock_irq(&sbi->iostat_lat_lock);  	memset(io_lat, 0, sizeof(struct iostat_lat_info)); -	spin_unlock_bh(&sbi->iostat_lat_lock); +	spin_unlock_irq(&sbi->iostat_lat_lock);  }  void f2fs_update_iostat(struct f2fs_sb_info *sbi,  			enum iostat_type type, unsigned long long io_bytes)  { +	unsigned long flags; +  	if (!sbi->iostat_enable)  		return; -	spin_lock_bh(&sbi->iostat_lock); +	spin_lock_irqsave(&sbi->iostat_lock, flags);  	sbi->rw_iostat[type] += io_bytes;  	if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO) @@ -172,7 +176,7 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi,  	if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO)  		sbi->rw_iostat[APP_READ_IO] += io_bytes; -	spin_unlock_bh(&sbi->iostat_lock); +	spin_unlock_irqrestore(&sbi->iostat_lock, flags);  	f2fs_record_iostat(sbi);  } @@ -185,6 +189,7 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,  	struct f2fs_sb_info *sbi = iostat_ctx->sbi;  	struct iostat_lat_info *io_lat = sbi->iostat_io_lat;  	int idx; +	unsigned long flags;  	if (!sbi->iostat_enable)  		return; @@ -202,12 +207,12 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,  			idx = WRITE_ASYNC_IO;  	} -	spin_lock_bh(&sbi->iostat_lat_lock); +	spin_lock_irqsave(&sbi->iostat_lat_lock, flags);  	io_lat->sum_lat[idx][iotype] += ts_diff;  	io_lat->bio_cnt[idx][iotype]++;  	if (ts_diff > io_lat->peak_lat[idx][iotype])  		io_lat->peak_lat[idx][iotype] = ts_diff; -	spin_unlock_bh(&sbi->iostat_lat_lock); +	spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags);  }  void iostat_update_and_unbind_ctx(struct bio *bio, int rw) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index c549acb52ac4..bf00d5057abb 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -89,8 +89,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,  	if (test_opt(sbi, INLINE_XATTR))  		set_inode_flag(inode, FI_INLINE_XATTR); -	if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) -		set_inode_flag(inode, FI_INLINE_DATA);  	if (f2fs_may_inline_dentry(inode))  		set_inode_flag(inode, FI_INLINE_DENTRY); @@ -107,10 +105,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,  	f2fs_init_extent_tree(inode, NULL); -	stat_inc_inline_xattr(inode); -	stat_inc_inline_inode(inode); -	stat_inc_inline_dir(inode); -  	F2FS_I(inode)->i_flags =  		f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); @@ -127,6 +121,14 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,  			set_compress_context(inode);  	} +	/* Should enable inline_data after compression set */ +	if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) +		set_inode_flag(inode, FI_INLINE_DATA); + +	stat_inc_inline_xattr(inode); +	stat_inc_inline_inode(inode); +	stat_inc_inline_dir(inode); +  	f2fs_set_inode_flags(inode);  	trace_f2fs_new_inode(inode, 0); @@ -325,6 +327,9 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,  		if (!is_extension_exist(name, ext[i], false))  			continue; +		/* Do not use inline_data with compression */ +		stat_dec_inline_inode(inode); +		clear_inode_flag(inode, FI_INLINE_DATA);  		set_compress_context(inode);  		return;  	} diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 836c79a20afc..cf6f7fc83c08 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1450,7 +1450,9 @@ page_hit:  out_err:  	ClearPageUptodate(page);  out_put_err: -	f2fs_handle_page_eio(sbi, page->index, NODE); +	/* ENOENT comes from read_node_page which is not an error. */ +	if (err != -ENOENT) +		f2fs_handle_page_eio(sbi, page->index, NODE);  	f2fs_put_page(page, 1);  	return ERR_PTR(err);  } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a21d8f1a56d1..05221366a16d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -120,6 +120,7 @@ static bool inode_io_list_move_locked(struct inode *inode,  				      struct list_head *head)  {  	assert_spin_locked(&wb->list_lock); +	assert_spin_locked(&inode->i_lock);  	list_move(&inode->i_io_list, head); @@ -1365,9 +1366,9 @@ static int move_expired_inodes(struct list_head *delaying_queue,  		inode = wb_inode(delaying_queue->prev);  		if (inode_dirtied_after(inode, dirtied_before))  			break; +		spin_lock(&inode->i_lock);  		list_move(&inode->i_io_list, &tmp);  		moved++; -		spin_lock(&inode->i_lock);  		inode->i_state |= I_SYNC_QUEUED;  		spin_unlock(&inode->i_lock);  		if (sb_is_blkdev_sb(inode->i_sb)) @@ -1383,7 +1384,12 @@ static int move_expired_inodes(struct list_head *delaying_queue,  		goto out;  	} -	/* Move inodes from one superblock together */ +	/* +	 * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue', +	 * we don't take inode->i_lock here because it is just a pointless overhead. +	 * Inode is already marked as I_SYNC_QUEUED so writeback list handling is +	 * fully under our control. +	 */  	while (!list_empty(&tmp)) {  		sb = wb_inode(tmp.prev)->i_sb;  		list_for_each_prev_safe(pos, node, &tmp) { @@ -1826,8 +1832,8 @@ static long writeback_sb_inodes(struct super_block *sb,  			 * We'll have another go at writing back this inode  			 * when we completed a full scan of b_io.  			 */ -			spin_unlock(&inode->i_lock);  			requeue_io(inode, wb); +			spin_unlock(&inode->i_lock);  			trace_writeback_sb_inodes_requeue(inode);  			continue;  		} @@ -2358,6 +2364,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)  {  	struct super_block *sb = inode->i_sb;  	int dirtytime = 0; +	struct bdi_writeback *wb = NULL;  	trace_writeback_mark_inode_dirty(inode, flags); @@ -2410,13 +2417,24 @@ void __mark_inode_dirty(struct inode *inode, int flags)  		inode->i_state |= flags;  		/* +		 * Grab inode's wb early because it requires dropping i_lock and we +		 * need to make sure following checks happen atomically with dirty +		 * list handling so that we don't move inodes under flush worker's +		 * hands. +		 */ +		if (!was_dirty) { +			wb = locked_inode_to_wb_and_lock_list(inode); +			spin_lock(&inode->i_lock); +		} + +		/*  		 * If the inode is queued for writeback by flush worker, just  		 * update its dirty state. Once the flush worker is done with  		 * the inode it will place it on the appropriate superblock  		 * list, based upon its state.  		 */  		if (inode->i_state & I_SYNC_QUEUED) -			goto out_unlock_inode; +			goto out_unlock;  		/*  		 * Only add valid (hashed) inodes to the superblock's @@ -2424,22 +2442,19 @@ void __mark_inode_dirty(struct inode *inode, int flags)  		 */  		if (!S_ISBLK(inode->i_mode)) {  			if (inode_unhashed(inode)) -				goto out_unlock_inode; +				goto out_unlock;  		}  		if (inode->i_state & I_FREEING) -			goto out_unlock_inode; +			goto out_unlock;  		/*  		 * If the inode was already on b_dirty/b_io/b_more_io, don't  		 * reposition it (that would break b_dirty time-ordering).  		 */  		if (!was_dirty) { -			struct bdi_writeback *wb;  			struct list_head *dirty_list;  			bool wakeup_bdi = false; -			wb = locked_inode_to_wb_and_lock_list(inode); -  			inode->dirtied_when = jiffies;  			if (dirtytime)  				inode->dirtied_time_when = jiffies; @@ -2453,6 +2468,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)  							       dirty_list);  			spin_unlock(&wb->list_lock); +			spin_unlock(&inode->i_lock);  			trace_writeback_dirty_inode_enqueue(inode);  			/* @@ -2467,6 +2483,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)  			return;  		}  	} +out_unlock: +	if (wb) +		spin_unlock(&wb->list_lock);  out_unlock_inode:  	spin_unlock(&inode->i_lock);  } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 62408047e8d7..02eb72351b15 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -600,41 +600,79 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)  	remove_inode_hugepages(inode, offset, LLONG_MAX);  } +static void hugetlbfs_zero_partial_page(struct hstate *h, +					struct address_space *mapping, +					loff_t start, +					loff_t end) +{ +	pgoff_t idx = start >> huge_page_shift(h); +	struct folio *folio; + +	folio = filemap_lock_folio(mapping, idx); +	if (!folio) +		return; + +	start = start & ~huge_page_mask(h); +	end = end & ~huge_page_mask(h); +	if (!end) +		end = huge_page_size(h); + +	folio_zero_segment(folio, (size_t)start, (size_t)end); + +	folio_unlock(folio); +	folio_put(folio); +} +  static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)  { +	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); +	struct address_space *mapping = inode->i_mapping;  	struct hstate *h = hstate_inode(inode);  	loff_t hpage_size = huge_page_size(h);  	loff_t hole_start, hole_end;  	/* -	 * For hole punch round up the beginning offset of the hole and -	 * round down the end. +	 * hole_start and hole_end indicate the full pages within the hole.  	 */  	hole_start = round_up(offset, hpage_size);  	hole_end = round_down(offset + len, hpage_size); -	if (hole_end > hole_start) { -		struct address_space *mapping = inode->i_mapping; -		struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); +	inode_lock(inode); -		inode_lock(inode); +	/* protected by i_rwsem */ +	if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { +		inode_unlock(inode); +		return -EPERM; +	} -		/* protected by i_rwsem */ -		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { -			inode_unlock(inode); -			return -EPERM; -		} +	i_mmap_lock_write(mapping); + +	/* If range starts before first full page, zero partial page. */ +	if (offset < hole_start) +		hugetlbfs_zero_partial_page(h, mapping, +				offset, min(offset + len, hole_start)); -		i_mmap_lock_write(mapping); +	/* Unmap users of full pages in the hole. */ +	if (hole_end > hole_start) {  		if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))  			hugetlb_vmdelete_list(&mapping->i_mmap,  					      hole_start >> PAGE_SHIFT,  					      hole_end >> PAGE_SHIFT, 0); -		i_mmap_unlock_write(mapping); -		remove_inode_hugepages(inode, hole_start, hole_end); -		inode_unlock(inode);  	} +	/* If range extends beyond last full page, zero partial page. */ +	if ((offset + len) > hole_end && (offset + len) > hole_start) +		hugetlbfs_zero_partial_page(h, mapping, +				hole_end, offset + len); + +	i_mmap_unlock_write(mapping); + +	/* Remove full pages from the file. */ +	if (hole_end > hole_start) +		remove_inode_hugepages(inode, hole_start, hole_end); + +	inode_unlock(inode); +  	return 0;  } diff --git a/fs/inode.c b/fs/inode.c index 9d9b422504d1..bd4da9c5207e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -27,7 +27,7 @@   * Inode locking rules:   *   * inode->i_lock protects: - *   inode->i_state, inode->i_hash, __iget() + *   inode->i_state, inode->i_hash, __iget(), inode->i_io_list   * Inode LRU list locks protect:   *   inode->i_sb->s_inode_lru, inode->i_lru   * inode->i_sb->s_inode_list_lock protects: diff --git a/fs/io_uring.c b/fs/io_uring.c index 3aab4182fd89..5ff2cdb425bc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -298,8 +298,8 @@ struct io_buffer_list {  	/* below is for ring provided buffers */  	__u16 buf_nr_pages;  	__u16 nr_entries; -	__u32 head; -	__u32 mask; +	__u16 head; +	__u16 mask;  };  struct io_buffer { @@ -576,7 +576,6 @@ struct io_close {  	struct file			*file;  	int				fd;  	u32				file_slot; -	u32				flags;  };  struct io_timeout_data { @@ -784,12 +783,6 @@ struct io_msg {  	u32 len;  }; -struct io_nop { -	struct file			*file; -	u64				extra1; -	u64				extra2; -}; -  struct io_async_connect {  	struct sockaddr_storage		address;  }; @@ -851,6 +844,7 @@ enum {  	REQ_F_SINGLE_POLL_BIT,  	REQ_F_DOUBLE_POLL_BIT,  	REQ_F_PARTIAL_IO_BIT, +	REQ_F_CQE32_INIT_BIT,  	REQ_F_APOLL_MULTISHOT_BIT,  	/* keep async read/write and isreg together and in order */  	REQ_F_SUPPORT_NOWAIT_BIT, @@ -920,6 +914,8 @@ enum {  	REQ_F_PARTIAL_IO	= BIT(REQ_F_PARTIAL_IO_BIT),  	/* fast poll multishot mode */  	REQ_F_APOLL_MULTISHOT	= BIT(REQ_F_APOLL_MULTISHOT_BIT), +	/* ->extra1 and ->extra2 are initialised */ +	REQ_F_CQE32_INIT	= BIT(REQ_F_CQE32_INIT_BIT),  };  struct async_poll { @@ -994,7 +990,6 @@ struct io_kiocb {  		struct io_msg		msg;  		struct io_xattr		xattr;  		struct io_socket	sock; -		struct io_nop		nop;  		struct io_uring_cmd	uring_cmd;  	}; @@ -1121,7 +1116,6 @@ static const struct io_op_def io_op_defs[] = {  	[IORING_OP_NOP] = {  		.audit_skip		= 1,  		.iopoll			= 1, -		.buffer_select		= 1,  	},  	[IORING_OP_READV] = {  		.needs_file		= 1, @@ -1729,9 +1723,16 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)  	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))  		return; -	/* don't recycle if we already did IO to this buffer */ -	if (req->flags & REQ_F_PARTIAL_IO) +	/* +	 * For legacy provided buffer mode, don't recycle if we already did +	 * IO to this buffer. For ring-mapped provided buffer mode, we should +	 * increment ring->head to explicitly monopolize the buffer to avoid +	 * multiple use. +	 */ +	if ((req->flags & REQ_F_BUFFER_SELECTED) && +	    (req->flags & REQ_F_PARTIAL_IO))  		return; +  	/*  	 * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear  	 * the flag and hence ensure that bl->head doesn't get incremented. @@ -1739,8 +1740,13 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)  	 */  	if (req->flags & REQ_F_BUFFER_RING) {  		if (req->buf_list) { -			req->buf_index = req->buf_list->bgid; -			req->flags &= ~REQ_F_BUFFER_RING; +			if (req->flags & REQ_F_PARTIAL_IO) { +				req->buf_list->head++; +				req->buf_list = NULL; +			} else { +				req->buf_index = req->buf_list->bgid; +				req->flags &= ~REQ_F_BUFFER_RING; +			}  		}  		return;  	} @@ -1969,7 +1975,7 @@ static inline void io_req_track_inflight(struct io_kiocb *req)  {  	if (!(req->flags & REQ_F_INFLIGHT)) {  		req->flags |= REQ_F_INFLIGHT; -		atomic_inc(¤t->io_uring->inflight_tracked); +		atomic_inc(&req->task->io_uring->inflight_tracked);  	}  } @@ -2441,94 +2447,66 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,  	return true;  } -static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, -				 s32 res, u32 cflags) +static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, +				     struct io_kiocb *req)  {  	struct io_uring_cqe *cqe; -	/* -	 * If we can't get a cq entry, userspace overflowed the -	 * submission (by quite a lot). Increment the overflow count in -	 * the ring. -	 */ -	cqe = io_get_cqe(ctx); -	if (likely(cqe)) { -		WRITE_ONCE(cqe->user_data, user_data); -		WRITE_ONCE(cqe->res, res); -		WRITE_ONCE(cqe->flags, cflags); -		return true; -	} -	return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); -} +	if (!(ctx->flags & IORING_SETUP_CQE32)) { +		trace_io_uring_complete(req->ctx, req, req->cqe.user_data, +					req->cqe.res, req->cqe.flags, 0, 0); -static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx, -					    struct io_kiocb *req) -{ -	struct io_uring_cqe *cqe; +		/* +		 * If we can't get a cq entry, userspace overflowed the +		 * submission (by quite a lot). Increment the overflow count in +		 * the ring. +		 */ +		cqe = io_get_cqe(ctx); +		if (likely(cqe)) { +			memcpy(cqe, &req->cqe, sizeof(*cqe)); +			return true; +		} -	trace_io_uring_complete(req->ctx, req, req->cqe.user_data, -				req->cqe.res, req->cqe.flags, 0, 0); +		return io_cqring_event_overflow(ctx, req->cqe.user_data, +						req->cqe.res, req->cqe.flags, +						0, 0); +	} else { +		u64 extra1 = 0, extra2 = 0; -	/* -	 * If we can't get a cq entry, userspace overflowed the -	 * submission (by quite a lot). Increment the overflow count in -	 * the ring. -	 */ -	cqe = io_get_cqe(ctx); -	if (likely(cqe)) { -		memcpy(cqe, &req->cqe, sizeof(*cqe)); -		return true; -	} -	return io_cqring_event_overflow(ctx, req->cqe.user_data, -					req->cqe.res, req->cqe.flags, 0, 0); -} +		if (req->flags & REQ_F_CQE32_INIT) { +			extra1 = req->extra1; +			extra2 = req->extra2; +		} -static inline bool __io_fill_cqe32_req_filled(struct io_ring_ctx *ctx, -					      struct io_kiocb *req) -{ -	struct io_uring_cqe *cqe; -	u64 extra1 = req->extra1; -	u64 extra2 = req->extra2; +		trace_io_uring_complete(req->ctx, req, req->cqe.user_data, +					req->cqe.res, req->cqe.flags, extra1, extra2); -	trace_io_uring_complete(req->ctx, req, req->cqe.user_data, -				req->cqe.res, req->cqe.flags, extra1, extra2); +		/* +		 * If we can't get a cq entry, userspace overflowed the +		 * submission (by quite a lot). Increment the overflow count in +		 * the ring. +		 */ +		cqe = io_get_cqe(ctx); +		if (likely(cqe)) { +			memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); +			WRITE_ONCE(cqe->big_cqe[0], extra1); +			WRITE_ONCE(cqe->big_cqe[1], extra2); +			return true; +		} -	/* -	 * If we can't get a cq entry, userspace overflowed the -	 * submission (by quite a lot). Increment the overflow count in -	 * the ring. -	 */ -	cqe = io_get_cqe(ctx); -	if (likely(cqe)) { -		memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); -		cqe->big_cqe[0] = extra1; -		cqe->big_cqe[1] = extra2; -		return true; +		return io_cqring_event_overflow(ctx, req->cqe.user_data, +				req->cqe.res, req->cqe.flags, +				extra1, extra2);  	} - -	return io_cqring_event_overflow(ctx, req->cqe.user_data, req->cqe.res, -					req->cqe.flags, extra1, extra2);  } -static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) -{ -	trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags, 0, 0); -	return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags); -} - -static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags, -				u64 extra1, u64 extra2) +static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, +				     s32 res, u32 cflags)  { -	struct io_ring_ctx *ctx = req->ctx;  	struct io_uring_cqe *cqe; -	if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) -		return; -	if (req->flags & REQ_F_CQE_SKIP) -		return; - -	trace_io_uring_complete(ctx, req, req->cqe.user_data, res, cflags, -				extra1, extra2); +	ctx->cq_extra++; +	trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);  	/*  	 * If we can't get a cq entry, userspace overflowed the @@ -2537,23 +2515,17 @@ static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags  	 */  	cqe = io_get_cqe(ctx);  	if (likely(cqe)) { -		WRITE_ONCE(cqe->user_data, req->cqe.user_data); +		WRITE_ONCE(cqe->user_data, user_data);  		WRITE_ONCE(cqe->res, res);  		WRITE_ONCE(cqe->flags, cflags); -		WRITE_ONCE(cqe->big_cqe[0], extra1); -		WRITE_ONCE(cqe->big_cqe[1], extra2); -		return; -	} -	io_cqring_event_overflow(ctx, req->cqe.user_data, res, cflags, extra1, extra2); -} - -static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, -				     s32 res, u32 cflags) -{ -	ctx->cq_extra++; -	trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); -	return __io_fill_cqe(ctx, user_data, res, cflags); +		if (ctx->flags & IORING_SETUP_CQE32) { +			WRITE_ONCE(cqe->big_cqe[0], 0); +			WRITE_ONCE(cqe->big_cqe[1], 0); +		} +		return true; +	} +	return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);  }  static void __io_req_complete_put(struct io_kiocb *req) @@ -2590,16 +2562,11 @@ static void __io_req_complete_put(struct io_kiocb *req)  static void __io_req_complete_post(struct io_kiocb *req, s32 res,  				   u32 cflags)  { -	if (!(req->flags & REQ_F_CQE_SKIP)) -		__io_fill_cqe_req(req, res, cflags); -	__io_req_complete_put(req); -} - -static void __io_req_complete_post32(struct io_kiocb *req, s32 res, -				   u32 cflags, u64 extra1, u64 extra2) -{ -	if (!(req->flags & REQ_F_CQE_SKIP)) -		__io_fill_cqe32_req(req, res, cflags, extra1, extra2); +	if (!(req->flags & REQ_F_CQE_SKIP)) { +		req->cqe.res = res; +		req->cqe.flags = cflags; +		__io_fill_cqe_req(req->ctx, req); +	}  	__io_req_complete_put(req);  } @@ -2614,18 +2581,6 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags)  	io_cqring_ev_posted(ctx);  } -static void io_req_complete_post32(struct io_kiocb *req, s32 res, -				   u32 cflags, u64 extra1, u64 extra2) -{ -	struct io_ring_ctx *ctx = req->ctx; - -	spin_lock(&ctx->completion_lock); -	__io_req_complete_post32(req, res, cflags, extra1, extra2); -	io_commit_cqring(ctx); -	spin_unlock(&ctx->completion_lock); -	io_cqring_ev_posted(ctx); -} -  static inline void io_req_complete_state(struct io_kiocb *req, s32 res,  					 u32 cflags)  { @@ -2643,19 +2598,6 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,  		io_req_complete_post(req, res, cflags);  } -static inline void __io_req_complete32(struct io_kiocb *req, -				       unsigned int issue_flags, s32 res, -				       u32 cflags, u64 extra1, u64 extra2) -{ -	if (issue_flags & IO_URING_F_COMPLETE_DEFER) { -		io_req_complete_state(req, res, cflags); -		req->extra1 = extra1; -		req->extra2 = extra2; -	} else { -		io_req_complete_post32(req, res, cflags, extra1, extra2); -	} -} -  static inline void io_req_complete(struct io_kiocb *req, s32 res)  {  	if (res < 0) @@ -3202,12 +3144,8 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)  			struct io_kiocb *req = container_of(node, struct io_kiocb,  						    comp_list); -			if (!(req->flags & REQ_F_CQE_SKIP)) { -				if (!(ctx->flags & IORING_SETUP_CQE32)) -					__io_fill_cqe_req_filled(ctx, req); -				else -					__io_fill_cqe32_req_filled(ctx, req); -			} +			if (!(req->flags & REQ_F_CQE_SKIP)) +				__io_fill_cqe_req(ctx, req);  		}  		io_commit_cqring(ctx); @@ -3326,7 +3264,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)  		nr_events++;  		if (unlikely(req->flags & REQ_F_CQE_SKIP))  			continue; -		__io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0)); + +		req->cqe.flags = io_put_kbuf(req, 0); +		__io_fill_cqe_req(req->ctx, req);  	}  	if (unlikely(!nr_events)) @@ -3497,7 +3437,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)  	if (unlikely(res != req->cqe.res)) {  		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&  		    io_rw_should_reissue(req)) { -			req->flags |= REQ_F_REISSUE; +			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;  			return true;  		}  		req_set_fail(req); @@ -3547,7 +3487,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)  		kiocb_end_write(req);  	if (unlikely(res != req->cqe.res)) {  		if (res == -EAGAIN && io_rw_should_reissue(req)) { -			req->flags |= REQ_F_REISSUE; +			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;  			return;  		}  		req->cqe.res = res; @@ -3677,6 +3617,20 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	int ret;  	kiocb->ki_pos = READ_ONCE(sqe->off); +	/* used for fixed read/write too - just read unconditionally */ +	req->buf_index = READ_ONCE(sqe->buf_index); + +	if (req->opcode == IORING_OP_READ_FIXED || +	    req->opcode == IORING_OP_WRITE_FIXED) { +		struct io_ring_ctx *ctx = req->ctx; +		u16 index; + +		if (unlikely(req->buf_index >= ctx->nr_user_bufs)) +			return -EFAULT; +		index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); +		req->imu = ctx->user_bufs[index]; +		io_req_set_rsrc_node(req, ctx, 0); +	}  	ioprio = READ_ONCE(sqe->ioprio);  	if (ioprio) { @@ -3689,12 +3643,9 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)  		kiocb->ki_ioprio = get_current_ioprio();  	} -	req->imu = NULL;  	req->rw.addr = READ_ONCE(sqe->addr);  	req->rw.len = READ_ONCE(sqe->len);  	req->rw.flags = READ_ONCE(sqe->rw_flags); -	/* used for fixed read/write too - just read unconditionally */ -	req->buf_index = READ_ONCE(sqe->buf_index);  	return 0;  } @@ -3826,20 +3777,9 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter  static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,  			   unsigned int issue_flags)  { -	struct io_mapped_ubuf *imu = req->imu; -	u16 index, buf_index = req->buf_index; - -	if (likely(!imu)) { -		struct io_ring_ctx *ctx = req->ctx; - -		if (unlikely(buf_index >= ctx->nr_user_bufs)) -			return -EFAULT; -		io_req_set_rsrc_node(req, ctx, issue_flags); -		index = array_index_nospec(buf_index, ctx->nr_user_bufs); -		imu = READ_ONCE(ctx->user_bufs[index]); -		req->imu = imu; -	} -	return __io_import_fixed(req, rw, iter, imu); +	if (WARN_ON_ONCE(!req->imu)) +		return -EFAULT; +	return __io_import_fixed(req, rw, iter, req->imu);  }  static int io_buffer_add_list(struct io_ring_ctx *ctx, @@ -3876,19 +3816,17 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,  {  	struct io_uring_buf_ring *br = bl->buf_ring;  	struct io_uring_buf *buf; -	__u32 head = bl->head; +	__u16 head = bl->head; -	if (unlikely(smp_load_acquire(&br->tail) == head)) { -		io_ring_submit_unlock(req->ctx, issue_flags); +	if (unlikely(smp_load_acquire(&br->tail) == head))  		return NULL; -	}  	head &= bl->mask;  	if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {  		buf = &br->bufs[head];  	} else {  		int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); -		int index = head / IO_BUFFER_LIST_BUF_PER_PAGE - 1; +		int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;  		buf = page_address(bl->buf_pages[index]);  		buf += off;  	} @@ -3898,7 +3836,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,  	req->buf_list = bl;  	req->buf_index = buf->bid; -	if (issue_flags & IO_URING_F_UNLOCKED) { +	if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {  		/*  		 * If we came in unlocked, we have no choice but to consume the  		 * buffer here. This does mean it'll be pinned until the IO @@ -5079,10 +5017,18 @@ void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,  	req->uring_cmd.task_work_cb = task_work_cb;  	req->io_task_work.func = io_uring_cmd_work; -	io_req_task_prio_work_add(req); +	io_req_task_work_add(req);  }  EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); +static inline void io_req_set_cqe32_extra(struct io_kiocb *req, +					  u64 extra1, u64 extra2) +{ +	req->extra1 = extra1; +	req->extra2 = extra2; +	req->flags |= REQ_F_CQE32_INIT; +} +  /*   * Called by consumers of io_uring_cmd, if they originally returned   * -EIOCBQUEUED upon receiving the command. @@ -5093,10 +5039,10 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)  	if (ret < 0)  		req_set_fail(req); +  	if (req->ctx->flags & IORING_SETUP_CQE32) -		__io_req_complete32(req, 0, ret, 0, res2, 0); -	else -		io_req_complete(req, ret); +		io_req_set_cqe32_extra(req, res2, 0); +	io_req_complete(req, ret);  }  EXPORT_SYMBOL_GPL(io_uring_cmd_done); @@ -5258,14 +5204,6 @@ done:  static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  { -	/* -	 * If the ring is setup with CQE32, relay back addr/addr -	 */ -	if (req->ctx->flags & IORING_SETUP_CQE32) { -		req->nop.extra1 = READ_ONCE(sqe->addr); -		req->nop.extra2 = READ_ONCE(sqe->addr2); -	} -  	return 0;  } @@ -5274,23 +5212,7 @@ static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)   */  static int io_nop(struct io_kiocb *req, unsigned int issue_flags)  { -	unsigned int cflags; -	void __user *buf; - -	if (req->flags & REQ_F_BUFFER_SELECT) { -		size_t len = 1; - -		buf = io_buffer_select(req, &len, issue_flags); -		if (!buf) -			return -ENOBUFS; -	} - -	cflags = io_put_kbuf(req, issue_flags); -	if (!(req->ctx->flags & IORING_SETUP_CQE32)) -		__io_req_complete(req, issue_flags, 0, cflags); -	else -		__io_req_complete32(req, issue_flags, 0, cflags, -				    req->nop.extra1, req->nop.extra2); +	__io_req_complete(req, issue_flags, 0, 0);  	return 0;  } @@ -5988,18 +5910,14 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags)  static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  { -	if (sqe->off || sqe->addr || sqe->len || sqe->buf_index) +	if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)  		return -EINVAL;  	if (req->flags & REQ_F_FIXED_FILE)  		return -EBADF;  	req->close.fd = READ_ONCE(sqe->fd);  	req->close.file_slot = READ_ONCE(sqe->file_index); -	req->close.flags = READ_ONCE(sqe->close_flags); -	if (req->close.flags & ~IORING_CLOSE_FD_AND_FILE_SLOT) -		return -EINVAL; -	if (!(req->close.flags & IORING_CLOSE_FD_AND_FILE_SLOT) && -	    req->close.file_slot && req->close.fd) +	if (req->close.file_slot && req->close.fd)  		return -EINVAL;  	return 0; @@ -6015,8 +5933,7 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)  	if (req->close.file_slot) {  		ret = io_close_fixed(req, issue_flags); -		if (ret || !(req->close.flags & IORING_CLOSE_FD_AND_FILE_SLOT)) -			goto err; +		goto err;  	}  	spin_lock(&files->file_lock); @@ -6160,8 +6077,6 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	if (unlikely(sqe->file_index))  		return -EINVAL; -	if (unlikely(sqe->addr2 || sqe->file_index)) -		return -EINVAL;  	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));  	sr->len = READ_ONCE(sqe->len); @@ -6398,8 +6313,6 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	if (unlikely(sqe->file_index))  		return -EINVAL; -	if (unlikely(sqe->addr2 || sqe->file_index)) -		return -EINVAL;  	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));  	sr->len = READ_ONCE(sqe->len); @@ -7037,7 +6950,8 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)  		io_req_complete_failed(req, ret);  } -static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events) +static void __io_poll_execute(struct io_kiocb *req, int mask, +			      __poll_t __maybe_unused events)  {  	req->cqe.res = mask;  	/* @@ -7046,7 +6960,6 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events)  	 * CPU. We want to avoid pulling in req->apoll->events for that  	 * case.  	 */ -	req->apoll_events = events;  	if (req->opcode == IORING_OP_POLL_ADD)  		req->io_task_work.func = io_poll_task_func;  	else @@ -7197,6 +7110,8 @@ static int __io_arm_poll_handler(struct io_kiocb *req,  	io_init_poll_iocb(poll, mask, io_poll_wake);  	poll->file = req->file; +	req->apoll_events = poll->events; +  	ipt->pt._key = mask;  	ipt->req = req;  	ipt->error = 0; @@ -7227,8 +7142,11 @@ static int __io_arm_poll_handler(struct io_kiocb *req,  	if (mask) {  		/* can't multishot if failed, just queue the event we've got */ -		if (unlikely(ipt->error || !ipt->nr_entries)) +		if (unlikely(ipt->error || !ipt->nr_entries)) {  			poll->events |= EPOLLONESHOT; +			req->apoll_events |= EPOLLONESHOT; +			ipt->error = 0; +		}  		__io_poll_execute(req, mask, poll->events);  		return 0;  	} @@ -7290,6 +7208,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)  		mask |= EPOLLEXCLUSIVE;  	if (req->flags & REQ_F_POLLED) {  		apoll = req->apoll; +		kfree(apoll->double_poll);  	} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&  		   !list_empty(&ctx->apoll_cache)) {  		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, @@ -7475,7 +7394,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe  		return -EINVAL;  	io_req_set_refcount(req); -	req->apoll_events = poll->events = io_poll_parse_events(sqe, flags); +	poll->events = io_poll_parse_events(sqe, flags);  	return 0;  } @@ -7488,6 +7407,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)  	ipt.pt._qproc = io_poll_queue_proc;  	ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events); +	if (!ret && ipt.error) +		req_set_fail(req);  	ret = ret ?: ipt.error;  	if (ret)  		__io_req_complete(req, issue_flags, ret, 0); @@ -8063,8 +7984,8 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,  		if (ret < 0)  			break;  		if (copy_to_user(&fds[done], &ret, sizeof(ret))) { -			ret = -EFAULT;  			__io_close_fixed(req, issue_flags, ret); +			ret = -EFAULT;  			break;  		}  	} @@ -8773,6 +8694,7 @@ static void io_queue_async(struct io_kiocb *req, int ret)  		 * Queued up for async execution, worker will release  		 * submit reference when the iocb is actually submitted.  		 */ +		io_kbuf_recycle(req, 0);  		io_queue_iowq(req, NULL);  		break;  	case IO_APOLL_OK: @@ -9788,11 +9710,19 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)  static int io_sqe_files_unregister(struct io_ring_ctx *ctx)  { +	unsigned nr = ctx->nr_user_files;  	int ret;  	if (!ctx->file_data)  		return -ENXIO; + +	/* +	 * Quiesce may unlock ->uring_lock, and while it's not held +	 * prevent new requests using the table. +	 */ +	ctx->nr_user_files = 0;  	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); +	ctx->nr_user_files = nr;  	if (!ret)  		__io_sqe_files_unregister(ctx);  	return ret; @@ -10690,12 +10620,19 @@ static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)  static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)  { +	unsigned nr = ctx->nr_user_bufs;  	int ret;  	if (!ctx->buf_data)  		return -ENXIO; +	/* +	 * Quiesce may unlock ->uring_lock, and while it's not held +	 * prevent new requests using the table. +	 */ +	ctx->nr_user_bufs = 0;  	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); +	ctx->nr_user_bufs = nr;  	if (!ret)  		__io_sqe_buffers_unregister(ctx);  	return ret; @@ -13002,6 +12939,10 @@ static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)  	if (!is_power_of_2(reg.ring_entries))  		return -EINVAL; +	/* cannot disambiguate full vs empty due to head/tail size */ +	if (reg.ring_entries >= 65536) +		return -EINVAL; +  	if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {  		int ret = io_init_bl_list(ctx);  		if (ret) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e49bb0938376..e9c308ae475f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2114,7 +2114,7 @@ out:  /**   * jbd2_journal_try_to_free_buffers() - try to free page buffers.   * @journal: journal for operation - * @page: to try and free + * @folio: Folio to detach data from.   *   * For all the buffers on this page,   * if they are fully written out ordered data, move them onto BUF_CLEAN diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 8742d22dfd2b..42f892c5712e 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -155,7 +155,7 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq,  void netfs_readahead(struct readahead_control *ractl)  {  	struct netfs_io_request *rreq; -	struct netfs_i_context *ctx = netfs_i_context(ractl->mapping->host); +	struct netfs_inode *ctx = netfs_inode(ractl->mapping->host);  	int ret;  	_enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); @@ -215,7 +215,7 @@ int netfs_read_folio(struct file *file, struct folio *folio)  {  	struct address_space *mapping = folio_file_mapping(folio);  	struct netfs_io_request *rreq; -	struct netfs_i_context *ctx = netfs_i_context(mapping->host); +	struct netfs_inode *ctx = netfs_inode(mapping->host);  	int ret;  	_enter("%lx", folio_index(folio)); @@ -297,6 +297,7 @@ zero_out:  /**   * netfs_write_begin - Helper to prepare for writing + * @ctx: The netfs context   * @file: The file to read from   * @mapping: The mapping to read from   * @pos: File position at which the write will begin @@ -326,12 +327,12 @@ zero_out:   *   * This is usable whether or not caching is enabled.   */ -int netfs_write_begin(struct file *file, struct address_space *mapping, +int netfs_write_begin(struct netfs_inode *ctx, +		      struct file *file, struct address_space *mapping,  		      loff_t pos, unsigned int len, struct folio **_folio,  		      void **_fsdata)  {  	struct netfs_io_request *rreq; -	struct netfs_i_context *ctx = netfs_i_context(file_inode(file ));  	struct folio *folio;  	unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;  	pgoff_t index = pos >> PAGE_SHIFT; diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index b7b0e3d18d9e..43fac1b14e40 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -91,7 +91,7 @@ static inline void netfs_stat_d(atomic_t *stat)  /*   * Miscellaneous functions.   */ -static inline bool netfs_is_cache_enabled(struct netfs_i_context *ctx) +static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)  {  #if IS_ENABLED(CONFIG_FSCACHE)  	struct fscache_cookie *cookie = ctx->cache; diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index e86107b30ba4..e17cdf53f6a7 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -18,7 +18,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,  {  	static atomic_t debug_ids;  	struct inode *inode = file ? file_inode(file) : mapping->host; -	struct netfs_i_context *ctx = netfs_i_context(inode); +	struct netfs_inode *ctx = netfs_inode(inode);  	struct netfs_io_request *rreq;  	int ret; @@ -75,10 +75,10 @@ static void netfs_free_request(struct work_struct *work)  	struct netfs_io_request *rreq =  		container_of(work, struct netfs_io_request, work); -	netfs_clear_subrequests(rreq, false); -	if (rreq->netfs_priv) -		rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);  	trace_netfs_rreq(rreq, netfs_rreq_trace_free); +	netfs_clear_subrequests(rreq, false); +	if (rreq->netfs_ops->free_request) +		rreq->netfs_ops->free_request(rreq);  	if (rreq->cache_resources.ops)  		rreq->cache_resources.ops->end_operation(&rreq->cache_resources);  	kfree(rreq); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c8520284dda7..c1eda73254e1 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -288,6 +288,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,  		rv = NFS4_OK;  		break;  	case -ENOENT: +		set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);  		/* Embrace your forgetfulness! */  		rv = NFS4ERR_NOMATCHING_LAYOUT; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a8ecdd527662..0c4e8dd6aa96 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2124,6 +2124,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,  		}  		goto out;  	} +	file->f_mode |= FMODE_CAN_ODIRECT;  	err = nfs_finish_open(ctx, ctx->dentry, file, open_flags);  	trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 03d3a270eff4..e88f6b18445e 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -93,6 +93,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)  	nfs_file_set_open_context(filp, ctx);  	nfs_fscache_open_file(inode, filp);  	err = 0; +	filp->f_mode |= FMODE_CAN_ODIRECT;  out_put_ctx:  	put_nfs_open_context(ctx); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 68a87be3e6f9..41a9b6b58fb9 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -469,6 +469,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,  		pnfs_clear_lseg_state(lseg, lseg_list);  	pnfs_clear_layoutreturn_info(lo);  	pnfs_free_returned_lsegs(lo, lseg_list, &range, 0); +	set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);  	if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&  	    !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))  		pnfs_clear_layoutreturn_waitbit(lo); @@ -1917,8 +1918,9 @@ static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)  static void nfs_layoutget_end(struct pnfs_layout_hdr *lo)  { -	if (atomic_dec_and_test(&lo->plh_outstanding)) -		wake_up_var(&lo->plh_outstanding); +	if (atomic_dec_and_test(&lo->plh_outstanding) && +	    test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) +		wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN);  }  static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo) @@ -2025,11 +2027,11 @@ lookup_again:  	 * If the layout segment list is empty, but there are outstanding  	 * layoutget calls, then they might be subject to a layoutrecall.  	 */ -	if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo)) && +	if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&  	    atomic_read(&lo->plh_outstanding) != 0) {  		spin_unlock(&ino->i_lock); -		lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding, -					!atomic_read(&lo->plh_outstanding))); +		lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN, +					   TASK_KILLABLE));  		if (IS_ERR(lseg))  			goto out_put_layout_hdr;  		pnfs_put_layout_hdr(lo); @@ -2152,6 +2154,12 @@ lookup_again:  		case -ERECALLCONFLICT:  		case -EAGAIN:  			break; +		case -ENODATA: +			/* The server returned NFS4ERR_LAYOUTUNAVAILABLE */ +			pnfs_layout_set_fail_bit( +				lo, pnfs_iomode_to_fail_bit(iomode)); +			lseg = NULL; +			goto out_put_layout_hdr;  		default:  			if (!nfs_error_is_fatal(PTR_ERR(lseg))) {  				pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); @@ -2407,7 +2415,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)  		goto out_forget;  	} -	if (!pnfs_layout_is_valid(lo) && !pnfs_is_first_layoutget(lo)) +	if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) && +	    !pnfs_is_first_layoutget(lo))  		goto out_forget;  	if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 07f11489e4e9..f331f067691b 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -105,6 +105,7 @@ enum {  	NFS_LAYOUT_FIRST_LAYOUTGET,	/* Serialize first layoutget */  	NFS_LAYOUT_INODE_FREEING,	/* The inode is being freed */  	NFS_LAYOUT_HASHED,		/* The layout visible */ +	NFS_LAYOUT_DRAIN,  };  enum layoutdriver_policy_flags { diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index f172412447f5..9cb2d590c036 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -309,11 +309,12 @@ nfsd_file_put(struct nfsd_file *nf)  	if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {  		nfsd_file_flush(nf);  		nfsd_file_put_noref(nf); -	} else { +	} else if (nf->nf_file) {  		nfsd_file_put_noref(nf); -		if (nf->nf_file) -			nfsd_file_schedule_laundrette(); -	} +		nfsd_file_schedule_laundrette(); +	} else +		nfsd_file_put_noref(nf); +  	if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT)  		nfsd_file_gc();  } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index a74aef99bd3d..09d1307959d0 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -79,6 +79,7 @@  #include <linux/capability.h>  #include <linux/quotaops.h>  #include <linux/blkdev.h> +#include <linux/sched/mm.h>  #include "../internal.h" /* ugh */  #include <linux/uaccess.h> @@ -425,9 +426,11 @@ EXPORT_SYMBOL(mark_info_dirty);  int dquot_acquire(struct dquot *dquot)  {  	int ret = 0, ret2 = 0; +	unsigned int memalloc;  	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);  	mutex_lock(&dquot->dq_lock); +	memalloc = memalloc_nofs_save();  	if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {  		ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);  		if (ret < 0) @@ -458,6 +461,7 @@ int dquot_acquire(struct dquot *dquot)  	smp_mb__before_atomic();  	set_bit(DQ_ACTIVE_B, &dquot->dq_flags);  out_iolock: +	memalloc_nofs_restore(memalloc);  	mutex_unlock(&dquot->dq_lock);  	return ret;  } @@ -469,9 +473,11 @@ EXPORT_SYMBOL(dquot_acquire);  int dquot_commit(struct dquot *dquot)  {  	int ret = 0; +	unsigned int memalloc;  	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);  	mutex_lock(&dquot->dq_lock); +	memalloc = memalloc_nofs_save();  	if (!clear_dquot_dirty(dquot))  		goto out_lock;  	/* Inactive dquot can be only if there was error during read/init @@ -481,6 +487,7 @@ int dquot_commit(struct dquot *dquot)  	else  		ret = -EIO;  out_lock: +	memalloc_nofs_restore(memalloc);  	mutex_unlock(&dquot->dq_lock);  	return ret;  } @@ -492,9 +499,11 @@ EXPORT_SYMBOL(dquot_commit);  int dquot_release(struct dquot *dquot)  {  	int ret = 0, ret2 = 0; +	unsigned int memalloc;  	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);  	mutex_lock(&dquot->dq_lock); +	memalloc = memalloc_nofs_save();  	/* Check whether we are not racing with some other dqget() */  	if (dquot_is_busy(dquot))  		goto out_dqlock; @@ -510,6 +519,7 @@ int dquot_release(struct dquot *dquot)  	}  	clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);  out_dqlock: +	memalloc_nofs_restore(memalloc);  	mutex_unlock(&dquot->dq_lock);  	return ret;  } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index de7252715b12..81d26abf486f 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -553,7 +553,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)   *   * Only one instances directory is allowed.   * - * The instances directory is special as it allows for mkdir and rmdir to + * The instances directory is special as it allows for mkdir and rmdir   * to be done by userspace. When a mkdir or rmdir is performed, the inode   * locks are released and the methods passed in (@mkdir and @rmdir) are   * called without locks and with the name of the directory being created diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 836ab1b8ed7b..1824f61621a2 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -997,9 +997,11 @@ xfs_attr_set(  	/*  	 * We have no control over the attribute names that userspace passes us  	 * to remove, so we have to allow the name lookup prior to attribute -	 * removal to fail as well. +	 * removal to fail as well.  Preserve the logged flag, since we need +	 * to pass that through to the logging code.  	 */ -	args->op_flags = XFS_DA_OP_OKNOENT; +	args->op_flags = XFS_DA_OP_OKNOENT | +					(args->op_flags & XFS_DA_OP_LOGGED);  	if (args->value) {  		XFS_STATS_INC(mp, xs_attr_set); @@ -1439,12 +1441,11 @@ static int  xfs_attr_node_try_addname(  	struct xfs_attr_intent		*attr)  { -	struct xfs_da_args		*args = attr->xattri_da_args;  	struct xfs_da_state		*state = attr->xattri_da_state;  	struct xfs_da_state_blk		*blk;  	int				error; -	trace_xfs_attr_node_addname(args); +	trace_xfs_attr_node_addname(state->args);  	blk = &state->path.blk[state->path.active-1];  	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index e329da3e7afa..b4a2fc77017e 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -28,16 +28,6 @@ struct xfs_attr_list_context;   */  #define	ATTR_MAX_VALUELEN	(64*1024)	/* max length of a value */ -static inline bool xfs_has_larp(struct xfs_mount *mp) -{ -#ifdef DEBUG -	/* Logged xattrs require a V5 super for log_incompat */ -	return xfs_has_crc(mp) && xfs_globals.larp; -#else -	return false; -#endif -} -  /*   * Kernel-internal version of the attrlist cursor.   */ @@ -624,7 +614,7 @@ static inline enum xfs_delattr_state  xfs_attr_init_replace_state(struct xfs_da_args *args)  {  	args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE; -	if (xfs_has_larp(args->dp->i_mount)) +	if (args->op_flags & XFS_DA_OP_LOGGED)  		return xfs_attr_init_remove_state(args);  	return xfs_attr_init_add_state(args);  } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 15a990409463..37e7c33f6283 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -1530,7 +1530,7 @@ xfs_attr3_leaf_add_work(  	if (tmp)  		entry->flags |= XFS_ATTR_LOCAL;  	if (args->op_flags & XFS_DA_OP_REPLACE) { -		if (!xfs_has_larp(mp)) +		if (!(args->op_flags & XFS_DA_OP_LOGGED))  			entry->flags |= XFS_ATTR_INCOMPLETE;  		if ((args->blkno2 == args->blkno) &&  		    (args->index2 <= args->index)) { diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index d33b7686a0b3..ffa3df5b2893 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -92,6 +92,7 @@ typedef struct xfs_da_args {  #define XFS_DA_OP_NOTIME	(1u << 5) /* don't update inode timestamps */  #define XFS_DA_OP_REMOVE	(1u << 6) /* this is a remove operation */  #define XFS_DA_OP_RECOVERY	(1u << 7) /* Log recovery operation */ +#define XFS_DA_OP_LOGGED	(1u << 8) /* Use intent items to track op */  #define XFS_DA_OP_FLAGS \  	{ XFS_DA_OP_JUSTCHECK,	"JUSTCHECK" }, \ @@ -101,7 +102,8 @@ typedef struct xfs_da_args {  	{ XFS_DA_OP_CILOOKUP,	"CILOOKUP" }, \  	{ XFS_DA_OP_NOTIME,	"NOTIME" }, \  	{ XFS_DA_OP_REMOVE,	"REMOVE" }, \ -	{ XFS_DA_OP_RECOVERY,	"RECOVERY" } +	{ XFS_DA_OP_RECOVERY,	"RECOVERY" }, \ +	{ XFS_DA_OP_LOGGED,	"LOGGED" }  /*   * Storage for holding state during Btree searches and split/join ops. diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 4a28c2d77070..135d44133477 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -413,18 +413,20 @@ xfs_attr_create_intent(  	struct xfs_mount		*mp = tp->t_mountp;  	struct xfs_attri_log_item	*attrip;  	struct xfs_attr_intent		*attr; +	struct xfs_da_args		*args;  	ASSERT(count == 1); -	if (!xfs_sb_version_haslogxattrs(&mp->m_sb)) -		return NULL; -  	/*  	 * Each attr item only performs one attribute operation at a time, so  	 * this is a list of one  	 */  	attr = list_first_entry_or_null(items, struct xfs_attr_intent,  			xattri_list); +	args = attr->xattri_da_args; + +	if (!(args->op_flags & XFS_DA_OP_LOGGED)) +		return NULL;  	/*  	 * Create a buffer to store the attribute name and value.  This buffer @@ -432,8 +434,6 @@ xfs_attr_create_intent(  	 * and the lower level xattr log items.  	 */  	if (!attr->xattri_nameval) { -		struct xfs_da_args	*args = attr->xattri_da_args; -  		/*  		 * Transfer our reference to the name/value buffer to the  		 * deferred work state structure. @@ -617,7 +617,10 @@ xfs_attri_item_recover(  	args->namelen = nv->name.i_len;  	args->hashval = xfs_da_hashname(args->name, args->namelen);  	args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK; -	args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT; +	args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT | +			 XFS_DA_OP_LOGGED; + +	ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb));  	switch (attr->xattri_op_flags) {  	case XFS_ATTRI_OP_FLAGS_SET: diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 5a364a7d58fd..0d67ff8a8961 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1096,7 +1096,8 @@ xfs_flags2diflags2(  {  	uint64_t		di_flags2 =  		(ip->i_diflags2 & (XFS_DIFLAG2_REFLINK | -				   XFS_DIFLAG2_BIGTIME)); +				   XFS_DIFLAG2_BIGTIME | +				   XFS_DIFLAG2_NREXT64));  	if (xflags & FS_XFLAG_DAX)  		di_flags2 |= XFS_DIFLAG2_DAX; diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 35e13e125ec6..c325a28b89a8 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -68,6 +68,18 @@ xfs_attr_rele_log_assist(  	xlog_drop_incompat_feat(mp->m_log);  } +static inline bool +xfs_attr_want_log_assist( +	struct xfs_mount	*mp) +{ +#ifdef DEBUG +	/* Logged xattrs require a V5 super for log_incompat */ +	return xfs_has_crc(mp) && xfs_globals.larp; +#else +	return false; +#endif +} +  /*   * Set or remove an xattr, having grabbed the appropriate logging resources   * prior to calling libxfs. @@ -80,11 +92,14 @@ xfs_attr_change(  	bool			use_logging = false;  	int			error; -	if (xfs_has_larp(mp)) { +	ASSERT(!(args->op_flags & XFS_DA_OP_LOGGED)); + +	if (xfs_attr_want_log_assist(mp)) {  		error = xfs_attr_grab_log_assist(mp);  		if (error)  			return error; +		args->op_flags |= XFS_DA_OP_LOGGED;  		use_logging = true;  	} diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index bcb21aea990a..053299758deb 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -110,15 +110,51 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize)  	}  } -static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, -			      unsigned int flags, struct iomap *iomap, -			      struct iomap *srcmap) +static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, +				   loff_t length, unsigned int flags, +				   struct iomap *iomap, struct iomap *srcmap)  {  	struct zonefs_inode_info *zi = ZONEFS_I(inode);  	struct super_block *sb = inode->i_sb;  	loff_t isize; -	/* All I/Os should always be within the file maximum size */ +	/* +	 * All blocks are always mapped below EOF. If reading past EOF, +	 * act as if there is a hole up to the file maximum size. +	 */ +	mutex_lock(&zi->i_truncate_mutex); +	iomap->bdev = inode->i_sb->s_bdev; +	iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); +	isize = i_size_read(inode); +	if (iomap->offset >= isize) { +		iomap->type = IOMAP_HOLE; +		iomap->addr = IOMAP_NULL_ADDR; +		iomap->length = length; +	} else { +		iomap->type = IOMAP_MAPPED; +		iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; +		iomap->length = isize - iomap->offset; +	} +	mutex_unlock(&zi->i_truncate_mutex); + +	trace_zonefs_iomap_begin(inode, iomap); + +	return 0; +} + +static const struct iomap_ops zonefs_read_iomap_ops = { +	.iomap_begin	= zonefs_read_iomap_begin, +}; + +static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, +				    loff_t length, unsigned int flags, +				    struct iomap *iomap, struct iomap *srcmap) +{ +	struct zonefs_inode_info *zi = ZONEFS_I(inode); +	struct super_block *sb = inode->i_sb; +	loff_t isize; + +	/* All write I/Os should always be within the file maximum size */  	if (WARN_ON_ONCE(offset + length > zi->i_max_size))  		return -EIO; @@ -128,7 +164,7 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,  	 * operation.  	 */  	if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && -			 (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT))) +			 !(flags & IOMAP_DIRECT)))  		return -EIO;  	/* @@ -137,47 +173,44 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,  	 * write pointer) and unwriten beyond.  	 */  	mutex_lock(&zi->i_truncate_mutex); +	iomap->bdev = inode->i_sb->s_bdev; +	iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); +	iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;  	isize = i_size_read(inode); -	if (offset >= isize) +	if (iomap->offset >= isize) {  		iomap->type = IOMAP_UNWRITTEN; -	else +		iomap->length = zi->i_max_size - iomap->offset; +	} else {  		iomap->type = IOMAP_MAPPED; -	if (flags & IOMAP_WRITE) -		length = zi->i_max_size - offset; -	else -		length = min(length, isize - offset); +		iomap->length = isize - iomap->offset; +	}  	mutex_unlock(&zi->i_truncate_mutex); -	iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); -	iomap->length = ALIGN(offset + length, sb->s_blocksize) - iomap->offset; -	iomap->bdev = inode->i_sb->s_bdev; -	iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; -  	trace_zonefs_iomap_begin(inode, iomap);  	return 0;  } -static const struct iomap_ops zonefs_iomap_ops = { -	.iomap_begin	= zonefs_iomap_begin, +static const struct iomap_ops zonefs_write_iomap_ops = { +	.iomap_begin	= zonefs_write_iomap_begin,  };  static int zonefs_read_folio(struct file *unused, struct folio *folio)  { -	return iomap_read_folio(folio, &zonefs_iomap_ops); +	return iomap_read_folio(folio, &zonefs_read_iomap_ops);  }  static void zonefs_readahead(struct readahead_control *rac)  { -	iomap_readahead(rac, &zonefs_iomap_ops); +	iomap_readahead(rac, &zonefs_read_iomap_ops);  }  /*   * Map blocks for page writeback. This is used only on conventional zone files,   * which implies that the page range can only be within the fixed inode size.   */ -static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc, -			     struct inode *inode, loff_t offset) +static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, +				   struct inode *inode, loff_t offset)  {  	struct zonefs_inode_info *zi = ZONEFS_I(inode); @@ -191,12 +224,12 @@ static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc,  	    offset < wpc->iomap.offset + wpc->iomap.length)  		return 0; -	return zonefs_iomap_begin(inode, offset, zi->i_max_size - offset, -				  IOMAP_WRITE, &wpc->iomap, NULL); +	return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset, +					IOMAP_WRITE, &wpc->iomap, NULL);  }  static const struct iomap_writeback_ops zonefs_writeback_ops = { -	.map_blocks		= zonefs_map_blocks, +	.map_blocks		= zonefs_write_map_blocks,  };  static int zonefs_writepage(struct page *page, struct writeback_control *wbc) @@ -226,7 +259,8 @@ static int zonefs_swap_activate(struct swap_info_struct *sis,  		return -EINVAL;  	} -	return iomap_swapfile_activate(sis, swap_file, span, &zonefs_iomap_ops); +	return iomap_swapfile_activate(sis, swap_file, span, +				       &zonefs_read_iomap_ops);  }  static const struct address_space_operations zonefs_file_aops = { @@ -647,7 +681,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)  	/* Serialize against truncates */  	filemap_invalidate_lock_shared(inode->i_mapping); -	ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops); +	ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);  	filemap_invalidate_unlock_shared(inode->i_mapping);  	sb_end_pagefault(inode->i_sb); @@ -899,7 +933,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)  	if (append)  		ret = zonefs_file_dio_append(iocb, from);  	else -		ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, +		ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,  				   &zonefs_write_dio_ops, 0, NULL, 0);  	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&  	    (ret > 0 || ret == -EIOCBQUEUED)) { @@ -948,7 +982,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,  	if (ret <= 0)  		goto inode_unlock; -	ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops); +	ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);  	if (ret > 0)  		iocb->ki_pos += ret;  	else if (ret == -EIO) @@ -1041,7 +1075,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)  			goto inode_unlock;  		}  		file_accessed(iocb->ki_filp); -		ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops, +		ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,  				   &zonefs_read_dio_ops, 0, NULL, 0);  	} else {  		ret = generic_file_read_iter(iocb, to); @@ -1085,7 +1119,8 @@ static int zonefs_seq_file_write_open(struct inode *inode)  		if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { -			if (wro > sbi->s_max_wro_seq_files) { +			if (sbi->s_max_wro_seq_files +			    && wro > sbi->s_max_wro_seq_files) {  				atomic_dec(&sbi->s_wro_seq_files);  				ret = -EBUSY;  				goto unlock; @@ -1760,12 +1795,6 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)  	atomic_set(&sbi->s_wro_seq_files, 0);  	sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev); -	if (!sbi->s_max_wro_seq_files && -	    sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { -		zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n"); -		sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; -	} -  	atomic_set(&sbi->s_active_seq_files, 0);  	sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev); @@ -1790,6 +1819,14 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)  	zonefs_info(sb, "Mounting %u zones",  		    blkdev_nr_zones(sb->s_bdev->bd_disk)); +	if (!sbi->s_max_wro_seq_files && +	    !sbi->s_max_active_seq_files && +	    sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { +		zonefs_info(sb, +			"No open and active zone limits. Ignoring explicit_open mount option\n"); +		sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; +	} +  	/* Create root directory inode */  	ret = -ENOMEM;  	inode = new_inode(sb);  | 
