diff options
Diffstat (limited to 'fs')
113 files changed, 3276 insertions, 2603 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 3a9c4517265f..61a51b90600d 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -468,7 +468,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, #ifdef CONFIG_9P_FSCACHE /* register the session for caching */ - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + if (v9ses->cache == CACHE_FSCACHE) { rc = v9fs_cache_session_get_cookie(v9ses, dev_name); if (rc < 0) goto err_clnt; diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 97599edbc300..6f46d7e4c750 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -279,8 +279,6 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping, p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - BUG_ON(!v9inode->writeback_fid); - /* Prefetch area to be written into the cache if we're caching this * file. We need to do this before we get a lock on the page in case * there's more than one writer competing for the same cache block. diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 59b0e8948f78..3d74b04fe0de 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -197,7 +197,7 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx) /** - * v9fs_dir_release - close a directory + * v9fs_dir_release - called on a close of a file or directory * @inode: inode of the directory * @filp: file pointer to a directory * @@ -209,6 +209,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) struct p9_fid *fid; __le32 version; loff_t i_size; + int retval = 0; fid = filp->private_data; p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n", @@ -217,7 +218,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) spin_lock(&inode->i_lock); hlist_del(&fid->ilist); spin_unlock(&inode->i_lock); - p9_fid_put(fid); + retval = p9_fid_put(fid); } if ((filp->f_mode & FMODE_WRITE)) { @@ -228,7 +229,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) } else { fscache_unuse_cookie(v9fs_inode_cookie(v9inode), NULL, NULL); } - return 0; + return retval; } const struct file_operations v9fs_dir_operations = { diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index b6ba22975781..44c15eb2b908 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -74,8 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) } mutex_lock(&v9inode->v_mutex); - if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && - !v9inode->writeback_fid && + if ((v9ses->cache) && !v9inode->writeback_fid && ((file->f_flags & O_ACCMODE) != O_RDONLY)) { /* * clone a fid and add it to writeback_fid @@ -93,9 +92,11 @@ int v9fs_file_open(struct inode *inode, struct file *file) v9inode->writeback_fid = (void *) writeback_fid; } mutex_unlock(&v9inode->v_mutex); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache == CACHE_FSCACHE) fscache_use_cookie(v9fs_inode_cookie(v9inode), file->f_mode & FMODE_WRITE); +#endif v9fs_open_fid_add(inode, &fid); return 0; out_error: diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 4344e7a7865f..1d523bec0a94 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -843,8 +843,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, inode = d_inode(dentry); v9inode = V9FS_I(inode); mutex_lock(&v9inode->v_mutex); - if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && - !v9inode->writeback_fid && + if ((v9ses->cache) && !v9inode->writeback_fid && ((flags & O_ACCMODE) != O_RDONLY)) { /* * clone a fid and add it to writeback_fid diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 3bed3eb3a0e2..331ed60d8fcb 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -316,8 +316,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, v9inode = V9FS_I(inode); mutex_lock(&v9inode->v_mutex); - if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && - !v9inode->writeback_fid && + if ((v9ses->cache) && !v9inode->writeback_fid && ((flags & O_ACCMODE) != O_RDONLY)) { /* * clone a fid and add it to writeback_fid @@ -340,9 +339,11 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (err) goto out; file->private_data = ofid; - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache == CACHE_FSCACHE) fscache_use_cookie(v9fs_inode_cookie(v9inode), file->f_mode & FMODE_WRITE); +#endif v9fs_open_fid_add(inode, &ofid); file->f_mode |= FMODE_CREATED; out: diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 5dcc62e678c4..f4d8bf7dec88 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -2098,6 +2098,9 @@ static long ceph_fallocate(struct file *file, int mode, loff_t endoff = 0; loff_t size; + dout("%s %p %llx.%llx mode %x, offset %llu length %llu\n", __func__, + inode, ceph_vinop(inode), mode, offset, length); + if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; @@ -2132,6 +2135,10 @@ static long ceph_fallocate(struct file *file, int mode, if (ret < 0) goto unlock; + ret = file_modified(file); + if (ret) + goto put_caps; + filemap_invalidate_lock(inode->i_mapping); ceph_fscache_invalidate(inode, false); ceph_zero_pagecache_range(inode, offset, length); @@ -2147,6 +2154,7 @@ static long ceph_fallocate(struct file *file, int mode, } filemap_invalidate_unlock(inode->i_mapping); +put_caps: ceph_put_cap_refs(ci, got); unlock: inode_unlock(inode); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index b7a36ebd0f2f..e2eff66eefab 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -667,11 +667,21 @@ static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, int match_target_ip(struct TCP_Server_Info *server, const char *share, size_t share_len, bool *result); - -int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid, - struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, - const char *dfs_link_path); +int cifs_inval_name_dfs_link_error(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + bool *islink); +#else +static inline int cifs_inval_name_dfs_link_error(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + bool *islink) +{ + *islink = false; + return 0; +} #endif static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options) @@ -684,5 +694,6 @@ static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options) struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon); void cifs_put_tcon_super(struct super_block *sb); +int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index a24e4ddf8043..a43c78396dd8 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -72,7 +72,6 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) struct cifs_ses *ses; struct TCP_Server_Info *server; struct nls_table *nls_codepage; - int retries; /* * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for @@ -102,45 +101,9 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) } spin_unlock(&tcon->tc_lock); - retries = server->nr_targets; - - /* - * Give demultiplex thread up to 10 seconds to each target available for - * reconnect -- should be greater than cifs socket timeout which is 7 - * seconds. - */ - while (server->tcpStatus == CifsNeedReconnect) { - rc = wait_event_interruptible_timeout(server->response_q, - (server->tcpStatus != CifsNeedReconnect), - 10 * HZ); - if (rc < 0) { - cifs_dbg(FYI, "%s: aborting reconnect due to a received signal by the process\n", - __func__); - return -ERESTARTSYS; - } - - /* are we still trying to reconnect? */ - spin_lock(&server->srv_lock); - if (server->tcpStatus != CifsNeedReconnect) { - spin_unlock(&server->srv_lock); - break; - } - spin_unlock(&server->srv_lock); - - if (retries && --retries) - continue; - - /* - * on "soft" mounts we wait once. Hard mounts keep - * retrying until process is killed or server comes - * back on-line - */ - if (!tcon->retry) { - cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n"); - return -EHOSTDOWN; - } - retries = server->nr_targets; - } + rc = cifs_wait_for_server_reconnect(server, tcon->retry); + if (rc) + return rc; spin_lock(&ses->chan_lock); if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ec020d860be3..5233f14f0636 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1294,7 +1294,8 @@ cifs_match_ipaddr(struct sockaddr *srcaddr, struct sockaddr *rhs) case AF_INET6: { struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr; struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs; - return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr); + return (ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr) + && saddr6->sin6_scope_id == vaddr6->sin6_scope_id); } default: WARN_ON(1); @@ -1343,32 +1344,8 @@ match_port(struct TCP_Server_Info *server, struct sockaddr *addr) static bool match_server_address(struct TCP_Server_Info *server, struct sockaddr *addr) { - switch (addr->sa_family) { - case AF_INET: { - struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; - struct sockaddr_in *srv_addr4 = - (struct sockaddr_in *)&server->dstaddr; - - if (addr4->sin_addr.s_addr != srv_addr4->sin_addr.s_addr) - return false; - break; - } - case AF_INET6: { - struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; - struct sockaddr_in6 *srv_addr6 = - (struct sockaddr_in6 *)&server->dstaddr; - - if (!ipv6_addr_equal(&addr6->sin6_addr, - &srv_addr6->sin6_addr)) - return false; - if (addr6->sin6_scope_id != srv_addr6->sin6_scope_id) - return false; - break; - } - default: - WARN_ON(1); - return false; /* don't expect to be here */ - } + if (!cifs_match_ipaddr(addr, (struct sockaddr *)&server->dstaddr)) + return false; return true; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ebfcaae8c437..4d4a2d82636d 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -52,6 +52,8 @@ static void cifs_undirty_folios(struct inode *inode, loff_t start, unsigned int end = (start + len - 1) / PAGE_SIZE; xas_for_each_marked(&xas, folio, end, PAGECACHE_TAG_DIRTY) { + if (xas_retry(&xas, folio)) + continue; xas_pause(&xas); rcu_read_unlock(); folio_lock(folio); @@ -81,6 +83,8 @@ void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len end = (start + len - 1) / PAGE_SIZE; xas_for_each(&xas, folio, end) { + if (xas_retry(&xas, folio)) + continue; if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", len, start, folio_index(folio), end); @@ -112,6 +116,8 @@ void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len end = (start + len - 1) / PAGE_SIZE; xas_for_each(&xas, folio, end) { + if (xas_retry(&xas, folio)) + continue; if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", len, start, folio_index(folio), end); @@ -2839,6 +2845,7 @@ err_xid: free_xid(xid); if (rc == 0) { wbc->nr_to_write = count; + rc = len; } else if (is_retryable_error(rc)) { cifs_pages_write_redirty(inode, start, len); } else { @@ -3605,7 +3612,7 @@ static ssize_t __cifs_writev( ctx->nr_pinned_pages = rc; ctx->bv = (void *)ctx->iter.bvec; - ctx->bv_need_unpin = iov_iter_extract_will_pin(&ctx->iter); + ctx->bv_need_unpin = iov_iter_extract_will_pin(from); } else if ((iov_iter_is_bvec(from) || iov_iter_is_kvec(from)) && !is_sync_kiocb(iocb)) { /* @@ -4141,7 +4148,7 @@ static ssize_t __cifs_readv( ctx->nr_pinned_pages = rc; ctx->bv = (void *)ctx->iter.bvec; - ctx->bv_need_unpin = iov_iter_extract_will_pin(&ctx->iter); + ctx->bv_need_unpin = iov_iter_extract_will_pin(to); ctx->should_dirty = true; } else if ((iov_iter_is_bvec(to) || iov_iter_is_kvec(to)) && !is_sync_kiocb(iocb)) { diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 2905734eb289..a0d286ee723d 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -21,6 +21,7 @@ #include "cifsfs.h" #ifdef CONFIG_CIFS_DFS_UPCALL #include "dns_resolve.h" +#include "dfs_cache.h" #endif #include "fs_context.h" #include "cached_dir.h" @@ -1198,4 +1199,114 @@ int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; return 0; } + +/* + * Handle weird Windows SMB server behaviour. It responds with + * STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request for + * "\<server>\<dfsname>\<linkpath>" DFS reference, where <dfsname> contains + * non-ASCII unicode symbols. + */ +int cifs_inval_name_dfs_link_error(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + bool *islink) +{ + struct cifs_ses *ses = tcon->ses; + size_t len; + char *path; + char *ref_path; + + *islink = false; + + /* + * Fast path - skip check when @full_path doesn't have a prefix path to + * look up or tcon is not DFS. + */ + if (strlen(full_path) < 2 || !cifs_sb || + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) || + !is_tcon_dfs(tcon) || !ses->server->origin_fullpath) + return 0; + + /* + * Slow path - tcon is DFS and @full_path has prefix path, so attempt + * to get a referral to figure out whether it is an DFS link. + */ + len = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1) + strlen(full_path) + 1; + path = kmalloc(len, GFP_KERNEL); + if (!path) + return -ENOMEM; + + scnprintf(path, len, "%s%s", tcon->tree_name, full_path); + ref_path = dfs_cache_canonical_path(path + 1, cifs_sb->local_nls, + cifs_remap(cifs_sb)); + kfree(path); + + if (IS_ERR(ref_path)) { + if (PTR_ERR(ref_path) != -EINVAL) + return PTR_ERR(ref_path); + } else { + struct dfs_info3_param *refs = NULL; + int num_refs = 0; + + /* + * XXX: we are not using dfs_cache_find() here because we might + * end filling all the DFS cache and thus potentially + * removing cached DFS targets that the client would eventually + * need during failover. + */ + if (ses->server->ops->get_dfs_refer && + !ses->server->ops->get_dfs_refer(xid, ses, ref_path, &refs, + &num_refs, cifs_sb->local_nls, + cifs_remap(cifs_sb))) + *islink = refs[0].server_type == DFS_TYPE_LINK; + free_dfs_info_array(refs, num_refs); + kfree(ref_path); + } + return 0; +} #endif + +int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry) +{ + int timeout = 10; + int rc; + + spin_lock(&server->srv_lock); + if (server->tcpStatus != CifsNeedReconnect) { + spin_unlock(&server->srv_lock); + return 0; + } + timeout *= server->nr_targets; + spin_unlock(&server->srv_lock); + + /* + * Give demultiplex thread up to 10 seconds to each target available for + * reconnect -- should be greater than cifs socket timeout which is 7 + * seconds. + * + * On "soft" mounts we wait once. Hard mounts keep retrying until + * process is killed or server comes back on-line. + */ + do { + rc = wait_event_interruptible_timeout(server->response_q, + (server->tcpStatus != CifsNeedReconnect), + timeout * HZ); + if (rc < 0) { + cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n", + __func__); + return -ERESTARTSYS; + } + + /* are we still trying to reconnect? */ + spin_lock(&server->srv_lock); + if (server->tcpStatus != CifsNeedReconnect) { + spin_unlock(&server->srv_lock); + return 0; + } + spin_unlock(&server->srv_lock); + } while (retry); + + cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__); + return -EHOSTDOWN; +} diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 37b4cd59245d..9b956294e864 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -527,12 +527,13 @@ int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse) { - int rc; __u32 create_options = 0; struct cifsFileInfo *cfile; struct cached_fid *cfid = NULL; struct kvec err_iov[3] = {}; int err_buftype[3] = {}; + bool islink; + int rc, rc2; *adjust_tz = false; *reparse = false; @@ -580,15 +581,15 @@ int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, SMB2_OP_QUERY_INFO, cfile, NULL, NULL, NULL, NULL); goto out; - } else if (rc != -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && - hdr->Status == STATUS_OBJECT_NAME_INVALID) { - /* - * Handle weird Windows SMB server behaviour. It responds with - * STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request - * for "\<server>\<dfsname>\<linkpath>" DFS reference, - * where <dfsname> contains non-ASCII unicode symbols. - */ - rc = -EREMOTE; + } else if (rc != -EREMOTE && hdr->Status == STATUS_OBJECT_NAME_INVALID) { + rc2 = cifs_inval_name_dfs_link_error(xid, tcon, cifs_sb, + full_path, &islink); + if (rc2) { + rc = rc2; + goto out; + } + if (islink) + rc = -EREMOTE; } if (rc == -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && cifs_sb && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f79b075f2992..6dfb865ee9d7 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -796,7 +796,6 @@ static int smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path) { - int rc; __le16 *utf16_path; __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; int err_buftype = CIFS_NO_BUFFER; @@ -804,6 +803,8 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, struct kvec err_iov = {}; struct cifs_fid fid; struct cached_fid *cfid; + bool islink; + int rc, rc2; rc = open_cached_dir(xid, tcon, full_path, cifs_sb, true, &cfid); if (!rc) { @@ -833,15 +834,17 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, if (unlikely(!hdr || err_buftype == CIFS_NO_BUFFER)) goto out; - /* - * Handle weird Windows SMB server behaviour. It responds with - * STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request - * for "\<server>\<dfsname>\<linkpath>" DFS reference, - * where <dfsname> contains non-ASCII unicode symbols. - */ - if (rc != -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && - hdr->Status == STATUS_OBJECT_NAME_INVALID) - rc = -EREMOTE; + + if (rc != -EREMOTE && hdr->Status == STATUS_OBJECT_NAME_INVALID) { + rc2 = cifs_inval_name_dfs_link_error(xid, tcon, cifs_sb, + full_path, &islink); + if (rc2) { + rc = rc2; + goto out; + } + if (islink) + rc = -EREMOTE; + } if (rc == -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && cifs_sb && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)) rc = -EOPNOTSUPP; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index ca9d7110ddcb..0e53265e1462 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -139,66 +139,6 @@ out: return; } -static int wait_for_server_reconnect(struct TCP_Server_Info *server, - __le16 smb2_command, bool retry) -{ - int timeout = 10; - int rc; - - spin_lock(&server->srv_lock); - if (server->tcpStatus != CifsNeedReconnect) { - spin_unlock(&server->srv_lock); - return 0; - } - timeout *= server->nr_targets; - spin_unlock(&server->srv_lock); - - /* - * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE - * here since they are implicitly done when session drops. - */ - switch (smb2_command) { - /* - * BB Should we keep oplock break and add flush to exceptions? - */ - case SMB2_TREE_DISCONNECT: - case SMB2_CANCEL: - case SMB2_CLOSE: - case SMB2_OPLOCK_BREAK: - return -EAGAIN; - } - - /* - * Give demultiplex thread up to 10 seconds to each target available for - * reconnect -- should be greater than cifs socket timeout which is 7 - * seconds. - * - * On "soft" mounts we wait once. Hard mounts keep retrying until - * process is killed or server comes back on-line. - */ - do { - rc = wait_event_interruptible_timeout(server->response_q, - (server->tcpStatus != CifsNeedReconnect), - timeout * HZ); - if (rc < 0) { - cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n", - __func__); - return -ERESTARTSYS; - } - - /* are we still trying to reconnect? */ - spin_lock(&server->srv_lock); - if (server->tcpStatus != CifsNeedReconnect) { - spin_unlock(&server->srv_lock); - return 0; - } - spin_unlock(&server->srv_lock); - } while (retry); - - cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__); - return -EHOSTDOWN; -} - static int smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, struct TCP_Server_Info *server) @@ -243,7 +183,27 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, (!tcon->ses->server) || !server) return -EIO; - rc = wait_for_server_reconnect(server, smb2_command, tcon->retry); + spin_lock(&server->srv_lock); + if (server->tcpStatus == CifsNeedReconnect) { + /* + * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE + * here since they are implicitly done when session drops. + */ + switch (smb2_command) { + /* + * BB Should we keep oplock break and add flush to exceptions? + */ + case SMB2_TREE_DISCONNECT: + case SMB2_CANCEL: + case SMB2_CLOSE: + case SMB2_OPLOCK_BREAK: + spin_unlock(&server->srv_lock); + return -EAGAIN; + } + } + spin_unlock(&server->srv_lock); + + rc = cifs_wait_for_server_reconnect(server, tcon->retry); if (rc) return rc; diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 55b6e319a61d..0362ebd4fa0f 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -837,7 +837,7 @@ static int smbd_post_send_iter(struct smbd_connection *info, int data_length; struct smbd_request *request; struct smbd_data_transfer *packet; - int new_credits; + int new_credits = 0; wait_credit: /* Wait for send credits. A SMBD packet needs one credit */ diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index e3d168911dbe..006ef68d7ff6 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -183,7 +183,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, unsigned int len) { struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; - struct file_ra_state ra; + struct file_ra_state ra = {}; struct page *pages[BLKS_PER_BUF]; unsigned i, blocknr, buffer; unsigned long devsize; diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 1dfa67f307f1..957574180a5e 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -29,14 +29,15 @@ static int exfat_extract_uni_name(struct exfat_dentry *ep, } -static void exfat_get_uniname_from_ext_entry(struct super_block *sb, +static int exfat_get_uniname_from_ext_entry(struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned short *uniname) { - int i; + int i, err; struct exfat_entry_set_cache es; - if (exfat_get_dentry_set(&es, sb, p_dir, entry, ES_ALL_ENTRIES)) - return; + err = exfat_get_dentry_set(&es, sb, p_dir, entry, ES_ALL_ENTRIES); + if (err) + return err; /* * First entry : file entry @@ -56,12 +57,13 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb, } exfat_put_dentry_set(&es, false); + return 0; } /* read a directory entry from the opened directory */ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry) { - int i, dentries_per_clu, num_ext; + int i, dentries_per_clu, num_ext, err; unsigned int type, clu_offset, max_dentries; struct exfat_chain dir, clu; struct exfat_uni_name uni_name; @@ -100,7 +102,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent clu.dir = ei->hint_bmap.clu; } - while (clu_offset > 0) { + while (clu_offset > 0 && clu.dir != EXFAT_EOF_CLUSTER) { if (exfat_get_next_cluster(sb, &(clu.dir))) return -EIO; @@ -146,8 +148,12 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent 0); *uni_name.name = 0x0; - exfat_get_uniname_from_ext_entry(sb, &clu, i, + err = exfat_get_uniname_from_ext_entry(sb, &clu, i, uni_name.name); + if (err) { + brelse(bh); + continue; + } exfat_utf16_to_nls(sb, &uni_name, dir_entry->namebuf.lfn, dir_entry->namebuf.lfnbuf_len); @@ -234,10 +240,7 @@ static int exfat_iterate(struct file *file, struct dir_context *ctx) fake_offset = 1; } - if (cpos & (DENTRY_SIZE - 1)) { - err = -ENOENT; - goto unlock; - } + cpos = round_up(cpos, DENTRY_SIZE); /* name buffer should be allocated before use */ err = exfat_alloc_namebuf(nb); @@ -378,6 +381,12 @@ unsigned int exfat_get_entry_type(struct exfat_dentry *ep) return TYPE_ACL; return TYPE_CRITICAL_SEC; } + + if (ep->type == EXFAT_VENDOR_EXT) + return TYPE_VENDOR_EXT; + if (ep->type == EXFAT_VENDOR_ALLOC) + return TYPE_VENDOR_ALLOC; + return TYPE_BENIGN_SEC; } @@ -521,6 +530,25 @@ release_fbh: return ret; } +static void exfat_free_benign_secondary_clusters(struct inode *inode, + struct exfat_dentry *ep) +{ + struct super_block *sb = inode->i_sb; + struct exfat_chain dir; + unsigned int start_clu = + le32_to_cpu(ep->dentry.generic_secondary.start_clu); + u64 size = le64_to_cpu(ep->dentry.generic_secondary.size); + unsigned char flags = ep->dentry.generic_secondary.flags; + + if (!(flags & ALLOC_POSSIBLE) || !start_clu || !size) + return; + + exfat_chain_set(&dir, start_clu, + EXFAT_B_TO_CLU_ROUND_UP(size, EXFAT_SB(sb)), + flags); + exfat_free_cluster(inode, &dir); +} + int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir, int entry, int num_entries, struct exfat_uni_name *p_uniname) { @@ -553,6 +581,9 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir, if (!ep) return -EIO; + if (exfat_get_entry_type(ep) & TYPE_BENIGN_SEC) + exfat_free_benign_secondary_clusters(inode, ep); + exfat_init_name_entry(ep, uniname); exfat_update_bh(bh, sync); brelse(bh); @@ -576,6 +607,9 @@ int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir, if (!ep) return -EIO; + if (exfat_get_entry_type(ep) & TYPE_BENIGN_SEC) + exfat_free_benign_secondary_clusters(inode, ep); + exfat_set_entry_type(ep, TYPE_DELETED); exfat_update_bh(bh, IS_DIRSYNC(inode)); brelse(bh); @@ -744,6 +778,7 @@ enum exfat_validate_dentry_mode { ES_MODE_GET_STRM_ENTRY, ES_MODE_GET_NAME_ENTRY, ES_MODE_GET_CRITICAL_SEC_ENTRY, + ES_MODE_GET_BENIGN_SEC_ENTRY, }; static bool exfat_validate_entry(unsigned int type, @@ -757,36 +792,33 @@ static bool exfat_validate_entry(unsigned int type, if (type != TYPE_FILE && type != TYPE_DIR) return false; *mode = ES_MODE_GET_FILE_ENTRY; - return true; + break; case ES_MODE_GET_FILE_ENTRY: if (type != TYPE_STREAM) return false; *mode = ES_MODE_GET_STRM_ENTRY; - return true; + break; case ES_MODE_GET_STRM_ENTRY: if (type != TYPE_EXTEND) return false; *mode = ES_MODE_GET_NAME_ENTRY; - return true; + break; case ES_MODE_GET_NAME_ENTRY: - if (type == TYPE_STREAM) - return false; - if (type != TYPE_EXTEND) { - if (!(type & TYPE_CRITICAL_SEC)) - return false; - *mode = ES_MODE_GET_CRITICAL_SEC_ENTRY; - } - return true; - case ES_MODE_GET_CRITICAL_SEC_ENTRY: - if (type == TYPE_EXTEND || type == TYPE_STREAM) + if (type & TYPE_BENIGN_SEC) + *mode = ES_MODE_GET_BENIGN_SEC_ENTRY; + else if (type != TYPE_EXTEND) return false; - if ((type & TYPE_CRITICAL_SEC) != TYPE_CRITICAL_SEC) + break; + case ES_MODE_GET_BENIGN_SEC_ENTRY: + /* Assume unreconized benign secondary entry */ + if (!(type & TYPE_BENIGN_SEC)) return false; - return true; + break; default: - WARN_ON_ONCE(1); return false; } + + return true; } struct exfat_dentry *exfat_get_dentry_cached( @@ -1167,10 +1199,8 @@ int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir, type = exfat_get_entry_type(ext_ep); brelse(bh); - if (type == TYPE_EXTEND || type == TYPE_STREAM) + if (type & TYPE_CRITICAL_SEC || type & TYPE_BENIGN_SEC) count++; - else - break; } return count; } diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 1bf16abe3c84..729ada9e26e8 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -50,7 +50,7 @@ enum { #define ES_IDX_LAST_FILENAME(name_len) \ (ES_IDX_FIRST_FILENAME + EXFAT_FILENAME_ENTRY_NUM(name_len) - 1) -#define DIR_DELETED 0xFFFF0321 +#define DIR_DELETED 0xFFFFFFF7 /* type values */ #define TYPE_UNUSED 0x0000 @@ -71,6 +71,8 @@ enum { #define TYPE_PADDING 0x0402 #define TYPE_ACLTAB 0x0403 #define TYPE_BENIGN_SEC 0x0800 +#define TYPE_VENDOR_EXT 0x0801 +#define TYPE_VENDOR_ALLOC 0x0802 #define MAX_CHARSET_SIZE 6 /* max size of multi-byte character */ #define MAX_NAME_LENGTH 255 /* max len of file name excluding NULL */ diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h index 7f39b1c6469c..0ece2e43cf49 100644 --- a/fs/exfat/exfat_raw.h +++ b/fs/exfat/exfat_raw.h @@ -27,6 +27,7 @@ ((sbi)->num_clusters - EXFAT_RESERVED_CLUSTERS) /* AllocationPossible and NoFatChain field in GeneralSecondaryFlags Field */ +#define ALLOC_POSSIBLE 0x01 #define ALLOC_FAT_CHAIN 0x01 #define ALLOC_NO_FAT_CHAIN 0x03 @@ -50,6 +51,8 @@ #define EXFAT_STREAM 0xC0 /* stream entry */ #define EXFAT_NAME 0xC1 /* file name entry */ #define EXFAT_ACL 0xC2 /* stream entry */ +#define EXFAT_VENDOR_EXT 0xE0 /* vendor extension entry */ +#define EXFAT_VENDOR_ALLOC 0xE1 /* vendor allocation entry */ #define IS_EXFAT_CRITICAL_PRI(x) (x < 0xA0) #define IS_EXFAT_BENIGN_PRI(x) (x < 0xC0) @@ -155,6 +158,24 @@ struct exfat_dentry { __le32 start_clu; __le64 size; } __packed upcase; /* up-case table directory entry */ + struct { + __u8 flags; + __u8 vendor_guid[16]; + __u8 vendor_defined[14]; + } __packed vendor_ext; /* vendor extension directory entry */ + struct { + __u8 flags; + __u8 vendor_guid[16]; + __u8 vendor_defined[2]; + __le32 start_clu; + __le64 size; + } __packed vendor_alloc; /* vendor allocation directory entry */ + struct { + __u8 flags; + __u8 custom_defined[18]; + __le32 start_clu; + __le64 size; + } __packed generic_secondary; /* generic secondary directory entry */ } __packed dentry; } __packed; diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 41ae4cce1f42..56b870d9cc0d 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -307,7 +307,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, struct exfat_chain *p_chain, bool sync_bmap) { int ret = -ENOSPC; - unsigned int num_clusters = 0, total_cnt; + unsigned int total_cnt; unsigned int hint_clu, new_clu, last_clu = EXFAT_EOF_CLUSTER; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); @@ -344,17 +344,11 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, /* check cluster validation */ if (!is_valid_cluster(sbi, hint_clu)) { - exfat_err(sb, "hint_cluster is invalid (%u)", - hint_clu); + if (hint_clu != sbi->num_clusters) + exfat_err(sb, "hint_cluster is invalid (%u), rewind to the first cluster", + hint_clu); hint_clu = EXFAT_FIRST_CLUSTER; - if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { - if (exfat_chain_cont_cluster(sb, p_chain->dir, - num_clusters)) { - ret = -EIO; - goto unlock; - } - p_chain->flags = ALLOC_FAT_CHAIN; - } + p_chain->flags = ALLOC_FAT_CHAIN; } p_chain->dir = EXFAT_EOF_CLUSTER; @@ -364,7 +358,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, if (new_clu != hint_clu && p_chain->flags == ALLOC_NO_FAT_CHAIN) { if (exfat_chain_cont_cluster(sb, p_chain->dir, - num_clusters)) { + p_chain->size)) { ret = -EIO; goto free_cluster; } @@ -377,8 +371,6 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, goto free_cluster; } - num_clusters++; - /* update FAT table */ if (p_chain->flags == ALLOC_FAT_CHAIN) { if (exfat_ent_set(sb, new_clu, EXFAT_EOF_CLUSTER)) { @@ -395,13 +387,14 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, goto free_cluster; } } + p_chain->size++; + last_clu = new_clu; - if (--num_alloc == 0) { + if (p_chain->size == num_alloc) { sbi->clu_srch_ptr = hint_clu; - sbi->used_clusters += num_clusters; + sbi->used_clusters += num_alloc; - p_chain->size += num_clusters; mutex_unlock(&sbi->bitmap_lock); return 0; } @@ -412,7 +405,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { if (exfat_chain_cont_cluster(sb, p_chain->dir, - num_clusters)) { + p_chain->size)) { ret = -EIO; goto free_cluster; } @@ -421,8 +414,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, } } free_cluster: - if (num_clusters) - __exfat_free_cluster(inode, p_chain); + __exfat_free_cluster(inode, p_chain); unlock: mutex_unlock(&sbi->bitmap_lock); return ret; diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 1fdb0a64b91d..e99183a74611 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -209,8 +209,7 @@ void exfat_truncate(struct inode *inode) if (err) goto write_size; - inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> - inode->i_blkbits; + inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9; write_size: aligned_size = i_size_read(inode); if (aligned_size & (blocksize - 1)) { diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 5b644cb057fa..481dd338f2b8 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -220,8 +220,7 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, num_clusters += num_to_be_allocated; *clu = new_clu.dir; - inode->i_blocks += - num_to_be_allocated << sbi->sect_per_clus_bits; + inode->i_blocks += EXFAT_CLU_TO_B(num_to_be_allocated, sbi) >> 9; /* * Move *clu pointer along FAT chains (hole care) because the @@ -576,8 +575,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) exfat_save_attr(inode, info->attr); - inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> - inode->i_blkbits; + inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9; inode->i_mtime = info->mtime; inode->i_ctime = info->mtime; ei->i_crtime = info->crtime; diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 02aab4c3a5f7..e0ff9d156f6f 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -396,7 +396,7 @@ static int exfat_find_empty_entry(struct inode *inode, ei->i_size_ondisk += sbi->cluster_size; ei->i_size_aligned += sbi->cluster_size; ei->flags = p_dir->flags; - inode->i_blocks += 1 << sbi->sect_per_clus_bits; + inode->i_blocks += sbi->cluster_size >> 9; } return dentry; diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 35f0305cd493..8c32460e031e 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -373,8 +373,7 @@ static int exfat_read_root(struct inode *inode) inode->i_op = &exfat_dir_inode_operations; inode->i_fop = &exfat_dir_operations; - inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> - inode->i_blkbits; + inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9; ei->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff; ei->i_size_aligned = i_size_read(inode); ei->i_size_ondisk = i_size_read(inode); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 43e26e6f6e42..4eeb02d456a9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1529,6 +1529,7 @@ struct ext4_sb_info { unsigned int s_mount_opt2; unsigned long s_mount_flags; unsigned int s_def_mount_opt; + unsigned int s_def_mount_opt2; ext4_fsblk_t s_sb_block; atomic64_t s_resv_clusters; kuid_t s_resuid; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9de1c9d1a13d..3559ea6b0781 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3251,7 +3251,7 @@ static int ext4_split_extent_at(handle_t *handle, ext4_ext_mark_unwritten(ex2); err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); - if (err != -ENOSPC && err != -EDQUOT) + if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) goto out; if (EXT4_EXT_MAY_ZEROOUT & split_flag) { diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 4594b62f147b..b06de728b3b6 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1332,8 +1332,14 @@ struct dentry_info_args { char *dname; }; +/* Same as struct ext4_fc_tl, but uses native endianness fields */ +struct ext4_fc_tl_mem { + u16 fc_tag; + u16 fc_len; +}; + static inline void tl_to_darg(struct dentry_info_args *darg, - struct ext4_fc_tl *tl, u8 *val) + struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_dentry_info fcd; @@ -1345,16 +1351,18 @@ static inline void tl_to_darg(struct dentry_info_args *darg, darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); } -static inline void ext4_fc_get_tl(struct ext4_fc_tl *tl, u8 *val) +static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) { - memcpy(tl, val, EXT4_FC_TAG_BASE_LEN); - tl->fc_len = le16_to_cpu(tl->fc_len); - tl->fc_tag = le16_to_cpu(tl->fc_tag); + struct ext4_fc_tl tl_disk; + + memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); + tl->fc_len = le16_to_cpu(tl_disk.fc_len); + tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); } /* Unlink replay function */ -static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, - u8 *val) +static int ext4_fc_replay_unlink(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode, *old_parent; struct qstr entry; @@ -1451,8 +1459,8 @@ out: } /* Link replay function */ -static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl, - u8 *val) +static int ext4_fc_replay_link(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct dentry_info_args darg; @@ -1506,8 +1514,8 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) /* * Inode replay function */ -static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, - u8 *val) +static int ext4_fc_replay_inode(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_inode fc_inode; struct ext4_inode *raw_inode; @@ -1609,8 +1617,8 @@ out: * inode for which we are trying to create a dentry here, should already have * been replayed before we start here. */ -static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, - u8 *val) +static int ext4_fc_replay_create(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) { int ret = 0; struct inode *inode = NULL; @@ -1708,7 +1716,7 @@ int ext4_fc_record_regions(struct super_block *sb, int ino, /* Replay add range tag */ static int ext4_fc_replay_add_range(struct super_block *sb, - struct ext4_fc_tl *tl, u8 *val) + struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_add_range fc_add_ex; struct ext4_extent newex, *ex; @@ -1828,8 +1836,8 @@ out: /* Replay DEL_RANGE tag */ static int -ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, - u8 *val) +ext4_fc_replay_del_range(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct ext4_fc_del_range lrange; @@ -2025,7 +2033,7 @@ static int ext4_fc_replay_scan(journal_t *journal, struct ext4_fc_replay_state *state; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_add_range ext; - struct ext4_fc_tl tl; + struct ext4_fc_tl_mem tl; struct ext4_fc_tail tail; __u8 *start, *end, *cur, *val; struct ext4_fc_head head; @@ -2144,7 +2152,7 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_fc_tl tl; + struct ext4_fc_tl_mem tl; __u8 *start, *end, *cur, *val; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6bdf61a62c79..0b8b4499e5ca 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len) return false; } -/* Is IO overwriting allocated and initialized blocks? */ -static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) +/* Is IO overwriting allocated or initialized blocks? */ +static bool ext4_overwrite_io(struct inode *inode, + loff_t pos, loff_t len, bool *unwritten) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; @@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) blklen = map.m_len; err = ext4_map_blocks(NULL, inode, &map, 0); + if (err != blklen) + return false; /* * 'err==len' means that all of the blocks have been preallocated, - * regardless of whether they have been initialized or not. To exclude - * unwritten extents, we need to check m_flags. + * regardless of whether they have been initialized or not. We need to + * check m_flags to distinguish the unwritten extents. */ - return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); + *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); + return true; } static ssize_t ext4_generic_write_checks(struct kiocb *iocb, @@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { * - For extending writes case we don't take the shared lock, since it requires * updating inode i_disksize and/or orphan handling with exclusive lock. * - * - shared locking will only be true mostly with overwrites. Otherwise we will - * switch to exclusive i_rwsem lock. + * - shared locking will only be true mostly with overwrites, including + * initialized blocks and unwritten blocks. For overwrite unwritten blocks + * we protect splitting extents by i_data_sem in ext4_inode_info, so we can + * also release exclusive i_rwsem lock. + * + * - Otherwise we will switch to exclusive i_rwsem lock. */ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, - bool *ilock_shared, bool *extend) + bool *ilock_shared, bool *extend, + bool *unwritten) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -459,7 +468,7 @@ restart: * in file_modified(). */ if (*ilock_shared && (!IS_NOSEC(inode) || *extend || - !ext4_overwrite_io(inode, offset, count))) { + !ext4_overwrite_io(inode, offset, count, unwritten))) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; @@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); const struct iomap_ops *iomap_ops = &ext4_iomap_ops; - bool extend = false, unaligned_io = false; + bool extend = false, unaligned_io = false, unwritten = false; bool ilock_shared = true; /* @@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) return ext4_buffered_write_iter(iocb, from); } - ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend); + ret = ext4_dio_write_checks(iocb, from, + &ilock_shared, &extend, &unwritten); if (ret <= 0) return ret; @@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) ext4_journal_stop(handle); } - if (ilock_shared) + if (ilock_shared && !unwritten) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 40579ef513b7..d251d705c276 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4872,13 +4872,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, goto bad_inode; raw_inode = ext4_raw_inode(&iloc); - if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { - ext4_error_inode(inode, function, line, 0, - "iget: root inode unallocated"); - ret = -EFSCORRUPTED; - goto bad_inode; - } - if ((flags & EXT4_IGET_HANDLE) && (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { ret = -ESTALE; @@ -4951,11 +4944,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { - if ((inode->i_mode == 0 || + if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && ino != EXT4_BOOT_LOADER_INO) { - /* this inode is deleted */ - ret = -ESTALE; + /* this inode is deleted or unallocated */ + if (flags & EXT4_IGET_SPECIAL) { + ext4_error_inode(inode, function, line, 0, + "iget: special inode unallocated"); + ret = -EFSCORRUPTED; + } else + ret = -ESTALE; goto bad_inode; } /* The only unlinked inodes we let through here have @@ -5788,7 +5786,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; int idxblocks; - int ret = 0; + int ret; /* * How many index blocks need to touch to map @lblocks logical blocks diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index b0dc7212694e..12435d61f09e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -155,9 +155,6 @@ static int ext4_update_backup_sb(struct super_block *sb, set_buffer_uptodate(bh); unlock_buffer(bh); - if (err) - goto out_bh; - if (handle) { err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index d10a508d95cd..94608b7df7e8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3872,9 +3872,16 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) goto end_rename; } + /* + * We need to protect against old.inode directory getting + * converted from inline directory format into a normal one. + */ + inode_lock_nested(old.inode, I_MUTEX_NONDIR2); retval = ext4_rename_dir_prepare(handle, &old); - if (retval) + if (retval) { + inode_unlock(old.inode); goto end_rename; + } } /* * If we're renaming a file within an inline_data dir and adding or @@ -4006,6 +4013,8 @@ end_rename: } else { ext4_journal_stop(handle); } + if (old.dir_bh) + inode_unlock(old.inode); release_bh: brelse(old.dir_bh); brelse(old.bh); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index faae05493471..88f7b8a88c76 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2146,7 +2146,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; case Opt_commit: if (result.uint_32 == 0) - ctx->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE; + result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE; else if (result.uint_32 > INT_MAX / HZ) { ext4_msg(NULL, KERN_ERR, "Invalid commit interval %d, " @@ -2883,7 +2883,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; - int def_errors, def_mount_opt = sbi->s_def_mount_opt; + int def_errors; const struct mount_opts *m; char sep = nodefs ? '\n' : ','; @@ -2895,15 +2895,28 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; + int opt_2 = m->flags & MOPT_2; + unsigned int mount_opt, def_mount_opt; + if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || m->flags & MOPT_SKIP) continue; - if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) - continue; /* skip if same as the default */ + + if (opt_2) { + mount_opt = sbi->s_mount_opt2; + def_mount_opt = sbi->s_def_mount_opt2; + } else { + mount_opt = sbi->s_mount_opt; + def_mount_opt = sbi->s_def_mount_opt; + } + /* skip if same as the default */ + if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt))) + continue; + /* select Opt_noFoo vs Opt_Foo */ if ((want_set && - (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) || - (!want_set && (sbi->s_mount_opt & m->mount_opt))) - continue; /* select Opt_noFoo vs Opt_Foo */ + (mount_opt & m->mount_opt) != m->mount_opt) || + (!want_set && (mount_opt & m->mount_opt))) + continue; SEQ_OPTS_PRINT("%s", token2str(m->token)); } @@ -2931,7 +2944,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); if (nodefs || EXT4_MOUNT_DATA_FLAGS & - (sbi->s_mount_opt ^ def_mount_opt)) { + (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) SEQ_OPTS_PUTS("data=journal"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) @@ -4727,7 +4740,6 @@ static int ext4_group_desc_init(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned int db_count; ext4_fsblk_t block; - int ret; int i; db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / @@ -4767,8 +4779,7 @@ static int ext4_group_desc_init(struct super_block *sb, ext4_msg(sb, KERN_ERR, "can't read group descriptor %d", i); sbi->s_gdb_count = i; - ret = PTR_ERR(bh); - goto out; + return PTR_ERR(bh); } rcu_read_lock(); rcu_dereference(sbi->s_group_desc)[i] = bh; @@ -4777,13 +4788,10 @@ static int ext4_group_desc_init(struct super_block *sb, sbi->s_gdb_count = db_count; if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - ret = -EFSCORRUPTED; - goto out; + return -EFSCORRUPTED; } + return 0; -out: - ext4_group_desc_free(sbi); - return ret; } static int ext4_load_and_init_journal(struct super_block *sb, @@ -5075,6 +5083,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount; sbi->s_def_mount_opt = sbi->s_mount_opt; + sbi->s_def_mount_opt2 = sbi->s_mount_opt2; err = ext4_check_opt_consistency(fc, sb); if (err < 0) @@ -5209,14 +5218,14 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (ext4_geometry_check(sb, es)) goto failed_mount; - err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); - if (err) - goto failed_mount; - timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); + err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); + if (err) + goto failed_mount3; + /* Register extent status tree shrinker */ if (ext4_es_register_shrinker(sbi)) goto failed_mount3; @@ -5937,8 +5946,11 @@ static int ext4_load_journal(struct super_block *sb, if (!really_read_only && journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); - - /* Make sure we flush the recovery flag to disk. */ + ext4_commit_super(sb); + } + if (!really_read_only && journal_inum && + journal_inum != le32_to_cpu(es->s_journal_inum)) { + es->s_journal_inum = cpu_to_le32(journal_inum); ext4_commit_super(sb); } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index a2f04a3808db..62f2ec599218 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -184,27 +184,73 @@ ext4_xattr_handler(int name_index) } static int -ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, - void *value_start) +check_xattrs(struct inode *inode, struct buffer_head *bh, + struct ext4_xattr_entry *entry, void *end, void *value_start, + const char *function, unsigned int line) { struct ext4_xattr_entry *e = entry; + int err = -EFSCORRUPTED; + char *err_str; + + if (bh) { + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) { + err_str = "invalid header"; + goto errout; + } + if (buffer_verified(bh)) + return 0; + if (!ext4_xattr_block_csum_verify(inode, bh)) { + err = -EFSBADCRC; + err_str = "invalid checksum"; + goto errout; + } + } else { + struct ext4_xattr_ibody_header *header = value_start; + + header -= 1; + if (end - (void *)header < sizeof(*header) + sizeof(u32)) { + err_str = "in-inode xattr block too small"; + goto errout; + } + if (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { + err_str = "bad magic number in in-inode xattr"; + goto errout; + } + } /* Find the end of the names list */ while (!IS_LAST_ENTRY(e)) { struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); - if ((void *)next >= end) - return -EFSCORRUPTED; - if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) - return -EFSCORRUPTED; + if ((void *)next >= end) { + err_str = "e_name out of bounds"; + goto errout; + } + if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) { + err_str = "bad e_name length"; + goto errout; + } e = next; } /* Check the values */ while (!IS_LAST_ENTRY(entry)) { u32 size = le32_to_cpu(entry->e_value_size); + unsigned long ea_ino = le32_to_cpu(entry->e_value_inum); - if (size > EXT4_XATTR_SIZE_MAX) - return -EFSCORRUPTED; + if (!ext4_has_feature_ea_inode(inode->i_sb) && ea_ino) { + err_str = "ea_inode specified without ea_inode feature enabled"; + goto errout; + } + if (ea_ino && ((ea_ino == EXT4_ROOT_INO) || + !ext4_valid_inum(inode->i_sb, ea_ino))) { + err_str = "invalid ea_ino"; + goto errout; + } + if (size > EXT4_XATTR_SIZE_MAX) { + err_str = "e_value size too large"; + goto errout; + } if (size != 0 && entry->e_value_inum == 0) { u16 offs = le16_to_cpu(entry->e_value_offs); @@ -216,66 +262,54 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, * the padded and unpadded sizes, since the size may * overflow to 0 when adding padding. */ - if (offs > end - value_start) - return -EFSCORRUPTED; + if (offs > end - value_start) { + err_str = "e_value out of bounds"; + goto errout; + } value = value_start + offs; if (value < (void *)e + sizeof(u32) || size > end - value || - EXT4_XATTR_SIZE(size) > end - value) - return -EFSCORRUPTED; + EXT4_XATTR_SIZE(size) > end - value) { + err_str = "overlapping e_value "; + goto errout; + } } entry = EXT4_XATTR_NEXT(entry); } - + if (bh) + set_buffer_verified(bh); return 0; + +errout: + if (bh) + __ext4_error_inode(inode, function, line, 0, -err, + "corrupted xattr block %llu: %s", + (unsigned long long) bh->b_blocknr, + err_str); + else + __ext4_error_inode(inode, function, line, 0, -err, + "corrupted in-inode xattr: %s", err_str); + return err; } static inline int __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, const char *function, unsigned int line) { - int error = -EFSCORRUPTED; - - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || - BHDR(bh)->h_blocks != cpu_to_le32(1)) - goto errout; - if (buffer_verified(bh)) - return 0; - - error = -EFSBADCRC; - if (!ext4_xattr_block_csum_verify(inode, bh)) - goto errout; - error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size, - bh->b_data); -errout: - if (error) - __ext4_error_inode(inode, function, line, 0, -error, - "corrupted xattr block %llu", - (unsigned long long) bh->b_blocknr); - else - set_buffer_verified(bh); - return error; + return check_xattrs(inode, bh, BFIRST(bh), bh->b_data + bh->b_size, + bh->b_data, function, line); } #define ext4_xattr_check_block(inode, bh) \ __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) -static int +static inline int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { - int error = -EFSCORRUPTED; - - if (end - (void *)header < sizeof(*header) + sizeof(u32) || - (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) - goto errout; - error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header)); -errout: - if (error) - __ext4_error_inode(inode, function, line, 0, -error, - "corrupted in-inode xattr"); - return error; + return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header), + function, line); } #define xattr_check_inode(inode, header, end) \ @@ -388,6 +422,17 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, struct inode *inode; int err; + /* + * We have to check for this corruption early as otherwise + * iget_locked() could wait indefinitely for the state of our + * parent inode. + */ + if (parent->i_ino == ea_ino) { + ext4_error(parent->i_sb, + "Parent and EA inode have the same ino %lu", ea_ino); + return -EFSCORRUPTED; + } + inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); @@ -1438,6 +1483,13 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle, uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) }; int err; + if (inode->i_sb->s_root == NULL) { + ext4_warning(inode->i_sb, + "refuse to create EA inode when umounting"); + WARN_ON(1); + return ERR_PTR(-EINVAL); + } + /* * Let the next inode be the goal, so we try and allocate the EA inode * in the same group, or nearby one. @@ -2567,9 +2619,8 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); - buffer = kvmalloc(value_size, GFP_NOFS); b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); - if (!is || !bs || !buffer || !b_entry_name) { + if (!is || !bs || !b_entry_name) { error = -ENOMEM; goto out; } @@ -2581,12 +2632,18 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, /* Save the entry name and the entry value */ if (entry->e_value_inum) { + buffer = kvmalloc(value_size, GFP_NOFS); + if (!buffer) { + error = -ENOMEM; + goto out; + } + error = ext4_xattr_inode_get(inode, entry, buffer, value_size); if (error) goto out; } else { size_t value_offs = le16_to_cpu(entry->e_value_offs); - memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size); + buffer = (void *)IFIRST(header) + value_offs; } memcpy(b_entry_name, entry->e_name, entry->e_name_len); @@ -2601,25 +2658,26 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, if (error) goto out; - /* Remove the chosen entry from the inode */ - error = ext4_xattr_ibody_set(handle, inode, &i, is); - if (error) - goto out; - i.value = buffer; i.value_len = value_size; error = ext4_xattr_block_find(inode, &i, bs); if (error) goto out; - /* Add entry which was removed from the inode into the block */ + /* Move ea entry from the inode into the block */ error = ext4_xattr_block_set(handle, inode, &i, bs); if (error) goto out; - error = 0; + + /* Remove the chosen entry from the inode */ + i.value = NULL; + i.value_len = 0; + error = ext4_xattr_ibody_set(handle, inode, &i, is); + out: kfree(b_entry_name); - kvfree(buffer); + if (entry->e_value_inum && buffer) + kvfree(buffer); if (is) brelse(is->iloc.bh); if (bs) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 5a5515d83a1b..c3e058e0a018 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -70,7 +70,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, - .is_por = !is_meta, + .is_por = !is_meta ? 1 : 0, }; int err; @@ -171,10 +171,8 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { - if (time_to_inject(sbi, FAULT_BLKADDR)) { - f2fs_show_injection_info(sbi, FAULT_BLKADDR); + if (time_to_inject(sbi, FAULT_BLKADDR)) return false; - } switch (type) { case META_NAT: @@ -239,8 +237,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .op = REQ_OP_READ, .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, - .in_list = false, - .is_por = (type == META_POR), + .in_list = 0, + .is_por = (type == META_POR) ? 1 : 0, }; struct blk_plug plug; int err; @@ -625,7 +623,6 @@ int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi) if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); - f2fs_show_injection_info(sbi, FAULT_ORPHAN); return -ENOSPC; } @@ -798,7 +795,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) */ head = &im->ino_list; - /* loop for each orphan inode entry and write them in Jornal block */ + /* loop for each orphan inode entry and write them in journal block */ list_for_each_entry(orphan, head, list) { if (!page) { page = f2fs_grab_meta_page(sbi, start_blk++); @@ -1128,7 +1125,7 @@ retry: } else { /* * We should submit bio, since it exists several - * wribacking dentry pages in the freeing inode. + * writebacking dentry pages in the freeing inode. */ f2fs_submit_merged_write(sbi, DATA); cond_resched(); @@ -1476,20 +1473,18 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { - ckpt->cur_node_segno[i] = - cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE)); - ckpt->cur_node_blkoff[i] = - cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE)); - ckpt->alloc_type[i + CURSEG_HOT_NODE] = - curseg_alloc_type(sbi, i + CURSEG_HOT_NODE); + struct curseg_info *curseg = CURSEG_I(sbi, i + CURSEG_HOT_NODE); + + ckpt->cur_node_segno[i] = cpu_to_le32(curseg->segno); + ckpt->cur_node_blkoff[i] = cpu_to_le16(curseg->next_blkoff); + ckpt->alloc_type[i + CURSEG_HOT_NODE] = curseg->alloc_type; } for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { - ckpt->cur_data_segno[i] = - cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA)); - ckpt->cur_data_blkoff[i] = - cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA)); - ckpt->alloc_type[i + CURSEG_HOT_DATA] = - curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); + struct curseg_info *curseg = CURSEG_I(sbi, i + CURSEG_HOT_DATA); + + ckpt->cur_data_segno[i] = cpu_to_le32(curseg->segno); + ckpt->cur_data_blkoff[i] = cpu_to_le16(curseg->next_blkoff); + ckpt->alloc_type[i + CURSEG_HOT_DATA] = curseg->alloc_type; } /* 2 cp + n data seg summary + orphan inode blocks */ diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 2532f369cb10..b40dec3d7f79 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -241,7 +241,7 @@ static int lz4_init_compress_ctx(struct compress_ctx *cc) unsigned int size = LZ4_MEM_COMPRESS; #ifdef CONFIG_F2FS_FS_LZ4HC - if (F2FS_I(cc->inode)->i_compress_flag >> COMPRESS_LEVEL_OFFSET) + if (F2FS_I(cc->inode)->i_compress_level) size = LZ4HC_MEM_COMPRESS; #endif @@ -267,8 +267,7 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc) #ifdef CONFIG_F2FS_FS_LZ4HC static int lz4hc_compress_pages(struct compress_ctx *cc) { - unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> - COMPRESS_LEVEL_OFFSET; + unsigned char level = F2FS_I(cc->inode)->i_compress_level; int len; if (level) @@ -340,8 +339,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) zstd_cstream *stream; void *workspace; unsigned int workspace_size; - unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> - COMPRESS_LEVEL_OFFSET; + unsigned char level = F2FS_I(cc->inode)->i_compress_level; if (!level) level = F2FS_ZSTD_DEFAULT_CLEVEL; @@ -564,7 +562,7 @@ module_param(num_compress_pages, uint, 0444); MODULE_PARM_DESC(num_compress_pages, "Number of intermediate compress pages to preallocate"); -int f2fs_init_compress_mempool(void) +int __init f2fs_init_compress_mempool(void) { compress_page_pool = mempool_create_page_pool(num_compress_pages, 0); return compress_page_pool ? 0 : -ENOMEM; @@ -690,9 +688,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) vm_unmap_ram(cc->cbuf, cc->nr_cpages); vm_unmap_ram(cc->rbuf, cc->cluster_size); - for (i = 0; i < cc->nr_cpages; i++) { - if (i < new_nr_cpages) - continue; + for (i = new_nr_cpages; i < cc->nr_cpages; i++) { f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } @@ -1070,7 +1066,7 @@ retry: if (ret) goto out; if (bio) - f2fs_submit_bio(sbi, bio, DATA); + f2fs_submit_read_bio(sbi, bio, DATA); ret = f2fs_init_compress_ctx(cc); if (ret) @@ -1215,10 +1211,11 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, .page = NULL, .encrypted_page = NULL, .compressed_page = NULL, - .submitted = false, + .submitted = 0, .io_type = io_type, .io_wbc = wbc, - .encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode), + .encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode) ? + 1 : 0, }; struct dnode_of_data dn; struct node_info ni; @@ -1228,7 +1225,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, loff_t psize; int i, err; - /* we should bypass data pages to proceed the kworkder jobs */ + /* we should bypass data pages to proceed the kworker jobs */ if (unlikely(f2fs_cp_error(sbi))) { mapping_set_error(cc->rpages[0]->mapping, -EIO); goto out_free; @@ -1813,6 +1810,7 @@ unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) const struct address_space_operations f2fs_compress_aops = { .release_folio = f2fs_release_folio, .invalidate_folio = f2fs_invalidate_folio, + .migrate_folio = filemap_migrate_folio, }; struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 41addc605350..06b552a0aba2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -292,13 +292,11 @@ static void f2fs_read_end_io(struct bio *bio) struct bio_post_read_ctx *ctx; bool intask = in_task(); - iostat_update_and_unbind_ctx(bio, 0); + iostat_update_and_unbind_ctx(bio); ctx = bio->bi_private; - if (time_to_inject(sbi, FAULT_READ_IO)) { - f2fs_show_injection_info(sbi, FAULT_READ_IO); + if (time_to_inject(sbi, FAULT_READ_IO)) bio->bi_status = BLK_STS_IOERR; - } if (bio->bi_status) { f2fs_finish_read_bio(bio, intask); @@ -332,13 +330,11 @@ static void f2fs_write_end_io(struct bio *bio) struct bio_vec *bvec; struct bvec_iter_all iter_all; - iostat_update_and_unbind_ctx(bio, 1); + iostat_update_and_unbind_ctx(bio); sbi = bio->bi_private; - if (time_to_inject(sbi, FAULT_WRITE_IO)) { - f2fs_show_injection_info(sbi, FAULT_WRITE_IO); + if (time_to_inject(sbi, FAULT_WRITE_IO)) bio->bi_status = BLK_STS_IOERR; - } bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; @@ -507,65 +503,66 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, return fscrypt_mergeable_bio(bio, inode, next_idx); } -static inline void __submit_bio(struct f2fs_sb_info *sbi, - struct bio *bio, enum page_type type) +void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, + enum page_type type) { - if (!is_read_io(bio_op(bio))) { - unsigned int start; + WARN_ON_ONCE(!is_read_io(bio_op(bio))); + trace_f2fs_submit_read_bio(sbi->sb, type, bio); - if (type != DATA && type != NODE) - goto submit_io; + iostat_update_submit_ctx(bio, type); + submit_bio(bio); +} - if (f2fs_lfs_mode(sbi) && current->plug) - blk_finish_plug(current->plug); +static void f2fs_align_write_bio(struct f2fs_sb_info *sbi, struct bio *bio) +{ + unsigned int start = + (bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS) % F2FS_IO_SIZE(sbi); - if (!F2FS_IO_ALIGNED(sbi)) - goto submit_io; + if (start == 0) + return; - start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; - start %= F2FS_IO_SIZE(sbi); + /* fill dummy pages */ + for (; start < F2FS_IO_SIZE(sbi); start++) { + struct page *page = + mempool_alloc(sbi->write_io_dummy, + GFP_NOIO | __GFP_NOFAIL); + f2fs_bug_on(sbi, !page); - if (start == 0) - goto submit_io; + lock_page(page); - /* fill dummy pages */ - for (; start < F2FS_IO_SIZE(sbi); start++) { - struct page *page = - mempool_alloc(sbi->write_io_dummy, - GFP_NOIO | __GFP_NOFAIL); - f2fs_bug_on(sbi, !page); + zero_user_segment(page, 0, PAGE_SIZE); + set_page_private_dummy(page); - lock_page(page); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) + f2fs_bug_on(sbi, 1); + } +} - zero_user_segment(page, 0, PAGE_SIZE); - set_page_private_dummy(page); +static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio, + enum page_type type) +{ + WARN_ON_ONCE(is_read_io(bio_op(bio))); + + if (type == DATA || type == NODE) { + if (f2fs_lfs_mode(sbi) && current->plug) + blk_finish_plug(current->plug); - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) - f2fs_bug_on(sbi, 1); + if (F2FS_IO_ALIGNED(sbi)) { + f2fs_align_write_bio(sbi, bio); + /* + * In the NODE case, we lose next block address chain. + * So, we need to do checkpoint in f2fs_sync_file. + */ + if (type == NODE) + set_sbi_flag(sbi, SBI_NEED_CP); } - /* - * In the NODE case, we lose next block address chain. So, we - * need to do checkpoint in f2fs_sync_file. - */ - if (type == NODE) - set_sbi_flag(sbi, SBI_NEED_CP); } -submit_io: - if (is_read_io(bio_op(bio))) - trace_f2fs_submit_read_bio(sbi->sb, type, bio); - else - trace_f2fs_submit_write_bio(sbi->sb, type, bio); + trace_f2fs_submit_write_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); submit_bio(bio); } -void f2fs_submit_bio(struct f2fs_sb_info *sbi, - struct bio *bio, enum page_type type) -{ - __submit_bio(sbi, bio, type); -} - static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -573,12 +570,13 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - if (is_read_io(fio->op)) + if (is_read_io(fio->op)) { trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); - else + f2fs_submit_read_bio(io->sbi, io->bio, fio->type); + } else { trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio); - - __submit_bio(io->sbi, io->bio, fio->type); + f2fs_submit_write_bio(io->sbi, io->bio, fio->type); + } io->bio = NULL; } @@ -655,6 +653,9 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, f2fs_down_write(&io->io_rwsem); + if (!io->bio) + goto unlock_out; + /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; @@ -663,6 +664,7 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA; } __submit_merged_bio(io); +unlock_out: f2fs_up_write(&io->io_rwsem); } @@ -741,12 +743,15 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) } if (fio->io_wbc && !is_read_io(fio->op)) - wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? __read_io_type(page) : WB_DATA_TYPE(fio->page)); - __submit_bio(fio->sbi, bio, fio->type); + if (is_read_io(bio_op(bio))) + f2fs_submit_read_bio(fio->sbi, bio, fio->type); + else + f2fs_submit_write_bio(fio->sbi, bio, fio->type); return 0; } @@ -848,7 +853,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, /* page can't be merged into bio; submit the bio */ del_bio_entry(be); - __submit_bio(sbi, *bio, DATA); + f2fs_submit_write_bio(sbi, *bio, DATA); break; } f2fs_up_write(&io->bio_list_lock); @@ -911,7 +916,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, } if (found) - __submit_bio(sbi, target, DATA); + f2fs_submit_write_bio(sbi, target, DATA); if (bio && *bio) { bio_put(*bio); *bio = NULL; @@ -948,7 +953,7 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); inc_page_count(fio->sbi, WB_DATA_TYPE(page)); @@ -991,7 +996,7 @@ next: bio_page = fio->page; /* set submitted = true as a return value */ - fio->submitted = true; + fio->submitted = 1; inc_page_count(sbi, WB_DATA_TYPE(bio_page)); @@ -1007,7 +1012,7 @@ alloc_new: (fio->type == DATA || fio->type == NODE) && fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { dec_page_count(sbi, WB_DATA_TYPE(bio_page)); - fio->retry = true; + fio->retry = 1; goto skip; } io->bio = __bio_alloc(fio, BIO_MAX_VECS); @@ -1022,7 +1027,7 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; @@ -1107,7 +1112,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, } inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); - __submit_bio(sbi, bio, DATA); + f2fs_submit_read_bio(sbi, bio, DATA); return 0; } @@ -1207,19 +1212,6 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) return err; } -int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) -{ - struct extent_info ei = {0, }; - struct inode *inode = dn->inode; - - if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { - dn->data_blkaddr = ei.blk + index - ei.fofs; - return 0; - } - - return f2fs_reserve_block(dn, index); -} - struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs) @@ -1227,15 +1219,14 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei = {0, }; int err; page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) return ERR_PTR(-ENOMEM); - if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { - dn.data_blkaddr = ei.blk + index - ei.fofs; + if (f2fs_lookup_read_extent_cache_block(inode, index, + &dn.data_blkaddr)) { if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; @@ -1432,13 +1423,12 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) return err; dn->data_blkaddr = f2fs_data_blkaddr(dn); - if (dn->data_blkaddr != NULL_ADDR) - goto alloc; - - if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) - return err; + if (dn->data_blkaddr == NULL_ADDR) { + err = inc_valid_block_count(sbi, dn->inode, &count); + if (unlikely(err)) + return err; + } -alloc: set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); old_blkaddr = dn->data_blkaddr; f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, @@ -1452,19 +1442,91 @@ alloc: return 0; } -void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +static void f2fs_map_lock(struct f2fs_sb_info *sbi, int flag) { - if (flag == F2FS_GET_BLOCK_PRE_AIO) { - if (lock) - f2fs_down_read(&sbi->node_change); - else - f2fs_up_read(&sbi->node_change); + if (flag == F2FS_GET_BLOCK_PRE_AIO) + f2fs_down_read(&sbi->node_change); + else + f2fs_lock_op(sbi); +} + +static void f2fs_map_unlock(struct f2fs_sb_info *sbi, int flag) +{ + if (flag == F2FS_GET_BLOCK_PRE_AIO) + f2fs_up_read(&sbi->node_change); + else + f2fs_unlock_op(sbi); +} + +int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + int err = 0; + + f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); + if (!f2fs_lookup_read_extent_cache_block(dn->inode, index, + &dn->data_blkaddr)) + err = f2fs_reserve_block(dn, index); + f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO); + + return err; +} + +static int f2fs_map_no_dnode(struct inode *inode, + struct f2fs_map_blocks *map, struct dnode_of_data *dn, + pgoff_t pgoff) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* + * There is one exceptional case that read_node_page() may return + * -ENOENT due to filesystem has been shutdown or cp_error, return + * -EIO in that case. + */ + if (map->m_may_create && + (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || f2fs_cp_error(sbi))) + return -EIO; + + if (map->m_next_pgofs) + *map->m_next_pgofs = f2fs_get_next_page_offset(dn, pgoff); + if (map->m_next_extent) + *map->m_next_extent = f2fs_get_next_page_offset(dn, pgoff); + return 0; +} + +static bool f2fs_map_blocks_cached(struct inode *inode, + struct f2fs_map_blocks *map, int flag) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int maxblocks = map->m_len; + pgoff_t pgoff = (pgoff_t)map->m_lblk; + struct extent_info ei = {}; + + if (!f2fs_lookup_read_extent_cache(inode, pgoff, &ei)) + return false; + + map->m_pblk = ei.blk + pgoff - ei.fofs; + map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgoff); + map->m_flags = F2FS_MAP_MAPPED; + if (map->m_next_extent) + *map->m_next_extent = pgoff + map->m_len; + + /* for hardware encryption, but to avoid potential issue in future */ + if (flag == F2FS_GET_BLOCK_DIO) + f2fs_wait_on_block_writeback_range(inode, + map->m_pblk, map->m_len); + + if (f2fs_allow_multi_device_dio(sbi, flag)) { + int bidx = f2fs_target_device_index(sbi, map->m_pblk); + struct f2fs_dev_info *dev = &sbi->devs[bidx]; + + map->m_bdev = dev->bdev; + map->m_pblk -= dev->start_blk; + map->m_len = min(map->m_len, dev->end_blk + 1 - map->m_pblk); } else { - if (lock) - f2fs_lock_op(sbi); - else - f2fs_unlock_op(sbi); + map->m_bdev = inode->i_sb->s_bdev; } + return true; } /* @@ -1472,8 +1534,7 @@ void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) * maps continuous logical blocks to physical blocks, and return such * info via f2fs_map_blocks structure. */ -int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, - int create, int flag) +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) { unsigned int maxblocks = map->m_len; struct dnode_of_data dn; @@ -1483,14 +1544,17 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; - struct extent_info ei = {0, }; block_t blkaddr; unsigned int start_pgofs; int bidx = 0; + bool is_hole; if (!maxblocks) return 0; + if (!map->m_may_create && f2fs_map_blocks_cached(inode, map, flag)) + goto out; + map->m_bdev = inode->i_sb->s_bdev; map->m_multidev_dio = f2fs_allow_multi_device_dio(F2FS_I_SB(inode), flag); @@ -1502,42 +1566,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, pgofs = (pgoff_t)map->m_lblk; end = pgofs + maxblocks; - if (!create && f2fs_lookup_read_extent_cache(inode, pgofs, &ei)) { - if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && - map->m_may_create) - goto next_dnode; - - map->m_pblk = ei.blk + pgofs - ei.fofs; - map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs); - map->m_flags = F2FS_MAP_MAPPED; - if (map->m_next_extent) - *map->m_next_extent = pgofs + map->m_len; - - /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO) - f2fs_wait_on_block_writeback_range(inode, - map->m_pblk, map->m_len); - - if (map->m_multidev_dio) { - block_t blk_addr = map->m_pblk; - - bidx = f2fs_target_device_index(sbi, map->m_pblk); - - map->m_bdev = FDEV(bidx).bdev; - map->m_pblk -= FDEV(bidx).start_blk; - map->m_len = min(map->m_len, - FDEV(bidx).end_blk + 1 - map->m_pblk); - - if (map->m_may_create) - f2fs_update_device_state(sbi, inode->i_ino, - blk_addr, map->m_len); - } - goto out; - } - next_dnode: if (map->m_may_create) - f2fs_do_map_lock(sbi, flag, true); + f2fs_map_lock(sbi, flag); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -1545,29 +1576,8 @@ next_dnode: if (err) { if (flag == F2FS_GET_BLOCK_BMAP) map->m_pblk = 0; - - if (err == -ENOENT) { - /* - * There is one exceptional case that read_node_page() - * may return -ENOENT due to filesystem has been - * shutdown or cp_error, so force to convert error - * number to EIO for such case. - */ - if (map->m_may_create && - (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || - f2fs_cp_error(sbi))) { - err = -EIO; - goto unlock_out; - } - - err = 0; - if (map->m_next_pgofs) - *map->m_next_pgofs = - f2fs_get_next_page_offset(&dn, pgofs); - if (map->m_next_extent) - *map->m_next_extent = - f2fs_get_next_page_offset(&dn, pgofs); - } + if (err == -ENOENT) + err = f2fs_map_no_dnode(inode, map, &dn, pgofs); goto unlock_out; } @@ -1578,78 +1588,76 @@ next_dnode: next_block: blkaddr = f2fs_data_blkaddr(&dn); - - if (__is_valid_data_blkaddr(blkaddr) && - !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { + is_hole = !__is_valid_data_blkaddr(blkaddr); + if (!is_hole && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto sync_out; } - if (__is_valid_data_blkaddr(blkaddr)) { - /* use out-place-update for driect IO under LFS mode */ - if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && - map->m_may_create) { + /* use out-place-update for direct IO under LFS mode */ + if (map->m_may_create && + (is_hole || (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO))) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto sync_out; + } + + switch (flag) { + case F2FS_GET_BLOCK_PRE_AIO: + if (blkaddr == NULL_ADDR) { + prealloc++; + last_ofs_in_node = dn.ofs_in_node; + } + break; + case F2FS_GET_BLOCK_PRE_DIO: + case F2FS_GET_BLOCK_DIO: err = __allocate_data_block(&dn, map->m_seg_type); if (err) goto sync_out; - blkaddr = dn.data_blkaddr; + if (flag == F2FS_GET_BLOCK_PRE_DIO) + file_need_truncate(inode); set_inode_flag(inode, FI_APPEND_WRITE); + break; + default: + WARN_ON_ONCE(1); + err = -EIO; + goto sync_out; } - } else { - if (create) { - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; - goto sync_out; - } - if (flag == F2FS_GET_BLOCK_PRE_AIO) { - if (blkaddr == NULL_ADDR) { - prealloc++; - last_ofs_in_node = dn.ofs_in_node; - } - } else { - WARN_ON(flag != F2FS_GET_BLOCK_PRE_DIO && - flag != F2FS_GET_BLOCK_DIO); - err = __allocate_data_block(&dn, - map->m_seg_type); - if (!err) { - if (flag == F2FS_GET_BLOCK_PRE_DIO) - file_need_truncate(inode); - set_inode_flag(inode, FI_APPEND_WRITE); - } - } - if (err) - goto sync_out; + + blkaddr = dn.data_blkaddr; + if (is_hole) map->m_flags |= F2FS_MAP_NEW; - blkaddr = dn.data_blkaddr; - } else { - if (f2fs_compressed_file(inode) && - f2fs_sanity_check_cluster(&dn) && - (flag != F2FS_GET_BLOCK_FIEMAP || - IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { - err = -EFSCORRUPTED; - f2fs_handle_error(sbi, - ERROR_CORRUPTED_CLUSTER); - goto sync_out; - } - if (flag == F2FS_GET_BLOCK_BMAP) { - map->m_pblk = 0; - goto sync_out; - } - if (flag == F2FS_GET_BLOCK_PRECACHE) - goto sync_out; - if (flag == F2FS_GET_BLOCK_FIEMAP && - blkaddr == NULL_ADDR) { - if (map->m_next_pgofs) - *map->m_next_pgofs = pgofs + 1; - goto sync_out; - } - if (flag != F2FS_GET_BLOCK_FIEMAP) { - /* for defragment case */ + } else if (is_hole) { + if (f2fs_compressed_file(inode) && + f2fs_sanity_check_cluster(&dn) && + (flag != F2FS_GET_BLOCK_FIEMAP || + IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_CORRUPTED_CLUSTER); + goto sync_out; + } + + switch (flag) { + case F2FS_GET_BLOCK_PRECACHE: + goto sync_out; + case F2FS_GET_BLOCK_BMAP: + map->m_pblk = 0; + goto sync_out; + case F2FS_GET_BLOCK_FIEMAP: + if (blkaddr == NULL_ADDR) { if (map->m_next_pgofs) *map->m_next_pgofs = pgofs + 1; goto sync_out; } + break; + default: + /* for defragment case */ + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; + goto sync_out; } } @@ -1660,9 +1668,9 @@ next_block: bidx = f2fs_target_device_index(sbi, blkaddr); if (map->m_len == 0) { - /* preallocated unwritten block should be mapped for fiemap. */ + /* reserved delalloc block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) - map->m_flags |= F2FS_MAP_UNWRITTEN; + map->m_flags |= F2FS_MAP_DELALLOC; map->m_flags |= F2FS_MAP_MAPPED; map->m_pblk = blkaddr; @@ -1721,7 +1729,7 @@ skip: f2fs_put_dnode(&dn); if (map->m_may_create) { - f2fs_do_map_lock(sbi, flag, false); + f2fs_map_unlock(sbi, flag); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; @@ -1767,11 +1775,11 @@ sync_out: f2fs_put_dnode(&dn); unlock_out: if (map->m_may_create) { - f2fs_do_map_lock(sbi, flag, false); + f2fs_map_unlock(sbi, flag); f2fs_balance_fs(sbi, dn.node_changed); } out: - trace_f2fs_map_blocks(inode, map, create, flag, err); + trace_f2fs_map_blocks(inode, map, flag, err); return err; } @@ -1793,7 +1801,7 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) while (map.m_lblk < last_lblk) { map.m_len = last_lblk - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT); if (err || map.m_len == 0) return false; map.m_lblk += map.m_len; @@ -1967,7 +1975,7 @@ next: map.m_len = cluster_size - count_in_cluster; } - ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); + ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; @@ -1984,7 +1992,7 @@ next: compr_appended = false; /* In a case of compressed cluster, append this to the last extent */ - if (compr_cluster && ((map.m_flags & F2FS_MAP_UNWRITTEN) || + if (compr_cluster && ((map.m_flags & F2FS_MAP_DELALLOC) || !(map.m_flags & F2FS_MAP_FLAGS))) { compr_appended = true; goto skip_fill; @@ -2030,7 +2038,7 @@ skip_fill: compr_cluster = false; size += blks_to_bytes(inode, 1); } - } else if (map.m_flags & F2FS_MAP_UNWRITTEN) { + } else if (map.m_flags & F2FS_MAP_DELALLOC) { flags = FIEMAP_EXTENT_UNWRITTEN; } @@ -2099,7 +2107,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, map->m_lblk = block_in_file; map->m_len = last_block - block_in_file; - ret = f2fs_map_blocks(inode, map, 0, F2FS_GET_BLOCK_DEFAULT); + ret = f2fs_map_blocks(inode, map, F2FS_GET_BLOCK_DEFAULT); if (ret) goto out; got_it: @@ -2136,7 +2144,7 @@ zero_out: *last_block_in_bio, block_nr) || !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) { submit_and_realloc: - __submit_bio(F2FS_I_SB(inode), bio, DATA); + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } if (bio == NULL) { @@ -2283,7 +2291,7 @@ skip_reading_dnode: *last_block_in_bio, blkaddr) || !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) { submit_and_realloc: - __submit_bio(sbi, bio, DATA); + f2fs_submit_read_bio(sbi, bio, DATA); bio = NULL; } @@ -2377,7 +2385,7 @@ static int f2fs_mpage_readpages(struct inode *inode, #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { - /* there are remained comressed pages, submit them */ + /* there are remained compressed pages, submit them */ if (!f2fs_cluster_can_merge_page(&cc, page->index)) { ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, @@ -2444,7 +2452,7 @@ next_page: #endif } if (bio) - __submit_bio(F2FS_I_SB(inode), bio, DATA); + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); return ret; } @@ -2530,34 +2538,29 @@ static inline bool check_inplace_update_policy(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - unsigned int policy = SM_I(sbi)->ipu_policy; - if (policy & (0x1 << F2FS_IPU_HONOR_OPU_WRITE) && - is_inode_flag_set(inode, FI_OPU_WRITE)) + if (IS_F2FS_IPU_HONOR_OPU_WRITE(sbi) && + is_inode_flag_set(inode, FI_OPU_WRITE)) return false; - if (policy & (0x1 << F2FS_IPU_FORCE)) + if (IS_F2FS_IPU_FORCE(sbi)) return true; - if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi)) + if (IS_F2FS_IPU_SSR(sbi) && f2fs_need_SSR(sbi)) return true; - if (policy & (0x1 << F2FS_IPU_UTIL) && - utilization(sbi) > SM_I(sbi)->min_ipu_util) + if (IS_F2FS_IPU_UTIL(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; - if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && f2fs_need_SSR(sbi) && - utilization(sbi) > SM_I(sbi)->min_ipu_util) + if (IS_F2FS_IPU_SSR_UTIL(sbi) && f2fs_need_SSR(sbi) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; /* * IPU for rewrite async pages */ - if (policy & (0x1 << F2FS_IPU_ASYNC) && - fio && fio->op == REQ_OP_WRITE && - !(fio->op_flags & REQ_SYNC) && - !IS_ENCRYPTED(inode)) + if (IS_F2FS_IPU_ASYNC(sbi) && fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC) && !IS_ENCRYPTED(inode)) return true; /* this is only set during fdatasync */ - if (policy & (0x1 << F2FS_IPU_FSYNC) && - is_inode_flag_set(inode, FI_NEED_IPU)) + if (IS_F2FS_IPU_FSYNC(sbi) && is_inode_flag_set(inode, FI_NEED_IPU)) return true; if (unlikely(fio && is_sbi_flag_set(sbi, SBI_CP_DISABLED) && @@ -2635,7 +2638,6 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; - struct extent_info ei = {0, }; struct node_info ni; bool ipu_force = false; int err = 0; @@ -2647,9 +2649,8 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) set_new_dnode(&dn, inode, NULL, NULL, 0); if (need_inplace_update(fio) && - f2fs_lookup_read_extent_cache(inode, page->index, &ei)) { - fio->old_blkaddr = ei.blk + page->index - ei.fofs; - + f2fs_lookup_read_extent_cache_block(inode, page->index, + &fio->old_blkaddr)) { if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_handle_error(fio->sbi, @@ -2699,7 +2700,6 @@ got_it: goto out_writepage; set_page_writeback(page); - ClearPageError(page); f2fs_put_dnode(&dn); if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); @@ -2735,7 +2735,6 @@ got_it: goto out_writepage; set_page_writeback(page); - ClearPageError(page); if (fio->compr_blocks && fio->old_blkaddr == COMPRESS_ADDR) f2fs_i_compr_blocks_update(inode, fio->compr_blocks - 1, false); @@ -2780,10 +2779,10 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, - .submitted = false, + .submitted = 0, .compr_blocks = compr_blocks, .need_lock = LOCK_RETRY, - .post_read = f2fs_post_read_required(inode), + .post_read = f2fs_post_read_required(inode) ? 1 : 0, .io_type = io_type, .io_wbc = wbc, .bio = bio, @@ -2792,7 +2791,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, trace_f2fs_writepage(page, DATA); - /* we should bypass data pages to proceed the kworkder jobs */ + /* we should bypass data pages to proceed the kworker jobs */ if (unlikely(f2fs_cp_error(sbi))) { mapping_set_error(page->mapping, -EIO); /* @@ -2904,14 +2903,14 @@ out: } if (submitted) - *submitted = fio.submitted ? 1 : 0; + *submitted = fio.submitted; return 0; redirty_out: redirty_page_for_writepage(wbc, page); /* - * pageout() in MM traslates EAGAIN, so calls handle_write_error() + * pageout() in MM translates EAGAIN, so calls handle_write_error() * -> mapping_set_error() -> set_bit(AS_EIO, ...). * file_write_and_wait_range() will see EIO error, which is critical * to return value of fsync() followed by atomic_write failure to user. @@ -2945,7 +2944,7 @@ out: } /* - * This function was copied from write_cche_pages from mm/page-writeback.c. + * This function was copied from write_cache_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from * warm/hot data page. */ @@ -3354,9 +3353,8 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, struct dnode_of_data dn; struct page *ipage; bool locked = false; - struct extent_info ei = {0, }; + int flag = F2FS_GET_BLOCK_PRE_AIO; int err = 0; - int flag; /* * If a whole page is being written and we already preallocated all the @@ -3366,14 +3364,13 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, return 0; /* f2fs_lock_op avoids race between write CP and convert_inline_page */ - if (f2fs_has_inline_data(inode) && pos + len > MAX_INLINE_DATA(inode)) - flag = F2FS_GET_BLOCK_DEFAULT; - else - flag = F2FS_GET_BLOCK_PRE_AIO; - - if (f2fs_has_inline_data(inode) || - (pos & PAGE_MASK) >= i_size_read(inode)) { - f2fs_do_map_lock(sbi, flag, true); + if (f2fs_has_inline_data(inode)) { + if (pos + len > MAX_INLINE_DATA(inode)) + flag = F2FS_GET_BLOCK_DEFAULT; + f2fs_map_lock(sbi, flag); + locked = true; + } else if ((pos & PAGE_MASK) >= i_size_read(inode)) { + f2fs_map_lock(sbi, flag); locked = true; } @@ -3393,40 +3390,40 @@ restart: set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) set_page_private_inline(ipage); - } else { - err = f2fs_convert_inline_page(&dn, page); - if (err) - goto out; - if (dn.data_blkaddr == NULL_ADDR) - err = f2fs_get_block(&dn, index); - } - } else if (locked) { - err = f2fs_get_block(&dn, index); - } else { - if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { - dn.data_blkaddr = ei.blk + index - ei.fofs; - } else { - /* hole case */ - err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err || dn.data_blkaddr == NULL_ADDR) { - f2fs_put_dnode(&dn); - f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, - true); - WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); - locked = true; - goto restart; - } + goto out; } + err = f2fs_convert_inline_page(&dn, page); + if (err || dn.data_blkaddr != NULL_ADDR) + goto out; } - /* convert_inline_page can make node_changed */ - *blk_addr = dn.data_blkaddr; - *node_changed = dn.node_changed; + if (!f2fs_lookup_read_extent_cache_block(inode, index, + &dn.data_blkaddr)) { + if (locked) { + err = f2fs_reserve_block(&dn, index); + goto out; + } + + /* hole case */ + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (!err && dn.data_blkaddr != NULL_ADDR) + goto out; + f2fs_put_dnode(&dn); + f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); + WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); + locked = true; + goto restart; + } out: + if (!err) { + /* convert_inline_page can make node_changed */ + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; + } f2fs_put_dnode(&dn); unlock_out: if (locked) - f2fs_do_map_lock(sbi, flag, false); + f2fs_map_unlock(sbi, flag); return err; } @@ -3435,7 +3432,6 @@ static int __find_data_block(struct inode *inode, pgoff_t index, { struct dnode_of_data dn; struct page *ipage; - struct extent_info ei = {0, }; int err = 0; ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); @@ -3444,9 +3440,8 @@ static int __find_data_block(struct inode *inode, pgoff_t index, set_new_dnode(&dn, inode, ipage, ipage, 0); - if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { - dn.data_blkaddr = ei.blk + index - ei.fofs; - } else { + if (!f2fs_lookup_read_extent_cache_block(inode, index, + &dn.data_blkaddr)) { /* hole case */ err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) { @@ -3467,7 +3462,7 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index, struct page *ipage; int err = 0; - f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { @@ -3476,14 +3471,16 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index, } set_new_dnode(&dn, inode, ipage, ipage, 0); - err = f2fs_get_block(&dn, index); + if (!f2fs_lookup_read_extent_cache_block(dn.inode, index, + &dn.data_blkaddr)) + err = f2fs_reserve_block(&dn, index); *blk_addr = dn.data_blkaddr; *node_changed = dn.node_changed; f2fs_put_dnode(&dn); unlock_out: - f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO); return err; } @@ -3729,6 +3726,7 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) } } + clear_page_private_reference(&folio->page); clear_page_private_gcing(&folio->page); if (test_opt(sbi, COMPRESS_CACHE) && @@ -3754,6 +3752,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait) clear_page_private_data(&folio->page); } + clear_page_private_reference(&folio->page); clear_page_private_gcing(&folio->page); folio_detach_private(folio); @@ -3835,7 +3834,7 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) map.m_next_pgofs = NULL; map.m_seg_type = NO_CHECK_TYPE; - if (!f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_BMAP)) + if (!f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_BMAP)) blknr = map.m_pblk; } out: @@ -3943,7 +3942,7 @@ retry: map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; - ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); + ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; @@ -4168,8 +4167,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (flags & IOMAP_WRITE) map.m_may_create = true; - err = f2fs_map_blocks(inode, &map, flags & IOMAP_WRITE, - F2FS_GET_BLOCK_DIO); + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO); if (err) return err; @@ -4182,20 +4180,24 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); - if (map.m_flags & (F2FS_MAP_MAPPED | F2FS_MAP_UNWRITTEN)) { - iomap->length = blks_to_bytes(inode, map.m_len); - if (map.m_flags & F2FS_MAP_MAPPED) { - iomap->type = IOMAP_MAPPED; - iomap->flags |= IOMAP_F_MERGED; - } else { - iomap->type = IOMAP_UNWRITTEN; - } - if (WARN_ON_ONCE(!__is_valid_data_blkaddr(map.m_pblk))) - return -EINVAL; + /* + * We should never see delalloc or compressed extents here based on + * prior flushing and checks. + */ + if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR)) + return -EINVAL; + if (WARN_ON_ONCE(map.m_pblk == COMPRESS_ADDR)) + return -EINVAL; + if (map.m_pblk != NULL_ADDR) { + iomap->length = blks_to_bytes(inode, map.m_len); + iomap->type = IOMAP_MAPPED; + iomap->flags |= IOMAP_F_MERGED; iomap->bdev = map.m_bdev; iomap->addr = blks_to_bytes(inode, map.m_pblk); } else { + if (flags & IOMAP_WRITE) + return -ENOTBLK; iomap->length = blks_to_bytes(inode, next_pgofs) - iomap->offset; iomap->type = IOMAP_HOLE; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 32af4f0c5735..30a77936e3c5 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -354,6 +354,17 @@ static char *s_flag[] = { [SBI_IS_FREEZING] = " freezefs", }; +static const char *ipu_mode_names[F2FS_IPU_MAX] = { + [F2FS_IPU_FORCE] = "FORCE", + [F2FS_IPU_SSR] = "SSR", + [F2FS_IPU_UTIL] = "UTIL", + [F2FS_IPU_SSR_UTIL] = "SSR_UTIL", + [F2FS_IPU_FSYNC] = "FSYNC", + [F2FS_IPU_ASYNC] = "ASYNC", + [F2FS_IPU_NOCACHE] = "NOCACHE", + [F2FS_IPU_HONOR_OPU_WRITE] = "HONOR_OPU_WRITE", +}; + static int stat_show(struct seq_file *s, void *v) { struct f2fs_stat_info *si; @@ -362,16 +373,18 @@ static int stat_show(struct seq_file *s, void *v) raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_for_each_entry(si, &f2fs_stat_list, stat_list) { - update_general_status(si->sbi); + struct f2fs_sb_info *sbi = si->sbi; + + update_general_status(sbi); seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", - si->sbi->sb->s_bdev, i++, - f2fs_readonly(si->sbi->sb) ? "RO" : "RW", - is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? - "Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good")); - if (si->sbi->s_flag) { + sbi->sb->s_bdev, i++, + f2fs_readonly(sbi->sb) ? "RO" : "RW", + is_set_ckpt_flags(sbi, CP_DISABLED_FLAG) ? + "Disabled" : (f2fs_cp_error(sbi) ? "Error" : "Good")); + if (sbi->s_flag) { seq_puts(s, "[SBI:"); - for_each_set_bit(j, &si->sbi->s_flag, 32) + for_each_set_bit(j, &sbi->s_flag, 32) seq_puts(s, s_flag[j]); seq_puts(s, "]\n"); } @@ -383,8 +396,21 @@ static int stat_show(struct seq_file *s, void *v) si->overp_segs, si->rsvd_segs); seq_printf(s, "Current Time Sec: %llu / Mounted Time Sec: %llu\n\n", ktime_get_boottime_seconds(), - SIT_I(si->sbi)->mounted_time); - if (test_opt(si->sbi, DISCARD)) + SIT_I(sbi)->mounted_time); + + seq_puts(s, "Policy:\n"); + seq_puts(s, " - IPU: ["); + if (IS_F2FS_IPU_DISABLE(sbi)) { + seq_puts(s, " DISABLE"); + } else { + unsigned long policy = SM_I(sbi)->ipu_policy; + + for_each_set_bit(j, &policy, F2FS_IPU_MAX) + seq_printf(s, " %s", ipu_mode_names[j]); + } + seq_puts(s, " ]\n\n"); + + if (test_opt(sbi, DISCARD)) seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n", si->utilization, si->valid_count, si->discard_blks); else @@ -491,15 +517,15 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - node segments : %d (%d)\n", si->node_segs, si->bg_node_segs); seq_puts(s, " - Reclaimed segs :\n"); - seq_printf(s, " - Normal : %d\n", si->sbi->gc_reclaimed_segs[GC_NORMAL]); - seq_printf(s, " - Idle CB : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_CB]); + seq_printf(s, " - Normal : %d\n", sbi->gc_reclaimed_segs[GC_NORMAL]); + seq_printf(s, " - Idle CB : %d\n", sbi->gc_reclaimed_segs[GC_IDLE_CB]); seq_printf(s, " - Idle Greedy : %d\n", - si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]); - seq_printf(s, " - Idle AT : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_AT]); + sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]); + seq_printf(s, " - Idle AT : %d\n", sbi->gc_reclaimed_segs[GC_IDLE_AT]); seq_printf(s, " - Urgent High : %d\n", - si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH]); - seq_printf(s, " - Urgent Mid : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_MID]); - seq_printf(s, " - Urgent Low : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); + sbi->gc_reclaimed_segs[GC_URGENT_HIGH]); + seq_printf(s, " - Urgent Mid : %d\n", sbi->gc_reclaimed_segs[GC_URGENT_MID]); + seq_printf(s, " - Urgent Low : %d\n", sbi->gc_reclaimed_segs[GC_URGENT_LOW]); seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, si->bg_data_blks + si->bg_node_blks); seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, @@ -565,7 +591,7 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_imeta); seq_printf(s, " - fsync mark: %4lld\n", percpu_counter_sum_positive( - &si->sbi->rf_node_block_count)); + &sbi->rf_node_block_count)); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", @@ -592,12 +618,12 @@ static int stat_show(struct seq_file *s, void *v) si->block_count[LFS], si->segment_count[LFS]); /* segment usage info */ - f2fs_update_sit_info(si->sbi); + f2fs_update_sit_info(sbi); seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n", si->bimodal, si->avg_vblocks); /* memory footprint */ - update_mem_info(si->sbi); + update_mem_info(sbi); seq_printf(s, "\nMemory: %llu KB\n", (si->base_mem + si->cache_mem + si->page_mem) >> 10); seq_printf(s, " - static: %llu KB\n", diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 8e025157f35c..9ccdbe120425 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -732,10 +732,8 @@ int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname, } start: - if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { - f2fs_show_injection_info(F2FS_I_SB(dir), FAULT_DIR_DEPTH); + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) return -ENOSPC; - } if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 342af24b2f8c..28b12553f2b3 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -19,6 +19,31 @@ #include "node.h" #include <trace/events/f2fs.h> +bool sanity_check_extent_cache(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct extent_info *ei; + + if (!fi->extent_tree[EX_READ]) + return true; + + ei = &fi->extent_tree[EX_READ]->largest; + + if (ei->len && + (!f2fs_is_valid_blkaddr(sbi, ei->blk, + DATA_GENERIC_ENHANCE) || + !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, + DATA_GENERIC_ENHANCE))) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix", + __func__, inode->i_ino, + ei->blk, ei->fofs, ei->len); + return false; + } + return true; +} + static void __set_extent_info(struct extent_info *ei, unsigned int fofs, unsigned int len, block_t blk, bool keep_clen, @@ -233,7 +258,7 @@ struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, * @prev_ex: extent before ofs * @next_ex: extent after ofs * @insert_p: insert point for new extent at ofs - * in order to simpfy the insertion after. + * in order to simplify the insertion after. * tree must stay unchanged between lookup and insertion. */ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, @@ -718,7 +743,7 @@ static void __update_extent_tree_range(struct inode *inode, if (!en) en = next_en; - /* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */ + /* 2. invalidate all extent nodes in range [fofs, fofs + len - 1] */ while (en && en->ei.fofs < end) { unsigned int org_end; int parts = 0; /* # of parts current extent split into */ @@ -871,14 +896,23 @@ unlock_out: } #endif -static unsigned long long __calculate_block_age(unsigned long long new, +static unsigned long long __calculate_block_age(struct f2fs_sb_info *sbi, + unsigned long long new, unsigned long long old) { - unsigned long long diff; + unsigned int rem_old, rem_new; + unsigned long long res; + unsigned int weight = sbi->last_age_weight; + + res = div_u64_rem(new, 100, &rem_new) * (100 - weight) + + div_u64_rem(old, 100, &rem_old) * weight; - diff = (new >= old) ? new - (new - old) : new + (old - new); + if (rem_new) + res += rem_new * (100 - weight) / 100; + if (rem_old) + res += rem_old * weight / 100; - return div_u64(diff * LAST_AGE_WEIGHT, 100); + return res; } /* This returns a new age and allocated blocks in ei */ @@ -910,7 +944,7 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei, cur_age = ULLONG_MAX - tei.last_blocks + cur_blocks; if (tei.age) - ei->age = __calculate_block_age(cur_age, tei.age); + ei->age = __calculate_block_age(sbi, cur_age, tei.age); else ei->age = cur_age; ei->last_blocks = cur_blocks; @@ -1047,6 +1081,17 @@ bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, return __lookup_extent_tree(inode, pgofs, ei, EX_READ); } +bool f2fs_lookup_read_extent_cache_block(struct inode *inode, pgoff_t index, + block_t *blkaddr) +{ + struct extent_info ei = {}; + + if (!f2fs_lookup_read_extent_cache(inode, index, &ei)) + return false; + *blkaddr = ei.blk + index - ei.fofs; + return true; +} + void f2fs_update_read_extent_cache(struct dnode_of_data *dn) { return __update_extent_cache(dn, EX_READ); @@ -1226,6 +1271,7 @@ void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) atomic64_set(&sbi->allocated_data_blocks, 0); sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD; sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD; + sbi->last_age_weight = LAST_AGE_WEIGHT; } int __init f2fs_create_extent_cache(void) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9a3ffa39ad30..b0ab2062038a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -402,7 +402,6 @@ struct discard_cmd_control { struct list_head wait_list; /* store on-flushing entries */ struct list_head fstrim_list; /* in-flight discard from fstrim */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ - unsigned int discard_wake; /* to wake up discard thread */ struct mutex cmd_lock; unsigned int nr_discards; /* # of discards in the list */ unsigned int max_discards; /* max. discards to be issued */ @@ -410,6 +409,7 @@ struct discard_cmd_control { unsigned int min_discard_issue_time; /* min. interval between discard issue */ unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ unsigned int max_discard_issue_time; /* max. interval between discard issue */ + unsigned int discard_io_aware_gran; /* minimum discard granularity not be aware of I/O */ unsigned int discard_urgent_util; /* utilization which issue discard proactively */ unsigned int discard_granularity; /* discard granularity */ unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */ @@ -420,6 +420,7 @@ struct discard_cmd_control { atomic_t discard_cmd_cnt; /* # of cached cmd count */ struct rb_root_cached root; /* root of discard rb-tree */ bool rbtree_check; /* config for consistence check */ + bool discard_wake; /* to wake up discard thread */ }; /* for the list of fsync inodes, used only during recovery */ @@ -692,15 +693,13 @@ struct extent_tree_info { }; /* - * This structure is taken from ext4_map_blocks. - * - * Note that, however, f2fs uses NEW and MAPPED flags for f2fs_map_blocks(). + * State of block returned by f2fs_map_blocks. */ -#define F2FS_MAP_NEW (1 << BH_New) -#define F2FS_MAP_MAPPED (1 << BH_Mapped) -#define F2FS_MAP_UNWRITTEN (1 << BH_Unwritten) +#define F2FS_MAP_NEW (1U << 0) +#define F2FS_MAP_MAPPED (1U << 1) +#define F2FS_MAP_DELALLOC (1U << 2) #define F2FS_MAP_FLAGS (F2FS_MAP_NEW | F2FS_MAP_MAPPED |\ - F2FS_MAP_UNWRITTEN) + F2FS_MAP_DELALLOC) struct f2fs_map_blocks { struct block_device *m_bdev; /* for multi-device dio */ @@ -870,7 +869,7 @@ struct f2fs_inode_info { unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ - unsigned short i_compress_flag; /* compress flag */ + unsigned char i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ unsigned int atomic_write_cnt; @@ -1193,7 +1192,8 @@ enum iostat_type { FS_META_READ_IO, /* meta read IOs */ /* other */ - FS_DISCARD, /* discard */ + FS_DISCARD_IO, /* discard */ + FS_FLUSH_IO, /* flush */ NR_IO_TYPE, }; @@ -1210,19 +1210,19 @@ struct f2fs_io_info { struct page *encrypted_page; /* encrypted page */ struct page *compressed_page; /* compressed page */ struct list_head list; /* serialize IOs */ - bool submitted; /* indicate IO submission */ - int need_lock; /* indicate we need to lock cp_rwsem */ - bool in_list; /* indicate fio is in io_list */ - bool is_por; /* indicate IO is from recovery or not */ - bool retry; /* need to reallocate block address */ - int compr_blocks; /* # of compressed block addresses */ - bool encrypted; /* indicate file is encrypted */ - bool post_read; /* require post read */ + unsigned int compr_blocks; /* # of compressed block addresses */ + unsigned int need_lock:8; /* indicate we need to lock cp_rwsem */ + unsigned int version:8; /* version of the node */ + unsigned int submitted:1; /* indicate IO submission */ + unsigned int in_list:1; /* indicate fio is in io_list */ + unsigned int is_por:1; /* indicate IO is from recovery or not */ + unsigned int retry:1; /* need to reallocate block address */ + unsigned int encrypted:1; /* indicate file is encrypted */ + unsigned int post_read:1; /* require post read */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ struct bio **bio; /* bio for ipu */ sector_t *last_block; /* last block number in bio */ - unsigned char version; /* version of the node */ }; struct bio_entry { @@ -1384,8 +1384,6 @@ enum { MEMORY_MODE_LOW, /* memory mode for low memry devices */ }; - - static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -1396,19 +1394,17 @@ static inline void f2fs_clear_bit(unsigned int nr, char *addr); * Layout A: lowest bit should be 1 * | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... | * bit 0 PAGE_PRIVATE_NOT_POINTER - * bit 1 PAGE_PRIVATE_ATOMIC_WRITE - * bit 2 PAGE_PRIVATE_DUMMY_WRITE - * bit 3 PAGE_PRIVATE_ONGOING_MIGRATION - * bit 4 PAGE_PRIVATE_INLINE_INODE - * bit 5 PAGE_PRIVATE_REF_RESOURCE - * bit 6- f2fs private data + * bit 1 PAGE_PRIVATE_DUMMY_WRITE + * bit 2 PAGE_PRIVATE_ONGOING_MIGRATION + * bit 3 PAGE_PRIVATE_INLINE_INODE + * bit 4 PAGE_PRIVATE_REF_RESOURCE + * bit 5- f2fs private data * * Layout B: lowest bit should be 0 * page.private is a wrapped pointer. */ enum { PAGE_PRIVATE_NOT_POINTER, /* private contains non-pointer data */ - PAGE_PRIVATE_ATOMIC_WRITE, /* data page from atomic write path */ PAGE_PRIVATE_DUMMY_WRITE, /* data page for padding aligned IO */ PAGE_PRIVATE_ONGOING_MIGRATION, /* data page which is on-going migrating */ PAGE_PRIVATE_INLINE_INODE, /* inode page contains inline data */ @@ -1450,22 +1446,18 @@ static inline void clear_page_private_##name(struct page *page) \ } PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER); -PAGE_PRIVATE_GET_FUNC(reference, REF_RESOURCE); PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION); -PAGE_PRIVATE_GET_FUNC(atomic, ATOMIC_WRITE); PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE); PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE); PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION); -PAGE_PRIVATE_SET_FUNC(atomic, ATOMIC_WRITE); PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE); PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE); PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); -PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE); PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE); static inline unsigned long get_page_private_data(struct page *page) @@ -1679,6 +1671,7 @@ struct f2fs_sb_info { /* The threshold used for hot and warm data seperation*/ unsigned int hot_data_age_threshold; unsigned int warm_data_age_threshold; + unsigned int last_age_weight; /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ @@ -1864,8 +1857,9 @@ struct f2fs_sb_info { #ifdef CONFIG_F2FS_IOSTAT /* For app/fs IO statistics */ spinlock_t iostat_lock; - unsigned long long rw_iostat[NR_IO_TYPE]; - unsigned long long prev_rw_iostat[NR_IO_TYPE]; + unsigned long long iostat_count[NR_IO_TYPE]; + unsigned long long iostat_bytes[NR_IO_TYPE]; + unsigned long long prev_iostat_bytes[NR_IO_TYPE]; bool iostat_enable; unsigned long iostat_next_period; unsigned int iostat_period_ms; @@ -1877,12 +1871,10 @@ struct f2fs_sb_info { }; #ifdef CONFIG_F2FS_FAULT_INJECTION -#define f2fs_show_injection_info(sbi, type) \ - printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", \ - KERN_INFO, sbi->sb->s_id, \ - f2fs_fault_name[type], \ - __func__, __builtin_return_address(0)) -static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) +#define time_to_inject(sbi, type) __time_to_inject(sbi, type, __func__, \ + __builtin_return_address(0)) +static inline bool __time_to_inject(struct f2fs_sb_info *sbi, int type, + const char *func, const char *parent_func) { struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; @@ -1895,12 +1887,14 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) atomic_inc(&ffi->inject_ops); if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { atomic_set(&ffi->inject_ops, 0); + printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", + KERN_INFO, sbi->sb->s_id, f2fs_fault_name[type], + func, parent_func); return true; } return false; } #else -#define f2fs_show_injection_info(sbi, type) do { } while (0) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { return false; @@ -2233,10 +2227,8 @@ static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) { - if (time_to_inject(sbi, FAULT_LOCK_OP)) { - f2fs_show_injection_info(sbi, FAULT_LOCK_OP); + if (time_to_inject(sbi, FAULT_LOCK_OP)) return 0; - } return f2fs_down_read_trylock(&sbi->cp_rwsem); } @@ -2324,7 +2316,6 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, return ret; if (time_to_inject(sbi, FAULT_BLOCK)) { - f2fs_show_injection_info(sbi, FAULT_BLOCK); release = *count; goto release_quota; } @@ -2604,10 +2595,8 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, return err; } - if (time_to_inject(sbi, FAULT_BLOCK)) { - f2fs_show_injection_info(sbi, FAULT_BLOCK); + if (time_to_inject(sbi, FAULT_BLOCK)) goto enospc; - } spin_lock(&sbi->stat_lock); @@ -2731,11 +2720,8 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, if (page) return page; - if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { - f2fs_show_injection_info(F2FS_M_SB(mapping), - FAULT_PAGE_ALLOC); + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) return NULL; - } } if (!for_write) @@ -2752,10 +2738,8 @@ static inline struct page *f2fs_pagecache_get_page( struct address_space *mapping, pgoff_t index, int fgp_flags, gfp_t gfp_mask) { - if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) { - f2fs_show_injection_info(F2FS_M_SB(mapping), FAULT_PAGE_GET); + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) return NULL; - } return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); } @@ -2805,10 +2789,8 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, if (nofail) return f2fs_kmem_cache_alloc_nofail(cachep, flags); - if (time_to_inject(sbi, FAULT_SLAB_ALLOC)) { - f2fs_show_injection_info(sbi, FAULT_SLAB_ALLOC); + if (time_to_inject(sbi, FAULT_SLAB_ALLOC)) return NULL; - } return kmem_cache_alloc(cachep, flags); } @@ -3382,10 +3364,8 @@ static inline bool is_dot_dotdot(const u8 *name, size_t len) static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { - if (time_to_inject(sbi, FAULT_KMALLOC)) { - f2fs_show_injection_info(sbi, FAULT_KMALLOC); + if (time_to_inject(sbi, FAULT_KMALLOC)) return NULL; - } return kmalloc(size, flags); } @@ -3399,10 +3379,8 @@ static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { - if (time_to_inject(sbi, FAULT_KVMALLOC)) { - f2fs_show_injection_info(sbi, FAULT_KVMALLOC); + if (time_to_inject(sbi, FAULT_KVMALLOC)) return NULL; - } return kvmalloc(size, flags); } @@ -3788,8 +3766,8 @@ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); -void f2fs_submit_bio(struct f2fs_sb_info *sbi, - struct bio *bio, enum page_type type); +void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, + enum page_type type); int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, @@ -3808,7 +3786,7 @@ void f2fs_set_data_blkaddr(struct dnode_of_data *dn); void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); int f2fs_reserve_new_block(struct dnode_of_data *dn); -int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); +int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); @@ -3819,9 +3797,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size); int f2fs_do_write_data_page(struct f2fs_io_info *fio); -void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock); -int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, - int create, int flag); +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); int f2fs_encrypt_one_page(struct f2fs_io_info *fio); @@ -4161,6 +4137,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ +bool sanity_check_extent_cache(struct inode *inode); struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs); struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi, @@ -4190,6 +4167,8 @@ void f2fs_destroy_extent_cache(void); void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage); bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, struct extent_info *ei); +bool f2fs_lookup_read_extent_cache_block(struct inode *inode, pgoff_t index, + block_t *blkaddr); void f2fs_update_read_extent_cache(struct dnode_of_data *dn); void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, pgoff_t fofs, block_t blkaddr, unsigned int len); @@ -4259,7 +4238,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock); void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); -int f2fs_init_compress_mempool(void); +int __init f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); void f2fs_end_read_compressed_page(struct page *page, bool failed, @@ -4328,7 +4307,7 @@ static inline struct page *f2fs_compress_control_page(struct page *page) WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } -static inline int f2fs_init_compress_mempool(void) { return 0; } +static inline int __init f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) { } @@ -4381,9 +4360,8 @@ static inline int set_compress_context(struct inode *inode) if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 || F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) && F2FS_OPTION(sbi).compress_level) - F2FS_I(inode)->i_compress_flag |= - F2FS_OPTION(sbi).compress_level << - COMPRESS_LEVEL_OFFSET; + F2FS_I(inode)->i_compress_level = + F2FS_OPTION(sbi).compress_level; F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b90617639743..15dabeac4690 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -113,10 +113,8 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (need_alloc) { /* block allocation */ - f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_get_block(&dn, page->index); - f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + err = f2fs_get_block_locked(&dn, page->index); } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -305,7 +303,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, * for OPU case, during fsync(), node can be persisted before * data when lower device doesn't support write barrier, result * in data corruption after SPO. - * So for strict fsync mode, force to use atomic write sematics + * So for strict fsync mode, force to use atomic write semantics * to keep write order in between data/node and last node to * avoid potential data corruption. */ @@ -619,7 +617,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + ofs; f2fs_update_read_extent_cache_range(dn, fofs, 0, len); - f2fs_update_age_extent_cache_range(dn, fofs, nr_free); + f2fs_update_age_extent_cache_range(dn, fofs, len); dec_valid_block_count(sbi, dn->inode, nr_free); } dn->ofs_in_node = ofs; @@ -784,10 +782,8 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); - if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { - f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_TRUNCATE); + if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) return -EIO; - } err = f2fs_dquot_initialize(inode); if (err) @@ -1112,7 +1108,7 @@ int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) return 0; } -static int punch_hole(struct inode *inode, loff_t offset, loff_t len) +static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { pgoff_t pg_start, pg_end; loff_t off_start, off_end; @@ -1498,6 +1494,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, } f2fs_update_read_extent_cache_range(dn, start, 0, index - start); + f2fs_update_age_extent_cache_range(dn, start, index - start); return ret; } @@ -1684,7 +1681,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) return ret; } -static int expand_inode_data(struct inode *inode, loff_t offset, +static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, loff_t len, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1697,7 +1694,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, .err_gc_skipped = true, .nr_free_secs = 0 }; pgoff_t pg_start, pg_end; - loff_t new_size = i_size_read(inode); + loff_t new_size; loff_t off_end; block_t expanded = 0; int err; @@ -1745,7 +1742,7 @@ next_alloc: f2fs_unlock_op(sbi); map.m_seg_type = CURSEG_COLD_DATA_PINNED; - err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRE_DIO); file_dont_truncate(inode); f2fs_up_write(&sbi->pin_sem); @@ -1758,7 +1755,7 @@ next_alloc: map.m_len = expanded; } else { - err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRE_AIO); expanded = map.m_len; } out_err: @@ -1809,7 +1806,7 @@ static long f2fs_fallocate(struct file *file, int mode, return -EOPNOTSUPP; /* - * Pinned file should not support partial trucation since the block + * Pinned file should not support partial truncation since the block * can be used by applications. */ if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) && @@ -1832,7 +1829,7 @@ static long f2fs_fallocate(struct file *file, int mode, if (offset >= inode->i_size) goto out; - ret = punch_hole(inode, offset, len); + ret = f2fs_punch_hole(inode, offset, len); } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { ret = f2fs_collapse_range(inode, offset, len); } else if (mode & FALLOC_FL_ZERO_RANGE) { @@ -1840,7 +1837,7 @@ static long f2fs_fallocate(struct file *file, int mode, } else if (mode & FALLOC_FL_INSERT_RANGE) { ret = f2fs_insert_range(inode, offset, len); } else { - ret = expand_inode_data(inode, offset, len, mode); + ret = f2fs_expand_inode_data(inode, offset, len, mode); } if (!ret) { @@ -1859,14 +1856,17 @@ out: static int f2fs_release_file(struct inode *inode, struct file *filp) { /* - * f2fs_relase_file is called at every close calls. So we should + * f2fs_release_file is called at every close calls. So we should * not drop any inmemory pages by close called by other process. */ if (!(filp->f_mode & FMODE_WRITE) || atomic_read(&inode->i_writecount) != 1) return 0; + inode_lock(inode); f2fs_abort_atomic_write(inode, true); + inode_unlock(inode); + return 0; } @@ -1880,8 +1880,13 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) * until all the writers close its file. Since this should be done * before dropping file lock, it needs to do in ->flush. */ - if (F2FS_I(inode)->atomic_write_task == current) + if (F2FS_I(inode)->atomic_write_task == current && + (current->flags & PF_EXITING)) { + inode_lock(inode); f2fs_abort_atomic_write(inode, true); + inode_unlock(inode); + } + return 0; } @@ -2087,19 +2092,28 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) goto out; } - /* Create a COW inode for atomic write */ - pinode = f2fs_iget(inode->i_sb, fi->i_pino); - if (IS_ERR(pinode)) { - f2fs_up_write(&fi->i_gc_rwsem[WRITE]); - ret = PTR_ERR(pinode); - goto out; - } + /* Check if the inode already has a COW inode */ + if (fi->cow_inode == NULL) { + /* Create a COW inode for atomic write */ + pinode = f2fs_iget(inode->i_sb, fi->i_pino); + if (IS_ERR(pinode)) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + ret = PTR_ERR(pinode); + goto out; + } - ret = f2fs_get_tmpfile(idmap, pinode, &fi->cow_inode); - iput(pinode); - if (ret) { - f2fs_up_write(&fi->i_gc_rwsem[WRITE]); - goto out; + ret = f2fs_get_tmpfile(idmap, pinode, &fi->cow_inode); + iput(pinode); + if (ret) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + goto out; + } + + set_inode_flag(fi->cow_inode, FI_COW_FILE); + clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); + } else { + /* Reuse the already created COW inode */ + f2fs_do_truncate_blocks(fi->cow_inode, 0, true); } f2fs_write_inode(inode, NULL); @@ -2107,8 +2121,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) stat_inc_atomic_inode(inode); set_inode_flag(inode, FI_ATOMIC_FILE); - set_inode_flag(fi->cow_inode, FI_COW_FILE); - clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); isize = i_size_read(inode); fi->original_i_size = isize; @@ -2338,6 +2350,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + u8 encrypt_pw_salt[16]; int err; if (!f2fs_sb_has_encrypt(sbi)) @@ -2362,12 +2375,14 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) goto out_err; } got_it: - if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, - 16)) - err = -EFAULT; + memcpy(encrypt_pw_salt, sbi->raw_super->encrypt_pw_salt, 16); out_err: f2fs_up_write(&sbi->sb_lock); mnt_drop_write_file(filp); + + if (!err && copy_to_user((__u8 __user *)arg, encrypt_pw_salt, 16)) + err = -EFAULT; + return err; } @@ -2524,7 +2539,7 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) return __f2fs_ioc_gc_range(filp, &range); } -static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) +static int f2fs_ioc_write_checkpoint(struct file *filp) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -2606,7 +2621,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, */ while (map.m_lblk < pg_end) { map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT); if (err) goto out; @@ -2653,7 +2668,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, do_map: map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT); if (err) goto clear_out; @@ -3227,7 +3242,7 @@ int f2fs_precache_extents(struct inode *inode) map.m_len = end - map.m_lblk; f2fs_down_write(&fi->i_gc_rwsem[WRITE]); - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE); + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRECACHE); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); if (err) return err; @@ -3238,7 +3253,7 @@ int f2fs_precache_extents(struct inode *inode) return 0; } -static int f2fs_ioc_precache_extents(struct file *filp, unsigned long arg) +static int f2fs_ioc_precache_extents(struct file *filp) { return f2fs_precache_extents(file_inode(filp)); } @@ -3942,7 +3957,7 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) goto out; } - if (inode->i_size != 0) { + if (F2FS_HAS_BLOCKS(inode)) { ret = -EFBIG; goto out; } @@ -3995,7 +4010,7 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) return ret; } -static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) +static int f2fs_ioc_decompress_file(struct file *filp) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -4068,7 +4083,7 @@ out: return ret; } -static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg) +static int f2fs_ioc_compress_file(struct file *filp) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -4184,7 +4199,7 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_GARBAGE_COLLECT_RANGE: return f2fs_ioc_gc_range(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: - return f2fs_ioc_write_checkpoint(filp, arg); + return f2fs_ioc_write_checkpoint(filp); case F2FS_IOC_DEFRAGMENT: return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: @@ -4198,7 +4213,7 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_SET_PIN_FILE: return f2fs_ioc_set_pin_file(filp, arg); case F2FS_IOC_PRECACHE_EXTENTS: - return f2fs_ioc_precache_extents(filp, arg); + return f2fs_ioc_precache_extents(filp); case F2FS_IOC_RESIZE_FS: return f2fs_ioc_resize_fs(filp, arg); case FS_IOC_ENABLE_VERITY: @@ -4224,9 +4239,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_SET_COMPRESS_OPTION: return f2fs_ioc_set_compress_option(filp, arg); case F2FS_IOC_DECOMPRESS_FILE: - return f2fs_ioc_decompress_file(filp, arg); + return f2fs_ioc_decompress_file(filp); case F2FS_IOC_COMPRESS_FILE: - return f2fs_ioc_compress_file(filp, arg); + return f2fs_ioc_compress_file(filp); default: return -ENOTTY; } @@ -4341,6 +4356,27 @@ out: return ret; } +static void f2fs_trace_rw_file_path(struct kiocb *iocb, size_t count, int rw) +{ + struct inode *inode = file_inode(iocb->ki_filp); + char *buf, *path; + + buf = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL); + if (!buf) + return; + path = dentry_path_raw(file_dentry(iocb->ki_filp), buf, PATH_MAX); + if (IS_ERR(path)) + goto free_buf; + if (rw == WRITE) + trace_f2fs_datawrite_start(inode, iocb->ki_pos, count, + current->pid, path, current->comm); + else + trace_f2fs_dataread_start(inode, iocb->ki_pos, count, + current->pid, path, current->comm); +free_buf: + kfree(buf); +} + static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); @@ -4350,24 +4386,9 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - if (trace_f2fs_dataread_start_enabled()) { - char *p = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL); - char *path; - - if (!p) - goto skip_read_trace; + if (trace_f2fs_dataread_start_enabled()) + f2fs_trace_rw_file_path(iocb, iov_iter_count(to), READ); - path = dentry_path_raw(file_dentry(iocb->ki_filp), p, PATH_MAX); - if (IS_ERR(path)) { - kfree(p); - goto skip_read_trace; - } - - trace_f2fs_dataread_start(inode, pos, iov_iter_count(to), - current->pid, path, current->comm); - kfree(p); - } -skip_read_trace: if (f2fs_should_use_dio(inode, iocb, to)) { ret = f2fs_dio_read_iter(iocb, to); } else { @@ -4466,7 +4487,7 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter, flag = F2FS_GET_BLOCK_PRE_AIO; } - ret = f2fs_map_blocks(inode, &map, 1, flag); + ret = f2fs_map_blocks(inode, &map, flag); /* -ENOSPC|-EDQUOT are fine to report the number of allocated blocks. */ if (ret < 0 && !((ret == -ENOSPC || ret == -EDQUOT) && map.m_len > 0)) return ret; @@ -4673,24 +4694,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (preallocated < 0) { ret = preallocated; } else { - if (trace_f2fs_datawrite_start_enabled()) { - char *p = f2fs_kmalloc(F2FS_I_SB(inode), - PATH_MAX, GFP_KERNEL); - char *path; - - if (!p) - goto skip_write_trace; - path = dentry_path_raw(file_dentry(iocb->ki_filp), - p, PATH_MAX); - if (IS_ERR(path)) { - kfree(p); - goto skip_write_trace; - } - trace_f2fs_datawrite_start(inode, orig_pos, orig_count, - current->pid, path, current->comm); - kfree(p); - } -skip_write_trace: + if (trace_f2fs_datawrite_start_enabled()) + f2fs_trace_rw_file_path(iocb, orig_count, WRITE); + /* Do the actual write. */ ret = dio ? f2fs_dio_write_iter(iocb, from, &may_need_sync) : @@ -4823,6 +4829,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC32_MOVE_RANGE: return f2fs_compat_ioc_move_range(file, arg); case F2FS_IOC_START_ATOMIC_WRITE: + case F2FS_IOC_START_ATOMIC_REPLACE: case F2FS_IOC_COMMIT_ATOMIC_WRITE: case F2FS_IOC_START_VOLATILE_WRITE: case F2FS_IOC_RELEASE_VOLATILE_WRITE: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6e2cae3d2e71..0a9dfa459860 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -57,7 +57,7 @@ static int gc_thread_func(void *data) /* give it a try one time */ if (gc_th->gc_wake) - gc_th->gc_wake = 0; + gc_th->gc_wake = false; if (try_to_freeze()) { stat_other_skip_bggc_count(sbi); @@ -72,11 +72,9 @@ static int gc_thread_func(void *data) continue; } - if (time_to_inject(sbi, FAULT_CHECKPOINT)) { - f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); + if (time_to_inject(sbi, FAULT_CHECKPOINT)) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); - } if (!sb_start_write_trylock(sbi->sb)) { stat_other_skip_bggc_count(sbi); @@ -185,7 +183,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; - gc_th->gc_wake = 0; + gc_th->gc_wake = false; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); @@ -1150,7 +1148,6 @@ static int ra_data_block(struct inode *inode, pgoff_t index) struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei = {0, }; struct f2fs_io_info fio = { .sbi = sbi, .ino = inode->i_ino, @@ -1159,8 +1156,8 @@ static int ra_data_block(struct inode *inode, pgoff_t index) .op = REQ_OP_READ, .op_flags = 0, .encrypted_page = NULL, - .in_list = false, - .retry = false, + .in_list = 0, + .retry = 0, }; int err; @@ -1168,8 +1165,8 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (!page) return -ENOMEM; - if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { - dn.data_blkaddr = ei.blk + index - ei.fofs; + if (f2fs_lookup_read_extent_cache_block(inode, index, + &dn.data_blkaddr)) { if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { err = -EFSCORRUPTED; @@ -1248,8 +1245,8 @@ static int move_data_block(struct inode *inode, block_t bidx, .op = REQ_OP_READ, .op_flags = 0, .encrypted_page = NULL, - .in_list = false, - .retry = false, + .in_list = 0, + .retry = 0, }; struct dnode_of_data dn; struct f2fs_summary sum; @@ -1365,7 +1362,6 @@ static int move_data_block(struct inode *inode, block_t bidx, dec_page_count(fio.sbi, F2FS_DIRTY_META); set_page_writeback(fio.encrypted_page); - ClearPageError(page); fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 19b956c2d697..15bd1d680f67 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -41,7 +41,7 @@ struct f2fs_gc_kthread { unsigned int no_gc_sleep_time; /* for changing gc mode */ - unsigned int gc_wake; + bool gc_wake; /* for GC_MERGE mount option */ wait_queue_head_t fggc_wq; /* diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 21a495234ffd..72269e7efd26 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -174,7 +174,6 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* write data page to try to make data consistent */ set_page_writeback(page); - ClearPageError(page); fio.old_blkaddr = dn->data_blkaddr; set_inode_flag(dn->inode, FI_HOT_DATA); f2fs_outplace_write_data(dn, &fio); @@ -422,18 +421,17 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, dentry_blk = page_address(page); + /* + * Start by zeroing the full block, to ensure that all unused space is + * zeroed and no uninitialized memory is leaked to disk. + */ + memset(dentry_blk, 0, F2FS_BLKSIZE); + make_dentry_ptr_inline(dir, &src, inline_dentry); make_dentry_ptr_block(dir, &dst, dentry_blk); /* copy data from inline dentry block to new dentry block */ memcpy(dst.bitmap, src.bitmap, src.nr_bitmap); - memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap); - /* - * we do not need to zero out remainder part of dentry and filename - * field, since we have used bitmap for marking the usage status of - * them, besides, we can also ignore copying/zeroing reserved space - * of dentry block, because them haven't been used so far. - */ memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ff6cf66ed46b..7d2e2c0dba65 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -262,22 +262,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (fi->extent_tree[EX_READ]) { - struct extent_info *ei = &fi->extent_tree[EX_READ]->largest; - - if (ei->len && - (!f2fs_is_valid_blkaddr(sbi, ei->blk, - DATA_GENERIC_ENHANCE) || - !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, - DATA_GENERIC_ENHANCE))) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix", - __func__, inode->i_ino, - ei->blk, ei->fofs, ei->len); - return false; - } - } - if (f2fs_sanity_check_inline_data(inode)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", @@ -413,12 +397,6 @@ static int do_read_inode(struct inode *inode) fi->i_inline_xattr_size = 0; } - if (!sanity_check_inode(inode, node_page)) { - f2fs_put_page(node_page, 1); - f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); - return -EFSCORRUPTED; - } - /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -466,11 +444,17 @@ static int do_read_inode(struct inode *inode) (fi->i_flags & F2FS_COMPR_FL)) { if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_log_cluster_size)) { + unsigned short compress_flag; + atomic_set(&fi->i_compr_blocks, le64_to_cpu(ri->i_compr_blocks)); fi->i_compress_algorithm = ri->i_compress_algorithm; fi->i_log_cluster_size = ri->i_log_cluster_size; - fi->i_compress_flag = le16_to_cpu(ri->i_compress_flag); + compress_flag = le16_to_cpu(ri->i_compress_flag); + fi->i_compress_level = compress_flag >> + COMPRESS_LEVEL_OFFSET; + fi->i_compress_flag = compress_flag & + (BIT(COMPRESS_LEVEL_OFFSET) - 1); fi->i_cluster_size = 1 << fi->i_log_cluster_size; set_inode_flag(inode, FI_COMPRESSED_FILE); } @@ -482,6 +466,18 @@ static int do_read_inode(struct inode *inode) f2fs_init_read_extent_tree(inode, node_page); f2fs_init_age_extent_tree(inode); + if (!sanity_check_inode(inode, node_page)) { + f2fs_put_page(node_page, 1); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return -EFSCORRUPTED; + } + + if (!sanity_check_extent_cache(inode)) { + f2fs_put_page(node_page, 1); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return -EFSCORRUPTED; + } + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -686,13 +682,17 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (f2fs_sb_has_compression(F2FS_I_SB(inode)) && F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, i_log_cluster_size)) { + unsigned short compress_flag; + ri->i_compr_blocks = cpu_to_le64(atomic_read( &F2FS_I(inode)->i_compr_blocks)); ri->i_compress_algorithm = F2FS_I(inode)->i_compress_algorithm; - ri->i_compress_flag = - cpu_to_le16(F2FS_I(inode)->i_compress_flag); + compress_flag = F2FS_I(inode)->i_compress_flag | + F2FS_I(inode)->i_compress_level << + COMPRESS_LEVEL_OFFSET; + ri->i_compress_flag = cpu_to_le16(compress_flag); ri->i_log_cluster_size = F2FS_I(inode)->i_log_cluster_size; } @@ -714,18 +714,19 @@ void f2fs_update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; + int count = 0; retry: node_page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { int err = PTR_ERR(node_page); - if (err == -ENOMEM) { - cond_resched(); + /* The node block was truncated. */ + if (err == -ENOENT) + return; + + if (err == -ENOMEM || ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; - } else if (err != -ENOENT) { - f2fs_stop_checkpoint(sbi, false, - STOP_CP_REASON_UPDATE_INODE); - } + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_UPDATE_INODE); return; } f2fs_update_inode(inode, node_page); @@ -766,11 +767,18 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - nid_t xnid = F2FS_I(inode)->i_xattr_nid; + struct f2fs_inode_info *fi = F2FS_I(inode); + nid_t xnid = fi->i_xattr_nid; int err = 0; f2fs_abort_atomic_write(inode, true); + if (fi->cow_inode) { + clear_inode_flag(fi->cow_inode, FI_COW_FILE); + iput(fi->cow_inode); + fi->cow_inode = NULL; + } + trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); @@ -809,10 +817,8 @@ retry: if (F2FS_HAS_BLOCKS(inode)) err = f2fs_truncate(inode); - if (time_to_inject(sbi, FAULT_EVICT_INODE)) { - f2fs_show_injection_info(sbi, FAULT_EVICT_INODE); + if (time_to_inject(sbi, FAULT_EVICT_INODE)) err = -EIO; - } if (!err) { f2fs_lock_op(sbi); @@ -857,7 +863,7 @@ no_delete: stat_dec_inline_inode(inode); stat_dec_compr_inode(inode); stat_sub_compr_blocks(inode, - atomic_read(&F2FS_I(inode)->i_compr_blocks)); + atomic_read(&fi->i_compr_blocks)); if (likely(!f2fs_cp_error(sbi) && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index 3166a8939ed4..3d5bfb1ad585 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -14,91 +14,79 @@ #include "iostat.h" #include <trace/events/f2fs.h> -#define NUM_PREALLOC_IOSTAT_CTXS 128 static struct kmem_cache *bio_iostat_ctx_cache; static mempool_t *bio_iostat_ctx_pool; +static inline unsigned long long iostat_get_avg_bytes(struct f2fs_sb_info *sbi, + enum iostat_type type) +{ + return sbi->iostat_count[type] ? div64_u64(sbi->iostat_bytes[type], + sbi->iostat_count[type]) : 0; +} + +#define IOSTAT_INFO_SHOW(name, type) \ + seq_printf(seq, "%-23s %-16llu %-16llu %-16llu\n", \ + name":", sbi->iostat_bytes[type], \ + sbi->iostat_count[type], \ + iostat_get_avg_bytes(sbi, type)) + int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); - time64_t now = ktime_get_real_seconds(); if (!sbi->iostat_enable) return 0; - seq_printf(seq, "time: %-16llu\n", now); + seq_printf(seq, "time: %-16llu\n", ktime_get_real_seconds()); + seq_printf(seq, "\t\t\t%-16s %-16s %-16s\n", + "io_bytes", "count", "avg_bytes"); /* print app write IOs */ seq_puts(seq, "[WRITE]\n"); - seq_printf(seq, "app buffered data: %-16llu\n", - sbi->rw_iostat[APP_BUFFERED_IO]); - seq_printf(seq, "app direct data: %-16llu\n", - sbi->rw_iostat[APP_DIRECT_IO]); - seq_printf(seq, "app mapped data: %-16llu\n", - sbi->rw_iostat[APP_MAPPED_IO]); - seq_printf(seq, "app buffered cdata: %-16llu\n", - sbi->rw_iostat[APP_BUFFERED_CDATA_IO]); - seq_printf(seq, "app mapped cdata: %-16llu\n", - sbi->rw_iostat[APP_MAPPED_CDATA_IO]); + IOSTAT_INFO_SHOW("app buffered data", APP_BUFFERED_IO); + IOSTAT_INFO_SHOW("app direct data", APP_DIRECT_IO); + IOSTAT_INFO_SHOW("app mapped data", APP_MAPPED_IO); + IOSTAT_INFO_SHOW("app buffered cdata", APP_BUFFERED_CDATA_IO); + IOSTAT_INFO_SHOW("app mapped cdata", APP_MAPPED_CDATA_IO); /* print fs write IOs */ - seq_printf(seq, "fs data: %-16llu\n", - sbi->rw_iostat[FS_DATA_IO]); - seq_printf(seq, "fs cdata: %-16llu\n", - sbi->rw_iostat[FS_CDATA_IO]); - seq_printf(seq, "fs node: %-16llu\n", - sbi->rw_iostat[FS_NODE_IO]); - seq_printf(seq, "fs meta: %-16llu\n", - sbi->rw_iostat[FS_META_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", - sbi->rw_iostat[FS_GC_DATA_IO]); - seq_printf(seq, "fs gc node: %-16llu\n", - sbi->rw_iostat[FS_GC_NODE_IO]); - seq_printf(seq, "fs cp data: %-16llu\n", - sbi->rw_iostat[FS_CP_DATA_IO]); - seq_printf(seq, "fs cp node: %-16llu\n", - sbi->rw_iostat[FS_CP_NODE_IO]); - seq_printf(seq, "fs cp meta: %-16llu\n", - sbi->rw_iostat[FS_CP_META_IO]); + IOSTAT_INFO_SHOW("fs data", FS_DATA_IO); + IOSTAT_INFO_SHOW("fs cdata", FS_CDATA_IO); + IOSTAT_INFO_SHOW("fs node", FS_NODE_IO); + IOSTAT_INFO_SHOW("fs meta", FS_META_IO); + IOSTAT_INFO_SHOW("fs gc data", FS_GC_DATA_IO); + IOSTAT_INFO_SHOW("fs gc node", FS_GC_NODE_IO); + IOSTAT_INFO_SHOW("fs cp data", FS_CP_DATA_IO); + IOSTAT_INFO_SHOW("fs cp node", FS_CP_NODE_IO); + IOSTAT_INFO_SHOW("fs cp meta", FS_CP_META_IO); /* print app read IOs */ seq_puts(seq, "[READ]\n"); - seq_printf(seq, "app buffered data: %-16llu\n", - sbi->rw_iostat[APP_BUFFERED_READ_IO]); - seq_printf(seq, "app direct data: %-16llu\n", - sbi->rw_iostat[APP_DIRECT_READ_IO]); - seq_printf(seq, "app mapped data: %-16llu\n", - sbi->rw_iostat[APP_MAPPED_READ_IO]); - seq_printf(seq, "app buffered cdata: %-16llu\n", - sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO]); - seq_printf(seq, "app mapped cdata: %-16llu\n", - sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO]); + IOSTAT_INFO_SHOW("app buffered data", APP_BUFFERED_READ_IO); + IOSTAT_INFO_SHOW("app direct data", APP_DIRECT_READ_IO); + IOSTAT_INFO_SHOW("app mapped data", APP_MAPPED_READ_IO); + IOSTAT_INFO_SHOW("app buffered cdata", APP_BUFFERED_CDATA_READ_IO); + IOSTAT_INFO_SHOW("app mapped cdata", APP_MAPPED_CDATA_READ_IO); /* print fs read IOs */ - seq_printf(seq, "fs data: %-16llu\n", - sbi->rw_iostat[FS_DATA_READ_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", - sbi->rw_iostat[FS_GDATA_READ_IO]); - seq_printf(seq, "fs cdata: %-16llu\n", - sbi->rw_iostat[FS_CDATA_READ_IO]); - seq_printf(seq, "fs node: %-16llu\n", - sbi->rw_iostat[FS_NODE_READ_IO]); - seq_printf(seq, "fs meta: %-16llu\n", - sbi->rw_iostat[FS_META_READ_IO]); + IOSTAT_INFO_SHOW("fs data", FS_DATA_READ_IO); + IOSTAT_INFO_SHOW("fs gc data", FS_GDATA_READ_IO); + IOSTAT_INFO_SHOW("fs cdata", FS_CDATA_READ_IO); + IOSTAT_INFO_SHOW("fs node", FS_NODE_READ_IO); + IOSTAT_INFO_SHOW("fs meta", FS_META_READ_IO); /* print other IOs */ seq_puts(seq, "[OTHER]\n"); - seq_printf(seq, "fs discard: %-16llu\n", - sbi->rw_iostat[FS_DISCARD]); + IOSTAT_INFO_SHOW("fs discard", FS_DISCARD_IO); + IOSTAT_INFO_SHOW("fs flush", FS_FLUSH_IO); return 0; } static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) { - int io, idx = 0; - unsigned int cnt; + int io, idx; struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; unsigned long flags; @@ -106,12 +94,11 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) spin_lock_irqsave(&sbi->iostat_lat_lock, flags); for (idx = 0; idx < MAX_IO_TYPE; idx++) { for (io = 0; io < NR_PAGE_TYPE; io++) { - cnt = io_lat->bio_cnt[idx][io]; iostat_lat[idx][io].peak_lat = jiffies_to_msecs(io_lat->peak_lat[idx][io]); - iostat_lat[idx][io].cnt = cnt; - iostat_lat[idx][io].avg_lat = cnt ? - jiffies_to_msecs(io_lat->sum_lat[idx][io]) / cnt : 0; + iostat_lat[idx][io].cnt = io_lat->bio_cnt[idx][io]; + iostat_lat[idx][io].avg_lat = iostat_lat[idx][io].cnt ? + jiffies_to_msecs(io_lat->sum_lat[idx][io]) / iostat_lat[idx][io].cnt : 0; io_lat->sum_lat[idx][io] = 0; io_lat->peak_lat[idx][io] = 0; io_lat->bio_cnt[idx][io] = 0; @@ -141,9 +128,9 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) msecs_to_jiffies(sbi->iostat_period_ms); for (i = 0; i < NR_IO_TYPE; i++) { - iostat_diff[i] = sbi->rw_iostat[i] - - sbi->prev_rw_iostat[i]; - sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; + iostat_diff[i] = sbi->iostat_bytes[i] - + sbi->prev_iostat_bytes[i]; + sbi->prev_iostat_bytes[i] = sbi->iostat_bytes[i]; } spin_unlock_irqrestore(&sbi->iostat_lock, flags); @@ -159,8 +146,9 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) spin_lock_irq(&sbi->iostat_lock); for (i = 0; i < NR_IO_TYPE; i++) { - sbi->rw_iostat[i] = 0; - sbi->prev_rw_iostat[i] = 0; + sbi->iostat_count[i] = 0; + sbi->iostat_bytes[i] = 0; + sbi->prev_iostat_bytes[i] = 0; } spin_unlock_irq(&sbi->iostat_lock); @@ -169,6 +157,13 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) spin_unlock_irq(&sbi->iostat_lat_lock); } +static inline void __f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) +{ + sbi->iostat_bytes[type] += io_bytes; + sbi->iostat_count[type]++; +} + void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes) { @@ -178,33 +173,33 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, return; spin_lock_irqsave(&sbi->iostat_lock, flags); - sbi->rw_iostat[type] += io_bytes; + __f2fs_update_iostat(sbi, type, io_bytes); if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO) - sbi->rw_iostat[APP_WRITE_IO] += io_bytes; + __f2fs_update_iostat(sbi, APP_WRITE_IO, io_bytes); if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO) - sbi->rw_iostat[APP_READ_IO] += io_bytes; + __f2fs_update_iostat(sbi, APP_READ_IO, io_bytes); #ifdef CONFIG_F2FS_FS_COMPRESSION if (inode && f2fs_compressed_file(inode)) { if (type == APP_BUFFERED_IO) - sbi->rw_iostat[APP_BUFFERED_CDATA_IO] += io_bytes; + __f2fs_update_iostat(sbi, APP_BUFFERED_CDATA_IO, io_bytes); if (type == APP_BUFFERED_READ_IO) - sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO] += io_bytes; + __f2fs_update_iostat(sbi, APP_BUFFERED_CDATA_READ_IO, io_bytes); if (type == APP_MAPPED_READ_IO) - sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO] += io_bytes; + __f2fs_update_iostat(sbi, APP_MAPPED_CDATA_READ_IO, io_bytes); if (type == APP_MAPPED_IO) - sbi->rw_iostat[APP_MAPPED_CDATA_IO] += io_bytes; + __f2fs_update_iostat(sbi, APP_MAPPED_CDATA_IO, io_bytes); if (type == FS_DATA_READ_IO) - sbi->rw_iostat[FS_CDATA_READ_IO] += io_bytes; + __f2fs_update_iostat(sbi, FS_CDATA_READ_IO, io_bytes); if (type == FS_DATA_IO) - sbi->rw_iostat[FS_CDATA_IO] += io_bytes; + __f2fs_update_iostat(sbi, FS_CDATA_IO, io_bytes); } #endif @@ -214,49 +209,48 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, } static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, - int rw, bool is_sync) + enum iostat_lat_type lat_type) { unsigned long ts_diff; - unsigned int iotype = iostat_ctx->type; + unsigned int page_type = iostat_ctx->type; struct f2fs_sb_info *sbi = iostat_ctx->sbi; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; - int idx; unsigned long flags; if (!sbi->iostat_enable) return; ts_diff = jiffies - iostat_ctx->submit_ts; - if (iotype >= META_FLUSH) - iotype = META; - - if (rw == 0) { - idx = READ_IO; - } else { - if (is_sync) - idx = WRITE_SYNC_IO; - else - idx = WRITE_ASYNC_IO; + if (page_type == META_FLUSH) { + page_type = META; + } else if (page_type >= NR_PAGE_TYPE) { + f2fs_warn(sbi, "%s: %d over NR_PAGE_TYPE", __func__, page_type); + return; } spin_lock_irqsave(&sbi->iostat_lat_lock, flags); - io_lat->sum_lat[idx][iotype] += ts_diff; - io_lat->bio_cnt[idx][iotype]++; - if (ts_diff > io_lat->peak_lat[idx][iotype]) - io_lat->peak_lat[idx][iotype] = ts_diff; + io_lat->sum_lat[lat_type][page_type] += ts_diff; + io_lat->bio_cnt[lat_type][page_type]++; + if (ts_diff > io_lat->peak_lat[lat_type][page_type]) + io_lat->peak_lat[lat_type][page_type] = ts_diff; spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); } -void iostat_update_and_unbind_ctx(struct bio *bio, int rw) +void iostat_update_and_unbind_ctx(struct bio *bio) { struct bio_iostat_ctx *iostat_ctx = bio->bi_private; - bool is_sync = bio->bi_opf & REQ_SYNC; + enum iostat_lat_type lat_type; - if (rw == 0) - bio->bi_private = iostat_ctx->post_read_ctx; - else + if (op_is_write(bio_op(bio))) { + lat_type = bio->bi_opf & REQ_SYNC ? + WRITE_SYNC_IO : WRITE_ASYNC_IO; bio->bi_private = iostat_ctx->sbi; - __update_iostat_latency(iostat_ctx, rw, is_sync); + } else { + lat_type = READ_IO; + bio->bi_private = iostat_ctx->post_read_ctx; + } + + __update_iostat_latency(iostat_ctx, lat_type); mempool_free(iostat_ctx, bio_iostat_ctx_pool); } diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h index 2c048307b6e0..eb99d05cf272 100644 --- a/fs/f2fs/iostat.h +++ b/fs/f2fs/iostat.h @@ -8,20 +8,21 @@ struct bio_post_read_ctx; +enum iostat_lat_type { + READ_IO = 0, + WRITE_SYNC_IO, + WRITE_ASYNC_IO, + MAX_IO_TYPE, +}; + #ifdef CONFIG_F2FS_IOSTAT +#define NUM_PREALLOC_IOSTAT_CTXS 128 #define DEFAULT_IOSTAT_PERIOD_MS 3000 #define MIN_IOSTAT_PERIOD_MS 100 /* maximum period of iostat tracing is 1 day */ #define MAX_IOSTAT_PERIOD_MS 8640000 -enum { - READ_IO, - WRITE_SYNC_IO, - WRITE_ASYNC_IO, - MAX_IO_TYPE, -}; - struct iostat_lat_info { unsigned long sum_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* sum of io latencies */ unsigned long peak_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* peak io latency */ @@ -57,7 +58,7 @@ static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio) return iostat_ctx->post_read_ctx; } -extern void iostat_update_and_unbind_ctx(struct bio *bio, int rw); +extern void iostat_update_and_unbind_ctx(struct bio *bio); extern void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, struct bio *bio, struct bio_post_read_ctx *ctx); extern int f2fs_init_iostat_processing(void); @@ -67,7 +68,7 @@ extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi); #else static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes) {} -static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {} +static inline void iostat_update_and_unbind_ctx(struct bio *bio) {} static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, struct bio *bio, struct bio_post_read_ctx *ctx) {} static inline void iostat_update_submit_ctx(struct bio *bio, diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index d8e01bbbf27f..11fc4c8036a9 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -926,9 +926,6 @@ static int f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, static int f2fs_create_whiteout(struct mnt_idmap *idmap, struct inode *dir, struct inode **whiteout) { - if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) - return -EIO; - return __f2fs_tmpfile(idmap, dir, NULL, S_IFCHR | WHITEOUT_MODE, true, whiteout); } @@ -966,7 +963,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, /* * If new_inode is null, the below renaming flow will - * add a link in old_dir which can conver inline_dir. + * add a link in old_dir which can convert inline_dir. * After then, if we failed to get the entry due to other * reasons like ENOMEM, we had to remove the new entry. * Instead of adding such the error handling routine, let's diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cf997356d9f9..bd1dad523796 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1587,7 +1587,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, - .submitted = false, + .submitted = 0, .io_type = io_type, .io_wbc = wbc, }; @@ -1651,7 +1651,6 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, } set_page_writeback(page); - ClearPageError(page); fio.old_blkaddr = ni.blk_addr; f2fs_do_write_node_page(nid, &fio); @@ -2083,8 +2082,6 @@ int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); f2fs_wait_on_page_writeback(page, NODE, true, false); - if (TestClearPageError(page)) - ret = -EIO; put_page(page); @@ -2548,10 +2545,8 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; retry: - if (time_to_inject(sbi, FAULT_ALLOC_NID)) { - f2fs_show_injection_info(sbi, FAULT_ALLOC_NID); + if (time_to_inject(sbi, FAULT_ALLOC_NID)) return false; - } spin_lock(&nm_i->nid_list_lock); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ae3c4e5474ef..227e25836173 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -192,18 +192,18 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) if (!f2fs_is_atomic_file(inode)) return; - clear_inode_flag(fi->cow_inode, FI_COW_FILE); - iput(fi->cow_inode); - fi->cow_inode = NULL; release_atomic_write_cnt(inode); clear_inode_flag(inode, FI_ATOMIC_COMMITTED); clear_inode_flag(inode, FI_ATOMIC_REPLACE); clear_inode_flag(inode, FI_ATOMIC_FILE); stat_dec_atomic_inode(inode); + F2FS_I(inode)->atomic_write_task = NULL; + if (clean) { truncate_inode_pages_final(inode->i_mapping); f2fs_i_size_write(inode, fi->original_i_size); + fi->original_i_size = 0; } } @@ -255,6 +255,9 @@ retry: } f2fs_put_dnode(&dn); + + trace_f2fs_replace_atomic_write_block(inode, F2FS_I(inode)->cow_inode, + index, *old_addr, new_addr, recover); return 0; } @@ -262,19 +265,24 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head, bool revoke) { struct revoke_entry *cur, *tmp; + pgoff_t start_index = 0; bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE); list_for_each_entry_safe(cur, tmp, head, list) { - if (revoke) + if (revoke) { __replace_atomic_write_block(inode, cur->index, cur->old_addr, NULL, true); + } else if (truncate) { + f2fs_truncate_hole(inode, start_index, cur->index); + start_index = cur->index + 1; + } list_del(&cur->list); kmem_cache_free(revoke_entry_slab, cur); } if (!revoke && truncate) - f2fs_do_truncate_blocks(inode, 0, false); + f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false); } static int __f2fs_commit_atomic_write(struct inode *inode) @@ -384,10 +392,8 @@ int f2fs_commit_atomic_write(struct inode *inode) */ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { - if (time_to_inject(sbi, FAULT_CHECKPOINT)) { - f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); + if (time_to_inject(sbi, FAULT_CHECKPOINT)) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); - } /* balance_fs_bg is able to be pending */ if (need && excess_cached_nats(sbi)) @@ -508,6 +514,8 @@ static int __submit_flush_wait(struct f2fs_sb_info *sbi, trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), test_opt(sbi, FLUSH_MERGE), ret); + if (!ret) + f2fs_update_iostat(sbi, NULL, FS_FLUSH_IO, 0); return ret; } @@ -1059,7 +1067,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->granularity = granularity; dpolicy->max_requests = dcc->max_discard_request; - dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware_gran = dcc->discard_io_aware_gran; dpolicy->timeout = false; if (discard_type == DPOLICY_BG) { @@ -1095,9 +1103,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, block_t start, block_t len); /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, - struct discard_policy *dpolicy, - struct discard_cmd *dc, - unsigned int *issued) + struct discard_policy *dpolicy, + struct discard_cmd *dc, int *issued) { struct block_device *bdev = dc->bdev; unsigned int max_discard_blocks = @@ -1141,7 +1148,6 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, dc->len += len; if (time_to_inject(sbi, FAULT_DISCARD)) { - f2fs_show_injection_info(sbi, FAULT_DISCARD); err = -EIO; } else { err = __blkdev_issue_discard(bdev, @@ -1186,7 +1192,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, atomic_inc(&dcc->issued_discard); - f2fs_update_iostat(sbi, NULL, FS_DISCARD, len * F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_DISCARD_IO, len * F2FS_BLKSIZE); lstart += len; start += len; @@ -1378,8 +1384,8 @@ static void __queue_discard_cmd(struct f2fs_sb_info *sbi, mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); } -static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, - struct discard_policy *dpolicy) +static void __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, int *issued) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; @@ -1387,7 +1393,6 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, struct discard_cmd *dc; struct blk_plug plug; unsigned int pos = dcc->next_pos; - unsigned int issued = 0; bool io_interrupted = false; mutex_lock(&dcc->cmd_lock); @@ -1414,9 +1419,9 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, } dcc->next_pos = dc->lstart + dc->len; - err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); + err = __submit_discard_cmd(sbi, dpolicy, dc, issued); - if (issued >= dpolicy->max_requests) + if (*issued >= dpolicy->max_requests) break; next: node = rb_next(&dc->rb_node); @@ -1432,10 +1437,8 @@ next: mutex_unlock(&dcc->cmd_lock); - if (!issued && io_interrupted) - issued = -1; - - return issued; + if (!(*issued) && io_interrupted) + *issued = -1; } static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy); @@ -1463,8 +1466,10 @@ retry: if (i + 1 < dpolicy->granularity) break; - if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) - return __issue_discard_cmd_orderly(sbi, dpolicy); + if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) { + __issue_discard_cmd_orderly(sbi, dpolicy, &issued); + return issued; + } pend_list = &dcc->pend_list[i]; @@ -1609,9 +1614,9 @@ static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, return __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); /* wait all */ - __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, 1); + __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, MIN_DISCARD_GRANULARITY); discard_blks = __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); - __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, 1); + __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, MIN_DISCARD_GRANULARITY); discard_blks += __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); return discard_blks; @@ -1653,7 +1658,14 @@ void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) } } -/* This comes from f2fs_put_super */ +/** + * f2fs_issue_discard_timeout() - Issue all discard cmd within UMOUNT_DISCARD_TIMEOUT + * @sbi: the f2fs_sb_info data for discard cmd to issue + * + * When UMOUNT_DISCARD_TIMEOUT is exceeded, all remaining discard commands will be dropped + * + * Return true if issued all discard cmd or no discard cmd need issue, otherwise return false. + */ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; @@ -1661,7 +1673,7 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) bool dropped; if (!atomic_read(&dcc->discard_cmd_cnt)) - return false; + return true; __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); @@ -1672,7 +1684,7 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) __wait_all_discard_cmd(sbi, NULL); f2fs_bug_on(sbi, atomic_read(&dcc->discard_cmd_cnt)); - return dropped; + return !dropped; } static int issue_discard_thread(void *data) @@ -1694,13 +1706,14 @@ static int issue_discard_thread(void *data) if (sbi->gc_mode == GC_URGENT_HIGH || !f2fs_available_free_memory(sbi, DISCARD_CACHE)) - __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); + __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, + MIN_DISCARD_GRANULARITY); else __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, dcc->discard_granularity); if (dcc->discard_wake) - dcc->discard_wake = 0; + dcc->discard_wake = false; /* clean up pending candidates before going to sleep */ if (atomic_read(&dcc->queued_discard)) @@ -2065,6 +2078,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return -ENOMEM; + dcc->discard_io_aware_gran = MAX_PLIST_NUM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) @@ -2327,17 +2341,13 @@ bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) return is_cp; } -/* - * This function should be resided under the curseg_mutex lock - */ -static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, - struct f2fs_summary *sum) +static unsigned short f2fs_curseg_valid_blocks(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); - void *addr = curseg->sum_blk; - addr += curseg->next_blkoff * sizeof(struct f2fs_summary); - memcpy(addr, sum, sizeof(struct f2fs_summary)); + if (sbi->ckpt->alloc_type[type] == SSR) + return sbi->blocks_per_seg; + return curseg->next_blkoff; } /* @@ -2349,15 +2359,11 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) int i, sum_in_page; for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { - if (sbi->ckpt->alloc_type[i] == SSR) - valid_sum_count += sbi->blocks_per_seg; - else { - if (for_ra) - valid_sum_count += le16_to_cpu( - F2FS_CKPT(sbi)->cur_data_blkoff[i]); - else - valid_sum_count += curseg_blkoff(sbi, i); - } + if (sbi->ckpt->alloc_type[i] != SSR && for_ra) + valid_sum_count += + le16_to_cpu(F2FS_CKPT(sbi)->cur_data_blkoff[i]); + else + valid_sum_count += f2fs_curseg_valid_blocks(sbi, i); } sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE - @@ -2628,30 +2634,10 @@ static int __next_free_blkoff(struct f2fs_sb_info *sbi, return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); } -/* - * If a segment is written by LFS manner, next block offset is just obtained - * by increasing the current block offset. However, if a segment is written by - * SSR manner, next block offset obtained by calling __next_free_blkoff - */ -static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, - struct curseg_info *seg) +static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi, + struct curseg_info *seg) { - if (seg->alloc_type == SSR) { - seg->next_blkoff = - __next_free_blkoff(sbi, seg->segno, - seg->next_blkoff + 1); - } else { - seg->next_blkoff++; - if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) { - /* To allocate block chunks in different sizes, use random number */ - if (--seg->fragment_remained_chunk <= 0) { - seg->fragment_remained_chunk = - get_random_u32_inclusive(1, sbi->max_fragment_chunk); - seg->next_blkoff += - get_random_u32_inclusive(1, sbi->max_fragment_hole); - } - } - } + return __next_free_blkoff(sbi, seg->segno, seg->next_blkoff + 1); } bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) @@ -2909,33 +2895,23 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; - if (!curseg->inited) - goto alloc; - - if (force || curseg->next_blkoff || - get_valid_blocks(sbi, curseg->segno, new_sec)) - goto alloc; - - if (!get_ckpt_valid_blocks(sbi, curseg->segno, new_sec)) + if (!force && curseg->inited && + !curseg->next_blkoff && + !get_valid_blocks(sbi, curseg->segno, new_sec) && + !get_ckpt_valid_blocks(sbi, curseg->segno, new_sec)) return; -alloc: + old_segno = curseg->segno; new_curseg(sbi, type, true); stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, old_segno); } -static void __allocate_new_section(struct f2fs_sb_info *sbi, - int type, bool force) -{ - __allocate_new_segment(sbi, type, true, force); -} - void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) { f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); - __allocate_new_section(sbi, type, force); + __allocate_new_segment(sbi, type, true, force); up_write(&SIT_I(sbi)->sentry_lock); f2fs_up_read(&SM_I(sbi)->curseg_lock); } @@ -3113,13 +3089,6 @@ out: return err; } -static bool __has_curseg_space(struct f2fs_sb_info *sbi, - struct curseg_info *curseg) -{ - return curseg->next_blkoff < f2fs_usable_blks_in_seg(sbi, - curseg->segno); -} - int f2fs_rw_hint_to_seg_type(enum rw_hint hint) { switch (hint) { @@ -3238,6 +3207,19 @@ static int __get_segment_type(struct f2fs_io_info *fio) return type; } +static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi, + struct curseg_info *seg) +{ + /* To allocate block chunks in different sizes, use random number */ + if (--seg->fragment_remained_chunk > 0) + return; + + seg->fragment_remained_chunk = + get_random_u32_inclusive(1, sbi->max_fragment_chunk); + seg->next_blkoff += + get_random_u32_inclusive(1, sbi->max_fragment_hole); +} + void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, @@ -3248,6 +3230,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, unsigned long long old_mtime; bool from_gc = (type == CURSEG_ALL_DATA_ATGC); struct seg_entry *se = NULL; + bool segment_full = false; f2fs_down_read(&SM_I(sbi)->curseg_lock); @@ -3266,15 +3249,16 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, f2fs_wait_discard_bio(sbi, *new_blkaddr); - /* - * __add_sum_entry should be resided under the curseg_mutex - * because, this function updates a summary entry in the - * current summary block. - */ - __add_sum_entry(sbi, type, sum); - - __refresh_next_blkoff(sbi, curseg); - + curseg->sum_blk->entries[curseg->next_blkoff] = *sum; + if (curseg->alloc_type == SSR) { + curseg->next_blkoff = f2fs_find_next_ssr_block(sbi, curseg); + } else { + curseg->next_blkoff++; + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + f2fs_randomize_chunk(sbi, curseg); + } + if (curseg->next_blkoff >= f2fs_usable_blks_in_seg(sbi, curseg->segno)) + segment_full = true; stat_inc_block_count(sbi, curseg); if (from_gc) { @@ -3293,10 +3277,11 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) update_sit_entry(sbi, old_blkaddr, -1); - if (!__has_curseg_space(sbi, curseg)) { - /* - * Flush out current segment and replace it with new segment. - */ + /* + * If the current segment is full, flush it out and replace it with a + * new segment. + */ + if (segment_full) { if (from_gc) { get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); @@ -3331,10 +3316,10 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_bio_info *io; if (F2FS_IO_ALIGNED(sbi)) - fio->retry = false; + fio->retry = 0; INIT_LIST_HEAD(&fio->list); - fio->in_list = true; + fio->in_list = 1; io = sbi->write_io[fio->type] + fio->temp; spin_lock(&io->io_lock); list_add_tail(&fio->list, &io->io_list); @@ -3415,14 +3400,13 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, .new_blkaddr = page->index, .page = page, .encrypted_page = NULL, - .in_list = false, + .in_list = 0, }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) fio.op_flags &= ~REQ_META; set_page_writeback(page); - ClearPageError(page); f2fs_submit_page_write(&fio); stat_inc_meta_count(sbi, page->index); @@ -3487,7 +3471,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) stat_inc_inplace_blocks(fio->sbi); - if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE))) + if (fio->bio && !IS_F2FS_IPU_NOCACHE(sbi)) err = f2fs_merge_page_bio(fio); else err = f2fs_submit_page_bio(fio); @@ -3576,7 +3560,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); - __add_sum_entry(sbi, type, sum); + curseg->sum_blk->entries[curseg->next_blkoff] = *sum; if (!recover_curseg || recover_newaddr) { if (!from_gc) @@ -3634,7 +3618,7 @@ void f2fs_wait_on_page_writeback(struct page *page, /* submit cached LFS IO */ f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type); - /* sbumit cached IPU IO */ + /* submit cached IPU IO */ f2fs_submit_merged_ipu_write(sbi, NULL, page); if (ordered) { wait_on_page_writeback(page); @@ -3885,15 +3869,8 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { - unsigned short blkoff; - seg_i = CURSEG_I(sbi, i); - if (sbi->ckpt->alloc_type[i] == SSR) - blkoff = sbi->blocks_per_seg; - else - blkoff = curseg_blkoff(sbi, i); - - for (j = 0; j < blkoff; j++) { + for (j = 0; j < f2fs_curseg_valid_blocks(sbi, i); j++) { if (!page) { page = f2fs_grab_meta_page(sbi, blkaddr++); kaddr = (unsigned char *)page_address(page); @@ -5126,7 +5103,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; if (!f2fs_lfs_mode(sbi)) - sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; + sm_info->ipu_policy = BIT(F2FS_IPU_FSYNC); sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; sm_info->min_seq_blocks = sbi->blocks_per_seg; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 3ad1b7b6fa94..efdb7fc3b797 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -670,6 +670,9 @@ static inline int utilization(struct f2fs_sb_info *sbi) #define SMALL_VOLUME_SEGMENTS (16 * 512) /* 16GB */ +#define F2FS_IPU_DISABLE 0 + +/* Modification on enum should be synchronized with ipu_mode_names array */ enum { F2FS_IPU_FORCE, F2FS_IPU_SSR, @@ -679,8 +682,29 @@ enum { F2FS_IPU_ASYNC, F2FS_IPU_NOCACHE, F2FS_IPU_HONOR_OPU_WRITE, + F2FS_IPU_MAX, }; +static inline bool IS_F2FS_IPU_DISABLE(struct f2fs_sb_info *sbi) +{ + return SM_I(sbi)->ipu_policy == F2FS_IPU_DISABLE; +} + +#define F2FS_IPU_POLICY(name) \ +static inline bool IS_##name(struct f2fs_sb_info *sbi) \ +{ \ + return SM_I(sbi)->ipu_policy & BIT(name); \ +} + +F2FS_IPU_POLICY(F2FS_IPU_FORCE); +F2FS_IPU_POLICY(F2FS_IPU_SSR); +F2FS_IPU_POLICY(F2FS_IPU_UTIL); +F2FS_IPU_POLICY(F2FS_IPU_SSR_UTIL); +F2FS_IPU_POLICY(F2FS_IPU_FSYNC); +F2FS_IPU_POLICY(F2FS_IPU_ASYNC); +F2FS_IPU_POLICY(F2FS_IPU_NOCACHE); +F2FS_IPU_POLICY(F2FS_IPU_HONOR_OPU_WRITE); + static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, int type) { @@ -695,15 +719,10 @@ static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi, return curseg->alloc_type; } -static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) -{ - struct curseg_info *curseg = CURSEG_I(sbi, type); - return curseg->next_blkoff; -} - -static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) +static inline bool valid_main_segno(struct f2fs_sb_info *sbi, + unsigned int segno) { - f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1); + return segno <= (MAIN_SEGS(sbi) - 1); } static inline void verify_fio_blkaddr(struct f2fs_io_info *fio) @@ -758,7 +777,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, /* check segment usage, and check boundary of a given segment number */ if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg - || segno > TOTAL_SEGS(sbi) - 1)) { + || !valid_main_segno(sbi, segno))) { f2fs_err(sbi, "Wrong valid blocks %d or segno %u", GET_SIT_VBLOCKS(raw_sit), segno); set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -775,7 +794,7 @@ static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, unsigned int offset = SIT_BLOCK_OFFSET(start); block_t blk_addr = sit_i->sit_base_addr + offset; - check_seg_range(sbi, start); + f2fs_bug_on(sbi, !valid_main_segno(sbi, start)); #ifdef CONFIG_F2FS_CHECK_FS if (f2fs_test_bit(offset, sit_i->sit_bitmap) != @@ -924,6 +943,6 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) if (!wakeup || !is_idle(sbi, DISCARD_TIME)) return; wake_up: - dcc->discard_wake = 1; + dcc->discard_wake = true; wake_up_interruptible_all(&dcc->discard_wait_queue); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 64d3556d61a5..fbaaabbcd6de 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1288,19 +1288,18 @@ default_check: * zone alignment optimization. This is optional for host-aware * devices, but mandatory for host-managed zoned block devices. */ -#ifndef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_has_blkzoned(sbi)) { - f2fs_err(sbi, "Zoned block device support is not enabled"); - return -EINVAL; - } -#endif if (f2fs_sb_has_blkzoned(sbi)) { +#ifdef CONFIG_BLK_DEV_ZONED if (F2FS_OPTION(sbi).discard_unit != DISCARD_UNIT_SECTION) { f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default"); F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; } +#else + f2fs_err(sbi, "Zoned block device support is not enabled"); + return -EINVAL; +#endif } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -1341,12 +1340,12 @@ default_check: } if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) { - f2fs_err(sbi, "LFS not compatible with checkpoint=disable"); + f2fs_err(sbi, "LFS is not compatible with checkpoint=disable"); return -EINVAL; } if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) { - f2fs_err(sbi, "LFS not compatible with ATGC"); + f2fs_err(sbi, "LFS is not compatible with ATGC"); return -EINVAL; } @@ -1366,10 +1365,8 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) { struct f2fs_inode_info *fi; - if (time_to_inject(F2FS_SB(sb), FAULT_SLAB_ALLOC)) { - f2fs_show_injection_info(F2FS_SB(sb), FAULT_SLAB_ALLOC); + if (time_to_inject(F2FS_SB(sb), FAULT_SLAB_ALLOC)) return NULL; - } fi = alloc_inode_sb(sb, f2fs_inode_cachep, GFP_F2FS_ZERO); if (!fi) @@ -1424,8 +1421,6 @@ static int f2fs_drop_inode(struct inode *inode) atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); - f2fs_abort_atomic_write(inode, true); - /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); @@ -1543,7 +1538,7 @@ static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); int i; - bool dropped; + bool done; /* unregister procfs/sysfs entries in advance to avoid race case */ f2fs_unregister_sysfs(sbi); @@ -1573,9 +1568,8 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - dropped = f2fs_issue_discard_timeout(sbi); - - if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && !dropped) { + done = f2fs_issue_discard_timeout(sbi); + if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && done) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; @@ -1900,15 +1894,24 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, GC_MERGE)) seq_puts(seq, ",gc_merge"); + else + seq_puts(seq, ",nogc_merge"); if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); if (test_opt(sbi, NORECOVERY)) seq_puts(seq, ",norecovery"); - if (test_opt(sbi, DISCARD)) + if (test_opt(sbi, DISCARD)) { seq_puts(seq, ",discard"); - else + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK) + seq_printf(seq, ",discard_unit=%s", "block"); + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + seq_printf(seq, ",discard_unit=%s", "segment"); + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) + seq_printf(seq, ",discard_unit=%s", "section"); + } else { seq_puts(seq, ",nodiscard"); + } if (test_opt(sbi, NOHEAP)) seq_puts(seq, ",no_heap"); else @@ -2032,13 +2035,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, ATGC)) seq_puts(seq, ",atgc"); - if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK) - seq_printf(seq, ",discard_unit=%s", "block"); - else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) - seq_printf(seq, ",discard_unit=%s", "segment"); - else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) - seq_printf(seq, ",discard_unit=%s", "section"); - if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_NORMAL) seq_printf(seq, ",memory=%s", "normal"); else if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW) @@ -2300,6 +2296,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } #endif + if (f2fs_lfs_mode(sbi) && !IS_F2FS_IPU_DISABLE(sbi)) { + err = -EINVAL; + f2fs_warn(sbi, "LFS is not compatible with IPU"); + goto restore_opts; + } + /* disallow enable atgc dynamically */ if (no_atgc == !!test_opt(sbi, ATGC)) { err = -EINVAL; @@ -2589,10 +2591,8 @@ retry: int f2fs_dquot_initialize(struct inode *inode) { - if (time_to_inject(F2FS_I_SB(inode), FAULT_DQUOT_INIT)) { - f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_DQUOT_INIT); + if (time_to_inject(F2FS_I_SB(inode), FAULT_DQUOT_INIT)) return -ESRCH; - } return dquot_initialize(inode); } @@ -4083,8 +4083,9 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) if (f2fs_block_unit_discard(sbi)) SM_I(sbi)->dcc_info->discard_granularity = MIN_DISCARD_GRANULARITY; - SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE | - 1 << F2FS_IPU_HONOR_OPU_WRITE; + if (!f2fs_lfs_mode(sbi)) + SM_I(sbi)->ipu_policy = BIT(F2FS_IPU_FORCE) | + BIT(F2FS_IPU_HONOR_OPU_WRITE); } sbi->readdir_ra = true; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 83a366f3ee80..0b19163c90d4 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -473,6 +473,17 @@ out: return count; } + if (!strcmp(a->attr.name, "discard_io_aware_gran")) { + if (t > MAX_PLIST_NUM) + return -EINVAL; + if (!f2fs_block_unit_discard(sbi)) + return -EINVAL; + if (t == *ui) + return count; + *ui = t; + return count; + } + if (!strcmp(a->attr.name, "discard_granularity")) { if (t == 0 || t > MAX_PLIST_NUM) return -EINVAL; @@ -511,7 +522,7 @@ out: } else if (t == 1) { sbi->gc_mode = GC_URGENT_HIGH; if (sbi->gc_thread) { - sbi->gc_thread->gc_wake = 1; + sbi->gc_thread->gc_wake = true; wake_up_interruptible_all( &sbi->gc_thread->gc_wait_queue_head); wake_up_discard_thread(sbi, true); @@ -521,7 +532,7 @@ out: } else if (t == 3) { sbi->gc_mode = GC_URGENT_MID; if (sbi->gc_thread) { - sbi->gc_thread->gc_wake = 1; + sbi->gc_thread->gc_wake = true; wake_up_interruptible_all( &sbi->gc_thread->gc_wait_queue_head); } @@ -678,7 +689,16 @@ out: } if (!strcmp(a->attr.name, "warm_data_age_threshold")) { - if (t == 0 || t <= sbi->hot_data_age_threshold) + if (t <= sbi->hot_data_age_threshold) + return -EINVAL; + if (t == *ui) + return count; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "last_age_weight")) { + if (t > 100) return -EINVAL; if (t == *ui) return count; @@ -686,6 +706,15 @@ out: return count; } + if (!strcmp(a->attr.name, "ipu_policy")) { + if (t >= BIT(F2FS_IPU_MAX)) + return -EINVAL; + if (t && f2fs_lfs_mode(sbi)) + return -EINVAL; + SM_I(sbi)->ipu_policy = (unsigned int)t; + return count; + } + *ui = (unsigned int)t; return count; @@ -825,6 +854,7 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_req F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_io_aware_gran, discard_io_aware_gran); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_urgent_util, discard_urgent_util); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard); @@ -944,6 +974,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block) /* For block age extent cache */ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, hot_data_age_threshold, hot_data_age_threshold); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, warm_data_age_threshold, warm_data_age_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, last_age_weight, last_age_weight); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -960,6 +991,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(min_discard_issue_time), ATTR_LIST(mid_discard_issue_time), ATTR_LIST(max_discard_issue_time), + ATTR_LIST(discard_io_aware_gran), ATTR_LIST(discard_urgent_util), ATTR_LIST(discard_granularity), ATTR_LIST(max_ordered_discard), @@ -1042,6 +1074,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(revoked_atomic_block), ATTR_LIST(hot_data_age_threshold), ATTR_LIST(warm_data_age_threshold), + ATTR_LIST(last_age_weight), NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -1129,13 +1162,13 @@ static const struct sysfs_ops f2fs_attr_ops = { .store = f2fs_attr_store, }; -static struct kobj_type f2fs_sb_ktype = { +static const struct kobj_type f2fs_sb_ktype = { .default_groups = f2fs_groups, .sysfs_ops = &f2fs_attr_ops, .release = f2fs_sb_release, }; -static struct kobj_type f2fs_ktype = { +static const struct kobj_type f2fs_ktype = { .sysfs_ops = &f2fs_attr_ops, }; @@ -1143,7 +1176,7 @@ static struct kset f2fs_kset = { .kobj = {.ktype = &f2fs_ktype}, }; -static struct kobj_type f2fs_feat_ktype = { +static const struct kobj_type f2fs_feat_ktype = { .default_groups = f2fs_feat_groups, .sysfs_ops = &f2fs_attr_ops, }; @@ -1184,7 +1217,7 @@ static const struct sysfs_ops f2fs_stat_attr_ops = { .store = f2fs_stat_attr_store, }; -static struct kobj_type f2fs_stat_ktype = { +static const struct kobj_type f2fs_stat_ktype = { .default_groups = f2fs_stat_groups, .sysfs_ops = &f2fs_stat_attr_ops, .release = f2fs_stat_kobj_release, @@ -1211,7 +1244,7 @@ static const struct sysfs_ops f2fs_feature_list_attr_ops = { .show = f2fs_sb_feat_attr_show, }; -static struct kobj_type f2fs_feature_list_ktype = { +static const struct kobj_type f2fs_feature_list_ktype = { .default_groups = f2fs_sb_feat_groups, .sysfs_ops = &f2fs_feature_list_attr_ops, .release = f2fs_feature_list_kobj_release, diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index f320ed8172ec..4fc95f353a7a 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -81,7 +81,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); struct page *page; - void *fsdata; + void *fsdata = NULL; int res; res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 122ed89ebf9f..1986b4f18a90 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -295,11 +295,11 @@ static void hfsplus_put_super(struct super_block *sb) hfsplus_sync_fs(sb, 1); } + iput(sbi->alloc_file); + iput(sbi->hidden_dir); hfs_btree_close(sbi->attr_tree); hfs_btree_close(sbi->cat_tree); hfs_btree_close(sbi->ext_tree); - iput(sbi->alloc_file); - iput(sbi->hidden_dir); kfree(sbi->s_vhdr_buf); kfree(sbi->s_backup_vhdr_buf); unload_nls(sbi->nls); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index c18bb50c31b6..28b4f15c19eb 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -412,7 +412,7 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc) if (page->index >= end_index) count = inode->i_size & (PAGE_SIZE-1); - buffer = kmap(page); + buffer = kmap_local_page(page); err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count); if (err != count) { @@ -428,9 +428,9 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc) err = 0; out: - kunmap(page); - + kunmap_local(buffer); unlock_page(page); + return err; } @@ -441,7 +441,7 @@ static int hostfs_read_folio(struct file *file, struct folio *folio) loff_t start = page_offset(page); int bytes_read, ret = 0; - buffer = kmap(page); + buffer = kmap_local_page(page); bytes_read = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer, PAGE_SIZE); if (bytes_read < 0) { @@ -458,8 +458,9 @@ static int hostfs_read_folio(struct file *file, struct folio *folio) out: flush_dcache_page(page); - kunmap(page); + kunmap_local(buffer); unlock_page(page); + return ret; } @@ -484,9 +485,9 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping, unsigned from = pos & (PAGE_SIZE - 1); int err; - buffer = kmap(page); + buffer = kmap_local_page(page); err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer + from, copied); - kunmap(page); + kunmap_local(buffer); if (!PageUptodate(page) && err == PAGE_SIZE) SetPageUptodate(page); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6a404ac1c178..15de1385012e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1010,36 +1010,28 @@ repeat: * ie. locked but not dirty) or tune2fs (which may actually have * the buffer dirtied, ugh.) */ - if (buffer_dirty(bh)) { + if (buffer_dirty(bh) && jh->b_transaction) { + warn_dirty_buffer(bh); /* - * First question: is this buffer already part of the current - * transaction or the existing committing transaction? - */ - if (jh->b_transaction) { - J_ASSERT_JH(jh, - jh->b_transaction == transaction || - jh->b_transaction == - journal->j_committing_transaction); - if (jh->b_next_transaction) - J_ASSERT_JH(jh, jh->b_next_transaction == - transaction); - warn_dirty_buffer(bh); - } - /* - * In any case we need to clean the dirty flag and we must - * do it under the buffer lock to be sure we don't race - * with running write-out. + * We need to clean the dirty flag and we must do it under the + * buffer lock to be sure we don't race with running write-out. */ JBUFFER_TRACE(jh, "Journalling dirty buffer"); clear_buffer_dirty(bh); + /* + * The buffer is going to be added to BJ_Reserved list now and + * nothing guarantees jbd2_journal_dirty_metadata() will be + * ever called for it. So we need to set jbddirty bit here to + * make sure the buffer is dirtied and written out when the + * journaling machinery is done with it. + */ set_buffer_jbddirty(bh); } - unlock_buffer(bh); - error = -EROFS; if (is_handle_aborted(handle)) { spin_unlock(&jh->b_state_lock); + unlock_buffer(bh); goto out; } error = 0; @@ -1049,8 +1041,10 @@ repeat: * b_next_transaction points to it */ if (jh->b_transaction == transaction || - jh->b_next_transaction == transaction) + jh->b_next_transaction == transaction) { + unlock_buffer(bh); goto done; + } /* * this is the first time this transaction is touching this buffer, @@ -1074,10 +1068,24 @@ repeat: */ smp_wmb(); spin_lock(&journal->j_list_lock); + if (test_clear_buffer_dirty(bh)) { + /* + * Execute buffer dirty clearing and jh->b_transaction + * assignment under journal->j_list_lock locked to + * prevent bh being removed from checkpoint list if + * the buffer is in an intermediate state (not dirty + * and jh->b_transaction is NULL). + */ + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + set_buffer_jbddirty(bh); + } __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); spin_unlock(&journal->j_list_lock); + unlock_buffer(bh); goto done; } + unlock_buffer(bh); + /* * If there is already a copy-out version of this buffer, then we don't * need to make another one diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c index 4849a4c9a0e2..764f19dec3f0 100644 --- a/fs/jffs2/compr.c +++ b/fs/jffs2/compr.c @@ -364,20 +364,25 @@ void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig) int __init jffs2_compressors_init(void) { + int ret = 0; /* Registering compressors */ -#ifdef CONFIG_JFFS2_ZLIB - jffs2_zlib_init(); -#endif -#ifdef CONFIG_JFFS2_RTIME - jffs2_rtime_init(); -#endif -#ifdef CONFIG_JFFS2_RUBIN - jffs2_rubinmips_init(); - jffs2_dynrubin_init(); -#endif -#ifdef CONFIG_JFFS2_LZO - jffs2_lzo_init(); -#endif + ret = jffs2_zlib_init(); + if (ret) + goto exit; + ret = jffs2_rtime_init(); + if (ret) + goto exit_zlib; + ret = jffs2_rubinmips_init(); + if (ret) + goto exit_rtime; + ret = jffs2_dynrubin_init(); + if (ret) + goto exit_runinmips; + ret = jffs2_lzo_init(); + if (ret) + goto exit_dynrubin; + + /* Setting default compression mode */ #ifdef CONFIG_JFFS2_CMODE_NONE jffs2_compression_mode = JFFS2_COMPR_MODE_NONE; @@ -396,23 +401,26 @@ int __init jffs2_compressors_init(void) #endif #endif return 0; + +exit_dynrubin: + jffs2_dynrubin_exit(); +exit_runinmips: + jffs2_rubinmips_exit(); +exit_rtime: + jffs2_rtime_exit(); +exit_zlib: + jffs2_zlib_exit(); +exit: + return ret; } int jffs2_compressors_exit(void) { /* Unregistering compressors */ -#ifdef CONFIG_JFFS2_LZO jffs2_lzo_exit(); -#endif -#ifdef CONFIG_JFFS2_RUBIN jffs2_dynrubin_exit(); jffs2_rubinmips_exit(); -#endif -#ifdef CONFIG_JFFS2_RTIME jffs2_rtime_exit(); -#endif -#ifdef CONFIG_JFFS2_ZLIB jffs2_zlib_exit(); -#endif return 0; } diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h index 5e91d578f4ed..3716b6b7924c 100644 --- a/fs/jffs2/compr.h +++ b/fs/jffs2/compr.h @@ -88,18 +88,32 @@ int jffs2_rubinmips_init(void); void jffs2_rubinmips_exit(void); int jffs2_dynrubin_init(void); void jffs2_dynrubin_exit(void); +#else +static inline int jffs2_rubinmips_init(void) { return 0; } +static inline void jffs2_rubinmips_exit(void) {} +static inline int jffs2_dynrubin_init(void) { return 0; } +static inline void jffs2_dynrubin_exit(void) {} #endif #ifdef CONFIG_JFFS2_RTIME -int jffs2_rtime_init(void); -void jffs2_rtime_exit(void); +extern int jffs2_rtime_init(void); +extern void jffs2_rtime_exit(void); +#else +static inline int jffs2_rtime_init(void) { return 0; } +static inline void jffs2_rtime_exit(void) {} #endif #ifdef CONFIG_JFFS2_ZLIB -int jffs2_zlib_init(void); -void jffs2_zlib_exit(void); +extern int jffs2_zlib_init(void); +extern void jffs2_zlib_exit(void); +#else +static inline int jffs2_zlib_init(void) { return 0; } +static inline void jffs2_zlib_exit(void) {} #endif #ifdef CONFIG_JFFS2_LZO -int jffs2_lzo_init(void); -void jffs2_lzo_exit(void); +extern int jffs2_lzo_init(void); +extern void jffs2_lzo_exit(void); +#else +static inline int jffs2_lzo_init(void) { return 0; } +static inline void jffs2_lzo_exit(void) {} #endif #endif /* __JFFS2_COMPR_H__ */ diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 3cf71befa475..96b0275ce957 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -137,19 +137,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); pgoff_t index = pos >> PAGE_SHIFT; - uint32_t pageofs = index << PAGE_SHIFT; int ret = 0; jffs2_dbg(1, "%s()\n", __func__); - if (pageofs > inode->i_size) { - /* Make new hole frag from old EOF to new page */ + if (pos > inode->i_size) { + /* Make new hole frag from old EOF to new position */ struct jffs2_raw_inode ri; struct jffs2_full_dnode *fn; uint32_t alloc_len; - jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n", - (unsigned int)inode->i_size, pageofs); + jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new position\n", + (unsigned int)inode->i_size, (uint32_t)pos); ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); @@ -169,10 +168,10 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, ri.mode = cpu_to_jemode(inode->i_mode); ri.uid = cpu_to_je16(i_uid_read(inode)); ri.gid = cpu_to_je16(i_gid_read(inode)); - ri.isize = cpu_to_je32(max((uint32_t)inode->i_size, pageofs)); + ri.isize = cpu_to_je32((uint32_t)pos); ri.atime = ri.ctime = ri.mtime = cpu_to_je32(JFFS2_NOW()); ri.offset = cpu_to_je32(inode->i_size); - ri.dsize = cpu_to_je32(pageofs - inode->i_size); + ri.dsize = cpu_to_je32((uint32_t)pos - inode->i_size); ri.csize = cpu_to_je32(0); ri.compr = JFFS2_COMPR_ZERO; ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8)); @@ -202,7 +201,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, goto out_err; } jffs2_complete_reservation(c); - inode->i_size = pageofs; + inode->i_size = pos; mutex_unlock(&f->sem); } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 09174898efd0..038516bee1ab 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -403,7 +403,7 @@ int jffs2_do_remount_fs(struct super_block *sb, struct fs_context *fc) /* We stop if it was running, then restart if it needs to. This also catches the case where it was stopped and this is just a remount to restart it. - Flush the writebuffer, if neccecary, else we loose it */ + Flush the writebuffer, if necessary, else we loose it */ if (!sb_rdonly(sb)) { jffs2_stop_garbage_collect_thread(c); mutex_lock(&c->alloc_sem); diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 765838578a72..a3eb1e826947 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -193,7 +193,8 @@ int dbMount(struct inode *ipbmap) bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); - if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) { + if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG || + bmp->db_agl2size < 0) { err = -EINVAL; goto err_release_metapage; } diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c index f00d43b8ac0a..e9a45dea748a 100644 --- a/fs/netfs/iterator.c +++ b/fs/netfs/iterator.c @@ -134,7 +134,7 @@ static ssize_t netfs_extract_user_to_sg(struct iov_iter *iter, npages = DIV_ROUND_UP(off + len, PAGE_SIZE); sg_max -= npages; - for (; npages < 0; npages--) { + for (; npages > 0; npages--) { struct page *page = *pages; size_t seg = min_t(size_t, PAGE_SIZE - off, len); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 192cad0662d8..b1e32ec4a9d4 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -105,14 +105,6 @@ static int __ocfs2_move_extent(handle_t *handle, */ replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), - context->et.et_root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } - ret = ocfs2_split_extent(handle, &context->et, path, index, &replace_rec, context->meta_ac, &context->dealloc); @@ -121,8 +113,6 @@ static int __ocfs2_move_extent(handle_t *handle, goto out; } - ocfs2_journal_dirty(handle, context->et.et_root_bh); - context->new_phys_cpos = new_p_cpos; /* @@ -444,7 +434,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, bg = (struct ocfs2_group_desc *)gd_bh->b_data; if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + - le16_to_cpu(bg->bg_bits))) { + (le16_to_cpu(bg->bg_bits) << bits_per_unit))) { *ret_bh = gd_bh; *vict_bit = (vict_blkno - blkno) >> @@ -559,6 +549,7 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, last_free_bits++; if (last_free_bits == move_len) { + i -= move_len; *goal_bit = i; *phys_cpos = base_cpos + i; break; @@ -1030,18 +1021,19 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) context->range = ⦥ + /* + * ok, the default theshold for the defragmentation + * is 1M, since our maximum clustersize was 1M also. + * any thought? + */ + if (!range.me_threshold) + range.me_threshold = 1024 * 1024; + + if (range.me_threshold > i_size_read(inode)) + range.me_threshold = i_size_read(inode); + if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { context->auto_defrag = 1; - /* - * ok, the default theshold for the defragmentation - * is 1M, since our maximum clustersize was 1M also. - * any thought? - */ - if (!range.me_threshold) - range.me_threshold = 1024 * 1024; - - if (range.me_threshold > i_size_read(inode)) - range.me_threshold = i_size_read(inode); if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) context->partial = 1; diff --git a/fs/open.c b/fs/open.c index 8038cf652583..4401a73d4032 100644 --- a/fs/open.c +++ b/fs/open.c @@ -368,7 +368,37 @@ COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. + * + * Creating new credentials is expensive, so we try to skip doing it, + * which we can if the result would match what we already got. */ +static bool access_need_override_creds(int flags) +{ + const struct cred *cred; + + if (flags & AT_EACCESS) + return false; + + cred = current_cred(); + if (!uid_eq(cred->fsuid, cred->uid) || + !gid_eq(cred->fsgid, cred->gid)) + return true; + + if (!issecure(SECURE_NO_SETUID_FIXUP)) { + kuid_t root_uid = make_kuid(cred->user_ns, 0); + if (!uid_eq(cred->uid, root_uid)) { + if (!cap_isclear(cred->cap_effective)) + return true; + } else { + if (!cap_isidentical(cred->cap_effective, + cred->cap_permitted)) + return true; + } + } + + return false; +} + static const struct cred *access_override_creds(void) { const struct cred *old_cred; @@ -378,6 +408,12 @@ static const struct cred *access_override_creds(void) if (!override_cred) return NULL; + /* + * XXX access_need_override_creds performs checks in hopes of skipping + * this work. Make sure it stays in sync if making any changes in this + * routine. + */ + override_cred->fsuid = override_cred->uid; override_cred->fsgid = override_cred->gid; @@ -437,7 +473,7 @@ static long do_faccessat(int dfd, const char __user *filename, int mode, int fla if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; - if (!(flags & AT_EACCESS)) { + if (access_need_override_creds(flags)) { old_cred = access_override_creds(); if (!old_cred) return -ENOMEM; diff --git a/fs/proc/array.c b/fs/proc/array.c index 49283b8103c7..9b0315d34c58 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -300,13 +300,8 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) static void render_cap_t(struct seq_file *m, const char *header, kernel_cap_t *a) { - unsigned __capi; - seq_puts(m, header); - CAP_FOR_EACH_U32(__capi) { - seq_put_hex_ll(m, NULL, - a->cap[CAP_LAST_U32 - __capi], 8); - } + seq_put_hex_ll(m, NULL, a->val, 16); seq_putc(m, '\n'); } diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index e8b9b756f0ac..d76eb7b39f56 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -209,11 +209,10 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs) subtract_lebs += 1; /* - * The GC journal head LEB is not really accessible. And since - * different write types go to different heads, we may count only on - * one head's space. + * Since different write types go to different heads, we should + * reserve one leb for each head. */ - subtract_lebs += c->jhead_cnt - 1; + subtract_lebs += c->jhead_cnt; /* We also reserve one LEB for deletions, which bypass budgeting */ subtract_lebs += 1; @@ -400,7 +399,7 @@ static int calc_dd_growth(const struct ubifs_info *c, dd_growth = req->dirtied_page ? c->bi.page_budget : 0; if (req->dirtied_ino) - dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1); + dd_growth += c->bi.inode_budget * req->dirtied_ino; if (req->mod_dent) dd_growth += c->bi.dent_budget; dd_growth += req->dirtied_ino_d; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 1e92c1730c16..1505539f6fe9 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1151,7 +1151,6 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, int err, sz_change, len = strlen(symname); struct fscrypt_str disk_link; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, - .new_ino_d = ALIGN(len, 8), .dirtied_ino = 1 }; struct fscrypt_name nm; @@ -1167,6 +1166,7 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, * Budget request settings: new inode, new direntry and changing parent * directory inode. */ + req.new_ino_d = ALIGN(disk_link.len - 1, 8); err = ubifs_budget_space(c, &req); if (err) return err; @@ -1324,6 +1324,8 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlink) { ubifs_assert(c, inode_is_locked(new_inode)); + /* Budget for old inode's data when its nlink > 1. */ + req.dirtied_ino_d = ALIGN(ubifs_inode(new_inode)->data_len, 8); err = ubifs_purge_xattrs(new_inode); if (err) return err; @@ -1566,6 +1568,15 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, ubifs_assert(c, fst_inode && snd_inode); + /* + * Budget request settings: changing two direntries, changing the two + * parent directory inodes. + */ + + dbg_gen("dent '%pd' ino %lu in dir ino %lu exchange dent '%pd' ino %lu in dir ino %lu", + old_dentry, fst_inode->i_ino, old_dir->i_ino, + new_dentry, snd_inode->i_ino, new_dir->i_ino); + err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &fst_nm); if (err) return err; @@ -1576,6 +1587,10 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, return err; } + err = ubifs_budget_space(c, &req); + if (err) + goto out; + lock_4_inodes(old_dir, new_dir, NULL, NULL); time = current_time(old_dir); @@ -1601,6 +1616,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, unlock_4_inodes(old_dir, new_dir, NULL, NULL); ubifs_release_budget(c, &req); +out: fscrypt_free_filename(&fst_nm); fscrypt_free_filename(&snd_nm); return err; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 8cb5d76b301c..979ab1d9d0c3 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1032,7 +1032,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) if (page->index >= synced_i_size >> PAGE_SHIFT) { err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) - goto out_unlock; + goto out_redirty; /* * The inode has been written, but the write-buffer has * not been synchronized, so in case of an unclean @@ -1060,11 +1060,17 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) if (i_size > synced_i_size) { err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) - goto out_unlock; + goto out_redirty; } return do_writepage(page, len); - +out_redirty: + /* + * redirty_page_for_writepage() won't call ubifs_dirty_inode() because + * it passes I_DIRTY_PAGES flag while calling __mark_inode_dirty(), so + * there is no need to do space budget for dirty inode. + */ + redirty_page_for_writepage(wbc, page); out_unlock: unlock_page(page); return err; @@ -1466,14 +1472,23 @@ static bool ubifs_release_folio(struct folio *folio, gfp_t unused_gfp_flags) struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; - /* - * An attempt to release a dirty page without budgeting for it - should - * not happen. - */ if (folio_test_writeback(folio)) return false; + + /* + * Page is private but not dirty, weird? There is one condition + * making it happened. ubifs_writepage skipped the page because + * page index beyonds isize (for example. truncated by other + * process named A), then the page is invalidated by fadvise64 + * syscall before being truncated by process A. + */ ubifs_assert(c, folio_test_private(folio)); - ubifs_assert(c, 0); + if (folio_test_checked(folio)) + release_new_page_budget(c); + else + release_existing_page_budget(c); + + atomic_long_dec(&c->dirty_pg_cnt); folio_detach_private(folio); folio_clear_checked(folio); return true; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 1607a3c76681..01d8eb170382 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -488,7 +488,7 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last) } /** - * wbuf_timer_callback - write-buffer timer callback function. + * wbuf_timer_callback_nolock - write-buffer timer callback function. * @timer: timer data (write-buffer descriptor) * * This function is called when the write-buffer timer expires. @@ -505,7 +505,7 @@ static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer) } /** - * new_wbuf_timer - start new write-buffer timer. + * new_wbuf_timer_nolock - start new write-buffer timer. * @c: UBIFS file-system description object * @wbuf: write-buffer descriptor */ @@ -531,7 +531,7 @@ static void new_wbuf_timer_nolock(struct ubifs_info *c, struct ubifs_wbuf *wbuf) } /** - * cancel_wbuf_timer - cancel write-buffer timer. + * cancel_wbuf_timer_nolock - cancel write-buffer timer. * @wbuf: write-buffer descriptor */ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index d02509920baf..dc52ac0f4a34 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1201,9 +1201,13 @@ out_free: * ubifs_jnl_rename - rename a directory entry. * @c: UBIFS file-system description object * @old_dir: parent inode of directory entry to rename - * @old_dentry: directory entry to rename + * @old_inode: directory entry's inode to rename + * @old_nm: name of the old directory entry to rename * @new_dir: parent inode of directory entry to rename - * @new_dentry: new directory entry (or directory entry to replace) + * @new_inode: new directory entry's inode (or directory entry's inode to + * replace) + * @new_nm: new name of the new directory entry + * @whiteout: whiteout inode * @sync: non-zero if the write-buffer has to be synchronized * * This function implements the re-name operation which may involve writing up diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index d0c9a09988bc..32cb14759796 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -833,7 +833,7 @@ static int alloc_wbufs(struct ubifs_info *c) INIT_LIST_HEAD(&c->jheads[i].buds_list); err = ubifs_wbuf_init(c, &c->jheads[i].wbuf); if (err) - return err; + goto out_wbuf; c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback; c->jheads[i].wbuf.jhead = i; @@ -841,7 +841,7 @@ static int alloc_wbufs(struct ubifs_info *c) c->jheads[i].log_hash = ubifs_hash_get_desc(c); if (IS_ERR(c->jheads[i].log_hash)) { err = PTR_ERR(c->jheads[i].log_hash); - goto out; + goto out_log_hash; } } @@ -854,9 +854,18 @@ static int alloc_wbufs(struct ubifs_info *c) return 0; -out: - while (i--) +out_log_hash: + kfree(c->jheads[i].wbuf.buf); + kfree(c->jheads[i].wbuf.inodes); + +out_wbuf: + while (i--) { + kfree(c->jheads[i].wbuf.buf); + kfree(c->jheads[i].wbuf.inodes); kfree(c->jheads[i].log_hash); + } + kfree(c->jheads); + c->jheads = NULL; return err; } diff --git a/fs/ubifs/sysfs.c b/fs/ubifs/sysfs.c index 06ad8fa1fcfb..1c958148bb87 100644 --- a/fs/ubifs/sysfs.c +++ b/fs/ubifs/sysfs.c @@ -74,13 +74,13 @@ static const struct sysfs_ops ubifs_attr_ops = { .show = ubifs_attr_show, }; -static struct kobj_type ubifs_sb_ktype = { +static const struct kobj_type ubifs_sb_ktype = { .default_groups = ubifs_groups, .sysfs_ops = &ubifs_attr_ops, .release = ubifs_sb_release, }; -static struct kobj_type ubifs_ktype = { +static const struct kobj_type ubifs_ktype = { .sysfs_ops = &ubifs_attr_ops, }; @@ -144,6 +144,8 @@ int __init ubifs_sysfs_init(void) kobject_set_name(&ubifs_kset.kobj, "ubifs"); ubifs_kset.kobj.parent = fs_kobj; ret = kset_register(&ubifs_kset); + if (ret) + kset_put(&ubifs_kset); return ret; } diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 488f3da7a6c6..2469f72eeaab 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -267,11 +267,18 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c, if (zbr->len) { err = insert_old_idx(c, zbr->lnum, zbr->offs); if (unlikely(err)) - return ERR_PTR(err); + /* + * Obsolete znodes will be freed by tnc_destroy_cnext() + * or free_obsolete_znodes(), copied up znodes should + * be added back to tnc and freed by + * ubifs_destroy_tnc_subtree(). + */ + goto out; err = add_idx_dirt(c, zbr->lnum, zbr->len); } else err = 0; +out: zbr->znode = zn; zbr->lnum = 0; zbr->offs = 0; @@ -3053,6 +3060,21 @@ static void tnc_destroy_cnext(struct ubifs_info *c) cnext = cnext->cnext; if (ubifs_zn_obsolete(znode)) kfree(znode); + else if (!ubifs_zn_cow(znode)) { + /* + * Don't forget to update clean znode count after + * committing failed, because ubifs will check this + * count while closing tnc. Non-obsolete znode could + * be re-dirtied during committing process, so dirty + * flag is untrustable. The flag 'COW_ZNODE' is set + * for each dirty znode before committing, and it is + * cleared as long as the znode become clean, so we + * can statistic clean znode count according to this + * flag. + */ + atomic_long_inc(&c->clean_zn_cnt); + atomic_long_inc(&ubifs_clean_zn_cnt); + } } while (cnext && cnext != c->cnext); } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 9063b73536f8..4c36044140e7 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1623,8 +1623,13 @@ static inline int ubifs_check_hmac(const struct ubifs_info *c, return crypto_memneq(expected, got, c->hmac_desc_len); } +#ifdef CONFIG_UBIFS_FS_AUTHENTICATION void ubifs_bad_hash(const struct ubifs_info *c, const void *node, const u8 *hash, int lnum, int offs); +#else +static inline void ubifs_bad_hash(const struct ubifs_info *c, const void *node, + const u8 *hash, int lnum, int offs) {}; +#endif int __ubifs_node_check_hash(const struct ubifs_info *c, const void *buf, const u8 *expected); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index f7a9607c2b95..2210e5eb1ea0 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -193,7 +193,7 @@ static int udf_adinicb_writepage(struct folio *folio, struct udf_inode_info *iinfo = UDF_I(inode); BUG_ON(!PageLocked(page)); - memcpy_to_page(page, 0, iinfo->i_data + iinfo->i_lenEAttr, + memcpy_from_page(iinfo->i_data + iinfo->i_lenEAttr, page, 0, i_size_read(inode)); unlock_page(page); mark_inode_dirty(inode); @@ -241,6 +241,15 @@ static int udf_read_folio(struct file *file, struct folio *folio) static void udf_readahead(struct readahead_control *rac) { + struct udf_inode_info *iinfo = UDF_I(rac->mapping->host); + + /* + * No readahead needed for in-ICB files and udf_get_block() would get + * confused for such file anyway. + */ + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + return; + mpage_readahead(rac, udf_get_block); } @@ -407,6 +416,9 @@ static int udf_map_block(struct inode *inode, struct udf_map_rq *map) int err; struct udf_inode_info *iinfo = UDF_I(inode); + if (WARN_ON_ONCE(iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)) + return -EFSCORRUPTED; + map->oflags = 0; if (!(map->iflags & UDF_MAP_CREATE)) { struct kernel_lb_addr eloc; diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index bb0c700afe3c..86696a1c6891 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -44,16 +44,15 @@ xfs_perag_get( xfs_agnumber_t agno) { struct xfs_perag *pag; - int ref = 0; rcu_read_lock(); pag = radix_tree_lookup(&mp->m_perag_tree, agno); if (pag) { + trace_xfs_perag_get(pag, _RET_IP_); ASSERT(atomic_read(&pag->pag_ref) >= 0); - ref = atomic_inc_return(&pag->pag_ref); + atomic_inc(&pag->pag_ref); } rcu_read_unlock(); - trace_xfs_perag_get(mp, agno, ref, _RET_IP_); return pag; } @@ -68,7 +67,6 @@ xfs_perag_get_tag( { struct xfs_perag *pag; int found; - int ref; rcu_read_lock(); found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, @@ -77,9 +75,9 @@ xfs_perag_get_tag( rcu_read_unlock(); return NULL; } - ref = atomic_inc_return(&pag->pag_ref); + trace_xfs_perag_get_tag(pag, _RET_IP_); + atomic_inc(&pag->pag_ref); rcu_read_unlock(); - trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); return pag; } @@ -87,11 +85,68 @@ void xfs_perag_put( struct xfs_perag *pag) { - int ref; - + trace_xfs_perag_put(pag, _RET_IP_); ASSERT(atomic_read(&pag->pag_ref) > 0); - ref = atomic_dec_return(&pag->pag_ref); - trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); + atomic_dec(&pag->pag_ref); +} + +/* + * Active references for perag structures. This is for short term access to the + * per ag structures for walking trees or accessing state. If an AG is being + * shrunk or is offline, then this will fail to find that AG and return NULL + * instead. + */ +struct xfs_perag * +xfs_perag_grab( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + + rcu_read_lock(); + pag = radix_tree_lookup(&mp->m_perag_tree, agno); + if (pag) { + trace_xfs_perag_grab(pag, _RET_IP_); + if (!atomic_inc_not_zero(&pag->pag_active_ref)) + pag = NULL; + } + rcu_read_unlock(); + return pag; +} + +/* + * search from @first to find the next perag with the given tag set. + */ +struct xfs_perag * +xfs_perag_grab_tag( + struct xfs_mount *mp, + xfs_agnumber_t first, + int tag) +{ + struct xfs_perag *pag; + int found; + + rcu_read_lock(); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, first, 1, tag); + if (found <= 0) { + rcu_read_unlock(); + return NULL; + } + trace_xfs_perag_grab_tag(pag, _RET_IP_); + if (!atomic_inc_not_zero(&pag->pag_active_ref)) + pag = NULL; + rcu_read_unlock(); + return pag; +} + +void +xfs_perag_rele( + struct xfs_perag *pag) +{ + trace_xfs_perag_rele(pag, _RET_IP_); + if (atomic_dec_and_test(&pag->pag_active_ref)) + wake_up(&pag->pag_active_wq); } /* @@ -196,6 +251,10 @@ xfs_free_perag( cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_buf_hash_destroy(pag); + /* drop the mount's active reference */ + xfs_perag_rele(pag); + XFS_IS_CORRUPT(pag->pag_mount, + atomic_read(&pag->pag_active_ref) != 0); call_rcu(&pag->rcu_head, __xfs_free_perag); } } @@ -314,6 +373,7 @@ xfs_initialize_perag( INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); init_waitqueue_head(&pag->pagb_wait); + init_waitqueue_head(&pag->pag_active_wq); pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; #endif /* __KERNEL__ */ @@ -322,6 +382,9 @@ xfs_initialize_perag( if (error) goto out_remove_pag; + /* Active ref owned by mount indicates AG is online. */ + atomic_set(&pag->pag_active_ref, 1); + /* first new pag is fully initialized */ if (first_initialised == NULLAGNUMBER) first_initialised = index; @@ -824,7 +887,7 @@ xfs_ag_shrink_space( struct xfs_alloc_arg args = { .tp = *tpp, .mp = mp, - .type = XFS_ALLOCTYPE_THIS_BNO, + .pag = pag, .minlen = delta, .maxlen = delta, .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, @@ -856,14 +919,11 @@ xfs_ag_shrink_space( if (delta >= aglen) return -EINVAL; - args.fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta); - /* * Make sure that the last inode cluster cannot overlap with the new * end of the AG, even if it's sparse. */ - error = xfs_ialloc_check_shrink(*tpp, pag->pag_agno, agibp, - aglen - delta); + error = xfs_ialloc_check_shrink(pag, *tpp, agibp, aglen - delta); if (error) return error; @@ -876,7 +936,8 @@ xfs_ag_shrink_space( return error; /* internal log shouldn't also show up in the free space btrees */ - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_exact_bno(&args, + XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta)); if (!error && args.agbno == NULLAGBLOCK) error = -ENOSPC; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 191b22b9a35b..5e18536dfdce 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -32,14 +32,12 @@ struct xfs_ag_resv { struct xfs_perag { struct xfs_mount *pag_mount; /* owner filesystem */ xfs_agnumber_t pag_agno; /* AG this structure belongs to */ - atomic_t pag_ref; /* perag reference count */ - char pagf_init; /* this agf's entry is initialized */ - char pagi_init; /* this agi's entry is initialized */ - char pagf_metadata; /* the agf is preferred to be metadata */ - char pagi_inodeok; /* The agi is ok for inodes */ + atomic_t pag_ref; /* passive reference count */ + atomic_t pag_active_ref; /* active reference count */ + wait_queue_head_t pag_active_wq;/* woken active_ref falls to zero */ + unsigned long pag_opstate; uint8_t pagf_levels[XFS_BTNUM_AGF]; /* # of levels in bno & cnt btree */ - bool pagf_agflreset; /* agfl requires reset before use */ uint32_t pagf_flcount; /* count of blocks in freelist */ xfs_extlen_t pagf_freeblks; /* total free blocks */ xfs_extlen_t pagf_longest; /* longest free space */ @@ -106,16 +104,44 @@ struct xfs_perag { #endif /* __KERNEL__ */ }; +/* + * Per-AG operational state. These are atomic flag bits. + */ +#define XFS_AGSTATE_AGF_INIT 0 +#define XFS_AGSTATE_AGI_INIT 1 +#define XFS_AGSTATE_PREFERS_METADATA 2 +#define XFS_AGSTATE_ALLOWS_INODES 3 +#define XFS_AGSTATE_AGFL_NEEDS_RESET 4 + +#define __XFS_AG_OPSTATE(name, NAME) \ +static inline bool xfs_perag_ ## name (struct xfs_perag *pag) \ +{ \ + return test_bit(XFS_AGSTATE_ ## NAME, &pag->pag_opstate); \ +} + +__XFS_AG_OPSTATE(initialised_agf, AGF_INIT) +__XFS_AG_OPSTATE(initialised_agi, AGI_INIT) +__XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA) +__XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES) +__XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET) + int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); void xfs_free_perag(struct xfs_mount *mp); +/* Passive AG references */ struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int tag); void xfs_perag_put(struct xfs_perag *pag); +/* Active AG references */ +struct xfs_perag *xfs_perag_grab(struct xfs_mount *, xfs_agnumber_t); +struct xfs_perag *xfs_perag_grab_tag(struct xfs_mount *, xfs_agnumber_t, + int tag); +void xfs_perag_rele(struct xfs_perag *pag); + /* * Per-ag geometry infomation and validation */ @@ -193,31 +219,86 @@ xfs_perag_next( struct xfs_mount *mp = pag->pag_mount; *agno = pag->pag_agno + 1; - xfs_perag_put(pag); - if (*agno > end_agno) - return NULL; - return xfs_perag_get(mp, *agno); + xfs_perag_rele(pag); + while (*agno <= end_agno) { + pag = xfs_perag_grab(mp, *agno); + if (pag) + return pag; + (*agno)++; + } + return NULL; } #define for_each_perag_range(mp, agno, end_agno, pag) \ - for ((pag) = xfs_perag_get((mp), (agno)); \ + for ((pag) = xfs_perag_grab((mp), (agno)); \ (pag) != NULL; \ (pag) = xfs_perag_next((pag), &(agno), (end_agno))) #define for_each_perag_from(mp, agno, pag) \ for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag)) - #define for_each_perag(mp, agno, pag) \ (agno) = 0; \ for_each_perag_from((mp), (agno), (pag)) #define for_each_perag_tag(mp, agno, pag, tag) \ - for ((agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \ + for ((agno) = 0, (pag) = xfs_perag_grab_tag((mp), 0, (tag)); \ (pag) != NULL; \ (agno) = (pag)->pag_agno + 1, \ - xfs_perag_put(pag), \ - (pag) = xfs_perag_get_tag((mp), (agno), (tag))) + xfs_perag_rele(pag), \ + (pag) = xfs_perag_grab_tag((mp), (agno), (tag))) + +static inline struct xfs_perag * +xfs_perag_next_wrap( + struct xfs_perag *pag, + xfs_agnumber_t *agno, + xfs_agnumber_t stop_agno, + xfs_agnumber_t restart_agno, + xfs_agnumber_t wrap_agno) +{ + struct xfs_mount *mp = pag->pag_mount; + + *agno = pag->pag_agno + 1; + xfs_perag_rele(pag); + while (*agno != stop_agno) { + if (*agno >= wrap_agno) { + if (restart_agno >= stop_agno) + break; + *agno = restart_agno; + } + + pag = xfs_perag_grab(mp, *agno); + if (pag) + return pag; + (*agno)++; + } + return NULL; +} + +/* + * Iterate all AGs from start_agno through wrap_agno, then restart_agno through + * (start_agno - 1). + */ +#define for_each_perag_wrap_range(mp, start_agno, restart_agno, wrap_agno, agno, pag) \ + for ((agno) = (start_agno), (pag) = xfs_perag_grab((mp), (agno)); \ + (pag) != NULL; \ + (pag) = xfs_perag_next_wrap((pag), &(agno), (start_agno), \ + (restart_agno), (wrap_agno))) +/* + * Iterate all AGs from start_agno through wrap_agno, then 0 through + * (start_agno - 1). + */ +#define for_each_perag_wrap_at(mp, start_agno, wrap_agno, agno, pag) \ + for_each_perag_wrap_range((mp), (start_agno), 0, (wrap_agno), (agno), (pag)) + +/* + * Iterate all AGs from start_agno through to the end of the filesystem, then 0 + * through (start_agno - 1). + */ +#define for_each_perag_wrap(mp, start_agno, agno, pag) \ + for_each_perag_wrap_at((mp), (start_agno), (mp)->m_sb.sb_agcount, \ + (agno), (pag)) + struct aghdr_init_data { /* per ag data */ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 5af123d13a63..7fd1fea95552 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -264,7 +264,7 @@ xfs_ag_resv_init( if (error) goto out; - error = xfs_finobt_calc_reserves(mp, tp, pag, &ask, &used); + error = xfs_finobt_calc_reserves(pag, tp, &ask, &used); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index f8ff81c3de76..6a037173d20d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -36,10 +36,6 @@ struct workqueue_struct *xfs_alloc_wq; #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); -STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); -STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); - /* * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in * the beginning of the block for a proper header with the location information @@ -772,8 +768,6 @@ xfs_alloc_cur_setup( int error; int i; - ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO); - acur->cur_len = args->maxlen; acur->rec_bno = 0; acur->rec_len = 0; @@ -887,7 +881,6 @@ xfs_alloc_cur_check( * We have an aligned record that satisfies minlen and beats or matches * the candidate extent size. Compare locality for near allocation mode. */ - ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO); diff = xfs_alloc_compute_diff(args->agbno, args->len, args->alignment, args->datatype, bnoa, lena, &bnew); @@ -1133,78 +1126,6 @@ error: } /* - * Allocate a variable extent in the allocation group agno. - * Type and bno are used to determine where in the allocation group the - * extent will start. - * Extent's length (returned in *len) will be between minlen and maxlen, - * and of the form k * prod + mod unless there's nothing that large. - * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. - */ -STATIC int /* error */ -xfs_alloc_ag_vextent( - xfs_alloc_arg_t *args) /* argument structure for allocation */ -{ - int error=0; - - ASSERT(args->minlen > 0); - ASSERT(args->maxlen > 0); - ASSERT(args->minlen <= args->maxlen); - ASSERT(args->mod < args->prod); - ASSERT(args->alignment > 0); - - /* - * Branch to correct routine based on the type. - */ - args->wasfromfl = 0; - switch (args->type) { - case XFS_ALLOCTYPE_THIS_AG: - error = xfs_alloc_ag_vextent_size(args); - break; - case XFS_ALLOCTYPE_NEAR_BNO: - error = xfs_alloc_ag_vextent_near(args); - break; - case XFS_ALLOCTYPE_THIS_BNO: - error = xfs_alloc_ag_vextent_exact(args); - break; - default: - ASSERT(0); - /* NOTREACHED */ - } - - if (error || args->agbno == NULLAGBLOCK) - return error; - - ASSERT(args->len >= args->minlen); - ASSERT(args->len <= args->maxlen); - ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL); - ASSERT(args->agbno % args->alignment == 0); - - /* if not file data, insert new block into the reverse map btree */ - if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { - error = xfs_rmap_alloc(args->tp, args->agbp, args->pag, - args->agbno, args->len, &args->oinfo); - if (error) - return error; - } - - if (!args->wasfromfl) { - error = xfs_alloc_update_counters(args->tp, args->agbp, - -((long)(args->len))); - if (error) - return error; - - ASSERT(!xfs_extent_busy_search(args->mp, args->pag, - args->agbno, args->len)); - } - - xfs_ag_resv_alloc_extent(args->pag, args->resv, args); - - XFS_STATS_INC(args->mp, xs_allocx); - XFS_STATS_ADD(args->mp, xs_allocb, args->len); - return error; -} - -/* * Allocate a variable extent at exactly agno/bno. * Extent's length (returned in *len) will be between minlen and maxlen, * and of the form k * prod + mod unless there's nothing that large. @@ -1389,7 +1310,6 @@ xfs_alloc_ag_vextent_locality( bool fbinc; ASSERT(acur->len == 0); - ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO); *stat = 0; @@ -2435,7 +2355,7 @@ xfs_agfl_reset( struct xfs_mount *mp = tp->t_mountp; struct xfs_agf *agf = agbp->b_addr; - ASSERT(pag->pagf_agflreset); + ASSERT(xfs_perag_agfl_needs_reset(pag)); trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); xfs_warn(mp, @@ -2450,7 +2370,7 @@ xfs_agfl_reset( XFS_AGF_FLCOUNT); pag->pagf_flcount = 0; - pag->pagf_agflreset = false; + clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); } /* @@ -2605,7 +2525,7 @@ xfs_alloc_fix_freelist( /* deferred ops (AGFL block frees) require permanent transactions */ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - if (!pag->pagf_init) { + if (!xfs_perag_initialised_agf(pag)) { error = xfs_alloc_read_agf(pag, tp, flags, &agbp); if (error) { /* Couldn't lock the AGF so skip this AG. */ @@ -2620,7 +2540,8 @@ xfs_alloc_fix_freelist( * somewhere else if we are not being asked to try harder at this * point */ - if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) && + if (xfs_perag_prefers_metadata(pag) && + (args->datatype & XFS_ALLOC_USERDATA) && (flags & XFS_ALLOC_FLAG_TRYLOCK)) { ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); goto out_agbp_relse; @@ -2646,7 +2567,7 @@ xfs_alloc_fix_freelist( } /* reset a padding mismatched agfl before final free space check */ - if (pag->pagf_agflreset) + if (xfs_perag_agfl_needs_reset(pag)) xfs_agfl_reset(tp, agbp, pag); /* If there isn't enough total space or single-extent, reject it. */ @@ -2707,7 +2628,6 @@ xfs_alloc_fix_freelist( targs.agbp = agbp; targs.agno = args->agno; targs.alignment = targs.minlen = targs.prod = 1; - targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; error = xfs_alloc_read_agfl(pag, tp, &agflbp); if (error) @@ -2720,7 +2640,7 @@ xfs_alloc_fix_freelist( targs.resv = XFS_AG_RESV_AGFL; /* Allocate as many blocks as possible at once. */ - error = xfs_alloc_ag_vextent(&targs); + error = xfs_alloc_ag_vextent_size(&targs); if (error) goto out_agflbp_relse; @@ -2734,6 +2654,18 @@ xfs_alloc_fix_freelist( break; goto out_agflbp_relse; } + + if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) { + error = xfs_rmap_alloc(tp, agbp, pag, + targs.agbno, targs.len, &targs.oinfo); + if (error) + goto out_agflbp_relse; + } + error = xfs_alloc_update_counters(tp, agbp, + -((long)(targs.len))); + if (error) + goto out_agflbp_relse; + /* * Put each allocated block on the list. */ @@ -2803,7 +2735,7 @@ xfs_alloc_get_freelist( if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) agf->agf_flfirst = 0; - ASSERT(!pag->pagf_agflreset); + ASSERT(!xfs_perag_agfl_needs_reset(pag)); be32_add_cpu(&agf->agf_flcount, -1); pag->pagf_flcount--; @@ -2892,7 +2824,7 @@ xfs_alloc_put_freelist( if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) agf->agf_fllast = 0; - ASSERT(!pag->pagf_agflreset); + ASSERT(!xfs_perag_agfl_needs_reset(pag)); be32_add_cpu(&agf->agf_flcount, 1); pag->pagf_flcount++; @@ -3099,7 +3031,7 @@ xfs_alloc_read_agf( return error; agf = agfbp->b_addr; - if (!pag->pagf_init) { + if (!xfs_perag_initialised_agf(pag)) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); @@ -3111,8 +3043,8 @@ xfs_alloc_read_agf( pag->pagf_levels[XFS_BTNUM_RMAPi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); - pag->pagf_init = 1; - pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf); + if (xfs_agfl_needs_reset(pag->pag_mount, agf)) + set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); /* * Update the in-core allocbt counter. Filter out the rmapbt @@ -3127,6 +3059,8 @@ xfs_alloc_read_agf( if (allocbt_blks > 0) atomic64_add(allocbt_blks, &pag->pag_mount->m_allocbt_blks); + + set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); } #ifdef DEBUG else if (!xfs_is_shutdown(pag->pag_mount)) { @@ -3148,26 +3082,25 @@ xfs_alloc_read_agf( } /* - * Allocate an extent (variable-size). - * Depending on the allocation type, we either look in a single allocation - * group or loop over the allocation groups to find the result. + * Pre-proces allocation arguments to set initial state that we don't require + * callers to set up correctly, as well as bounds check the allocation args + * that are set up. */ -int /* error */ -xfs_alloc_vextent( - struct xfs_alloc_arg *args) /* allocation argument structure */ +static int +xfs_alloc_vextent_check_args( + struct xfs_alloc_arg *args, + xfs_fsblock_t target, + xfs_agnumber_t *minimum_agno) { - xfs_agblock_t agsize; /* allocation group size */ - int error; - int flags; /* XFS_ALLOC_FLAG_... locking flags */ - struct xfs_mount *mp; /* mount structure pointer */ - xfs_agnumber_t sagno; /* starting allocation group number */ - xfs_alloctype_t type; /* input allocation type */ - int bump_rotor = 0; - xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */ - - mp = args->mp; - type = args->otype = args->type; - args->agbno = NULLAGBLOCK; + struct xfs_mount *mp = args->mp; + xfs_agblock_t agsize; + + args->fsbno = NULLFSBLOCK; + + *minimum_agno = 0; + if (args->tp->t_highest_agno != NULLAGNUMBER) + *minimum_agno = args->tp->t_highest_agno; + /* * Just fix this up, for the case where the last a.g. is shorter * (or there's only one a.g.) and the caller couldn't easily figure @@ -3178,168 +3111,414 @@ xfs_alloc_vextent( args->maxlen = agsize; if (args->alignment == 0) args->alignment = 1; - ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize); + + ASSERT(args->minlen > 0); + ASSERT(args->maxlen > 0); + ASSERT(args->alignment > 0); + ASSERT(args->resv != XFS_AG_RESV_AGFL); + + ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize); ASSERT(args->minlen <= args->maxlen); ASSERT(args->minlen <= agsize); ASSERT(args->mod < args->prod); - if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount || - XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize || + + if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount || + XFS_FSB_TO_AGBNO(mp, target) >= agsize || args->minlen > args->maxlen || args->minlen > agsize || args->mod >= args->prod) { - args->fsbno = NULLFSBLOCK; trace_xfs_alloc_vextent_badargs(args); + return -ENOSPC; + } + + if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) { + trace_xfs_alloc_vextent_skip_deadlock(args); + return -ENOSPC; + } + return 0; + +} + +/* + * Prepare an AG for allocation. If the AG is not prepared to accept the + * allocation, return failure. + * + * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are + * modified to hold their own perag references. + */ +static int +xfs_alloc_vextent_prepare_ag( + struct xfs_alloc_arg *args) +{ + bool need_pag = !args->pag; + int error; + + if (need_pag) + args->pag = xfs_perag_get(args->mp, args->agno); + + args->agbp = NULL; + error = xfs_alloc_fix_freelist(args, 0); + if (error) { + trace_xfs_alloc_vextent_nofix(args); + if (need_pag) + xfs_perag_put(args->pag); + args->agbno = NULLAGBLOCK; + return error; + } + if (!args->agbp) { + /* cannot allocate in this AG at all */ + trace_xfs_alloc_vextent_noagbp(args); + args->agbno = NULLAGBLOCK; return 0; } + args->wasfromfl = 0; + return 0; +} - switch (type) { - case XFS_ALLOCTYPE_THIS_AG: - case XFS_ALLOCTYPE_NEAR_BNO: - case XFS_ALLOCTYPE_THIS_BNO: - /* - * These three force us into a single a.g. - */ - args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); - args->pag = xfs_perag_get(mp, args->agno); - error = xfs_alloc_fix_freelist(args, 0); - if (error) { - trace_xfs_alloc_vextent_nofix(args); - goto error0; - } - if (!args->agbp) { - trace_xfs_alloc_vextent_noagbp(args); +/* + * Post-process allocation results to account for the allocation if it succeed + * and set the allocated block number correctly for the caller. + * + * XXX: we should really be returning ENOSPC for ENOSPC, not + * hiding it behind a "successful" NULLFSBLOCK allocation. + */ +static int +xfs_alloc_vextent_finish( + struct xfs_alloc_arg *args, + xfs_agnumber_t minimum_agno, + int alloc_error, + bool drop_perag) +{ + struct xfs_mount *mp = args->mp; + int error = 0; + + /* + * We can end up here with a locked AGF. If we failed, the caller is + * likely going to try to allocate again with different parameters, and + * that can widen the AGs that are searched for free space. If we have + * to do BMBT block allocation, we have to do a new allocation. + * + * Hence leaving this function with the AGF locked opens up potential + * ABBA AGF deadlocks because a future allocation attempt in this + * transaction may attempt to lock a lower number AGF. + * + * We can't release the AGF until the transaction is commited, so at + * this point we must update the "first allocation" tracker to point at + * this AG if the tracker is empty or points to a lower AG. This allows + * the next allocation attempt to be modified appropriately to avoid + * deadlocks. + */ + if (args->agbp && + (args->tp->t_highest_agno == NULLAGNUMBER || + args->agno > minimum_agno)) + args->tp->t_highest_agno = args->agno; + + /* + * If the allocation failed with an error or we had an ENOSPC result, + * preserve the returned error whilst also marking the allocation result + * as "no extent allocated". This ensures that callers that fail to + * capture the error will still treat it as a failed allocation. + */ + if (alloc_error || args->agbno == NULLAGBLOCK) { + args->fsbno = NULLFSBLOCK; + error = alloc_error; + goto out_drop_perag; + } + + args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); + + ASSERT(args->len >= args->minlen); + ASSERT(args->len <= args->maxlen); + ASSERT(args->agbno % args->alignment == 0); + XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len); + + /* if not file data, insert new block into the reverse map btree */ + if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { + error = xfs_rmap_alloc(args->tp, args->agbp, args->pag, + args->agbno, args->len, &args->oinfo); + if (error) + goto out_drop_perag; + } + + if (!args->wasfromfl) { + error = xfs_alloc_update_counters(args->tp, args->agbp, + -((long)(args->len))); + if (error) + goto out_drop_perag; + + ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno, + args->len)); + } + + xfs_ag_resv_alloc_extent(args->pag, args->resv, args); + + XFS_STATS_INC(mp, xs_allocx); + XFS_STATS_ADD(mp, xs_allocb, args->len); + +out_drop_perag: + if (drop_perag && args->pag) { + xfs_perag_rele(args->pag); + args->pag = NULL; + } + return error; +} + +/* + * Allocate within a single AG only. This uses a best-fit length algorithm so if + * you need an exact sized allocation without locality constraints, this is the + * fastest way to do it. + * + * Caller is expected to hold a perag reference in args->pag. + */ +int +xfs_alloc_vextent_this_ag( + struct xfs_alloc_arg *args, + xfs_agnumber_t agno) +{ + struct xfs_mount *mp = args->mp; + xfs_agnumber_t minimum_agno; + int error; + + args->agno = agno; + args->agbno = 0; + error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0), + &minimum_agno); + if (error) { + if (error == -ENOSPC) + return 0; + return error; + } + + error = xfs_alloc_vextent_prepare_ag(args); + if (!error && args->agbp) + error = xfs_alloc_ag_vextent_size(args); + + return xfs_alloc_vextent_finish(args, minimum_agno, error, false); +} + +/* + * Iterate all AGs trying to allocate an extent starting from @start_ag. + * + * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the + * allocation attempts in @start_agno have locality information. If we fail to + * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs + * we attempt to allocation in as there is no locality optimisation possible for + * those allocations. + * + * On return, args->pag may be left referenced if we finish before the "all + * failed" return point. The allocation finish still needs the perag, and + * so the caller will release it once they've finished the allocation. + * + * When we wrap the AG iteration at the end of the filesystem, we have to be + * careful not to wrap into AGs below ones we already have locked in the + * transaction if we are doing a blocking iteration. This will result in an + * out-of-order locking of AGFs and hence can cause deadlocks. + */ +static int +xfs_alloc_vextent_iterate_ags( + struct xfs_alloc_arg *args, + xfs_agnumber_t minimum_agno, + xfs_agnumber_t start_agno, + xfs_agblock_t target_agbno, + uint32_t flags) +{ + struct xfs_mount *mp = args->mp; + xfs_agnumber_t agno; + int error = 0; + +restart: + for_each_perag_wrap_range(mp, start_agno, minimum_agno, + mp->m_sb.sb_agcount, agno, args->pag) { + args->agno = agno; + error = xfs_alloc_vextent_prepare_ag(args); + if (error) break; + if (!args->agbp) { + trace_xfs_alloc_vextent_loopfailed(args); + continue; } - args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); - if ((error = xfs_alloc_ag_vextent(args))) - goto error0; - break; - case XFS_ALLOCTYPE_START_BNO: - /* - * Try near allocation first, then anywhere-in-ag after - * the first a.g. fails. - */ - if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && - xfs_is_inode32(mp)) { - args->fsbno = XFS_AGB_TO_FSB(mp, - ((mp->m_agfrotor / rotorstep) % - mp->m_sb.sb_agcount), 0); - bump_rotor = 1; - } - args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); - args->type = XFS_ALLOCTYPE_NEAR_BNO; - fallthrough; - case XFS_ALLOCTYPE_FIRST_AG: + /* - * Rotate through the allocation groups looking for a winner. + * Allocation is supposed to succeed now, so break out of the + * loop regardless of whether we succeed or not. */ - if (type == XFS_ALLOCTYPE_FIRST_AG) { - /* - * Start with allocation group given by bno. - */ - args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); - args->type = XFS_ALLOCTYPE_THIS_AG; - sagno = 0; - flags = 0; + if (args->agno == start_agno && target_agbno) { + args->agbno = target_agbno; + error = xfs_alloc_ag_vextent_near(args); } else { - /* - * Start with the given allocation group. - */ - args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno); - flags = XFS_ALLOC_FLAG_TRYLOCK; - } - /* - * Loop over allocation groups twice; first time with - * trylock set, second time without. - */ - for (;;) { - args->pag = xfs_perag_get(mp, args->agno); - error = xfs_alloc_fix_freelist(args, flags); - if (error) { - trace_xfs_alloc_vextent_nofix(args); - goto error0; - } - /* - * If we get a buffer back then the allocation will fly. - */ - if (args->agbp) { - if ((error = xfs_alloc_ag_vextent(args))) - goto error0; - break; - } - - trace_xfs_alloc_vextent_loopfailed(args); - - /* - * Didn't work, figure out the next iteration. - */ - if (args->agno == sagno && - type == XFS_ALLOCTYPE_START_BNO) - args->type = XFS_ALLOCTYPE_THIS_AG; - /* - * For the first allocation, we can try any AG to get - * space. However, if we already have allocated a - * block, we don't want to try AGs whose number is below - * sagno. Otherwise, we may end up with out-of-order - * locking of AGF, which might cause deadlock. - */ - if (++(args->agno) == mp->m_sb.sb_agcount) { - if (args->tp->t_firstblock != NULLFSBLOCK) - args->agno = sagno; - else - args->agno = 0; - } - /* - * Reached the starting a.g., must either be done - * or switch to non-trylock mode. - */ - if (args->agno == sagno) { - if (flags == 0) { - args->agbno = NULLAGBLOCK; - trace_xfs_alloc_vextent_allfailed(args); - break; - } - - flags = 0; - if (type == XFS_ALLOCTYPE_START_BNO) { - args->agbno = XFS_FSB_TO_AGBNO(mp, - args->fsbno); - args->type = XFS_ALLOCTYPE_NEAR_BNO; - } - } - xfs_perag_put(args->pag); - } - if (bump_rotor) { - if (args->agno == sagno) - mp->m_agfrotor = (mp->m_agfrotor + 1) % - (mp->m_sb.sb_agcount * rotorstep); - else - mp->m_agfrotor = (args->agno * rotorstep + 1) % - (mp->m_sb.sb_agcount * rotorstep); + args->agbno = 0; + error = xfs_alloc_ag_vextent_size(args); } break; - default: - ASSERT(0); - /* NOTREACHED */ } - if (args->agbno == NULLAGBLOCK) - args->fsbno = NULLFSBLOCK; - else { - args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); -#ifdef DEBUG - ASSERT(args->len >= args->minlen); - ASSERT(args->len <= args->maxlen); - ASSERT(args->agbno % args->alignment == 0); - XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), - args->len); -#endif + if (error) { + xfs_perag_rele(args->pag); + args->pag = NULL; + return error; + } + if (args->agbp) + return 0; + /* + * We didn't find an AG we can alloation from. If we were given + * constraining flags by the caller, drop them and retry the allocation + * without any constraints being set. + */ + if (flags) { + flags = 0; + goto restart; } - xfs_perag_put(args->pag); + + ASSERT(args->pag == NULL); + trace_xfs_alloc_vextent_allfailed(args); return 0; -error0: - xfs_perag_put(args->pag); - return error; +} + +/* + * Iterate from the AGs from the start AG to the end of the filesystem, trying + * to allocate blocks. It starts with a near allocation attempt in the initial + * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap + * back to zero if allowed by previous allocations in this transaction, + * otherwise will wrap back to the start AG and run a second blocking pass to + * the end of the filesystem. + */ +int +xfs_alloc_vextent_start_ag( + struct xfs_alloc_arg *args, + xfs_fsblock_t target) +{ + struct xfs_mount *mp = args->mp; + xfs_agnumber_t minimum_agno; + xfs_agnumber_t start_agno; + xfs_agnumber_t rotorstep = xfs_rotorstep; + bool bump_rotor = false; + int error; + + args->agno = NULLAGNUMBER; + args->agbno = NULLAGBLOCK; + error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); + if (error) { + if (error == -ENOSPC) + return 0; + return error; + } + + if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && + xfs_is_inode32(mp)) { + target = XFS_AGB_TO_FSB(mp, + ((mp->m_agfrotor / rotorstep) % + mp->m_sb.sb_agcount), 0); + bump_rotor = 1; + } + + start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); + error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, + XFS_FSB_TO_AGBNO(mp, target), XFS_ALLOC_FLAG_TRYLOCK); + + if (bump_rotor) { + if (args->agno == start_agno) + mp->m_agfrotor = (mp->m_agfrotor + 1) % + (mp->m_sb.sb_agcount * rotorstep); + else + mp->m_agfrotor = (args->agno * rotorstep + 1) % + (mp->m_sb.sb_agcount * rotorstep); + } + + return xfs_alloc_vextent_finish(args, minimum_agno, error, true); +} + +/* + * Iterate from the agno indicated via @target through to the end of the + * filesystem attempting blocking allocation. This does not wrap or try a second + * pass, so will not recurse into AGs lower than indicated by the target. + */ +int +xfs_alloc_vextent_first_ag( + struct xfs_alloc_arg *args, + xfs_fsblock_t target) + { + struct xfs_mount *mp = args->mp; + xfs_agnumber_t minimum_agno; + xfs_agnumber_t start_agno; + int error; + + args->agno = NULLAGNUMBER; + args->agbno = NULLAGBLOCK; + error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); + if (error) { + if (error == -ENOSPC) + return 0; + return error; + } + + start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); + error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, + XFS_FSB_TO_AGBNO(mp, target), 0); + return xfs_alloc_vextent_finish(args, minimum_agno, error, true); +} + +/* + * Allocate at the exact block target or fail. Caller is expected to hold a + * perag reference in args->pag. + */ +int +xfs_alloc_vextent_exact_bno( + struct xfs_alloc_arg *args, + xfs_fsblock_t target) +{ + struct xfs_mount *mp = args->mp; + xfs_agnumber_t minimum_agno; + int error; + + args->agno = XFS_FSB_TO_AGNO(mp, target); + args->agbno = XFS_FSB_TO_AGBNO(mp, target); + error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); + if (error) { + if (error == -ENOSPC) + return 0; + return error; + } + + error = xfs_alloc_vextent_prepare_ag(args); + if (!error && args->agbp) + error = xfs_alloc_ag_vextent_exact(args); + + return xfs_alloc_vextent_finish(args, minimum_agno, error, false); +} + +/* + * Allocate an extent as close to the target as possible. If there are not + * viable candidates in the AG, then fail the allocation. + * + * Caller may or may not have a per-ag reference in args->pag. + */ +int +xfs_alloc_vextent_near_bno( + struct xfs_alloc_arg *args, + xfs_fsblock_t target) +{ + struct xfs_mount *mp = args->mp; + xfs_agnumber_t minimum_agno; + bool needs_perag = args->pag == NULL; + int error; + + args->agno = XFS_FSB_TO_AGNO(mp, target); + args->agbno = XFS_FSB_TO_AGBNO(mp, target); + error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); + if (error) { + if (error == -ENOSPC) + return 0; + return error; + } + + if (needs_perag) + args->pag = xfs_perag_grab(mp, args->agno); + + error = xfs_alloc_vextent_prepare_ag(args); + if (!error && args->agbp) + error = xfs_alloc_ag_vextent_near(args); + + return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag); } /* Ensure that the freelist is at full capacity. */ diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 2c3f762dfb58..2b246d74c189 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -17,25 +17,6 @@ extern struct workqueue_struct *xfs_alloc_wq; unsigned int xfs_agfl_size(struct xfs_mount *mp); /* - * Freespace allocation types. Argument to xfs_alloc_[v]extent. - */ -#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */ -#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */ -#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */ -#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */ -#define XFS_ALLOCTYPE_THIS_BNO 0x40 /* at exactly this block */ - -/* this should become an enum again when the tracing code is fixed */ -typedef unsigned int xfs_alloctype_t; - -#define XFS_ALLOC_TYPES \ - { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \ - { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \ - { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \ - { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \ - { XFS_ALLOCTYPE_THIS_BNO, "THIS_BNO" } - -/* * Flags for xfs_alloc_fix_freelist. */ #define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */ @@ -68,8 +49,6 @@ typedef struct xfs_alloc_arg { xfs_agblock_t min_agbno; /* set an agbno range for NEAR allocs */ xfs_agblock_t max_agbno; /* ... */ xfs_extlen_t len; /* output: actual size of extent */ - xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ - xfs_alloctype_t otype; /* original allocation type */ int datatype; /* mask defining data type treatment */ char wasdel; /* set if allocation was prev delayed */ char wasfromfl; /* set if allocation is from freelist */ @@ -118,11 +97,43 @@ xfs_alloc_log_agf( uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */ /* - * Allocate an extent (variable-size). + * Allocate an extent anywhere in the specific AG given. If there is no + * space matching the requirements in that AG, then the allocation will fail. */ -int /* error */ -xfs_alloc_vextent( - xfs_alloc_arg_t *args); /* allocation argument structure */ +int xfs_alloc_vextent_this_ag(struct xfs_alloc_arg *args, xfs_agnumber_t agno); + +/* + * Allocate an extent as close to the target as possible. If there are not + * viable candidates in the AG, then fail the allocation. + */ +int xfs_alloc_vextent_near_bno(struct xfs_alloc_arg *args, + xfs_fsblock_t target); + +/* + * Allocate an extent exactly at the target given. If this is not possible + * then the allocation fails. + */ +int xfs_alloc_vextent_exact_bno(struct xfs_alloc_arg *args, + xfs_fsblock_t target); + +/* + * Best effort full filesystem allocation scan. + * + * Locality aware allocation will be attempted in the initial AG, but on failure + * non-localised attempts will be made. The AGs are constrained by previous + * allocations in the current transaction. Two passes will be made - the first + * non-blocking, the second blocking. + */ +int xfs_alloc_vextent_start_ag(struct xfs_alloc_arg *args, + xfs_fsblock_t target); + +/* + * Iterate from the AG indicated from args->fsbno through to the end of the + * filesystem attempting blocking allocation. This is for use in last + * resort allocation attempts when everything else has failed. + */ +int xfs_alloc_vextent_first_ag(struct xfs_alloc_arg *args, + xfs_fsblock_t target); /* * Free an extent. diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 549a3cba0234..0f29c7b1b39f 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -315,7 +315,7 @@ xfs_allocbt_verify( level = be16_to_cpu(block->bb_level); if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) btnum = XFS_BTNUM_CNTi; - if (pag && pag->pagf_init) { + if (pag && xfs_perag_initialised_agf(pag)) { if (level >= pag->pagf_levels[btnum]) return __this_address; } else if (level >= mp->m_alloc_maxlevels) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c8c65387136c..34de6e6898c4 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -645,34 +645,23 @@ xfs_bmap_extents_to_btree( args.tp = tp; args.mp = mp; xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, whichfork); - if (tp->t_firstblock == NULLFSBLOCK) { - args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); - } else if (tp->t_flags & XFS_TRANS_LOWMODE) { - args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = tp->t_firstblock; - } else { - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.fsbno = tp->t_firstblock; - } + args.minlen = args.maxlen = args.prod = 1; args.wasdel = wasdel; *logflagsp = 0; - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_start_ag(&args, + XFS_INO_TO_FSB(mp, ip->i_ino)); if (error) goto out_root_realloc; + /* + * Allocation can't fail, the space was reserved. + */ if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { error = -ENOSPC; goto out_root_realloc; } - /* - * Allocation can't fail, the space was reserved. - */ - ASSERT(tp->t_firstblock == NULLFSBLOCK || - args.agno >= XFS_FSB_TO_AGNO(mp, tp->t_firstblock)); - tp->t_firstblock = args.fsbno; cur->bc_ino.allocated++; ip->i_nblocks++; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); @@ -799,28 +788,24 @@ xfs_bmap_local_to_extents( memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; + args.total = total; + args.minlen = args.maxlen = args.prod = 1; xfs_rmap_ino_owner(&args.oinfo, ip->i_ino, whichfork, 0); + /* * Allocate a block. We know we need only one, since the * file currently fits in an inode. */ - if (tp->t_firstblock == NULLFSBLOCK) { - args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); - args.type = XFS_ALLOCTYPE_START_BNO; - } else { - args.fsbno = tp->t_firstblock; - args.type = XFS_ALLOCTYPE_NEAR_BNO; - } args.total = total; args.minlen = args.maxlen = args.prod = 1; - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_start_ag(&args, + XFS_INO_TO_FSB(args.mp, ip->i_ino)); if (error) goto done; /* Can't fail, the space was reserved. */ ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); - tp->t_firstblock = args.fsbno; error = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, XFS_FSB_TO_DADDR(args.mp, args.fsbno), args.mp->m_bsize, 0, &bp); @@ -854,8 +839,7 @@ xfs_bmap_local_to_extents( ifp->if_nextents = 1; ip->i_nblocks = 1; - xfs_trans_mod_dquot_byino(tp, ip, - XFS_TRANS_DQ_BCOUNT, 1L); + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); flags |= xfs_ilog_fext(whichfork); done: @@ -3025,9 +3009,7 @@ xfs_bmap_adjacent( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { xfs_fsblock_t adjust; /* adjustment to block numbers */ - xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ xfs_mount_t *mp; /* mount point structure */ - int nullfb; /* true if ap->firstblock isn't set */ int rt; /* true if inode is realtime */ #define ISVALID(x,y) \ @@ -3038,11 +3020,8 @@ xfs_bmap_adjacent( XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) mp = ap->ip->i_mount; - nullfb = ap->tp->t_firstblock == NULLFSBLOCK; rt = XFS_IS_REALTIME_INODE(ap->ip) && (ap->datatype & XFS_ALLOC_USERDATA); - fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, - ap->tp->t_firstblock); /* * If allocating at eof, and there's a previous real block, * try to use its last block as our starting point. @@ -3101,13 +3080,6 @@ xfs_bmap_adjacent( prevbno += adjust; else prevdiff += adjust; - /* - * If the firstblock forbids it, can't use it, - * must use default. - */ - if (!rt && !nullfb && - XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno) - prevbno = NULLFSBLOCK; } /* * No previous block or can't follow it, just default. @@ -3143,13 +3115,6 @@ xfs_bmap_adjacent( gotdiff += adjust - ap->length; } else gotdiff += adjust; - /* - * If the firstblock forbids it, can't use it, - * must use default. - */ - if (!rt && !nullfb && - XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno) - gotbno = NULLFSBLOCK; } /* * No next block, just default. @@ -3170,147 +3135,91 @@ xfs_bmap_adjacent( #undef ISVALID } -static int +int xfs_bmap_longest_free_extent( + struct xfs_perag *pag, struct xfs_trans *tp, - xfs_agnumber_t ag, - xfs_extlen_t *blen, - int *notinit) + xfs_extlen_t *blen) { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_perag *pag; xfs_extlen_t longest; int error = 0; - pag = xfs_perag_get(mp, ag); - if (!pag->pagf_init) { + if (!xfs_perag_initialised_agf(pag)) { error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_TRYLOCK, NULL); - if (error) { - /* Couldn't lock the AGF, so skip this AG. */ - if (error == -EAGAIN) { - *notinit = 1; - error = 0; - } - goto out; - } + if (error) + return error; } longest = xfs_alloc_longest_free_extent(pag, - xfs_alloc_min_freelist(mp, pag), + xfs_alloc_min_freelist(pag->pag_mount, pag), xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (*blen < longest) *blen = longest; -out: - xfs_perag_put(pag); - return error; + return 0; } -static void +static xfs_extlen_t xfs_bmap_select_minlen( struct xfs_bmalloca *ap, struct xfs_alloc_arg *args, - xfs_extlen_t *blen, - int notinit) + xfs_extlen_t blen) { - if (notinit || *blen < ap->minlen) { - /* - * Since we did a BUF_TRYLOCK above, it is possible that - * there is space for this request. - */ - args->minlen = ap->minlen; - } else if (*blen < args->maxlen) { - /* - * If the best seen length is less than the request length, - * use the best as the minimum. - */ - args->minlen = *blen; - } else { - /* - * Otherwise we've seen an extent as big as maxlen, use that - * as the minimum. - */ - args->minlen = args->maxlen; - } -} - -STATIC int -xfs_bmap_btalloc_nullfb( - struct xfs_bmalloca *ap, - struct xfs_alloc_arg *args, - xfs_extlen_t *blen) -{ - struct xfs_mount *mp = ap->ip->i_mount; - xfs_agnumber_t ag, startag; - int notinit = 0; - int error; - args->type = XFS_ALLOCTYPE_START_BNO; - args->total = ap->total; - - startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno); - if (startag == NULLAGNUMBER) - startag = ag = 0; - - while (*blen < args->maxlen) { - error = xfs_bmap_longest_free_extent(args->tp, ag, blen, - ¬init); - if (error) - return error; - - if (++ag == mp->m_sb.sb_agcount) - ag = 0; - if (ag == startag) - break; - } + /* + * Since we used XFS_ALLOC_FLAG_TRYLOCK in _longest_free_extent(), it is + * possible that there is enough contiguous free space for this request. + */ + if (blen < ap->minlen) + return ap->minlen; - xfs_bmap_select_minlen(ap, args, blen, notinit); - return 0; + /* + * If the best seen length is less than the request length, + * use the best as the minimum, otherwise we've got the maxlen we + * were asked for. + */ + if (blen < args->maxlen) + return blen; + return args->maxlen; } -STATIC int -xfs_bmap_btalloc_filestreams( +static int +xfs_bmap_btalloc_select_lengths( struct xfs_bmalloca *ap, struct xfs_alloc_arg *args, xfs_extlen_t *blen) { - struct xfs_mount *mp = ap->ip->i_mount; - xfs_agnumber_t ag; - int notinit = 0; - int error; - - args->type = XFS_ALLOCTYPE_NEAR_BNO; - args->total = ap->total; - - ag = XFS_FSB_TO_AGNO(mp, args->fsbno); - if (ag == NULLAGNUMBER) - ag = 0; - - error = xfs_bmap_longest_free_extent(args->tp, ag, blen, ¬init); - if (error) - return error; + struct xfs_mount *mp = args->mp; + struct xfs_perag *pag; + xfs_agnumber_t agno, startag; + int error = 0; - if (*blen < args->maxlen) { - error = xfs_filestream_new_ag(ap, &ag); - if (error) - return error; + if (ap->tp->t_flags & XFS_TRANS_LOWMODE) { + args->total = ap->minlen; + args->minlen = ap->minlen; + return 0; + } - error = xfs_bmap_longest_free_extent(args->tp, ag, blen, - ¬init); - if (error) - return error; + args->total = ap->total; + startag = XFS_FSB_TO_AGNO(mp, ap->blkno); + if (startag == NULLAGNUMBER) + startag = 0; + *blen = 0; + for_each_perag_wrap(mp, startag, agno, pag) { + error = xfs_bmap_longest_free_extent(pag, args->tp, blen); + if (error && error != -EAGAIN) + break; + error = 0; + if (*blen >= args->maxlen) + break; } + if (pag) + xfs_perag_rele(pag); - xfs_bmap_select_minlen(ap, args, blen, notinit); - - /* - * Set the failure fallback case to look in the selected AG as stream - * may have moved. - */ - ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); - return 0; + args->minlen = xfs_bmap_select_minlen(ap, args, *blen); + return error; } /* Update all inode and quota accounting for the allocation we just did. */ @@ -3413,21 +3322,7 @@ xfs_bmap_process_allocated_extent( xfs_fileoff_t orig_offset, xfs_extlen_t orig_length) { - int nullfb; - - nullfb = ap->tp->t_firstblock == NULLFSBLOCK; - - /* - * check the allocation happened at the same or higher AG than - * the first block that was allocated. - */ - ASSERT(nullfb || - XFS_FSB_TO_AGNO(args->mp, ap->tp->t_firstblock) <= - XFS_FSB_TO_AGNO(args->mp, args->fsbno)); - ap->blkno = args->fsbno; - if (nullfb) - ap->tp->t_firstblock = args->fsbno; ap->length = args->len; /* * If the extent size hint is active, we tried to round the @@ -3474,23 +3369,17 @@ xfs_bmap_exact_minlen_extent_alloc( xfs_bmap_compute_alignments(ap, &args); - if (ap->tp->t_firstblock == NULLFSBLOCK) { - /* - * Unlike the longest extent available in an AG, we don't track - * the length of an AG's shortest extent. - * XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT is a debug only knob and - * hence we can afford to start traversing from the 0th AG since - * we need not be concerned about a drop in performance in - * "debug only" code paths. - */ - ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0); - } else { - ap->blkno = ap->tp->t_firstblock; - } + /* + * Unlike the longest extent available in an AG, we don't track + * the length of an AG's shortest extent. + * XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT is a debug only knob and + * hence we can afford to start traversing from the 0th AG since + * we need not be concerned about a drop in performance in + * "debug only" code paths. + */ + ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0); - args.fsbno = ap->blkno; args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; - args.type = XFS_ALLOCTYPE_FIRST_AG; args.minlen = args.maxlen = ap->minlen; args.total = ap->total; @@ -3502,7 +3391,7 @@ xfs_bmap_exact_minlen_extent_alloc( args.resv = XFS_AG_RESV_NONE; args.datatype = ap->datatype; - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_first_ag(&args, ap->blkno); if (error) return error; @@ -3522,193 +3411,270 @@ xfs_bmap_exact_minlen_extent_alloc( #endif -STATIC int -xfs_bmap_btalloc( - struct xfs_bmalloca *ap) +/* + * If we are not low on available data blocks and we are allocating at + * EOF, optimise allocation for contiguous file extension and/or stripe + * alignment of the new extent. + * + * NOTE: ap->aeof is only set if the allocation length is >= the + * stripe unit and the allocation offset is at the end of file. + */ +static int +xfs_bmap_btalloc_at_eof( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t blen, + int stripe_align, + bool ag_only) { - struct xfs_mount *mp = ap->ip->i_mount; - struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp }; - xfs_alloctype_t atype = 0; - xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ - xfs_agnumber_t ag; - xfs_fileoff_t orig_offset; - xfs_extlen_t orig_length; - xfs_extlen_t blen; - xfs_extlen_t nextminlen = 0; - int nullfb; /* true if ap->firstblock isn't set */ - int isaligned; - int tryagain; + struct xfs_mount *mp = args->mp; + struct xfs_perag *caller_pag = args->pag; int error; - int stripe_align; - - ASSERT(ap->length); - orig_offset = ap->offset; - orig_length = ap->length; - - stripe_align = xfs_bmap_compute_alignments(ap, &args); - - nullfb = ap->tp->t_firstblock == NULLFSBLOCK; - fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, - ap->tp->t_firstblock); - if (nullfb) { - if ((ap->datatype & XFS_ALLOC_USERDATA) && - xfs_inode_is_filestream(ap->ip)) { - ag = xfs_filestream_lookup_ag(ap->ip); - ag = (ag != NULLAGNUMBER) ? ag : 0; - ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0); - } else { - ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino); - } - } else - ap->blkno = ap->tp->t_firstblock; - xfs_bmap_adjacent(ap); - - /* - * If allowed, use ap->blkno; otherwise must use firstblock since - * it's in the right allocation group. - */ - if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno) - ; - else - ap->blkno = ap->tp->t_firstblock; /* - * Normal allocation, done through xfs_alloc_vextent. + * If there are already extents in the file, try an exact EOF block + * allocation to extend the file as a contiguous extent. If that fails, + * or it's the first allocation in a file, just try for a stripe aligned + * allocation. */ - tryagain = isaligned = 0; - args.fsbno = ap->blkno; - args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; + if (ap->offset) { + xfs_extlen_t nextminlen = 0; - /* Trim the allocation back to the maximum an AG can fit. */ - args.maxlen = min(ap->length, mp->m_ag_max_usable); - blen = 0; - if (nullfb) { /* - * Search for an allocation group with a single extent large - * enough for the request. If one isn't found, then adjust - * the minimum allocation size to the largest space found. + * Compute the minlen+alignment for the next case. Set slop so + * that the value of minlen+alignment+slop doesn't go up between + * the calls. */ - if ((ap->datatype & XFS_ALLOC_USERDATA) && - xfs_inode_is_filestream(ap->ip)) - error = xfs_bmap_btalloc_filestreams(ap, &args, &blen); + args->alignment = 1; + if (blen > stripe_align && blen <= args->maxlen) + nextminlen = blen - stripe_align; + else + nextminlen = args->minlen; + if (nextminlen + stripe_align > args->minlen + 1) + args->minalignslop = nextminlen + stripe_align - + args->minlen - 1; else - error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); + args->minalignslop = 0; + + if (!caller_pag) + args->pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ap->blkno)); + error = xfs_alloc_vextent_exact_bno(args, ap->blkno); + if (!caller_pag) + xfs_perag_put(args->pag); if (error) return error; - } else if (ap->tp->t_flags & XFS_TRANS_LOWMODE) { - if (xfs_inode_is_filestream(ap->ip)) - args.type = XFS_ALLOCTYPE_FIRST_AG; - else - args.type = XFS_ALLOCTYPE_START_BNO; - args.total = args.minlen = ap->minlen; + + if (args->fsbno != NULLFSBLOCK) + return 0; + /* + * Exact allocation failed. Reset to try an aligned allocation + * according to the original allocation specification. + */ + args->pag = NULL; + args->alignment = stripe_align; + args->minlen = nextminlen; + args->minalignslop = 0; } else { - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.total = ap->total; - args.minlen = ap->minlen; + /* + * Adjust minlen to try and preserve alignment if we + * can't guarantee an aligned maxlen extent. + */ + args->alignment = stripe_align; + if (blen > args->alignment && + blen <= args->maxlen + args->alignment) + args->minlen = blen - args->alignment; + args->minalignslop = 0; } - /* - * If we are not low on available data blocks, and the underlying - * logical volume manager is a stripe, and the file offset is zero then - * try to allocate data blocks on stripe unit boundary. NOTE: ap->aeof - * is only set if the allocation length is >= the stripe unit and the - * allocation offset is at the end of file. - */ - if (!(ap->tp->t_flags & XFS_TRANS_LOWMODE) && ap->aeof) { - if (!ap->offset) { - args.alignment = stripe_align; - atype = args.type; - isaligned = 1; - /* - * Adjust minlen to try and preserve alignment if we - * can't guarantee an aligned maxlen extent. - */ - if (blen > args.alignment && - blen <= args.maxlen + args.alignment) - args.minlen = blen - args.alignment; - args.minalignslop = 0; - } else { - /* - * First try an exact bno allocation. - * If it fails then do a near or start bno - * allocation with alignment turned on. - */ - atype = args.type; - tryagain = 1; - args.type = XFS_ALLOCTYPE_THIS_BNO; - args.alignment = 1; - /* - * Compute the minlen+alignment for the - * next case. Set slop so that the value - * of minlen+alignment+slop doesn't go up - * between the calls. - */ - if (blen > stripe_align && blen <= args.maxlen) - nextminlen = blen - stripe_align; - else - nextminlen = args.minlen; - if (nextminlen + stripe_align > args.minlen + 1) - args.minalignslop = - nextminlen + stripe_align - - args.minlen - 1; - else - args.minalignslop = 0; - } + if (ag_only) { + error = xfs_alloc_vextent_near_bno(args, ap->blkno); } else { - args.alignment = 1; - args.minalignslop = 0; + args->pag = NULL; + error = xfs_alloc_vextent_start_ag(args, ap->blkno); + ASSERT(args->pag == NULL); + args->pag = caller_pag; } - args.minleft = ap->minleft; - args.wasdel = ap->wasdel; - args.resv = XFS_AG_RESV_NONE; - args.datatype = ap->datatype; - - error = xfs_alloc_vextent(&args); if (error) return error; - if (tryagain && args.fsbno == NULLFSBLOCK) { - /* - * Exact allocation failed. Now try with alignment - * turned on. - */ - args.type = atype; - args.fsbno = ap->blkno; - args.alignment = stripe_align; - args.minlen = nextminlen; - args.minalignslop = 0; - isaligned = 1; - if ((error = xfs_alloc_vextent(&args))) - return error; - } - if (isaligned && args.fsbno == NULLFSBLOCK) { - /* - * allocation failed, so turn off alignment and - * try again. - */ - args.type = atype; - args.fsbno = ap->blkno; - args.alignment = 0; - if ((error = xfs_alloc_vextent(&args))) + if (args->fsbno != NULLFSBLOCK) + return 0; + + /* + * Allocation failed, so turn return the allocation args to their + * original non-aligned state so the caller can proceed on allocation + * failure as if this function was never called. + */ + args->fsbno = ap->blkno; + args->alignment = 1; + return 0; +} + +/* + * We have failed multiple allocation attempts so now are in a low space + * allocation situation. Try a locality first full filesystem minimum length + * allocation whilst still maintaining necessary total block reservation + * requirements. + * + * If that fails, we are now critically low on space, so perform a last resort + * allocation attempt: no reserve, no locality, blocking, minimum length, full + * filesystem free space scan. We also indicate to future allocations in this + * transaction that we are critically low on space so they don't waste time on + * allocation modes that are unlikely to succeed. + */ +int +xfs_bmap_btalloc_low_space( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args) +{ + int error; + + if (args->minlen > ap->minlen) { + args->minlen = ap->minlen; + error = xfs_alloc_vextent_start_ag(args, ap->blkno); + if (error || args->fsbno != NULLFSBLOCK) return error; } - if (args.fsbno == NULLFSBLOCK && nullfb && - args.minlen > ap->minlen) { - args.minlen = ap->minlen; - args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = ap->blkno; - if ((error = xfs_alloc_vextent(&args))) - return error; + + /* Last ditch attempt before failure is declared. */ + args->total = ap->minlen; + error = xfs_alloc_vextent_first_ag(args, 0); + if (error) + return error; + ap->tp->t_flags |= XFS_TRANS_LOWMODE; + return 0; +} + +static int +xfs_bmap_btalloc_filestreams( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + int stripe_align) +{ + xfs_extlen_t blen = 0; + int error = 0; + + + error = xfs_filestream_select_ag(ap, args, &blen); + if (error) + return error; + ASSERT(args->pag); + + /* + * If we are in low space mode, then optimal allocation will fail so + * prepare for minimal allocation and jump to the low space algorithm + * immediately. + */ + if (ap->tp->t_flags & XFS_TRANS_LOWMODE) { + args->minlen = ap->minlen; + ASSERT(args->fsbno == NULLFSBLOCK); + goto out_low_space; } - if (args.fsbno == NULLFSBLOCK && nullfb) { - args.fsbno = 0; - args.type = XFS_ALLOCTYPE_FIRST_AG; - args.total = ap->minlen; - if ((error = xfs_alloc_vextent(&args))) + + args->minlen = xfs_bmap_select_minlen(ap, args, blen); + if (ap->aeof) + error = xfs_bmap_btalloc_at_eof(ap, args, blen, stripe_align, + true); + + if (!error && args->fsbno == NULLFSBLOCK) + error = xfs_alloc_vextent_near_bno(args, ap->blkno); + +out_low_space: + /* + * We are now done with the perag reference for the filestreams + * association provided by xfs_filestream_select_ag(). Release it now as + * we've either succeeded, had a fatal error or we are out of space and + * need to do a full filesystem scan for free space which will take it's + * own references. + */ + xfs_perag_rele(args->pag); + args->pag = NULL; + if (error || args->fsbno != NULLFSBLOCK) + return error; + + return xfs_bmap_btalloc_low_space(ap, args); +} + +static int +xfs_bmap_btalloc_best_length( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + int stripe_align) +{ + xfs_extlen_t blen = 0; + int error; + + ap->blkno = XFS_INO_TO_FSB(args->mp, ap->ip->i_ino); + xfs_bmap_adjacent(ap); + + /* + * Search for an allocation group with a single extent large enough for + * the request. If one isn't found, then adjust the minimum allocation + * size to the largest space found. + */ + error = xfs_bmap_btalloc_select_lengths(ap, args, &blen); + if (error) + return error; + + /* + * Don't attempt optimal EOF allocation if previous allocations barely + * succeeded due to being near ENOSPC. It is highly unlikely we'll get + * optimal or even aligned allocations in this case, so don't waste time + * trying. + */ + if (ap->aeof && !(ap->tp->t_flags & XFS_TRANS_LOWMODE)) { + error = xfs_bmap_btalloc_at_eof(ap, args, blen, stripe_align, + false); + if (error || args->fsbno != NULLFSBLOCK) return error; - ap->tp->t_flags |= XFS_TRANS_LOWMODE; } + error = xfs_alloc_vextent_start_ag(args, ap->blkno); + if (error || args->fsbno != NULLFSBLOCK) + return error; + + return xfs_bmap_btalloc_low_space(ap, args); +} + +static int +xfs_bmap_btalloc( + struct xfs_bmalloca *ap) +{ + struct xfs_mount *mp = ap->ip->i_mount; + struct xfs_alloc_arg args = { + .tp = ap->tp, + .mp = mp, + .fsbno = NULLFSBLOCK, + .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, + .minleft = ap->minleft, + .wasdel = ap->wasdel, + .resv = XFS_AG_RESV_NONE, + .datatype = ap->datatype, + .alignment = 1, + .minalignslop = 0, + }; + xfs_fileoff_t orig_offset; + xfs_extlen_t orig_length; + int error; + int stripe_align; + + ASSERT(ap->length); + orig_offset = ap->offset; + orig_length = ap->length; + + stripe_align = xfs_bmap_compute_alignments(ap, &args); + + /* Trim the allocation back to the maximum an AG can fit. */ + args.maxlen = min(ap->length, mp->m_ag_max_usable); + + if ((ap->datatype & XFS_ALLOC_USERDATA) && + xfs_inode_is_filestream(ap->ip)) + error = xfs_bmap_btalloc_filestreams(ap, &args, stripe_align); + else + error = xfs_bmap_btalloc_best_length(ap, &args, stripe_align); + if (error) + return error; + if (args.fsbno != NULLFSBLOCK) { xfs_bmap_process_allocated_extent(ap, &args, orig_offset, orig_length); @@ -4256,7 +4222,7 @@ xfs_bmapi_convert_unwritten( return 0; } -static inline xfs_extlen_t +xfs_extlen_t xfs_bmapi_minleft( struct xfs_trans *tp, struct xfs_inode *ip, @@ -4264,7 +4230,7 @@ xfs_bmapi_minleft( { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, fork); - if (tp && tp->t_firstblock != NULLFSBLOCK) + if (tp && tp->t_highest_agno != NULLAGNUMBER) return 0; if (ifp->if_format != XFS_DINODE_FMT_BTREE) return 1; @@ -6151,7 +6117,7 @@ xfs_bmap_finish_one( struct xfs_bmbt_irec *bmap = &bi->bi_bmap; int error = 0; - ASSERT(tp->t_firstblock == NULLFSBLOCK); + ASSERT(tp->t_highest_agno == NULLAGNUMBER); trace_xfs_bmap_deferred(tp->t_mountp, XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock), diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 01c2df35c3e3..dd08361ca5a6 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -12,6 +12,7 @@ struct xfs_ifork; struct xfs_inode; struct xfs_mount; struct xfs_trans; +struct xfs_alloc_arg; /* * Argument structure for xfs_bmap_alloc. @@ -168,6 +169,8 @@ static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec) #define xfs_valid_startblock(ip, startblock) \ ((startblock) != 0 || XFS_IS_REALTIME_INODE(ip)) +int xfs_bmap_longest_free_extent(struct xfs_perag *pag, + struct xfs_trans *tp, xfs_extlen_t *blen); void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp); @@ -220,6 +223,10 @@ int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, struct xfs_bmbt_irec *new, int *logflagsp); +xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip, + int fork); +int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index cfa052d40105..b8ad95050c9b 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -21,6 +21,7 @@ #include "xfs_quota.h" #include "xfs_trace.h" #include "xfs_rmap.h" +#include "xfs_ag.h" static struct kmem_cache *xfs_bmbt_cur_cache; @@ -184,11 +185,11 @@ xfs_bmbt_update_cursor( struct xfs_btree_cur *src, struct xfs_btree_cur *dst) { - ASSERT((dst->bc_tp->t_firstblock != NULLFSBLOCK) || + ASSERT((dst->bc_tp->t_highest_agno != NULLAGNUMBER) || (dst->bc_ino.ip->i_diflags & XFS_DIFLAG_REALTIME)); dst->bc_ino.allocated += src->bc_ino.allocated; - dst->bc_tp->t_firstblock = src->bc_tp->t_firstblock; + dst->bc_tp->t_highest_agno = src->bc_tp->t_highest_agno; src->bc_ino.allocated = 0; } @@ -200,46 +201,32 @@ xfs_bmbt_alloc_block( union xfs_btree_ptr *new, int *stat) { - xfs_alloc_arg_t args; /* block allocation args */ - int error; /* error return value */ + struct xfs_alloc_arg args; + int error; memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; - args.fsbno = cur->bc_tp->t_firstblock; xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino, cur->bc_ino.whichfork); - - if (args.fsbno == NULLFSBLOCK) { - args.fsbno = be64_to_cpu(start->l); - args.type = XFS_ALLOCTYPE_START_BNO; - /* - * Make sure there is sufficient room left in the AG to - * complete a full tree split for an extent insert. If - * we are converting the middle part of an extent then - * we may need space for two tree splits. - * - * We are relying on the caller to make the correct block - * reservation for this operation to succeed. If the - * reservation amount is insufficient then we may fail a - * block allocation here and corrupt the filesystem. - */ - args.minleft = args.tp->t_blk_res; - } else if (cur->bc_tp->t_flags & XFS_TRANS_LOWMODE) { - args.type = XFS_ALLOCTYPE_START_BNO; - } else { - args.type = XFS_ALLOCTYPE_NEAR_BNO; - } - args.minlen = args.maxlen = args.prod = 1; args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL; - if (!args.wasdel && args.tp->t_blk_res == 0) { - error = -ENOSPC; - goto error0; - } - error = xfs_alloc_vextent(&args); + if (!args.wasdel && args.tp->t_blk_res == 0) + return -ENOSPC; + + /* + * If we are coming here from something like unwritten extent + * conversion, there has been no data extent allocation already done, so + * we have to ensure that we attempt to locate the entire set of bmbt + * allocations in the same AG, as xfs_bmapi_write() would have reserved. + */ + if (cur->bc_tp->t_highest_agno == NULLAGNUMBER) + args.minleft = xfs_bmapi_minleft(cur->bc_tp, cur->bc_ino.ip, + cur->bc_ino.whichfork); + + error = xfs_alloc_vextent_start_ag(&args, be64_to_cpu(start->l)); if (error) - goto error0; + return error; if (args.fsbno == NULLFSBLOCK && args.minleft) { /* @@ -247,11 +234,10 @@ xfs_bmbt_alloc_block( * a full btree split. Try again and if * successful activate the lowspace algorithm. */ - args.fsbno = 0; - args.type = XFS_ALLOCTYPE_FIRST_AG; - error = xfs_alloc_vextent(&args); + args.minleft = 0; + error = xfs_alloc_vextent_start_ag(&args, 0); if (error) - goto error0; + return error; cur->bc_tp->t_flags |= XFS_TRANS_LOWMODE; } if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { @@ -260,7 +246,6 @@ xfs_bmbt_alloc_block( } ASSERT(args.len == 1); - cur->bc_tp->t_firstblock = args.fsbno; cur->bc_ino.allocated++; cur->bc_ino.ip->i_nblocks++; xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE); @@ -271,9 +256,6 @@ xfs_bmbt_alloc_block( *stat = 1; return 0; - - error0: - return error; } STATIC int diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index da8c769887fd..c4649cc624e1 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2943,7 +2943,7 @@ xfs_btree_split( DECLARE_COMPLETION_ONSTACK(done); if (cur->bc_btnum != XFS_BTNUM_BMAP || - cur->bc_tp->t_firstblock == NULLFSBLOCK) + cur->bc_tp->t_highest_agno == NULLAGNUMBER) return __xfs_btree_split(cur, level, ptrp, key, curp, stat); args.cur = cur; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 5118dedf9267..7ee292aecbeb 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -169,10 +169,9 @@ xfs_inobt_insert_rec( */ STATIC int xfs_inobt_insert( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_agino_t newino, xfs_agino_t newlen, xfs_btnum_t btnum) @@ -182,7 +181,7 @@ xfs_inobt_insert( int i; int error; - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum); + cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum); for (thisino = newino; thisino < newino + newlen; @@ -514,20 +513,20 @@ __xfs_inobt_rec_merge( */ STATIC int xfs_inobt_insert_sprec( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, int btnum, struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ bool merge) /* merge or replace */ { + struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; int error; int i; struct xfs_inobt_rec_incore rec; - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum); + cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); @@ -609,9 +608,9 @@ error: */ STATIC int xfs_ialloc_ag_alloc( + struct xfs_perag *pag, struct xfs_trans *tp, - struct xfs_buf *agbp, - struct xfs_perag *pag) + struct xfs_buf *agbp) { struct xfs_agi *agi; struct xfs_alloc_arg args; @@ -631,6 +630,7 @@ xfs_ialloc_ag_alloc( args.mp = tp->t_mountp; args.fsbno = NULLFSBLOCK; args.oinfo = XFS_RMAP_OINFO_INODES; + args.pag = pag; #ifdef DEBUG /* randomly do sparse inode allocations */ @@ -662,8 +662,6 @@ xfs_ialloc_ag_alloc( goto sparse_alloc; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { - args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); - args.type = XFS_ALLOCTYPE_THIS_BNO; args.prod = 1; /* @@ -684,7 +682,10 @@ xfs_ialloc_ag_alloc( /* Allow space for the inode btree to split. */ args.minleft = igeo->inobt_maxlevels; - if ((error = xfs_alloc_vextent(&args))) + error = xfs_alloc_vextent_exact_bno(&args, + XFS_AGB_TO_FSB(args.mp, pag->pag_agno, + args.agbno)); + if (error) return error; /* @@ -717,22 +718,17 @@ xfs_ialloc_ag_alloc( } else args.alignment = igeo->cluster_align; /* - * Need to figure out where to allocate the inode blocks. - * Ideally they should be spaced out through the a.g. - * For now, just allocate blocks up front. - */ - args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); - /* * Allocate a fixed-size extent of inodes. */ - args.type = XFS_ALLOCTYPE_NEAR_BNO; args.prod = 1; /* * Allow space for the inode btree to split. */ args.minleft = igeo->inobt_maxlevels; - if ((error = xfs_alloc_vextent(&args))) + error = xfs_alloc_vextent_near_bno(&args, + XFS_AGB_TO_FSB(args.mp, pag->pag_agno, + be32_to_cpu(agi->agi_root))); + if (error) return error; } @@ -741,11 +737,11 @@ xfs_ialloc_ag_alloc( * alignment. */ if (isaligned && args.fsbno == NULLFSBLOCK) { - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); args.alignment = igeo->cluster_align; - if ((error = xfs_alloc_vextent(&args))) + error = xfs_alloc_vextent_near_bno(&args, + XFS_AGB_TO_FSB(args.mp, pag->pag_agno, + be32_to_cpu(agi->agi_root))); + if (error) return error; } @@ -757,9 +753,6 @@ xfs_ialloc_ag_alloc( igeo->ialloc_min_blks < igeo->ialloc_blks && args.fsbno == NULLFSBLOCK) { sparse_alloc: - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); args.alignment = args.mp->m_sb.sb_spino_align; args.prod = 1; @@ -781,7 +774,9 @@ sparse_alloc: args.mp->m_sb.sb_inoalignmt) - igeo->ialloc_blks; - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_near_bno(&args, + XFS_AGB_TO_FSB(args.mp, pag->pag_agno, + be32_to_cpu(agi->agi_root))); if (error) return error; @@ -831,7 +826,7 @@ sparse_alloc: * if necessary. If a merge does occur, rec is updated to the * merged record. */ - error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag, + error = xfs_inobt_insert_sprec(pag, tp, agbp, XFS_BTNUM_INO, &rec, true); if (error == -EFSCORRUPTED) { xfs_alert(args.mp, @@ -856,20 +851,20 @@ sparse_alloc: * existing record with this one. */ if (xfs_has_finobt(args.mp)) { - error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag, + error = xfs_inobt_insert_sprec(pag, tp, agbp, XFS_BTNUM_FINO, &rec, false); if (error) return error; } } else { /* full chunk - insert new records to both btrees */ - error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, newlen, + error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, XFS_BTNUM_INO); if (error) return error; if (xfs_has_finobt(args.mp)) { - error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, + error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, XFS_BTNUM_FINO); if (error) return error; @@ -981,9 +976,9 @@ xfs_inobt_first_free_inode( */ STATIC int xfs_dialloc_ag_inobt( + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_ino_t parent, xfs_ino_t *inop) { @@ -999,12 +994,12 @@ xfs_dialloc_ag_inobt( int i, j; int searchdistance = 10; - ASSERT(pag->pagi_init); - ASSERT(pag->pagi_inodeok); + ASSERT(xfs_perag_initialised_agi(pag)); + ASSERT(xfs_perag_allows_inodes(pag)); ASSERT(pag->pagi_freecount > 0); restart_pagno: - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -1429,9 +1424,9 @@ xfs_dialloc_ag_update_inobt( */ static int xfs_dialloc_ag( + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_ino_t parent, xfs_ino_t *inop) { @@ -1448,7 +1443,7 @@ xfs_dialloc_ag( int i; if (!xfs_has_finobt(mp)) - return xfs_dialloc_ag_inobt(tp, agbp, pag, parent, inop); + return xfs_dialloc_ag_inobt(pag, tp, agbp, parent, inop); /* * If pagino is 0 (this is the root inode allocation) use newino. @@ -1457,7 +1452,7 @@ xfs_dialloc_ag( if (!pagino) pagino = be32_to_cpu(agi->agi_newino); - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO); + cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO); error = xfs_check_agi_freecount(cur); if (error) @@ -1500,7 +1495,7 @@ xfs_dialloc_ag( * the original freecount. If all is well, make the equivalent update to * the inobt using the finobt record and offset information. */ - icur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); + icur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); error = xfs_check_agi_freecount(icur); if (error) @@ -1577,25 +1572,10 @@ xfs_dialloc_roll( return error; } -static xfs_agnumber_t -xfs_ialloc_next_ag( - xfs_mount_t *mp) -{ - xfs_agnumber_t agno; - - spin_lock(&mp->m_agirotor_lock); - agno = mp->m_agirotor; - if (++mp->m_agirotor >= mp->m_maxagi) - mp->m_agirotor = 0; - spin_unlock(&mp->m_agirotor_lock); - - return agno; -} - static bool xfs_dialloc_good_ag( - struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_trans *tp, umode_t mode, int flags, bool ok_alloc) @@ -1606,10 +1586,12 @@ xfs_dialloc_good_ag( int needspace; int error; - if (!pag->pagi_inodeok) + if (!pag) + return false; + if (!xfs_perag_allows_inodes(pag)) return false; - if (!pag->pagi_init) { + if (!xfs_perag_initialised_agi(pag)) { error = xfs_ialloc_read_agi(pag, tp, NULL); if (error) return false; @@ -1620,7 +1602,7 @@ xfs_dialloc_good_ag( if (!ok_alloc) return false; - if (!pag->pagf_init) { + if (!xfs_perag_initialised_agf(pag)) { error = xfs_alloc_read_agf(pag, tp, flags, NULL); if (error) return false; @@ -1665,8 +1647,8 @@ xfs_dialloc_good_ag( static int xfs_dialloc_try_ag( - struct xfs_trans **tpp, struct xfs_perag *pag, + struct xfs_trans **tpp, xfs_ino_t parent, xfs_ino_t *new_ino, bool ok_alloc) @@ -1689,7 +1671,7 @@ xfs_dialloc_try_ag( goto out_release; } - error = xfs_ialloc_ag_alloc(*tpp, agbp, pag); + error = xfs_ialloc_ag_alloc(pag, *tpp, agbp); if (error < 0) goto out_release; @@ -1705,7 +1687,7 @@ xfs_dialloc_try_ag( } /* Allocate an inode in the found AG */ - error = xfs_dialloc_ag(*tpp, agbp, pag, parent, &ino); + error = xfs_dialloc_ag(pag, *tpp, agbp, parent, &ino); if (!error) *new_ino = ino; return error; @@ -1737,8 +1719,9 @@ xfs_dialloc( struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); bool ok_alloc = true; + bool low_space = false; int flags; - xfs_ino_t ino; + xfs_ino_t ino = NULLFSINO; /* * Directories, symlinks, and regular files frequently allocate at least @@ -1746,7 +1729,8 @@ xfs_dialloc( * an AG has enough space for file creation. */ if (S_ISDIR(mode)) - start_agno = xfs_ialloc_next_ag(mp); + start_agno = (atomic_inc_return(&mp->m_agirotor) - 1) % + mp->m_maxagi; else { start_agno = XFS_INO_TO_AGNO(mp, parent); if (start_agno >= mp->m_maxagi) @@ -1768,41 +1752,55 @@ xfs_dialloc( } /* + * If we are near to ENOSPC, we want to prefer allocation from AGs that + * have free inodes in them rather than use up free space allocating new + * inode chunks. Hence we turn off allocation for the first non-blocking + * pass through the AGs if we are near ENOSPC to consume free inodes + * that we can immediately allocate, but then we allow allocation on the + * second pass if we fail to find an AG with free inodes in it. + */ + if (percpu_counter_read_positive(&mp->m_fdblocks) < + mp->m_low_space[XFS_LOWSP_1_PCNT]) { + ok_alloc = false; + low_space = true; + } + + /* * Loop until we find an allocation group that either has free inodes * or in which we can allocate some inodes. Iterate through the * allocation groups upward, wrapping at the end. */ - agno = start_agno; flags = XFS_ALLOC_FLAG_TRYLOCK; - for (;;) { - pag = xfs_perag_get(mp, agno); - if (xfs_dialloc_good_ag(*tpp, pag, mode, flags, ok_alloc)) { - error = xfs_dialloc_try_ag(tpp, pag, parent, +retry: + for_each_perag_wrap_at(mp, start_agno, mp->m_maxagi, agno, pag) { + if (xfs_dialloc_good_ag(pag, *tpp, mode, flags, ok_alloc)) { + error = xfs_dialloc_try_ag(pag, tpp, parent, &ino, ok_alloc); if (error != -EAGAIN) break; + error = 0; } if (xfs_is_shutdown(mp)) { error = -EFSCORRUPTED; break; } - if (++agno == mp->m_maxagi) - agno = 0; - if (agno == start_agno) { - if (!flags) { - error = -ENOSPC; - break; - } + } + if (pag) + xfs_perag_rele(pag); + if (error) + return error; + if (ino == NULLFSINO) { + if (flags) { flags = 0; + if (low_space) + ok_alloc = true; + goto retry; } - xfs_perag_put(pag); + return -ENOSPC; } - - if (!error) - *new_ino = ino; - xfs_perag_put(pag); - return error; + *new_ino = ino; + return 0; } /* @@ -1885,14 +1883,14 @@ next: STATIC int xfs_difree_inobt( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_agino_t agino, struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { + struct xfs_mount *mp = pag->pag_mount; struct xfs_agi *agi = agbp->b_addr; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; @@ -1907,7 +1905,7 @@ xfs_difree_inobt( /* * Initialize the cursor. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); error = xfs_check_agi_freecount(cur); if (error) @@ -2019,20 +2017,20 @@ error0: */ STATIC int xfs_difree_finobt( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { + struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int offset = agino - ibtrec->ir_startino; int error; int i; - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO); + cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO); error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) @@ -2179,7 +2177,7 @@ xfs_difree( /* * Fix up the inode allocation btree. */ - error = xfs_difree_inobt(mp, tp, agbp, pag, agino, xic, &rec); + error = xfs_difree_inobt(pag, tp, agbp, agino, xic, &rec); if (error) goto error0; @@ -2187,7 +2185,7 @@ xfs_difree( * Fix up the free inode btree. */ if (xfs_has_finobt(mp)) { - error = xfs_difree_finobt(mp, tp, agbp, pag, agino, &rec); + error = xfs_difree_finobt(pag, tp, agbp, agino, &rec); if (error) goto error0; } @@ -2200,15 +2198,15 @@ error0: STATIC int xfs_imap_lookup( - struct xfs_mount *mp, - struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_trans *tp, xfs_agino_t agino, xfs_agblock_t agbno, xfs_agblock_t *chunk_agbno, xfs_agblock_t *offset_agbno, int flags) { + struct xfs_mount *mp = pag->pag_mount; struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_buf *agbp; @@ -2229,7 +2227,7 @@ xfs_imap_lookup( * we have a record, we need to ensure it contains the inode number * we are looking up. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) @@ -2263,12 +2261,13 @@ xfs_imap_lookup( */ int xfs_imap( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ + struct xfs_perag *pag, + struct xfs_trans *tp, xfs_ino_t ino, /* inode to locate */ struct xfs_imap *imap, /* location map structure */ uint flags) /* flags for inode btree lookup */ { + struct xfs_mount *mp = pag->pag_mount; xfs_agblock_t agbno; /* block number of inode in the alloc group */ xfs_agino_t agino; /* inode number within alloc group */ xfs_agblock_t chunk_agbno; /* first block in inode chunk */ @@ -2276,17 +2275,15 @@ xfs_imap( int error; /* error code */ int offset; /* index of inode in its buffer */ xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ - struct xfs_perag *pag; ASSERT(ino != NULLFSINO); /* * Split up the inode number into its parts. */ - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); agino = XFS_INO_TO_AGINO(mp, ino); agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (!pag || agbno >= mp->m_sb.sb_agblocks || + if (agbno >= mp->m_sb.sb_agblocks || ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { error = -EINVAL; #ifdef DEBUG @@ -2295,20 +2292,14 @@ xfs_imap( * as they can be invalid without implying corruption. */ if (flags & XFS_IGET_UNTRUSTED) - goto out_drop; - if (!pag) { - xfs_alert(mp, - "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)", - __func__, XFS_INO_TO_AGNO(mp, ino), - mp->m_sb.sb_agcount); - } + return error; if (agbno >= mp->m_sb.sb_agblocks) { xfs_alert(mp, "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)", __func__, (unsigned long long)agbno, (unsigned long)mp->m_sb.sb_agblocks); } - if (pag && ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { + if (ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { xfs_alert(mp, "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)", __func__, ino, @@ -2316,7 +2307,7 @@ xfs_imap( } xfs_stack_trace(); #endif /* DEBUG */ - goto out_drop; + return error; } /* @@ -2327,10 +2318,10 @@ xfs_imap( * in all cases where an untrusted inode number is passed. */ if (flags & XFS_IGET_UNTRUSTED) { - error = xfs_imap_lookup(mp, tp, pag, agino, agbno, + error = xfs_imap_lookup(pag, tp, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) - goto out_drop; + return error; goto out_map; } @@ -2346,8 +2337,7 @@ xfs_imap( imap->im_len = XFS_FSB_TO_BB(mp, 1); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); - error = 0; - goto out_drop; + return 0; } /* @@ -2359,10 +2349,10 @@ xfs_imap( offset_agbno = agbno & M_IGEO(mp)->inoalign_mask; chunk_agbno = agbno - offset_agbno; } else { - error = xfs_imap_lookup(mp, tp, pag, agino, agbno, + error = xfs_imap_lookup(pag, tp, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) - goto out_drop; + return error; } out_map: @@ -2390,14 +2380,9 @@ out_map: __func__, (unsigned long long) imap->im_blkno, (unsigned long long) imap->im_len, XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); - error = -EINVAL; - goto out_drop; + return -EINVAL; } - error = 0; -out_drop: - if (pag) - xfs_perag_put(pag); - return error; + return 0; } /* @@ -2613,10 +2598,10 @@ xfs_ialloc_read_agi( return error; agi = agibp->b_addr; - if (!pag->pagi_init) { + if (!xfs_perag_initialised_agi(pag)) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); - pag->pagi_init = 1; + set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); } /* @@ -2924,26 +2909,24 @@ xfs_ialloc_calc_rootino( */ int xfs_ialloc_check_shrink( + struct xfs_perag *pag, struct xfs_trans *tp, - xfs_agnumber_t agno, struct xfs_buf *agibp, xfs_agblock_t new_length) { struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; - struct xfs_mount *mp = tp->t_mountp; - struct xfs_perag *pag; - xfs_agino_t agino = XFS_AGB_TO_AGINO(mp, new_length); + xfs_agino_t agino; int has; int error; - if (!xfs_has_sparseinodes(mp)) + if (!xfs_has_sparseinodes(pag->pag_mount)) return 0; - pag = xfs_perag_get(mp, agno); - cur = xfs_inobt_init_cursor(mp, tp, agibp, pag, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agibp, XFS_BTNUM_INO); /* Look up the inobt record that would correspond to the new EOFS. */ + agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has); if (error || !has) goto out; @@ -2964,6 +2947,5 @@ xfs_ialloc_check_shrink( } out: xfs_btree_del_cursor(cur, error); - xfs_perag_put(pag); return error; } diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 9bbbca6ac4ed..ab8c30b4ec22 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -12,6 +12,7 @@ struct xfs_imap; struct xfs_mount; struct xfs_trans; struct xfs_btree_cur; +struct xfs_perag; /* Move inodes in clusters of this size */ #define XFS_INODE_BIG_CLUSTER_SIZE 8192 @@ -47,7 +48,7 @@ int xfs_difree(struct xfs_trans *tp, struct xfs_perag *pag, */ int xfs_imap( - struct xfs_mount *mp, /* file system mount structure */ + struct xfs_perag *pag, struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t ino, /* inode to locate */ struct xfs_imap *imap, /* location map structure */ @@ -106,7 +107,7 @@ int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); void xfs_ialloc_setup_geometry(struct xfs_mount *mp); xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit); -int xfs_ialloc_check_shrink(struct xfs_trans *tp, xfs_agnumber_t agno, +int xfs_ialloc_check_shrink(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agibp, xfs_agblock_t new_length); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 8c83e265770c..9b28211d5a4c 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -36,8 +36,8 @@ STATIC struct xfs_btree_cur * xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { - return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum); + return xfs_inobt_init_cursor(cur->bc_ag.pag, cur->bc_tp, + cur->bc_ag.agbp, cur->bc_btnum); } STATIC void @@ -103,15 +103,15 @@ __xfs_inobt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; + args.pag = cur->bc_ag.pag; args.oinfo = XFS_RMAP_OINFO_INOBT; - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.pag->pag_agno, sbno); args.minlen = 1; args.maxlen = 1; args.prod = 1; - args.type = XFS_ALLOCTYPE_NEAR_BNO; args.resv = resv; - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_near_bno(&args, + XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno, sbno)); if (error) return error; @@ -291,8 +291,8 @@ xfs_inobt_verify( * Similarly, during log recovery we will have a perag structure * attached, but the agi information will not yet have been initialised * from the on disk AGI. We don't currently use any of this information, - * but beware of the landmine (i.e. need to check pag->pagi_init) if we - * ever do. + * but beware of the landmine (i.e. need to check + * xfs_perag_initialised_agi(pag)) if we ever do. */ if (xfs_has_crc(mp)) { fa = xfs_btree_sblock_v5hdr_verify(bp); @@ -427,11 +427,11 @@ static const struct xfs_btree_ops xfs_finobt_ops = { */ static struct xfs_btree_cur * xfs_inobt_init_common( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ struct xfs_perag *pag, + struct xfs_trans *tp, /* transaction pointer */ xfs_btnum_t btnum) /* ialloc or free ino btree */ { + struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; cur = xfs_btree_alloc_cursor(mp, tp, btnum, @@ -456,16 +456,15 @@ xfs_inobt_init_common( /* Create an inode btree cursor. */ struct xfs_btree_cur * xfs_inobt_init_cursor( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; struct xfs_agi *agi = agbp->b_addr; - cur = xfs_inobt_init_common(mp, tp, pag, btnum); + cur = xfs_inobt_init_common(pag, tp, btnum); if (btnum == XFS_BTNUM_INO) cur->bc_nlevels = be32_to_cpu(agi->agi_level); else @@ -477,14 +476,13 @@ xfs_inobt_init_cursor( /* Create an inode btree cursor with a fake root for staging. */ struct xfs_btree_cur * xfs_inobt_stage_cursor( - struct xfs_mount *mp, - struct xbtree_afakeroot *afake, struct xfs_perag *pag, + struct xbtree_afakeroot *afake, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; - cur = xfs_inobt_init_common(mp, NULL, pag, btnum); + cur = xfs_inobt_init_common(pag, NULL, btnum); xfs_btree_stage_afakeroot(cur, afake); return cur; } @@ -708,9 +706,8 @@ xfs_inobt_max_size( /* Read AGI and create inobt cursor. */ int xfs_inobt_cur( - struct xfs_mount *mp, - struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_trans *tp, xfs_btnum_t which, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp) @@ -725,16 +722,15 @@ xfs_inobt_cur( if (error) return error; - cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, pag, which); + cur = xfs_inobt_init_cursor(pag, tp, *agi_bpp, which); *curpp = cur; return 0; } static int xfs_inobt_count_blocks( - struct xfs_mount *mp, - struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_trans *tp, xfs_btnum_t btnum, xfs_extlen_t *tree_blocks) { @@ -742,7 +738,7 @@ xfs_inobt_count_blocks( struct xfs_btree_cur *cur = NULL; int error; - error = xfs_inobt_cur(mp, tp, pag, btnum, &cur, &agbp); + error = xfs_inobt_cur(pag, tp, btnum, &cur, &agbp); if (error) return error; @@ -779,22 +775,21 @@ xfs_finobt_read_blocks( */ int xfs_finobt_calc_reserves( - struct xfs_mount *mp, - struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_trans *tp, xfs_extlen_t *ask, xfs_extlen_t *used) { xfs_extlen_t tree_len = 0; int error; - if (!xfs_has_finobt(mp)) + if (!xfs_has_finobt(pag->pag_mount)) return 0; - if (xfs_has_inobtcounts(mp)) + if (xfs_has_inobtcounts(pag->pag_mount)) error = xfs_finobt_read_blocks(pag, tp, &tree_len); else - error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO, + error = xfs_inobt_count_blocks(pag, tp, XFS_BTNUM_FINO, &tree_len); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 26451cb76b98..e859a6e05230 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -46,12 +46,10 @@ struct xfs_perag; (maxrecs) * sizeof(xfs_inobt_key_t) + \ ((index) - 1) * sizeof(xfs_inobt_ptr_t))) -extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *mp, - struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_btnum_t btnum); -struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, struct xfs_perag *pag, - xfs_btnum_t btnum); +extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag, + struct xfs_trans *tp, struct xfs_buf *agbp, xfs_btnum_t btnum); +struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag, + struct xbtree_afakeroot *afake, xfs_btnum_t btnum); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); /* ir_holemask to inode allocation bitmap conversion */ @@ -64,13 +62,13 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, #define xfs_inobt_rec_check_count(mp, rec) 0 #endif /* DEBUG */ -int xfs_finobt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used); +int xfs_finobt_calc_reserves(struct xfs_perag *perag, struct xfs_trans *tp, + xfs_extlen_t *ask, xfs_extlen_t *used); extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp, unsigned long long len); -int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_perag *pag, xfs_btnum_t btnum, - struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp); +int xfs_inobt_cur(struct xfs_perag *pag, struct xfs_trans *tp, + xfs_btnum_t btnum, struct xfs_btree_cur **curpp, + struct xfs_buf **agi_bpp); void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index e1f789866683..f3b860970b26 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -67,14 +67,14 @@ xfs_refcountbt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, - xfs_refc_block(args.mp)); + args.pag = cur->bc_ag.pag; args.oinfo = XFS_RMAP_OINFO_REFC; args.minlen = args.maxlen = args.prod = 1; args.resv = XFS_AG_RESV_METADATA; - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_near_bno(&args, + XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno, + xfs_refc_block(args.mp))); if (error) goto out_error; trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, @@ -227,7 +227,7 @@ xfs_refcountbt_verify( return fa; level = be16_to_cpu(block->bb_level); - if (pag && pag->pagf_init) { + if (pag && xfs_perag_initialised_agf(pag)) { if (level >= pag->pagf_refcount_level) return __this_address; } else if (level >= mp->m_refc_maxlevels) diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 7f83f62e51e0..d3285684bb5e 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -313,7 +313,7 @@ xfs_rmapbt_verify( return fa; level = be16_to_cpu(block->bb_level); - if (pag && pag->pagf_init) { + if (pag && xfs_perag_initialised_agf(pag)) { if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi]) return __this_address; } else if (level >= mp->m_rmap_maxlevels) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 1eeecf2eb2a7..99cc03a298e2 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -909,7 +909,8 @@ xfs_sb_mount_common( struct xfs_mount *mp, struct xfs_sb *sbp) { - mp->m_agfrotor = mp->m_agirotor = 0; + mp->m_agfrotor = 0; + atomic_set(&mp->m_agirotor, 0); mp->m_maxagi = mp->m_sb.sb_agcount; mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index d75d82151eeb..c37e6d72760b 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -191,14 +191,15 @@ xrep_agf_init_header( struct xfs_agf *old_agf) { struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag = sc->sa.pag; struct xfs_agf *agf = agf_bp->b_addr; memcpy(old_agf, agf, sizeof(*old_agf)); memset(agf, 0, BBTOB(agf_bp->b_length)); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); - agf->agf_seqno = cpu_to_be32(sc->sa.pag->pag_agno); - agf->agf_length = cpu_to_be32(sc->sa.pag->block_count); + agf->agf_seqno = cpu_to_be32(pag->pag_agno); + agf->agf_length = cpu_to_be32(pag->block_count); agf->agf_flfirst = old_agf->agf_flfirst; agf->agf_fllast = old_agf->agf_fllast; agf->agf_flcount = old_agf->agf_flcount; @@ -206,8 +207,8 @@ xrep_agf_init_header( uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); /* Mark the incore AGF data stale until we're done fixing things. */ - ASSERT(sc->sa.pag->pagf_init); - sc->sa.pag->pagf_init = 0; + ASSERT(xfs_perag_initialised_agf(pag)); + clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); } /* Set btree root information in an AGF. */ @@ -333,7 +334,7 @@ xrep_agf_commit_new( pag->pagf_levels[XFS_BTNUM_RMAPi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); - pag->pagf_init = 1; + set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); return 0; } @@ -434,7 +435,7 @@ xrep_agf( out_revert: /* Mark the incore AGF state stale and revert the AGF. */ - sc->sa.pag->pagf_init = 0; + clear_bit(XFS_AGSTATE_AGF_INIT, &sc->sa.pag->pag_opstate); memcpy(agf, &old_agf, sizeof(old_agf)); return error; } @@ -618,7 +619,7 @@ xrep_agfl_update_agf( xfs_force_summary_recalc(sc->mp); /* Update the AGF counters. */ - if (sc->sa.pag->pagf_init) + if (xfs_perag_initialised_agf(sc->sa.pag)) sc->sa.pag->pagf_flcount = flcount; agf->agf_flfirst = cpu_to_be32(0); agf->agf_flcount = cpu_to_be32(flcount); @@ -822,14 +823,15 @@ xrep_agi_init_header( struct xfs_agi *old_agi) { struct xfs_agi *agi = agi_bp->b_addr; + struct xfs_perag *pag = sc->sa.pag; struct xfs_mount *mp = sc->mp; memcpy(old_agi, agi, sizeof(*old_agi)); memset(agi, 0, BBTOB(agi_bp->b_length)); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); - agi->agi_seqno = cpu_to_be32(sc->sa.pag->pag_agno); - agi->agi_length = cpu_to_be32(sc->sa.pag->block_count); + agi->agi_seqno = cpu_to_be32(pag->pag_agno); + agi->agi_length = cpu_to_be32(pag->block_count); agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); if (xfs_has_crc(mp)) @@ -840,8 +842,8 @@ xrep_agi_init_header( sizeof(agi->agi_unlinked)); /* Mark the incore AGF data stale until we're done fixing things. */ - ASSERT(sc->sa.pag->pagi_init); - sc->sa.pag->pagi_init = 0; + ASSERT(xfs_perag_initialised_agi(pag)); + clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); } /* Set btree root information in an AGI. */ @@ -873,8 +875,7 @@ xrep_agi_calc_from_btrees( xfs_agino_t freecount; int error; - cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, - sc->sa.pag, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, XFS_BTNUM_INO); error = xfs_ialloc_count_inodes(cur, &count, &freecount); if (error) goto err; @@ -894,8 +895,8 @@ xrep_agi_calc_from_btrees( if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) { xfs_agblock_t blocks; - cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, - sc->sa.pag, XFS_BTNUM_FINO); + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, + XFS_BTNUM_FINO); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; @@ -929,7 +930,7 @@ xrep_agi_commit_new( pag = sc->sa.pag; pag->pagi_count = be32_to_cpu(agi->agi_count); pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); - pag->pagi_init = 1; + set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); return 0; } @@ -994,7 +995,7 @@ xrep_agi( out_revert: /* Mark the incore AGI state stale and revert the AGI. */ - sc->sa.pag->pagi_init = 0; + clear_bit(XFS_AGSTATE_AGI_INIT, &sc->sa.pag->pag_opstate); memcpy(agi, &old_agi, sizeof(old_agi)); return error; } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index d50d0eab196a..dbbc7037074c 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -662,7 +662,7 @@ xchk_bmap_check_rmaps( error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { - xfs_perag_put(pag); + xfs_perag_rele(pag); return error; } } diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 613260b04a3d..848a8e32e56f 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -478,15 +478,15 @@ xchk_ag_btcur_init( /* Set up a inobt cursor for cross-referencing. */ if (sa->agi_bp && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) { - sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, - sa->pag, XFS_BTNUM_INO); + sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp, + XFS_BTNUM_INO); } /* Set up a finobt cursor for cross-referencing. */ if (sa->agi_bp && xfs_has_finobt(mp) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) { - sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, - sa->pag, XFS_BTNUM_FINO); + sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp, + XFS_BTNUM_FINO); } /* Set up a rmapbt cursor for cross-referencing. */ @@ -636,6 +636,7 @@ xchk_get_inode( { struct xfs_imap imap; struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag; struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); struct xfs_inode *ip = NULL; int error; @@ -671,10 +672,14 @@ xchk_get_inode( * Otherwise, we really couldn't find it so tell userspace * that it no longer exists. */ - error = xfs_imap(sc->mp, sc->tp, sc->sm->sm_ino, &imap, - XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE); - if (error) - return -ENOENT; + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); + if (pag) { + error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, + XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE); + xfs_perag_put(pag); + if (error) + return -ENOENT; + } error = -EFSCORRUPTED; fallthrough; default: diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 4777e7b89fdc..f0c7f41897b9 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -86,7 +86,8 @@ xchk_fscount_warmup( for_each_perag(mp, agno, pag) { if (xchk_should_terminate(sc, &error)) break; - if (pag->pagi_init && pag->pagf_init) + if (xfs_perag_initialised_agi(pag) && + xfs_perag_initialised_agf(pag)) continue; /* Lock both AG headers. */ @@ -101,7 +102,8 @@ xchk_fscount_warmup( * These are supposed to be initialized by the header read * function. */ - if (!pag->pagi_init || !pag->pagf_init) { + if (!xfs_perag_initialised_agi(pag) || + !xfs_perag_initialised_agf(pag)) { error = -EFSCORRUPTED; break; } @@ -117,7 +119,7 @@ xchk_fscount_warmup( if (agi_bp) xfs_buf_relse(agi_bp); if (pag) - xfs_perag_put(pag); + xfs_perag_rele(pag); return error; } @@ -220,7 +222,8 @@ retry: break; /* This somehow got unset since the warmup? */ - if (!pag->pagi_init || !pag->pagf_init) { + if (!xfs_perag_initialised_agi(pag) || + !xfs_perag_initialised_agf(pag)) { error = -EFSCORRUPTED; break; } @@ -249,7 +252,7 @@ retry: } if (pag) - xfs_perag_put(pag); + xfs_perag_rele(pag); if (error) { xchk_set_incomplete(sc); return error; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 4b92f9253ccd..1b71174ec0d6 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -206,7 +206,7 @@ xrep_calc_ag_resblks( return 0; pag = xfs_perag_get(mp, sm->sm_agno); - if (pag->pagi_init) { + if (xfs_perag_initialised_agi(pag)) { /* Use in-core icount if possible. */ icount = pag->pagi_count; } else { @@ -326,15 +326,14 @@ xrep_alloc_ag_block( args.tp = sc->tp; args.mp = sc->mp; + args.pag = sc->sa.pag; args.oinfo = *oinfo; - args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.pag->pag_agno, 0); args.minlen = 1; args.maxlen = 1; args.prod = 1; - args.type = XFS_ALLOCTYPE_THIS_AG; args.resv = resv; - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_this_ag(&args, sc->sa.pag->pag_agno); if (error) return error; if (args.fsbno == NULLFSBLOCK) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 867645b74d88..a09dd2606479 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1410,7 +1410,7 @@ xfs_swap_extent_rmap( /* Unmap the old blocks in the source file. */ while (tirec.br_blockcount) { - ASSERT(tp->t_firstblock == NULLFSBLOCK); + ASSERT(tp->t_highest_agno == NULLAGNUMBER); trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec); /* Read extent from the source file */ diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index bfc829c07f03..afc4c78b9eed 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -21,23 +21,20 @@ STATIC int xfs_trim_extents( - struct xfs_mount *mp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_daddr_t start, xfs_daddr_t end, xfs_daddr_t minlen, uint64_t *blocks_trimmed) { + struct xfs_mount *mp = pag->pag_mount; struct block_device *bdev = mp->m_ddev_targp->bt_bdev; struct xfs_btree_cur *cur; struct xfs_buf *agbp; struct xfs_agf *agf; - struct xfs_perag *pag; int error; int i; - pag = xfs_perag_get(mp, agno); - /* * Force out the log. This means any transactions that might have freed * space before we take the AGF buffer lock are now on disk, and the @@ -47,7 +44,7 @@ xfs_trim_extents( error = xfs_alloc_read_agf(pag, NULL, 0, &agbp); if (error) - goto out_put_perag; + return error; agf = agbp->b_addr; cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT); @@ -71,10 +68,10 @@ xfs_trim_extents( error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); if (error) - goto out_del_cursor; + break; if (XFS_IS_CORRUPT(mp, i != 1)) { error = -EFSCORRUPTED; - goto out_del_cursor; + break; } ASSERT(flen <= be32_to_cpu(agf->agf_longest)); @@ -83,15 +80,15 @@ xfs_trim_extents( * the format the range/len variables are supplied in by * userspace. */ - dbno = XFS_AGB_TO_DADDR(mp, agno, fbno); + dbno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, fbno); dlen = XFS_FSB_TO_BB(mp, flen); /* * Too small? Give up. */ if (dlen < minlen) { - trace_xfs_discard_toosmall(mp, agno, fbno, flen); - goto out_del_cursor; + trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); + break; } /* @@ -100,7 +97,7 @@ xfs_trim_extents( * down partially overlapping ranges for now. */ if (dbno + dlen < start || dbno > end) { - trace_xfs_discard_exclude(mp, agno, fbno, flen); + trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen); goto next_extent; } @@ -109,32 +106,30 @@ xfs_trim_extents( * discard and try again the next time. */ if (xfs_extent_busy_search(mp, pag, fbno, flen)) { - trace_xfs_discard_busy(mp, agno, fbno, flen); + trace_xfs_discard_busy(mp, pag->pag_agno, fbno, flen); goto next_extent; } - trace_xfs_discard_extent(mp, agno, fbno, flen); + trace_xfs_discard_extent(mp, pag->pag_agno, fbno, flen); error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS); if (error) - goto out_del_cursor; + break; *blocks_trimmed += flen; next_extent: error = xfs_btree_decrement(cur, 0, &i); if (error) - goto out_del_cursor; + break; if (fatal_signal_pending(current)) { error = -ERESTARTSYS; - goto out_del_cursor; + break; } } out_del_cursor: xfs_btree_del_cursor(cur, error); xfs_buf_relse(agbp); -out_put_perag: - xfs_perag_put(pag); return error; } @@ -152,11 +147,12 @@ xfs_ioc_trim( struct xfs_mount *mp, struct fstrim_range __user *urange) { + struct xfs_perag *pag; unsigned int granularity = bdev_discard_granularity(mp->m_ddev_targp->bt_bdev); struct fstrim_range range; xfs_daddr_t start, end, minlen; - xfs_agnumber_t start_agno, end_agno, agno; + xfs_agnumber_t agno; uint64_t blocks_trimmed = 0; int error, last_error = 0; @@ -193,18 +189,18 @@ xfs_ioc_trim( end = start + BTOBBT(range.len) - 1; if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) - end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1; - - start_agno = xfs_daddr_to_agno(mp, start); - end_agno = xfs_daddr_to_agno(mp, end); + end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1; - for (agno = start_agno; agno <= end_agno; agno++) { - error = xfs_trim_extents(mp, agno, start, end, minlen, + agno = xfs_daddr_to_agno(mp, start); + for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) { + error = xfs_trim_extents(pag, start, end, minlen, &blocks_trimmed); if (error) { last_error = error; - if (error == -ERESTARTSYS) + if (error == -ERESTARTSYS) { + xfs_perag_rele(pag); break; + } } } diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 34b21a29c39b..22c13933c8f8 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -12,6 +12,7 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_alloc.h" #include "xfs_mru_cache.h" #include "xfs_trace.h" @@ -22,7 +23,7 @@ struct xfs_fstrm_item { struct xfs_mru_cache_elem mru; - xfs_agnumber_t ag; /* AG in use for this directory */ + struct xfs_perag *pag; /* AG in use for this directory */ }; enum xfs_fstrm_alloc { @@ -30,117 +31,68 @@ enum xfs_fstrm_alloc { XFS_PICK_LOWSPACE = 2, }; -/* - * Allocation group filestream associations are tracked with per-ag atomic - * counters. These counters allow xfs_filestream_pick_ag() to tell whether a - * particular AG already has active filestreams associated with it. - */ -int -xfs_filestream_peek_ag( - xfs_mount_t *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - int ret; - - pag = xfs_perag_get(mp, agno); - ret = atomic_read(&pag->pagf_fstrms); - xfs_perag_put(pag); - return ret; -} - -static int -xfs_filestream_get_ag( - xfs_mount_t *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - int ret; - - pag = xfs_perag_get(mp, agno); - ret = atomic_inc_return(&pag->pagf_fstrms); - xfs_perag_put(pag); - return ret; -} - -static void -xfs_filestream_put_ag( - xfs_mount_t *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, agno); - atomic_dec(&pag->pagf_fstrms); - xfs_perag_put(pag); -} - static void xfs_fstrm_free_func( void *data, struct xfs_mru_cache_elem *mru) { - struct xfs_mount *mp = data; struct xfs_fstrm_item *item = container_of(mru, struct xfs_fstrm_item, mru); + struct xfs_perag *pag = item->pag; - xfs_filestream_put_ag(mp, item->ag); - trace_xfs_filestream_free(mp, mru->key, item->ag); + trace_xfs_filestream_free(pag, mru->key); + atomic_dec(&pag->pagf_fstrms); + xfs_perag_rele(pag); kmem_free(item); } /* - * Scan the AGs starting at startag looking for an AG that isn't in use and has - * at least minlen blocks free. + * Scan the AGs starting at start_agno looking for an AG that isn't in use and + * has at least minlen blocks free. If no AG is found to match the allocation + * requirements, pick the AG with the most free space in it. */ static int xfs_filestream_pick_ag( - struct xfs_inode *ip, - xfs_agnumber_t startag, - xfs_agnumber_t *agp, + struct xfs_alloc_arg *args, + xfs_ino_t pino, + xfs_agnumber_t start_agno, int flags, - xfs_extlen_t minlen) + xfs_extlen_t *longest) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_fstrm_item *item; + struct xfs_mount *mp = args->mp; struct xfs_perag *pag; - xfs_extlen_t longest, free = 0, minfree, maxfree = 0; - xfs_agnumber_t ag, max_ag = NULLAGNUMBER; - int err, trylock, nscan; - - ASSERT(S_ISDIR(VFS_I(ip)->i_mode)); + struct xfs_perag *max_pag = NULL; + xfs_extlen_t minlen = *longest; + xfs_extlen_t free = 0, minfree, maxfree = 0; + xfs_agnumber_t agno; + bool first_pass = true; + int err; /* 2% of an AG's blocks must be free for it to be chosen. */ minfree = mp->m_sb.sb_agblocks / 50; - ag = startag; - *agp = NULLAGNUMBER; - - /* For the first pass, don't sleep trying to init the per-AG. */ - trylock = XFS_ALLOC_FLAG_TRYLOCK; - - for (nscan = 0; 1; nscan++) { - trace_xfs_filestream_scan(mp, ip->i_ino, ag); - - pag = xfs_perag_get(mp, ag); - - if (!pag->pagf_init) { - err = xfs_alloc_read_agf(pag, NULL, trylock, NULL); - if (err) { - if (err != -EAGAIN) { - xfs_perag_put(pag); - return err; - } - /* Couldn't lock the AGF, skip this AG. */ - goto next_ag; - } +restart: + for_each_perag_wrap(mp, start_agno, agno, pag) { + trace_xfs_filestream_scan(pag, pino); + *longest = 0; + err = xfs_bmap_longest_free_extent(pag, NULL, longest); + if (err) { + xfs_perag_rele(pag); + if (err != -EAGAIN) + break; + /* Couldn't lock the AGF, skip this AG. */ + err = 0; + continue; } /* Keep track of the AG with the most free blocks. */ if (pag->pagf_freeblks > maxfree) { maxfree = pag->pagf_freeblks; - max_ag = ag; + if (max_pag) + xfs_perag_rele(max_pag); + atomic_inc(&pag->pag_active_ref); + max_pag = pag; } /* @@ -149,93 +101,73 @@ xfs_filestream_pick_ag( * loop, and it guards against two filestreams being established * in the same AG as each other. */ - if (xfs_filestream_get_ag(mp, ag) > 1) { - xfs_filestream_put_ag(mp, ag); - goto next_ag; - } - - longest = xfs_alloc_longest_free_extent(pag, - xfs_alloc_min_freelist(mp, pag), - xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); - if (((minlen && longest >= minlen) || - (!minlen && pag->pagf_freeblks >= minfree)) && - (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || - (flags & XFS_PICK_LOWSPACE))) { - - /* Break out, retaining the reference on the AG. */ - free = pag->pagf_freeblks; - xfs_perag_put(pag); - *agp = ag; - break; + if (atomic_inc_return(&pag->pagf_fstrms) <= 1) { + if (((minlen && *longest >= minlen) || + (!minlen && pag->pagf_freeblks >= minfree)) && + (!xfs_perag_prefers_metadata(pag) || + !(flags & XFS_PICK_USERDATA) || + (flags & XFS_PICK_LOWSPACE))) { + /* Break out, retaining the reference on the AG. */ + free = pag->pagf_freeblks; + break; + } } /* Drop the reference on this AG, it's not usable. */ - xfs_filestream_put_ag(mp, ag); -next_ag: - xfs_perag_put(pag); - /* Move to the next AG, wrapping to AG 0 if necessary. */ - if (++ag >= mp->m_sb.sb_agcount) - ag = 0; - - /* If a full pass of the AGs hasn't been done yet, continue. */ - if (ag != startag) - continue; + atomic_dec(&pag->pagf_fstrms); + } - /* Allow sleeping in xfs_alloc_read_agf() on the 2nd pass. */ - if (trylock != 0) { - trylock = 0; - continue; + if (err) { + xfs_perag_rele(pag); + if (max_pag) + xfs_perag_rele(max_pag); + return err; + } + + if (!pag) { + /* + * Allow a second pass to give xfs_bmap_longest_free_extent() + * another attempt at locking AGFs that it might have skipped + * over before we fail. + */ + if (first_pass) { + first_pass = false; + goto restart; } - /* Finally, if lowspace wasn't set, set it for the 3rd pass. */ + /* + * We must be low on data space, so run a final lowspace + * optimised selection pass if we haven't already. + */ if (!(flags & XFS_PICK_LOWSPACE)) { flags |= XFS_PICK_LOWSPACE; - continue; + goto restart; } /* - * Take the AG with the most free space, regardless of whether - * it's already in use by another filestream. + * No unassociated AGs are available, so select the AG with the + * most free space, regardless of whether it's already in use by + * another filestream. It none suit, just use whatever AG we can + * grab. */ - if (max_ag != NULLAGNUMBER) { - xfs_filestream_get_ag(mp, max_ag); + if (!max_pag) { + for_each_perag_wrap(args->mp, 0, start_agno, args->pag) + break; + atomic_inc(&args->pag->pagf_fstrms); + *longest = 0; + } else { + pag = max_pag; free = maxfree; - *agp = max_ag; - break; + atomic_inc(&pag->pagf_fstrms); } - - /* take AG 0 if none matched */ - trace_xfs_filestream_pick(ip, *agp, free, nscan); - *agp = 0; - return 0; - } - - trace_xfs_filestream_pick(ip, *agp, free, nscan); - - if (*agp == NULLAGNUMBER) - return 0; - - err = -ENOMEM; - item = kmem_alloc(sizeof(*item), KM_MAYFAIL); - if (!item) - goto out_put_ag; - - item->ag = *agp; - - err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru); - if (err) { - if (err == -EEXIST) - err = 0; - goto out_free_item; + } else if (max_pag) { + xfs_perag_rele(max_pag); } + trace_xfs_filestream_pick(pag, pino, free); + args->pag = pag; return 0; -out_free_item: - kmem_free(item); -out_put_ag: - xfs_filestream_put_ag(mp, *agp); - return err; } static struct xfs_inode * @@ -263,104 +195,187 @@ out: } /* - * Find the right allocation group for a file, either by finding an - * existing file stream or creating a new one. + * Lookup the mru cache for an existing association. If one exists and we can + * use it, return with an active perag reference indicating that the allocation + * will proceed with that association. * - * Returns NULLAGNUMBER in case of an error. + * If we have no association, or we cannot use the current one and have to + * destroy it, return with longest = 0 to tell the caller to create a new + * association. */ -xfs_agnumber_t -xfs_filestream_lookup_ag( - struct xfs_inode *ip) +static int +xfs_filestream_lookup_association( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_ino_t pino, + xfs_extlen_t *longest) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_inode *pip = NULL; - xfs_agnumber_t startag, ag = NULLAGNUMBER; + struct xfs_mount *mp = args->mp; + struct xfs_perag *pag; struct xfs_mru_cache_elem *mru; + int error = 0; - ASSERT(S_ISREG(VFS_I(ip)->i_mode)); - - pip = xfs_filestream_get_parent(ip); - if (!pip) - return NULLAGNUMBER; + *longest = 0; + mru = xfs_mru_cache_lookup(mp->m_filestream, pino); + if (!mru) + return 0; + /* + * Grab the pag and take an extra active reference for the caller whilst + * the mru item cannot go away. This means we'll pin the perag with + * the reference we get here even if the filestreams association is torn + * down immediately after we mark the lookup as done. + */ + pag = container_of(mru, struct xfs_fstrm_item, mru)->pag; + atomic_inc(&pag->pag_active_ref); + xfs_mru_cache_done(mp->m_filestream); - mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino); - if (mru) { - ag = container_of(mru, struct xfs_fstrm_item, mru)->ag; - xfs_mru_cache_done(mp->m_filestream); + trace_xfs_filestream_lookup(pag, ap->ip->i_ino); - trace_xfs_filestream_lookup(mp, ip->i_ino, ag); - goto out; - } + ap->blkno = XFS_AGB_TO_FSB(args->mp, pag->pag_agno, 0); + xfs_bmap_adjacent(ap); /* - * Set the starting AG using the rotor for inode32, otherwise - * use the directory inode's AG. + * If there is very little free space before we start a filestreams + * allocation, we're almost guaranteed to fail to find a large enough + * free space available so just use the cached AG. */ - if (xfs_is_inode32(mp)) { - xfs_agnumber_t rotorstep = xfs_rotorstep; - startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; - mp->m_agfrotor = (mp->m_agfrotor + 1) % - (mp->m_sb.sb_agcount * rotorstep); - } else - startag = XFS_INO_TO_AGNO(mp, pip->i_ino); + if (ap->tp->t_flags & XFS_TRANS_LOWMODE) { + *longest = 1; + goto out_done; + } - if (xfs_filestream_pick_ag(pip, startag, &ag, 0, 0)) - ag = NULLAGNUMBER; -out: - xfs_irele(pip); - return ag; + error = xfs_bmap_longest_free_extent(pag, args->tp, longest); + if (error == -EAGAIN) + error = 0; + if (error || *longest < args->maxlen) { + /* We aren't going to use this perag */ + *longest = 0; + xfs_perag_rele(pag); + return error; + } + +out_done: + args->pag = pag; + return 0; } -/* - * Pick a new allocation group for the current file and its file stream. - * - * This is called when the allocator can't find a suitable extent in the - * current AG, and we have to move the stream into a new AG with more space. - */ -int -xfs_filestream_new_ag( +static int +xfs_filestream_create_association( struct xfs_bmalloca *ap, - xfs_agnumber_t *agp) + struct xfs_alloc_arg *args, + xfs_ino_t pino, + xfs_extlen_t *longest) { - struct xfs_inode *ip = ap->ip, *pip; - struct xfs_mount *mp = ip->i_mount; - xfs_extlen_t minlen = ap->length; - xfs_agnumber_t startag = 0; - int flags = 0; - int err = 0; + struct xfs_mount *mp = args->mp; struct xfs_mru_cache_elem *mru; + struct xfs_fstrm_item *item; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, pino); + int flags = 0; + int error; - *agp = NULLAGNUMBER; - - pip = xfs_filestream_get_parent(ip); - if (!pip) - goto exit; - - mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino); + /* Changing parent AG association now, so remove the existing one. */ + mru = xfs_mru_cache_remove(mp->m_filestream, pino); if (mru) { struct xfs_fstrm_item *item = container_of(mru, struct xfs_fstrm_item, mru); - startag = (item->ag + 1) % mp->m_sb.sb_agcount; + + agno = (item->pag->pag_agno + 1) % mp->m_sb.sb_agcount; + xfs_fstrm_free_func(mp, mru); + } else if (xfs_is_inode32(mp)) { + xfs_agnumber_t rotorstep = xfs_rotorstep; + + agno = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; + mp->m_agfrotor = (mp->m_agfrotor + 1) % + (mp->m_sb.sb_agcount * rotorstep); } + ap->blkno = XFS_AGB_TO_FSB(args->mp, agno, 0); + xfs_bmap_adjacent(ap); + if (ap->datatype & XFS_ALLOC_USERDATA) flags |= XFS_PICK_USERDATA; if (ap->tp->t_flags & XFS_TRANS_LOWMODE) flags |= XFS_PICK_LOWSPACE; - err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen); + *longest = ap->length; + error = xfs_filestream_pick_ag(args, pino, agno, flags, longest); + if (error) + return error; /* - * Only free the item here so we skip over the old AG earlier. + * We are going to use this perag now, so create an assoication for it. + * xfs_filestream_pick_ag() has already bumped the perag fstrms counter + * for us, so all we need to do here is take another active reference to + * the perag for the cached association. + * + * If we fail to store the association, we need to drop the fstrms + * counter as well as drop the perag reference we take here for the + * item. We do not need to return an error for this failure - as long as + * we return a referenced AG, the allocation can still go ahead just + * fine. */ - if (mru) - xfs_fstrm_free_func(mp, mru); + item = kmem_alloc(sizeof(*item), KM_MAYFAIL); + if (!item) + goto out_put_fstrms; + + atomic_inc(&args->pag->pag_active_ref); + item->pag = args->pag; + error = xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru); + if (error) + goto out_free_item; + return 0; + +out_free_item: + xfs_perag_rele(item->pag); + kmem_free(item); +out_put_fstrms: + atomic_dec(&args->pag->pagf_fstrms); + return 0; +} + +/* + * Search for an allocation group with a single extent large enough for + * the request. First we look for an existing association and use that if it + * is found. Otherwise, we create a new association by selecting an AG that fits + * the allocation criteria. + * + * We return with a referenced perag in args->pag to indicate which AG we are + * allocating into or an error with no references held. + */ +int +xfs_filestream_select_ag( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *longest) +{ + struct xfs_mount *mp = args->mp; + struct xfs_inode *pip; + xfs_ino_t ino = 0; + int error = 0; + + *longest = 0; + args->total = ap->total; + pip = xfs_filestream_get_parent(ap->ip); + if (pip) { + ino = pip->i_ino; + error = xfs_filestream_lookup_association(ap, args, ino, + longest); + xfs_irele(pip); + if (error) + return error; + if (*longest >= args->maxlen) + goto out_select; + if (ap->tp->t_flags & XFS_TRANS_LOWMODE) + goto out_select; + } + + error = xfs_filestream_create_association(ap, args, ino, longest); + if (error) + return error; - xfs_irele(pip); -exit: - if (*agp == NULLAGNUMBER) - *agp = 0; - return err; +out_select: + ap->blkno = XFS_AGB_TO_FSB(mp, args->pag->pag_agno, 0); + return 0; } void diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index 403226ebb80b..84149ed0e340 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -9,13 +9,13 @@ struct xfs_mount; struct xfs_inode; struct xfs_bmalloca; +struct xfs_alloc_arg; int xfs_filestream_mount(struct xfs_mount *mp); void xfs_filestream_unmount(struct xfs_mount *mp); void xfs_filestream_deassociate(struct xfs_inode *ip); -xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); -int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp); -int xfs_filestream_peek_ag(struct xfs_mount *mp, xfs_agnumber_t agno); +int xfs_filestream_select_ag(struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, xfs_extlen_t *blen); static inline int xfs_inode_is_filestream( diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 88a88506ffff..59e7d1a14b67 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -688,11 +688,11 @@ __xfs_getfsmap_datadev( info->agf_bp = NULL; } if (info->pag) { - xfs_perag_put(info->pag); + xfs_perag_rele(info->pag); info->pag = NULL; } else if (pag) { /* loop termination case */ - xfs_perag_put(pag); + xfs_perag_rele(pag); } return error; @@ -761,6 +761,7 @@ xfs_getfsmap_datadev_bnobt( { struct xfs_alloc_rec_incore akeys[2]; + memset(akeys, 0, sizeof(akeys)); info->missing_owner = XFS_FMR_OWN_UNKNOWN; return __xfs_getfsmap_datadev(tp, keys, info, xfs_getfsmap_datadev_bnobt_query, &akeys[0]); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index ddeaccc04aec..c9a7e270a428 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -255,7 +255,7 @@ xfs_perag_set_inode_tag( break; } - trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); + trace_xfs_perag_set_inode_tag(pag, _RET_IP_); } /* Clear a tag on both the AG incore inode tree and the AG radix tree. */ @@ -289,7 +289,7 @@ xfs_perag_clear_inode_tag( radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag); spin_unlock(&mp->m_perag_lock); - trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); + trace_xfs_perag_clear_inode_tag(pag, _RET_IP_); } /* @@ -586,7 +586,7 @@ xfs_iget_cache_miss( if (!ip) return -ENOMEM; - error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags); + error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, flags); if (error) goto out_destroy; @@ -1767,7 +1767,7 @@ xfs_icwalk( if (error) { last_error = error; if (error == -EFSCORRUPTED) { - xfs_perag_put(pag); + xfs_perag_rele(pag); break; } } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7f1d715faab5..5808abab786c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1367,7 +1367,7 @@ xfs_itruncate_extents_flags( unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; while (unmap_len > 0) { - ASSERT(tp->t_firstblock == NULLFSBLOCK); + ASSERT(tp->t_highest_agno == NULLAGNUMBER); error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, flags, XFS_ITRUNC_MAX_EXTENTS); if (error) diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 7558486f4937..21be93bf006d 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -275,7 +275,7 @@ xfs_iwalk_ag_start( /* Set up a fresh cursor and empty the inobt cache. */ iwag->nr_recs = 0; - error = xfs_inobt_cur(mp, tp, pag, XFS_BTNUM_INO, curpp, agi_bpp); + error = xfs_inobt_cur(pag, tp, XFS_BTNUM_INO, curpp, agi_bpp); if (error) return error; @@ -390,7 +390,7 @@ xfs_iwalk_run_callbacks( } /* ...and recreate the cursor just past where we left off. */ - error = xfs_inobt_cur(mp, iwag->tp, iwag->pag, XFS_BTNUM_INO, curpp, + error = xfs_inobt_cur(iwag->pag, iwag->tp, XFS_BTNUM_INO, curpp, agi_bpp); if (error) return error; @@ -591,7 +591,7 @@ xfs_iwalk( } if (iwag.pag) - xfs_perag_put(pag); + xfs_perag_rele(pag); xfs_iwalk_free(&iwag); return error; } @@ -683,7 +683,7 @@ xfs_iwalk_threaded( break; } if (pag) - xfs_perag_put(pag); + xfs_perag_rele(pag); if (polled) xfs_pwork_poll(&pctl); return xfs_pwork_destroy(&pctl); @@ -776,7 +776,7 @@ xfs_inobt_walk( } if (iwag.pag) - xfs_perag_put(pag); + xfs_perag_rele(pag); xfs_iwalk_free(&iwag); return error; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 8aca2cc173ac..f3269c0626f0 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -210,8 +210,7 @@ typedef struct xfs_mount { struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ xfs_agnumber_t m_agfrotor; /* last ag where space found */ - xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ - spinlock_t m_agirotor_lock;/* .. and lock protecting it */ + atomic_t m_agirotor; /* last ag dir inode alloced */ /* Memory shrinker to throttle and reprioritize inodegc */ struct shrinker m_inodegc_shrinker; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 5535778a98f9..f5dc46ce9803 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -610,7 +610,7 @@ xfs_reflink_cancel_cow_blocks( if (error) break; } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { - ASSERT((*tpp)->t_firstblock == NULLFSBLOCK); + ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); /* Free the CoW orphan record. */ xfs_refcount_free_cow_extent(*tpp, del.br_startblock, @@ -927,7 +927,7 @@ xfs_reflink_recover_cow( for_each_perag(mp, agno, pag) { error = xfs_refcount_recover_cow_leftovers(mp, pag); if (error) { - xfs_perag_put(pag); + xfs_perag_rele(pag); break; } } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0c4b73e9b29d..2479b5cbd75e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -247,6 +247,32 @@ xfs_fs_show_options( return 0; } +static bool +xfs_set_inode_alloc_perag( + struct xfs_perag *pag, + xfs_ino_t ino, + xfs_agnumber_t max_metadata) +{ + if (!xfs_is_inode32(pag->pag_mount)) { + set_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate); + clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); + return false; + } + + if (ino > XFS_MAXINUMBER_32) { + clear_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate); + clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); + return false; + } + + set_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate); + if (pag->pag_agno < max_metadata) + set_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); + else + clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); + return true; +} + /* * Set parameters for inode allocation heuristics, taking into account * filesystem size and inode32/inode64 mount options; i.e. specifically @@ -310,24 +336,8 @@ xfs_set_inode_alloc( ino = XFS_AGINO_TO_INO(mp, index, agino); pag = xfs_perag_get(mp, index); - - if (xfs_is_inode32(mp)) { - if (ino > XFS_MAXINUMBER_32) { - pag->pagi_inodeok = 0; - pag->pagf_metadata = 0; - } else { - pag->pagi_inodeok = 1; - maxagi++; - if (index < max_metadata) - pag->pagf_metadata = 1; - else - pag->pagf_metadata = 0; - } - } else { - pag->pagi_inodeok = 1; - pag->pagf_metadata = 0; - } - + if (xfs_set_inode_alloc_perag(pag, ino, max_metadata)) + maxagi++; xfs_perag_put(pag); } @@ -1922,7 +1932,6 @@ static int xfs_init_fs_context( return -ENOMEM; spin_lock_init(&mp->m_sb_lock); - spin_lock_init(&mp->m_agirotor_lock); INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); spin_lock_init(&mp->m_perag_lock); mutex_init(&mp->m_growlock); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 6b0e9ae7c513..7dc0fd6a6504 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -74,6 +74,7 @@ struct xfs_inobt_rec_incore; union xfs_btree_ptr; struct xfs_dqtrx; struct xfs_icwalk; +struct xfs_perag; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -159,36 +160,40 @@ TRACE_EVENT(xlog_intent_recovery_failed, ); DECLARE_EVENT_CLASS(xfs_perag_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, - unsigned long caller_ip), - TP_ARGS(mp, agno, refcount, caller_ip), + TP_PROTO(struct xfs_perag *pag, unsigned long caller_ip), + TP_ARGS(pag, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(int, refcount) + __field(int, active_refcount) __field(unsigned long, caller_ip) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->refcount = refcount; + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->refcount = atomic_read(&pag->pag_ref); + __entry->active_refcount = atomic_read(&pag->pag_active_ref); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno 0x%x refcount %d caller %pS", + TP_printk("dev %d:%d agno 0x%x passive refs %d active refs %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->refcount, + __entry->active_refcount, (char *)__entry->caller_ip) ); #define DEFINE_PERAG_REF_EVENT(name) \ DEFINE_EVENT(xfs_perag_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \ - unsigned long caller_ip), \ - TP_ARGS(mp, agno, refcount, caller_ip)) + TP_PROTO(struct xfs_perag *pag, unsigned long caller_ip), \ + TP_ARGS(pag, caller_ip)) DEFINE_PERAG_REF_EVENT(xfs_perag_get); DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_put); +DEFINE_PERAG_REF_EVENT(xfs_perag_grab); +DEFINE_PERAG_REF_EVENT(xfs_perag_grab_tag); +DEFINE_PERAG_REF_EVENT(xfs_perag_rele); DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag); @@ -634,8 +639,8 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); DECLARE_EVENT_CLASS(xfs_filestream_class, - TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, xfs_agnumber_t agno), - TP_ARGS(mp, ino, agno), + TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino), + TP_ARGS(pag, ino), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) @@ -643,10 +648,10 @@ DECLARE_EVENT_CLASS(xfs_filestream_class, __field(int, streams) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; + __entry->dev = pag->pag_mount->m_super->s_dev; __entry->ino = ino; - __entry->agno = agno; - __entry->streams = xfs_filestream_peek_ag(mp, agno); + __entry->agno = pag->pag_agno; + __entry->streams = atomic_read(&pag->pagf_fstrms); ), TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -656,39 +661,40 @@ DECLARE_EVENT_CLASS(xfs_filestream_class, ) #define DEFINE_FILESTREAM_EVENT(name) \ DEFINE_EVENT(xfs_filestream_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, xfs_agnumber_t agno), \ - TP_ARGS(mp, ino, agno)) + TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino), \ + TP_ARGS(pag, ino)) DEFINE_FILESTREAM_EVENT(xfs_filestream_free); DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup); DEFINE_FILESTREAM_EVENT(xfs_filestream_scan); TRACE_EVENT(xfs_filestream_pick, - TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno, - xfs_extlen_t free, int nscan), - TP_ARGS(ip, agno, free, nscan), + TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino, xfs_extlen_t free), + TP_ARGS(pag, ino, free), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(xfs_agnumber_t, agno) __field(int, streams) __field(xfs_extlen_t, free) - __field(int, nscan) ), TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->agno = agno; - __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->ino = ino; + if (pag) { + __entry->agno = pag->pag_agno; + __entry->streams = atomic_read(&pag->pagf_fstrms); + } else { + __entry->agno = NULLAGNUMBER; + __entry->streams = 0; + } __entry->free = free; - __entry->nscan = nscan; ), - TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d nscan %d", + TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->agno, __entry->streams, - __entry->free, - __entry->nscan) + __entry->free) ); DECLARE_EVENT_CLASS(xfs_lock_class, @@ -1795,13 +1801,11 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __field(xfs_extlen_t, alignment) __field(xfs_extlen_t, minalignslop) __field(xfs_extlen_t, len) - __field(short, type) - __field(short, otype) __field(char, wasdel) __field(char, wasfromfl) __field(int, resv) __field(int, datatype) - __field(xfs_fsblock_t, firstblock) + __field(xfs_agnumber_t, highest_agno) ), TP_fast_assign( __entry->dev = args->mp->m_super->s_dev; @@ -1816,18 +1820,16 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __entry->alignment = args->alignment; __entry->minalignslop = args->minalignslop; __entry->len = args->len; - __entry->type = args->type; - __entry->otype = args->otype; __entry->wasdel = args->wasdel; __entry->wasfromfl = args->wasfromfl; __entry->resv = args->resv; __entry->datatype = args->datatype; - __entry->firstblock = args->tp->t_firstblock; + __entry->highest_agno = args->tp->t_highest_agno; ), TP_printk("dev %d:%d agno 0x%x agbno 0x%x minlen %u maxlen %u mod %u " "prod %u minleft %u total %u alignment %u minalignslop %u " - "len %u type %s otype %s wasdel %d wasfromfl %d resv %d " - "datatype 0x%x firstblock 0x%llx", + "len %u wasdel %d wasfromfl %d resv %d " + "datatype 0x%x highest_agno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -1840,13 +1842,11 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __entry->alignment, __entry->minalignslop, __entry->len, - __print_symbolic(__entry->type, XFS_ALLOC_TYPES), - __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), __entry->wasdel, __entry->wasfromfl, __entry->resv, __entry->datatype, - (unsigned long long)__entry->firstblock) + __entry->highest_agno) ) #define DEFINE_ALLOC_EVENT(name) \ @@ -1877,6 +1877,7 @@ DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough); DEFINE_ALLOC_EVENT(xfs_alloc_small_done); DEFINE_ALLOC_EVENT(xfs_alloc_small_error); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_skip_deadlock); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 7bd16fbff534..8afc0c080861 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -102,7 +102,7 @@ xfs_trans_dup( INIT_LIST_HEAD(&ntp->t_items); INIT_LIST_HEAD(&ntp->t_busy); INIT_LIST_HEAD(&ntp->t_dfops); - ntp->t_firstblock = NULLFSBLOCK; + ntp->t_highest_agno = NULLAGNUMBER; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(tp->t_ticket != NULL); @@ -278,7 +278,7 @@ retry: INIT_LIST_HEAD(&tp->t_items); INIT_LIST_HEAD(&tp->t_busy); INIT_LIST_HEAD(&tp->t_dfops); - tp->t_firstblock = NULLFSBLOCK; + tp->t_highest_agno = NULLAGNUMBER; error = xfs_trans_reserve(tp, resp, blocks, rtextents); if (error == -ENOSPC && want_retry) { @@ -1078,10 +1078,10 @@ xfs_trans_cancel( /* * It's never valid to cancel a transaction with deferred ops attached, * because the transaction is effectively dirty. Complain about this - * loudly before freeing the in-memory defer items. + * loudly before freeing the in-memory defer items and shutting down the + * filesystem. */ if (!list_empty(&tp->t_dfops)) { - ASSERT(xfs_is_shutdown(mp) || list_empty(&tp->t_dfops)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); dirty = true; xfs_defer_cancel(tp); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 55819785941c..6e3646d524ce 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -132,7 +132,7 @@ typedef struct xfs_trans { unsigned int t_rtx_res; /* # of rt extents resvd */ unsigned int t_rtx_res_used; /* # of resvd rt extents used */ unsigned int t_flags; /* misc flags */ - xfs_fsblock_t t_firstblock; /* first block allocated */ + xfs_agnumber_t t_highest_agno; /* highest AGF locked */ struct xlog_ticket *t_ticket; /* log mgr ticket */ struct xfs_mount *t_mountp; /* ptr to fs mount struct */ struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ |