diff options
Diffstat (limited to 'fs')
50 files changed, 1220 insertions, 883 deletions
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 8bcec8dcabb6..054f97b07754 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -63,7 +63,7 @@ struct autofs_info { struct autofs_sb_info *sbi; unsigned long last_used; - atomic_t count; + int count; kuid_t uid; kgid_t gid; diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 91f5787dae7c..a1c7701007e7 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -211,7 +211,7 @@ static int autofs_tree_busy(struct vfsmount *mnt, } } else { struct autofs_info *ino = autofs_dentry_ino(p); - unsigned int ino_count = atomic_read(&ino->count); + unsigned int ino_count = READ_ONCE(ino->count); /* allow for dget above and top is already dgot */ if (p == top) @@ -379,7 +379,7 @@ static struct dentry *should_expire(struct dentry *dentry, /* Not a forced expire? */ if (!(how & AUTOFS_EXP_FORCED)) { /* ref-walk currently on this dentry? */ - ino_count = atomic_read(&ino->count) + 1; + ino_count = READ_ONCE(ino->count) + 1; if (d_count(dentry) > ino_count) return NULL; } @@ -396,7 +396,7 @@ static struct dentry *should_expire(struct dentry *dentry, /* Not a forced expire? */ if (!(how & AUTOFS_EXP_FORCED)) { /* ref-walk currently on this dentry? */ - ino_count = atomic_read(&ino->count) + 1; + ino_count = READ_ONCE(ino->count) + 1; if (d_count(dentry) > ino_count) return NULL; } diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 29abafc0ce31..5aaa1732bf1e 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -569,10 +569,9 @@ static int autofs_dir_symlink(struct inode *dir, d_add(dentry, inode); dget(dentry); - atomic_inc(&ino->count); + ino->count++; p_ino = autofs_dentry_ino(dentry->d_parent); - if (p_ino && !IS_ROOT(dentry)) - atomic_inc(&p_ino->count); + p_ino->count++; dir->i_mtime = current_time(dir); @@ -610,11 +609,9 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) if (sbi->flags & AUTOFS_SBI_CATATONIC) return -EACCES; - if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs_dentry_ino(dentry->d_parent); - if (p_ino && !IS_ROOT(dentry)) - atomic_dec(&p_ino->count); - } + ino->count--; + p_ino = autofs_dentry_ino(dentry->d_parent); + p_ino->count--; dput(ino->dentry); d_inode(dentry)->i_size = 0; @@ -660,7 +657,6 @@ static void autofs_set_leaf_automount_flags(struct dentry *dentry) static void autofs_clear_leaf_automount_flags(struct dentry *dentry) { - struct list_head *d_child; struct dentry *parent; /* flags for dentrys in the root are handled elsewhere */ @@ -673,10 +669,7 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry) /* only consider parents below dentrys in the root */ if (IS_ROOT(parent->d_parent)) return; - d_child = &dentry->d_child; - /* Set parent managed if it's becoming empty */ - if (d_child->next == &parent->d_subdirs && - d_child->prev == &parent->d_subdirs) + if (autofs_dentry_ino(parent)->count == 2) managed_dentry_set_managed(parent); } @@ -698,11 +691,10 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) if (sbi->flags & AUTOFS_SBI_CATATONIC) return -EACCES; - spin_lock(&sbi->lookup_lock); - if (!simple_empty(dentry)) { - spin_unlock(&sbi->lookup_lock); + if (ino->count != 1) return -ENOTEMPTY; - } + + spin_lock(&sbi->lookup_lock); __autofs_add_expiring(dentry); d_drop(dentry); spin_unlock(&sbi->lookup_lock); @@ -710,11 +702,9 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) if (sbi->version < 5) autofs_clear_leaf_automount_flags(dentry); - if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs_dentry_ino(dentry->d_parent); - if (p_ino && dentry->d_parent != dentry) - atomic_dec(&p_ino->count); - } + ino->count--; + p_ino = autofs_dentry_ino(dentry->d_parent); + p_ino->count--; dput(ino->dentry); d_inode(dentry)->i_size = 0; clear_nlink(d_inode(dentry)); @@ -760,10 +750,9 @@ static int autofs_dir_mkdir(struct inode *dir, autofs_set_leaf_automount_flags(dentry); dget(dentry); - atomic_inc(&ino->count); + ino->count++; p_ino = autofs_dentry_ino(dentry->d_parent); - if (p_ino && !IS_ROOT(dentry)) - atomic_inc(&p_ino->count); + p_ino->count++; inc_nlink(dir); dir->i_mtime = current_time(dir); diff --git a/fs/block_dev.c b/fs/block_dev.c index ee63c2732fa2..69bf2fb6f7cd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1531,7 +1531,7 @@ rescan: ret = blk_add_partitions(disk, bdev); if (ret == -EAGAIN) goto rescan; - } else { + } else if (invalidate) { /* * Tell userspace that the media / partition table may have * changed. diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index b2ec29eeb4c4..73f24f307a4a 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -8,6 +8,7 @@ #include <linux/ceph/ceph_debug.h> +#include <linux/fs_context.h> #include "super.h" #include "cache.h" @@ -49,7 +50,7 @@ void ceph_fscache_unregister(void) fscache_unregister_netfs(&ceph_cache_netfs); } -int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) { const struct ceph_fsid *fsid = &fsc->client->fsid; const char *fscache_uniq = fsc->mount_options->fscache_uniq; @@ -66,8 +67,8 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) if (uniq_len && memcmp(ent->uniquifier, fscache_uniq, uniq_len)) continue; - pr_err("fscache cookie already registered for fsid %pU\n", fsid); - pr_err(" use fsc=%%s mount option to specify a uniquifier\n"); + errorf(fc, "ceph: fscache cookie already registered for fsid %pU, use fsc=<uniquifier> option", + fsid); err = -EBUSY; goto out_unlock; } @@ -95,7 +96,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) list_add_tail(&ent->list, &ceph_fscache_list); } else { kfree(ent); - pr_err("unable to register fscache cookie for fsid %pU\n", + errorf(fc, "ceph: unable to register fscache cookie for fsid %pU", fsid); /* all other fs ignore this error */ } diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index e486fac3434d..89dbdd1eb14a 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -16,7 +16,7 @@ extern struct fscache_netfs ceph_cache_netfs; int ceph_fscache_register(void); void ceph_fscache_unregister(void); -int ceph_fscache_register_fs(struct ceph_fs_client* fsc); +int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc); void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); void ceph_fscache_register_inode_cookie(struct inode *inode); @@ -88,7 +88,8 @@ static inline void ceph_fscache_unregister(void) { } -static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc, + struct fs_context *fc) { return 0; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a5163296d9d9..068b029cf073 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2182,13 +2182,17 @@ retry: } base = ceph_ino(d_inode(temp)); rcu_read_unlock(); - if (pos < 0 || read_seqretry(&rename_lock, seq)) { - pr_err("build_path did not end path lookup where " - "expected, pos is %d\n", pos); - /* presumably this is only possible if racing with a - rename of one of the parent directories (we can not - lock the dentries above us to prevent this, but - retrying should be harmless) */ + + if (read_seqretry(&rename_lock, seq)) + goto retry; + + if (pos < 0) { + /* + * A rename didn't occur, but somehow we didn't end up where + * we thought we would. Throw a warning and try again. + */ + pr_warn("build_path did not end path lookup where " + "expected, pos is %d\n", pos); goto retry; } @@ -2345,6 +2349,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->op = cpu_to_le32(req->r_op); head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid)); head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid)); + head->ino = 0; head->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index ce2d00da5096..aeec1d6e3769 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -20,7 +20,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) { int n = 0; - int i; + int i, j; /* special case for one mds */ if (1 == m->m_num_mds && m->m_info[0].state > 0) @@ -35,9 +35,12 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) /* pick */ n = prandom_u32() % n; - for (i = 0; n > 0; i++, n--) - while (m->m_info[i].state <= 0) - i++; + for (j = 0, i = 0; i < m->m_num_mds; i++) { + if (m->m_info[i].state > 0) + j++; + if (j > n) + break; + } return i; } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index b47f43fc2d68..9c9a7c68eea3 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -9,7 +9,8 @@ #include <linux/in6.h> #include <linux/module.h> #include <linux/mount.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/slab.h> @@ -138,280 +139,308 @@ enum { Opt_readdir_max_entries, Opt_readdir_max_bytes, Opt_congestion_kb, - Opt_last_int, /* int args above */ Opt_snapdirname, Opt_mds_namespace, - Opt_fscache_uniq, Opt_recover_session, - Opt_last_string, + Opt_source, /* string args above */ Opt_dirstat, - Opt_nodirstat, Opt_rbytes, - Opt_norbytes, Opt_asyncreaddir, - Opt_noasyncreaddir, Opt_dcache, - Opt_nodcache, Opt_ino32, - Opt_noino32, Opt_fscache, - Opt_nofscache, Opt_poolperm, - Opt_nopoolperm, Opt_require_active_mds, - Opt_norequire_active_mds, -#ifdef CONFIG_CEPH_FS_POSIX_ACL Opt_acl, -#endif - Opt_noacl, Opt_quotadf, - Opt_noquotadf, Opt_copyfrom, - Opt_nocopyfrom, }; -static match_table_t fsopt_tokens = { - {Opt_wsize, "wsize=%d"}, - {Opt_rsize, "rsize=%d"}, - {Opt_rasize, "rasize=%d"}, - {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, - {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, - {Opt_caps_max, "caps_max=%d"}, - {Opt_readdir_max_entries, "readdir_max_entries=%d"}, - {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, - {Opt_congestion_kb, "write_congestion_kb=%d"}, - /* int args above */ - {Opt_snapdirname, "snapdirname=%s"}, - {Opt_mds_namespace, "mds_namespace=%s"}, - {Opt_recover_session, "recover_session=%s"}, - {Opt_fscache_uniq, "fsc=%s"}, - /* string args above */ - {Opt_dirstat, "dirstat"}, - {Opt_nodirstat, "nodirstat"}, - {Opt_rbytes, "rbytes"}, - {Opt_norbytes, "norbytes"}, - {Opt_asyncreaddir, "asyncreaddir"}, - {Opt_noasyncreaddir, "noasyncreaddir"}, - {Opt_dcache, "dcache"}, - {Opt_nodcache, "nodcache"}, - {Opt_ino32, "ino32"}, - {Opt_noino32, "noino32"}, - {Opt_fscache, "fsc"}, - {Opt_nofscache, "nofsc"}, - {Opt_poolperm, "poolperm"}, - {Opt_nopoolperm, "nopoolperm"}, - {Opt_require_active_mds, "require_active_mds"}, - {Opt_norequire_active_mds, "norequire_active_mds"}, -#ifdef CONFIG_CEPH_FS_POSIX_ACL - {Opt_acl, "acl"}, -#endif - {Opt_noacl, "noacl"}, - {Opt_quotadf, "quotadf"}, - {Opt_noquotadf, "noquotadf"}, - {Opt_copyfrom, "copyfrom"}, - {Opt_nocopyfrom, "nocopyfrom"}, - {-1, NULL} +enum ceph_recover_session_mode { + ceph_recover_session_no, + ceph_recover_session_clean +}; + +static const struct fs_parameter_enum ceph_mount_param_enums[] = { + { Opt_recover_session, "no", ceph_recover_session_no }, + { Opt_recover_session, "clean", ceph_recover_session_clean }, + {} +}; + +static const struct fs_parameter_spec ceph_mount_param_specs[] = { + fsparam_flag_no ("acl", Opt_acl), + fsparam_flag_no ("asyncreaddir", Opt_asyncreaddir), + fsparam_u32 ("caps_max", Opt_caps_max), + fsparam_u32 ("caps_wanted_delay_max", Opt_caps_wanted_delay_max), + fsparam_u32 ("caps_wanted_delay_min", Opt_caps_wanted_delay_min), + fsparam_s32 ("write_congestion_kb", Opt_congestion_kb), + fsparam_flag_no ("copyfrom", Opt_copyfrom), + fsparam_flag_no ("dcache", Opt_dcache), + fsparam_flag_no ("dirstat", Opt_dirstat), + __fsparam (fs_param_is_string, "fsc", Opt_fscache, + fs_param_neg_with_no | fs_param_v_optional), + fsparam_flag_no ("ino32", Opt_ino32), + fsparam_string ("mds_namespace", Opt_mds_namespace), + fsparam_flag_no ("poolperm", Opt_poolperm), + fsparam_flag_no ("quotadf", Opt_quotadf), + fsparam_u32 ("rasize", Opt_rasize), + fsparam_flag_no ("rbytes", Opt_rbytes), + fsparam_s32 ("readdir_max_bytes", Opt_readdir_max_bytes), + fsparam_s32 ("readdir_max_entries", Opt_readdir_max_entries), + fsparam_enum ("recover_session", Opt_recover_session), + fsparam_flag_no ("require_active_mds", Opt_require_active_mds), + fsparam_u32 ("rsize", Opt_rsize), + fsparam_string ("snapdirname", Opt_snapdirname), + fsparam_string ("source", Opt_source), + fsparam_u32 ("wsize", Opt_wsize), + {} +}; + +static const struct fs_parameter_description ceph_mount_parameters = { + .name = "ceph", + .specs = ceph_mount_param_specs, + .enums = ceph_mount_param_enums, }; -static int parse_fsopt_token(char *c, void *private) +struct ceph_parse_opts_ctx { + struct ceph_options *copts; + struct ceph_mount_options *opts; +}; + +/* + * Parse the source parameter. Distinguish the server list from the path. + * Internally we do not include the leading '/' in the path. + * + * The source will look like: + * <server_spec>[,<server_spec>...]:[<path>] + * where + * <server_spec> is <ip>[:<port>] + * <path> is optional, but if present must begin with '/' + */ +static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) { - struct ceph_mount_options *fsopt = private; - substring_t argstr[MAX_OPT_ARGS]; - int token, intval, ret; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + char *dev_name = param->string, *dev_name_end; + int ret; - token = match_token((char *)c, fsopt_tokens, argstr); - if (token < 0) - return -EINVAL; + dout("%s '%s'\n", __func__, dev_name); + if (!dev_name || !*dev_name) + return invalf(fc, "ceph: Empty source"); - if (token < Opt_last_int) { - ret = match_int(&argstr[0], &intval); - if (ret < 0) { - pr_err("bad option arg (not int) at '%s'\n", c); - return ret; + dev_name_end = strchr(dev_name, '/'); + if (dev_name_end) { + if (strlen(dev_name_end) > 1) { + kfree(fsopt->server_path); + fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); + if (!fsopt->server_path) + return -ENOMEM; } - dout("got int token %d val %d\n", token, intval); - } else if (token > Opt_last_int && token < Opt_last_string) { - dout("got string token %d val %s\n", token, - argstr[0].from); } else { - dout("got token %d\n", token); + dev_name_end = dev_name + strlen(dev_name); } + dev_name_end--; /* back up to ':' separator */ + if (dev_name_end < dev_name || *dev_name_end != ':') + return invalf(fc, "ceph: No path or : separator in source"); + + dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); + if (fsopt->server_path) + dout("server path '%s'\n", fsopt->server_path); + + ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name, + pctx->copts, fc); + if (ret) + return ret; + + fc->source = param->string; + param->string = NULL; + return 0; +} + +static int ceph_parse_mount_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + struct fs_parse_result result; + unsigned int mode; + int token, ret; + + ret = ceph_parse_param(param, pctx->copts, fc); + if (ret != -ENOPARAM) + return ret; + + token = fs_parse(fc, &ceph_mount_parameters, param, &result); + dout("%s fs_parse '%s' token %d\n", __func__, param->key, token); + if (token < 0) + return token; + switch (token) { case Opt_snapdirname: kfree(fsopt->snapdir_name); - fsopt->snapdir_name = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - if (!fsopt->snapdir_name) - return -ENOMEM; + fsopt->snapdir_name = param->string; + param->string = NULL; break; case Opt_mds_namespace: kfree(fsopt->mds_namespace); - fsopt->mds_namespace = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - if (!fsopt->mds_namespace) - return -ENOMEM; + fsopt->mds_namespace = param->string; + param->string = NULL; break; case Opt_recover_session: - if (!strncmp(argstr[0].from, "no", - argstr[0].to - argstr[0].from)) { + mode = result.uint_32; + if (mode == ceph_recover_session_no) fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER; - } else if (!strncmp(argstr[0].from, "clean", - argstr[0].to - argstr[0].from)) { + else if (mode == ceph_recover_session_clean) fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER; - } else { - return -EINVAL; - } - break; - case Opt_fscache_uniq: -#ifdef CONFIG_CEPH_FSCACHE - kfree(fsopt->fscache_uniq); - fsopt->fscache_uniq = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - if (!fsopt->fscache_uniq) - return -ENOMEM; - fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; + else + BUG(); break; -#else - pr_err("fscache support is disabled\n"); - return -EINVAL; -#endif + case Opt_source: + if (fc->source) + return invalf(fc, "ceph: Multiple sources specified"); + return ceph_parse_source(param, fc); case Opt_wsize: - if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE) - return -EINVAL; - fsopt->wsize = ALIGN(intval, PAGE_SIZE); + if (result.uint_32 < PAGE_SIZE || + result.uint_32 > CEPH_MAX_WRITE_SIZE) + goto out_of_range; + fsopt->wsize = ALIGN(result.uint_32, PAGE_SIZE); break; case Opt_rsize: - if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_READ_SIZE) - return -EINVAL; - fsopt->rsize = ALIGN(intval, PAGE_SIZE); + if (result.uint_32 < PAGE_SIZE || + result.uint_32 > CEPH_MAX_READ_SIZE) + goto out_of_range; + fsopt->rsize = ALIGN(result.uint_32, PAGE_SIZE); break; case Opt_rasize: - if (intval < 0) - return -EINVAL; - fsopt->rasize = ALIGN(intval, PAGE_SIZE); + fsopt->rasize = ALIGN(result.uint_32, PAGE_SIZE); break; case Opt_caps_wanted_delay_min: - if (intval < 1) - return -EINVAL; - fsopt->caps_wanted_delay_min = intval; + if (result.uint_32 < 1) + goto out_of_range; + fsopt->caps_wanted_delay_min = result.uint_32; break; case Opt_caps_wanted_delay_max: - if (intval < 1) - return -EINVAL; - fsopt->caps_wanted_delay_max = intval; + if (result.uint_32 < 1) + goto out_of_range; + fsopt->caps_wanted_delay_max = result.uint_32; break; case Opt_caps_max: - if (intval < 0) - return -EINVAL; - fsopt->caps_max = intval; + fsopt->caps_max = result.uint_32; break; case Opt_readdir_max_entries: - if (intval < 1) - return -EINVAL; - fsopt->max_readdir = intval; + if (result.uint_32 < 1) + goto out_of_range; + fsopt->max_readdir = result.uint_32; break; case Opt_readdir_max_bytes: - if (intval < (int)PAGE_SIZE && intval != 0) - return -EINVAL; - fsopt->max_readdir_bytes = intval; + if (result.uint_32 < PAGE_SIZE && result.uint_32 != 0) + goto out_of_range; + fsopt->max_readdir_bytes = result.uint_32; break; case Opt_congestion_kb: - if (intval < 1024) /* at least 1M */ - return -EINVAL; - fsopt->congestion_kb = intval; + if (result.uint_32 < 1024) /* at least 1M */ + goto out_of_range; + fsopt->congestion_kb = result.uint_32; break; case Opt_dirstat: - fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; - break; - case Opt_nodirstat: - fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; break; case Opt_rbytes: - fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; - break; - case Opt_norbytes: - fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; break; case Opt_asyncreaddir: - fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; - break; - case Opt_noasyncreaddir: - fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; break; case Opt_dcache: - fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; - break; - case Opt_nodcache: - fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; break; case Opt_ino32: - fsopt->flags |= CEPH_MOUNT_OPT_INO32; - break; - case Opt_noino32: - fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_INO32; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; break; + case Opt_fscache: #ifdef CONFIG_CEPH_FSCACHE - fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; kfree(fsopt->fscache_uniq); fsopt->fscache_uniq = NULL; + if (result.negated) { + fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; + } else { + fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; + fsopt->fscache_uniq = param->string; + param->string = NULL; + } break; #else - pr_err("fscache support is disabled\n"); - return -EINVAL; + return invalf(fc, "ceph: fscache support is disabled"); #endif - case Opt_nofscache: - fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; - kfree(fsopt->fscache_uniq); - fsopt->fscache_uniq = NULL; - break; case Opt_poolperm: - fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; - break; - case Opt_nopoolperm: - fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; break; case Opt_require_active_mds: - fsopt->flags &= ~CEPH_MOUNT_OPT_MOUNTWAIT; - break; - case Opt_norequire_active_mds: - fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT; + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_MOUNTWAIT; + else + fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT; break; case Opt_quotadf: - fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF; - break; - case Opt_noquotadf: - fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF; + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF; break; case Opt_copyfrom: - fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM; - break; - case Opt_nocopyfrom: - fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM; + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM; break; -#ifdef CONFIG_CEPH_FS_POSIX_ACL case Opt_acl: - fsopt->sb_flags |= SB_POSIXACL; - break; + if (!result.negated) { +#ifdef CONFIG_CEPH_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#else + return invalf(fc, "ceph: POSIX ACL support is disabled"); #endif - case Opt_noacl: - fsopt->sb_flags &= ~SB_POSIXACL; + } else { + fc->sb_flags &= ~SB_POSIXACL; + } break; default: - BUG_ON(token); + BUG(); } return 0; + +out_of_range: + return invalf(fc, "ceph: %s out of range", param->key); } static void destroy_mount_options(struct ceph_mount_options *args) { dout("destroy_mount_options %p\n", args); + if (!args) + return; + kfree(args->snapdir_name); kfree(args->mds_namespace); kfree(args->server_path); @@ -459,91 +488,6 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, return ceph_compare_options(new_opt, fsc->client); } -static int parse_mount_options(struct ceph_mount_options **pfsopt, - struct ceph_options **popt, - int flags, char *options, - const char *dev_name) -{ - struct ceph_mount_options *fsopt; - const char *dev_name_end; - int err; - - if (!dev_name || !*dev_name) - return -EINVAL; - - fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); - if (!fsopt) - return -ENOMEM; - - dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); - - fsopt->sb_flags = flags; - fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; - - fsopt->wsize = CEPH_MAX_WRITE_SIZE; - fsopt->rsize = CEPH_MAX_READ_SIZE; - fsopt->rasize = CEPH_RASIZE_DEFAULT; - fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); - if (!fsopt->snapdir_name) { - err = -ENOMEM; - goto out; - } - - fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; - fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; - fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; - fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; - fsopt->congestion_kb = default_congestion_kb(); - - /* - * Distinguish the server list from the path in "dev_name". - * Internally we do not include the leading '/' in the path. - * - * "dev_name" will look like: - * <server_spec>[,<server_spec>...]:[<path>] - * where - * <server_spec> is <ip>[:<port>] - * <path> is optional, but if present must begin with '/' - */ - dev_name_end = strchr(dev_name, '/'); - if (dev_name_end) { - if (strlen(dev_name_end) > 1) { - fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); - if (!fsopt->server_path) { - err = -ENOMEM; - goto out; - } - } - } else { - dev_name_end = dev_name + strlen(dev_name); - } - err = -EINVAL; - dev_name_end--; /* back up to ':' separator */ - if (dev_name_end < dev_name || *dev_name_end != ':') { - pr_err("device name is missing path (no : separator in %s)\n", - dev_name); - goto out; - } - dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); - if (fsopt->server_path) - dout("server path '%s'\n", fsopt->server_path); - - *popt = ceph_parse_options(options, dev_name, dev_name_end, - parse_fsopt_token, (void *)fsopt); - if (IS_ERR(*popt)) { - err = PTR_ERR(*popt); - goto out; - } - - /* success */ - *pfsopt = fsopt; - return 0; - -out: - destroy_mount_options(fsopt); - return err; -} - /** * ceph_show_options - Show mount options in /proc/mounts * @m: seq_file to write to @@ -587,7 +531,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noquotadf"); #ifdef CONFIG_CEPH_FS_POSIX_ACL - if (fsopt->sb_flags & SB_POSIXACL) + if (root->d_sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); else seq_puts(m, ",noacl"); @@ -860,12 +804,6 @@ static void ceph_umount_begin(struct super_block *sb) fsc->filp_gen++; // invalidate open files } -static int ceph_remount(struct super_block *sb, int *flags, char *data) -{ - sync_filesystem(sb); - return 0; -} - static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .free_inode = ceph_free_inode, @@ -874,7 +812,6 @@ static const struct super_operations ceph_super_ops = { .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, - .remount_fs = ceph_remount, .show_options = ceph_show_options, .statfs = ceph_statfs, .umount_begin = ceph_umount_begin, @@ -935,7 +872,8 @@ out: /* * mount: join the ceph cluster, and open root directory. */ -static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) +static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, + struct fs_context *fc) { int err; unsigned long started = jiffies; /* note the start time */ @@ -952,7 +890,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) /* setup fscache */ if (fsc->mount_options->flags & CEPH_MOUNT_OPT_FSCACHE) { - err = ceph_fscache_register_fs(fsc); + err = ceph_fscache_register_fs(fsc, fc); if (err < 0) goto out; } @@ -987,18 +925,16 @@ out: return ERR_PTR(err); } -static int ceph_set_super(struct super_block *s, void *data) +static int ceph_set_super(struct super_block *s, struct fs_context *fc) { - struct ceph_fs_client *fsc = data; + struct ceph_fs_client *fsc = s->s_fs_info; int ret; - dout("set_super %p data %p\n", s, data); + dout("set_super %p\n", s); - s->s_flags = fsc->mount_options->sb_flags; s->s_maxbytes = MAX_LFS_FILESIZE; s->s_xattr = ceph_xattr_handlers; - s->s_fs_info = fsc; fsc->sb = s; fsc->max_file_size = 1ULL << 40; /* temp value until we get mdsmap */ @@ -1010,24 +946,18 @@ static int ceph_set_super(struct super_block *s, void *data) s->s_time_min = 0; s->s_time_max = U32_MAX; - ret = set_anon_super(s, NULL); /* what is that second arg for? */ + ret = set_anon_super_fc(s, fc); if (ret != 0) - goto fail; - - return ret; - -fail: - s->s_fs_info = NULL; - fsc->sb = NULL; + fsc->sb = NULL; return ret; } /* * share superblock if same fs AND options */ -static int ceph_compare_super(struct super_block *sb, void *data) +static int ceph_compare_super(struct super_block *sb, struct fs_context *fc) { - struct ceph_fs_client *new = data; + struct ceph_fs_client *new = fc->s_fs_info; struct ceph_mount_options *fsopt = new->mount_options; struct ceph_options *opt = new->client->options; struct ceph_fs_client *other = ceph_sb_to_client(sb); @@ -1043,7 +973,7 @@ static int ceph_compare_super(struct super_block *sb, void *data) dout("fsid doesn't match\n"); return 0; } - if (fsopt->sb_flags != other->mount_options->sb_flags) { + if (fc->sb_flags != (sb->s_flags & ~SB_BORN)) { dout("flags differ\n"); return 0; } @@ -1073,46 +1003,46 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) return 0; } -static struct dentry *ceph_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int ceph_get_tree(struct fs_context *fc) { + struct ceph_parse_opts_ctx *pctx = fc->fs_private; struct super_block *sb; struct ceph_fs_client *fsc; struct dentry *res; + int (*compare_super)(struct super_block *, struct fs_context *) = + ceph_compare_super; int err; - int (*compare_super)(struct super_block *, void *) = ceph_compare_super; - struct ceph_mount_options *fsopt = NULL; - struct ceph_options *opt = NULL; - dout("ceph_mount\n"); + dout("ceph_get_tree\n"); + + if (!fc->source) + return invalf(fc, "ceph: No source"); #ifdef CONFIG_CEPH_FS_POSIX_ACL - flags |= SB_POSIXACL; + fc->sb_flags |= SB_POSIXACL; #endif - err = parse_mount_options(&fsopt, &opt, flags, data, dev_name); - if (err < 0) { - res = ERR_PTR(err); - goto out_final; - } /* create client (which we may/may not use) */ - fsc = create_fs_client(fsopt, opt); + fsc = create_fs_client(pctx->opts, pctx->copts); + pctx->opts = NULL; + pctx->copts = NULL; if (IS_ERR(fsc)) { - res = ERR_CAST(fsc); + err = PTR_ERR(fsc); goto out_final; } err = ceph_mdsc_init(fsc); - if (err < 0) { - res = ERR_PTR(err); + if (err < 0) goto out; - } if (ceph_test_opt(fsc->client, NOSHARE)) compare_super = NULL; - sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc); + + fc->s_fs_info = fsc; + sb = sget_fc(fc, compare_super, ceph_set_super); + fc->s_fs_info = NULL; if (IS_ERR(sb)) { - res = ERR_CAST(sb); + err = PTR_ERR(sb); goto out; } @@ -1123,18 +1053,19 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, } else { dout("get_sb using new client %p\n", fsc); err = ceph_setup_bdi(sb, fsc); - if (err < 0) { - res = ERR_PTR(err); + if (err < 0) goto out_splat; - } } - res = ceph_real_mount(fsc); - if (IS_ERR(res)) + res = ceph_real_mount(fsc, fc); + if (IS_ERR(res)) { + err = PTR_ERR(res); goto out_splat; + } dout("root %p inode %p ino %llx.%llx\n", res, d_inode(res), ceph_vinop(d_inode(res))); - return res; + fc->root = fsc->sb->s_root; + return 0; out_splat: ceph_mdsc_close_sessions(fsc->mdsc); @@ -1144,8 +1075,79 @@ out_splat: out: destroy_fs_client(fsc); out_final: - dout("ceph_mount fail %ld\n", PTR_ERR(res)); - return res; + dout("ceph_get_tree fail %d\n", err); + return err; +} + +static void ceph_free_fc(struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + + if (pctx) { + destroy_mount_options(pctx->opts); + ceph_destroy_options(pctx->copts); + kfree(pctx); + } +} + +static int ceph_reconfigure_fc(struct fs_context *fc) +{ + sync_filesystem(fc->root->d_sb); + return 0; +} + +static const struct fs_context_operations ceph_context_ops = { + .free = ceph_free_fc, + .parse_param = ceph_parse_mount_param, + .get_tree = ceph_get_tree, + .reconfigure = ceph_reconfigure_fc, +}; + +/* + * Set up the filesystem mount context. + */ +static int ceph_init_fs_context(struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx; + struct ceph_mount_options *fsopt; + + pctx = kzalloc(sizeof(*pctx), GFP_KERNEL); + if (!pctx) + return -ENOMEM; + + pctx->copts = ceph_alloc_options(); + if (!pctx->copts) + goto nomem; + + pctx->opts = kzalloc(sizeof(*pctx->opts), GFP_KERNEL); + if (!pctx->opts) + goto nomem; + + fsopt = pctx->opts; + fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + + fsopt->wsize = CEPH_MAX_WRITE_SIZE; + fsopt->rsize = CEPH_MAX_READ_SIZE; + fsopt->rasize = CEPH_RASIZE_DEFAULT; + fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + if (!fsopt->snapdir_name) + goto nomem; + + fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; + fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; + fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; + fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; + fsopt->congestion_kb = default_congestion_kb(); + + fc->fs_private = pctx; + fc->ops = &ceph_context_ops; + return 0; + +nomem: + destroy_mount_options(pctx->opts); + ceph_destroy_options(pctx->copts); + kfree(pctx); + return -ENOMEM; } static void ceph_kill_sb(struct super_block *s) @@ -1172,7 +1174,7 @@ static void ceph_kill_sb(struct super_block *s) static struct file_system_type ceph_fs_type = { .owner = THIS_MODULE, .name = "ceph", - .mount = ceph_mount, + .init_fs_context = ceph_init_fs_context, .kill_sb = ceph_kill_sb, .fs_flags = FS_RENAME_DOES_D_MOVE, }; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f98d9247f9cb..f0f9cb7447ac 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -74,7 +74,6 @@ struct ceph_mount_options { int flags; - int sb_flags; int wsize; /* max write size */ int rsize; /* max read size */ @@ -407,22 +406,26 @@ struct ceph_inode_info { struct inode vfs_inode; /* at end */ }; -static inline struct ceph_inode_info *ceph_inode(struct inode *inode) +static inline struct ceph_inode_info * +ceph_inode(const struct inode *inode) { return container_of(inode, struct ceph_inode_info, vfs_inode); } -static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) +static inline struct ceph_fs_client * +ceph_inode_to_client(const struct inode *inode) { return (struct ceph_fs_client *)inode->i_sb->s_fs_info; } -static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) +static inline struct ceph_fs_client * +ceph_sb_to_client(const struct super_block *sb) { return (struct ceph_fs_client *)sb->s_fs_info; } -static inline struct ceph_vino ceph_vino(struct inode *inode) +static inline struct ceph_vino +ceph_vino(const struct inode *inode) { return ceph_inode(inode)->i_vino; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 1d1051d31513..5492b9860baa 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -730,11 +730,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) struct inode *dir = d_inode(dentry); struct dentry *child; - if (!dir) { - dput(dentry); - dentry = ERR_PTR(-ENOENT); - break; - } if (!S_ISDIR(dir->i_mode)) { dput(dentry); dentry = ERR_PTR(-ENOTDIR); @@ -751,7 +746,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) while (*s && *s != sep) s++; - child = lookup_one_len_unlocked(p, dentry, s - p); + child = lookup_positive_unlocked(p, dentry, s - p); dput(dentry); dentry = child; } while (!IS_ERR(dentry)); diff --git a/fs/dcache.c b/fs/dcache.c index f7931b682a0d..a2749a700230 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -319,7 +319,7 @@ static inline void __d_set_inode_and_type(struct dentry *dentry, flags = READ_ONCE(dentry->d_flags); flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); flags |= type_flags; - WRITE_ONCE(dentry->d_flags, flags); + smp_store_release(&dentry->d_flags, flags); } static inline void __d_clear_type_and_inode(struct dentry *dentry) @@ -903,17 +903,19 @@ struct dentry *dget_parent(struct dentry *dentry) { int gotref; struct dentry *ret; + unsigned seq; /* * Do optimistic parent lookup without any * locking. */ rcu_read_lock(); + seq = raw_seqcount_begin(&dentry->d_seq); ret = READ_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); rcu_read_unlock(); if (likely(gotref)) { - if (likely(ret == READ_ONCE(dentry->d_parent))) + if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; dput(ret); } diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 7b975dbb2bb4..f4d8df5e4714 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -299,13 +299,9 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) if (!parent) parent = debugfs_mount->mnt_root; - dentry = lookup_one_len_unlocked(name, parent, strlen(name)); + dentry = lookup_positive_unlocked(name, parent, strlen(name)); if (IS_ERR(dentry)) return NULL; - if (!d_really_is_positive(dentry)) { - dput(dentry); - return NULL; - } return dentry; } EXPORT_SYMBOL_GPL(debugfs_lookup); diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 0635cba19971..eb2a585572dc 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -34,7 +34,7 @@ config VIRTIO_FS select VIRTIO help The Virtio Filesystem allows guests to mount file systems from the - host. + host. If you want to share files between guests or with the host, answer Y - or M. + or M. diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d4e6691d2d92..8e02d76fe104 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1965,7 +1965,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, nbuf = 0; rem = 0; - for (idx = tail; idx < head && rem < len; idx++) + for (idx = tail; idx != head && rem < len; idx++) rem += pipe->bufs[idx & mask].len; ret = -EINVAL; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 54d638f9ba1c..ee190119f45c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -248,7 +248,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) kfree(forget); if (ret == -ENOMEM) goto out; - if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) + if (ret || fuse_invalid_attr(&outarg.attr) || + (outarg.attr.mode ^ inode->i_mode) & S_IFMT) goto invalid; forget_all_cached_acls(inode); @@ -319,6 +320,12 @@ int fuse_valid_type(int m) S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); } +bool fuse_invalid_attr(struct fuse_attr *attr) +{ + return !fuse_valid_type(attr->mode) || + attr->size > LLONG_MAX; +} + int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode) { @@ -350,7 +357,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name err = -EIO; if (!outarg->nodeid) goto out_put_forget; - if (!fuse_valid_type(outarg->attr.mode)) + if (fuse_invalid_attr(&outarg->attr)) goto out_put_forget; *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, @@ -475,7 +482,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, goto out_free_ff; err = -EIO; - if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid)) + if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid) || + fuse_invalid_attr(&outentry.attr)) goto out_free_ff; ff->fh = outopen.fh; @@ -583,7 +591,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, goto out_put_forget_req; err = -EIO; - if (invalid_nodeid(outarg.nodeid)) + if (invalid_nodeid(outarg.nodeid) || fuse_invalid_attr(&outarg.attr)) goto out_put_forget_req; if ((outarg.attr.mode ^ mode) & S_IFMT) @@ -862,7 +870,8 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, spin_lock(&fi->lock); fi->attr_version = atomic64_inc_return(&fc->attr_version); - inc_nlink(inode); + if (likely(inode->i_nlink < UINT_MAX)) + inc_nlink(inode); spin_unlock(&fi->lock); fuse_invalidate_attr(inode); fuse_update_ctime(inode); @@ -942,7 +951,8 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) { - if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { + if (fuse_invalid_attr(&outarg.attr) || + (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { make_bad_inode(inode); err = -EIO; } else { @@ -1563,7 +1573,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, goto error; } - if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { + if (fuse_invalid_attr(&outarg.attr) || + (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { make_bad_inode(inode); err = -EIO; goto error; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index db48a5cf8620..a63d779eac10 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -713,8 +713,10 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc, ia->ap.args.end = fuse_aio_complete_req; err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL); + if (err) + fuse_aio_complete_req(fc, &ia->ap.args, err); - return err ?: num_bytes; + return num_bytes; } static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, @@ -1096,6 +1098,8 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, ia->write.in.flags = fuse_write_flags(iocb); err = fuse_simple_request(fc, &ap->args); + if (!err && ia->write.out.size > count) + err = -EIO; offset = ap->descs[0].offset; count = ia->write.out.size; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d148188cfca4..aa75e2305b75 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -989,6 +989,8 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc); */ int fuse_valid_type(int m); +bool fuse_invalid_attr(struct fuse_attr *attr); + /** * Is current process allowed to perform filesystem operation? */ diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 5c38b9d84c6e..6a40f75a0d25 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -184,7 +184,7 @@ static int fuse_direntplus_link(struct file *file, if (invalid_nodeid(o->nodeid)) return -EIO; - if (!fuse_valid_type(o->attr.mode)) + if (fuse_invalid_attr(&o->attr)) return -EIO; fc = get_fuse_conn(dir); diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index a5c86048b96e..bade74768903 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -35,6 +35,7 @@ struct virtio_fs_vq { struct fuse_dev *fud; bool connected; long in_flight; + struct completion in_flight_zero; /* No inflight requests */ char name[24]; } ____cacheline_aligned_in_smp; @@ -48,11 +49,15 @@ struct virtio_fs { unsigned int num_request_queues; /* number of request queues */ }; -struct virtio_fs_forget { +struct virtio_fs_forget_req { struct fuse_in_header ih; struct fuse_forget_in arg; +}; + +struct virtio_fs_forget { /* This request can be temporarily queued on virt queue */ struct list_head list; + struct virtio_fs_forget_req req; }; static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, @@ -81,6 +86,8 @@ static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq) { WARN_ON(fsvq->in_flight <= 0); fsvq->in_flight--; + if (!fsvq->in_flight) + complete(&fsvq->in_flight_zero); } static void release_virtio_fs_obj(struct kref *ref) @@ -111,22 +118,23 @@ static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) WARN_ON(fsvq->in_flight < 0); /* Wait for in flight requests to finish.*/ - while (1) { - spin_lock(&fsvq->lock); - if (!fsvq->in_flight) { - spin_unlock(&fsvq->lock); - break; - } + spin_lock(&fsvq->lock); + if (fsvq->in_flight) { + /* We are holding virtio_fs_mutex. There should not be any + * waiters waiting for completion. + */ + reinit_completion(&fsvq->in_flight_zero); + spin_unlock(&fsvq->lock); + wait_for_completion(&fsvq->in_flight_zero); + } else { spin_unlock(&fsvq->lock); - /* TODO use completion instead of timeout */ - usleep_range(1000, 2000); } flush_work(&fsvq->done_work); flush_delayed_work(&fsvq->dispatch_work); } -static void virtio_fs_drain_all_queues(struct virtio_fs *fs) +static void virtio_fs_drain_all_queues_locked(struct virtio_fs *fs) { struct virtio_fs_vq *fsvq; int i; @@ -137,6 +145,19 @@ static void virtio_fs_drain_all_queues(struct virtio_fs *fs) } } +static void virtio_fs_drain_all_queues(struct virtio_fs *fs) +{ + /* Provides mutual exclusion between ->remove and ->kill_sb + * paths. We don't want both of these draining queue at the + * same time. Current completion logic reinits completion + * and that means there should not be any other thread + * doing reinit or waiting for completion already. + */ + mutex_lock(&virtio_fs_mutex); + virtio_fs_drain_all_queues_locked(fs); + mutex_unlock(&virtio_fs_mutex); +} + static void virtio_fs_start_all_queues(struct virtio_fs *fs) { struct virtio_fs_vq *fsvq; @@ -313,17 +334,72 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) } } +/* + * Returns 1 if queue is full and sender should wait a bit before sending + * next request, 0 otherwise. + */ +static int send_forget_request(struct virtio_fs_vq *fsvq, + struct virtio_fs_forget *forget, + bool in_flight) +{ + struct scatterlist sg; + struct virtqueue *vq; + int ret = 0; + bool notify; + struct virtio_fs_forget_req *req = &forget->req; + + spin_lock(&fsvq->lock); + if (!fsvq->connected) { + if (in_flight) + dec_in_flight_req(fsvq); + kfree(forget); + goto out; + } + + sg_init_one(&sg, req, sizeof(*req)); + vq = fsvq->vq; + dev_dbg(&vq->vdev->dev, "%s\n", __func__); + + ret = virtqueue_add_outbuf(vq, &sg, 1, forget, GFP_ATOMIC); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOSPC) { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n", + ret); + list_add_tail(&forget->list, &fsvq->queued_reqs); + schedule_delayed_work(&fsvq->dispatch_work, + msecs_to_jiffies(1)); + if (!in_flight) + inc_in_flight_req(fsvq); + /* Queue is full */ + ret = 1; + } else { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", + ret); + kfree(forget); + if (in_flight) + dec_in_flight_req(fsvq); + } + goto out; + } + + if (!in_flight) + inc_in_flight_req(fsvq); + notify = virtqueue_kick_prepare(vq); + spin_unlock(&fsvq->lock); + + if (notify) + virtqueue_notify(vq); + return ret; +out: + spin_unlock(&fsvq->lock); + return ret; +} + static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) { struct virtio_fs_forget *forget; struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, dispatch_work.work); - struct virtqueue *vq = fsvq->vq; - struct scatterlist sg; - struct scatterlist *sgs[] = {&sg}; - bool notify; - int ret; - pr_debug("virtio-fs: worker %s called.\n", __func__); while (1) { spin_lock(&fsvq->lock); @@ -335,43 +411,9 @@ static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) } list_del(&forget->list); - if (!fsvq->connected) { - dec_in_flight_req(fsvq); - spin_unlock(&fsvq->lock); - kfree(forget); - continue; - } - - sg_init_one(&sg, forget, sizeof(*forget)); - - /* Enqueue the request */ - dev_dbg(&vq->vdev->dev, "%s\n", __func__); - ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC); - if (ret < 0) { - if (ret == -ENOMEM || ret == -ENOSPC) { - pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n", - ret); - list_add_tail(&forget->list, - &fsvq->queued_reqs); - schedule_delayed_work(&fsvq->dispatch_work, - msecs_to_jiffies(1)); - } else { - pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", - ret); - dec_in_flight_req(fsvq); - kfree(forget); - } - spin_unlock(&fsvq->lock); - return; - } - - notify = virtqueue_kick_prepare(vq); spin_unlock(&fsvq->lock); - - if (notify) - virtqueue_notify(vq); - pr_debug("virtio-fs: worker %s dispatched one forget request.\n", - __func__); + if (send_forget_request(fsvq, forget, true)) + return; } } @@ -556,6 +598,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs); INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work, virtio_fs_hiprio_dispatch_work); + init_completion(&fs->vqs[VQ_HIPRIO].in_flight_zero); spin_lock_init(&fs->vqs[VQ_HIPRIO].lock); /* Initialize the requests virtqueues */ @@ -566,6 +609,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, virtio_fs_request_dispatch_work); INIT_LIST_HEAD(&fs->vqs[i].queued_reqs); INIT_LIST_HEAD(&fs->vqs[i].end_reqs); + init_completion(&fs->vqs[i].in_flight_zero); snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name), "requests.%u", i - VQ_REQUEST); callbacks[i] = virtio_fs_vq_done; @@ -659,7 +703,7 @@ static void virtio_fs_remove(struct virtio_device *vdev) /* This device is going away. No one should get new reference */ list_del_init(&fs->list); virtio_fs_stop_all_queues(fs); - virtio_fs_drain_all_queues(fs); + virtio_fs_drain_all_queues_locked(fs); vdev->config->reset(vdev); virtio_fs_cleanup_vqs(vdev, fs); @@ -684,12 +728,12 @@ static int virtio_fs_restore(struct virtio_device *vdev) } #endif /* CONFIG_PM_SLEEP */ -const static struct virtio_device_id id_table[] = { +static const struct virtio_device_id id_table[] = { { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID }, {}, }; -const static unsigned int feature_table[] = {}; +static const unsigned int feature_table[] = {}; static struct virtio_driver virtio_fs_driver = { .driver.name = KBUILD_MODNAME, @@ -710,14 +754,10 @@ __releases(fiq->lock) { struct fuse_forget_link *link; struct virtio_fs_forget *forget; - struct scatterlist sg; - struct scatterlist *sgs[] = {&sg}; + struct virtio_fs_forget_req *req; struct virtio_fs *fs; - struct virtqueue *vq; struct virtio_fs_vq *fsvq; - bool notify; u64 unique; - int ret; link = fuse_dequeue_forget(fiq, 1, NULL); unique = fuse_get_unique(fiq); @@ -728,57 +768,19 @@ __releases(fiq->lock) /* Allocate a buffer for the request */ forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL); + req = &forget->req; - forget->ih = (struct fuse_in_header){ + req->ih = (struct fuse_in_header){ .opcode = FUSE_FORGET, .nodeid = link->forget_one.nodeid, .unique = unique, - .len = sizeof(*forget), + .len = sizeof(*req), }; - forget->arg = (struct fuse_forget_in){ + req->arg = (struct fuse_forget_in){ .nlookup = link->forget_one.nlookup, }; - sg_init_one(&sg, forget, sizeof(*forget)); - - /* Enqueue the request */ - spin_lock(&fsvq->lock); - - if (!fsvq->connected) { - kfree(forget); - spin_unlock(&fsvq->lock); - goto out; - } - - vq = fsvq->vq; - dev_dbg(&vq->vdev->dev, "%s\n", __func__); - - ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC); - if (ret < 0) { - if (ret == -ENOMEM || ret == -ENOSPC) { - pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later.\n", - ret); - list_add_tail(&forget->list, &fsvq->queued_reqs); - schedule_delayed_work(&fsvq->dispatch_work, - msecs_to_jiffies(1)); - inc_in_flight_req(fsvq); - } else { - pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", - ret); - kfree(forget); - } - spin_unlock(&fsvq->lock); - goto out; - } - - inc_in_flight_req(fsvq); - notify = virtqueue_kick_prepare(vq); - - spin_unlock(&fsvq->lock); - - if (notify) - virtqueue_notify(vq); -out: + send_forget_request(fsvq, forget, false); kfree(link); } @@ -1026,7 +1028,7 @@ __releases(fiq->lock) } } -const static struct fuse_iqueue_ops virtio_fs_fiq_ops = { +static const struct fuse_iqueue_ops virtio_fs_fiq_ops = { .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock, .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock, .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock, diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index b9fe975d7625..9c6df721321a 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -133,7 +133,7 @@ static int gfs2_write_full_page(struct page *page, get_block_t *get_block, * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - offset = i_size & (PAGE_SIZE-1); + offset = i_size & (PAGE_SIZE - 1); if (page->index == end_index && offset) zero_user_segment(page, offset, PAGE_SIZE); @@ -497,7 +497,7 @@ static int __gfs2_readpage(void *file, struct page *page) error = mpage_readpage(page, gfs2_block_map); } - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) return -EIO; return error; @@ -614,7 +614,7 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping, gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) ret = -EIO; return ret; } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 516103248272..08f6fbb3655e 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -2441,8 +2441,16 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length) struct inode *inode = file_inode(file); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + unsigned int blocksize = i_blocksize(inode); + loff_t start, end; int error; + start = round_down(offset, blocksize); + end = round_up(offset + length, blocksize) - 1; + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + if (gfs2_is_jdata(ip)) error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA, GFS2_JTRUNC_REVOKES); @@ -2456,9 +2464,8 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length) if (error) goto out; } else { - unsigned int start_off, end_len, blocksize; + unsigned int start_off, end_len; - blocksize = i_blocksize(inode); start_off = offset & (blocksize - 1); end_len = (offset + length) & (blocksize - 1); if (start_off) { diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index d07a295f9cac..9d58295ccf7a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -407,27 +407,28 @@ static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size) /** * gfs2_allocate_page_backing - Allocate blocks for a write fault * @page: The (locked) page to allocate backing for + * @length: Size of the allocation * * We try to allocate all the blocks required for the page in one go. This * might fail for various reasons, so we keep trying until all the blocks to * back this page are allocated. If some of the blocks are already allocated, * that is ok too. */ -static int gfs2_allocate_page_backing(struct page *page) +static int gfs2_allocate_page_backing(struct page *page, unsigned int length) { u64 pos = page_offset(page); - u64 size = PAGE_SIZE; do { struct iomap iomap = { }; - if (gfs2_iomap_get_alloc(page->mapping->host, pos, 1, &iomap)) + if (gfs2_iomap_get_alloc(page->mapping->host, pos, length, &iomap)) return -EIO; - iomap.length = min(iomap.length, size); - size -= iomap.length; + if (length < iomap.length) + iomap.length = length; + length -= iomap.length; pos += iomap.length; - } while (size > 0); + } while (length > 0); return 0; } @@ -448,10 +449,10 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_alloc_parms ap = { .aflags = 0, }; - unsigned long last_index; - u64 pos = page_offset(page); + u64 offset = page_offset(page); unsigned int data_blocks, ind_blocks, rblocks; struct gfs2_holder gh; + unsigned int length; loff_t size; int ret; @@ -461,20 +462,39 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) if (ret) goto out; - gfs2_size_hint(vmf->vma->vm_file, pos, PAGE_SIZE); - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) goto out_uninit; + /* Check page index against inode size */ + size = i_size_read(inode); + if (offset >= size) { + ret = -EINVAL; + goto out_unlock; + } + /* Update file times before taking page lock */ file_update_time(vmf->vma->vm_file); + /* page is wholly or partially inside EOF */ + if (offset > size - PAGE_SIZE) + length = offset_in_page(size); + else + length = PAGE_SIZE; + + gfs2_size_hint(vmf->vma->vm_file, offset, length); + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); - if (!gfs2_write_alloc_required(ip, pos, PAGE_SIZE)) { + /* + * iomap_writepage / iomap_writepages currently don't support inline + * files, so always unstuff here. + */ + + if (!gfs2_is_stuffed(ip) && + !gfs2_write_alloc_required(ip, offset, length)) { lock_page(page); if (!PageUptodate(page) || page->mapping != inode->i_mapping) { ret = -EAGAIN; @@ -487,7 +507,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) if (ret) goto out_unlock; - gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks); + gfs2_write_calc_reserv(ip, length, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; ret = gfs2_quota_lock_check(ip, &ap); if (ret) @@ -508,13 +528,6 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) goto out_trans_fail; lock_page(page); - ret = -EINVAL; - size = i_size_read(inode); - last_index = (size - 1) >> PAGE_SHIFT; - /* Check page index against inode size */ - if (size == 0 || (page->index > last_index)) - goto out_trans_end; - ret = -EAGAIN; /* If truncated, we must retry the operation, we may have raced * with the glock demotion code. @@ -527,7 +540,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) if (gfs2_is_stuffed(ip)) ret = gfs2_unstuff_dinode(ip, page); if (ret == 0) - ret = gfs2_allocate_page_backing(page); + ret = gfs2_allocate_page_backing(page, length); out_trans_end: if (ret) @@ -961,6 +974,7 @@ out: brelse(dibh); return error; } + /** * calc_max_reserv() - Reverse of write_calc_reserv. Given a number of * blocks, determine how many bytes can be written. @@ -1208,7 +1222,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) cmd = F_SETLK; fl->fl_type = F_UNLCK; } - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) { + if (unlikely(gfs2_withdrawn(sdp))) { if (fl->fl_type == F_UNLCK) locks_lock_file_wait(file, fl); return -EIO; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 0290a22ebccf..b7123de7c180 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -549,7 +549,7 @@ __acquires(&gl->gl_lockref.lock) unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0); int ret; - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) && + if (unlikely(gfs2_withdrawn(sdp)) && target != LM_ST_UNLOCKED) return; lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | @@ -558,7 +558,14 @@ __acquires(&gl->gl_lockref.lock) GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && glops->go_inval) { - set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); + /* + * If another process is already doing the invalidate, let that + * finish first. The glock state machine will get back to this + * holder again later. + */ + if (test_and_set_bit(GLF_INVALIDATE_IN_PROGRESS, + &gl->gl_flags)) + return; do_error(gl, 0); /* Fail queued try locks */ } gl->gl_req = target; @@ -586,8 +593,7 @@ __acquires(&gl->gl_lockref.lock) } else if (ret) { fs_err(sdp, "lm_lock ret %d\n", ret); - GLOCK_BUG_ON(gl, !test_bit(SDF_WITHDRAWN, - &sdp->sd_flags)); + GLOCK_BUG_ON(gl, !gfs2_withdrawn(sdp)); } } else { /* lock_nolock */ finish_xmote(gl, target); @@ -1191,7 +1197,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh) struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; int error = 0; - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) return -EIO; if (test_bit(GLF_LRU, &gl->gl_flags)) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index ff213690e364..4ede1f18de85 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -350,7 +350,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major), be32_to_cpu(str->di_minor)); break; - }; + } i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid)); @@ -540,7 +540,7 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) gfs2_consist(sdp); /* Initialize some head of the log stuff */ - if (!test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) { + if (!gfs2_withdrawn(sdp)) { sdp->sd_log_sequence = head.lh_sequence + 1; gfs2_log_pointers_init(sdp, head.lh_blkno); } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e1e18fb587eb..dafef10b91f1 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -656,7 +656,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, inode->i_rdev = dev; inode->i_size = size; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - gfs2_set_inode_blocks(inode, 1); munge_mode_uid_gid(dip, inode); check_and_update_goal(dip); ip->i_goal = dip->i_goal; @@ -712,7 +711,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = gfs2_trans_begin(sdp, blocks, 0); if (error) - goto fail_gunlock2; + goto fail_free_inode; if (blocks > 1) { ip->i_eattr = ip->i_no_addr + 1; @@ -723,7 +722,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (error) - goto fail_gunlock2; + goto fail_free_inode; BUG_ON(test_and_set_bit(GLF_INODE_CREATING, &io_gl->gl_flags)); @@ -732,7 +731,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, goto fail_gunlock2; glock_set_object(ip->i_iopen_gh.gh_gl, ip); - gfs2_glock_put(io_gl); gfs2_set_iop(inode); insert_inode_hash(inode); @@ -765,6 +763,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, mark_inode_dirty(inode); d_instantiate(dentry, inode); + /* After instantiate, errors should result in evict which will destroy + * both inode and iopen glocks properly. */ if (file) { file->f_mode |= FMODE_CREATED; error = finish_open(file, dentry, gfs2_open_common); @@ -772,15 +772,15 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, gfs2_glock_dq_uninit(ghs); gfs2_glock_dq_uninit(ghs + 1); clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); + gfs2_glock_put(io_gl); return error; fail_gunlock3: glock_clear_object(io_gl, ip); gfs2_glock_dq_uninit(&ip->i_iopen_gh); - gfs2_glock_put(io_gl); fail_gunlock2: - if (io_gl) - clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); + clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); + gfs2_glock_put(io_gl); fail_free_inode: if (ip->i_gl) { glock_clear_object(ip->i_gl, ip); @@ -1475,7 +1475,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, error = -EEXIST; default: goto out_gunlock; - }; + } if (odip != ndip) { if (!ndip->i_inode.i_nlink) { diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 58e237fba565..eb3f2e7b8085 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -31,6 +31,8 @@ #include "dir.h" #include "trace_gfs2.h" +static void gfs2_log_shutdown(struct gfs2_sbd *sdp); + /** * gfs2_struct2blk - compute stuff * @sdp: the filesystem @@ -159,7 +161,8 @@ restart: list_for_each_entry_reverse(tr, head, tr_list) { if (wbc->nr_to_write <= 0) break; - if (gfs2_ail1_start_one(sdp, wbc, tr, &withdraw)) + if (gfs2_ail1_start_one(sdp, wbc, tr, &withdraw) && + !gfs2_withdrawn(sdp)) goto restart; } spin_unlock(&sdp->sd_ail_lock); @@ -609,6 +612,14 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) list_add(&bd->bd_list, &sdp->sd_log_revokes); } +void gfs2_glock_remove_revoke(struct gfs2_glock *gl) +{ + if (atomic_dec_return(&gl->gl_revokes) == 0) { + clear_bit(GLF_LFLUSH, &gl->gl_flags); + gfs2_glock_queue_put(gl); + } +} + void gfs2_write_revokes(struct gfs2_sbd *sdp) { struct gfs2_trans *tr; @@ -682,12 +693,16 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, { struct gfs2_log_header *lh; u32 hash, crc; - struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + struct page *page; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; struct timespec64 tv; struct super_block *sb = sdp->sd_vfs; u64 dblock; + if (gfs2_withdrawn(sdp)) + goto out; + + page = mempool_alloc(gfs2_page_pool, GFP_NOIO); lh = page_address(page); clear_page(lh); @@ -707,7 +722,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, lh->lh_nsec = cpu_to_be32(tv.tv_nsec); lh->lh_sec = cpu_to_be64(tv.tv_sec); if (!list_empty(&jd->extent_list)) - dblock = gfs2_log_bmap(sdp); + dblock = gfs2_log_bmap(jd, lblock); else { int ret = gfs2_lblk_to_dblk(jd->jd_inode, lblock, &dblock); if (gfs2_assert_withdraw(sdp, ret == 0)) @@ -740,6 +755,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, gfs2_log_write(sdp, page, sb->s_blocksize, 0, dblock); gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags); +out: log_flush_wait(sdp); } @@ -768,6 +784,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); gfs2_write_log_header(sdp, sdp->sd_jdesc, sdp->sd_log_sequence++, tail, sdp->sd_log_flush_head, flags, op_flags); + gfs2_log_incr_head(sdp); if (sdp->sd_log_tail != tail) log_pull_tail(sdp, tail); @@ -948,7 +965,7 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) * */ -void gfs2_log_shutdown(struct gfs2_sbd *sdp) +static void gfs2_log_shutdown(struct gfs2_sbd *sdp) { gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 2315fca47a2b..2ff163a8dce1 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -74,9 +74,9 @@ extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); -extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); extern int gfs2_logd(void *data); extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl); extern void gfs2_write_revokes(struct gfs2_sbd *sdp); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 5b17979af539..55fed7daf2b1 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -129,7 +129,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, atomic_dec(&sdp->sd_log_pinned); } -static void gfs2_log_incr_head(struct gfs2_sbd *sdp) +void gfs2_log_incr_head(struct gfs2_sbd *sdp) { BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) && (sdp->sd_log_flush_head != sdp->sd_log_head)); @@ -138,18 +138,13 @@ static void gfs2_log_incr_head(struct gfs2_sbd *sdp) sdp->sd_log_flush_head = 0; } -u64 gfs2_log_bmap(struct gfs2_sbd *sdp) +u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lblock) { - unsigned int lbn = sdp->sd_log_flush_head; struct gfs2_journal_extent *je; - u64 block; - list_for_each_entry(je, &sdp->sd_jdesc->extent_list, list) { - if ((lbn >= je->lblock) && (lbn < (je->lblock + je->blocks))) { - block = je->dblock + lbn - je->lblock; - gfs2_log_incr_head(sdp); - return block; - } + list_for_each_entry(je, &jd->extent_list, list) { + if (lblock >= je->lblock && lblock < je->lblock + je->blocks) + return je->dblock + lblock - je->lblock; } return -1; @@ -351,8 +346,11 @@ void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) { - gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh), - gfs2_log_bmap(sdp)); + u64 dblock; + + dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head); + gfs2_log_incr_head(sdp); + gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh), dblock); } /** @@ -369,8 +367,11 @@ static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) { struct super_block *sb = sdp->sd_vfs; - gfs2_log_write(sdp, page, sb->s_blocksize, 0, - gfs2_log_bmap(sdp)); + u64 dblock; + + dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head); + gfs2_log_incr_head(sdp); + gfs2_log_write(sdp, page, sb->s_blocksize, 0, dblock); } /** @@ -882,10 +883,7 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) bd = list_entry(head->next, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); gl = bd->bd_gl; - if (atomic_dec_return(&gl->gl_revokes) == 0) { - clear_bit(GLF_LFLUSH, &gl->gl_flags); - gfs2_glock_queue_put(gl); - } + gfs2_glock_remove_revoke(gl); kmem_cache_free(gfs2_bufdata_cachep, bd); } } diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 9c059957a733..9c5e4e491e03 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -18,7 +18,8 @@ ~(2 * sizeof(__be64) - 1)) extern const struct gfs2_log_operations *gfs2_log_ops[]; -extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp); +extern void gfs2_log_incr_head(struct gfs2_sbd *sdp); +extern u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn); extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, unsigned size, unsigned offset, u64 blkno); extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 662ef36c1874..0c3772974030 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -251,7 +251,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct buffer_head *bh, *bhs[2]; int num = 0; - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) { + if (unlikely(gfs2_withdrawn(sdp))) { *bhp = NULL; return -EIO; } @@ -309,7 +309,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) { - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) return -EIO; wait_on_buffer(bh); @@ -320,7 +320,7 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) gfs2_io_error_bh_wd(sdp, bh); return -EIO; } - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) return -EIO; return 0; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 18daf494abab..e8b7b0ce8404 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1006,8 +1006,7 @@ hostdata_error: void gfs2_lm_unmount(struct gfs2_sbd *sdp) { const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops; - if (likely(!test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) && - lm->lm_unmount) + if (likely(!gfs2_withdrawn(sdp)) && lm->lm_unmount) lm->lm_unmount(sdp); } @@ -1328,7 +1327,7 @@ static const struct fs_parameter_enum gfs2_param_enums[] = { {} }; -const struct fs_parameter_description gfs2_fs_parameters = { +static const struct fs_parameter_description gfs2_fs_parameters = { .name = "gfs2", .specs = gfs2_param_specs, .enums = gfs2_param_enums, diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 7c016a082aa6..e9f93045eb01 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1273,7 +1273,7 @@ int gfs2_quota_sync(struct super_block *sb, int type) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_data **qda; - unsigned int max_qd = PAGE_SIZE/sizeof(struct gfs2_holder); + unsigned int max_qd = PAGE_SIZE / sizeof(struct gfs2_holder); unsigned int num_qd; unsigned int x; int error = 0; @@ -1475,7 +1475,7 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) { if (error == 0 || error == -EROFS) return; - if (!test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) { + if (!gfs2_withdrawn(sdp)) { fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); sdp->sd_log_error = error; wake_up(&sdp->sd_logd_waitq); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index c529f8749a89..85f830e56945 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -263,11 +263,13 @@ static void clean_journal(struct gfs2_jdesc *jd, u32 lblock = head->lh_blkno; gfs2_replay_incr_blk(jd, &lblock); - if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) - sdp->sd_log_flush_head = lblock; gfs2_write_log_header(sdp, jd, head->lh_sequence + 1, 0, lblock, GFS2_LOG_HEAD_UNMOUNT | GFS2_LOG_HEAD_RECOVERY, REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC); + if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) { + sdp->sd_log_flush_head = lblock; + gfs2_log_incr_head(sdp); + } } @@ -326,7 +328,7 @@ void gfs2_recover_func(struct work_struct *work) default: goto fail; - }; + } error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP | GL_NOCACHE, &ji_gh); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 5fa1eec4fb4f..68cc7c291a81 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -399,8 +399,7 @@ struct lfcc { * Returns: errno */ -static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, - struct gfs2_holder *freeze_gh) +static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) { struct gfs2_inode *ip; struct gfs2_jdesc *jd; @@ -425,7 +424,9 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, } error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE, - GL_NOCACHE, freeze_gh); + GL_NOCACHE, &sdp->sd_freeze_gh); + if (error) + goto out; list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { error = gfs2_jdesc_check(jd); @@ -441,7 +442,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, } if (error) - gfs2_glock_dq_uninit(freeze_gh); + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); out: while (!list_empty(&list)) { @@ -553,7 +554,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) if (!(flags & I_DIRTY_INODE)) return; - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) return; if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); @@ -602,7 +603,7 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp) error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, GL_NOCACHE, &freeze_gh); - if (error && !test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) + if (error && !gfs2_withdrawn(sdp)) return error; flush_workqueue(gfs2_delete_workqueue); @@ -761,21 +762,25 @@ static int gfs2_freeze(struct super_block *sb) if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) goto out; - if (test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) { - error = -EINVAL; - goto out; - } - for (;;) { - error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); + if (gfs2_withdrawn(sdp)) { + error = -EINVAL; + goto out; + } + + error = gfs2_lock_fs_check_clean(sdp); if (!error) break; if (error == -EBUSY) fs_err(sdp, "waiting for recovery before freeze\n"); - else + else if (error == -EIO) { + fs_err(sdp, "Fatal IO error: cannot freeze gfs2 due " + "to recovery error.\n"); + goto out; + } else { fs_err(sdp, "error freezing FS: %d\n", error); - + } fs_err(sdp, "retrying...\n"); msleep(1000); } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index dd15b8e4af2c..8ccb68f4ed16 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -118,7 +118,7 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) { - unsigned int b = test_bit(SDF_WITHDRAWN, &sdp->sd_flags); + unsigned int b = gfs2_withdrawn(sdp); return snprintf(buf, PAGE_SIZE, "%u\n", b); } diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 35e3059255fe..9d4227330de4 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -262,6 +262,8 @@ void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) list_del_init(&bd->bd_list); gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); sdp->sd_log_num_revoke--; + if (bd->bd_gl) + gfs2_glock_remove_revoke(bd->bd_gl); kmem_cache_free(gfs2_bufdata_cachep, bd); tr->tr_num_revoke--; if (--n == 0) diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index c45159133d8e..ec600b487498 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -258,7 +258,7 @@ void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, const char *function, char *file, unsigned int line, bool withdraw) { - if (!test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) + if (!gfs2_withdrawn(sdp)) fs_err(sdp, "fatal: I/O error\n" " block = %llu\n" diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 4b68b2c1fe56..f2702bc9837c 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -164,6 +164,15 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, return x; } +/** + * gfs2_withdrawn - test whether the file system is withdrawing or withdrawn + * @sdp: the superblock + */ +static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp) +{ + return test_bit(SDF_WITHDRAWN, &sdp->sd_flags); +} + #define gfs2_tune_get(sdp, field) \ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) diff --git a/fs/io-wq.c b/fs/io-wq.c index 91b85df0861e..74b40506c5d9 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -111,7 +111,7 @@ struct io_wq { struct task_struct *manager; struct user_struct *user; - struct cred *creds; + const struct cred *creds; struct mm_struct *mm; refcount_t refs; struct completion done; diff --git a/fs/io-wq.h b/fs/io-wq.h index 600e0158cba7..7c333a28e2a7 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -52,6 +52,7 @@ static inline void wq_node_del(struct io_wq_work_list *list, list->last = prev; if (prev) prev->next = node->next; + node->next = NULL; } #define wq_list_for_each(pos, prv, head) \ @@ -87,7 +88,7 @@ typedef void (put_work_fn)(struct io_wq_work *); struct io_wq_data { struct mm_struct *mm; struct user_struct *user; - struct cred *creds; + const struct cred *creds; get_work_fn *get_work; put_work_fn *put_work; @@ -118,10 +119,6 @@ static inline void io_wq_worker_sleeping(struct task_struct *tsk) static inline void io_wq_worker_running(struct task_struct *tsk) { } -#endif +#endif /* CONFIG_IO_WQ */ -static inline bool io_wq_current_is_worker(void) -{ - return in_task() && (current->flags & PF_IO_WORKER); -} -#endif +#endif /* INTERNAL_IO_WQ_H */ diff --git a/fs/io_uring.c b/fs/io_uring.c index ec53aa7cdc94..405be10da73d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -145,7 +145,7 @@ struct io_rings { /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure - * there are not more requests pending thatn there is space in + * there are not more requests pending than there is space in * the completion queue. * * Written by the kernel, shouldn't be modified by the @@ -238,7 +238,7 @@ struct io_ring_ctx { struct user_struct *user; - struct cred *creds; + const struct cred *creds; /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */ struct completion *completions; @@ -275,7 +275,8 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct list_head poll_list; - struct rb_root cancel_tree; + struct hlist_head *cancel_hash; + unsigned cancel_hash_bits; spinlock_t inflight_lock; struct list_head inflight_list; @@ -303,9 +304,32 @@ struct io_timeout_data { u32 seq_offset; }; -struct io_timeout { - struct file *file; - struct io_timeout_data *data; +struct io_async_connect { + struct sockaddr_storage address; +}; + +struct io_async_msghdr { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov; + struct sockaddr __user *uaddr; + struct msghdr msg; +}; + +struct io_async_rw { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov; + ssize_t nr_segs; + ssize_t size; +}; + +struct io_async_ctx { + struct io_uring_sqe sqe; + union { + struct io_async_rw rw; + struct io_async_msghdr msg; + struct io_async_connect connect; + struct io_timeout_data timeout; + }; }; /* @@ -319,10 +343,10 @@ struct io_kiocb { struct file *file; struct kiocb rw; struct io_poll_iocb poll; - struct io_timeout timeout; }; const struct io_uring_sqe *sqe; + struct io_async_ctx *io; struct file *ring_file; int ring_fd; bool has_user; @@ -332,7 +356,7 @@ struct io_kiocb { struct io_ring_ctx *ctx; union { struct list_head list; - struct rb_node rb_node; + struct hlist_node hash_node; }; struct list_head link_list; unsigned int flags; @@ -353,7 +377,6 @@ struct io_kiocb { #define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ #define REQ_F_INFLIGHT 16384 /* on inflight list */ #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ -#define REQ_F_FREE_SQE 65536 /* free sqe if not async queued */ u64 user_data; u32 result; u32 sequence; @@ -422,6 +445,7 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref) static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; + int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -435,6 +459,21 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) if (!ctx->completions) goto err; + /* + * Use 5 bits less than the max cq entries, that should give us around + * 32 entries per hash list if totally full and uniformly spread. + */ + hash_bits = ilog2(p->cq_entries); + hash_bits -= 5; + if (hash_bits <= 0) + hash_bits = 1; + ctx->cancel_hash_bits = hash_bits; + ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), + GFP_KERNEL); + if (!ctx->cancel_hash) + goto err; + __hash_init(ctx->cancel_hash, 1U << hash_bits); + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -448,7 +487,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->poll_list); - ctx->cancel_tree = RB_ROOT; INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); init_waitqueue_head(&ctx->inflight_wait); @@ -459,6 +497,7 @@ err: if (ctx->fallback_req) kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx->completions); + kfree(ctx->cancel_hash); kfree(ctx); return NULL; } @@ -592,7 +631,7 @@ static void io_kill_timeout(struct io_kiocb *req) { int ret; - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { atomic_inc(&req->ctx->cq_timeouts); list_del_init(&req->list); @@ -806,6 +845,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, } got_it: + req->io = NULL; req->ring_file = NULL; req->file = NULL; req->ctx = ctx; @@ -836,8 +876,8 @@ static void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - if (req->flags & REQ_F_FREE_SQE) - kfree(req->sqe); + if (req->io) + kfree(req->io); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); if (req->flags & REQ_F_INFLIGHT) { @@ -849,8 +889,6 @@ static void __io_free_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } - if (req->flags & REQ_F_TIMEOUT) - kfree(req->timeout.data); percpu_ref_put(&ctx->refs); if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); @@ -863,7 +901,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); @@ -878,7 +916,6 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *nxt; bool wake_ev = false; /* Already got next link */ @@ -890,24 +927,21 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) * potentially happen if the chain is messed up, check to be on the * safe side. */ - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); - while (nxt) { - list_del_init(&nxt->list); + while (!list_empty(&req->link_list)) { + struct io_kiocb *nxt = list_first_entry(&req->link_list, + struct io_kiocb, link_list); - if ((req->flags & REQ_F_LINK_TIMEOUT) && - (nxt->flags & REQ_F_TIMEOUT)) { + if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) && + (nxt->flags & REQ_F_TIMEOUT))) { + list_del_init(&nxt->link_list); wake_ev |= io_link_cancel_timeout(nxt); - nxt = list_first_entry_or_null(&req->link_list, - struct io_kiocb, list); req->flags &= ~REQ_F_LINK_TIMEOUT; continue; } - if (!list_empty(&req->link_list)) { - INIT_LIST_HEAD(&nxt->link_list); - list_splice(&req->link_list, &nxt->link_list); - nxt->flags |= REQ_F_LINK; - } + list_del_init(&req->link_list); + if (!list_empty(&nxt->link_list)) + nxt->flags |= REQ_F_LINK; *nxtptr = nxt; break; } @@ -923,15 +957,15 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) static void io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *link; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { - link = list_first_entry(&req->link_list, struct io_kiocb, list); - list_del_init(&link->list); + struct io_kiocb *link = list_first_entry(&req->link_list, + struct io_kiocb, link_list); + list_del_init(&link->link_list); trace_io_uring_fail_link(req, link); if ((req->flags & REQ_F_LINK_TIMEOUT) && @@ -1079,9 +1113,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, * completions for those, only batch free for fixed * file and non-linked commands. */ - if (((req->flags & - (REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) == - REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) { + if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == + REQ_F_FIXED_FILE) && !io_is_fallback_req(req) && + !req->io) { reqs[to_free++] = req; if (to_free == ARRAY_SIZE(reqs)) io_free_req_many(ctx, reqs, &to_free); @@ -1410,15 +1444,6 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) if (S_ISREG(file_inode(req->file)->i_mode)) req->flags |= REQ_F_ISREG; - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ - if (force_nonblock && !io_file_supports_async(req->file)) { - req->flags |= REQ_F_MUST_PUNT; - return -EAGAIN; - } - kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -1587,6 +1612,16 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return io_import_fixed(req->ctx, rw, sqe, iter); } + if (req->io) { + struct io_async_rw *iorw = &req->io->rw; + + *iovec = iorw->iov; + iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size); + if (iorw->iov == iorw->fast_iov) + *iovec = NULL; + return iorw->size; + } + if (!req->has_user) return -EFAULT; @@ -1657,6 +1692,50 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return ret; } +static void io_req_map_io(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter) +{ + req->io->rw.nr_segs = iter->nr_segs; + req->io->rw.size = io_size; + req->io->rw.iov = iovec; + if (!req->io->rw.iov) { + req->io->rw.iov = req->io->rw.fast_iov; + memcpy(req->io->rw.iov, fast_iov, + sizeof(struct iovec) * iter->nr_segs); + } +} + +static int io_setup_async_io(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter) +{ + req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); + if (req->io) { + io_req_map_io(req, io_size, iovec, fast_iov, iter); + memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe)); + req->sqe = &req->io->sqe; + return 0; + } + + return -ENOMEM; +} + +static int io_read_prep(struct io_kiocb *req, struct iovec **iovec, + struct iov_iter *iter, bool force_nonblock) +{ + ssize_t ret; + + ret = io_prep_rw(req, force_nonblock); + if (ret) + return ret; + + if (unlikely(!(req->file->f_mode & FMODE_READ))) + return -EBADF; + + return io_import_iovec(READ, req, iovec, iter); +} + static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { @@ -1665,23 +1744,31 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t read_size, ret; - - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; - file = kiocb->ki_filp; + ssize_t io_size, ret; - if (unlikely(!(file->f_mode & FMODE_READ))) - return -EBADF; - - ret = io_import_iovec(READ, req, &iovec, &iter); - if (ret < 0) - return ret; + if (!req->io) { + ret = io_read_prep(req, &iovec, &iter, force_nonblock); + if (ret < 0) + return ret; + } else { + ret = io_import_iovec(READ, req, &iovec, &iter); + if (ret < 0) + return ret; + } - read_size = ret; + file = req->file; + io_size = ret; if (req->flags & REQ_F_LINK) - req->result = read_size; + req->result = io_size; + + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(file)) { + req->flags |= REQ_F_MUST_PUNT; + goto copy_iov; + } iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); @@ -1703,18 +1790,40 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, */ if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && (req->flags & REQ_F_ISREG) && - ret2 > 0 && ret2 < read_size) + ret2 > 0 && ret2 < io_size) ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || ret2 != -EAGAIN) + if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); - else - ret = -EAGAIN; + } else { +copy_iov: + ret = io_setup_async_io(req, io_size, iovec, + inline_vecs, &iter); + if (ret) + goto out_free; + return -EAGAIN; + } } +out_free: kfree(iovec); return ret; } +static int io_write_prep(struct io_kiocb *req, struct iovec **iovec, + struct iov_iter *iter, bool force_nonblock) +{ + ssize_t ret; + + ret = io_prep_rw(req, force_nonblock); + if (ret) + return ret; + + if (unlikely(!(req->file->f_mode & FMODE_WRITE))) + return -EBADF; + + return io_import_iovec(WRITE, req, iovec, iter); +} + static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { @@ -1723,29 +1832,36 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t ret; + ssize_t ret, io_size; - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; + if (!req->io) { + ret = io_write_prep(req, &iovec, &iter, force_nonblock); + if (ret < 0) + return ret; + } else { + ret = io_import_iovec(WRITE, req, &iovec, &iter); + if (ret < 0) + return ret; + } file = kiocb->ki_filp; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - return -EBADF; - - ret = io_import_iovec(WRITE, req, &iovec, &iter); - if (ret < 0) - return ret; - + io_size = ret; if (req->flags & REQ_F_LINK) - req->result = ret; + req->result = io_size; - iov_count = iov_iter_count(&iter); + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(req->file)) { + req->flags |= REQ_F_MUST_PUNT; + goto copy_iov; + } - ret = -EAGAIN; if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) - goto out_free; + goto copy_iov; + iov_count = iov_iter_count(&iter); ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); if (!ret) { ssize_t ret2; @@ -1769,10 +1885,16 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, ret2 = call_write_iter(file, kiocb, &iter); else ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); - if (!force_nonblock || ret2 != -EAGAIN) + if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); - else - ret = -EAGAIN; + } else { +copy_iov: + ret = io_setup_async_io(req, io_size, iovec, + inline_vecs, &iter); + if (ret) + goto out_free; + return -EAGAIN; + } } out_free: kfree(iovec); @@ -1888,12 +2010,25 @@ static int io_sync_file_range(struct io_kiocb *req, return 0; } +static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) +{ #if defined(CONFIG_NET) -static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock, - long (*fn)(struct socket *, struct user_msghdr __user *, - unsigned int)) + const struct io_uring_sqe *sqe = req->sqe; + struct user_msghdr __user *msg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov); +#else + return 0; +#endif +} + +static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) { +#if defined(CONFIG_NET) struct socket *sock; int ret; @@ -1902,7 +2037,9 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, sock = sock_from_file(req->file, &ret); if (sock) { - struct user_msghdr __user *msg; + struct io_async_ctx io, *copy; + struct sockaddr_storage addr; + struct msghdr *kmsg; unsigned flags; flags = READ_ONCE(sqe->msg_flags); @@ -1911,30 +2048,59 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, else if (force_nonblock) flags |= MSG_DONTWAIT; - msg = (struct user_msghdr __user *) (unsigned long) - READ_ONCE(sqe->addr); + if (req->io) { + kmsg = &req->io->msg.msg; + kmsg->msg_name = &addr; + } else { + kmsg = &io.msg.msg; + kmsg->msg_name = &addr; + io.msg.iov = io.msg.fast_iov; + ret = io_sendmsg_prep(req, &io); + if (ret) + goto out; + } - ret = fn(sock, msg, flags); - if (force_nonblock && ret == -EAGAIN) + ret = __sys_sendmsg_sock(sock, kmsg, flags); + if (force_nonblock && ret == -EAGAIN) { + copy = kmalloc(sizeof(*copy), GFP_KERNEL); + if (!copy) { + ret = -ENOMEM; + goto out; + } + memcpy(©->msg, &io.msg, sizeof(copy->msg)); + req->io = copy; + memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); + req->sqe = &req->io->sqe; return ret; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; } +out: io_cqring_add_event(req, ret); if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; io_put_req_find_next(req, nxt); return 0; -} +#else + return -EOPNOTSUPP; #endif +} -static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) - return io_send_recvmsg(req, sqe, nxt, force_nonblock, - __sys_sendmsg_sock); + const struct io_uring_sqe *sqe = req->sqe; + struct user_msghdr __user *msg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr, + &io->msg.iov); #else - return -EOPNOTSUPP; + return 0; #endif } @@ -1942,8 +2108,63 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_kiocb **nxt, bool force_nonblock) { #if defined(CONFIG_NET) - return io_send_recvmsg(req, sqe, nxt, force_nonblock, - __sys_recvmsg_sock); + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct user_msghdr __user *msg; + struct io_async_ctx io, *copy; + struct sockaddr_storage addr; + struct msghdr *kmsg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + msg = (struct user_msghdr __user *) (unsigned long) + READ_ONCE(sqe->addr); + if (req->io) { + kmsg = &req->io->msg.msg; + kmsg->msg_name = &addr; + } else { + kmsg = &io.msg.msg; + kmsg->msg_name = &addr; + io.msg.iov = io.msg.fast_iov; + ret = io_recvmsg_prep(req, &io); + if (ret) + goto out; + } + + ret = __sys_recvmsg_sock(sock, kmsg, msg, io.msg.uaddr, flags); + if (force_nonblock && ret == -EAGAIN) { + copy = kmalloc(sizeof(*copy), GFP_KERNEL); + if (!copy) { + ret = -ENOMEM; + goto out; + } + memcpy(copy, &io, sizeof(*copy)); + req->io = copy; + memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); + req->sqe = &req->io->sqe; + return ret; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + +out: + io_cqring_add_event(req, ret); + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; + io_put_req_find_next(req, nxt); + return 0; #else return -EOPNOTSUPP; #endif @@ -1985,11 +2206,26 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, #endif } +static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) +{ +#if defined(CONFIG_NET) + const struct io_uring_sqe *sqe = req->sqe; + struct sockaddr __user *addr; + int addr_len; + + addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); + addr_len = READ_ONCE(sqe->addr2); + return move_addr_to_kernel(addr, addr_len, &io->connect.address); +#else + return 0; +#endif +} + static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_kiocb **nxt, bool force_nonblock) { #if defined(CONFIG_NET) - struct sockaddr __user *addr; + struct io_async_ctx __io, *io; unsigned file_flags; int addr_len, ret; @@ -1998,15 +2234,35 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) return -EINVAL; - addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); addr_len = READ_ONCE(sqe->addr2); file_flags = force_nonblock ? O_NONBLOCK : 0; - ret = __sys_connect_file(req->file, addr, addr_len, file_flags); - if (ret == -EAGAIN && force_nonblock) + if (req->io) { + io = req->io; + } else { + ret = io_connect_prep(req, &__io); + if (ret) + goto out; + io = &__io; + } + + ret = __sys_connect_file(req->file, &io->connect.address, addr_len, + file_flags); + if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) { + ret = -ENOMEM; + goto out; + } + memcpy(&io->connect, &__io.connect, sizeof(io->connect)); + req->io = io; + memcpy(&io->sqe, req->sqe, sizeof(*req->sqe)); + req->sqe = &io->sqe; return -EAGAIN; + } if (ret == -ERESTARTSYS) ret = -EINTR; +out: if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req, ret); @@ -2017,14 +2273,6 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, #endif } -static inline void io_poll_remove_req(struct io_kiocb *req) -{ - if (!RB_EMPTY_NODE(&req->rb_node)) { - rb_erase(&req->rb_node, &req->ctx->cancel_tree); - RB_CLEAR_NODE(&req->rb_node); - } -} - static void io_poll_remove_one(struct io_kiocb *req) { struct io_poll_iocb *poll = &req->poll; @@ -2036,36 +2284,34 @@ static void io_poll_remove_one(struct io_kiocb *req) io_queue_async_work(req); } spin_unlock(&poll->head->lock); - io_poll_remove_req(req); + hash_del(&req->hash_node); } static void io_poll_remove_all(struct io_ring_ctx *ctx) { - struct rb_node *node; + struct hlist_node *tmp; struct io_kiocb *req; + int i; spin_lock_irq(&ctx->completion_lock); - while ((node = rb_first(&ctx->cancel_tree)) != NULL) { - req = rb_entry(node, struct io_kiocb, rb_node); - io_poll_remove_one(req); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry_safe(req, tmp, list, hash_node) + io_poll_remove_one(req); } spin_unlock_irq(&ctx->completion_lock); } static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) { - struct rb_node *p, *parent = NULL; + struct hlist_head *list; struct io_kiocb *req; - p = ctx->cancel_tree.rb_node; - while (p) { - parent = p; - req = rb_entry(parent, struct io_kiocb, rb_node); - if (sqe_addr < req->user_data) { - p = p->rb_left; - } else if (sqe_addr > req->user_data) { - p = p->rb_right; - } else { + list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; + hlist_for_each_entry(req, list, hash_node) { + if (sqe_addr == req->user_data) { io_poll_remove_one(req); return 0; } @@ -2147,7 +2393,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) spin_unlock_irq(&ctx->completion_lock); return; } - io_poll_remove_req(req); + hash_del(&req->hash_node); io_poll_complete(req, mask, ret); spin_unlock_irq(&ctx->completion_lock); @@ -2182,7 +2428,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, * for finalizing the request, mark us as having grabbed that already. */ if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { - io_poll_remove_req(req); + hash_del(&req->hash_node); io_poll_complete(req, mask, 0); req->flags |= REQ_F_COMP_LOCKED; io_put_req(req); @@ -2220,20 +2466,10 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, static void io_poll_req_insert(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct rb_node **p = &ctx->cancel_tree.rb_node; - struct rb_node *parent = NULL; - struct io_kiocb *tmp; - - while (*p) { - parent = *p; - tmp = rb_entry(parent, struct io_kiocb, rb_node); - if (req->user_data < tmp->user_data) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - rb_link_node(&req->rb_node, parent, p); - rb_insert_color(&req->rb_node, &ctx->cancel_tree); + struct hlist_head *list; + + list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + hlist_add_head(&req->hash_node, list); } static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, @@ -2257,11 +2493,11 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (!poll->wait) return -ENOMEM; - req->sqe = NULL; + req->io = NULL; INIT_IO_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; - RB_CLEAR_NODE(&req->rb_node); + INIT_HLIST_NODE(&req->hash_node); poll->head = NULL; poll->done = false; @@ -2368,7 +2604,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (ret == -ENOENT) return ret; - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret == -1) return -EALREADY; @@ -2410,7 +2646,8 @@ static int io_timeout_remove(struct io_kiocb *req, return 0; } -static int io_timeout_setup(struct io_kiocb *req) +static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, + bool is_timeout_link) { const struct io_uring_sqe *sqe = req->sqe; struct io_timeout_data *data; @@ -2420,15 +2657,14 @@ static int io_timeout_setup(struct io_kiocb *req) return -EINVAL; if (sqe->ioprio || sqe->buf_index || sqe->len != 1) return -EINVAL; + if (sqe->off && is_timeout_link) + return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); if (flags & ~IORING_TIMEOUT_ABS) return -EINVAL; - data = kzalloc(sizeof(struct io_timeout_data), GFP_KERNEL); - if (!data) - return -ENOMEM; + data = &io->timeout; data->req = req; - req->timeout.data = data; req->flags |= REQ_F_TIMEOUT; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) @@ -2440,6 +2676,7 @@ static int io_timeout_setup(struct io_kiocb *req) data->mode = HRTIMER_MODE_REL; hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); + req->io = io; return 0; } @@ -2448,16 +2685,24 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned count; struct io_ring_ctx *ctx = req->ctx; struct io_timeout_data *data; + struct io_async_ctx *io; struct list_head *entry; unsigned span = 0; - int ret; - ret = io_timeout_setup(req); - /* common setup allows flags (like links) set, we don't */ - if (!ret && sqe->flags) - ret = -EINVAL; - if (ret) - return ret; + io = req->io; + if (!io) { + int ret; + + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) + return -ENOMEM; + ret = io_timeout_prep(req, io, false); + if (ret) { + kfree(io); + return ret; + } + } + data = &req->io->timeout; /* * sqe->off holds how many events that need to occur for this @@ -2473,7 +2718,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->sequence = ctx->cached_sq_head + count - 1; - req->timeout.data->seq_offset = count; + data->seq_offset = count; /* * Insertion sort, ensuring the first entry in the list is always @@ -2484,7 +2729,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); unsigned nxt_sq_head; long long tmp, tmp_nxt; - u32 nxt_offset = nxt->timeout.data->seq_offset; + u32 nxt_offset = nxt->io->timeout.seq_offset; if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) continue; @@ -2517,7 +2762,6 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->sequence -= span; add: list_add(&req->list, entry); - data = req->timeout.data; data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->completion_lock); @@ -2598,30 +2842,76 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } +static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct iov_iter iter; + ssize_t ret; + + memcpy(&io->sqe, req->sqe, sizeof(io->sqe)); + req->sqe = &io->sqe; + + switch (io->sqe.opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + ret = io_read_prep(req, &iovec, &iter, true); + break; + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + ret = io_write_prep(req, &iovec, &iter, true); + break; + case IORING_OP_SENDMSG: + ret = io_sendmsg_prep(req, io); + break; + case IORING_OP_RECVMSG: + ret = io_recvmsg_prep(req, io); + break; + case IORING_OP_CONNECT: + ret = io_connect_prep(req, io); + break; + case IORING_OP_TIMEOUT: + return io_timeout_prep(req, io, false); + case IORING_OP_LINK_TIMEOUT: + return io_timeout_prep(req, io, true); + default: + req->io = io; + return 0; + } + + if (ret < 0) + return ret; + + req->io = io; + io_req_map_io(req, ret, iovec, inline_vecs, &iter); + return 0; +} + static int io_req_defer(struct io_kiocb *req) { - struct io_uring_sqe *sqe_copy; struct io_ring_ctx *ctx = req->ctx; + struct io_async_ctx *io; + int ret; /* Still need defer if there is pending req in defer list. */ if (!req_need_defer(req) && list_empty(&ctx->defer_list)) return 0; - sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); - if (!sqe_copy) + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) return -EAGAIN; + ret = io_req_defer_prep(req, io); + if (ret < 0) { + kfree(io); + return ret; + } + spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); - kfree(sqe_copy); return 0; } - memcpy(sqe_copy, req->sqe, sizeof(*sqe_copy)); - req->flags |= REQ_F_FREE_SQE; - req->sqe = sqe_copy; - trace_io_uring_defer(ctx, req, req->user_data); list_add_tail(&req->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); @@ -2876,10 +3166,11 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) * We don't expect the list to be empty, that will only happen if we * race with the completion of the linked work. */ - if (!list_empty(&req->list)) { - prev = list_entry(req->list.prev, struct io_kiocb, link_list); + if (!list_empty(&req->link_list)) { + prev = list_entry(req->link_list.prev, struct io_kiocb, + link_list); if (refcount_inc_not_zero(&prev->refs)) { - list_del_init(&req->list); + list_del_init(&req->link_list); prev->flags &= ~REQ_F_LINK_TIMEOUT; } else prev = NULL; @@ -2909,8 +3200,8 @@ static void io_queue_linked_timeout(struct io_kiocb *req) * we got a chance to setup the timer */ spin_lock_irq(&ctx->completion_lock); - if (!list_empty(&req->list)) { - struct io_timeout_data *data = req->timeout.data; + if (!list_empty(&req->link_list)) { + struct io_timeout_data *data = &req->io->timeout; data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), @@ -2929,7 +3220,8 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!(req->flags & REQ_F_LINK)) return NULL; - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); + nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, + link_list); if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT) return NULL; @@ -2953,15 +3245,6 @@ static void __io_queue_sqe(struct io_kiocb *req) */ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { - struct io_uring_sqe *sqe_copy; - - sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL); - if (!sqe_copy) - goto err; - - req->sqe = sqe_copy; - req->flags |= REQ_F_FREE_SQE; - if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { ret = io_grab_files(req); if (ret) @@ -3030,7 +3313,7 @@ static inline void io_queue_link_head(struct io_kiocb *req) #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) -static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, +static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, struct io_kiocb **link) { struct io_ring_ctx *ctx = req->ctx; @@ -3049,7 +3332,7 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, err_req: io_cqring_add_event(req, ret); io_double_put_req(req); - return; + return false; } /* @@ -3061,32 +3344,25 @@ err_req: */ if (*link) { struct io_kiocb *prev = *link; - struct io_uring_sqe *sqe_copy; + struct io_async_ctx *io; if (req->sqe->flags & IOSQE_IO_DRAIN) (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; - if (READ_ONCE(req->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { - ret = io_timeout_setup(req); - /* common setup allows offset being set, we don't */ - if (!ret && req->sqe->off) - ret = -EINVAL; - if (ret) { - prev->flags |= REQ_F_FAIL_LINK; - goto err_req; - } - } - - sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL); - if (!sqe_copy) { + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) { ret = -EAGAIN; goto err_req; } - req->sqe = sqe_copy; - req->flags |= REQ_F_FREE_SQE; + ret = io_req_defer_prep(req, io); + if (ret) { + kfree(io); + prev->flags |= REQ_F_FAIL_LINK; + goto err_req; + } trace_io_uring_link(ctx, req, prev); - list_add_tail(&req->list, &prev->link_list); + list_add_tail(&req->link_list, &prev->link_list); } else if (req->sqe->flags & IOSQE_IO_LINK) { req->flags |= REQ_F_LINK; @@ -3095,6 +3371,8 @@ err_req: } else { io_queue_sqe(req); } + + return true; } /* @@ -3113,7 +3391,7 @@ static void io_submit_state_end(struct io_submit_state *state) * Start submission side cache. */ static void io_submit_state_start(struct io_submit_state *state, - struct io_ring_ctx *ctx, unsigned max_ios) + unsigned int max_ios) { blk_start_plug(&state->plug); state->free_reqs = 0; @@ -3197,7 +3475,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, return -EBUSY; if (nr > IO_PLUG_THRESHOLD) { - io_submit_state_start(&state, ctx, nr); + io_submit_state_start(&state, nr); statep = &state; } @@ -3224,6 +3502,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } } + submitted++; sqe_flags = req->sqe->flags; req->ring_file = ring_file; @@ -3233,9 +3512,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, req->needs_fixed_file = async; trace_io_uring_submit_sqe(ctx, req->sqe->user_data, true, async); - io_submit_sqe(req, statep, &link); - submitted++; - + if (!io_submit_sqe(req, statep, &link)) + break; /* * If previous wasn't linked and we have a linked command, * that's the end of the chain. Submit the previous link. @@ -4363,6 +4641,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) free_uid(ctx->user); put_cred(ctx->creds); kfree(ctx->completions); + kfree(ctx->cancel_hash); kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx); } @@ -4759,7 +5038,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) ctx->compat = in_compat_syscall(); ctx->account_mem = account_mem; ctx->user = user; - ctx->creds = prepare_creds(); + ctx->creds = get_current_cred(); ret = io_allocate_scq_urings(ctx, p); if (ret) @@ -4794,7 +5073,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) if (ret < 0) goto err; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP; + p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | + IORING_FEAT_SUBMIT_STABLE; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 4d31503abaee..9dc7e7a64e10 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -223,7 +223,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, dput(dentry); return ERR_PTR(-EINVAL); } - dtmp = lookup_one_len_unlocked(kntmp->name, dentry, + dtmp = lookup_positive_unlocked(kntmp->name, dentry, strlen(kntmp->name)); dput(dentry); if (IS_ERR(dtmp)) diff --git a/fs/namei.c b/fs/namei.c index 2dda552bcf7a..d6c91d1e88cb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1210,25 +1210,25 @@ static int follow_automount(struct path *path, struct nameidata *nd, * - Flagged as automount point * * This may only be called in refwalk mode. + * On success path->dentry is known positive. * * Serialization is taken care of in namespace.c */ static int follow_managed(struct path *path, struct nameidata *nd) { struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ - unsigned managed; + unsigned flags; bool need_mntput = false; int ret = 0; /* Given that we're not holding a lock here, we retain the value in a * local variable for each dentry as we look at it so that we don't see * the components of that value change under us */ - while (managed = READ_ONCE(path->dentry->d_flags), - managed &= DCACHE_MANAGED_DENTRY, - unlikely(managed != 0)) { + while (flags = smp_load_acquire(&path->dentry->d_flags), + unlikely(flags & DCACHE_MANAGED_DENTRY)) { /* Allow the filesystem to manage the transit without i_mutex * being held. */ - if (managed & DCACHE_MANAGE_TRANSIT) { + if (flags & DCACHE_MANAGE_TRANSIT) { BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); ret = path->dentry->d_op->d_manage(path, false); @@ -1237,7 +1237,7 @@ static int follow_managed(struct path *path, struct nameidata *nd) } /* Transit to a mounted filesystem. */ - if (managed & DCACHE_MOUNTED) { + if (flags & DCACHE_MOUNTED) { struct vfsmount *mounted = lookup_mnt(path); if (mounted) { dput(path->dentry); @@ -1256,7 +1256,7 @@ static int follow_managed(struct path *path, struct nameidata *nd) } /* Handle an automount point */ - if (managed & DCACHE_NEED_AUTOMOUNT) { + if (flags & DCACHE_NEED_AUTOMOUNT) { ret = follow_automount(path, nd, &need_mntput); if (ret < 0) break; @@ -1269,10 +1269,12 @@ static int follow_managed(struct path *path, struct nameidata *nd) if (need_mntput && path->mnt == mnt) mntput(path->mnt); - if (ret == -EISDIR || !ret) - ret = 1; if (need_mntput) nd->flags |= LOOKUP_JUMPED; + if (ret == -EISDIR || !ret) + ret = 1; + if (ret > 0 && unlikely(d_flags_negative(flags))) + ret = -ENOENT; if (unlikely(ret < 0)) path_put_conditional(path, nd); return ret; @@ -1621,10 +1623,6 @@ static int lookup_fast(struct nameidata *nd, dput(dentry); return status; } - if (unlikely(d_is_negative(dentry))) { - dput(dentry); - return -ENOENT; - } path->mnt = mnt; path->dentry = dentry; @@ -1811,11 +1809,6 @@ static int walk_component(struct nameidata *nd, int flags) if (unlikely(err < 0)) return err; - if (unlikely(d_is_negative(path.dentry))) { - path_to_nameidata(&path, nd); - return -ENOENT; - } - seq = 0; /* we are already out of RCU mode */ inode = d_backing_inode(path.dentry); } @@ -2568,6 +2561,26 @@ struct dentry *lookup_one_len_unlocked(const char *name, } EXPORT_SYMBOL(lookup_one_len_unlocked); +/* + * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT) + * on negatives. Returns known positive or ERR_PTR(); that's what + * most of the users want. Note that pinned negative with unlocked parent + * _can_ become positive at any time, so callers of lookup_one_len_unlocked() + * need to be very careful; pinned positives have ->d_inode stable, so + * this one avoids such problems. + */ +struct dentry *lookup_positive_unlocked(const char *name, + struct dentry *base, int len) +{ + struct dentry *ret = lookup_one_len_unlocked(name, base, len); + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); + } + return ret; +} +EXPORT_SYMBOL(lookup_positive_unlocked); + #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { @@ -2662,7 +2675,7 @@ mountpoint_last(struct nameidata *nd) return PTR_ERR(path.dentry); } } - if (d_is_negative(path.dentry)) { + if (d_flags_negative(smp_load_acquire(&path.dentry->d_flags))) { dput(path.dentry); return -ENOENT; } @@ -3356,11 +3369,6 @@ static int do_last(struct nameidata *nd, if (unlikely(error < 0)) return error; - if (unlikely(d_is_negative(path.dentry))) { - path_to_nameidata(&path, nd); - return -ENOENT; - } - /* * create/update audit record if it already exists. */ diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 86e5658651f1..195ab7a0fc89 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -863,13 +863,11 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, } else dchild = dget(dparent); } else - dchild = lookup_one_len_unlocked(name, dparent, namlen); + dchild = lookup_positive_unlocked(name, dparent, namlen); if (IS_ERR(dchild)) return rv; if (d_mountpoint(dchild)) goto out; - if (d_really_is_negative(dchild)) - goto out; if (dchild->d_inode->i_ino != ino) goto out; rv = fh_compose(fhp, exp, dchild, &cd->fh); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 533d0fc3c96b..b09237431ae2 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2991,18 +2991,9 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, __be32 nfserr; int ignore_crossmnt = 0; - dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen); + dentry = lookup_positive_unlocked(name, cd->rd_fhp->fh_dentry, namlen); if (IS_ERR(dentry)) return nfserrno(PTR_ERR(dentry)); - if (d_really_is_negative(dentry)) { - /* - * we're not holding the i_mutex here, so there's - * a window where this directory entry could have gone - * away. - */ - dput(dentry); - return nfserr_noent; - } exp_get(exp); /* diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index e9717c2f7d45..c269d6033525 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -200,7 +200,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, int err; bool last_element = !post[0]; - this = lookup_one_len_unlocked(name, base, namelen); + this = lookup_positive_unlocked(name, base, namelen); if (IS_ERR(this)) { err = PTR_ERR(this); this = NULL; @@ -208,8 +208,6 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, goto out; goto out_err; } - if (!this->d_inode) - goto put_and_out; if (ovl_dentry_weird(this)) { /* Don't support traversing automounts and other weirdness */ @@ -651,7 +649,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh) if (err) return ERR_PTR(err); - index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len); + index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len); kfree(name.name); if (IS_ERR(index)) { if (PTR_ERR(index) == -ENOENT) @@ -659,9 +657,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh) return index; } - if (d_is_negative(index)) - err = 0; - else if (ovl_is_whiteout(index)) + if (ovl_is_whiteout(index)) err = -ESTALE; else if (ovl_dentry_weird(index)) err = -EIO; @@ -685,7 +681,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, if (err) return ERR_PTR(err); - index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len); + index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len); if (IS_ERR(index)) { err = PTR_ERR(index); if (err == -ENOENT) { @@ -700,9 +696,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, } inode = d_inode(index); - if (d_is_negative(index)) { - goto out_dput; - } else if (ovl_is_whiteout(index) && !verify) { + if (ovl_is_whiteout(index) && !verify) { /* * When index lookup is called with !verify for decoding an * overlay file handle, a whiteout index implies that decode @@ -1131,7 +1125,7 @@ bool ovl_lower_positive(struct dentry *dentry) struct dentry *this; struct dentry *lowerdir = poe->lowerstack[i].dentry; - this = lookup_one_len_unlocked(name->name, lowerdir, + this = lookup_positive_unlocked(name->name, lowerdir, name->len); if (IS_ERR(this)) { switch (PTR_ERR(this)) { @@ -1148,10 +1142,8 @@ bool ovl_lower_positive(struct dentry *dentry) break; } } else { - if (this->d_inode) { - positive = !ovl_is_whiteout(this); - done = true; - } + positive = !ovl_is_whiteout(this); + done = true; dput(this); } } diff --git a/fs/pipe.c b/fs/pipe.c index 648ce440ca85..b901c8eefafd 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -389,7 +389,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; - unsigned int head, max_usage, mask; + unsigned int head; ssize_t ret = 0; int do_wakeup = 0; size_t total_len = iov_iter_count(from); @@ -408,12 +408,11 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) } head = pipe->head; - max_usage = pipe->max_usage; - mask = pipe->ring_size - 1; /* We try to merge small writes */ chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ if (!pipe_empty(head, pipe->tail) && chars != 0) { + unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; int offset = buf->offset + buf->len; @@ -443,7 +442,8 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) } head = pipe->head; - if (!pipe_full(head, pipe->tail, max_usage)) { + if (!pipe_full(head, pipe->tail, pipe->max_usage)) { + unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf = &pipe->bufs[head & mask]; struct page *page = pipe->tmp_page; int copied; @@ -465,7 +465,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) spin_lock_irq(&pipe->wait.lock); head = pipe->head; - if (pipe_full(head, pipe->tail, max_usage)) { + if (pipe_full(head, pipe->tail, pipe->max_usage)) { spin_unlock_irq(&pipe->wait.lock); continue; } @@ -510,7 +510,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) break; } - if (!pipe_full(head, pipe->tail, max_usage)) + if (!pipe_full(head, pipe->tail, pipe->max_usage)) continue; /* Wait for buffer space to become available. */ @@ -579,8 +579,6 @@ pipe_poll(struct file *filp, poll_table *wait) poll_wait(filp, &pipe->wait, wait); - BUG_ON(pipe_occupancy(head, tail) > pipe->ring_size); - /* Reading only -- no need for acquiring the semaphore. */ mask = 0; if (filp->f_mode & FMODE_READ) { @@ -1176,6 +1174,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) pipe->max_usage = nr_slots; pipe->tail = tail; pipe->head = head; + wake_up_interruptible_all(&pipe->wait); return pipe->max_usage * PAGE_SIZE; out_revert_acct: diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 4639d53e96a3..b0688c02dc90 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2487,21 +2487,15 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, struct dentry *dentry; int error; - dentry = lookup_one_len_unlocked(qf_name, sb->s_root, strlen(qf_name)); + dentry = lookup_positive_unlocked(qf_name, sb->s_root, strlen(qf_name)); if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (d_really_is_negative(dentry)) { - error = -ENOENT; - goto out; - } - error = security_quota_on(dentry); if (!error) error = dquot_load_quota_inode(d_inode(dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); -out: dput(dentry); return error; } diff --git a/fs/splice.c b/fs/splice.c index f2400ce7d528..fa1f3773c8cd 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -495,7 +495,7 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des unsigned int mask = pipe->ring_size - 1; int ret; - while (!pipe_empty(tail, head)) { + while (!pipe_empty(head, tail)) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; sd->len = buf->len; @@ -711,9 +711,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, splice_from_pipe_begin(&sd); while (sd.total_len) { struct iov_iter from; - unsigned int head = pipe->head; - unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; + unsigned int head, tail, mask; size_t left; int n; @@ -732,6 +730,10 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, } } + head = pipe->head; + tail = pipe->tail; + mask = pipe->ring_size - 1; + /* build the vector */ left = sd.total_len; for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++, n++) { |