diff options
Diffstat (limited to 'fs')
90 files changed, 2057 insertions, 2702 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 30b751c7f11a..5976eb33535f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -245,19 +245,27 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS -config HUGETLB_PAGE_FREE_VMEMMAP +# +# Select this config option from the architecture Kconfig, if it is preferred +# to enable the feature of minimizing overhead of struct page associated with +# each HugeTLB page. +# +config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP + bool + +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on X86_64 + depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP depends on SPARSEMEM_VMEMMAP -config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON - bool "Default freeing vmemmap pages of HugeTLB to on" +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON + bool "Default optimizing vmemmap pages of HugeTLB to on" default n - depends on HUGETLB_PAGE_FREE_VMEMMAP + depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP help - When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap + When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap pages associated with each HugeTLB page is default off. Say Y here - to enable freeing vmemmap pages of HugeTLB by default. It can then + to enable optimizing vmemmap pages of HugeTLB by default. It can then be disabled on the command line via hugetlb_free_vmemmap=off. config MEMFD_CREATE diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 9d334816eac0..1dd995efd5b8 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -116,7 +116,8 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) tcon->ses->server->ops->dump_share_caps(m, tcon); if (tcon->use_witness) seq_puts(m, " Witness"); - + if (tcon->broken_sparse_sup) + seq_puts(m, " nosparse"); if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); seq_putc(m, '\n'); @@ -386,7 +387,7 @@ skip_rdma: (ses->serverNOS == NULL)) { seq_printf(m, "\n\t%d) Address: %s Uses: %d Capability: 0x%x\tSession Status: %d ", i, ses->ip_addr, ses->ses_count, - ses->capabilities, ses->status); + ses->capabilities, ses->ses_status); if (ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) seq_printf(m, "Guest "); else if (ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) @@ -398,7 +399,7 @@ skip_rdma: "\n\tSMB session status: %d ", i, ses->ip_addr, ses->serverDomain, ses->ses_count, ses->serverOS, ses->serverNOS, - ses->capabilities, ses->status); + ses->capabilities, ses->ses_status); } seq_printf(m, "\n\tSecurity type: %s ", @@ -418,6 +419,8 @@ skip_rdma: spin_lock(&ses->chan_lock); if (CIFS_CHAN_NEEDS_RECONNECT(ses, 0)) seq_puts(m, "\tPrimary channel: DISCONNECTED "); + if (CIFS_CHAN_IN_RECONNECT(ses, 0)) + seq_puts(m, "\t[RECONNECTING] "); if (ses->chan_count > 1) { seq_printf(m, "\n\n\tExtra Channels: %zu ", @@ -426,6 +429,8 @@ skip_rdma: cifs_dump_channel(m, j, &ses->chans[j]); if (CIFS_CHAN_NEEDS_RECONNECT(ses, j)) seq_puts(m, "\tDISCONNECTED "); + if (CIFS_CHAN_IN_RECONNECT(ses, j)) + seq_puts(m, "\t[RECONNECTING] "); } } spin_unlock(&ses->chan_lock); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 2b1a1c029c75..f539a39d47f5 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -582,6 +582,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",nocase"); if (tcon->nodelete) seq_puts(s, ",nodelete"); + if (cifs_sb->ctx->no_sparse) + seq_puts(s, ",nosparse"); if (tcon->local_lease) seq_puts(s, ",locallease"); if (tcon->retry) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 8de977c359b1..68da230c7f11 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -106,7 +106,7 @@ * CIFS vfs client Status information (based on what we know.) */ -/* associated with each tcp and smb session */ +/* associated with each connection */ enum statusEnum { CifsNew = 0, CifsGood, @@ -114,8 +114,15 @@ enum statusEnum { CifsNeedReconnect, CifsNeedNegotiate, CifsInNegotiate, - CifsNeedSessSetup, - CifsInSessSetup, +}; + +/* associated with each smb session */ +enum ses_status_enum { + SES_NEW = 0, + SES_GOOD, + SES_EXITING, + SES_NEED_RECON, + SES_IN_SETUP }; /* associated with each tree connection to the server */ @@ -915,6 +922,7 @@ struct cifs_server_iface { }; struct cifs_chan { + unsigned int in_reconnect : 1; /* if session setup in progress for this channel */ struct TCP_Server_Info *server; __u8 signkey[SMB3_SIGN_KEY_SIZE]; }; @@ -930,7 +938,7 @@ struct cifs_ses { struct mutex session_mutex; struct TCP_Server_Info *server; /* pointer to server info */ int ses_count; /* reference counter */ - enum statusEnum status; /* updates protected by cifs_tcp_ses_lock */ + enum ses_status_enum ses_status; /* updates protected by cifs_tcp_ses_lock */ unsigned overrideSecFlg; /* if non-zero override global sec flags */ char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ @@ -944,7 +952,7 @@ struct cifs_ses { and after mount option parsing we fill it */ char *domainName; char *password; - char *workstation_name; + char workstation_name[CIFS_MAX_WORKSTATION_LEN]; struct session_key auth_key; struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ enum securityEnum sectype; /* what security flavor was specified? */ @@ -977,12 +985,16 @@ struct cifs_ses { #define CIFS_MAX_CHANNELS 16 #define CIFS_ALL_CHANNELS_SET(ses) \ ((1UL << (ses)->chan_count) - 1) +#define CIFS_ALL_CHANS_GOOD(ses) \ + (!(ses)->chans_need_reconnect) #define CIFS_ALL_CHANS_NEED_RECONNECT(ses) \ ((ses)->chans_need_reconnect == CIFS_ALL_CHANNELS_SET(ses)) #define CIFS_SET_ALL_CHANS_NEED_RECONNECT(ses) \ ((ses)->chans_need_reconnect = CIFS_ALL_CHANNELS_SET(ses)) #define CIFS_CHAN_NEEDS_RECONNECT(ses, index) \ test_bit((index), &(ses)->chans_need_reconnect) +#define CIFS_CHAN_IN_RECONNECT(ses, index) \ + ((ses)->chans[(index)].in_reconnect) struct cifs_chan chans[CIFS_MAX_CHANNELS]; size_t chan_count; @@ -1009,6 +1021,58 @@ cap_unix(struct cifs_ses *ses) return ses->server->vals->cap_unix & ses->capabilities; } +/* + * common struct for holding inode info when searching for or updating an + * inode with new info + */ + +#define CIFS_FATTR_DFS_REFERRAL 0x1 +#define CIFS_FATTR_DELETE_PENDING 0x2 +#define CIFS_FATTR_NEED_REVAL 0x4 +#define CIFS_FATTR_INO_COLLISION 0x8 +#define CIFS_FATTR_UNKNOWN_NLINK 0x10 +#define CIFS_FATTR_FAKE_ROOT_INO 0x20 + +struct cifs_fattr { + u32 cf_flags; + u32 cf_cifsattrs; + u64 cf_uniqueid; + u64 cf_eof; + u64 cf_bytes; + u64 cf_createtime; + kuid_t cf_uid; + kgid_t cf_gid; + umode_t cf_mode; + dev_t cf_rdev; + unsigned int cf_nlink; + unsigned int cf_dtype; + struct timespec64 cf_atime; + struct timespec64 cf_mtime; + struct timespec64 cf_ctime; + u32 cf_cifstag; +}; + +struct cached_dirent { + struct list_head entry; + char *name; + int namelen; + loff_t pos; + + struct cifs_fattr fattr; +}; + +struct cached_dirents { + bool is_valid:1; + bool is_failed:1; + struct dir_context *ctx; /* + * Only used to make sure we only take entries + * from a single context. Never dereferenced. + */ + struct mutex de_mutex; + int pos; /* Expected ctx->pos */ + struct list_head entries; +}; + struct cached_fid { bool is_valid:1; /* Do we have a useable root fid */ bool file_all_info_is_valid:1; @@ -1021,6 +1085,7 @@ struct cached_fid { struct dentry *dentry; struct work_struct lease_break; struct smb2_file_all_info file_all_info; + struct cached_dirents dirents; }; /* @@ -1641,37 +1706,6 @@ struct file_list { struct cifsFileInfo *cfile; }; -/* - * common struct for holding inode info when searching for or updating an - * inode with new info - */ - -#define CIFS_FATTR_DFS_REFERRAL 0x1 -#define CIFS_FATTR_DELETE_PENDING 0x2 -#define CIFS_FATTR_NEED_REVAL 0x4 -#define CIFS_FATTR_INO_COLLISION 0x8 -#define CIFS_FATTR_UNKNOWN_NLINK 0x10 -#define CIFS_FATTR_FAKE_ROOT_INO 0x20 - -struct cifs_fattr { - u32 cf_flags; - u32 cf_cifsattrs; - u64 cf_uniqueid; - u64 cf_eof; - u64 cf_bytes; - u64 cf_createtime; - kuid_t cf_uid; - kgid_t cf_gid; - umode_t cf_mode; - dev_t cf_rdev; - unsigned int cf_nlink; - unsigned int cf_dtype; - struct timespec64 cf_atime; - struct timespec64 cf_mtime; - struct timespec64 cf_ctime; - u32 cf_cifstag; -}; - static inline void free_dfs_info_param(struct dfs_info3_param *param) { if (param) { @@ -1979,4 +2013,22 @@ static inline bool cifs_is_referral_server(struct cifs_tcon *tcon, return is_tcon_dfs(tcon) || (ref && (ref->flags & DFSREF_REFERRAL_SERVER)); } +static inline u64 cifs_flock_len(struct file_lock *fl) +{ + return fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; +} + +static inline size_t ntlmssp_workstation_name_size(const struct cifs_ses *ses) +{ + if (WARN_ON_ONCE(!ses || !ses->server)) + return 0; + /* + * Make workstation name no more than 15 chars when using insecure dialects as some legacy + * servers do require it during NTLMSSP. + */ + if (ses->server->dialect <= SMB20_PROT_ID) + return min_t(size_t, sizeof(ses->workstation_name), RFC1001_NAME_LEN_WITH_NULL); + return sizeof(ses->workstation_name); +} + #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 0df3b24a0bf4..3b7366ec03c7 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -619,6 +619,15 @@ unsigned int cifs_ses_get_chan_index(struct cifs_ses *ses, struct TCP_Server_Info *server); void +cifs_chan_set_in_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server); +void +cifs_chan_clear_in_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server); +bool +cifs_chan_in_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server); +void cifs_chan_set_need_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server); void diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 47e927c4ff8d..6371b9eebdad 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -75,7 +75,7 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); - if ((tcon->ses->status != CifsGood) || (tcon->status != TID_NEED_RECON)) { + if ((tcon->ses->ses_status != SES_GOOD) || (tcon->status != TID_NEED_RECON)) { spin_unlock(&cifs_tcp_ses_lock); return; } @@ -2558,7 +2558,8 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, pLockData->fl_start = le64_to_cpu(parm_data->start); pLockData->fl_end = pLockData->fl_start + - le64_to_cpu(parm_data->length) - 1; + (le64_to_cpu(parm_data->length) ? + le64_to_cpu(parm_data->length) - 1 : 0); pLockData->fl_pid = -le32_to_cpu(parm_data->pid); } } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 42e14f408856..53373a3649e1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -241,7 +241,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) goto next_session; - ses->status = CifsNeedReconnect; + ses->ses_status = SES_NEED_RECON; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { tcon->need_reconnect = true; @@ -1789,7 +1789,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx) goto out; } - cifs_dbg(FYI, "IPC tcon rc = %d ipc tid = %d\n", rc, tcon->tid); + cifs_dbg(FYI, "IPC tcon rc=%d ipc tid=0x%x\n", rc, tcon->tid); ses->tcon_ipc = tcon; out: @@ -1828,7 +1828,7 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - if (ses->status == CifsExiting) + if (ses->ses_status == SES_EXITING) continue; if (!match_session(ses, ctx)) continue; @@ -1848,7 +1848,7 @@ void cifs_put_smb_ses(struct cifs_ses *ses) cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); spin_lock(&cifs_tcp_ses_lock); - if (ses->status == CifsExiting) { + if (ses->ses_status == SES_EXITING) { spin_unlock(&cifs_tcp_ses_lock); return; } @@ -1864,13 +1864,13 @@ void cifs_put_smb_ses(struct cifs_ses *ses) /* ses_count can never go negative */ WARN_ON(ses->ses_count < 0); - if (ses->status == CifsGood) - ses->status = CifsExiting; + if (ses->ses_status == SES_GOOD) + ses->ses_status = SES_EXITING; spin_unlock(&cifs_tcp_ses_lock); cifs_free_ipc(ses); - if (ses->status == CifsExiting && server->ops->logoff) { + if (ses->ses_status == SES_EXITING && server->ops->logoff) { xid = get_xid(); rc = server->ops->logoff(xid, ses); if (rc) @@ -2037,18 +2037,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) } } - ctx->workstation_name = kstrdup(ses->workstation_name, GFP_KERNEL); - if (!ctx->workstation_name) { - cifs_dbg(FYI, "Unable to allocate memory for workstation_name\n"); - rc = -ENOMEM; - kfree(ctx->username); - ctx->username = NULL; - kfree_sensitive(ctx->password); - ctx->password = NULL; - kfree(ctx->domainname); - ctx->domainname = NULL; - goto out_key_put; - } + strscpy(ctx->workstation_name, ses->workstation_name, sizeof(ctx->workstation_name)); out_key_put: up_read(&key->sem); @@ -2090,7 +2079,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) ses = cifs_find_smb_ses(server, ctx); if (ses) { cifs_dbg(FYI, "Existing smb sess found (status=%d)\n", - ses->status); + ses->ses_status); spin_lock(&ses->chan_lock); if (cifs_chan_needs_reconnect(ses, server)) { @@ -2157,12 +2146,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) if (!ses->domainName) goto get_ses_fail; } - if (ctx->workstation_name) { - ses->workstation_name = kstrdup(ctx->workstation_name, - GFP_KERNEL); - if (!ses->workstation_name) - goto get_ses_fail; - } + + strscpy(ses->workstation_name, ctx->workstation_name, sizeof(ses->workstation_name)); + if (ctx->domainauto) ses->domainAuto = ctx->domainauto; ses->cred_uid = ctx->cred_uid; @@ -2509,6 +2495,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) */ tcon->retry = ctx->retry; tcon->nocase = ctx->nocase; + tcon->broken_sparse_sup = ctx->no_sparse; if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING) tcon->nohandlecache = ctx->nohandlecache; else @@ -3420,8 +3407,9 @@ cifs_are_all_path_components_accessible(struct TCP_Server_Info *server, } /* - * Check if path is remote (e.g. a DFS share). Return -EREMOTE if it is, - * otherwise 0. + * Check if path is remote (i.e. a DFS share). + * + * Return -EREMOTE if it is, otherwise 0 or -errno. */ static int is_path_remote(struct mount_ctx *mnt_ctx) { @@ -3432,6 +3420,9 @@ static int is_path_remote(struct mount_ctx *mnt_ctx) struct cifs_tcon *tcon = mnt_ctx->tcon; struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; char *full_path; +#ifdef CONFIG_CIFS_DFS_UPCALL + bool nodfs = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS; +#endif if (!server->ops->is_path_accessible) return -EOPNOTSUPP; @@ -3449,14 +3440,20 @@ static int is_path_remote(struct mount_ctx *mnt_ctx) rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, full_path); #ifdef CONFIG_CIFS_DFS_UPCALL + if (nodfs) { + if (rc == -EREMOTE) + rc = -EOPNOTSUPP; + goto out; + } + + /* path *might* exist with non-ASCII characters in DFS root + * try again with full path (only if nodfs is not set) */ if (rc == -ENOENT && is_tcon_dfs(tcon)) rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, cifs_sb, full_path); #endif - if (rc != 0 && rc != -EREMOTE) { - kfree(full_path); - return rc; - } + if (rc != 0 && rc != -EREMOTE) + goto out; if (rc != -EREMOTE) { rc = cifs_are_all_path_components_accessible(server, xid, tcon, @@ -3468,6 +3465,7 @@ static int is_path_remote(struct mount_ctx *mnt_ctx) } } +out: kfree(full_path); return rc; } @@ -3703,6 +3701,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) if (!isdfs) goto out; + /* proceed as DFS mount */ uuid_gen(&mnt_ctx.mount_id); rc = connect_dfs_root(&mnt_ctx, &tl); dfs_cache_free_tgts(&tl); @@ -3960,7 +3959,7 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, if (rc == 0) { spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsInNegotiate) - server->tcpStatus = CifsNeedSessSetup; + server->tcpStatus = CifsGood; else rc = -EHOSTDOWN; spin_unlock(&cifs_tcp_ses_lock); @@ -3982,20 +3981,31 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, int rc = -ENOSYS; bool is_binding = false; - /* only send once per connect */ + spin_lock(&cifs_tcp_ses_lock); - if ((server->tcpStatus != CifsNeedSessSetup) && - (ses->status == CifsGood)) { + if (ses->ses_status != SES_GOOD && + ses->ses_status != SES_NEW && + ses->ses_status != SES_NEED_RECON) { spin_unlock(&cifs_tcp_ses_lock); return 0; } - server->tcpStatus = CifsInSessSetup; - spin_unlock(&cifs_tcp_ses_lock); + /* only send once per connect */ spin_lock(&ses->chan_lock); + if (CIFS_ALL_CHANS_GOOD(ses) || + cifs_chan_in_reconnect(ses, server)) { + spin_unlock(&ses->chan_lock); + spin_unlock(&cifs_tcp_ses_lock); + return 0; + } is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + cifs_chan_set_in_reconnect(ses, server); spin_unlock(&ses->chan_lock); + if (!is_binding) + ses->ses_status = SES_IN_SETUP; + spin_unlock(&cifs_tcp_ses_lock); + if (!is_binding) { ses->capabilities = server->capabilities; if (!linuxExtEnabled) @@ -4019,20 +4029,21 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (rc) { cifs_server_dbg(VFS, "Send error in SessSetup = %d\n", rc); spin_lock(&cifs_tcp_ses_lock); - if (server->tcpStatus == CifsInSessSetup) - server->tcpStatus = CifsNeedSessSetup; + if (ses->ses_status == SES_IN_SETUP) + ses->ses_status = SES_NEED_RECON; + spin_lock(&ses->chan_lock); + cifs_chan_clear_in_reconnect(ses, server); + spin_unlock(&ses->chan_lock); spin_unlock(&cifs_tcp_ses_lock); } else { spin_lock(&cifs_tcp_ses_lock); - if (server->tcpStatus == CifsInSessSetup) - server->tcpStatus = CifsGood; - /* Even if one channel is active, session is in good state */ - ses->status = CifsGood; - spin_unlock(&cifs_tcp_ses_lock); - + if (ses->ses_status == SES_IN_SETUP) + ses->ses_status = SES_GOOD; spin_lock(&ses->chan_lock); + cifs_chan_clear_in_reconnect(ses, server); cifs_chan_clear_need_reconnect(ses, server); spin_unlock(&ses->chan_lock); + spin_unlock(&cifs_tcp_ses_lock); } return rc; @@ -4497,7 +4508,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); - if (tcon->ses->status != CifsGood || + if (tcon->ses->ses_status != SES_GOOD || (tcon->status != TID_NEW && tcon->status != TID_NEED_TCON)) { spin_unlock(&cifs_tcp_ses_lock); @@ -4565,7 +4576,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); - if (tcon->ses->status != CifsGood || + if (tcon->ses->ses_status != SES_GOOD || (tcon->status != TID_NEW && tcon->status != TID_NEED_TCON)) { spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 956f8e5cf3e7..c5dd6f7305bd 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -654,7 +654,7 @@ static struct cache_entry *__lookup_cache_entry(const char *path, unsigned int h return ce; } } - return ERR_PTR(-EEXIST); + return ERR_PTR(-ENOENT); } /* @@ -662,7 +662,7 @@ static struct cache_entry *__lookup_cache_entry(const char *path, unsigned int h * * Use whole path components in the match. Must be called with htable_rw_lock held. * - * Return ERR_PTR(-EEXIST) if the entry is not found. + * Return ERR_PTR(-ENOENT) if the entry is not found. */ static struct cache_entry *lookup_cache_entry(const char *path) { @@ -710,7 +710,7 @@ static struct cache_entry *lookup_cache_entry(const char *path) while (e > s && *e != sep) e--; } - return ERR_PTR(-EEXIST); + return ERR_PTR(-ENOENT); } /** diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 06003bb9cbe9..1618e0537d58 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1395,7 +1395,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) cifs_dbg(VFS, "Can't push all brlocks!\n"); break; } - length = 1 + flock->fl_end - flock->fl_start; + length = cifs_flock_len(flock); if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK) type = CIFS_RDLCK; else @@ -1511,7 +1511,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, bool wait_flag, bool posix_lck, unsigned int xid) { int rc = 0; - __u64 length = 1 + flock->fl_end - flock->fl_start; + __u64 length = cifs_flock_len(flock); struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; @@ -1609,7 +1609,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifsLockInfo *li, *tmp; - __u64 length = 1 + flock->fl_end - flock->fl_start; + __u64 length = cifs_flock_len(flock); struct list_head tmp_llist; INIT_LIST_HEAD(&tmp_llist); @@ -1713,7 +1713,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, unsigned int xid) { int rc = 0; - __u64 length = 1 + flock->fl_end - flock->fl_start; + __u64 length = cifs_flock_len(flock); struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; @@ -2777,8 +2777,11 @@ int cifs_flush(struct file *file, fl_owner_t id) rc = filemap_write_and_wait(inode->i_mapping); cifs_dbg(FYI, "Flush inode %p file %p rc %d\n", inode, file, rc); - if (rc) + if (rc) { + /* get more nuanced writeback errors */ + rc = filemap_check_wb_err(file->f_mapping, 0); trace_cifs_flush_err(inode->i_ino, rc); + } return rc; } @@ -4906,6 +4909,10 @@ static int cifs_swap_activate(struct swap_info_struct *sis, cifs_dbg(FYI, "swap activate\n"); + if (!swap_file->f_mapping->a_ops->swap_rw) + /* Cannot support swap */ + return -EINVAL; + spin_lock(&inode->i_lock); blocks = inode->i_blocks; isize = inode->i_size; @@ -4934,7 +4941,8 @@ static int cifs_swap_activate(struct swap_info_struct *sis, * from reading or writing the file */ - return 0; + sis->flags |= SWP_FS_OPS; + return add_swap_extent(sis, 0, sis->max, 0); } static void cifs_swap_deactivate(struct file *file) diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index a92e9eec521f..8dc0d923ef6a 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -119,6 +119,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_flag_no("persistenthandles", Opt_persistent), fsparam_flag_no("resilienthandles", Opt_resilient), fsparam_flag_no("tcpnodelay", Opt_tcp_nodelay), + fsparam_flag("nosparse", Opt_nosparse), fsparam_flag("domainauto", Opt_domainauto), fsparam_flag("rdma", Opt_rdma), fsparam_flag("modesid", Opt_modesid), @@ -312,7 +313,6 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx new_ctx->password = NULL; new_ctx->server_hostname = NULL; new_ctx->domainname = NULL; - new_ctx->workstation_name = NULL; new_ctx->UNC = NULL; new_ctx->source = NULL; new_ctx->iocharset = NULL; @@ -327,7 +327,6 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx DUP_CTX_STR(UNC); DUP_CTX_STR(source); DUP_CTX_STR(domainname); - DUP_CTX_STR(workstation_name); DUP_CTX_STR(nodename); DUP_CTX_STR(iocharset); @@ -766,8 +765,7 @@ static int smb3_verify_reconfigure_ctx(struct fs_context *fc, cifs_errorf(fc, "can not change domainname during remount\n"); return -EINVAL; } - if (new_ctx->workstation_name && - (!old_ctx->workstation_name || strcmp(new_ctx->workstation_name, old_ctx->workstation_name))) { + if (strcmp(new_ctx->workstation_name, old_ctx->workstation_name)) { cifs_errorf(fc, "can not change workstation_name during remount\n"); return -EINVAL; } @@ -814,7 +812,6 @@ static int smb3_reconfigure(struct fs_context *fc) STEAL_STRING(cifs_sb, ctx, username); STEAL_STRING(cifs_sb, ctx, password); STEAL_STRING(cifs_sb, ctx, domainname); - STEAL_STRING(cifs_sb, ctx, workstation_name); STEAL_STRING(cifs_sb, ctx, nodename); STEAL_STRING(cifs_sb, ctx, iocharset); @@ -943,6 +940,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_nolease: ctx->no_lease = 1; break; + case Opt_nosparse: + ctx->no_sparse = 1; + break; case Opt_nodelete: ctx->nodelete = 1; break; @@ -1467,22 +1467,15 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, int smb3_init_fs_context(struct fs_context *fc) { - int rc; struct smb3_fs_context *ctx; char *nodename = utsname()->nodename; int i; ctx = kzalloc(sizeof(struct smb3_fs_context), GFP_KERNEL); - if (unlikely(!ctx)) { - rc = -ENOMEM; - goto err_exit; - } + if (unlikely(!ctx)) + return -ENOMEM; - ctx->workstation_name = kstrdup(nodename, GFP_KERNEL); - if (unlikely(!ctx->workstation_name)) { - rc = -ENOMEM; - goto err_exit; - } + strscpy(ctx->workstation_name, nodename, sizeof(ctx->workstation_name)); /* * does not have to be perfect mapping since field is @@ -1555,14 +1548,6 @@ int smb3_init_fs_context(struct fs_context *fc) fc->fs_private = ctx; fc->ops = &smb3_fs_context_ops; return 0; - -err_exit: - if (ctx) { - kfree(ctx->workstation_name); - kfree(ctx); - } - - return rc; } void @@ -1588,8 +1573,6 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx) ctx->source = NULL; kfree(ctx->domainname); ctx->domainname = NULL; - kfree(ctx->workstation_name); - ctx->workstation_name = NULL; kfree(ctx->nodename); ctx->nodename = NULL; kfree(ctx->iocharset); diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index e54090d9ef36..5f093cb7e9b9 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -62,6 +62,7 @@ enum cifs_param { Opt_noblocksend, Opt_noautotune, Opt_nolease, + Opt_nosparse, Opt_hard, Opt_soft, Opt_perm, @@ -170,7 +171,7 @@ struct smb3_fs_context { char *server_hostname; char *UNC; char *nodename; - char *workstation_name; + char workstation_name[CIFS_MAX_WORKSTATION_LEN]; char *iocharset; /* local code page for mapping to and from Unicode */ char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */ char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */ @@ -222,6 +223,7 @@ struct smb3_fs_context { bool noautotune:1; bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ bool no_lease:1; /* disable requesting leases */ + bool no_sparse:1; /* do not attempt to set files sparse */ bool fsc:1; /* enable fscache */ bool mfsymlinks:1; /* use Minshall+French Symlinks */ bool multiuser:1; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index afaf59c22193..35962a1a23b9 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -69,7 +69,7 @@ sesInfoAlloc(void) ret_buf = kzalloc(sizeof(struct cifs_ses), GFP_KERNEL); if (ret_buf) { atomic_inc(&sesInfoAllocCount); - ret_buf->status = CifsNew; + ret_buf->ses_status = SES_NEW; ++ret_buf->ses_count; INIT_LIST_HEAD(&ret_buf->smb_ses_list); INIT_LIST_HEAD(&ret_buf->tcon_list); @@ -95,7 +95,6 @@ sesInfoFree(struct cifs_ses *buf_to_free) kfree_sensitive(buf_to_free->password); kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); - kfree(buf_to_free->workstation_name); kfree_sensitive(buf_to_free->auth_key.response); kfree(buf_to_free->iface_list); kfree_sensitive(buf_to_free); @@ -114,6 +113,8 @@ tconInfoAlloc(void) kfree(ret_buf); return NULL; } + INIT_LIST_HEAD(&ret_buf->crfid.dirents.entries); + mutex_init(&ret_buf->crfid.dirents.de_mutex); atomic_inc(&tconInfoAllocCount); ret_buf->status = TID_NEW; @@ -1309,7 +1310,7 @@ int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix) * for "\<server>\<dfsname>\<linkpath>" DFS reference, * where <dfsname> contains non-ASCII unicode symbols. * - * Check such DFS reference and emulate -ENOENT if it is actual. + * Check such DFS reference. */ int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid, struct cifs_tcon *tcon, @@ -1341,10 +1342,6 @@ int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid, cifs_dbg(FYI, "DFS ref '%s' is found, emulate -EREMOTE\n", dfspath); rc = -EREMOTE; - } else if (rc == -EEXIST) { - cifs_dbg(FYI, "DFS ref '%s' is not found, emulate -ENOENT\n", - dfspath); - rc = -ENOENT; } else { cifs_dbg(FYI, "%s: dfs_cache_find returned %d\n", __func__, rc); } diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 1929e80c09ee..384cabdf47ca 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -840,9 +840,109 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, return rc; } +static bool emit_cached_dirents(struct cached_dirents *cde, + struct dir_context *ctx) +{ + struct cached_dirent *dirent; + int rc; + + list_for_each_entry(dirent, &cde->entries, entry) { + if (ctx->pos >= dirent->pos) + continue; + ctx->pos = dirent->pos; + rc = dir_emit(ctx, dirent->name, dirent->namelen, + dirent->fattr.cf_uniqueid, + dirent->fattr.cf_dtype); + if (!rc) + return rc; + } + return true; +} + +static void update_cached_dirents_count(struct cached_dirents *cde, + struct dir_context *ctx) +{ + if (cde->ctx != ctx) + return; + if (cde->is_valid || cde->is_failed) + return; + + cde->pos++; +} + +static void finished_cached_dirents_count(struct cached_dirents *cde, + struct dir_context *ctx) +{ + if (cde->ctx != ctx) + return; + if (cde->is_valid || cde->is_failed) + return; + if (ctx->pos != cde->pos) + return; + + cde->is_valid = 1; +} + +static void add_cached_dirent(struct cached_dirents *cde, + struct dir_context *ctx, + const char *name, int namelen, + struct cifs_fattr *fattr) +{ + struct cached_dirent *de; + + if (cde->ctx != ctx) + return; + if (cde->is_valid || cde->is_failed) + return; + if (ctx->pos != cde->pos) { + cde->is_failed = 1; + return; + } + de = kzalloc(sizeof(*de), GFP_ATOMIC); + if (de == NULL) { + cde->is_failed = 1; + return; + } + de->namelen = namelen; + de->name = kstrndup(name, namelen, GFP_ATOMIC); + if (de->name == NULL) { + kfree(de); + cde->is_failed = 1; + return; + } + de->pos = ctx->pos; + + memcpy(&de->fattr, fattr, sizeof(struct cifs_fattr)); + + list_add_tail(&de->entry, &cde->entries); +} + +static bool cifs_dir_emit(struct dir_context *ctx, + const char *name, int namelen, + struct cifs_fattr *fattr, + struct cached_fid *cfid) +{ + bool rc; + ino_t ino = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid); + + rc = dir_emit(ctx, name, namelen, ino, fattr->cf_dtype); + if (!rc) + return rc; + + if (cfid) { + mutex_lock(&cfid->dirents.de_mutex); + add_cached_dirent(&cfid->dirents, ctx, name, namelen, + fattr); + mutex_unlock(&cfid->dirents.de_mutex); + } + + return rc; +} + static int cifs_filldir(char *find_entry, struct file *file, - struct dir_context *ctx, - char *scratch_buf, unsigned int max_len) + struct dir_context *ctx, + char *scratch_buf, unsigned int max_len, + struct cached_fid *cfid) { struct cifsFileInfo *file_info = file->private_data; struct super_block *sb = file_inode(file)->i_sb; @@ -851,7 +951,6 @@ static int cifs_filldir(char *find_entry, struct file *file, struct cifs_fattr fattr; struct qstr name; int rc = 0; - ino_t ino; rc = cifs_fill_dirent(&de, find_entry, file_info->srch_inf.info_level, file_info->srch_inf.unicode); @@ -931,8 +1030,8 @@ static int cifs_filldir(char *find_entry, struct file *file, cifs_prime_dcache(file_dentry(file), &name, &fattr); - ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); - return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype); + return !cifs_dir_emit(ctx, name.name, name.len, + &fattr, cfid); } @@ -941,8 +1040,9 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) int rc = 0; unsigned int xid; int i; + struct tcon_link *tlink = NULL; struct cifs_tcon *tcon; - struct cifsFileInfo *cifsFile = NULL; + struct cifsFileInfo *cifsFile; char *current_entry; int num_to_fill = 0; char *tmp_buf = NULL; @@ -950,6 +1050,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) unsigned int max_len; const char *full_path; void *page = alloc_dentry_path(); + struct cached_fid *cfid = NULL; + struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); xid = get_xid(); @@ -959,6 +1061,54 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) goto rddir2_exit; } + if (file->private_data == NULL) { + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + goto cache_not_found; + tcon = tlink_tcon(tlink); + } else { + cifsFile = file->private_data; + tcon = tlink_tcon(cifsFile->tlink); + } + + rc = open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid); + cifs_put_tlink(tlink); + if (rc) + goto cache_not_found; + + mutex_lock(&cfid->dirents.de_mutex); + /* + * If this was reading from the start of the directory + * we need to initialize scanning and storing the + * directory content. + */ + if (ctx->pos == 0 && cfid->dirents.ctx == NULL) { + cfid->dirents.ctx = ctx; + cfid->dirents.pos = 2; + } + /* + * If we already have the entire directory cached then + * we can just serve the cache. + */ + if (cfid->dirents.is_valid) { + if (!dir_emit_dots(file, ctx)) { + mutex_unlock(&cfid->dirents.de_mutex); + goto rddir2_exit; + } + emit_cached_dirents(&cfid->dirents, ctx); + mutex_unlock(&cfid->dirents.de_mutex); + goto rddir2_exit; + } + mutex_unlock(&cfid->dirents.de_mutex); + + /* Drop the cache while calling initiate_cifs_search and + * find_cifs_entry in case there will be reconnects during + * query_directory. + */ + close_cached_dir(cfid); + cfid = NULL; + + cache_not_found: /* * Ensure FindFirst doesn't fail before doing filldir() for '.' and * '..'. Otherwise we won't be able to notify VFS in case of failure. @@ -977,7 +1127,6 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) is in current search buffer? if it before then restart search if after then keep searching till find it */ - cifsFile = file->private_data; if (cifsFile->srch_inf.endOfSearch) { if (cifsFile->srch_inf.emptyDir) { @@ -993,12 +1142,18 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) tcon = tlink_tcon(cifsFile->tlink); rc = find_cifs_entry(xid, tcon, ctx->pos, file, full_path, ¤t_entry, &num_to_fill); + open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid); if (rc) { cifs_dbg(FYI, "fce error %d\n", rc); goto rddir2_exit; } else if (current_entry != NULL) { cifs_dbg(FYI, "entry %lld found\n", ctx->pos); } else { + if (cfid) { + mutex_lock(&cfid->dirents.de_mutex); + finished_cached_dirents_count(&cfid->dirents, ctx); + mutex_unlock(&cfid->dirents.de_mutex); + } cifs_dbg(FYI, "Could not find entry\n"); goto rddir2_exit; } @@ -1028,7 +1183,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) */ *tmp_buf = 0; rc = cifs_filldir(current_entry, file, ctx, - tmp_buf, max_len); + tmp_buf, max_len, cfid); if (rc) { if (rc > 0) rc = 0; @@ -1036,6 +1191,12 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) } ctx->pos++; + if (cfid) { + mutex_lock(&cfid->dirents.de_mutex); + update_cached_dirents_count(&cfid->dirents, ctx); + mutex_unlock(&cfid->dirents.de_mutex); + } + if (ctx->pos == cifsFile->srch_inf.index_of_last_entry) { cifs_dbg(FYI, "last entry in buf at pos %lld %s\n", @@ -1050,6 +1211,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) kfree(tmp_buf); rddir2_exit: + if (cfid) + close_cached_dir(cfid); free_dentry_path(page); free_xid(xid); return rc; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 32f478c7a66d..c6214cfc575f 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -86,6 +86,33 @@ cifs_ses_get_chan_index(struct cifs_ses *ses, } void +cifs_chan_set_in_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + + ses->chans[chan_index].in_reconnect = true; +} + +void +cifs_chan_clear_in_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + + ses->chans[chan_index].in_reconnect = false; +} + +bool +cifs_chan_in_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + + return CIFS_CHAN_IN_RECONNECT(ses, chan_index); +} + +void cifs_chan_set_need_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server) { @@ -714,9 +741,9 @@ static int size_of_ntlmssp_blob(struct cifs_ses *ses, int base_size) else sz += sizeof(__le16); - if (ses->workstation_name) + if (ses->workstation_name[0]) sz += sizeof(__le16) * strnlen(ses->workstation_name, - CIFS_MAX_WORKSTATION_LEN); + ntlmssp_workstation_name_size(ses)); else sz += sizeof(__le16); @@ -960,7 +987,7 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer, cifs_security_buffer_from_str(&sec_blob->WorkstationName, ses->workstation_name, - CIFS_MAX_WORKSTATION_LEN, + ntlmssp_workstation_name_size(ses), *pbuffer, &tmp, nls_cp); diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index fe5bfa245fa7..8571a459c710 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -362,8 +362,6 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, num_rqst++; if (cfile) { - cifsFileInfo_put(cfile); - cfile = NULL; rc = compound_send_recv(xid, ses, server, flags, num_rqst - 2, &rqst[1], &resp_buftype[1], @@ -514,8 +512,11 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, if (smb2_data == NULL) return -ENOMEM; + if (strcmp(full_path, "")) + rc = -ENOENT; + else + rc = open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid); /* If it is a root and its handle is cached then use it */ - rc = open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid); if (!rc) { if (tcon->crfid.file_all_info_is_valid) { move_smb2_info_to_cifs(data, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 3fe47a88f47d..17813c3d0c6e 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -656,6 +656,12 @@ smb2_is_valid_lease_break(char *buffer) } spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "Can not process lease break - no lease matched\n"); + trace_smb3_lease_not_found(le32_to_cpu(rsp->CurrentLeaseState), + le32_to_cpu(rsp->hdr.Id.SyncId.TreeId), + le64_to_cpu(rsp->hdr.SessionId), + *((u64 *)rsp->LeaseKey), + *((u64 *)&rsp->LeaseKey[8])); + return false; } @@ -726,6 +732,10 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) } spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "No file id matched, oplock break ignored\n"); + trace_smb3_oplock_not_found(0 /* no xid */, rsp->PersistentFid, + le32_to_cpu(rsp->hdr.Id.SyncId.TreeId), + le64_to_cpu(rsp->hdr.SessionId)); + return true; } @@ -798,7 +808,7 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid, if (tcon->ses) server = tcon->ses->server; - cifs_server_dbg(FYI, "tid=%u: tcon is closing, skipping async close retry of fid %llu %llu\n", + cifs_server_dbg(FYI, "tid=0x%x: tcon is closing, skipping async close retry of fid %llu %llu\n", tcon->tid, persistent_fid, volatile_fid); return 0; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d6aaeff4a30a..d7ade739cde1 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -699,6 +699,7 @@ smb2_close_cached_fid(struct kref *ref) { struct cached_fid *cfid = container_of(ref, struct cached_fid, refcount); + struct cached_dirent *dirent, *q; if (cfid->is_valid) { cifs_dbg(FYI, "clear cached root file handle\n"); @@ -718,6 +719,21 @@ smb2_close_cached_fid(struct kref *ref) dput(cfid->dentry); cfid->dentry = NULL; } + /* + * Delete all cached dirent names + */ + mutex_lock(&cfid->dirents.de_mutex); + list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) { + list_del(&dirent->entry); + kfree(dirent->name); + kfree(dirent); + } + cfid->dirents.is_valid = 0; + cfid->dirents.is_failed = 0; + cfid->dirents.ctx = NULL; + cfid->dirents.pos = 0; + mutex_unlock(&cfid->dirents.de_mutex); + } void close_cached_dir(struct cached_fid *cfid) @@ -754,14 +770,15 @@ smb2_cached_lease_break(struct work_struct *work) /* * Open the and cache a directory handle. * Only supported for the root handle. + * If error then *cfid is not initialized. */ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, const char *path, struct cifs_sb_info *cifs_sb, struct cached_fid **cfid) { - struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = ses->server; + struct cifs_ses *ses; + struct TCP_Server_Info *server; struct cifs_open_parms oparms; struct smb2_create_rsp *o_rsp = NULL; struct smb2_query_info_rsp *qi_rsp = NULL; @@ -776,9 +793,13 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid; struct dentry *dentry; - if (tcon->nohandlecache) + if (tcon == NULL || tcon->nohandlecache || + is_smb1_server(tcon->ses->server)) return -ENOTSUPP; + ses = tcon->ses; + server = ses->server; + if (cifs_sb->root == NULL) return -ENOENT; @@ -824,7 +845,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; oparms.tcon = tcon; - oparms.create_options = cifs_create_options(cifs_sb, 0); + oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE); oparms.desired_access = FILE_READ_ATTRIBUTES; oparms.disposition = FILE_OPEN; oparms.fid = pfid; @@ -2695,7 +2716,8 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; memset(rsp_iov, 0, sizeof(rsp_iov)); - rc = open_cached_dir(xid, tcon, path, cifs_sb, &cfid); + if (!strcmp(path, "")) + open_cached_dir(xid, tcon, path, cifs_sb, &cfid); /* cfid null if open dir failed */ memset(&open_iov, 0, sizeof(open_iov)); rqst[0].rq_iov = open_iov; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 1b7ad0c09566..084be3a90198 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -179,7 +179,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, } } spin_unlock(&cifs_tcp_ses_lock); - if ((!tcon->ses) || (tcon->ses->status == CifsExiting) || + if ((!tcon->ses) || (tcon->ses->ses_status == SES_EXITING) || (!tcon->ses->server) || !server) return -EIO; @@ -3899,7 +3899,8 @@ SMB2_echo(struct TCP_Server_Info *server) cifs_dbg(FYI, "In echo request for conn_id %lld\n", server->conn_id); spin_lock(&cifs_tcp_ses_lock); - if (server->tcpStatus == CifsNeedNegotiate) { + if (server->ops->need_neg && + server->ops->need_neg(server)) { spin_unlock(&cifs_tcp_ses_lock); /* No need to send echo on newly established connections */ mod_delayed_work(cifsiod_wq, &server->reconnect, 0); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index d8c4388b190d..f57881b8464f 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -260,28 +260,6 @@ struct get_retrieval_pointers_refcount_rsp { struct smb3_extents extents[]; } __packed; -struct fsctl_set_integrity_information_req { - __le16 ChecksumAlgorithm; - __le16 Reserved; - __le32 Flags; -} __packed; - -struct fsctl_get_integrity_information_rsp { - __le16 ChecksumAlgorithm; - __le16 Reserved; - __le32 Flags; - __le32 ChecksumChunkSizeInBytes; - __le32 ClusterSizeInBytes; -} __packed; - -/* Integrity ChecksumAlgorithm choices for above */ -#define CHECKSUM_TYPE_NONE 0x0000 -#define CHECKSUM_TYPE_CRC64 0x0002 -#define CHECKSUM_TYPE_UNCHANGED 0xFFFF /* set only */ - -/* Integrity flags for above */ -#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001 - /* See MS-DFSC 2.2.2 */ struct fsctl_get_dfs_referral_req { __le16 MaxReferralLevel; diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 2af79093b78b..55e79f6ee78d 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -641,7 +641,8 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) if (!is_signed) return 0; spin_lock(&cifs_tcp_ses_lock); - if (server->tcpStatus == CifsNeedNegotiate) { + if (server->ops->need_neg && + server->ops->need_neg(server)) { spin_unlock(&cifs_tcp_ses_lock); return 0; } @@ -779,7 +780,7 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server, return -EAGAIN; } - if (ses->status == CifsNew) { + if (ses->ses_status == SES_NEW) { if ((shdr->Command != SMB2_SESSION_SETUP) && (shdr->Command != SMB2_NEGOTIATE)) { spin_unlock(&cifs_tcp_ses_lock); @@ -788,7 +789,7 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server, /* else ok - we are setting up session */ } - if (ses->status == CifsExiting) { + if (ses->ses_status == SES_EXITING) { if (shdr->Command != SMB2_LOGOFF) { spin_unlock(&cifs_tcp_ses_lock); return -EAGAIN; diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 31ef64eb7fbb..c3278db1cade 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -649,7 +649,7 @@ static int smbd_ia_open( smbd_max_frmr_depth, info->id->device->attrs.max_fast_reg_page_list_len); info->mr_type = IB_MR_TYPE_MEM_REG; - if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) + if (info->id->device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) info->mr_type = IB_MR_TYPE_SG_GAPS; info->pd = ib_alloc_pd(info->id->device, 0); @@ -1350,7 +1350,7 @@ void smbd_destroy(struct TCP_Server_Info *server) wait_event(info->wait_send_pending, atomic_read(&info->send_pending) == 0); - /* It's not posssible for upper layer to get to reassembly */ + /* It's not possible for upper layer to get to reassembly */ log_rdma_event(INFO, "drain the reassembly queue\n"); do { spin_lock_irqsave(&info->reassembly_queue_lock, flags); diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index bc279616c513..2be5e0c8564d 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -158,6 +158,7 @@ DEFINE_SMB3_FD_EVENT(flush_enter); DEFINE_SMB3_FD_EVENT(flush_done); DEFINE_SMB3_FD_EVENT(close_enter); DEFINE_SMB3_FD_EVENT(close_done); +DEFINE_SMB3_FD_EVENT(oplock_not_found); DECLARE_EVENT_CLASS(smb3_fd_err_class, TP_PROTO(unsigned int xid, @@ -814,6 +815,7 @@ DEFINE_EVENT(smb3_lease_done_class, smb3_##name, \ TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high)) DEFINE_SMB3_LEASE_DONE_EVENT(lease_done); +DEFINE_SMB3_LEASE_DONE_EVENT(lease_not_found); DECLARE_EVENT_CLASS(smb3_lease_err_class, TP_PROTO(__u32 lease_state, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index c667e6ddfe2f..05eca41e3b1e 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -726,7 +726,7 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, struct mid_q_entry **ppmidQ) { spin_lock(&cifs_tcp_ses_lock); - if (ses->status == CifsNew) { + if (ses->ses_status == SES_NEW) { if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) && (in_buf->Command != SMB_COM_NEGOTIATE)) { spin_unlock(&cifs_tcp_ses_lock); @@ -735,7 +735,7 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, /* else ok - we are setting up session */ } - if (ses->status == CifsExiting) { + if (ses->ses_status == SES_EXITING) { /* check if SMB session is bad because we are setting it up */ if (in_buf->Command != SMB_COM_LOGOFF_ANDX) { spin_unlock(&cifs_tcp_ses_lock); @@ -1187,7 +1187,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, * Compounding is never used during session establish. */ spin_lock(&cifs_tcp_ses_lock); - if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { + if ((ses->ses_status == SES_NEW) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { spin_unlock(&cifs_tcp_ses_lock); mutex_lock(&server->srv_mutex); @@ -1260,7 +1260,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, * Compounding is never used during session establish. */ spin_lock(&cifs_tcp_ses_lock); - if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { + if ((ses->ses_status == SES_NEW) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { struct kvec iov = { .iov_base = resp_iov[0].iov_base, .iov_len = resp_iov[0].iov_len @@ -24,6 +24,7 @@ #include <linux/sizes.h> #include <linux/mmu_notifier.h> #include <linux/iomap.h> +#include <linux/rmap.h> #include <asm/pgalloc.h> #define CREATE_TRACE_POINTS @@ -721,7 +722,8 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter int id; id = dax_read_lock(); - rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL); + rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS, + &kaddr, NULL); if (rc < 0) { dax_read_unlock(id); return rc; @@ -789,95 +791,12 @@ static void *dax_insert_entry(struct xa_state *xas, return entry; } -static inline -unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) -{ - unsigned long address; - - address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); - return address; -} - -/* Walk all mappings of a given index of a file and writeprotect them */ -static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, - unsigned long pfn) -{ - struct vm_area_struct *vma; - pte_t pte, *ptep = NULL; - pmd_t *pmdp = NULL; - spinlock_t *ptl; - - i_mmap_lock_read(mapping); - vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { - struct mmu_notifier_range range; - unsigned long address; - - cond_resched(); - - if (!(vma->vm_flags & VM_SHARED)) - continue; - - address = pgoff_address(index, vma); - - /* - * follow_invalidate_pte() will use the range to call - * mmu_notifier_invalidate_range_start() on our behalf before - * taking any lock. - */ - if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep, - &pmdp, &ptl)) - continue; - - /* - * No need to call mmu_notifier_invalidate_range() as we are - * downgrading page table protection not changing it to point - * to a new page. - * - * See Documentation/vm/mmu_notifier.rst - */ - if (pmdp) { -#ifdef CONFIG_FS_DAX_PMD - pmd_t pmd; - - if (pfn != pmd_pfn(*pmdp)) - goto unlock_pmd; - if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) - goto unlock_pmd; - - flush_cache_page(vma, address, pfn); - pmd = pmdp_invalidate(vma, address, pmdp); - pmd = pmd_wrprotect(pmd); - pmd = pmd_mkclean(pmd); - set_pmd_at(vma->vm_mm, address, pmdp, pmd); -unlock_pmd: -#endif - spin_unlock(ptl); - } else { - if (pfn != pte_pfn(*ptep)) - goto unlock_pte; - if (!pte_dirty(*ptep) && !pte_write(*ptep)) - goto unlock_pte; - - flush_cache_page(vma, address, pfn); - pte = ptep_clear_flush(vma, address, ptep); - pte = pte_wrprotect(pte); - pte = pte_mkclean(pte); - set_pte_at(vma->vm_mm, address, ptep, pte); -unlock_pte: - pte_unmap_unlock(ptep, ptl); - } - - mmu_notifier_invalidate_range_end(&range); - } - i_mmap_unlock_read(mapping); -} - static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, struct address_space *mapping, void *entry) { - unsigned long pfn, index, count; + unsigned long pfn, index, count, end; long ret = 0; + struct vm_area_struct *vma; /* * A page got tagged dirty in DAX mapping? Something is seriously @@ -935,8 +854,16 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, pfn = dax_to_pfn(entry); count = 1UL << dax_entry_order(entry); index = xas->xa_index & ~(count - 1); + end = index + count - 1; + + /* Walk all mappings of a given index of a file and writeprotect them */ + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) { + pfn_mkclean_range(pfn, count, index, vma); + cond_resched(); + } + i_mmap_unlock_read(mapping); - dax_entry_mkclean(mapping, index, pfn); dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); /* * After we have flushed the cache, we can clear the dirty tag. There @@ -1013,7 +940,7 @@ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size, id = dax_read_lock(); length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), - NULL, pfnp); + DAX_ACCESS, NULL, pfnp); if (length < 0) { rc = length; goto out; @@ -1122,7 +1049,7 @@ static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff, void *kaddr; long ret; - ret = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); + ret = dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); if (ret > 0) { memset(kaddr + offset, 0, size); dax_flush(dax_dev, kaddr + offset, size); @@ -1239,6 +1166,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, const size_t size = ALIGN(length + offset, PAGE_SIZE); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); ssize_t map_len; + bool recovery = false; void *kaddr; if (fatal_signal_pending(current)) { @@ -1247,7 +1175,14 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, } map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), - &kaddr, NULL); + DAX_ACCESS, &kaddr, NULL); + if (map_len == -EIO && iov_iter_rw(iter) == WRITE) { + map_len = dax_direct_access(dax_dev, pgoff, + PHYS_PFN(size), DAX_RECOVERY_WRITE, + &kaddr, NULL); + if (map_len > 0) + recovery = true; + } if (map_len < 0) { ret = map_len; break; @@ -1259,7 +1194,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, if (map_len > end - pos) map_len = end - pos; - if (iov_iter_rw(iter) == WRITE) + if (recovery) + xfer = dax_recovery_write(dax_dev, pgoff, kaddr, + map_len, iter); + else if (iov_iter_rw(iter) == WRITE) xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); else diff --git a/fs/exec.c b/fs/exec.c index e3e55d5e0be1..14b4b3755580 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -758,6 +758,7 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_size; unsigned long stack_expand; unsigned long rlim_stack; + struct mmu_gather tlb; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ @@ -812,8 +813,11 @@ int setup_arg_pages(struct linux_binprm *bprm, vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; - ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, + tlb_gather_mmu(&tlb, mm); + ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end, vm_flags); + tlb_finish_mmu(&tlb); + if (ret) goto out_unlock; BUG_ON(prev != vma); diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 02d4d4234956..a415c02ede39 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -126,6 +126,7 @@ struct msdos_inode_info { struct hlist_node i_fat_hash; /* hash by i_location */ struct hlist_node i_dir_hash; /* hash by i_logstart */ struct rw_semaphore truncate_lock; /* protect bmap against truncate */ + struct timespec64 i_crtime; /* File creation (birth) time */ struct inode vfs_inode; }; @@ -433,8 +434,15 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...); __fat_fs_error(sb, 1, fmt , ## args) #define fat_fs_error_ratelimit(sb, fmt, args...) \ __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args) + +#define FAT_PRINTK_PREFIX "%sFAT-fs (%s): " +#define fat_msg(sb, level, fmt, args...) \ +do { \ + printk_index_subsys_emit(FAT_PRINTK_PREFIX, level, fmt, ##args);\ + _fat_msg(sb, level, fmt, ##args); \ +} while (0) __printf(3, 4) __cold -void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...); +void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...); #define fat_msg_ratelimit(sb, level, fmt, args...) \ do { \ if (__ratelimit(&MSDOS_SB(sb)->ratelimit)) \ @@ -446,6 +454,10 @@ extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 __time, __le16 __date, u8 time_cs); extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs); +extern struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts); +extern struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts); extern int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags); extern int fat_update_time(struct inode *inode, struct timespec64 *now, diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 978ac6751aeb..1db348f8f887 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -94,7 +94,8 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent, err_brelse: brelse(bhs[0]); err: - fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr); + fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)", + (llu)blocknr); return -EIO; } @@ -107,8 +108,8 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent, fatent->fat_inode = MSDOS_SB(sb)->fat_inode; fatent->bhs[0] = sb_bread(sb, blocknr); if (!fatent->bhs[0]) { - fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", - (llu)blocknr); + fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)", + (llu)blocknr); return -EIO; } fatent->nr_bhs = 1; diff --git a/fs/fat/file.c b/fs/fat/file.c index bf91f977debe..3dae3ed60f3a 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -398,13 +398,21 @@ int fat_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + generic_fillattr(mnt_userns, inode, stat); - stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size; + stat->blksize = sbi->cluster_size; - if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) { + if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) { /* Use i_pos for ino. This is used as fileid of nfs. */ - stat->ino = fat_i_pos_read(MSDOS_SB(inode->i_sb), inode); + stat->ino = fat_i_pos_read(sbi, inode); } + + if (sbi->options.isvfat && request_mask & STATX_BTIME) { + stat->result_mask |= STATX_BTIME; + stat->btime = MSDOS_I(inode)->i_crtime; + } + return 0; } EXPORT_SYMBOL_GPL(fat_getattr); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 69b4d4ae64d7..a38238d75c08 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -567,12 +567,13 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) & ~((loff_t)sbi->cluster_size - 1)) >> 9; fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0); + inode->i_ctime = inode->i_mtime; if (sbi->options.isvfat) { - fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime, - de->cdate, de->ctime_cs); fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); + fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime, + de->cdate, de->ctime_cs); } else - fat_truncate_time(inode, &inode->i_mtime, S_ATIME|S_CTIME); + inode->i_atime = fat_truncate_atime(sbi, &inode->i_mtime); return 0; } @@ -757,6 +758,8 @@ static struct inode *fat_alloc_inode(struct super_block *sb) ei->i_logstart = 0; ei->i_attrs = 0; ei->i_pos = 0; + ei->i_crtime.tv_sec = 0; + ei->i_crtime.tv_nsec = 0; return &ei->vfs_inode; } @@ -888,10 +891,10 @@ retry: &raw_entry->date, NULL); if (sbi->options.isvfat) { __le16 atime; - fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime, - &raw_entry->cdate, &raw_entry->ctime_cs); fat_time_unix2fat(sbi, &inode->i_atime, &atime, &raw_entry->adate, NULL); + fat_time_unix2fat(sbi, &MSDOS_I(inode)->i_crtime, &raw_entry->ctime, + &raw_entry->cdate, &raw_entry->ctime_cs); } spin_unlock(&sbi->inode_hash_lock); mark_buffer_dirty(bh); @@ -1885,10 +1888,8 @@ out_invalid: fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem"); out_fail: - if (fsinfo_inode) - iput(fsinfo_inode); - if (fat_inode) - iput(fat_inode); + iput(fsinfo_inode); + iput(fat_inode); unload_nls(sbi->nls_io); unload_nls(sbi->nls_disk); fat_reset_iocharset(&sbi->options); diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 91ca3c304211..7e5d6ae305f2 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -42,10 +42,16 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) EXPORT_SYMBOL_GPL(__fat_fs_error); /** - * fat_msg() - print preformated FAT specific messages. Every thing what is - * not fat_fs_error() should be fat_msg(). + * _fat_msg() - Print a preformatted FAT message based on a superblock. + * @sb: A pointer to a &struct super_block + * @level: A Kernel printk level constant + * @fmt: The printf-style format string to print. + * + * Everything that is not fat_fs_error() should be fat_msg(). + * + * fat_msg() wraps _fat_msg() for printk indexing. */ -void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) +void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -53,7 +59,7 @@ void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf); + _printk(FAT_PRINTK_PREFIX "%pV\n", level, sb->s_id, &vaf); va_end(args); } @@ -187,7 +193,7 @@ static long days_in_year[] = { 0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, }; -static inline int fat_tz_offset(struct msdos_sb_info *sbi) +static inline int fat_tz_offset(const struct msdos_sb_info *sbi) { return (sbi->options.tz_set ? -sbi->options.time_offset : @@ -275,23 +281,35 @@ static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts) return (struct timespec64){ ts.tv_sec & ~1ULL, 0 }; } -static inline struct timespec64 fat_timespec64_trunc_10ms(struct timespec64 ts) +/* + * truncate atime to 24 hour granularity (00:00:00 in local timezone) + */ +struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts) +{ + /* to localtime */ + time64_t seconds = ts->tv_sec - fat_tz_offset(sbi); + s32 remainder; + + div_s64_rem(seconds, SECS_PER_DAY, &remainder); + /* to day boundary, and back to unix time */ + seconds = seconds + fat_tz_offset(sbi) - remainder; + + return (struct timespec64){ seconds, 0 }; +} + +/* + * truncate mtime to 2 second granularity + */ +struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts) { - if (ts.tv_nsec) - ts.tv_nsec -= ts.tv_nsec % 10000000UL; - return ts; + return fat_timespec64_trunc_2secs(*ts); } /* * truncate the various times with appropriate granularity: - * root inode: - * all times always 0 - * all other inodes: - * mtime - 2 seconds - * ctime - * msdos - 2 seconds - * vfat - 10 milliseconds - * atime - 24 hours (00:00:00 in local timezone) + * all times in root node are always 0 */ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) { @@ -306,25 +324,15 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) ts = current_time(inode); } - if (flags & S_ATIME) { - /* to localtime */ - time64_t seconds = now->tv_sec - fat_tz_offset(sbi); - s32 remainder; - - div_s64_rem(seconds, SECS_PER_DAY, &remainder); - /* to day boundary, and back to unix time */ - seconds = seconds + fat_tz_offset(sbi) - remainder; - - inode->i_atime = (struct timespec64){ seconds, 0 }; - } - if (flags & S_CTIME) { - if (sbi->options.isvfat) - inode->i_ctime = fat_timespec64_trunc_10ms(*now); - else - inode->i_ctime = fat_timespec64_trunc_2secs(*now); - } + if (flags & S_ATIME) + inode->i_atime = fat_truncate_atime(sbi, now); + /* + * ctime and mtime share the same on-disk field, and should be + * identical in memory. all mtime updates will be applied to ctime, + * but ctime updates are ignored. + */ if (flags & S_MTIME) - inode->i_mtime = fat_timespec64_trunc_2secs(*now); + inode->i_mtime = inode->i_ctime = fat_truncate_mtime(sbi, now); return 0; } diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 5369d82e0bfb..c573314806cf 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -780,8 +780,6 @@ static int vfat_create(struct user_namespace *mnt_userns, struct inode *dir, goto out; } inode_inc_iversion(inode); - fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME); - /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); out: @@ -878,8 +876,6 @@ static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, } inode_inc_iversion(inode); set_nlink(inode, 2); - fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME); - /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); diff --git a/fs/fcntl.c b/fs/fcntl.c index f15d885b9796..34a3faa4886d 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -56,11 +56,10 @@ static int setfl(int fd, struct file * filp, unsigned long arg) arg |= O_NONBLOCK; /* Pipe packetized mode is controlled by O_DIRECT flag */ - if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) { - if (!filp->f_mapping || !filp->f_mapping->a_ops || - !filp->f_mapping->a_ops->direct_IO) - return -EINVAL; - } + if (!S_ISFIFO(inode->i_mode) && + (arg & O_DIRECT) && + !(filp->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; if (filp->f_op->check_flags) error = filp->f_op->check_flags(arg); diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index d7d3a7f06862..10eb50cbf398 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1241,8 +1241,8 @@ static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd) INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker); id = dax_read_lock(); - nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), NULL, - NULL); + nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), + DAX_ACCESS, NULL, NULL); dax_read_unlock(id); if (nr_pages < 0) { pr_debug("dax_direct_access() returned %ld\n", nr_pages); diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 86b7dbb6a0d4..8db53fa67359 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -752,7 +752,8 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, * offset. */ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, + void **kaddr, pfn_t *pfn) { struct virtio_fs *fs = dax_get_private(dax_dev); phys_addr_t offset = PFN_PHYS(pgoff); @@ -772,7 +773,8 @@ static int virtio_fs_zero_page_range(struct dax_device *dax_dev, long rc; void *kaddr; - rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL); + rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr, + NULL); if (rc < 0) return rc; memset(kaddr, 0, nr_pages << PAGE_SHIFT); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 2de9ca5d260d..62408047e8d7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -195,7 +195,6 @@ out: * Called under mmap_write_lock(mm). */ -#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -206,7 +205,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = current->mm->mmap_base; - info.high_limit = arch_get_mmap_end(addr); + info.high_limit = arch_get_mmap_end(addr, len, flags); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); @@ -237,21 +236,22 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = current->mm->mmap_base; - info.high_limit = arch_get_mmap_end(addr); + info.high_limit = arch_get_mmap_end(addr, len, flags); addr = vm_unmapped_area(&info); } return addr; } -static unsigned long -hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) +unsigned long +generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); - const unsigned long mmap_end = arch_get_mmap_end(addr); + const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); if (len & ~huge_page_mask(h)) return -EINVAL; @@ -283,6 +283,15 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return hugetlb_get_unmapped_area_bottomup(file, addr, len, pgoff, flags); } + +#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA +static unsigned long +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); +} #endif static size_t @@ -405,7 +414,8 @@ static void remove_huge_page(struct page *page) } static void -hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) +hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, + zap_flags_t zap_flags) { struct vm_area_struct *vma; @@ -439,7 +449,7 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) } unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, - NULL); + NULL, zap_flags); } } @@ -517,7 +527,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vmdelete_list(&mapping->i_mmap, index * pages_per_huge_page(h), - (index + 1) * pages_per_huge_page(h)); + (index + 1) * pages_per_huge_page(h), + ZAP_FLAG_DROP_MARKER); i_mmap_unlock_write(mapping); } @@ -583,7 +594,8 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_mmap_lock_write(mapping); i_size_write(inode, offset); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) - hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); + hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0, + ZAP_FLAG_DROP_MARKER); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, offset, LLONG_MAX); } @@ -616,8 +628,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, - hole_start >> PAGE_SHIFT, - hole_end >> PAGE_SHIFT); + hole_start >> PAGE_SHIFT, + hole_end >> PAGE_SHIFT, 0); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, hole_start, hole_end); inode_unlock(inode); @@ -1048,12 +1060,12 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) if (sbinfo->spool) { long free_pages; - spin_lock(&sbinfo->spool->lock); + spin_lock_irq(&sbinfo->spool->lock); buf->f_blocks = sbinfo->spool->max_hpages; free_pages = sbinfo->spool->max_hpages - sbinfo->spool->used_hpages; buf->f_bavail = buf->f_bfree = free_pages; - spin_unlock(&sbinfo->spool->lock); + spin_unlock_irq(&sbinfo->spool->lock); buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_inodes; } diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile index 285ec189ed5c..7156d2c218c7 100644 --- a/fs/jfs/Makefile +++ b/fs/jfs/Makefile @@ -13,5 +13,3 @@ jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \ resize.o xattr.o ioctl.o jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o - -ccflags-y := -D_JFS_4K diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index a5dd7e53754a..259326556ada 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -224,18 +224,9 @@ int jfs_get_block(struct inode *ip, sector_t lblock, * this as a hole */ goto unlock; -#ifdef _JFS_4K XADoffset(&xad, lblock64); XADlength(&xad, xlen); XADaddress(&xad, xaddr); -#else /* _JFS_4K */ - /* - * As long as block size = 4K, this isn't a problem. - * We should mark the whole page not ABNR, but how - * will we know to mark the other blocks BH_New? - */ - BUG(); -#endif /* _JFS_4K */ rc = extRecord(ip, &xad); if (rc) goto unlock; @@ -252,7 +243,6 @@ int jfs_get_block(struct inode *ip, sector_t lblock, /* * Allocate a new block */ -#ifdef _JFS_4K if ((rc = extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad))) goto unlock; rc = extAlloc(ip, xlen, lblock64, &xad, false); @@ -263,14 +253,6 @@ int jfs_get_block(struct inode *ip, sector_t lblock, map_bh(bh_result, ip->i_sb, addressXAD(&xad)); bh_result->b_size = lengthXAD(&xad) << ip->i_blkbits; -#else /* _JFS_4K */ - /* - * We need to do whatever it takes to keep all but the last buffers - * in 4K pages - see jfs_write.c - */ - BUG(); -#endif /* _JFS_4K */ - unlock: /* * Release lock on inode diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index d8502f4989d9..6b838d3ae7c2 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -385,7 +385,8 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks) } /* write the last buffer. */ - write_metapage(mp); + if (mp) + write_metapage(mp); IREAD_UNLOCK(ipbmap); @@ -868,74 +869,6 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) return (rc); } -#ifdef _NOTYET -/* - * NAME: dbAllocExact() - * - * FUNCTION: try to allocate the requested extent; - * - * PARAMETERS: - * ip - pointer to in-core inode; - * blkno - extent address; - * nblocks - extent length; - * - * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error - */ -int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) -{ - int rc; - struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; - struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; - struct dmap *dp; - s64 lblkno; - struct metapage *mp; - - IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); - - /* - * validate extent request: - * - * note: defragfs policy: - * max 64 blocks will be moved. - * allocation request size must be satisfied from a single dmap. - */ - if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) { - IREAD_UNLOCK(ipbmap); - return -EINVAL; - } - - if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) { - /* the free space is no longer available */ - IREAD_UNLOCK(ipbmap); - return -ENOSPC; - } - - /* read in the dmap covering the extent */ - lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); - mp = read_metapage(ipbmap, lblkno, PSIZE, 0); - if (mp == NULL) { - IREAD_UNLOCK(ipbmap); - return -EIO; - } - dp = (struct dmap *) mp->data; - - /* try to allocate the requested extent */ - rc = dbAllocNext(bmp, dp, blkno, nblocks); - - IREAD_UNLOCK(ipbmap); - - if (rc == 0) - mark_metapage_dirty(mp); - - release_metapage(mp); - - return (rc); -} -#endif /* _NOTYET */ - /* * NAME: dbReAlloc() * diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 837d42f61464..92b7c533407c 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -2423,304 +2423,6 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, return 0; } -#ifdef _NOTYET -/* - * NAME: dtRelocate() - * - * FUNCTION: relocate dtpage (internal or leaf) of directory; - * This function is mainly used by defragfs utility. - */ -int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, - s64 nxaddr) -{ - int rc = 0; - struct metapage *mp, *pmp, *lmp, *rmp; - dtpage_t *p, *pp, *rp = 0, *lp= 0; - s64 bn; - int index; - struct btstack btstack; - pxd_t *pxd; - s64 oxaddr, nextbn, prevbn; - int xlen, xsize; - struct tlock *tlck; - struct dt_lock *dtlck; - struct pxd_lock *pxdlock; - s8 *stbl; - struct lv *lv; - - oxaddr = addressPXD(opxd); - xlen = lengthPXD(opxd); - - jfs_info("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d", - (long long)lmxaddr, (long long)oxaddr, (long long)nxaddr, - xlen); - - /* - * 1. get the internal parent dtpage covering - * router entry for the tartget page to be relocated; - */ - rc = dtSearchNode(ip, lmxaddr, opxd, &btstack); - if (rc) - return rc; - - /* retrieve search result */ - DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); - jfs_info("dtRelocate: parent router entry validated."); - - /* - * 2. relocate the target dtpage - */ - /* read in the target page from src extent */ - DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); - if (rc) { - /* release the pinned parent page */ - DT_PUTPAGE(pmp); - return rc; - } - - /* - * read in sibling pages if any to update sibling pointers; - */ - rmp = NULL; - if (p->header.next) { - nextbn = le64_to_cpu(p->header.next); - DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc); - if (rc) { - DT_PUTPAGE(mp); - DT_PUTPAGE(pmp); - return (rc); - } - } - - lmp = NULL; - if (p->header.prev) { - prevbn = le64_to_cpu(p->header.prev); - DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc); - if (rc) { - DT_PUTPAGE(mp); - DT_PUTPAGE(pmp); - if (rmp) - DT_PUTPAGE(rmp); - return (rc); - } - } - - /* at this point, all xtpages to be updated are in memory */ - - /* - * update sibling pointers of sibling dtpages if any; - */ - if (lmp) { - tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK); - dtlck = (struct dt_lock *) & tlck->lock; - /* linelock header */ - ASSERT(dtlck->index == 0); - lv = & dtlck->lv[0]; - lv->offset = 0; - lv->length = 1; - dtlck->index++; - - lp->header.next = cpu_to_le64(nxaddr); - DT_PUTPAGE(lmp); - } - - if (rmp) { - tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK); - dtlck = (struct dt_lock *) & tlck->lock; - /* linelock header */ - ASSERT(dtlck->index == 0); - lv = & dtlck->lv[0]; - lv->offset = 0; - lv->length = 1; - dtlck->index++; - - rp->header.prev = cpu_to_le64(nxaddr); - DT_PUTPAGE(rmp); - } - - /* - * update the target dtpage to be relocated - * - * write LOG_REDOPAGE of LOG_NEW type for dst page - * for the whole target page (logredo() will apply - * after image and update bmap for allocation of the - * dst extent), and update bmap for allocation of - * the dst extent; - */ - tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW); - dtlck = (struct dt_lock *) & tlck->lock; - /* linelock header */ - ASSERT(dtlck->index == 0); - lv = & dtlck->lv[0]; - - /* update the self address in the dtpage header */ - pxd = &p->header.self; - PXDaddress(pxd, nxaddr); - - /* the dst page is the same as the src page, i.e., - * linelock for afterimage of the whole page; - */ - lv->offset = 0; - lv->length = p->header.maxslot; - dtlck->index++; - - /* update the buffer extent descriptor of the dtpage */ - xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; - - /* unpin the relocated page */ - DT_PUTPAGE(mp); - jfs_info("dtRelocate: target dtpage relocated."); - - /* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec - * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec - * will also force a bmap update ). - */ - - /* - * 3. acquire maplock for the source extent to be freed; - */ - /* for dtpage relocation, write a LOG_NOREDOPAGE record - * for the source dtpage (logredo() will init NoRedoPage - * filter and will also update bmap for free of the source - * dtpage), and upadte bmap for free of the source dtpage; - */ - tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); - pxdlock = (struct pxd_lock *) & tlck->lock; - pxdlock->flag = mlckFREEPXD; - PXDaddress(&pxdlock->pxd, oxaddr); - PXDlength(&pxdlock->pxd, xlen); - pxdlock->index = 1; - - /* - * 4. update the parent router entry for relocation; - * - * acquire tlck for the parent entry covering the target dtpage; - * write LOG_REDOPAGE to apply after image only; - */ - jfs_info("dtRelocate: update parent router entry."); - tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); - dtlck = (struct dt_lock *) & tlck->lock; - lv = & dtlck->lv[dtlck->index]; - - /* update the PXD with the new address */ - stbl = DT_GETSTBL(pp); - pxd = (pxd_t *) & pp->slot[stbl[index]]; - PXDaddress(pxd, nxaddr); - lv->offset = stbl[index]; - lv->length = 1; - dtlck->index++; - - /* unpin the parent dtpage */ - DT_PUTPAGE(pmp); - - return rc; -} - -/* - * NAME: dtSearchNode() - * - * FUNCTION: Search for an dtpage containing a specified address - * This function is mainly used by defragfs utility. - * - * NOTE: Search result on stack, the found page is pinned at exit. - * The result page must be an internal dtpage. - * lmxaddr give the address of the left most page of the - * dtree level, in which the required dtpage resides. - */ -static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd, - struct btstack * btstack) -{ - int rc = 0; - s64 bn; - struct metapage *mp; - dtpage_t *p; - int psize = 288; /* initial in-line directory */ - s8 *stbl; - int i; - pxd_t *pxd; - struct btframe *btsp; - - BT_CLR(btstack); /* reset stack */ - - /* - * descend tree to the level with specified leftmost page - * - * by convention, root bn = 0. - */ - for (bn = 0;;) { - /* get/pin the page to search */ - DT_GETPAGE(ip, bn, mp, psize, p, rc); - if (rc) - return rc; - - /* does the xaddr of leftmost page of the levevl - * matches levevl search key ? - */ - if (p->header.flag & BT_ROOT) { - if (lmxaddr == 0) - break; - } else if (addressPXD(&p->header.self) == lmxaddr) - break; - - /* - * descend down to leftmost child page - */ - if (p->header.flag & BT_LEAF) { - DT_PUTPAGE(mp); - return -ESTALE; - } - - /* get the leftmost entry */ - stbl = DT_GETSTBL(p); - pxd = (pxd_t *) & p->slot[stbl[0]]; - - /* get the child page block address */ - bn = addressPXD(pxd); - psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; - /* unpin the parent page */ - DT_PUTPAGE(mp); - } - - /* - * search each page at the current levevl - */ - loop: - stbl = DT_GETSTBL(p); - for (i = 0; i < p->header.nextindex; i++) { - pxd = (pxd_t *) & p->slot[stbl[i]]; - - /* found the specified router entry */ - if (addressPXD(pxd) == addressPXD(kpxd) && - lengthPXD(pxd) == lengthPXD(kpxd)) { - btsp = btstack->top; - btsp->bn = bn; - btsp->index = i; - btsp->mp = mp; - - return 0; - } - } - - /* get the right sibling page if any */ - if (p->header.next) - bn = le64_to_cpu(p->header.next); - else { - DT_PUTPAGE(mp); - return -ESTALE; - } - - /* unpin current page */ - DT_PUTPAGE(mp); - - /* get the right sibling page */ - DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; - - goto loop; -} -#endif /* _NOTYET */ - /* * dtRelink() * diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index bb4a342a193d..ae99a7e232ee 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -16,9 +16,6 @@ * forward references */ static int extBalloc(struct inode *, s64, s64 *, s64 *); -#ifdef _NOTYET -static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *); -#endif static s64 extRoundDown(s64 nb); #define DPD(a) (printk("(a): %d\n",(a))) @@ -177,162 +174,6 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) return (0); } - -#ifdef _NOTYET -/* - * NAME: extRealloc() - * - * FUNCTION: extend the allocation of a file extent containing a - * partial back last page. - * - * PARAMETERS: - * ip - the inode of the file. - * cp - cbuf for the partial backed last page. - * xlen - request size of the resulting extent. - * xp - pointer to an xad. on successful exit, the xad - * describes the newly allocated extent. - * abnr - bool indicating whether the newly allocated extent - * should be marked as allocated but not recorded. - * - * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. - */ -int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) -{ - struct super_block *sb = ip->i_sb; - s64 xaddr, xlen, nxaddr, delta, xoff; - s64 ntail, nextend, ninsert; - int rc, nbperpage = JFS_SBI(sb)->nbperpage; - int xflag; - - /* This blocks if we are low on resources */ - txBeginAnon(ip->i_sb); - - mutex_lock(&JFS_IP(ip)->commit_mutex); - /* validate extent length */ - if (nxlen > MAXXLEN) - nxlen = MAXXLEN; - - /* get the extend (partial) page's disk block address and - * number of blocks. - */ - xaddr = addressXAD(xp); - xlen = lengthXAD(xp); - xoff = offsetXAD(xp); - - /* if the extend page is abnr and if the request is for - * the extent to be allocated and recorded, - * make the page allocated and recorded. - */ - if ((xp->flag & XAD_NOTRECORDED) && !abnr) { - xp->flag = 0; - if ((rc = xtUpdate(0, ip, xp))) - goto exit; - } - - /* try to allocated the request number of blocks for the - * extent. dbRealloc() first tries to satisfy the request - * by extending the allocation in place. otherwise, it will - * try to allocate a new set of blocks large enough for the - * request. in satisfying a request, dbReAlloc() may allocate - * less than what was request but will always allocate enough - * space as to satisfy the extend page. - */ - if ((rc = extBrealloc(ip, xaddr, xlen, &nxlen, &nxaddr))) - goto exit; - - /* Allocat blocks to quota. */ - rc = dquot_alloc_block(ip, nxlen); - if (rc) { - dbFree(ip, nxaddr, (s64) nxlen); - mutex_unlock(&JFS_IP(ip)->commit_mutex); - return rc; - } - - delta = nxlen - xlen; - - /* check if the extend page is not abnr but the request is abnr - * and the allocated disk space is for more than one page. if this - * is the case, there is a miss match of abnr between the extend page - * and the one or more pages following the extend page. as a result, - * two extents will have to be manipulated. the first will be that - * of the extent of the extend page and will be manipulated thru - * an xtExtend() or an xtTailgate(), depending upon whether the - * disk allocation occurred as an inplace extension. the second - * extent will be manipulated (created) through an xtInsert() and - * will be for the pages following the extend page. - */ - if (abnr && (!(xp->flag & XAD_NOTRECORDED)) && (nxlen > nbperpage)) { - ntail = nbperpage; - nextend = ntail - xlen; - ninsert = nxlen - nbperpage; - - xflag = XAD_NOTRECORDED; - } else { - ntail = nxlen; - nextend = delta; - ninsert = 0; - - xflag = xp->flag; - } - - /* if we were able to extend the disk allocation in place, - * extend the extent. otherwise, move the extent to a - * new disk location. - */ - if (xaddr == nxaddr) { - /* extend the extent */ - if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) { - dbFree(ip, xaddr + xlen, delta); - dquot_free_block(ip, nxlen); - goto exit; - } - } else { - /* - * move the extent to a new location: - * - * xtTailgate() accounts for relocated tail extent; - */ - if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) { - dbFree(ip, nxaddr, nxlen); - dquot_free_block(ip, nxlen); - goto exit; - } - } - - - /* check if we need to also insert a new extent */ - if (ninsert) { - /* perform the insert. if it fails, free the blocks - * to be inserted and make it appear that we only did - * the xtExtend() or xtTailgate() above. - */ - xaddr = nxaddr + ntail; - if (xtInsert (0, ip, xflag, xoff + ntail, (int) ninsert, - &xaddr, 0)) { - dbFree(ip, xaddr, (s64) ninsert); - delta = nextend; - nxlen = ntail; - xflag = 0; - } - } - - /* set the return results */ - XADaddress(xp, nxaddr); - XADlength(xp, nxlen); - XADoffset(xp, xoff); - xp->flag = xflag; - - mark_inode_dirty(ip); -exit: - mutex_unlock(&JFS_IP(ip)->commit_mutex); - return (rc); -} -#endif /* _NOTYET */ - - /* * NAME: extHint() * @@ -423,44 +264,6 @@ int extRecord(struct inode *ip, xad_t * xp) return rc; } - -#ifdef _NOTYET -/* - * NAME: extFill() - * - * FUNCTION: allocate disk space for a file page that represents - * a file hole. - * - * PARAMETERS: - * ip - the inode of the file. - * cp - cbuf of the file page represent the hole. - * - * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. - */ -int extFill(struct inode *ip, xad_t * xp) -{ - int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage; - s64 blkno = offsetXAD(xp) >> ip->i_blkbits; - -// assert(ISSPARSE(ip)); - - /* initialize the extent allocation hint */ - XADaddress(xp, 0); - - /* allocate an extent to fill the hole */ - if ((rc = extAlloc(ip, nbperpage, blkno, xp, false))) - return (rc); - - assert(lengthPXD(xp) == nbperpage); - - return (0); -} -#endif /* _NOTYET */ - - /* * NAME: extBalloc() * @@ -550,64 +353,6 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) return (0); } - -#ifdef _NOTYET -/* - * NAME: extBrealloc() - * - * FUNCTION: attempt to extend an extent's allocation. - * - * Initially, we will try to extend the extent's allocation - * in place. If this fails, we'll try to move the extent - * to a new set of blocks. If moving the extent, we initially - * will try to allocate disk blocks for the requested size - * (newnblks). if this fails (new contiguous free blocks not - * available), we'll try to allocate a smaller number of - * blocks (producing a smaller extent), with this smaller - * number of blocks consisting of the requested number of - * blocks rounded down to the next smaller power of 2 - * number (i.e. 16 -> 8). We'll continue to round down and - * retry the allocation until the number of blocks to allocate - * is smaller than the number of blocks per page. - * - * PARAMETERS: - * ip - the inode of the file. - * blkno - starting block number of the extents current allocation. - * nblks - number of blocks within the extents current allocation. - * newnblks - pointer to a s64 value. on entry, this value is the - * new desired extent size (number of blocks). on - * successful exit, this value is set to the extent's actual - * new size (new number of blocks). - * newblkno - the starting block number of the extents new allocation. - * - * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. - */ -static int -extBrealloc(struct inode *ip, - s64 blkno, s64 nblks, s64 * newnblks, s64 * newblkno) -{ - int rc; - - /* try to extend in place */ - if ((rc = dbExtend(ip, blkno, nblks, *newnblks - nblks)) == 0) { - *newblkno = blkno; - return (0); - } else { - if (rc != -ENOSPC) - return (rc); - } - - /* in place extension not possible. - * try to move the extent to a new set of blocks. - */ - return (extBalloc(ip, blkno, newnblks, newblkno)); -} -#endif /* _NOTYET */ - - /* * NAME: extRoundDown() * diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 997c81fcea34..695415cbfe98 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -388,14 +388,6 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot; linelock = (struct linelock *) & tlck->lock; } -#ifdef _JFS_WIP - else if (tlck->flag & tlckINLINELOCK) { - - inlinelock = (struct inlinelock *) & tlck; - p = (caddr_t) & inlinelock->pxd; - linelock = (struct linelock *) & tlck; - } -#endif /* _JFS_WIP */ else { jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck); return 0; /* Probably should trap */ diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index aa4ff7bcaff2..48d1f70f786c 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -307,13 +307,11 @@ static int chkSuper(struct super_block *sb) } bsize = le32_to_cpu(j_sb->s_bsize); -#ifdef _JFS_4K if (bsize != PSIZE) { - jfs_err("Currently only 4K block size supported!"); + jfs_err("Only 4K block size supported!"); rc = -EINVAL; goto out; } -#endif /* _JFS_4K */ jfs_info("superblock: flag:0x%08x state:0x%08x size:0x%Lx", le32_to_cpu(j_sb->s_flag), le32_to_cpu(j_sb->s_state), diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 042bbe6d8ac2..ffd4feece078 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1490,40 +1490,6 @@ static void diLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, tlck->flag |= tlckWRITEPAGE; } else jfs_err("diLog: UFO type tlck:0x%p", tlck); -#ifdef _JFS_WIP - /* - * alloc/free external EA extent - * - * a maplock for txUpdateMap() to update bPWMAP for alloc/free - * of the extent has been formatted at txLock() time; - */ - else { - assert(tlck->type & tlckEA); - - /* log LOG_UPDATEMAP for logredo() to update bmap for - * alloc of new (and free of old) external EA extent; - */ - lrd->type = cpu_to_le16(LOG_UPDATEMAP); - pxdlock = (struct pxd_lock *) & tlck->lock; - nlock = pxdlock->index; - for (i = 0; i < nlock; i++, pxdlock++) { - if (pxdlock->flag & mlckALLOCPXD) - lrd->log.updatemap.type = - cpu_to_le16(LOG_ALLOCPXD); - else - lrd->log.updatemap.type = - cpu_to_le16(LOG_FREEPXD); - lrd->log.updatemap.nxd = cpu_to_le16(1); - lrd->log.updatemap.pxd = pxdlock->pxd; - lrd->backchain = - cpu_to_le32(lmLog(log, tblk, lrd, NULL)); - } - - /* update bmap */ - tlck->flag |= tlckUPDATEMAP; - } -#endif /* _JFS_WIP */ - return; } diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index 3148e9b35f3b..2d304cee884c 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -114,17 +114,6 @@ static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split, static int xtSplitRoot(tid_t tid, struct inode *ip, struct xtsplit * split, struct metapage ** rmpp); -#ifdef _STILL_TO_PORT -static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, - xtpage_t * fp, struct btstack * btstack); - -static int xtSearchNode(struct inode *ip, - xad_t * xad, - int *cmpp, struct btstack * btstack, int flag); - -static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp); -#endif /* _STILL_TO_PORT */ - /* * xtLookup() * @@ -1493,189 +1482,6 @@ int xtExtend(tid_t tid, /* transaction id */ return rc; } -#ifdef _NOTYET -/* - * xtTailgate() - * - * function: split existing 'tail' extent - * (split offset >= start offset of tail extent), and - * relocate and extend the split tail half; - * - * note: existing extent may or may not have been committed. - * caller is responsible for pager buffer cache update, and - * working block allocation map update; - * update pmap: free old split tail extent, alloc new extent; - */ -int xtTailgate(tid_t tid, /* transaction id */ - struct inode *ip, s64 xoff, /* split/new extent offset */ - s32 xlen, /* new extent length */ - s64 xaddr, /* new extent address */ - int flag) -{ - int rc = 0; - int cmp; - struct metapage *mp; /* meta-page buffer */ - xtpage_t *p; /* base B+-tree index page */ - s64 bn; - int index, nextindex, llen, rlen; - struct btstack btstack; /* traverse stack */ - struct xtsplit split; /* split information */ - xad_t *xad; - struct tlock *tlck; - struct xtlock *xtlck = 0; - struct tlock *mtlck; - struct maplock *pxdlock; - -/* -printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", - (ulong)xoff, xlen, (ulong)xaddr); -*/ - - /* there must exist extent to be tailgated */ - if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, XT_INSERT))) - return rc; - - /* retrieve search result */ - XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); - - if (cmp != 0) { - XT_PUTPAGE(mp); - jfs_error(ip->i_sb, "couldn't find extent\n"); - return -EIO; - } - - /* entry found must be last entry */ - nextindex = le16_to_cpu(p->header.nextindex); - if (index != nextindex - 1) { - XT_PUTPAGE(mp); - jfs_error(ip->i_sb, "the entry found is not the last entry\n"); - return -EIO; - } - - BT_MARK_DIRTY(mp, ip); - /* - * acquire tlock of the leaf page containing original entry - */ - if (!test_cflag(COMMIT_Nolink, ip)) { - tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); - xtlck = (struct xtlock *) & tlck->lock; - } - - /* completely replace extent ? */ - xad = &p->xad[index]; -/* -printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", - (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad)); -*/ - if ((llen = xoff - offsetXAD(xad)) == 0) - goto updateOld; - - /* - * partially replace extent: insert entry for new extent - */ -//insertNew: - /* - * if the leaf page is full, insert the new entry and - * propagate up the router entry for the new page from split - * - * The xtSplitUp() will insert the entry and unpin the leaf page. - */ - if (nextindex == le16_to_cpu(p->header.maxentry)) { - /* xtSpliUp() unpins leaf pages */ - split.mp = mp; - split.index = index + 1; - split.flag = XAD_NEW; - split.off = xoff; /* split offset */ - split.len = xlen; - split.addr = xaddr; - split.pxdlist = NULL; - if ((rc = xtSplitUp(tid, ip, &split, &btstack))) - return rc; - - /* get back old page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; - /* - * if leaf root has been split, original root has been - * copied to new child page, i.e., original entry now - * resides on the new child page; - */ - if (p->header.flag & BT_INTERNAL) { - ASSERT(p->header.nextindex == - cpu_to_le16(XTENTRYSTART + 1)); - xad = &p->xad[XTENTRYSTART]; - bn = addressXAD(xad); - XT_PUTPAGE(mp); - - /* get new child page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; - - BT_MARK_DIRTY(mp, ip); - if (!test_cflag(COMMIT_Nolink, ip)) { - tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); - xtlck = (struct xtlock *) & tlck->lock; - } - } - } - /* - * insert the new entry into the leaf page - */ - else { - /* insert the new entry: mark the entry NEW */ - xad = &p->xad[index + 1]; - XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); - - /* advance next available entry index */ - le16_add_cpu(&p->header.nextindex, 1); - } - - /* get back old XAD */ - xad = &p->xad[index]; - - /* - * truncate/relocate old extent at split offset - */ - updateOld: - /* update dmap for old/committed/truncated extent */ - rlen = lengthXAD(xad) - llen; - if (!(xad->flag & XAD_NEW)) { - /* free from PWMAP at commit */ - if (!test_cflag(COMMIT_Nolink, ip)) { - mtlck = txMaplock(tid, ip, tlckMAP); - pxdlock = (struct maplock *) & mtlck->lock; - pxdlock->flag = mlckFREEPXD; - PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen); - PXDlength(&pxdlock->pxd, rlen); - pxdlock->index = 1; - } - } else - /* free from WMAP */ - dbFree(ip, addressXAD(xad) + llen, (s64) rlen); - - if (llen) - /* truncate */ - XADlength(xad, llen); - else - /* replace */ - XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); - - if (!test_cflag(COMMIT_Nolink, ip)) { - xtlck->lwm.offset = (xtlck->lwm.offset) ? - min(index, (int)xtlck->lwm.offset) : index; - xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - - xtlck->lwm.offset; - } - - /* unpin the leaf page */ - XT_PUTPAGE(mp); - - return rc; -} -#endif /* _NOTYET */ - /* * xtUpdate() * @@ -1753,32 +1559,12 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) newindex = index + 1; nextindex = le16_to_cpu(p->header.nextindex); -#ifdef _JFS_WIP_NOCOALESCE - if (xoff < nxoff) - goto updateRight; - - /* - * replace XAD with nXAD - */ - replace: /* (nxoff == xoff) */ - if (nxlen == xlen) { - /* replace XAD with nXAD:recorded */ - *xad = *nxad; - xad->flag = xflag & ~XAD_NOTRECORDED; - - goto out; - } else /* (nxlen < xlen) */ - goto updateLeft; -#endif /* _JFS_WIP_NOCOALESCE */ - -/* #ifdef _JFS_WIP_COALESCE */ if (xoff < nxoff) goto coalesceRight; /* * coalesce with left XAD */ -//coalesceLeft: /* (xoff == nxoff) */ /* is XAD first entry of page ? */ if (index == XTENTRYSTART) goto replace; @@ -1897,7 +1683,6 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) jfs_error(ip->i_sb, "xoff >= nxoff\n"); return -EIO; } -/* #endif _JFS_WIP_COALESCE */ /* * split XAD into (lXAD, nXAD): @@ -2305,752 +2090,6 @@ int xtAppend(tid_t tid, /* transaction id */ return rc; } -#ifdef _STILL_TO_PORT - -/* - TBD for defragmentaion/reorganization - - * - * xtDelete() - * - * function: - * delete the entry with the specified key. - * - * N.B.: whole extent of the entry is assumed to be deleted. - * - * parameter: - * - * return: - * ENOENT: if the entry is not found. - * - * exception: - */ -int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag) -{ - int rc = 0; - struct btstack btstack; - int cmp; - s64 bn; - struct metapage *mp; - xtpage_t *p; - int index, nextindex; - struct tlock *tlck; - struct xtlock *xtlck; - - /* - * find the matching entry; xtSearch() pins the page - */ - if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0))) - return rc; - - XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); - if (cmp) { - /* unpin the leaf page */ - XT_PUTPAGE(mp); - return -ENOENT; - } - - /* - * delete the entry from the leaf page - */ - nextindex = le16_to_cpu(p->header.nextindex); - le16_add_cpu(&p->header.nextindex, -1); - - /* - * if the leaf page bocome empty, free the page - */ - if (p->header.nextindex == cpu_to_le16(XTENTRYSTART)) - return (xtDeleteUp(tid, ip, mp, p, &btstack)); - - BT_MARK_DIRTY(mp, ip); - /* - * acquire a transaction lock on the leaf page; - * - * action:xad deletion; - */ - tlck = txLock(tid, ip, mp, tlckXTREE); - xtlck = (struct xtlock *) & tlck->lock; - xtlck->lwm.offset = - (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index; - - /* if delete from middle, shift left/compact the remaining entries */ - if (index < nextindex - 1) - memmove(&p->xad[index], &p->xad[index + 1], - (nextindex - index - 1) * sizeof(xad_t)); - - XT_PUTPAGE(mp); - - return 0; -} - - -/* - TBD for defragmentaion/reorganization - - * - * xtDeleteUp() - * - * function: - * free empty pages as propagating deletion up the tree - * - * parameter: - * - * return: - */ -static int -xtDeleteUp(tid_t tid, struct inode *ip, - struct metapage * fmp, xtpage_t * fp, struct btstack * btstack) -{ - int rc = 0; - struct metapage *mp; - xtpage_t *p; - int index, nextindex; - s64 xaddr; - int xlen; - struct btframe *parent; - struct tlock *tlck; - struct xtlock *xtlck; - - /* - * keep root leaf page which has become empty - */ - if (fp->header.flag & BT_ROOT) { - /* keep the root page */ - fp->header.flag &= ~BT_INTERNAL; - fp->header.flag |= BT_LEAF; - fp->header.nextindex = cpu_to_le16(XTENTRYSTART); - - /* XT_PUTPAGE(fmp); */ - - return 0; - } - - /* - * free non-root leaf page - */ - if ((rc = xtRelink(tid, ip, fp))) { - XT_PUTPAGE(fmp); - return rc; - } - - xaddr = addressPXD(&fp->header.self); - xlen = lengthPXD(&fp->header.self); - /* free the page extent */ - dbFree(ip, xaddr, (s64) xlen); - - /* free the buffer page */ - discard_metapage(fmp); - - /* - * propagate page deletion up the index tree - * - * If the delete from the parent page makes it empty, - * continue all the way up the tree. - * stop if the root page is reached (which is never deleted) or - * if the entry deletion does not empty the page. - */ - while ((parent = BT_POP(btstack)) != NULL) { - /* get/pin the parent page <sp> */ - XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); - if (rc) - return rc; - - index = parent->index; - - /* delete the entry for the freed child page from parent. - */ - nextindex = le16_to_cpu(p->header.nextindex); - - /* - * the parent has the single entry being deleted: - * free the parent page which has become empty. - */ - if (nextindex == 1) { - if (p->header.flag & BT_ROOT) { - /* keep the root page */ - p->header.flag &= ~BT_INTERNAL; - p->header.flag |= BT_LEAF; - p->header.nextindex = - cpu_to_le16(XTENTRYSTART); - - /* XT_PUTPAGE(mp); */ - - break; - } else { - /* free the parent page */ - if ((rc = xtRelink(tid, ip, p))) - return rc; - - xaddr = addressPXD(&p->header.self); - /* free the page extent */ - dbFree(ip, xaddr, - (s64) JFS_SBI(ip->i_sb)->nbperpage); - - /* unpin/free the buffer page */ - discard_metapage(mp); - - /* propagate up */ - continue; - } - } - /* - * the parent has other entries remaining: - * delete the router entry from the parent page. - */ - else { - BT_MARK_DIRTY(mp, ip); - /* - * acquire a transaction lock on the leaf page; - * - * action:xad deletion; - */ - tlck = txLock(tid, ip, mp, tlckXTREE); - xtlck = (struct xtlock *) & tlck->lock; - xtlck->lwm.offset = - (xtlck->lwm.offset) ? min(index, - xtlck->lwm. - offset) : index; - - /* if delete from middle, - * shift left/compact the remaining entries in the page - */ - if (index < nextindex - 1) - memmove(&p->xad[index], &p->xad[index + 1], - (nextindex - index - - 1) << L2XTSLOTSIZE); - - le16_add_cpu(&p->header.nextindex, -1); - jfs_info("xtDeleteUp(entry): 0x%lx[%d]", - (ulong) parent->bn, index); - } - - /* unpin the parent page */ - XT_PUTPAGE(mp); - - /* exit propagation up */ - break; - } - - return 0; -} - - -/* - * NAME: xtRelocate() - * - * FUNCTION: relocate xtpage or data extent of regular file; - * This function is mainly used by defragfs utility. - * - * NOTE: This routine does not have the logic to handle - * uncommitted allocated extent. The caller should call - * txCommit() to commit all the allocation before call - * this routine. - */ -int -xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ - s64 nxaddr, /* new xaddr */ - int xtype) -{ /* extent type: XTPAGE or DATAEXT */ - int rc = 0; - struct tblock *tblk; - struct tlock *tlck; - struct xtlock *xtlck; - struct metapage *mp, *pmp, *lmp, *rmp; /* meta-page buffer */ - xtpage_t *p, *pp, *rp, *lp; /* base B+-tree index page */ - xad_t *xad; - pxd_t *pxd; - s64 xoff, xsize; - int xlen; - s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn; - cbuf_t *cp; - s64 offset, nbytes, nbrd, pno; - int nb, npages, nblks; - s64 bn; - int cmp; - int index; - struct pxd_lock *pxdlock; - struct btstack btstack; /* traverse stack */ - - xtype = xtype & EXTENT_TYPE; - - xoff = offsetXAD(oxad); - oxaddr = addressXAD(oxad); - xlen = lengthXAD(oxad); - - /* validate extent offset */ - offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; - if (offset >= ip->i_size) - return -ESTALE; /* stale extent */ - - jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx", - xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr); - - /* - * 1. get and validate the parent xtpage/xad entry - * covering the source extent to be relocated; - */ - if (xtype == DATAEXT) { - /* search in leaf entry */ - rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0); - if (rc) - return rc; - - /* retrieve search result */ - XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); - - if (cmp) { - XT_PUTPAGE(pmp); - return -ESTALE; - } - - /* validate for exact match with a single entry */ - xad = &pp->xad[index]; - if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) { - XT_PUTPAGE(pmp); - return -ESTALE; - } - } else { /* (xtype == XTPAGE) */ - - /* search in internal entry */ - rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0); - if (rc) - return rc; - - /* retrieve search result */ - XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); - - if (cmp) { - XT_PUTPAGE(pmp); - return -ESTALE; - } - - /* xtSearchNode() validated for exact match with a single entry - */ - xad = &pp->xad[index]; - } - jfs_info("xtRelocate: parent xad entry validated."); - - /* - * 2. relocate the extent - */ - if (xtype == DATAEXT) { - /* if the extent is allocated-but-not-recorded - * there is no real data to be moved in this extent, - */ - if (xad->flag & XAD_NOTRECORDED) - goto out; - else - /* release xtpage for cmRead()/xtLookup() */ - XT_PUTPAGE(pmp); - - /* - * cmRelocate() - * - * copy target data pages to be relocated; - * - * data extent must start at page boundary and - * multiple of page size (except the last data extent); - * read in each page of the source data extent into cbuf, - * update the cbuf extent descriptor of the page to be - * homeward bound to new dst data extent - * copy the data from the old extent to new extent. - * copy is essential for compressed files to avoid problems - * that can arise if there was a change in compression - * algorithms. - * it is a good strategy because it may disrupt cache - * policy to keep the pages in memory afterwards. - */ - offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; - assert((offset & CM_OFFSET) == 0); - nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize; - pno = offset >> CM_L2BSIZE; - npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE; -/* - npages = ((offset + nbytes - 1) >> CM_L2BSIZE) - - (offset >> CM_L2BSIZE) + 1; -*/ - sxaddr = oxaddr; - dxaddr = nxaddr; - - /* process the request one cache buffer at a time */ - for (nbrd = 0; nbrd < nbytes; nbrd += nb, - offset += nb, pno++, npages--) { - /* compute page size */ - nb = min(nbytes - nbrd, CM_BSIZE); - - /* get the cache buffer of the page */ - if (rc = cmRead(ip, offset, npages, &cp)) - break; - - assert(addressPXD(&cp->cm_pxd) == sxaddr); - assert(!cp->cm_modified); - - /* bind buffer with the new extent address */ - nblks = nb >> JFS_IP(ip->i_sb)->l2bsize; - cmSetXD(ip, cp, pno, dxaddr, nblks); - - /* release the cbuf, mark it as modified */ - cmPut(cp, true); - - dxaddr += nblks; - sxaddr += nblks; - } - - /* get back parent page */ - if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0))) - return rc; - - XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); - jfs_info("xtRelocate: target data extent relocated."); - } else { /* (xtype == XTPAGE) */ - - /* - * read in the target xtpage from the source extent; - */ - XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); - if (rc) { - XT_PUTPAGE(pmp); - return rc; - } - - /* - * read in sibling pages if any to update sibling pointers; - */ - rmp = NULL; - if (p->header.next) { - nextbn = le64_to_cpu(p->header.next); - XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc); - if (rc) { - XT_PUTPAGE(pmp); - XT_PUTPAGE(mp); - return (rc); - } - } - - lmp = NULL; - if (p->header.prev) { - prevbn = le64_to_cpu(p->header.prev); - XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc); - if (rc) { - XT_PUTPAGE(pmp); - XT_PUTPAGE(mp); - if (rmp) - XT_PUTPAGE(rmp); - return (rc); - } - } - - /* at this point, all xtpages to be updated are in memory */ - - /* - * update sibling pointers of sibling xtpages if any; - */ - if (lmp) { - BT_MARK_DIRTY(lmp, ip); - tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK); - lp->header.next = cpu_to_le64(nxaddr); - XT_PUTPAGE(lmp); - } - - if (rmp) { - BT_MARK_DIRTY(rmp, ip); - tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK); - rp->header.prev = cpu_to_le64(nxaddr); - XT_PUTPAGE(rmp); - } - - /* - * update the target xtpage to be relocated - * - * update the self address of the target page - * and write to destination extent; - * redo image covers the whole xtpage since it is new page - * to the destination extent; - * update of bmap for the free of source extent - * of the target xtpage itself: - * update of bmap for the allocation of destination extent - * of the target xtpage itself: - * update of bmap for the extents covered by xad entries in - * the target xtpage is not necessary since they are not - * updated; - * if not committed before this relocation, - * target page may contain XAD_NEW entries which must - * be scanned for bmap update (logredo() always - * scan xtpage REDOPAGE image for bmap update); - * if committed before this relocation (tlckRELOCATE), - * scan may be skipped by commit() and logredo(); - */ - BT_MARK_DIRTY(mp, ip); - /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */ - tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW); - xtlck = (struct xtlock *) & tlck->lock; - - /* update the self address in the xtpage header */ - pxd = &p->header.self; - PXDaddress(pxd, nxaddr); - - /* linelock for the after image of the whole page */ - xtlck->lwm.length = - le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; - - /* update the buffer extent descriptor of target xtpage */ - xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; - bmSetXD(mp, nxaddr, xsize); - - /* unpin the target page to new homeward bound */ - XT_PUTPAGE(mp); - jfs_info("xtRelocate: target xtpage relocated."); - } - - /* - * 3. acquire maplock for the source extent to be freed; - * - * acquire a maplock saving the src relocated extent address; - * to free of the extent at commit time; - */ - out: - /* if DATAEXT relocation, write a LOG_UPDATEMAP record for - * free PXD of the source data extent (logredo() will update - * bmap for free of source data extent), and update bmap for - * free of the source data extent; - */ - if (xtype == DATAEXT) - tlck = txMaplock(tid, ip, tlckMAP); - /* if XTPAGE relocation, write a LOG_NOREDOPAGE record - * for the source xtpage (logredo() will init NoRedoPage - * filter and will also update bmap for free of the source - * xtpage), and update bmap for free of the source xtpage; - * N.B. We use tlckMAP instead of tlkcXTREE because there - * is no buffer associated with this lock since the buffer - * has been redirected to the target location. - */ - else /* (xtype == XTPAGE) */ - tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE); - - pxdlock = (struct pxd_lock *) & tlck->lock; - pxdlock->flag = mlckFREEPXD; - PXDaddress(&pxdlock->pxd, oxaddr); - PXDlength(&pxdlock->pxd, xlen); - pxdlock->index = 1; - - /* - * 4. update the parent xad entry for relocation; - * - * acquire tlck for the parent entry with XAD_NEW as entry - * update which will write LOG_REDOPAGE and update bmap for - * allocation of XAD_NEW destination extent; - */ - jfs_info("xtRelocate: update parent xad entry."); - BT_MARK_DIRTY(pmp, ip); - tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW); - xtlck = (struct xtlock *) & tlck->lock; - - /* update the XAD with the new destination extent; */ - xad = &pp->xad[index]; - xad->flag |= XAD_NEW; - XADaddress(xad, nxaddr); - - xtlck->lwm.offset = min(index, xtlck->lwm.offset); - xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) - - xtlck->lwm.offset; - - /* unpin the parent xtpage */ - XT_PUTPAGE(pmp); - - return rc; -} - - -/* - * xtSearchNode() - * - * function: search for the internal xad entry covering specified extent. - * This function is mainly used by defragfs utility. - * - * parameters: - * ip - file object; - * xad - extent to find; - * cmpp - comparison result: - * btstack - traverse stack; - * flag - search process flag; - * - * returns: - * btstack contains (bn, index) of search path traversed to the entry. - * *cmpp is set to result of comparison with the entry returned. - * the page containing the entry is pinned at exit. - */ -static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ - int *cmpp, struct btstack * btstack, int flag) -{ - int rc = 0; - s64 xoff, xaddr; - int xlen; - int cmp = 1; /* init for empty page */ - s64 bn; /* block number */ - struct metapage *mp; /* meta-page buffer */ - xtpage_t *p; /* page */ - int base, index, lim; - struct btframe *btsp; - s64 t64; - - BT_CLR(btstack); - - xoff = offsetXAD(xad); - xlen = lengthXAD(xad); - xaddr = addressXAD(xad); - - /* - * search down tree from root: - * - * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of - * internal page, child page Pi contains entry with k, Ki <= K < Kj. - * - * if entry with search key K is not found - * internal page search find the entry with largest key Ki - * less than K which point to the child page to search; - * leaf page search find the entry with smallest key Kj - * greater than K so that the returned index is the position of - * the entry to be shifted right for insertion of new entry. - * for empty tree, search key is greater than any key of the tree. - * - * by convention, root bn = 0. - */ - for (bn = 0;;) { - /* get/pin the page to search */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; - if (p->header.flag & BT_LEAF) { - XT_PUTPAGE(mp); - return -ESTALE; - } - - lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; - - /* - * binary search with search key K on the current page - */ - for (base = XTENTRYSTART; lim; lim >>= 1) { - index = base + (lim >> 1); - - XT_CMP(cmp, xoff, &p->xad[index], t64); - if (cmp == 0) { - /* - * search hit - * - * verify for exact match; - */ - if (xaddr == addressXAD(&p->xad[index]) && - xoff == offsetXAD(&p->xad[index])) { - *cmpp = cmp; - - /* save search result */ - btsp = btstack->top; - btsp->bn = bn; - btsp->index = index; - btsp->mp = mp; - - return 0; - } - - /* descend/search its child page */ - goto next; - } - - if (cmp > 0) { - base = index + 1; - --lim; - } - } - - /* - * search miss - non-leaf page: - * - * base is the smallest index with key (Kj) greater than - * search key (K) and may be zero or maxentry index. - * if base is non-zero, decrement base by one to get the parent - * entry of the child page to search. - */ - index = base ? base - 1 : base; - - /* - * go down to child page - */ - next: - /* get the child page block number */ - bn = addressXAD(&p->xad[index]); - - /* unpin the parent page */ - XT_PUTPAGE(mp); - } -} - - -/* - * xtRelink() - * - * function: - * link around a freed page. - * - * Parameter: - * int tid, - * struct inode *ip, - * xtpage_t *p) - * - * returns: - */ -static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p) -{ - int rc = 0; - struct metapage *mp; - s64 nextbn, prevbn; - struct tlock *tlck; - - nextbn = le64_to_cpu(p->header.next); - prevbn = le64_to_cpu(p->header.prev); - - /* update prev pointer of the next page */ - if (nextbn != 0) { - XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); - if (rc) - return rc; - - /* - * acquire a transaction lock on the page; - * - * action: update prev pointer; - */ - BT_MARK_DIRTY(mp, ip); - tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); - - /* the page may already have been tlock'd */ - - p->header.prev = cpu_to_le64(prevbn); - - XT_PUTPAGE(mp); - } - - /* update next pointer of the previous page */ - if (prevbn != 0) { - XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); - if (rc) - return rc; - - /* - * acquire a transaction lock on the page; - * - * action: update next pointer; - */ - BT_MARK_DIRTY(mp, ip); - tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); - - /* the page may already have been tlock'd */ - - p->header.next = le64_to_cpu(nextbn); - - XT_PUTPAGE(mp); - } - - return 0; -} -#endif /* _STILL_TO_PORT */ - /* * xtInitRoot() diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h index 5f51be8596b3..142caafc73b1 100644 --- a/fs/jfs/jfs_xtree.h +++ b/fs/jfs/jfs_xtree.h @@ -95,10 +95,6 @@ extern int xtInsert(tid_t tid, struct inode *ip, int xflag, s64 xoff, int xlen, s64 * xaddrp, int flag); extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen, int flag); -#ifdef _NOTYET -extern int xtTailgate(tid_t tid, struct inode *ip, - s64 xoff, int xlen, s64 xaddr, int flag); -#endif extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad); extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen, int flag); diff --git a/fs/locks.c b/fs/locks.c index 8c6df10cd9ed..ca28e0e50e56 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -300,6 +300,34 @@ void locks_release_private(struct file_lock *fl) } EXPORT_SYMBOL_GPL(locks_release_private); +/** + * locks_owner_has_blockers - Check for blocking lock requests + * @flctx: file lock context + * @owner: lock owner + * + * Return values: + * %true: @owner has at least one blocker + * %false: @owner has no blockers + */ +bool locks_owner_has_blockers(struct file_lock_context *flctx, + fl_owner_t owner) +{ + struct file_lock *fl; + + spin_lock(&flctx->flc_lock); + list_for_each_entry(fl, &flctx->flc_posix, fl_list) { + if (fl->fl_owner != owner) + continue; + if (!list_empty(&fl->fl_blocked_requests)) { + spin_unlock(&flctx->flc_lock); + return true; + } + } + spin_unlock(&flctx->flc_lock); + return false; +} +EXPORT_SYMBOL_GPL(locks_owner_has_blockers); + /* Free a lock which is not in use. */ void locks_free_lock(struct file_lock *fl) { @@ -874,6 +902,8 @@ posix_test_lock(struct file *filp, struct file_lock *fl) struct file_lock *cfl; struct file_lock_context *ctx; struct inode *inode = locks_inode(filp); + void *owner; + void (*func)(void); ctx = smp_load_acquire(&inode->i_flctx); if (!ctx || list_empty_careful(&ctx->flc_posix)) { @@ -881,12 +911,23 @@ posix_test_lock(struct file *filp, struct file_lock *fl) return; } +retry: spin_lock(&ctx->flc_lock); list_for_each_entry(cfl, &ctx->flc_posix, fl_list) { - if (posix_locks_conflict(fl, cfl)) { - locks_copy_conflock(fl, cfl); - goto out; + if (!posix_locks_conflict(fl, cfl)) + continue; + if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable + && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) { + owner = cfl->fl_lmops->lm_mod_owner; + func = cfl->fl_lmops->lm_expire_lock; + __module_get(owner); + spin_unlock(&ctx->flc_lock); + (*func)(); + module_put(owner); + goto retry; } + locks_copy_conflock(fl, cfl); + goto out; } fl->fl_type = F_UNLCK; out: @@ -1060,6 +1101,8 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, int error; bool added = false; LIST_HEAD(dispose); + void *owner; + void (*func)(void); ctx = locks_get_lock_context(inode, request->fl_type); if (!ctx) @@ -1078,6 +1121,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, new_fl2 = locks_alloc_lock(); } +retry: percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); /* @@ -1089,6 +1133,17 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, list_for_each_entry(fl, &ctx->flc_posix, fl_list) { if (!posix_locks_conflict(request, fl)) continue; + if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable + && (*fl->fl_lmops->lm_lock_expirable)(fl)) { + owner = fl->fl_lmops->lm_mod_owner; + func = fl->fl_lmops->lm_expire_lock; + __module_get(owner); + spin_unlock(&ctx->flc_lock); + percpu_up_read(&file_rwsem); + (*func)(); + module_put(owner); + goto retry; + } if (conflock) locks_copy_conflock(conflock, fl); error = -EAGAIN; diff --git a/fs/namei.c b/fs/namei.c index 896ade8b7400..3dc0db2df561 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1032,7 +1032,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1041,7 +1041,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1050,7 +1050,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, @@ -1059,7 +1059,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 11c566d8769f..4eb2a8380a28 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -153,28 +153,25 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq, } /** - * nfs_direct_IO - NFS address space operation for direct I/O + * nfs_swap_rw - NFS address space operation for swap I/O * @iocb: target I/O control block * @iter: I/O buffer * - * The presence of this routine in the address space ops vector means - * the NFS client supports direct I/O. However, for most direct IO, we - * shunt off direct read and write requests before the VFS gets them, - * so this method is only ever called for swap. + * Perform IO to the swap-file. This is much like direct IO. */ -ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) { - struct inode *inode = iocb->ki_filp->f_mapping->host; - - /* we only support swap file calling nfs_direct_IO */ - if (!IS_SWAPFILE(inode)) - return 0; + ssize_t ret; VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (iov_iter_rw(iter) == READ) - return nfs_file_direct_read(iocb, iter, true); - return nfs_file_direct_write(iocb, iter, true); + ret = nfs_file_direct_read(iocb, iter, true); + else + ret = nfs_file_direct_write(iocb, iter, true); + if (ret < 0) + return ret; + return 0; } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d764b3ce7905..6f5425e89ca6 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -69,6 +69,8 @@ nfs_file_open(struct inode *inode, struct file *filp) return res; res = nfs_open(inode, filp); + if (res == 0) + filp->f_mode |= FMODE_CAN_ODIRECT; return res; } @@ -480,6 +482,7 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, { unsigned long blocks; long long isize; + int ret; struct inode *inode = file_inode(file); struct rpc_clnt *clnt = NFS_CLIENT(inode); struct nfs_client *cl = NFS_SERVER(inode)->nfs_client; @@ -493,13 +496,22 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, return -EINVAL; } - *span = sis->pages; + ret = rpc_clnt_swap_activate(clnt); + if (ret) + return ret; + ret = add_swap_extent(sis, 0, sis->max, 0); + if (ret < 0) { + rpc_clnt_swap_deactivate(clnt); + return ret; + } + *span = sis->pages; if (cl->rpc_ops->enable_swap) cl->rpc_ops->enable_swap(inode); - return rpc_clnt_swap_activate(clnt); + sis->flags |= SWP_FS_OPS; + return ret; } static void nfs_swap_deactivate(struct file *file) @@ -523,7 +535,6 @@ const struct address_space_operations nfs_file_aops = { .write_end = nfs_write_end, .invalidate_folio = nfs_invalidate_folio, .release_folio = nfs_release_folio, - .direct_IO = nfs_direct_IO, #ifdef CONFIG_MIGRATION .migratepage = nfs_migrate_page, #endif @@ -532,6 +543,7 @@ const struct address_space_operations nfs_file_aops = { .error_remove_page = generic_error_remove_page, .swap_activate = nfs_swap_activate, .swap_deactivate = nfs_swap_deactivate, + .swap_rw = nfs_swap_rw, }; /* diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 489c9c1d8f31..f172412447f5 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -303,6 +303,8 @@ nfsd_file_put_noref(struct nfsd_file *nf) void nfsd_file_put(struct nfsd_file *nf) { + might_sleep(); + set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) { nfsd_file_flush(nf); @@ -899,9 +901,9 @@ nfsd_file_is_cached(struct inode *inode) return ret; } -__be32 -nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, - unsigned int may_flags, struct nfsd_file **pnf) +static __be32 +nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf, bool open) { __be32 status; struct net *net = SVC_NET(rqstp); @@ -996,10 +998,14 @@ open_file: nfsd_file_gc(); nf->nf_mark = nfsd_file_mark_find_or_create(nf); - if (nf->nf_mark) - status = nfsd_open_verified(rqstp, fhp, S_IFREG, - may_flags, &nf->nf_file); - else + if (nf->nf_mark) { + if (open) { + status = nfsd_open_verified(rqstp, fhp, may_flags, + &nf->nf_file); + trace_nfsd_file_open(nf, status); + } else + status = nfs_ok; + } else status = nfserr_jukebox; /* * If construction failed, or we raced with a call to unlink() @@ -1019,6 +1025,40 @@ open_file: goto out; } +/** + * nfsd_file_acquire - Get a struct nfsd_file with an open file + * @rqstp: the RPC transaction being executed + * @fhp: the NFS filehandle of the file to be opened + * @may_flags: NFSD_MAY_ settings for the file + * @pnf: OUT: new or found "struct nfsd_file" object + * + * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in + * network byte order is returned. + */ +__be32 +nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + return nfsd_do_file_acquire(rqstp, fhp, may_flags, pnf, true); +} + +/** + * nfsd_file_create - Get a struct nfsd_file, do not open + * @rqstp: the RPC transaction being executed + * @fhp: the NFS filehandle of the file just created + * @may_flags: NFSD_MAY_ settings for the file + * @pnf: OUT: new or found "struct nfsd_file" object + * + * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in + * network byte order is returned. + */ +__be32 +nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + return nfsd_do_file_acquire(rqstp, fhp, may_flags, pnf, false); +} + /* * Note that fields may be added, removed or reordered in the future. Programs * scraping this file for info should test the labels to ensure they're diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 435ceab27897..1da0c79a5580 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -59,5 +59,7 @@ void nfsd_file_close_inode_sync(struct inode *inode); bool nfsd_file_is_cached(struct inode *inode); __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **nfp); +__be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **nfp); int nfsd_file_cache_stats_open(struct inode *, struct file *); #endif /* _FS_NFSD_FILECACHE_H */ diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 936eebd4c56d..981a3a7a6e16 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -8,6 +8,7 @@ #include <linux/fs.h> #include <linux/ext2_fs.h> #include <linux/magic.h> +#include <linux/namei.h> #include "cache.h" #include "xdr3.h" @@ -220,17 +221,132 @@ nfsd3_proc_write(struct svc_rqst *rqstp) } /* - * With NFSv3, CREATE processing is a lot easier than with NFSv2. - * At least in theory; we'll see how it fares in practice when the - * first reports about SunOS compatibility problems start to pour in... + * Implement NFSv3's unchecked, guarded, and exclusive CREATE + * semantics for regular files. Except for the created file, + * this operation is stateless on the server. + * + * Upon return, caller must release @fhp and @resfhp. */ static __be32 +nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct svc_fh *resfhp, struct nfsd3_createargs *argp) +{ + struct iattr *iap = &argp->attrs; + struct dentry *parent, *child; + __u32 v_mtime, v_atime; + struct inode *inode; + __be32 status; + int host_err; + + if (isdotent(argp->name, argp->len)) + return nfserr_exist; + if (!(iap->ia_valid & ATTR_MODE)) + iap->ia_mode = 0; + + status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); + if (status != nfs_ok) + return status; + + parent = fhp->fh_dentry; + inode = d_inode(parent); + + host_err = fh_want_write(fhp); + if (host_err) + return nfserrno(host_err); + + fh_lock_nested(fhp, I_MUTEX_PARENT); + + child = lookup_one_len(argp->name, parent, argp->len); + if (IS_ERR(child)) { + status = nfserrno(PTR_ERR(child)); + goto out; + } + + if (d_really_is_negative(child)) { + status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (status != nfs_ok) + goto out; + } + + status = fh_compose(resfhp, fhp->fh_export, child, fhp); + if (status != nfs_ok) + goto out; + + v_mtime = 0; + v_atime = 0; + if (argp->createmode == NFS3_CREATE_EXCLUSIVE) { + u32 *verifier = (u32 *)argp->verf; + + /* + * Solaris 7 gets confused (bugid 4218508) if these have + * the high bit set, as do xfs filesystems without the + * "bigtime" feature. So just clear the high bits. + */ + v_mtime = verifier[0] & 0x7fffffff; + v_atime = verifier[1] & 0x7fffffff; + } + + if (d_really_is_positive(child)) { + status = nfs_ok; + + switch (argp->createmode) { + case NFS3_CREATE_UNCHECKED: + if (!d_is_reg(child)) + break; + iap->ia_valid &= ATTR_SIZE; + goto set_attr; + case NFS3_CREATE_GUARDED: + status = nfserr_exist; + break; + case NFS3_CREATE_EXCLUSIVE: + if (d_inode(child)->i_mtime.tv_sec == v_mtime && + d_inode(child)->i_atime.tv_sec == v_atime && + d_inode(child)->i_size == 0) { + break; + } + status = nfserr_exist; + } + goto out; + } + + if (!IS_POSIXACL(inode)) + iap->ia_mode &= ~current_umask(); + + host_err = vfs_create(&init_user_ns, inode, child, iap->ia_mode, true); + if (host_err < 0) { + status = nfserrno(host_err); + goto out; + } + + /* A newly created file already has a file size of zero. */ + if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0)) + iap->ia_valid &= ~ATTR_SIZE; + if (argp->createmode == NFS3_CREATE_EXCLUSIVE) { + iap->ia_valid = ATTR_MTIME | ATTR_ATIME | + ATTR_MTIME_SET | ATTR_ATIME_SET; + iap->ia_mtime.tv_sec = v_mtime; + iap->ia_atime.tv_sec = v_atime; + iap->ia_mtime.tv_nsec = 0; + iap->ia_atime.tv_nsec = 0; + } + +set_attr: + status = nfsd_create_setattr(rqstp, fhp, resfhp, iap); + +out: + fh_unlock(fhp); + if (child && !IS_ERR(child)) + dput(child); + fh_drop_write(fhp); + return status; +} + +static __be32 nfsd3_proc_create(struct svc_rqst *rqstp) { struct nfsd3_createargs *argp = rqstp->rq_argp; struct nfsd3_diropres *resp = rqstp->rq_resp; - svc_fh *dirfhp, *newfhp = NULL; - struct iattr *attr; + svc_fh *dirfhp, *newfhp; dprintk("nfsd: CREATE(3) %s %.*s\n", SVCFH_fmt(&argp->fh), @@ -239,21 +355,8 @@ nfsd3_proc_create(struct svc_rqst *rqstp) dirfhp = fh_copy(&resp->dirfh, &argp->fh); newfhp = fh_init(&resp->fh, NFS3_FHSIZE); - attr = &argp->attrs; - - /* Unfudge the mode bits */ - attr->ia_mode &= ~S_IFMT; - if (!(attr->ia_valid & ATTR_MODE)) { - attr->ia_valid |= ATTR_MODE; - attr->ia_mode = S_IFREG; - } else { - attr->ia_mode = (attr->ia_mode & ~S_IFMT) | S_IFREG; - } - /* Now create the file and set attributes */ - resp->status = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len, - attr, newfhp, argp->createmode, - (u32 *)argp->verf, NULL, NULL); + resp->status = nfsd3_create_file(rqstp, dirfhp, newfhp, argp); return rpc_success; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index b207c76a873f..3895eb52d2b1 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -37,6 +37,8 @@ #include <linux/falloc.h> #include <linux/slab.h> #include <linux/kthread.h> +#include <linux/namei.h> + #include <linux/sunrpc/addr.h> #include <linux/nfs_ssc.h> @@ -235,6 +237,183 @@ static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate &resfh->fh_handle); } +static inline bool nfsd4_create_is_exclusive(int createmode) +{ + return createmode == NFS4_CREATE_EXCLUSIVE || + createmode == NFS4_CREATE_EXCLUSIVE4_1; +} + +static __be32 +nfsd4_vfs_create(struct svc_fh *fhp, struct dentry *child, + struct nfsd4_open *open) +{ + struct file *filp; + struct path path; + int oflags; + + oflags = O_CREAT | O_LARGEFILE; + switch (open->op_share_access & NFS4_SHARE_ACCESS_BOTH) { + case NFS4_SHARE_ACCESS_WRITE: + oflags |= O_WRONLY; + break; + case NFS4_SHARE_ACCESS_BOTH: + oflags |= O_RDWR; + break; + default: + oflags |= O_RDONLY; + } + + path.mnt = fhp->fh_export->ex_path.mnt; + path.dentry = child; + filp = dentry_create(&path, oflags, open->op_iattr.ia_mode, + current_cred()); + if (IS_ERR(filp)) + return nfserrno(PTR_ERR(filp)); + + open->op_filp = filp; + return nfs_ok; +} + +/* + * Implement NFSv4's unchecked, guarded, and exclusive create + * semantics for regular files. Open state for this new file is + * subsequently fabricated in nfsd4_process_open2(). + * + * Upon return, caller must release @fhp and @resfhp. + */ +static __be32 +nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct svc_fh *resfhp, struct nfsd4_open *open) +{ + struct iattr *iap = &open->op_iattr; + struct dentry *parent, *child; + __u32 v_mtime, v_atime; + struct inode *inode; + __be32 status; + int host_err; + + if (isdotent(open->op_fname, open->op_fnamelen)) + return nfserr_exist; + if (!(iap->ia_valid & ATTR_MODE)) + iap->ia_mode = 0; + + status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); + if (status != nfs_ok) + return status; + parent = fhp->fh_dentry; + inode = d_inode(parent); + + host_err = fh_want_write(fhp); + if (host_err) + return nfserrno(host_err); + + fh_lock_nested(fhp, I_MUTEX_PARENT); + + child = lookup_one_len(open->op_fname, parent, open->op_fnamelen); + if (IS_ERR(child)) { + status = nfserrno(PTR_ERR(child)); + goto out; + } + + if (d_really_is_negative(child)) { + status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (status != nfs_ok) + goto out; + } + + status = fh_compose(resfhp, fhp->fh_export, child, fhp); + if (status != nfs_ok) + goto out; + + v_mtime = 0; + v_atime = 0; + if (nfsd4_create_is_exclusive(open->op_createmode)) { + u32 *verifier = (u32 *)open->op_verf.data; + + /* + * Solaris 7 gets confused (bugid 4218508) if these have + * the high bit set, as do xfs filesystems without the + * "bigtime" feature. So just clear the high bits. If this + * is ever changed to use different attrs for storing the + * verifier, then do_open_lookup() will also need to be + * fixed accordingly. + */ + v_mtime = verifier[0] & 0x7fffffff; + v_atime = verifier[1] & 0x7fffffff; + } + + if (d_really_is_positive(child)) { + status = nfs_ok; + + switch (open->op_createmode) { + case NFS4_CREATE_UNCHECKED: + if (!d_is_reg(child)) + break; + + /* + * In NFSv4, we don't want to truncate the file + * now. This would be wrong if the OPEN fails for + * some other reason. Furthermore, if the size is + * nonzero, we should ignore it according to spec! + */ + open->op_truncate = (iap->ia_valid & ATTR_SIZE) && + !iap->ia_size; + break; + case NFS4_CREATE_GUARDED: + status = nfserr_exist; + break; + case NFS4_CREATE_EXCLUSIVE: + if (d_inode(child)->i_mtime.tv_sec == v_mtime && + d_inode(child)->i_atime.tv_sec == v_atime && + d_inode(child)->i_size == 0) { + open->op_created = true; + break; /* subtle */ + } + status = nfserr_exist; + break; + case NFS4_CREATE_EXCLUSIVE4_1: + if (d_inode(child)->i_mtime.tv_sec == v_mtime && + d_inode(child)->i_atime.tv_sec == v_atime && + d_inode(child)->i_size == 0) { + open->op_created = true; + goto set_attr; /* subtle */ + } + status = nfserr_exist; + } + goto out; + } + + if (!IS_POSIXACL(inode)) + iap->ia_mode &= ~current_umask(); + + status = nfsd4_vfs_create(fhp, child, open); + if (status != nfs_ok) + goto out; + open->op_created = true; + + /* A newly created file already has a file size of zero. */ + if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0)) + iap->ia_valid &= ~ATTR_SIZE; + if (nfsd4_create_is_exclusive(open->op_createmode)) { + iap->ia_valid = ATTR_MTIME | ATTR_ATIME | + ATTR_MTIME_SET|ATTR_ATIME_SET; + iap->ia_mtime.tv_sec = v_mtime; + iap->ia_atime.tv_sec = v_atime; + iap->ia_mtime.tv_nsec = 0; + iap->ia_atime.tv_nsec = 0; + } + +set_attr: + status = nfsd_create_setattr(rqstp, fhp, resfhp, iap); + +out: + fh_unlock(fhp); + if (child && !IS_ERR(child)) + dput(child); + fh_drop_write(fhp); + return status; +} + static __be32 do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh) { @@ -264,16 +443,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru * yes | yes | GUARDED4 | GUARDED4 */ - /* - * Note: create modes (UNCHECKED,GUARDED...) are the same - * in NFSv4 as in v3 except EXCLUSIVE4_1. - */ current->fs->umask = open->op_umask; - status = do_nfsd_create(rqstp, current_fh, open->op_fname, - open->op_fnamelen, &open->op_iattr, - *resfh, open->op_createmode, - (u32 *)open->op_verf.data, - &open->op_truncate, &open->op_created); + status = nfsd4_create_file(rqstp, current_fh, *resfh, open); current->fs->umask = 0; if (!status && open->op_label.len) @@ -284,7 +455,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru * use the returned bitmask to indicate which attributes * we used to store the verifier: */ - if (nfsd_create_is_exclusive(open->op_createmode) && status == 0) + if (nfsd4_create_is_exclusive(open->op_createmode) && status == 0) open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_MODIFY); } else @@ -375,6 +546,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, (int)open->op_fnamelen, open->op_fname, open->op_openowner); + open->op_filp = NULL; + /* This check required by spec. */ if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) return nfserr_inval; @@ -427,43 +600,35 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; switch (open->op_claim_type) { - case NFS4_OPEN_CLAIM_DELEGATE_CUR: - case NFS4_OPEN_CLAIM_NULL: - status = do_open_lookup(rqstp, cstate, open, &resfh); - if (status) - goto out; - break; - case NFS4_OPEN_CLAIM_PREVIOUS: - status = nfs4_check_open_reclaim(cstate->clp); - if (status) - goto out; - open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; - reclaim = true; - fallthrough; - case NFS4_OPEN_CLAIM_FH: - case NFS4_OPEN_CLAIM_DELEG_CUR_FH: - status = do_open_fhandle(rqstp, cstate, open); - if (status) - goto out; - resfh = &cstate->current_fh; - break; - case NFS4_OPEN_CLAIM_DELEG_PREV_FH: - case NFS4_OPEN_CLAIM_DELEGATE_PREV: - dprintk("NFSD: unsupported OPEN claim type %d\n", - open->op_claim_type); - status = nfserr_notsupp; + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_NULL: + status = do_open_lookup(rqstp, cstate, open, &resfh); + if (status) goto out; - default: - dprintk("NFSD: Invalid OPEN claim type %d\n", - open->op_claim_type); - status = nfserr_inval; + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + status = nfs4_check_open_reclaim(cstate->clp); + if (status) + goto out; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + reclaim = true; + fallthrough; + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + status = do_open_fhandle(rqstp, cstate, open); + if (status) goto out; + resfh = &cstate->current_fh; + break; + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + status = nfserr_notsupp; + goto out; + default: + status = nfserr_inval; + goto out; } - /* - * nfsd4_process_open2() does the actual opening of the file. If - * successful, it (1) truncates the file if open->op_truncate was - * set, (2) sets open->op_stateid, (3) sets open->op_delegation. - */ + status = nfsd4_process_open2(rqstp, resfh, open); WARN(status && open->op_created, "nfsd4_process_open2 failed to open newly-created file! status=%u\n", @@ -471,6 +636,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (reclaim && !status) nn->somebody_reclaimed = true; out: + if (open->op_filp) { + fput(open->op_filp); + open->op_filp = NULL; + } if (resfh && resfh != &cstate->current_fh) { fh_dup2(&cstate->current_fh, resfh); fh_put(resfh); @@ -801,7 +970,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * the client wants us to do more in this compound: */ if (!nfsd4_last_compound_op(rqstp)) - clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* check stateid */ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, @@ -2481,11 +2650,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) cstate->minorversion = args->minorversion; fh_init(current_fh, NFS4_FHSIZE); fh_init(save_fh, NFS4_FHSIZE); + /* * Don't use the deferral mechanism for NFSv4; compounds make it * too hard to avoid non-idempotency problems. */ - clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); + __clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); /* * According to RFC3010, this takes precedence over all other errors. @@ -2600,7 +2770,7 @@ encode_op: out: cstate->status = status; /* Reset deferral mechanism for RPC deferrals */ - set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); + __set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); return rpc_success; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 234e852fcdfa..9409a0dc1b76 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -125,6 +125,23 @@ static void free_session(struct nfsd4_session *); static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops; +static struct workqueue_struct *laundry_wq; + +int nfsd4_create_laundry_wq(void) +{ + int rc = 0; + + laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); + if (laundry_wq == NULL) + rc = -ENOMEM; + return rc; +} + +void nfsd4_destroy_laundry_wq(void) +{ + destroy_workqueue(laundry_wq); +} + static bool is_session_dead(struct nfsd4_session *ses) { return ses->se_flags & NFS4_SESSION_DEAD; @@ -152,6 +169,7 @@ static __be32 get_client_locked(struct nfs4_client *clp) if (is_client_expired(clp)) return nfserr_expired; atomic_inc(&clp->cl_rpc_users); + clp->cl_state = NFSD4_ACTIVE; return nfs_ok; } @@ -172,6 +190,7 @@ renew_client_locked(struct nfs4_client *clp) list_move_tail(&clp->cl_lru, &nn->client_lru); clp->cl_time = ktime_get_boottime_seconds(); + clp->cl_state = NFSD4_ACTIVE; } static void put_client_renew_locked(struct nfs4_client *clp) @@ -690,6 +709,57 @@ static unsigned int file_hashval(struct svc_fh *fh) static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; +/* + * Check if courtesy clients have conflicting access and resolve it if possible + * + * access: is op_share_access if share_access is true. + * Check if access mode, op_share_access, would conflict with + * the current deny mode of the file 'fp'. + * access: is op_share_deny if share_access is false. + * Check if the deny mode, op_share_deny, would conflict with + * current access of the file 'fp'. + * stp: skip checking this entry. + * new_stp: normal open, not open upgrade. + * + * Function returns: + * false - access/deny mode conflict with normal client. + * true - no conflict or conflict with courtesy client(s) is resolved. + */ +static bool +nfs4_resolve_deny_conflicts_locked(struct nfs4_file *fp, bool new_stp, + struct nfs4_ol_stateid *stp, u32 access, bool share_access) +{ + struct nfs4_ol_stateid *st; + bool resolvable = true; + unsigned char bmap; + struct nfsd_net *nn; + struct nfs4_client *clp; + + lockdep_assert_held(&fp->fi_lock); + list_for_each_entry(st, &fp->fi_stateids, st_perfile) { + /* ignore lock stateid */ + if (st->st_openstp) + continue; + if (st == stp && new_stp) + continue; + /* check file access against deny mode or vice versa */ + bmap = share_access ? st->st_deny_bmap : st->st_access_bmap; + if (!(access & bmap_to_share_mode(bmap))) + continue; + clp = st->st_stid.sc_client; + if (try_to_expire_client(clp)) + continue; + resolvable = false; + break; + } + if (resolvable) { + clp = stp->st_stid.sc_client; + nn = net_generic(clp->net, nfsd_net_id); + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); + } + return resolvable; +} + static void __nfs4_file_get_access(struct nfs4_file *fp, u32 access) { @@ -1090,6 +1160,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, get_clnt_odstate(odstate); dp->dl_type = NFS4_OPEN_DELEGATE_READ; dp->dl_retries = 1; + dp->dl_recalled = false; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL); get_nfs4_file(fp); @@ -2004,6 +2075,8 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) idr_init(&clp->cl_stateids); atomic_set(&clp->cl_rpc_users, 0); clp->cl_cb_state = NFSD4_CB_UNKNOWN; + clp->cl_state = NFSD4_ACTIVE; + atomic_set(&clp->cl_delegs_in_recall, 0); INIT_LIST_HEAD(&clp->cl_idhash); INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); @@ -2408,10 +2481,17 @@ static int client_info_show(struct seq_file *m, void *v) memcpy(&clid, &clp->cl_clientid, sizeof(clid)); seq_printf(m, "clientid: 0x%llx\n", clid); seq_printf(m, "address: \"%pISpc\"\n", (struct sockaddr *)&clp->cl_addr); - if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) + + if (clp->cl_state == NFSD4_COURTESY) + seq_puts(m, "status: courtesy\n"); + else if (clp->cl_state == NFSD4_EXPIRABLE) + seq_puts(m, "status: expirable\n"); + else if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) seq_puts(m, "status: confirmed\n"); else seq_puts(m, "status: unconfirmed\n"); + seq_printf(m, "seconds from last renew: %lld\n", + ktime_get_boottime_seconds() - clp->cl_time); seq_printf(m, "name: "); seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len); seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion); @@ -4694,9 +4774,18 @@ nfsd_break_deleg_cb(struct file_lock *fl) bool ret = false; struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; struct nfs4_file *fp = dp->dl_stid.sc_file; + struct nfs4_client *clp = dp->dl_stid.sc_client; + struct nfsd_net *nn; trace_nfsd_cb_recall(&dp->dl_stid); + dp->dl_recalled = true; + atomic_inc(&clp->cl_delegs_in_recall); + if (try_to_expire_client(clp)) { + nn = net_generic(clp->net, nfsd_net_id); + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); + } + /* * We don't want the locks code to timeout the lease for us; * we'll remove it ourself if a delegation isn't returned @@ -4739,9 +4828,14 @@ static int nfsd_change_deleg_cb(struct file_lock *onlist, int arg, struct list_head *dispose) { - if (arg & F_UNLCK) + struct nfs4_delegation *dp = (struct nfs4_delegation *)onlist->fl_owner; + struct nfs4_client *clp = dp->dl_stid.sc_client; + + if (arg & F_UNLCK) { + if (dp->dl_recalled) + atomic_dec(&clp->cl_delegs_in_recall); return lease_modify(onlist, arg, dispose); - else + } else return -EAGAIN; } @@ -4947,7 +5041,7 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, - struct nfsd4_open *open) + struct nfsd4_open *open, bool new_stp) { struct nfsd_file *nf = NULL; __be32 status; @@ -4963,6 +5057,13 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, */ status = nfs4_file_check_deny(fp, open->op_share_deny); if (status != nfs_ok) { + if (status != nfserr_share_denied) { + spin_unlock(&fp->fi_lock); + goto out; + } + if (nfs4_resolve_deny_conflicts_locked(fp, new_stp, + stp, open->op_share_deny, false)) + status = nfserr_jukebox; spin_unlock(&fp->fi_lock); goto out; } @@ -4970,6 +5071,13 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, /* set access to the file */ status = nfs4_file_get_access(fp, open->op_share_access); if (status != nfs_ok) { + if (status != nfserr_share_denied) { + spin_unlock(&fp->fi_lock); + goto out; + } + if (nfs4_resolve_deny_conflicts_locked(fp, new_stp, + stp, open->op_share_access, true)) + status = nfserr_jukebox; spin_unlock(&fp->fi_lock); goto out; } @@ -4985,9 +5093,19 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, if (!fp->fi_fds[oflag]) { spin_unlock(&fp->fi_lock); - status = nfsd_file_acquire(rqstp, cur_fh, access, &nf); - if (status) - goto out_put_access; + + if (!open->op_filp) { + status = nfsd_file_acquire(rqstp, cur_fh, access, &nf); + if (status != nfs_ok) + goto out_put_access; + } else { + status = nfsd_file_create(rqstp, cur_fh, access, &nf); + if (status != nfs_ok) + goto out_put_access; + nf->nf_file = open->op_filp; + open->op_filp = NULL; + } + spin_lock(&fp->fi_lock); if (!fp->fi_fds[oflag]) { fp->fi_fds[oflag] = nf; @@ -5016,21 +5134,29 @@ out_put_access: } static __be32 -nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) +nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, + struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, + struct nfsd4_open *open) { __be32 status; unsigned char old_deny_bmap = stp->st_deny_bmap; if (!test_access(open->op_share_access, stp)) - return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open); + return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open, false); /* test and set deny mode */ spin_lock(&fp->fi_lock); status = nfs4_file_check_deny(fp, open->op_share_deny); if (status == nfs_ok) { - set_deny(open->op_share_deny, stp); - fp->fi_share_deny |= + if (status != nfserr_share_denied) { + set_deny(open->op_share_deny, stp); + fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH); + } else { + if (nfs4_resolve_deny_conflicts_locked(fp, false, + stp, open->op_share_deny, false)) + status = nfserr_jukebox; + } } spin_unlock(&fp->fi_lock); @@ -5322,6 +5448,18 @@ static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open, */ } +/** + * nfsd4_process_open2 - finish open processing + * @rqstp: the RPC transaction being executed + * @current_fh: NFSv4 COMPOUND's current filehandle + * @open: OPEN arguments + * + * If successful, (1) truncate the file if open->op_truncate was + * set, (2) set open->op_stateid, (3) set open->op_delegation. + * + * Returns %nfs_ok on success; otherwise an nfs4stat value in + * network byte order is returned. + */ __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { @@ -5371,7 +5509,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf goto out; } } else { - status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); + status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true); if (status) { stp->st_stid.sc_type = NFS4_CLOSED_STID; release_open_stateid(stp); @@ -5605,6 +5743,81 @@ static void nfsd4_ssc_expire_umount(struct nfsd_net *nn) } #endif +/* Check if any lock belonging to this lockowner has any blockers */ +static bool +nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo) +{ + struct file_lock_context *ctx; + struct nfs4_ol_stateid *stp; + struct nfs4_file *nf; + + list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { + nf = stp->st_stid.sc_file; + ctx = nf->fi_inode->i_flctx; + if (!ctx) + continue; + if (locks_owner_has_blockers(ctx, lo)) + return true; + } + return false; +} + +static bool +nfs4_anylock_blockers(struct nfs4_client *clp) +{ + int i; + struct nfs4_stateowner *so; + struct nfs4_lockowner *lo; + + if (atomic_read(&clp->cl_delegs_in_recall)) + return true; + spin_lock(&clp->cl_lock); + for (i = 0; i < OWNER_HASH_SIZE; i++) { + list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[i], + so_strhash) { + if (so->so_is_open_owner) + continue; + lo = lockowner(so); + if (nfs4_lockowner_has_blockers(lo)) { + spin_unlock(&clp->cl_lock); + return true; + } + } + } + spin_unlock(&clp->cl_lock); + return false; +} + +static void +nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist, + struct laundry_time *lt) +{ + struct list_head *pos, *next; + struct nfs4_client *clp; + + INIT_LIST_HEAD(reaplist); + spin_lock(&nn->client_lock); + list_for_each_safe(pos, next, &nn->client_lru) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + if (clp->cl_state == NFSD4_EXPIRABLE) + goto exp_client; + if (!state_expired(lt, clp->cl_time)) + break; + if (!atomic_read(&clp->cl_rpc_users)) + clp->cl_state = NFSD4_COURTESY; + if (!client_has_state(clp) || + ktime_get_boottime_seconds() >= + (clp->cl_time + NFSD_COURTESY_CLIENT_TIMEOUT)) + goto exp_client; + if (nfs4_anylock_blockers(clp)) { +exp_client: + if (!mark_client_expired_locked(clp)) + list_add(&clp->cl_lru, reaplist); + } + } + spin_unlock(&nn->client_lock); +} + static time64_t nfs4_laundromat(struct nfsd_net *nn) { @@ -5627,7 +5840,6 @@ nfs4_laundromat(struct nfsd_net *nn) goto out; } nfsd4_end_grace(nn); - INIT_LIST_HEAD(&reaplist); spin_lock(&nn->s2s_cp_lock); idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) { @@ -5637,17 +5849,7 @@ nfs4_laundromat(struct nfsd_net *nn) _free_cpntf_state_locked(nn, cps); } spin_unlock(&nn->s2s_cp_lock); - - spin_lock(&nn->client_lock); - list_for_each_safe(pos, next, &nn->client_lru) { - clp = list_entry(pos, struct nfs4_client, cl_lru); - if (!state_expired(<, clp->cl_time)) - break; - if (mark_client_expired_locked(clp)) - continue; - list_add(&clp->cl_lru, &reaplist); - } - spin_unlock(&nn->client_lock); + nfs4_get_client_reaplist(nn, &reaplist, <); list_for_each_safe(pos, next, &reaplist) { clp = list_entry(pos, struct nfs4_client, cl_lru); trace_nfsd_clid_purged(&clp->cl_clientid); @@ -5722,7 +5924,6 @@ out: return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); } -static struct workqueue_struct *laundry_wq; static void laundromat_main(struct work_struct *); static void @@ -6551,6 +6752,29 @@ nfsd4_lm_put_owner(fl_owner_t owner) nfs4_put_stateowner(&lo->lo_owner); } +/* return pointer to struct nfs4_client if client is expirable */ +static bool +nfsd4_lm_lock_expirable(struct file_lock *cfl) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)cfl->fl_owner; + struct nfs4_client *clp = lo->lo_owner.so_client; + struct nfsd_net *nn; + + if (try_to_expire_client(clp)) { + nn = net_generic(clp->net, nfsd_net_id); + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); + return true; + } + return false; +} + +/* schedule laundromat to run immediately and wait for it to complete */ +static void +nfsd4_lm_expire_lock(void) +{ + flush_workqueue(laundry_wq); +} + static void nfsd4_lm_notify(struct file_lock *fl) { @@ -6577,9 +6801,12 @@ nfsd4_lm_notify(struct file_lock *fl) } static const struct lock_manager_operations nfsd_posix_mng_ops = { + .lm_mod_owner = THIS_MODULE, .lm_notify = nfsd4_lm_notify, .lm_get_owner = nfsd4_lm_get_owner, .lm_put_owner = nfsd4_lm_put_owner, + .lm_lock_expirable = nfsd4_lm_lock_expirable, + .lm_expire_lock = nfsd4_lm_expire_lock, }; static inline void @@ -7297,22 +7524,36 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) return status; } +/** + * nfsd4_release_lockowner - process NFSv4.0 RELEASE_LOCKOWNER operations + * @rqstp: RPC transaction + * @cstate: NFSv4 COMPOUND state + * @u: RELEASE_LOCKOWNER arguments + * + * The lockowner's so_count is bumped when a lock record is added + * or when copying a conflicting lock. The latter case is brief, + * but can lead to fleeting false positives when looking for + * locks-in-use. + * + * Return values: + * %nfs_ok: lockowner released or not found + * %nfserr_locks_held: lockowner still in use + * %nfserr_stale_clientid: clientid no longer active + * %nfserr_expired: clientid not recognized + */ __be32 nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); clientid_t *clid = &rlockowner->rl_clientid; - struct nfs4_stateowner *sop; - struct nfs4_lockowner *lo = NULL; struct nfs4_ol_stateid *stp; - struct xdr_netobj *owner = &rlockowner->rl_owner; - unsigned int hashval = ownerstr_hashval(owner); - __be32 status; - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfs4_lockowner *lo; struct nfs4_client *clp; - LIST_HEAD (reaplist); + LIST_HEAD(reaplist); + __be32 status; dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); @@ -7320,34 +7561,19 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, status = set_client(clid, cstate, nn); if (status) return status; - clp = cstate->clp; - /* Find the matching lock stateowner */ - spin_lock(&clp->cl_lock); - list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval], - so_strhash) { - - if (sop->so_is_open_owner || !same_owner_str(sop, owner)) - continue; - /* see if there are still any locks associated with it */ - lo = lockowner(sop); - list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) { - if (check_for_locks(stp->st_stid.sc_file, lo)) { - status = nfserr_locks_held; - spin_unlock(&clp->cl_lock); - return status; - } - } - - nfs4_get_stateowner(sop); - break; - } + spin_lock(&clp->cl_lock); + lo = find_lockowner_str_locked(clp, &rlockowner->rl_owner); if (!lo) { spin_unlock(&clp->cl_lock); - return status; + return nfs_ok; + } + if (atomic_read(&lo->lo_owner.so_count) != 2) { + spin_unlock(&clp->cl_lock); + nfs4_put_stateowner(&lo->lo_owner); + return nfserr_locks_held; } - unhash_lockowner_locked(lo); while (!list_empty(&lo->lo_owner.so_stateids)) { stp = list_first_entry(&lo->lo_owner.so_stateids, @@ -7357,11 +7583,11 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, put_ol_stateid_locked(stp, &reaplist); } spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); remove_blocked_locks(lo); nfs4_put_stateowner(&lo->lo_owner); - - return status; + return nfs_ok; } static inline struct nfs4_client_reclaim * @@ -7602,22 +7828,12 @@ nfs4_state_start(void) { int ret; - laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); - if (laundry_wq == NULL) { - ret = -ENOMEM; - goto out; - } ret = nfsd4_create_callback_queue(); if (ret) - goto out_free_laundry; + return ret; set_max_delegations(); return 0; - -out_free_laundry: - destroy_workqueue(laundry_wq); -out: - return ret; } void @@ -7654,7 +7870,6 @@ nfs4_state_shutdown_net(struct net *net) void nfs4_state_shutdown(void) { - destroy_workqueue(laundry_wq); nfsd4_destroy_callback_queue(); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index da92e7d2ab6a..61b2aae81abb 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2411,7 +2411,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) - clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); + __clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); return true; } diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 0b3f12aa37ff..7da88bdc0d6c 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -206,7 +206,6 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) struct svc_cacherep *rp; unsigned int i; - nfsd_reply_cache_stats_destroy(nn); unregister_shrinker(&nn->nfsd_reply_cache_shrinker); for (i = 0; i < nn->drc_hashsize; i++) { @@ -217,6 +216,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) rp, nn); } } + nfsd_reply_cache_stats_destroy(nn); kvfree(nn->drc_hashtbl); nn->drc_hashtbl = NULL; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 16920e4512bd..0621c2faf242 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1535,20 +1535,25 @@ static int __init init_nfsd(void) retval = create_proc_exports_entry(); if (retval) goto out_free_lockd; - retval = register_filesystem(&nfsd_fs_type); - if (retval) - goto out_free_exports; retval = register_pernet_subsys(&nfsd_net_ops); if (retval < 0) - goto out_free_filesystem; + goto out_free_exports; retval = register_cld_notifier(); if (retval) + goto out_free_subsys; + retval = nfsd4_create_laundry_wq(); + if (retval) + goto out_free_cld; + retval = register_filesystem(&nfsd_fs_type); + if (retval) goto out_free_all; return 0; out_free_all: + nfsd4_destroy_laundry_wq(); +out_free_cld: + unregister_cld_notifier(); +out_free_subsys: unregister_pernet_subsys(&nfsd_net_ops); -out_free_filesystem: - unregister_filesystem(&nfsd_fs_type); out_free_exports: remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); @@ -1566,6 +1571,8 @@ out_free_slabs: static void __exit exit_nfsd(void) { + unregister_filesystem(&nfsd_fs_type); + nfsd4_destroy_laundry_wq(); unregister_cld_notifier(); unregister_pernet_subsys(&nfsd_net_ops); nfsd_drc_slab_free(); @@ -1575,7 +1582,6 @@ static void __exit exit_nfsd(void) nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); - unregister_filesystem(&nfsd_fs_type); } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 4fc1fd639527..847b482155ae 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -162,6 +162,8 @@ void nfs4_state_shutdown_net(struct net *net); int nfs4_reset_recoverydir(char *recdir); char * nfs4_recoverydir(void); bool nfsd4_spo_must_allow(struct svc_rqst *rqstp); +int nfsd4_create_laundry_wq(void); +void nfsd4_destroy_laundry_wq(void); #else static inline int nfsd4_init_slabs(void) { return 0; } static inline void nfsd4_free_slabs(void) { } @@ -175,6 +177,8 @@ static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) { return false; } +static inline int nfsd4_create_laundry_wq(void) { return 0; }; +static inline void nfsd4_destroy_laundry_wq(void) {}; #endif /* @@ -336,6 +340,7 @@ void nfsd_lockd_shutdown(void); #define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */ #define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */ +#define NFSD_COURTESY_CLIENT_TIMEOUT (24 * 60 * 60) /* seconds */ /* * The following attributes are currently not supported by the NFSv4 server: diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 95457cfd37fc..f3d6313914ed 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -149,6 +149,7 @@ struct nfs4_delegation { /* For recall: */ int dl_retries; struct nfsd4_callback dl_recall; + bool dl_recalled; }; #define cb_to_delegation(cb) \ @@ -283,6 +284,28 @@ struct nfsd4_sessionid { #define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */ /* + * State Meaning Where set + * -------------------------------------------------------------------------- + * | NFSD4_ACTIVE | Confirmed, active | Default | + * |------------------- ----------------------------------------------------| + * | NFSD4_COURTESY | Courtesy state. | nfs4_get_client_reaplist | + * | | Lease/lock/share | | + * | | reservation conflict | | + * | | can cause Courtesy | | + * | | client to be expired | | + * |------------------------------------------------------------------------| + * | NFSD4_EXPIRABLE | Courtesy client to be| nfs4_laundromat | + * | | expired by Laundromat| try_to_expire_client | + * | | due to conflict | | + * |------------------------------------------------------------------------| + */ +enum { + NFSD4_ACTIVE = 0, + NFSD4_COURTESY, + NFSD4_EXPIRABLE, +}; + +/* * struct nfs4_client - one per client. Clientids live here. * * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0) @@ -385,6 +408,9 @@ struct nfs4_client { struct list_head async_copies; /* list of async copies */ spinlock_t async_lock; /* lock for async copies */ atomic_t cl_cb_inflight; /* Outstanding callbacks */ + + unsigned int cl_state; + atomic_t cl_delegs_in_recall; }; /* struct nfs4_client_reset @@ -702,4 +728,9 @@ extern void nfsd4_client_record_remove(struct nfs4_client *clp); extern int nfsd4_client_record_check(struct nfs4_client *clp); extern void nfsd4_record_grace_done(struct nfsd_net *nn); +static inline bool try_to_expire_client(struct nfs4_client *clp) +{ + cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE); + return clp->cl_state == NFSD4_EXPIRABLE; +} #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 242fa123e0e9..a60ead3b227a 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -692,12 +692,6 @@ DEFINE_CLID_EVENT(confirmed_r); /* * from fs/nfsd/filecache.h */ -TRACE_DEFINE_ENUM(NFSD_FILE_HASHED); -TRACE_DEFINE_ENUM(NFSD_FILE_PENDING); -TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_READ); -TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_WRITE); -TRACE_DEFINE_ENUM(NFSD_FILE_REFERENCED); - #define show_nf_flags(val) \ __print_flags(val, "|", \ { 1 << NFSD_FILE_HASHED, "HASHED" }, \ @@ -784,6 +778,34 @@ TRACE_EVENT(nfsd_file_acquire, __entry->nf_file, __entry->status) ); +TRACE_EVENT(nfsd_file_open, + TP_PROTO(struct nfsd_file *nf, __be32 status), + TP_ARGS(nf, status), + TP_STRUCT__entry( + __field(unsigned int, nf_hashval) + __field(void *, nf_inode) /* cannot be dereferenced */ + __field(int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned long, nf_may) + __field(void *, nf_file) /* cannot be dereferenced */ + ), + TP_fast_assign( + __entry->nf_hashval = nf->nf_hashval; + __entry->nf_inode = nf->nf_inode; + __entry->nf_ref = refcount_read(&nf->nf_ref); + __entry->nf_flags = nf->nf_flags; + __entry->nf_may = nf->nf_may; + __entry->nf_file = nf->nf_file; + ), + TP_printk("hash=0x%x inode=%p ref=%d flags=%s may=%s file=%p", + __entry->nf_hashval, + __entry->nf_inode, + __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nfsd_may_flags(__entry->nf_may), + __entry->nf_file) +) + DECLARE_EVENT_CLASS(nfsd_file_search_class, TP_PROTO(struct inode *inode, unsigned int hash, int found), TP_ARGS(inode, hash, found), diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c22ad0532e8e..840e3af63a6f 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -827,14 +827,23 @@ retry: return err; } +/** + * nfsd_open_verified - Open a regular file for the filecache + * @rqstp: RPC request + * @fhp: NFS filehandle of the file to open + * @may_flags: internal permission flags + * @filp: OUT: open "struct file *" + * + * Returns an nfsstat value in network byte order. + */ __be32 -nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, - int may_flags, struct file **filp) +nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags, + struct file **filp) { __be32 err; validate_process_creds(); - err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp); validate_process_creds(); return err; } @@ -849,17 +858,11 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct svc_rqst *rqstp = sd->u.data; - struct page **pp = rqstp->rq_next_page; - struct page *page = buf->page; - if (rqstp->rq_res.page_len == 0) { - svc_rqst_replace_page(rqstp, page); + svc_rqst_replace_page(rqstp, buf->page); + if (rqstp->rq_res.page_len == 0) rqstp->rq_res.page_base = buf->offset; - } else if (page != pp[-1]) { - svc_rqst_replace_page(rqstp, page); - } rqstp->rq_res.page_len += sd->len; - return sd->len; } @@ -1187,14 +1190,26 @@ out: return err; } -static __be32 -nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, - struct iattr *iap) +/** + * nfsd_create_setattr - Set a created file's attributes + * @rqstp: RPC transaction being executed + * @fhp: NFS filehandle of parent directory + * @resfhp: NFS filehandle of new object + * @iap: requested attributes of new object + * + * Returns nfs_ok on success, or an nfsstat in network byte order. + */ +__be32 +nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct svc_fh *resfhp, struct iattr *iap) { + __be32 status; + /* - * Mode has already been set earlier in create: + * Mode has already been set by file creation. */ iap->ia_valid &= ~ATTR_MODE; + /* * Setting uid/gid works only for root. Irix appears to * send along the gid on create when it tries to implement @@ -1202,10 +1217,31 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, */ if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) iap->ia_valid &= ~(ATTR_UID|ATTR_GID); + + /* + * Callers expect new file metadata to be committed even + * if the attributes have not changed. + */ if (iap->ia_valid) - return nfsd_setattr(rqstp, resfhp, iap, 0, (time64_t)0); - /* Callers expect file metadata to be committed here */ - return nfserrno(commit_metadata(resfhp)); + status = nfsd_setattr(rqstp, resfhp, iap, 0, (time64_t)0); + else + status = nfserrno(commit_metadata(resfhp)); + + /* + * Transactional filesystems had a chance to commit changes + * for both parent and child simultaneously making the + * following commit_metadata a noop in many cases. + */ + if (!status) + status = nfserrno(commit_metadata(fhp)); + + /* + * Update the new filehandle to pick up the new attributes. + */ + if (!status) + status = fh_update(resfhp); + + return status; } /* HPUX client sometimes creates a file in mode 000, and sets size to 0. @@ -1232,7 +1268,6 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, struct dentry *dentry, *dchild; struct inode *dirp; __be32 err; - __be32 err2; int host_err; dentry = fhp->fh_dentry; @@ -1305,22 +1340,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, if (host_err < 0) goto out_nfserr; - err = nfsd_create_setattr(rqstp, resfhp, iap); + err = nfsd_create_setattr(rqstp, fhp, resfhp, iap); - /* - * nfsd_create_setattr already committed the child. Transactional - * filesystems had a chance to commit changes for both parent and - * child simultaneously making the following commit_metadata a - * noop. - */ - err2 = nfserrno(commit_metadata(fhp)); - if (err2) - err = err2; - /* - * Update the file handle to get the new inode info. - */ - if (!err) - err = fh_update(resfhp); out: dput(dchild); return err; @@ -1376,172 +1397,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, } /* - * NFSv3 and NFSv4 version of nfsd_create - */ -__be32 -do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, - char *fname, int flen, struct iattr *iap, - struct svc_fh *resfhp, int createmode, u32 *verifier, - bool *truncp, bool *created) -{ - struct dentry *dentry, *dchild = NULL; - struct inode *dirp; - __be32 err; - int host_err; - __u32 v_mtime=0, v_atime=0; - - err = nfserr_perm; - if (!flen) - goto out; - err = nfserr_exist; - if (isdotent(fname, flen)) - goto out; - if (!(iap->ia_valid & ATTR_MODE)) - iap->ia_mode = 0; - err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); - if (err) - goto out; - - dentry = fhp->fh_dentry; - dirp = d_inode(dentry); - - host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; - - fh_lock_nested(fhp, I_MUTEX_PARENT); - - /* - * Compose the response file handle. - */ - dchild = lookup_one_len(fname, dentry, flen); - host_err = PTR_ERR(dchild); - if (IS_ERR(dchild)) - goto out_nfserr; - - /* If file doesn't exist, check for permissions to create one */ - if (d_really_is_negative(dchild)) { - err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); - if (err) - goto out; - } - - err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); - if (err) - goto out; - - if (nfsd_create_is_exclusive(createmode)) { - /* solaris7 gets confused (bugid 4218508) if these have - * the high bit set, as do xfs filesystems without the - * "bigtime" feature. So just clear the high bits. If this is - * ever changed to use different attrs for storing the - * verifier, then do_open_lookup() will also need to be fixed - * accordingly. - */ - v_mtime = verifier[0]&0x7fffffff; - v_atime = verifier[1]&0x7fffffff; - } - - if (d_really_is_positive(dchild)) { - err = 0; - - switch (createmode) { - case NFS3_CREATE_UNCHECKED: - if (! d_is_reg(dchild)) - goto out; - else if (truncp) { - /* in nfsv4, we need to treat this case a little - * differently. we don't want to truncate the - * file now; this would be wrong if the OPEN - * fails for some other reason. furthermore, - * if the size is nonzero, we should ignore it - * according to spec! - */ - *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size; - } - else { - iap->ia_valid &= ATTR_SIZE; - goto set_attr; - } - break; - case NFS3_CREATE_EXCLUSIVE: - if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime - && d_inode(dchild)->i_atime.tv_sec == v_atime - && d_inode(dchild)->i_size == 0 ) { - if (created) - *created = true; - break; - } - fallthrough; - case NFS4_CREATE_EXCLUSIVE4_1: - if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime - && d_inode(dchild)->i_atime.tv_sec == v_atime - && d_inode(dchild)->i_size == 0 ) { - if (created) - *created = true; - goto set_attr; - } - fallthrough; - case NFS3_CREATE_GUARDED: - err = nfserr_exist; - } - fh_drop_write(fhp); - goto out; - } - - if (!IS_POSIXACL(dirp)) - iap->ia_mode &= ~current_umask(); - - host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true); - if (host_err < 0) { - fh_drop_write(fhp); - goto out_nfserr; - } - if (created) - *created = true; - - nfsd_check_ignore_resizing(iap); - - if (nfsd_create_is_exclusive(createmode)) { - /* Cram the verifier into atime/mtime */ - iap->ia_valid = ATTR_MTIME|ATTR_ATIME - | ATTR_MTIME_SET|ATTR_ATIME_SET; - /* XXX someone who knows this better please fix it for nsec */ - iap->ia_mtime.tv_sec = v_mtime; - iap->ia_atime.tv_sec = v_atime; - iap->ia_mtime.tv_nsec = 0; - iap->ia_atime.tv_nsec = 0; - } - - set_attr: - err = nfsd_create_setattr(rqstp, resfhp, iap); - - /* - * nfsd_create_setattr already committed the child - * (and possibly also the parent). - */ - if (!err) - err = nfserrno(commit_metadata(fhp)); - - /* - * Update the filehandle to get the new inode info. - */ - if (!err) - err = fh_update(resfhp); - - out: - fh_unlock(fhp); - if (dchild && !IS_ERR(dchild)) - dput(dchild); - fh_drop_write(fhp); - return err; - - out_nfserr: - err = nfserrno(host_err); - goto out; -} - -/* * Read a symlink. On entry, *lenp must contain the maximum path length that * fits into the buffer. On return, it contains the true length. * N.B. After this call fhp needs an fh_put diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index ccb87b2864f6..26347d76f44a 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -69,10 +69,8 @@ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, int type, dev_t rdev, struct svc_fh *res); __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); -__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, - char *name, int len, struct iattr *attrs, - struct svc_fh *res, int createmode, - u32 *verifier, bool *truncp, bool *created); +__be32 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct svc_fh *resfhp, struct iattr *iap); __be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp, u64 offset, u32 count, __be32 *verf); #ifdef CONFIG_NFSD_V4 @@ -88,7 +86,7 @@ __be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, int nfsd_open_break_lease(struct inode *, int); __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t, +__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, int, struct file **); __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, @@ -159,10 +157,4 @@ static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) AT_STATX_SYNC_AS_STAT)); } -static inline int nfsd_create_is_exclusive(int createmode) -{ - return createmode == NFS3_CREATE_EXCLUSIVE - || createmode == NFS4_CREATE_EXCLUSIVE4_1; -} - #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 846ab6df9d48..7b744011f2d3 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -273,6 +273,7 @@ struct nfsd4_open { bool op_truncate; /* used during processing */ bool op_created; /* used during processing */ struct nfs4_openowner *op_openowner; /* used during processing */ + struct file *op_filp; /* used during processing */ struct nfs4_file *op_file; /* used during processing */ struct nfs4_ol_stateid *op_stp; /* used during processing */ struct nfs4_clnt_odstate *op_odstate; /* used during processing */ diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index e1392a9b8ceb..a8abe2296514 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1772,11 +1772,11 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, last_vcn = -1; do { VCN vcn; - pgoff_t idx, start_idx; + pgoff_t start_idx; unsigned ofs, do_pages, u; size_t copied; - start_idx = idx = pos >> PAGE_SHIFT; + start_idx = pos >> PAGE_SHIFT; ofs = pos & ~PAGE_MASK; bytes = PAGE_SIZE - ofs; do_pages = 1; diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 5781b9e8e3d8..0c6de6287737 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -668,9 +668,11 @@ static u32 format_size_gb(const u64 bytes, u32 *mb) static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot) { - return boot->sectors_per_clusters <= 0x80 - ? boot->sectors_per_clusters - : (1u << (0 - boot->sectors_per_clusters)); + if (boot->sectors_per_clusters <= 0x80) + return boot->sectors_per_clusters; + if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */ + return 1U << (0 - boot->sectors_per_clusters); + return -EINVAL; } /* @@ -713,6 +715,8 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, /* cluster size: 512, 1K, 2K, 4K, ... 2M */ sct_per_clst = true_sectors_per_clst(boot); + if ((int)sct_per_clst < 0) + goto out; if (!is_power_of_2(sct_per_clst)) goto out; diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index d442cf5dda8a..be5e9ed7da8d 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -541,7 +541,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos) struct debug_lockres *dl = m->private; struct dlm_ctxt *dlm = dl->dl_ctxt; struct dlm_lock_resource *oldres = dl->dl_res; - struct dlm_lock_resource *res = NULL; + struct dlm_lock_resource *res = NULL, *iter; struct list_head *track_list; spin_lock(&dlm->track_lock); @@ -556,11 +556,11 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos) } } - list_for_each_entry(res, track_list, tracking) { - if (&res->tracking == &dlm->tracking_list) - res = NULL; - else - dlm_lockres_get(res); + list_for_each_entry(iter, track_list, tracking) { + if (&iter->tracking != &dlm->tracking_list) { + dlm_lockres_get(iter); + res = iter; + } break; } spin_unlock(&dlm->track_lock); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 61103b2d69fb..7318e4794ef9 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -392,9 +392,9 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, struct dlm_ctxt *dlm = data; struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; struct dlm_lock_resource *res = NULL; - struct dlm_lock *lock = NULL; + struct dlm_lock *lock = NULL, *iter; enum dlm_status status = DLM_NORMAL; - int found = 0, i; + int i; struct dlm_lockstatus *lksb = NULL; int ignore; u32 flags; @@ -437,7 +437,6 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } queue=&res->granted; - found = 0; spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { spin_unlock(&res->spinlock); @@ -461,21 +460,21 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } for (i=0; i<3; i++) { - list_for_each_entry(lock, queue, list) { - if (lock->ml.cookie == unlock->cookie && - lock->ml.node == unlock->node_idx) { - dlm_lock_get(lock); - found = 1; + list_for_each_entry(iter, queue, list) { + if (iter->ml.cookie == unlock->cookie && + iter->ml.node == unlock->node_idx) { + dlm_lock_get(iter); + lock = iter; break; } } - if (found) + if (lock) break; /* scan granted -> converting -> blocked queues */ queue++; } spin_unlock(&res->spinlock); - if (!found) { + if (!lock) { status = DLM_IVLOCKID; goto not_found; } @@ -505,7 +504,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, dlm_kick_thread(dlm, res); not_found: - if (!found) + if (!lock) mlog(ML_ERROR, "failed to find lock to unlock! " "cookie=%u:%llu\n", dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)), diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c index 29f183a15798..617c92e7b925 100644 --- a/fs/ocfs2/dlmfs/userdlm.c +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -433,6 +433,11 @@ again: } spin_lock(&lockres->l_lock); + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + spin_unlock(&lockres->l_lock); + status = -EAGAIN; + goto bail; + } /* We only compare against the currently granted level * here. If the lock is blocked waiting on a downconvert, @@ -595,7 +600,7 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) spin_lock(&lockres->l_lock); if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { spin_unlock(&lockres->l_lock); - return 0; + goto bail; } lockres->l_flags |= USER_LOCK_IN_TEARDOWN; @@ -609,22 +614,30 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) } if (lockres->l_ro_holders || lockres->l_ex_holders) { + lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; spin_unlock(&lockres->l_lock); goto bail; } status = 0; if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { + /* + * lock is never requested, leave USER_LOCK_IN_TEARDOWN set + * to avoid new lock request coming in. + */ spin_unlock(&lockres->l_lock); goto bail; } - lockres->l_flags &= ~USER_LOCK_ATTACHED; lockres->l_flags |= USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK); if (status) { + spin_lock(&lockres->l_lock); + lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; + lockres->l_flags &= ~USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); goto bail; } diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 5739dc301569..bb116c39b581 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -125,6 +125,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; + journal_t *journal = osb->journal->j_journal; trace_ocfs2_iget_begin((unsigned long long)blkno, flags, sysfile_type); @@ -171,11 +172,10 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ - if (osb->journal) { + if (journal) { transaction_t *transaction; tid_t tid; struct ocfs2_inode_info *oi = OCFS2_I(inode); - journal_t *journal = osb->journal->j_journal; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 1887a2708709..fa87d89cf754 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -810,22 +810,20 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb) write_unlock(&journal->j_state_lock); } -int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) +/* + * alloc & initialize skeleton for journal structure. + * ocfs2_journal_init() will make fs have journal ability. + */ +int ocfs2_journal_alloc(struct ocfs2_super *osb) { - int status = -1; - struct inode *inode = NULL; /* the journal inode */ - journal_t *j_journal = NULL; - struct ocfs2_journal *journal = NULL; - struct ocfs2_dinode *di = NULL; - struct buffer_head *bh = NULL; - int inode_lock = 0; + int status = 0; + struct ocfs2_journal *journal; - /* initialize our journal structure */ journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); if (!journal) { mlog(ML_ERROR, "unable to alloc journal\n"); status = -ENOMEM; - goto done; + goto bail; } osb->journal = journal; journal->j_osb = osb; @@ -839,6 +837,21 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); journal->j_state = OCFS2_JOURNAL_FREE; +bail: + return status; +} + +int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) +{ + int status = -1; + struct inode *inode = NULL; /* the journal inode */ + journal_t *j_journal = NULL; + struct ocfs2_journal *journal = osb->journal; + struct ocfs2_dinode *di = NULL; + struct buffer_head *bh = NULL; + int inode_lock = 0; + + BUG_ON(!journal); /* already have the inode for our journal */ inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, osb->slot_num); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 8dcb2f2cadbc..969d0aa28718 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -154,6 +154,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb); * Journal Control: * Initialize, Load, Shutdown, Wipe a journal. * + * ocfs2_journal_alloc - Initialize skeleton for journal structure. * ocfs2_journal_init - Initialize journal structures in the OSB. * ocfs2_journal_load - Load the given journal off disk. Replay it if * there's transactions still in there. @@ -167,6 +168,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb); * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint. */ void ocfs2_set_journal_params(struct ocfs2_super *osb); +int ocfs2_journal_alloc(struct ocfs2_super *osb); int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty); void ocfs2_journal_shutdown(struct ocfs2_super *osb); int ocfs2_journal_wipe(struct ocfs2_journal *journal, diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index b1a8b046f4c2..5022b3e9bfcd 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -921,19 +921,19 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb, { struct mem_dqinfo *info = sb_dqinfo(sb, type); struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; - struct ocfs2_quota_chunk *chunk; + struct ocfs2_quota_chunk *chunk = NULL, *iter; struct ocfs2_local_disk_chunk *dchunk; int found = 0, len; - list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) { + list_for_each_entry(iter, &oinfo->dqi_chunk, qc_chunk) { dchunk = (struct ocfs2_local_disk_chunk *) - chunk->qc_headerbh->b_data; + iter->qc_headerbh->b_data; if (le32_to_cpu(dchunk->dqc_free) > 0) { - found = 1; + chunk = iter; break; } } - if (!found) + if (!chunk) return NULL; if (chunk->qc_num < oinfo->dqi_chunks - 1) { diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index 769e466887b0..a9d1296d736d 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -198,7 +198,7 @@ void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv, resv->r_flags |= flags; } -int ocfs2_resmap_init(struct ocfs2_super *osb, +void ocfs2_resmap_init(struct ocfs2_super *osb, struct ocfs2_reservation_map *resmap) { memset(resmap, 0, sizeof(*resmap)); @@ -207,8 +207,6 @@ int ocfs2_resmap_init(struct ocfs2_super *osb, resmap->m_reservations = RB_ROOT; /* m_bitmap_len is initialized to zero by the above memset. */ INIT_LIST_HEAD(&resmap->m_lru); - - return 0; } static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap, diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h index 677c50663595..ec8101ef5717 100644 --- a/fs/ocfs2/reservations.h +++ b/fs/ocfs2/reservations.h @@ -73,15 +73,10 @@ void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, /** * ocfs2_resmap_init() - Initialize fields of a reservations bitmap + * @osb: struct ocfs2_super to be saved in resmap * @resmap: struct ocfs2_reservation_map to initialize - * @obj: unused for now - * @ops: unused for now - * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize) - * - * Only possible return value other than '0' is -ENOMEM for failure to - * allocation mirror bitmap. */ -int ocfs2_resmap_init(struct ocfs2_super *osb, +void ocfs2_resmap_init(struct ocfs2_super *osb, struct ocfs2_reservation_map *resmap); /** diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 477cdf94122e..f7298816d8d9 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -989,28 +989,27 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) { status = -EINVAL; - goto read_super_error; + goto out; } /* probe for superblock */ status = ocfs2_sb_probe(sb, &bh, §or_size, &stats); if (status < 0) { mlog(ML_ERROR, "superblock probe failed!\n"); - goto read_super_error; + goto out; } status = ocfs2_initialize_super(sb, bh, sector_size, &stats); - osb = OCFS2_SB(sb); - if (status < 0) { - mlog_errno(status); - goto read_super_error; - } brelse(bh); bh = NULL; + if (status < 0) + goto out; + + osb = OCFS2_SB(sb); if (!ocfs2_check_set_options(sb, &parsed_options)) { status = -EINVAL; - goto read_super_error; + goto out_super; } osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; @@ -1027,7 +1026,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) - goto read_super_error; + goto out_super; sb->s_magic = OCFS2_SUPER_MAGIC; @@ -1041,7 +1040,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = -EACCES; mlog(ML_ERROR, "Readonly device detected but readonly " "mount was not specified.\n"); - goto read_super_error; + goto out_super; } /* You should not be able to start a local heartbeat @@ -1050,7 +1049,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = -EROFS; mlog(ML_ERROR, "Local heartbeat specified on readonly " "device.\n"); - goto read_super_error; + goto out_super; } status = ocfs2_check_journals_nolocks(osb); @@ -1059,9 +1058,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) mlog(ML_ERROR, "Recovery required on readonly " "file system, but write access is " "unavailable.\n"); - else - mlog_errno(status); - goto read_super_error; + goto out_super; } ocfs2_set_ro_flag(osb, 1); @@ -1077,10 +1074,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) } status = ocfs2_verify_heartbeat(osb); - if (status < 0) { - mlog_errno(status); - goto read_super_error; - } + if (status < 0) + goto out_super; osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); @@ -1094,15 +1089,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = ocfs2_mount_volume(sb); if (status < 0) - goto read_super_error; + goto out_debugfs; if (osb->root_inode) inode = igrab(osb->root_inode); if (!inode) { status = -EIO; - mlog_errno(status); - goto read_super_error; + goto out_dismount; } osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL, @@ -1110,7 +1104,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) if (!osb->osb_dev_kset) { status = -ENOMEM; mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id); - goto read_super_error; + goto out_dismount; } /* Create filecheck sysfs related directories/files at @@ -1119,14 +1113,13 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = -ENOMEM; mlog(ML_ERROR, "Unable to create filecheck sysfs directory at " "/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id); - goto read_super_error; + goto out_dismount; } root = d_make_root(inode); if (!root) { status = -ENOMEM; - mlog_errno(status); - goto read_super_error; + goto out_dismount; } sb->s_root = root; @@ -1178,17 +1171,21 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) return status; -read_super_error: - brelse(bh); - - if (status) - mlog_errno(status); +out_dismount: + atomic_set(&osb->vol_state, VOLUME_DISABLED); + wake_up(&osb->osb_mount_event); + ocfs2_dismount_volume(sb, 1); + goto out; - if (osb) { - atomic_set(&osb->vol_state, VOLUME_DISABLED); - wake_up(&osb->osb_mount_event); - ocfs2_dismount_volume(sb, 1); - } +out_debugfs: + debugfs_remove_recursive(osb->osb_debug_root); +out_super: + ocfs2_release_system_inodes(osb); + kfree(osb->recovery_map); + ocfs2_delete_osb(osb); + kfree(osb); +out: + mlog_errno(status); return status; } @@ -1803,11 +1800,10 @@ static int ocfs2_get_sector(struct super_block *sb, static int ocfs2_mount_volume(struct super_block *sb) { int status = 0; - int unlock_super = 0; struct ocfs2_super *osb = OCFS2_SB(sb); if (ocfs2_is_hard_readonly(osb)) - goto leave; + goto out; mutex_init(&osb->obs_trim_fs_mutex); @@ -1817,44 +1813,56 @@ static int ocfs2_mount_volume(struct super_block *sb) if (status == -EBADR && ocfs2_userspace_stack(osb)) mlog(ML_ERROR, "couldn't mount because cluster name on" " disk does not match the running cluster name.\n"); - goto leave; + goto out; } status = ocfs2_super_lock(osb, 1); if (status < 0) { mlog_errno(status); - goto leave; + goto out_dlm; } - unlock_super = 1; /* This will load up the node map and add ourselves to it. */ status = ocfs2_find_slot(osb); if (status < 0) { mlog_errno(status); - goto leave; + goto out_super_lock; } /* load all node-local system inodes */ status = ocfs2_init_local_system_inodes(osb); if (status < 0) { mlog_errno(status); - goto leave; + goto out_super_lock; } status = ocfs2_check_volume(osb); if (status < 0) { mlog_errno(status); - goto leave; + goto out_system_inodes; } status = ocfs2_truncate_log_init(osb); - if (status < 0) + if (status < 0) { mlog_errno(status); + goto out_system_inodes; + } -leave: - if (unlock_super) - ocfs2_super_unlock(osb, 1); + ocfs2_super_unlock(osb, 1); + return 0; +out_system_inodes: + if (osb->local_alloc_state == OCFS2_LA_ENABLED) + ocfs2_shutdown_local_alloc(osb); + ocfs2_release_system_inodes(osb); + /* before journal shutdown, we should release slot_info */ + ocfs2_free_slot_info(osb); + ocfs2_journal_shutdown(osb); +out_super_lock: + ocfs2_super_unlock(osb, 1); +out_dlm: + ocfs2_dlm_shutdown(osb, 0); +out: return status; } @@ -2022,7 +2030,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out; } sb->s_fs_info = osb; @@ -2083,7 +2091,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "Invalid number of node slots (%u)\n", osb->max_slots); status = -EINVAL; - goto bail; + goto out; } ocfs2_orphan_scan_init(osb); @@ -2092,7 +2100,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (status) { mlog(ML_ERROR, "Unable to initialize recovery state\n"); mlog_errno(status); - goto bail; + goto out; } init_waitqueue_head(&osb->checkpoint_event); @@ -2110,17 +2118,13 @@ static int ocfs2_initialize_super(struct super_block *sb, init_waitqueue_head(&osb->osb_mount_event); - status = ocfs2_resmap_init(osb, &osb->osb_la_resmap); - if (status) { - mlog_errno(status); - goto bail; - } + ocfs2_resmap_init(osb, &osb->osb_la_resmap); osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); if (!osb->vol_label) { mlog(ML_ERROR, "unable to alloc vol label\n"); status = -ENOMEM; - goto bail; + goto out_recovery_map; } osb->slot_recovery_generations = @@ -2129,7 +2133,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb->slot_recovery_generations) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_vol_label; } init_waitqueue_head(&osb->osb_wipe_event); @@ -2139,7 +2143,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb->osb_orphan_wipes) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_slot_recovery_gen; } osb->osb_rf_lock_tree = RB_ROOT; @@ -2155,13 +2159,13 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "couldn't mount because of unsupported " "optional features (%x).\n", i); status = -EINVAL; - goto bail; + goto out_orphan_wipes; } if (!sb_rdonly(osb->sb) && (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { mlog(ML_ERROR, "couldn't mount RDWR because of " "unsupported optional features (%x).\n", i); status = -EINVAL; - goto bail; + goto out_orphan_wipes; } if (ocfs2_clusterinfo_valid(osb)) { @@ -2182,7 +2186,7 @@ static int ocfs2_initialize_super(struct super_block *sb, "cluster stack label (%s) \n", osb->osb_cluster_stack); status = -EINVAL; - goto bail; + goto out_orphan_wipes; } memcpy(osb->osb_cluster_name, OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, @@ -2195,6 +2199,15 @@ static int ocfs2_initialize_super(struct super_block *sb, get_random_bytes(&osb->s_next_generation, sizeof(u32)); + /* + * FIXME + * This should be done in ocfs2_journal_init(), but any inode + * writes back operation will cause the filesystem to crash. + */ + status = ocfs2_journal_alloc(osb); + if (status < 0) + goto out_orphan_wipes; + INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); init_llist_head(&osb->dquot_drop_list); @@ -2208,7 +2221,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n", osb->s_clustersize); status = -EINVAL; - goto bail; + goto out_journal; } total_blocks = ocfs2_clusters_to_blocks(osb->sb, @@ -2220,14 +2233,14 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "Volume too large " "to mount safely on this system"); status = -EFBIG; - goto bail; + goto out_journal; } if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid, sizeof(di->id2.i_super.s_uuid))) { mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n"); status = -ENOMEM; - goto bail; + goto out_journal; } strlcpy(osb->vol_label, di->id2.i_super.s_label, @@ -2247,7 +2260,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb->osb_dlm_debug) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_uuid_str; } atomic_set(&osb->vol_state, VOLUME_INIT); @@ -2256,7 +2269,7 @@ static int ocfs2_initialize_super(struct super_block *sb, status = ocfs2_init_global_system_inodes(osb); if (status < 0) { mlog_errno(status); - goto bail; + goto out_dlm_out; } /* @@ -2267,7 +2280,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!inode) { status = -EINVAL; mlog_errno(status); - goto bail; + goto out_system_inodes; } osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; @@ -2280,16 +2293,39 @@ static int ocfs2_initialize_super(struct super_block *sb, status = ocfs2_init_slot_info(osb); if (status < 0) { mlog_errno(status); - goto bail; + goto out_system_inodes; } osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM); if (!osb->ocfs2_wq) { status = -ENOMEM; mlog_errno(status); + goto out_slot_info; } -bail: + return status; + +out_slot_info: + ocfs2_free_slot_info(osb); +out_system_inodes: + ocfs2_release_system_inodes(osb); +out_dlm_out: + ocfs2_put_dlm_debug(osb->osb_dlm_debug); +out_uuid_str: + kfree(osb->uuid_str); +out_journal: + kfree(osb->journal); +out_orphan_wipes: + kfree(osb->osb_orphan_wipes); +out_slot_recovery_gen: + kfree(osb->slot_recovery_generations); +out_vol_label: + kfree(osb->vol_label); +out_recovery_map: + kfree(osb->recovery_map); +out: + kfree(osb); + sb->s_fs_info = NULL; return status; } @@ -2483,6 +2519,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) kfree(osb->osb_orphan_wipes); kfree(osb->slot_recovery_generations); + /* FIXME + * This belongs in journal shutdown, but because we have to + * allocate osb->journal at the middle of ocfs2_initialize_super(), + * we free it here. + */ + kfree(osb->journal); kfree(osb->local_alloc_copy); kfree(osb->uuid_str); kfree(osb->vol_label); diff --git a/fs/open.c b/fs/open.c index 1315253e0247..be849dcca032 100644 --- a/fs/open.c +++ b/fs/open.c @@ -834,16 +834,15 @@ static int do_dentry_open(struct file *f, if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; + if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) + f->f_mode |= FMODE_CAN_ODIRECT; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); - /* NB: we're sure to have correct a_ops only after f_op->open */ - if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) - return -EINVAL; - } + if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; /* * XXX: Huge page cache doesn't support writing yet. Drop all page @@ -981,6 +980,48 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); +/** + * dentry_create - Create and open a file + * @path: path to create + * @flags: O_ flags + * @mode: mode bits for new file + * @cred: credentials to use + * + * Caller must hold the parent directory's lock, and have prepared + * a negative dentry, placed in @path->dentry, for the new file. + * + * Caller sets @path->mnt to the vfsmount of the filesystem where + * the new file is to be created. The parent directory and the + * negative dentry must reside on the same filesystem instance. + * + * On success, returns a "struct file *". Otherwise a ERR_PTR + * is returned. + */ +struct file *dentry_create(const struct path *path, int flags, umode_t mode, + const struct cred *cred) +{ + struct file *f; + int error; + + validate_creds(cred); + f = alloc_empty_file(flags, cred); + if (IS_ERR(f)) + return f; + + error = vfs_create(mnt_user_ns(path->mnt), + d_inode(path->dentry->d_parent), + path->dentry, mode, true); + if (!error) + error = vfs_open(path, f); + + if (unlikely(error)) { + fput(f); + return ERR_PTR(error); + } + return f; +} +EXPORT_SYMBOL(dentry_create); + struct file *open_with_fake_path(const struct path *path, int flags, struct inode *inode, const struct cred *cred) { diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index fa125feed0ff..9d69b4dbb8c4 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -82,11 +82,8 @@ static int ovl_change_flags(struct file *file, unsigned int flags) if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode)) return -EPERM; - if (flags & O_DIRECT) { - if (!file->f_mapping->a_ops || - !file->f_mapping->a_ops->direct_IO) - return -EINVAL; - } + if ((flags & O_DIRECT) && !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; if (file->f_op->check_flags) { err = file->f_op->check_flags(flags); @@ -306,8 +303,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) ret = -EINVAL; if (iocb->ki_flags & IOCB_DIRECT && - (!real.file->f_mapping->a_ops || - !real.file->f_mapping->a_ops->direct_IO)) + !(real.file->f_mode & FMODE_CAN_ODIRECT)) goto out_fdput; old_cred = ovl_override_creds(file_inode(file)->i_sb); @@ -367,8 +363,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) ret = -EINVAL; if (iocb->ki_flags & IOCB_DIRECT && - (!real.file->f_mapping->a_ops || - !real.file->f_mapping->a_ops->direct_IO)) + !(real.file->f_mode & FMODE_CAN_ODIRECT)) goto out_fdput; if (!ovl_should_sync(OVL_FS(inode->i_sb))) diff --git a/fs/pipe.c b/fs/pipe.c index e140ea150bbb..74ae9fafd25a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -653,7 +653,7 @@ pipe_poll(struct file *filp, poll_table *wait) unsigned int head, tail; /* Epoll has some historical nasty semantics, this enables them */ - pipe->poll_usage = 1; + WRITE_ONCE(pipe->poll_usage, true); /* * Reading pipe state only -- no need for acquiring the semaphore. @@ -1245,30 +1245,33 @@ unsigned int round_pipe_size(unsigned long size) /* * Resize the pipe ring to a number of slots. + * + * Note the pipe can be reduced in capacity, but only if the current + * occupancy doesn't exceed nr_slots; if it does, EBUSY will be + * returned instead. */ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) { struct pipe_buffer *bufs; unsigned int head, tail, mask, n; - /* - * We can shrink the pipe, if arg is greater than the ring occupancy. - * Since we don't expect a lot of shrink+grow operations, just free and - * allocate again like we would do for growing. If the pipe currently - * contains more buffers than arg, then return busy. - */ - mask = pipe->ring_size - 1; - head = pipe->head; - tail = pipe->tail; - n = pipe_occupancy(pipe->head, pipe->tail); - if (nr_slots < n) - return -EBUSY; - bufs = kcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; + spin_lock_irq(&pipe->rd_wait.lock); + mask = pipe->ring_size - 1; + head = pipe->head; + tail = pipe->tail; + + n = pipe_occupancy(head, tail); + if (nr_slots < n) { + spin_unlock_irq(&pipe->rd_wait.lock); + kfree(bufs); + return -EBUSY; + } + /* * The pipe array wraps around, so just start the new one at zero * and adjust the indices. @@ -1300,6 +1303,8 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) pipe->tail = tail; pipe->head = head; + spin_unlock_irq(&pipe->rd_wait.lock); + /* This might have made more room for writers */ wake_up_interruptible(&pipe->wr_wait); return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index c1031843cc6a..8dfa36a99c74 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3154,6 +3154,22 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, } #endif /* CONFIG_LIVEPATCH */ +#ifdef CONFIG_KSM +static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm; + + mm = get_task_mm(task); + if (mm) { + seq_printf(m, "%lu\n", mm->ksm_merging_pages); + mmput(mm); + } + + return 0; +} +#endif /* CONFIG_KSM */ + #ifdef CONFIG_STACKLEAK_METRICS static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) @@ -3285,6 +3301,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif +#ifdef CONFIG_KSM + ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3618,6 +3637,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif +#ifdef CONFIG_KSM + ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index f2132407e133..587b91d9d998 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -448,6 +448,9 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, proc_set_user(ent, (*parent)->uid, (*parent)->gid); ent->proc_dops = &proc_misc_dentry_ops; + /* Revalidate everything under /proc/${pid}/net */ + if ((*parent)->proc_dops == &proc_net_dentry_ops) + pde_force_lookup(ent); out: return ent; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 982e694aae77..dff921f7ca33 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -479,10 +479,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) * the previous entry, search for a matching entry. */ if (!m || start < m->addr || start >= m->addr + m->size) { - list_for_each_entry(m, &kclist_head, list) { - if (start >= m->addr && - start < m->addr + m->size) + struct kcore_list *iter; + + m = NULL; + list_for_each_entry(iter, &kclist_head, list) { + if (start >= iter->addr && + start < iter->addr + iter->size) { + m = iter; break; + } } } @@ -492,12 +497,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) page_offline_freeze(); } - if (&m->list == &kclist_head) { + if (!m) { if (clear_user(buffer, tsz)) { ret = -EFAULT; goto out; } - m = NULL; /* skip the list anchor */ goto skip; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6fa761c9cc78..6e89f0e2fd20 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -86,6 +86,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SwapTotal: ", i.totalswap); show_val_kb(m, "SwapFree: ", i.freeswap); +#ifdef CONFIG_ZSWAP + seq_printf(m, "Zswap: %8lu kB\n", + (unsigned long)(zswap_pool_total_size >> 10)); + seq_printf(m, "Zswapped: %8lu kB\n", + (unsigned long)atomic_read(&zswap_stored_pages) << + (PAGE_SHIFT - 10)); +#endif show_val_kb(m, "Dirty: ", global_node_page_state(NR_FILE_DIRTY)); show_val_kb(m, "Writeback: ", diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index e1cfeda397f3..913e5acefbb6 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -376,6 +376,9 @@ static __net_init int proc_net_ns_init(struct net *net) proc_set_user(netd, uid, gid); + /* Seed dentry revalidation for /proc/${pid}/net */ + pde_force_lookup(netd); + err = -EEXIST; net_statd = proc_net_mkdir(net, "stat", netd); if (!net_statd) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c29f39c01a9a..021e83fe831f 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -19,6 +19,9 @@ #include <linux/kmemleak.h> #include "internal.h" +#define list_for_each_table_entry(entry, table) \ + for ((entry) = (table); (entry)->procname; (entry)++) + static const struct dentry_operations proc_sys_dentry_operations; static const struct file_operations proc_sys_file_operations; static const struct inode_operations proc_sys_inode_operations; @@ -215,15 +218,19 @@ static void init_header(struct ctl_table_header *head, INIT_HLIST_HEAD(&head->inodes); if (node) { struct ctl_table *entry; - for (entry = table; entry->procname; entry++, node++) + + list_for_each_table_entry(entry, table) { node->header = head; + node++; + } } } static void erase_header(struct ctl_table_header *head) { struct ctl_table *entry; - for (entry = head->ctl_table; entry->procname; entry++) + + list_for_each_table_entry(entry, head->ctl_table) erase_entry(head, entry); } @@ -248,7 +255,7 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) err = insert_links(header); if (err) goto fail_links; - for (entry = header->ctl_table; entry->procname; entry++) { + list_for_each_table_entry(entry, header->ctl_table) { err = insert_entry(header, entry); if (err) goto fail; @@ -978,7 +985,6 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set, table = (struct ctl_table *)(node + 1); new_name = (char *)(table + 2); memcpy(new_name, name, namelen); - new_name[namelen] = '\0'; table[0].procname = new_name; table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; init_header(&new->header, set->dir.header.root, set, node, table); @@ -1130,35 +1136,36 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) static int sysctl_check_table(const char *path, struct ctl_table *table) { + struct ctl_table *entry; int err = 0; - for (; table->procname; table++) { - if (table->child) - err |= sysctl_err(path, table, "Not a file"); + list_for_each_table_entry(entry, table) { + if (entry->child) + err |= sysctl_err(path, entry, "Not a file"); - if ((table->proc_handler == proc_dostring) || - (table->proc_handler == proc_dointvec) || - (table->proc_handler == proc_douintvec) || - (table->proc_handler == proc_douintvec_minmax) || - (table->proc_handler == proc_dointvec_minmax) || - (table->proc_handler == proc_dou8vec_minmax) || - (table->proc_handler == proc_dointvec_jiffies) || - (table->proc_handler == proc_dointvec_userhz_jiffies) || - (table->proc_handler == proc_dointvec_ms_jiffies) || - (table->proc_handler == proc_doulongvec_minmax) || - (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { - if (!table->data) - err |= sysctl_err(path, table, "No data"); - if (!table->maxlen) - err |= sysctl_err(path, table, "No maxlen"); + if ((entry->proc_handler == proc_dostring) || + (entry->proc_handler == proc_dointvec) || + (entry->proc_handler == proc_douintvec) || + (entry->proc_handler == proc_douintvec_minmax) || + (entry->proc_handler == proc_dointvec_minmax) || + (entry->proc_handler == proc_dou8vec_minmax) || + (entry->proc_handler == proc_dointvec_jiffies) || + (entry->proc_handler == proc_dointvec_userhz_jiffies) || + (entry->proc_handler == proc_dointvec_ms_jiffies) || + (entry->proc_handler == proc_doulongvec_minmax) || + (entry->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { + if (!entry->data) + err |= sysctl_err(path, entry, "No data"); + if (!entry->maxlen) + err |= sysctl_err(path, entry, "No maxlen"); else - err |= sysctl_check_table_array(path, table); + err |= sysctl_check_table_array(path, entry); } - if (!table->proc_handler) - err |= sysctl_err(path, table, "No proc_handler"); + if (!entry->proc_handler) + err |= sysctl_err(path, entry, "No proc_handler"); - if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode) - err |= sysctl_err(path, table, "bogus .mode 0%o", - table->mode); + if ((entry->mode & (S_IRUGO|S_IWUGO)) != entry->mode) + err |= sysctl_err(path, entry, "bogus .mode 0%o", + entry->mode); } return err; } @@ -1174,7 +1181,7 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table name_bytes = 0; nr_entries = 0; - for (entry = table; entry->procname; entry++) { + list_for_each_table_entry(entry, table) { nr_entries++; name_bytes += strlen(entry->procname) + 1; } @@ -1191,14 +1198,16 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table node = (struct ctl_node *)(links + 1); link_table = (struct ctl_table *)(node + nr_entries); link_name = (char *)&link_table[nr_entries + 1]; + link = link_table; - for (link = link_table, entry = table; entry->procname; link++, entry++) { + list_for_each_table_entry(entry, table) { int len = strlen(entry->procname) + 1; memcpy(link_name, entry->procname, len); link->procname = link_name; link->mode = S_IFLNK|S_IRWXUGO; link->data = link_root; link_name += len; + link++; } init_header(links, dir->header.root, dir->header.set, node, link_table); links->nreg = nr_entries; @@ -1213,7 +1222,7 @@ static bool get_links(struct ctl_dir *dir, struct ctl_table *entry, *link; /* Are there links available for every entry in table? */ - for (entry = table; entry->procname; entry++) { + list_for_each_table_entry(entry, table) { const char *procname = entry->procname; link = find_entry(&head, dir, procname, strlen(procname)); if (!link) @@ -1226,7 +1235,7 @@ static bool get_links(struct ctl_dir *dir, } /* The checks passed. Increase the registration count on the links */ - for (entry = table; entry->procname; entry++) { + list_for_each_table_entry(entry, table) { const char *procname = entry->procname; link = find_entry(&head, dir, procname, strlen(procname)); head->nreg++; @@ -1329,7 +1338,7 @@ struct ctl_table_header *__register_sysctl_table( struct ctl_node *node; int nr_entries = 0; - for (entry = table; entry->procname; entry++) + list_for_each_table_entry(entry, table) nr_entries++; header = kzalloc(sizeof(struct ctl_table_header) + @@ -1456,7 +1465,7 @@ static int count_subheaders(struct ctl_table *table) if (!table || !table->procname) return 1; - for (entry = table; entry->procname; entry++) { + list_for_each_table_entry(entry, table) { if (entry->child) nr_subheaders += count_subheaders(entry->child); else @@ -1475,7 +1484,7 @@ static int register_leaf_sysctl_tables(const char *path, char *pos, int nr_dirs = 0; int err = -ENOMEM; - for (entry = table; entry->procname; entry++) { + list_for_each_table_entry(entry, table) { if (entry->child) nr_dirs++; else @@ -1492,7 +1501,9 @@ static int register_leaf_sysctl_tables(const char *path, char *pos, goto out; ctl_table_arg = files; - for (new = files, entry = table; entry->procname; entry++) { + new = files; + + list_for_each_table_entry(entry, table) { if (entry->child) continue; *new = *entry; @@ -1516,7 +1527,7 @@ static int register_leaf_sysctl_tables(const char *path, char *pos, } /* Recurse into the subdirectories. */ - for (entry = table; entry->procname; entry++) { + list_for_each_table_entry(entry, table) { char *child_pos; if (!entry->child) @@ -1670,7 +1681,7 @@ static void put_links(struct ctl_table_header *header) if (IS_ERR(core_parent)) return; - for (entry = header->ctl_table; entry->procname; entry++) { + list_for_each_table_entry(entry, header->ctl_table) { struct ctl_table_header *link_head; struct ctl_table *link; const char *name = entry->procname; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f46060eb91b5..2d04e3470d4c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1421,6 +1421,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, migration = is_migration_entry(entry); if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); + if (pte_marker_entry_uffd_wp(entry)) + flags |= PM_UFFD_WP; } if (page && !PageAnon(page)) @@ -1556,10 +1558,15 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, if (page_mapcount(page) == 1) flags |= PM_MMAP_EXCLUSIVE; + if (huge_pte_uffd_wp(pte)) + flags |= PM_UFFD_WP; + flags |= PM_PRESENT; if (pm->show_pfn) frame = pte_pfn(pte) + ((addr & ~hmask) >> PAGE_SHIFT); + } else if (pte_swp_uffd_wp_any(pte)) { + flags |= PM_UFFD_WP; } for (; addr != end; addr += PAGE_SIZE) { @@ -1873,8 +1880,6 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, return 0; page = pte_page(huge_pte); - if (!page) - return 0; md = walk->private; gather_stats(page, md, pte_dirty(huge_pte), 1); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 6f1b8ddc6f7a..4eaeb645e759 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -26,6 +26,7 @@ #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/uaccess.h> +#include <linux/uio.h> #include <linux/cc_platform.h> #include <asm/io.h> #include "internal.h" @@ -128,9 +129,8 @@ static int open_vmcore(struct inode *inode, struct file *file) } /* Reads a page from the oldmem device from given offset. */ -ssize_t read_from_oldmem(char *buf, size_t count, - u64 *ppos, int userbuf, - bool encrypted) +ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, + u64 *ppos, bool encrypted) { unsigned long pfn, offset; size_t nr_bytes; @@ -152,29 +152,23 @@ ssize_t read_from_oldmem(char *buf, size_t count, /* If pfn is not ram, return zeros for sparse dump files */ if (!pfn_is_ram(pfn)) { - tmp = 0; - if (!userbuf) - memset(buf, 0, nr_bytes); - else if (clear_user(buf, nr_bytes)) - tmp = -EFAULT; + tmp = iov_iter_zero(nr_bytes, iter); } else { if (encrypted) - tmp = copy_oldmem_page_encrypted(pfn, buf, + tmp = copy_oldmem_page_encrypted(iter, pfn, nr_bytes, - offset, - userbuf); + offset); else - tmp = copy_oldmem_page(pfn, buf, nr_bytes, - offset, userbuf); + tmp = copy_oldmem_page(iter, pfn, nr_bytes, + offset); } - if (tmp < 0) { + if (tmp < nr_bytes) { srcu_read_unlock(&vmcore_cb_srcu, idx); - return tmp; + return -EFAULT; } *ppos += nr_bytes; count -= nr_bytes; - buf += nr_bytes; read += nr_bytes; ++pfn; offset = 0; @@ -203,7 +197,12 @@ void __weak elfcorehdr_free(unsigned long long addr) */ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, false); + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, false); } /* @@ -211,7 +210,13 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) */ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, + cc_platform_has(CC_ATTR_MEM_ENCRYPT)); } /* @@ -228,29 +233,14 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, /* * Architectures which support memory encryption override this. */ -ssize_t __weak -copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t __weak copy_oldmem_page_encrypted(struct iov_iter *iter, + unsigned long pfn, size_t csize, unsigned long offset) { - return copy_oldmem_page(pfn, buf, csize, offset, userbuf); -} - -/* - * Copy to either kernel or user space - */ -static int copy_to(void *target, void *src, size_t size, int userbuf) -{ - if (userbuf) { - if (copy_to_user((char __user *) target, src, size)) - return -EFAULT; - } else { - memcpy(target, src, size); - } - return 0; + return copy_oldmem_page(iter, pfn, csize, offset); } #ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP -static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf) +static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size) { struct vmcoredd_node *dump; u64 offset = 0; @@ -263,14 +253,13 @@ static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf) if (start < offset + dump->size) { tsz = min(offset + (u64)dump->size - start, (u64)size); buf = dump->buf + start - offset; - if (copy_to(dst, buf, tsz, userbuf)) { + if (copy_to_iter(buf, tsz, iter) < tsz) { ret = -EFAULT; goto out_unlock; } size -= tsz; start += tsz; - dst += tsz; /* Leave now if buffer filled already */ if (!size) @@ -326,33 +315,28 @@ out_unlock: /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ -static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, - int userbuf) +static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) { ssize_t acc = 0, tmp; size_t tsz; u64 start; struct vmcore *m = NULL; - if (buflen == 0 || *fpos >= vmcore_size) + if (!iov_iter_count(iter) || *fpos >= vmcore_size) return 0; - /* trim buflen to not go beyond EOF */ - if (buflen > vmcore_size - *fpos) - buflen = vmcore_size - *fpos; + iov_iter_truncate(iter, vmcore_size - *fpos); /* Read ELF core header */ if (*fpos < elfcorebuf_sz) { - tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen); - if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf)) + tsz = min(elfcorebuf_sz - (size_t)*fpos, iov_iter_count(iter)); + if (copy_to_iter(elfcorebuf + *fpos, tsz, iter) < tsz) return -EFAULT; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (buflen == 0) + if (!iov_iter_count(iter)) return acc; } @@ -373,35 +357,32 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, /* Read device dumps */ if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) { tsz = min(elfcorebuf_sz + vmcoredd_orig_sz - - (size_t)*fpos, buflen); + (size_t)*fpos, iov_iter_count(iter)); start = *fpos - elfcorebuf_sz; - if (vmcoredd_copy_dumps(buffer, start, tsz, userbuf)) + if (vmcoredd_copy_dumps(iter, start, tsz)) return -EFAULT; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (!buflen) + if (!iov_iter_count(iter)) return acc; } #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ /* Read remaining elf notes */ - tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, + iov_iter_count(iter)); kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz; - if (copy_to(buffer, kaddr, tsz, userbuf)) + if (copy_to_iter(kaddr, tsz, iter) < tsz) return -EFAULT; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (buflen == 0) + if (!iov_iter_count(iter)) return acc; } @@ -409,19 +390,17 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, if (*fpos < m->offset + m->size) { tsz = (size_t)min_t(unsigned long long, m->offset + m->size - *fpos, - buflen); + iov_iter_count(iter)); start = m->paddr + *fpos - m->offset; - tmp = read_from_oldmem(buffer, tsz, &start, - userbuf, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); + tmp = read_from_oldmem(iter, tsz, &start, + cc_platform_has(CC_ATTR_MEM_ENCRYPT)); if (tmp < 0) return tmp; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (buflen == 0) + if (!iov_iter_count(iter)) return acc; } } @@ -429,15 +408,14 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, return acc; } -static ssize_t read_vmcore(struct file *file, char __user *buffer, - size_t buflen, loff_t *fpos) +static ssize_t read_vmcore(struct kiocb *iocb, struct iov_iter *iter) { - return __read_vmcore((__force char *) buffer, buflen, fpos, 1); + return __read_vmcore(iter, &iocb->ki_pos); } /* * The vmcore fault handler uses the page cache and fills data using the - * standard __vmcore_read() function. + * standard __read_vmcore() function. * * On s390 the fault handler is used for memory regions that can't be mapped * directly with remap_pfn_range(). @@ -447,9 +425,10 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) #ifdef CONFIG_S390 struct address_space *mapping = vmf->vma->vm_file->f_mapping; pgoff_t index = vmf->pgoff; + struct iov_iter iter; + struct kvec kvec; struct page *page; loff_t offset; - char *buf; int rc; page = find_or_create_page(mapping, index, GFP_KERNEL); @@ -457,8 +436,11 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) return VM_FAULT_OOM; if (!PageUptodate(page)) { offset = (loff_t) index << PAGE_SHIFT; - buf = __va((page_to_pfn(page) << PAGE_SHIFT)); - rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0); + kvec.iov_base = page_address(page); + kvec.iov_len = PAGE_SIZE; + iov_iter_kvec(&iter, READ, &kvec, 1, PAGE_SIZE); + + rc = __read_vmcore(&iter, &offset); if (rc < 0) { unlock_page(page); put_page(page); @@ -708,7 +690,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) static const struct proc_ops vmcore_proc_ops = { .proc_open = open_vmcore, - .proc_read = read_vmcore, + .proc_read_iter = read_vmcore, .proc_lseek = default_llseek, .proc_mmap = mmap_vmcore, }; diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index 0507aecfc669..2cab413fffee 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -1244,6 +1244,106 @@ struct file_zero_data_information { __le64 BeyondFinalZero; } __packed; +/* See MS-FSCC 2.3.7 */ +struct duplicate_extents_to_file { + __u64 PersistentFileHandle; /* source file handle, opaque endianness */ + __u64 VolatileFileHandle; + __le64 SourceFileOffset; + __le64 TargetFileOffset; + __le64 ByteCount; /* Bytes to be copied */ +} __packed; + +/* See MS-FSCC 2.3.8 */ +#define DUPLICATE_EXTENTS_DATA_EX_SOURCE_ATOMIC 0x00000001 +struct duplicate_extents_to_file_ex { + __u64 PersistentFileHandle; /* source file handle, opaque endianness */ + __u64 VolatileFileHandle; + __le64 SourceFileOffset; + __le64 TargetFileOffset; + __le64 ByteCount; /* Bytes to be copied */ + __le32 Flags; + __le32 Reserved; +} __packed; + + +/* See MS-FSCC 2.3.20 */ +struct fsctl_get_integrity_information_rsp { + __le16 ChecksumAlgorithm; + __le16 Reserved; + __le32 Flags; + __le32 ChecksumChunkSizeInBytes; + __le32 ClusterSizeInBytes; +} __packed; + +/* See MS-FSCC 2.3.55 */ +struct fsctl_query_file_regions_req { + __le64 FileOffset; + __le64 Length; + __le32 DesiredUsage; + __le32 Reserved; +} __packed; + +/* DesiredUsage flags see MS-FSCC 2.3.56.1 */ +#define FILE_USAGE_INVALID_RANGE 0x00000000 +#define FILE_USAGE_VALID_CACHED_DATA 0x00000001 +#define FILE_USAGE_NONCACHED_DATA 0x00000002 + +struct file_region_info { + __le64 FileOffset; + __le64 Length; + __le32 DesiredUsage; + __le32 Reserved; +} __packed; + +/* See MS-FSCC 2.3.56 */ +struct fsctl_query_file_region_rsp { + __le32 Flags; + __le32 TotalRegionEntryCount; + __le32 RegionEntryCount; + __u32 Reserved; + struct file_region_info Regions[]; +} __packed; + +/* See MS-FSCC 2.3.58 */ +struct fsctl_query_on_disk_vol_info_rsp { + __le64 DirectoryCount; + __le64 FileCount; + __le16 FsFormatMajVersion; + __le16 FsFormatMinVersion; + __u8 FsFormatName[24]; + __le64 FormatTime; + __le64 LastUpdateTime; + __u8 CopyrightInfo[68]; + __u8 AbstractInfo[68]; + __u8 FormatImplInfo[68]; + __u8 LastModifyImplInfo[68]; +} __packed; + +/* See MS-FSCC 2.3.73 */ +struct fsctl_set_integrity_information_req { + __le16 ChecksumAlgorithm; + __le16 Reserved; + __le32 Flags; +} __packed; + +/* See MS-FSCC 2.3.75 */ +struct fsctl_set_integrity_info_ex_req { + __u8 EnableIntegrity; + __u8 KeepState; + __u16 Reserved; + __le32 Flags; + __u8 Version; + __u8 Reserved2[7]; +} __packed; + +/* Integrity ChecksumAlgorithm choices for above */ +#define CHECKSUM_TYPE_NONE 0x0000 +#define CHECKSUM_TYPE_CRC64 0x0002 +#define CHECKSUM_TYPE_UNCHANGED 0xFFFF /* set only */ + +/* Integrity flags for above */ +#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001 + /* Reparse structures - see MS-FSCC 2.1.2 */ /* struct fsctl_reparse_info_req is empty, only response structs (see below) */ @@ -1304,13 +1404,6 @@ struct validate_negotiate_info_rsp { __le16 Dialect; /* Dialect in use for the connection */ } __packed; -struct duplicate_extents_to_file { - __u64 PersistentFileHandle; /* source file handle, opaque endianness */ - __u64 VolatileFileHandle; - __le64 SourceFileOffset; - __le64 TargetFileOffset; - __le64 ByteCount; /* Bytes to be copied */ -} __packed; /* Possible InfoType values */ #define SMB2_O_INFO_FILE 0x01 @@ -1419,6 +1512,7 @@ struct smb2_query_info_rsp { * PDU query infolevel structure definitions */ +/* See MS-FSCC 2.3.52 */ struct file_allocated_range_buffer { __le64 file_offset; __le64 length; diff --git a/fs/smbfs_common/smbfsctl.h b/fs/smbfs_common/smbfsctl.h index d51939c43ad7..edd7fc2a7921 100644 --- a/fs/smbfs_common/smbfsctl.h +++ b/fs/smbfs_common/smbfsctl.h @@ -88,21 +88,27 @@ #define FSCTL_READ_RAW_ENCRYPTED 0x000900E3 /* BB add struct */ #define FSCTL_READ_FILE_USN_DATA 0x000900EB /* BB add struct */ #define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */ +#define FSCTL_MARK_HANDLE 0x000900FC /* BB add struct */ #define FSCTL_SIS_COPYFILE 0x00090100 /* BB add struct */ #define FSCTL_RECALL_FILE 0x00090117 /* BB add struct */ #define FSCTL_QUERY_SPARING_INFO 0x00090138 /* BB add struct */ +#define FSCTL_QUERY_ON_DISK_VOLUME_INFO 0x0009013C #define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */ #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ #define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C +#define FSCTL_QUERY_FILE_REGIONS 0x00090284 #define FSCTL_GET_REFS_VOLUME_DATA 0x000902D8 /* See MS-FSCC 2.3.24 */ #define FSCTL_SET_INTEGRITY_INFORMATION_EXT 0x00090380 #define FSCTL_GET_RETRIEVAL_POINTERS_AND_REFCOUNT 0x000903d3 #define FSCTL_GET_RETRIEVAL_POINTER_COUNT 0x0009042b #define FSCTL_REFS_STREAM_SNAPSHOT_MANAGEMENT 0x00090440 #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF +#define FSCTL_OFFLOAD_READ 0x00094264 /* BB add struct */ +#define FSCTL_OFFLOAD_WRITE 0x00098268 /* BB add struct */ #define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ #define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */ #define FSCTL_DUPLICATE_EXTENTS_TO_FILE 0x00098344 +#define FSCTL_DUPLICATE_EXTENTS_TO_FILE_EX 0x000983E8 #define FSCTL_SIS_LINK_FILES 0x0009C104 #define FSCTL_SET_INTEGRITY_INFORMATION 0x0009C280 #define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */ diff --git a/fs/sysv/super.c b/fs/sysv/super.c index d1def0771a40..3365a30dc1e0 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -312,7 +312,9 @@ static int complete_read_super(struct super_block *sb, int silent, int size) sbi->s_firstinodezone = 2; flavour_setup[sbi->s_type](sbi, &sb->s_max_links); - + if (sbi->s_firstdatazone < sbi->s_firstinodezone) + return 0; + sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone; sbi->s_inodes_per_block = bsize >> 6; sbi->s_inodes_per_block_1 = (bsize >> 6)-1; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index aa0c47cb0d16..e943370107d0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -29,6 +29,7 @@ #include <linux/ioctl.h> #include <linux/security.h> #include <linux/hugetlb.h> +#include <linux/swapops.h> int sysctl_unprivileged_userfaultfd __read_mostly; @@ -249,9 +250,10 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. + * changes under us. PTE markers should be handled the same as none + * ptes here. */ - if (huge_pte_none(pte)) + if (huge_pte_none_mostly(pte)) ret = true; if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) ret = true; @@ -330,9 +332,10 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pte = pte_offset_map(pmd, address); /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. + * changes under us. PTE markers should be handled the same as none + * ptes here. */ - if (pte_none(*pte)) + if (pte_none_mostly(*pte)) ret = true; if (!pte_write(*pte) && (reason & VM_UFFD_WP)) ret = true; @@ -1255,24 +1258,6 @@ static __always_inline int validate_range(struct mm_struct *mm, return 0; } -static inline bool vma_can_userfault(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - /* FIXME: add WP support to hugetlbfs and shmem */ - if (vm_flags & VM_UFFD_WP) { - if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) - return false; - } - - if (vm_flags & VM_UFFD_MINOR) { - if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) - return false; - } - - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); -} - static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long arg) { @@ -1954,6 +1939,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; #endif +#ifndef CONFIG_PTE_MARKER_UFFD_WP + uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; +#endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) |
